org.jbake.app.Crawler java code examples

@Test
public void renderCustomTypePaper() throws Exception {
  // setup
  config.setTemplateFileNameForDocType("paper", "paper." + templateExtension);
  DocumentTypes.addDocumentType("paper");
  db.updateSchema();
  Crawler crawler = new Crawler(db, config);
  crawler.crawl();
  Parser parser = new Parser(config);
  Renderer renderer = new Renderer(db, config);
  String filename = "published-paper.html";
  File sampleFile = new File(sourceFolder.getPath() + File.separator + "content" + File.separator + "papers" + File.separator + filename);
  Map<String, Object> content = parser.processFile(sampleFile);
  content.put(Crawler.Attributes.URI, "/" + filename);
  renderer.render(content);
  File outputFile = new File(destinationFolder, filename);
  Assert.assertTrue(outputFile.exists());
  // verify
  String output = FileUtils.readFileToString(outputFile, Charset.defaultCharset());
  for (String string : getOutputStrings("paper")) {
    assertThat(output).contains(string);
  }
}

StringBuilder sb = new StringBuilder();
sb.append("Processing [").append(sourceFile.getPath()).append("]... ");
String sha1 = buildHash(sourceFile);
String uri = buildURI(sourceFile);
boolean process = true;
DocumentStatus status = DocumentStatus.NEW;
for (String docType : DocumentTypes.getDocumentTypes()) {
  status = findDocumentStatus(docType, uri, sha1);
  if (status == DocumentStatus.UPDATED) {
    sb.append(" : modified ");
  crawlSourceFile(sourceFile, sha1, uri);
crawl(sourceFile);

private String buildURI(final File sourceFile) {
  String uri = FileUtil.asPath(sourceFile).replace(FileUtil.asPath(config.getContentFolder()), "");
  if (useNoExtensionUri(uri)) {
    // convert URI from xxx.html to xxx/index.html
    uri = createNoExtensionUri(uri);
  } else {
    uri = createUri(uri);
  }
  // strip off leading / to enable generating non-root based sites
  if (uri.startsWith(FileUtil.URI_SEPARATOR_CHAR)) {
    uri = uri.substring(1, uri.length());
  }
  return uri;
}

public void crawl() {
  crawl(config.getContentFolder());
  LOGGER.info("Content detected:");
  for (String docType : DocumentTypes.getDocumentTypes()) {
    long count = db.getDocumentCount(docType);
    if (count > 0) {
      LOGGER.info("Parsed {} files of type: {}", count, docType);
    }
  }
}

 Crawler c = new Crawler();
c.schedule(seedDocument); 
c.waitUntilCompletion()

  /**
   * Create default {@link Utensils} by a given {@link JBakeConfiguration}
   * @param config a {@link JBakeConfiguration}
   * @return a default {@link Utensils} instance
   */
  public static Utensils createDefaultUtensils(JBakeConfiguration config) {

    JBakeConfigurationInspector inspector = new JBakeConfigurationInspector(config);
    inspector.inspect();

    Utensils utensils = new Utensils();
    utensils.setConfiguration(config);
    ContentStore contentStore = DBUtil.createDataStore(config);
    utensils.setContentStore(contentStore);
    utensils.setCrawler(new Crawler(contentStore, config));
    utensils.setRenderer(new Renderer(contentStore, config));
    utensils.setAsset(new Asset(config));

    return utensils;
  }
}

 public void output() {
  Crawler cw=new Crawler(); // new object
  System.out.println(cw.getTitle());
}

Map<String, Object> fileContents = parser.processFile(sourceFile);
if (fileContents != null) {
  fileContents.put(Attributes.ROOTPATH, getPathToRoot(sourceFile));
  fileContents.put(String.valueOf(DocumentAttributes.SHA1), sha1);
  fileContents.put(String.valueOf(DocumentAttributes.RENDERED), false);

crawler.crawl();

Thread thread = new Thread(new Crawler("https://www.google.com.pk/?gws_rd=cr&ei=-q8vUqqNDIny4QTLlYCwAQ#q=pakistan"/*new BasicDAO().getNonProcessedLink()*/));

Map<String, Object> fileContents = parser.processFile(sourceFile);
if (fileContents != null) {
  fileContents.put(Attributes.ROOTPATH, getPathToRoot(sourceFile));
  fileContents.put(String.valueOf(DocumentAttributes.SHA1), sha1);
  fileContents.put(String.valueOf(DocumentAttributes.RENDERED), false);

StringBuilder sb = new StringBuilder();
sb.append("Processing [").append(sourceFile.getPath()).append("]... ");
String sha1 = buildHash(sourceFile);
String uri = buildURI(sourceFile);
boolean process = true;
DocumentStatus status = DocumentStatus.NEW;
for (String docType : DocumentTypes.getDocumentTypes()) {
  status = findDocumentStatus(docType, uri, sha1);
  if (status == DocumentStatus.UPDATED) {
    sb.append(" : modified ");
  crawlSourceFile(sourceFile, sha1, uri);
crawl(sourceFile);

@Test
public void renderWithPrettyUrls() throws Exception {
  config.setUriWithoutExtension(true);
  config.setPrefixForUriWithoutExtension("/blog");
  Crawler crawler = new Crawler(db, config);
  crawler.crawl();
  Assert.assertEquals(4, db.getDocumentCount("post"));
  Assert.assertEquals(3, db.getDocumentCount("page"));
  DocumentList documents = db.getPublishedPosts();
  for (Map<String, Object> model : documents) {
    String noExtensionUri = "blog/\\d{4}/" + FilenameUtils.getBaseName((String) model.get("file")) + "/";
    Assert.assertThat(model.get("noExtensionUri"), RegexMatcher.matches(noExtensionUri));
    Assert.assertThat(model.get("uri"), RegexMatcher.matches(noExtensionUri + "index\\.html"));
    assertThat(model).containsEntry("rootpath", "../../../");
  }
}

  @Test
  public void shouldCrawlRenderAndCopyAssets() throws Exception {
    configuration.setTemplateFolder( folder.newFolder("template") );
    configuration.setContentFolder( folder.newFolder("content") );
    configuration.setAssetFolder( folder.newFolder("assets") );

    contentStore = spy(new ContentStore("memory", "documents"+ System.currentTimeMillis()));

    Crawler crawler = mock(Crawler.class);
    Renderer renderer = mock(Renderer.class);
    Asset asset = mock(Asset.class);

    Utensils utensils = new Utensils();
    utensils.setConfiguration(configuration);
    utensils.setContentStore(contentStore);
    utensils.setRenderer(renderer);
    utensils.setCrawler(crawler);
    utensils.setAsset(asset);

    Oven oven = new Oven(utensils);

    oven.bake();

    verify(contentStore, times(1)).startup();
    verify(renderer,atLeastOnce()).renderIndex(anyString());
    verify(crawler,times(1)).crawl();
    verify(asset,times(1)).copy();
  }
}

private String buildURI(final File sourceFile) {
  String uri = FileUtil.asPath(sourceFile).replace(FileUtil.asPath(config.getContentFolder()), "");
  if (useNoExtensionUri(uri)) {
    // convert URI from xxx.html to xxx/index.html
    uri = createNoExtensionUri(uri);
  } else {
    uri = createUri(uri);
  }
  // strip off leading / to enable generating non-root based sites
  if (uri.startsWith(FileUtil.URI_SEPARATOR_CHAR)) {
    uri = uri.substring(1, uri.length());
  }
  return uri;
}

var crawler = new Crawler();

@Before
public void setup() throws Exception {
  currentLocale = Locale.getDefault();
  Locale.setDefault(Locale.ENGLISH);
  ModelExtractorsDocumentTypeListener listener = new ModelExtractorsDocumentTypeListener();
  DocumentTypes.addListener(listener);
  templateFolder = new File(sourceFolder, templateDir);
  if (!templateFolder.exists()) {
    throw new Exception("Cannot find template folder!");
  }
  destinationFolder = folder.getRoot();
  config.setDestinationFolder(destinationFolder);
  config.setTemplateFolder(templateFolder);
  for (String docType : DocumentTypes.getDocumentTypes()) {
    File templateFile = config.getTemplateFileByDocType(docType);
    if (templateFile != null) {
      String fileName = templateFile.getName();
      String fileBaseName = fileName.substring(0, fileName.lastIndexOf("."));
      config.setTemplateFileNameForDocType(docType, fileBaseName + "." + templateExtension);
    }
  }
  Assert.assertEquals(".html", config.getOutputExtension());
  Crawler crawler = new Crawler(db, config);
  crawler.crawl();
  parser = new Parser(config);
  renderer = new Renderer(db, config);
  setupExpectedOutputStrings();
}

 public class Crawler() {

  public static void main(String[] args) {
    Crawler crawler = new Crawler();
    String[] urls = new String[]{
        "http://www.url.com/1",
        "http://www.url.com/2",
        "http://www.url.com/3"
      };
    crawler.crawl(urls);
    try {
      Thread.sleep(1000 * 60 * 15); // without this the unit test exits too early
    } catch (InterruptedException e) {
      e.printStackTrace();
    }
  }

 // the rest of the class definition

}

  /**
   * Create default {@link Utensils} by a given {@link JBakeConfiguration}
   * @param config a {@link JBakeConfiguration}
   * @return a default {@link Utensils} instance
   */
  public static Utensils createDefaultUtensils(JBakeConfiguration config) {

    JBakeConfigurationInspector inspector = new JBakeConfigurationInspector(config);
    inspector.inspect();

    Utensils utensils = new Utensils();
    utensils.setConfiguration(config);
    ContentStore contentStore = DBUtil.createDataStore(config);
    utensils.setContentStore(contentStore);
    utensils.setCrawler(new Crawler(contentStore, config));
    utensils.setRenderer(new Renderer(contentStore, config));
    utensils.setAsset(new Asset(config));

    return utensils;
  }
}

@Test
public void crawl() {
  Crawler crawler = new Crawler(db, config);
  crawler.crawl();
  Assert.assertEquals(4, db.getDocumentCount("post"));
  Assert.assertEquals(3, db.getDocumentCount("page"));
  DocumentList results = db.getPublishedPosts();
  assertThat(results.size()).isEqualTo(3);
  for (Map<String, Object> content : results) {
    assertThat(content)
        .containsKey(Crawler.Attributes.ROOTPATH)
        .containsValue("../../../");
  }
  DocumentList allPosts = db.getAllContent("post");
  assertThat(allPosts.size()).isEqualTo(4);
  for (Map<String, Object> content : allPosts) {
    if (content.get(Crawler.Attributes.TITLE).equals("Draft Post")) {
      assertThat(content).containsKey(Crawler.Attributes.DATE);
    }
  }
  // covers bug #213
  DocumentList publishedPostsByTag = db.getPublishedPostsByTag("blog");
  Assert.assertEquals(3, publishedPostsByTag.size());
}

Javadoc

Crawls a file system looking for content.

Most used methods

<init>
Creates new instance of Crawler.
crawl
Crawl all files and folders looking for content.
buildHash
buildURI
crawlSourceFile
createNoExtensionUri
createUri
findDocumentStatus
getPathToRoot
useNoExtensionUri
getTitle
schedule

Popular in Java

Reactive rest calls using spring rest template
getResourceAsStream (ClassLoader)
requestLocationUpdates (LocationManager)
addToBackStack (FragmentTransaction)
MalformedURLException (java.net)
This exception is thrown when a program attempts to create an URL from an incorrect specification.
Dictionary (java.util)
Note: Do not use this class since it is obsolete. Please use the Map interface for new implementatio
TreeMap (java.util)
Walk the nodes of the tree left-to-right or right-to-left. Note that in descending iterations, next
Cipher (javax.crypto)
This class provides access to implementations of cryptographic ciphers for encryption and decryption
XPath (javax.xml.xpath)
XPath provides access to the XPath evaluation environment and expressions. Evaluation of XPath Expr
BasicDataSource (org.apache.commons.dbcp)
Basic implementation of javax.sql.DataSource that is configured via JavaBeans properties. This is no
Top PhpStorm plugins

How to useCrawler in org.jbake.app

Best Java code snippets using org.jbake.app.Crawler (Showing top 20 results out of 315)

How to use
Crawler
in
org.jbake.app