private List<String> getURLsFromChap(Document doc) { LOGGER.debug("Getting urls from " + doc.location()); List<String> result = new ArrayList<>(); for (Element el : doc.select(".vung-doc > img")) { result.add(el.attr("src")); } return result; }
@Override protected List<String> getURLsFromPage(Document page) { JSONObject collectionData = getCollectionData(page); if (collectionData == null) { LOGGER.error("Unable to find JSON data at URL: " + page.location()); // probably better than returning null, as the ripper will display // that nothing was found instead of a NullPointerException return new ArrayList<>(); } else { return getImageURLs(collectionData); } }
@Override protected void downloadURL(URL url, int index) { addURLToDownload(url, getPrefix(++this.index), currAlbum.location, currAlbum.currPage.location(), currAlbum.cookies); }
/** * Converts a jsoup document into the provided W3C Document. If required, you can set options on the output document * before converting. * @param in jsoup doc * @param out w3c doc * @see org.jsoup.helper.W3CDom#fromJsoup(org.jsoup.nodes.Document) */ public void convert(org.jsoup.nodes.Document in, Document out) { if (!StringUtil.isBlank(in.location())) out.setDocumentURI(in.location()); org.jsoup.nodes.Element rootEl = in.child(0); // skip the #root node NodeTraversor.traverse(new W3CBuilder(out), rootEl); }
throw new IOException("No images found at " + doc.location()); LOGGER.debug("Fetching description(s) from " + doc.location()); List<String> textURLs = getDescriptionsFromPage(doc); if (!textURLs.isEmpty()) { LOGGER.debug("Found description link(s) from " + doc.location()); for (String textURL : textURLs) { if (isStopped()) {
throw new IOException("No images found at " + doc.location());
private Mono<CrawlerResult> getCrawlerResult(Document document, int depth) { return Flux.fromIterable(document.getElementsByTag("a")) .map(element -> element.absUrl("href")) .collectList() .map(hyperlinks -> new CrawlerResult(document.location(), document.title(), document.text(), hyperlinks, depth)); }