@Override public Document getNextPage(Document page) throws IOException { Elements nextPageLink = page.select("li.page_next > a"); if (nextPageLink.isEmpty()){ throw new IOException("No more pages"); } else { URL nextURL = new URL(this.url, nextPageLink.first().attr("href")); return Http.url(nextURL).get(); } }
/** * 让html的图片变成绝对路径,这在api请求文章数据的时候,方便客户端直接浏览 * * @param html * @param domain * @return */ public static String makeImageSrcToAbsolutePath(String html, String domain) { if (StrUtils.isBlank(domain)) { return html; } Document doc = Jsoup.parse(html); Elements es = doc.select("img"); if (es != null && es.size() > 0) { for (Element e : es) { String src = e.attr("src"); if (StrUtils.isNotBlank(src) && src.startsWith("/")) { src = domain + src; e.attr("src", src); } } } return doc.body().children().toString(); }
private void normaliseStructure(String tag, Element htmlEl) { Elements elements = this.getElementsByTag(tag); Element master = elements.first(); // will always be available as created above if not existent if (elements.size() > 1) { // dupes, move contents to master List<Node> toMove = new ArrayList<>(); for (int i = 1; i < elements.size(); i++) { Node dupe = elements.get(i); toMove.addAll(dupe.ensureChildNodes()); dupe.remove(); } for (Node dupe : toMove) master.appendChild(dupe); } // ensure parented by <html> if (!master.parent().equals(htmlEl)) { htmlEl.appendChild(master); // includes remove() } }
@Override public Page convert(ResponseBody responseBody) throws IOException { Document document = Jsoup.parse(responseBody.string()); List<String> links = new ArrayList<>(); for (Element element : document.select("a[href]")) { links.add(element.attr("href")); } return new Page(document.title(), Collections.unmodifiableList(links)); } }
/** Creates a new, clean document, from the original dirty document, containing only elements allowed by the whitelist. The original document is not modified. Only elements from the dirt document's <code>body</code> are used. @param dirtyDocument Untrusted base document to clean. @return cleaned document. */ public Document clean(Document dirtyDocument) { Validate.notNull(dirtyDocument); Document clean = Document.createShell(dirtyDocument.baseUri()); if (dirtyDocument.body() != null) // frameset documents won't have a body. the clean doc will have empty body. copySafeNodes(dirtyDocument.body(), clean.body()); return clean; }
private JSONObject getJSON(String page, String apiKey) { URL pageURL = null; String apiURL = null; try { apiURL = apiURLBuilder(getPhotosetID(url.toExternalForm()), page, apiKey); pageURL = new URL(apiURL); } catch (MalformedURLException e) { LOGGER.error("Unable to get api link " + apiURL + " is malformed"); } try { LOGGER.info(Http.url(pageURL).ignoreContentType().get().text()); return new JSONObject(Http.url(pageURL).ignoreContentType().get().text()); } catch (IOException e) { LOGGER.error("Unable to get api link " + apiURL + " is malformed"); return null; } }
Document doc = Http.url(url).get(); Elements metaTags = doc.getElementsByTag("meta"); if (metaTag.attr("property").equals("og:image")) { imgsrc = metaTag.attr("content"); LOGGER.info("Found URL " + imgsrc); break;//only one (useful) image possible for an "image page". LOGGER.warn("Image not found at " + this.url); return; addURLToDownload(new URL(imgsrc), prefix); } catch (IOException e) { LOGGER.error("[!] Exception while loading/parsing " + this.url, e);
@Override public void rip() throws IOException { LOGGER.info(" Retrieving " + this.url.toExternalForm()); Document doc = Http.url(this.url).get(); Elements videos = doc.select("meta[name=twitter:player:stream]"); if (videos.isEmpty()) { throw new IOException("Could not find twitter:player:stream at " + url); } String vidUrl = videos.first().attr("content"); vidUrl = vidUrl.replaceAll("&", "&"); addURLToDownload(new URL(vidUrl), HOST + "_" + getGID(this.url)); waitForThreads(); } }
@Override public List<String> getURLsFromPage(Document doc) { List<String> results = new ArrayList<>(); String duckMoviesUrl = doc.select("iframe").attr("src"); try { Document duckDoc = Http.url(new URL(duckMoviesUrl)).get(); String videoURL = duckDoc.select("source").attr("src"); // remove any white spaces so we can download the movie without a 400 error videoURL = videoURL.replaceAll(" ", "%20"); results.add(videoURL); } catch (MalformedURLException e) { LOGGER.error(duckMoviesUrl + " is not a valid url"); } catch (IOException e) { LOGGER.error("Unable to load page " + duckMoviesUrl); e.printStackTrace(); } return results; }
private void fetchImage() { try { Document doc = Http.url(this.url) .referrer(this.url) .get(); // Find image Elements images = doc.select("#photoImageSection img"); Element image = images.first(); String imgsrc = image.attr("src"); LOGGER.info("Found URL " + imgsrc + " via " + images.get(0)); // Provide prefix and let the AbstractRipper "guess" the filename String prefix = ""; if (Utils.getConfigBoolean("download.save_order", true)) { prefix = String.format("%03d_", index); } URL imgurl = new URL(url, imgsrc); addURLToDownload(imgurl, prefix); } catch (IOException e) { LOGGER.error("[!] Exception while loading/parsing " + this.url, e); } } }
@Override public List<String> getURLsFromPage(Document doc) { LOGGER.debug("Checking for urls"); List<String> result = new ArrayList<>(); if (!isVideoUrl(url)) { for (Element page : doc.select("div.items > div.item-container > a.item")) { String pageWithImageUrl = page.attr("href"); try { String image = Http.url(new URL(pageWithImageUrl)).get().select("div.picture_container > a > img").attr("src"); downloadFile(image); } catch (IOException e) { LOGGER.error("Was unable to load page " + pageWithImageUrl); } } } else { String imgUrl = doc.select("div.player-container > a").attr("href"); downloadFile(imgUrl); } return result; }
private URL getGalleryFromImage(URL url) throws IOException { Document doc = Http.url(url).get(); for (Element link : doc.select("a[href~=^gallery\\.php.*$]")) { LOGGER.info("LINK: " + link.toString()); if (link.hasAttr("href") && link.attr("href").contains("gallery.php")) { url = new URL("http://imagearn.com/" + link.attr("href")); LOGGER.info("[!] Found gallery from given link: " + url); return url; } } throw new IOException("Failed to find gallery at URL " + url); }
@Override public void rip() throws IOException { LOGGER.info("Retrieving " + this.url); Document doc = Http.url(url).get(); //Get user friendly filename from page title String title = doc.title(); Elements script = doc.select("script"); if (script.isEmpty()) { throw new IOException("Could not find script code at " + url); } //Regex assumes highest quality source is listed first Pattern p = Pattern.compile("\"source\":\"(.*?)\""); for (Element element : script) { Matcher m = p.matcher(element.data()); if (m.find()){ String vidUrl = m.group(1); addURLToDownload(new URL(vidUrl), HOST + "_" + title); } } waitForThreads(); } }
@Override public void rip() throws IOException { LOGGER.info("Retrieving " + this.url); Document doc = Http.url(url).get(); Elements videos = doc.select(".wp-video > video > source"); if (videos.isEmpty()) { throw new IOException("Could not find Embed code at " + url); } String vidUrl = videos.attr("src"); addURLToDownload(new URL(vidUrl), HOST + "_" + getGID(this.url)); waitForThreads(); } }
@Override public void rip() throws IOException { LOGGER.info("Retrieving " + this.url); Document doc = Http.url(url).get(); List<String> mp4s = Utils.between(doc.html(), "file:\"", "\""); if (mp4s.isEmpty()) { throw new IOException("Could not find files at " + url); } String vidUrl = mp4s.get(0); addURLToDownload(new URL(vidUrl), HOST + "_" + getGID(this.url)); waitForThreads(); } }
public static List<URL> getURLs(URL url) throws IOException{ Response resp = Http.url(url) .ignoreContentType() .response(); Document doc = resp.parse(); List<URL> URLs = new ArrayList<>(); //Pictures Elements imgs = doc.getElementsByTag("img"); for (Element img : imgs) { if (img.hasClass("album-image")) { String imageURL = img.attr("src"); URLs.add(new URL(imageURL)); } } //Videos Elements vids = doc.getElementsByTag("video"); for (Element vid : vids) { if (vid.hasClass("album-video")) { Elements source = vid.getElementsByTag("source"); String videoURL = source.first().attr("src"); URLs.add(new URL(videoURL)); } } return URLs; } }
/** * Article API * @param URL * @param JSONObject genericScraperData * @return genericScraperData */ public JSONObject articleAPI (String url, JSONObject genericScraperData) throws MalformedURLException{ URL qurl = new URL(url); String data = ""; try { data = null;// ArticleExtractor.INSTANCE.getText(qurl); genericScraperData.put("query", qurl); genericScraperData.put("data", data); genericScraperData.put("NLP", "true"); } catch (Exception e) { if ("".equals(data)) { try { Document htmlPage = Jsoup.connect(url).get(); data = htmlPage.text(); genericScraperData.put("query", qurl); genericScraperData.put("data", data); genericScraperData.put("NLP", "false"); } catch (Exception ex) {} } } return genericScraperData; }
@Override public String getAlbumTitle(URL url) throws MalformedURLException { if (!is_profile(url)) { try { // Attempt to use album title as GID Element titleElement = getFirstPage().select("meta[property=og:title]").first(); String title = titleElement.attr("content"); title = title.substring(title.lastIndexOf('/') + 1); return getHost() + "_" + getGID(url) + "_" + title.trim(); } catch (IOException e) { // Fall back to default album naming convention LOGGER.info("Unable to find title at " + url); } return super.getAlbumTitle(url); } return url.toExternalForm().split("/u/")[1]; }
private String vscoImageToURL(String url) throws IOException{ Document page = Jsoup.connect(url).userAgent(USER_AGENT) .get(); //create Elements filled only with Elements with the "meta" tag. Elements metaTags = page.getElementsByTag("meta"); String result = ""; for(Element metaTag : metaTags){ //find URL inside meta-tag with property of "og:image" if (metaTag.attr("property").equals("og:image")){ String givenURL = metaTag.attr("content"); givenURL = givenURL.replaceAll("\\?h=[0-9]+", "");//replace the "?h=xxx" tag at the end of the URL (where each x is a number) result = givenURL; LOGGER.debug("Found image URL: " + givenURL); break;//immediately stop after getting URL (there should only be 1 image to be downloaded) } } //Means website changed, things need to be fixed. if (result.isEmpty()){ LOGGER.error("Could not find image URL at: " + url); } return result; }