org.jsoup.nodes.Document java code examples

Refine search

@Override
public Document getNextPage(Document page) throws IOException {
  Elements nextPageLink = page.select("li.page_next > a");
  if (nextPageLink.isEmpty()){
    throw new IOException("No more pages");
  } else {
    URL nextURL = new URL(this.url, nextPageLink.first().attr("href"));
    return Http.url(nextURL).get();
  }
}

/**
 * 让html的图片变成绝对路径，这在api请求文章数据的时候，方便客户端直接浏览
 *
 * @param html
 * @param domain
 * @return
 */
public static String makeImageSrcToAbsolutePath(String html, String domain) {
  if (StrUtils.isBlank(domain)) {
    return html;
  }
  Document doc = Jsoup.parse(html);
  Elements es = doc.select("img");
  if (es != null && es.size() > 0) {
    for (Element e : es) {
      String src = e.attr("src");
      if (StrUtils.isNotBlank(src) && src.startsWith("/")) {
        src = domain + src;
        e.attr("src", src);
      }
    }
  }
  return doc.body().children().toString();
}

private void normaliseStructure(String tag, Element htmlEl) {
  Elements elements = this.getElementsByTag(tag);
  Element master = elements.first(); // will always be available as created above if not existent
  if (elements.size() > 1) { // dupes, move contents to master
    List<Node> toMove = new ArrayList<>();
    for (int i = 1; i < elements.size(); i++) {
      Node dupe = elements.get(i);
      toMove.addAll(dupe.ensureChildNodes());
      dupe.remove();
    }
    for (Node dupe : toMove)
      master.appendChild(dupe);
  }
  // ensure parented by <html>
  if (!master.parent().equals(htmlEl)) {
    htmlEl.appendChild(master); // includes remove()            
  }
}

 @Override public Page convert(ResponseBody responseBody) throws IOException {
  Document document = Jsoup.parse(responseBody.string());
  List<String> links = new ArrayList<>();
  for (Element element : document.select("a[href]")) {
   links.add(element.attr("href"));
  }
  return new Page(document.title(), Collections.unmodifiableList(links));
 }
}

/**
 Creates a new, clean document, from the original dirty document, containing only elements allowed by the whitelist.
 The original document is not modified. Only elements from the dirt document's <code>body</code> are used.
 @param dirtyDocument Untrusted base document to clean.
 @return cleaned document.
 */
public Document clean(Document dirtyDocument) {
  Validate.notNull(dirtyDocument);
  Document clean = Document.createShell(dirtyDocument.baseUri());
  if (dirtyDocument.body() != null) // frameset documents won't have a body. the clean doc will have empty body.
    copySafeNodes(dirtyDocument.body(), clean.body());
  return clean;
}

private List<String> getURLsFromChap(Document doc) {
  LOGGER.debug("Getting urls from " + doc.location());
  List<String> result = new ArrayList<>();
  for (Element el : doc.select(".vung-doc > img")) {
    result.add(el.attr("src"));
  }
  return result;
}

private JSONObject getJSON(String page, String apiKey) {
  URL pageURL = null;
  String apiURL = null;
  try {
    apiURL = apiURLBuilder(getPhotosetID(url.toExternalForm()), page, apiKey);
    pageURL = new URL(apiURL);
  }  catch (MalformedURLException e) {
    LOGGER.error("Unable to get api link " + apiURL + " is malformed");
  }
  try {
    LOGGER.info(Http.url(pageURL).ignoreContentType().get().text());
    return new JSONObject(Http.url(pageURL).ignoreContentType().get().text());
  } catch (IOException e) {
    LOGGER.error("Unable to get api link " + apiURL + " is malformed");
    return null;
  }
}

  Document doc = Http.url(url).get();
  Elements metaTags = doc.getElementsByTag("meta");
    if (metaTag.attr("property").equals("og:image")) {
      imgsrc = metaTag.attr("content");
      LOGGER.info("Found URL " + imgsrc);
      break;//only one (useful) image possible for an "image page".
    LOGGER.warn("Image not found at " + this.url);
    return;
  addURLToDownload(new URL(imgsrc), prefix);
} catch (IOException e) {
  LOGGER.error("[!] Exception while loading/parsing " + this.url, e);

  @Override
  public void rip() throws IOException {
    LOGGER.info("    Retrieving " + this.url.toExternalForm());
    Document doc = Http.url(this.url).get();
    Elements videos = doc.select("meta[name=twitter:player:stream]");
    if (videos.isEmpty()) {
      throw new IOException("Could not find twitter:player:stream at " + url);
    }
    String vidUrl = videos.first().attr("content");
    vidUrl = vidUrl.replaceAll("&amp;", "&");
    addURLToDownload(new URL(vidUrl), HOST + "_" + getGID(this.url));
    waitForThreads();
  }
}

@Override
public List<String> getURLsFromPage(Document doc) {
  List<String> results = new ArrayList<>();
  String duckMoviesUrl = doc.select("iframe").attr("src");
  try {
    Document duckDoc = Http.url(new URL(duckMoviesUrl)).get();
    String videoURL = duckDoc.select("source").attr("src");
    // remove any white spaces so we can download the movie without a 400 error
    videoURL = videoURL.replaceAll(" ", "%20");
    results.add(videoURL);
  } catch (MalformedURLException e) {
    LOGGER.error(duckMoviesUrl + " is not a valid url");
  } catch (IOException e) {
    LOGGER.error("Unable to load page " + duckMoviesUrl);
    e.printStackTrace();
  }
  return results;
}

  private void fetchImage() {
    try {
      Document doc = Http.url(this.url)
                .referrer(this.url)
                .get();
      // Find image
      Elements images = doc.select("#photoImageSection img");
      Element image = images.first();
      String imgsrc = image.attr("src");
      LOGGER.info("Found URL " + imgsrc + " via " + images.get(0));
      // Provide prefix and let the AbstractRipper "guess" the filename
      String prefix = "";
      if (Utils.getConfigBoolean("download.save_order", true)) {
        prefix = String.format("%03d_", index);
      }
      URL imgurl = new URL(url, imgsrc);
      addURLToDownload(imgurl, prefix);
    } catch (IOException e) {
      LOGGER.error("[!] Exception while loading/parsing " + this.url, e);
    }
  }
}

@Override
public List<String> getURLsFromPage(Document doc) {
  LOGGER.debug("Checking for urls");
  List<String> result = new ArrayList<>();
  if (!isVideoUrl(url)) {
   for (Element page : doc.select("div.items > div.item-container > a.item")) {
     String pageWithImageUrl = page.attr("href");
     try {
       String image = Http.url(new URL(pageWithImageUrl)).get().select("div.picture_container > a > img").attr("src");
       downloadFile(image);
     } catch (IOException e) {
       LOGGER.error("Was unable to load page " + pageWithImageUrl);
     }
   }
  } else {
    String imgUrl = doc.select("div.player-container > a").attr("href");
    downloadFile(imgUrl);
  }
  return result;
}

private URL getGalleryFromImage(URL url) throws IOException {
  Document doc = Http.url(url).get();
  for (Element link : doc.select("a[href~=^gallery\\.php.*$]")) {
    LOGGER.info("LINK: " + link.toString());
    if (link.hasAttr("href")
        && link.attr("href").contains("gallery.php")) {
      url = new URL("http://imagearn.com/" + link.attr("href"));
      LOGGER.info("[!] Found gallery from given link: " + url);
      return url;
    }
  }
  throw new IOException("Failed to find gallery at URL " + url);
}

  @Override
  public void rip() throws IOException {
    LOGGER.info("Retrieving " + this.url);
    Document doc = Http.url(url).get();
    
    //Get user friendly filename from page title
    String title = doc.title();
    
    Elements script = doc.select("script");
    if (script.isEmpty()) {
      throw new IOException("Could not find script code at " + url);
    }
    //Regex assumes highest quality source is listed first
    Pattern p = Pattern.compile("\"source\":\"(.*?)\"");
    
    for (Element element : script) {
      Matcher m = p.matcher(element.data());
      if (m.find()){
        String vidUrl = m.group(1);
        addURLToDownload(new URL(vidUrl), HOST + "_" + title);
      }
    }
    waitForThreads();
  }
}

  @Override
  public void rip() throws IOException {
    LOGGER.info("Retrieving " + this.url);
    Document doc = Http.url(url).get();
    Elements videos = doc.select(".wp-video > video > source");
    if (videos.isEmpty()) {
      throw new IOException("Could not find Embed code at " + url);
    }
    String vidUrl = videos.attr("src");
    addURLToDownload(new URL(vidUrl), HOST + "_" + getGID(this.url));
    waitForThreads();
  }
}

  @Override
  public void rip() throws IOException {
    LOGGER.info("Retrieving " + this.url);
    Document doc = Http.url(url).get();
    List<String> mp4s = Utils.between(doc.html(), "file:\"", "\"");
    if (mp4s.isEmpty()) {
      throw new IOException("Could not find files at " + url);
    }
    String vidUrl = mp4s.get(0);
    addURLToDownload(new URL(vidUrl), HOST + "_" + getGID(this.url));
    waitForThreads();
  }
}

  public static List<URL> getURLs(URL url) throws IOException{

    Response resp = Http.url(url)
              .ignoreContentType()
              .response();

    Document doc = resp.parse();

    List<URL> URLs = new ArrayList<>();
    //Pictures
    Elements imgs = doc.getElementsByTag("img");
    for (Element img : imgs) {
      if (img.hasClass("album-image")) {
        String imageURL = img.attr("src");
        URLs.add(new URL(imageURL));
      }
    }
    //Videos
    Elements vids = doc.getElementsByTag("video");
    for (Element vid : vids) {
      if (vid.hasClass("album-video")) {
        Elements source = vid.getElementsByTag("source");
        String videoURL = source.first().attr("src");
        URLs.add(new URL(videoURL));
      }
    }

    return URLs;
  }
}

/**
 * Article API
 * @param URL
 * @param JSONObject genericScraperData
 * @return genericScraperData
 */
public JSONObject articleAPI (String url, JSONObject genericScraperData) throws MalformedURLException{
  URL qurl = new URL(url);
  String data = "";
  try {
    data = null;// ArticleExtractor.INSTANCE.getText(qurl);
    genericScraperData.put("query", qurl);
    genericScraperData.put("data", data);
    genericScraperData.put("NLP", "true");
  }
  catch (Exception e) {
    if ("".equals(data)) {
      try {
        Document htmlPage = Jsoup.connect(url).get();
        data = htmlPage.text();
        genericScraperData.put("query", qurl);
        genericScraperData.put("data", data);
        genericScraperData.put("NLP", "false");
      } catch (Exception ex) {}
    }
  }
  return genericScraperData;
}

@Override
public String getAlbumTitle(URL url) throws MalformedURLException {
  if (!is_profile(url)) {
    try {
      // Attempt to use album title as GID
      Element titleElement = getFirstPage().select("meta[property=og:title]").first();
      String title = titleElement.attr("content");
      title = title.substring(title.lastIndexOf('/') + 1);
      return getHost() + "_" + getGID(url) + "_" + title.trim();
    } catch (IOException e) {
      // Fall back to default album naming convention
      LOGGER.info("Unable to find title at " + url);
    }
    return super.getAlbumTitle(url);
  }
  return url.toExternalForm().split("/u/")[1];
}

private String vscoImageToURL(String url) throws IOException{
  Document page = Jsoup.connect(url).userAgent(USER_AGENT)
                   .get();
  //create Elements filled only with Elements with the "meta" tag.
  Elements metaTags = page.getElementsByTag("meta");
  String result = "";
  for(Element metaTag : metaTags){
    //find URL inside meta-tag with property of "og:image"
    if (metaTag.attr("property").equals("og:image")){
      String givenURL = metaTag.attr("content");
      givenURL = givenURL.replaceAll("\\?h=[0-9]+", "");//replace the "?h=xxx" tag at the end of the URL (where each x is a number)
      
      result = givenURL;
      LOGGER.debug("Found image URL: " + givenURL);
      break;//immediately stop after getting URL (there should only be 1 image to be downloaded)
    }
  }
  
  //Means website changed, things need to be fixed.
  if (result.isEmpty()){
    LOGGER.error("Could not find image URL at: " + url);
  }
  
  return result;
  
}

Javadoc

A HTML Document.

Most used methods

select
body
Accessor to the document's body element.
getElementsByTag
text
Set the text of the body of this document. Any existing nodes within the body will be cleared.
getElementById
html
getElementsByClass
outputSettings
Set the document's output settings.
toString
head
Accessor to the document's head element.
title
Set the document's title element. Updates the existing element, or adds title to head if not present
getElementsByAttributeValue

Popular in Java

Finding current android device location
orElseThrow (Optional)
Return the contained value, if present, otherwise throw an exception to be created by the provided s
scheduleAtFixedRate (Timer)
setScale (BigDecimal)
Proxy (java.net)
This class represents proxy server settings. A created instance of Proxy stores a type and an addres
HashMap (java.util)
HashMap is an implementation of Map. All optional operations are supported.All elements are permitte
Stack (java.util)
Stack is a Last-In/First-Out(LIFO) data structure which represents a stack of objects. It enables u
Callable (java.util.concurrent)
A task that returns a result and may throw an exception. Implementors define a single method with no
ImageIO (javax.imageio)
Notification (javax.management)
Top plugins for Android Studio

How to useDocument in org.jsoup.nodes

Best Java code snippets using org.jsoup.nodes.Document (Showing top 20 results out of 4,104)

Refine search

How to use
Document
in
org.jsoup.nodes