org.jsoup.nodes.Document.baseUri java code examples

/**
 Create a new Element, with this document's base uri. Does not make the new element a child of this document.
 @param tagName element tag name (e.g. {@code a})
 @return new element
 */
public Element createElement(String tagName) {
  return new Element(Tag.valueOf(tagName, ParseSettings.preserveCase), this.baseUri());
}

/**
 * Only document can be select
 * See: https://github.com/code4craft/webmagic/issues/113
 *
 * @param elementIterator elementIterator
 * @return element element
 */
private Element checkElementAndConvert(ListIterator<Element> elementIterator) {
  Element element = elementIterator.next();
  if (!(element instanceof Document)) {
    Document root = new Document(element.ownerDocument().baseUri());
    Element clone = element.clone();
    root.appendChild(clone);
    elementIterator.set(root);
    return root;
  }
  return element;
}

/**
 Creates a new, clean document, from the original dirty document, containing only elements allowed by the whitelist.
 The original document is not modified. Only elements from the dirt document's <code>body</code> are used.
 @param dirtyDocument Untrusted base document to clean.
 @return cleaned document.
 */
public Document clean(Document dirtyDocument) {
  Validate.notNull(dirtyDocument);
  Document clean = Document.createShell(dirtyDocument.baseUri());
  if (dirtyDocument.body() != null) // frameset documents won't have a body. the clean doc will have empty body.
    copySafeNodes(dirtyDocument.body(), clean.body());
  return clean;
}

/**
 Determines if the input document <b>body</b>is valid, against the whitelist. It is considered valid if all the tags and attributes
 in the input HTML are allowed by the whitelist, and that there is no content in the <code>head</code>.
 <p>
 This method can be used as a validator for user input. An invalid document will still be cleaned successfully
 using the {@link #clean(Document)} document. If using as a validator, it is recommended to still clean the document
 to ensure enforced attributes are set correctly, and that the output is tidied.
 </p>
 @param dirtyDocument document to test
 @return true if no tags or attributes need to be removed; false if they do
 */
public boolean isValid(Document dirtyDocument) {
  Validate.notNull(dirtyDocument);
  Document clean = Document.createShell(dirtyDocument.baseUri());
  int numDiscarded = copySafeNodes(dirtyDocument.body(), clean.body());
  return numDiscarded == 0
    && dirtyDocument.head().childNodes().size() == 0; // because we only look at the body, but we start from a shell, make sure there's nothing in the head
}

    document.baseUri());
document.child(0).before(doctype);

@Override
public String getDocumentURI() {
  return document.baseUri();
}

@Override
public String getDocumentURI() {
  return document.baseUri();
}

Document doc = con.get();
String uri = doc.baseUri();
returnObj.put(RETURN_FINAL_URL, uri);

/**
 * Only document can be select
 * See: https://github.com/code4craft/webmagic/issues/113
 *
 * @param elementIterator elementIterator
 * @return element element
 */
private Element checkElementAndConvert(ListIterator<Element> elementIterator) {
  Element element = elementIterator.next();
  if (!(element instanceof Document)) {
    Document root = new Document(element.ownerDocument().baseUri());
    Element clone = element.clone();
    root.appendChild(clone);
    elementIterator.set(root);
    return root;
  }
  return element;
}

/**
 * Only document can be select
 * See: https://github.com/code4craft/webmagic/issues/113
 *
 * @param elementIterator elementIterator
 * @return element element
 */
private Element checkElementAndConvert(ListIterator<Element> elementIterator) {
  Element element = elementIterator.next();
  if (!(element instanceof Document)) {
    Document root = new Document(element.ownerDocument().baseUri());
    Element clone = element.clone();
    root.appendChild(clone);
    elementIterator.set(root);
    return root;
  }
  return element;
}

private void loadResList(Document resDoc, List<ReservedItem> items) throws IOException {
  items.addAll(parseResList(resDoc));
  String nextPageUrl = findNextPageUrl(resDoc);
  if (nextPageUrl != null) {
    Document doc = Jsoup.parse(httpGet(nextPageUrl, getDefaultEncoding()));
    doc.setBaseUri(resDoc.baseUri());
    loadResList(doc, items);
  }
}

private void loadMediaList(Document lentDoc, List<LentItem> items)
    throws IOException {
  items.addAll(parseMediaList(lentDoc));
  String nextPageUrl = findNextPageUrl(lentDoc);
  if (nextPageUrl != null) {
    Document doc = Jsoup.parse(httpGet(nextPageUrl, getDefaultEncoding()));
    doc.setBaseUri(lentDoc.baseUri());
    loadMediaList(doc, items);
  }
}

public News getNews() throws Exception {
  News news = new News();
  Element contentElement;
  try {
    contentElement = getContentElement();
    news.setContentElement(contentElement);
  } catch (Exception ex) {
    LOG.info("news content extraction failed,extraction abort", ex);
    throw new Exception(ex);
  }
  if (doc.baseUri() != null) {
    news.setUrl(doc.baseUri());
  }
  try {
    news.setTime(getTime(contentElement));
  } catch (Exception ex) {
    LOG.info("news title extraction failed", ex);
  }
  try {
    news.setTitle(getTitle(contentElement));
  } catch (Exception ex) {
    LOG.info("title extraction failed", ex);
  }
  return news;
}

Timber.d("URI : %s", doc.baseUri());
if (doc.baseUri().contains(HENTAICAFE.getUrl() + "/78-2/") ||           // ignore tags page
    doc.baseUri().contains(HENTAICAFE.getUrl() + "/artists/")) {    // ignore artist page

result = new Content();
String url = doc.baseUri();
String protocol = url.substring(0,5);
if ("https".equals(protocol)) protocol = "https:";

@Override
protected Content parseContent(Document doc) {
  Content result = new Content();
  result.setUrl(doc.baseUri().substring(doc.baseUri().indexOf('/', 9)));
  String coverUrl = doc.select("div#imgholder")
      .select("a")
      .select("img")
      .attr("src");
  result.setCoverImageUrl(coverUrl);
  String title = doc.select("div#mangainfo")
      .select("div")
      .select("h1")
      .text();
  result.setTitle(title);
  String lastOptionUrl = doc.select("div#selectpage")
      .select("select")
      .select("option")
      .last()
      .attr("value");
  int nbPages = Integer.parseInt(lastOptionUrl.substring(lastOptionUrl.lastIndexOf('/') + 1));
  result.setQtyPages(nbPages);
  AttributeMap attributes = new AttributeMap();
  result.setAttributes(attributes);
  result.setSite(Site.PANDA);
  return result;
}

@Override
public Document runFilter(Document document) {
  final Document clean = Document.createShell(document.baseUri());
  if (document.body() != null) // frameset documents won't have a body. the clean doc will have empty body.
    copySafeNodes(document.body(), clean.body());
  return clean;
}

return new TextNode(element.getText(), document.baseUri());

parseAttributes(attributes, AttributeType.CHARACTER, characterElements, true);
if (doc.baseUri().contains("comics")) {
  result.setSite(Site.ASMHENTAI_COMICS);
} else {

static Document getBootstrapPage(BootstrapContext context) {
  Document document = new Document("");
  DocumentType doctype = new DocumentType("html", "", "",
      document.baseUri());
  document.appendChild(doctype);
  Element html = document.appendElement("html");

Popular in Java

Creating JSON documents from java classes using gson
getResourceAsStream (ClassLoader)
getSharedPreferences (Context)
putExtra (Intent)
ServerSocket (java.net)
This class represents a server-side socket that waits for incoming client connections. A ServerSocke
Time (java.sql)
Java representation of an SQL TIME value. Provides utilities to format and parse the time's represen
StringTokenizer (java.util)
Breaks a string into tokens; new code should probably use String#split.> // Legacy code: StringTo
Handler (java.util.logging)
A Handler object accepts a logging request and exports the desired messages to a target, for example
GridLayout (java.awt)
The GridLayout class is a layout manager that lays out a container's components in a rectangular gri
BoxLayout (javax.swing)
Best IntelliJ plugins

How to use baseUrimethodin org.jsoup.nodes.Document

Best Java code snippets using org.jsoup.nodes.Document.baseUri (Showing top 20 results out of 315)

How to use
baseUri
method
in
org.jsoup.nodes.Document