/** Create a new Element, with this document's base uri. Does not make the new element a child of this document. @param tagName element tag name (e.g. {@code a}) @return new element */ public Element createElement(String tagName) { return new Element(Tag.valueOf(tagName, ParseSettings.preserveCase), this.baseUri()); }
/** * Only document can be select * See: https://github.com/code4craft/webmagic/issues/113 * * @param elementIterator elementIterator * @return element element */ private Element checkElementAndConvert(ListIterator<Element> elementIterator) { Element element = elementIterator.next(); if (!(element instanceof Document)) { Document root = new Document(element.ownerDocument().baseUri()); Element clone = element.clone(); root.appendChild(clone); elementIterator.set(root); return root; } return element; }
/** Creates a new, clean document, from the original dirty document, containing only elements allowed by the whitelist. The original document is not modified. Only elements from the dirt document's <code>body</code> are used. @param dirtyDocument Untrusted base document to clean. @return cleaned document. */ public Document clean(Document dirtyDocument) { Validate.notNull(dirtyDocument); Document clean = Document.createShell(dirtyDocument.baseUri()); if (dirtyDocument.body() != null) // frameset documents won't have a body. the clean doc will have empty body. copySafeNodes(dirtyDocument.body(), clean.body()); return clean; }
/** Determines if the input document <b>body</b>is valid, against the whitelist. It is considered valid if all the tags and attributes in the input HTML are allowed by the whitelist, and that there is no content in the <code>head</code>. <p> This method can be used as a validator for user input. An invalid document will still be cleaned successfully using the {@link #clean(Document)} document. If using as a validator, it is recommended to still clean the document to ensure enforced attributes are set correctly, and that the output is tidied. </p> @param dirtyDocument document to test @return true if no tags or attributes need to be removed; false if they do */ public boolean isValid(Document dirtyDocument) { Validate.notNull(dirtyDocument); Document clean = Document.createShell(dirtyDocument.baseUri()); int numDiscarded = copySafeNodes(dirtyDocument.body(), clean.body()); return numDiscarded == 0 && dirtyDocument.head().childNodes().size() == 0; // because we only look at the body, but we start from a shell, make sure there's nothing in the head }
document.baseUri()); document.child(0).before(doctype);
@Override public String getDocumentURI() { return document.baseUri(); }
@Override public String getDocumentURI() { return document.baseUri(); }
Document doc = con.get(); String uri = doc.baseUri(); returnObj.put(RETURN_FINAL_URL, uri);
/** * Only document can be select * See: https://github.com/code4craft/webmagic/issues/113 * * @param elementIterator elementIterator * @return element element */ private Element checkElementAndConvert(ListIterator<Element> elementIterator) { Element element = elementIterator.next(); if (!(element instanceof Document)) { Document root = new Document(element.ownerDocument().baseUri()); Element clone = element.clone(); root.appendChild(clone); elementIterator.set(root); return root; } return element; }
/** * Only document can be select * See: https://github.com/code4craft/webmagic/issues/113 * * @param elementIterator elementIterator * @return element element */ private Element checkElementAndConvert(ListIterator<Element> elementIterator) { Element element = elementIterator.next(); if (!(element instanceof Document)) { Document root = new Document(element.ownerDocument().baseUri()); Element clone = element.clone(); root.appendChild(clone); elementIterator.set(root); return root; } return element; }
private void loadResList(Document resDoc, List<ReservedItem> items) throws IOException { items.addAll(parseResList(resDoc)); String nextPageUrl = findNextPageUrl(resDoc); if (nextPageUrl != null) { Document doc = Jsoup.parse(httpGet(nextPageUrl, getDefaultEncoding())); doc.setBaseUri(resDoc.baseUri()); loadResList(doc, items); } }
private void loadMediaList(Document lentDoc, List<LentItem> items) throws IOException { items.addAll(parseMediaList(lentDoc)); String nextPageUrl = findNextPageUrl(lentDoc); if (nextPageUrl != null) { Document doc = Jsoup.parse(httpGet(nextPageUrl, getDefaultEncoding())); doc.setBaseUri(lentDoc.baseUri()); loadMediaList(doc, items); } }
public News getNews() throws Exception { News news = new News(); Element contentElement; try { contentElement = getContentElement(); news.setContentElement(contentElement); } catch (Exception ex) { LOG.info("news content extraction failed,extraction abort", ex); throw new Exception(ex); } if (doc.baseUri() != null) { news.setUrl(doc.baseUri()); } try { news.setTime(getTime(contentElement)); } catch (Exception ex) { LOG.info("news title extraction failed", ex); } try { news.setTitle(getTitle(contentElement)); } catch (Exception ex) { LOG.info("title extraction failed", ex); } return news; }
result = new Content(); String url = doc.baseUri(); String protocol = url.substring(0,5); if ("https".equals(protocol)) protocol = "https:";
@Override protected Content parseContent(Document doc) { Content result = new Content(); result.setUrl(doc.baseUri().substring(doc.baseUri().indexOf('/', 9))); String coverUrl = doc.select("div#imgholder") .select("a") .select("img") .attr("src"); result.setCoverImageUrl(coverUrl); String title = doc.select("div#mangainfo") .select("div") .select("h1") .text(); result.setTitle(title); String lastOptionUrl = doc.select("div#selectpage") .select("select") .select("option") .last() .attr("value"); int nbPages = Integer.parseInt(lastOptionUrl.substring(lastOptionUrl.lastIndexOf('/') + 1)); result.setQtyPages(nbPages); AttributeMap attributes = new AttributeMap(); result.setAttributes(attributes); result.setSite(Site.PANDA); return result; }
@Override public Document runFilter(Document document) { final Document clean = Document.createShell(document.baseUri()); if (document.body() != null) // frameset documents won't have a body. the clean doc will have empty body. copySafeNodes(document.body(), clean.body()); return clean; }
return new TextNode(element.getText(), document.baseUri());
parseAttributes(attributes, AttributeType.CHARACTER, characterElements, true); if (doc.baseUri().contains("comics")) { result.setSite(Site.ASMHENTAI_COMICS); } else {
static Document getBootstrapPage(BootstrapContext context) { Document document = new Document(""); DocumentType doctype = new DocumentType("html", "", "", document.baseUri()); document.appendChild(doctype); Element html = document.appendElement("html");