/** Get the HTML representation of this attribute; e.g. {@code href="index.html"}. @return HTML */ public String html() { StringBuilder accum = new StringBuilder(); try { html(accum, (new Document("")).outputSettings()); } catch(IOException exception) { throw new SerializationException(exception); } return accum.toString(); }
/** * Only document can be select * See: https://github.com/code4craft/webmagic/issues/113 * * @param elementIterator elementIterator * @return element element */ private Element checkElementAndConvert(ListIterator<Element> elementIterator) { Element element = elementIterator.next(); if (!(element instanceof Document)) { Document root = new Document(element.ownerDocument().baseUri()); Element clone = element.clone(); root.appendChild(clone); elementIterator.set(root); return root; } return element; }
/** Get the HTML representation of these attributes. @return HTML @throws SerializationException if the HTML representation of the attributes cannot be constructed. */ public String html() { StringBuilder accum = new StringBuilder(); try { html(accum, (new Document("")).outputSettings()); // output settings a bit funky, but this html() seldom used } catch (IOException e) { // ought never happen throw new SerializationException(e); } return accum.toString(); }
protected void initialiseParse(Reader input, String baseUri, ParseErrorList errors, ParseSettings settings) { Validate.notNull(input, "String input must not be null"); Validate.notNull(baseUri, "BaseURI must not be null"); doc = new Document(baseUri); this.settings = settings; reader = new CharacterReader(input); this.errors = errors; currentToken = null; tokeniser = new Tokeniser(reader, errors); stack = new ArrayList<>(32); this.baseUri = baseUri; }
Document.OutputSettings getOutputSettings() { Document owner = ownerDocument(); return owner != null ? owner.outputSettings() : (new Document("")).outputSettings(); }
/** Create a valid, empty shell of a document, suitable for adding more elements to. @param baseUri baseUri of document @return document with html, head, and body elements. */ public static Document createShell(String baseUri) { Validate.notNull(baseUri); Document doc = new Document(baseUri); Element html = doc.appendElement("html"); html.appendElement("head"); html.appendElement("body"); return doc; }
static Document parseInputStream(InputStream input, String charsetName, String baseUri, Parser parser) throws IOException { if (input == null) // empty body return new Document(baseUri); input = ConstrainableInputStream.wrap(input, bufferSize, 0);
public DesignContext() { this(new Document("")); }
Document doc = new Document(""); DocumentType docType = new DocumentType("html", "", "", ""); doc.appendChild(docType);
public static Document get(String url) { int trys = 3; try { return get(url, trys); } catch (Exception e) { } // 4次请求之后无法解析返回空文档 return new Document(""); }
public static Document proxyGet(String url, String ip, int port) { int trys = 3; try { return proxyGet(url, trys, ip, port); } catch (Exception e) { e.printStackTrace(); } // 4次请求之后无法解析返回空文档 return new Document(""); }
/** * Gets the outer HTML for the element. * <p> * This operation recursively iterates the element and all children and * should not be called unnecessarily. * * @return the outer HTML for the element */ public String getOuterHTML() { return ElementUtil.toJsoup(new Document(""), this).outerHtml(); }
public Document parse(String data, String baseUri) throws SAXException, IOException { InputSource source = new InputSource(); source.setCharacterStream(new StringReader(data)); SAXParser nekoParser = new SAXParser(); Document document = new Document(baseUri); nekoParser.setContentHandler(new Handler(document)); nekoParser.setErrorHandler(new LocalErrorHandler()); nekoParser.parse(source); return document; }
/** * Only document can be select * See: https://github.com/code4craft/webmagic/issues/113 * * @param elementIterator elementIterator * @return element element */ private Element checkElementAndConvert(ListIterator<Element> elementIterator) { Element element = elementIterator.next(); if (!(element instanceof Document)) { Document root = new Document(element.ownerDocument().baseUri()); Element clone = element.clone(); root.appendChild(clone); elementIterator.set(root); return root; } return element; }
/** * Only document can be select * See: https://github.com/code4craft/webmagic/issues/113 * * @param elementIterator elementIterator * @return element element */ private Element checkElementAndConvert(ListIterator<Element> elementIterator) { Element element = elementIterator.next(); if (!(element instanceof Document)) { Document root = new Document(element.ownerDocument().baseUri()); Element clone = element.clone(); root.appendChild(clone); elementIterator.set(root); return root; } return element; }
public Document parse(InputStream data, String baseUri) throws SAXException, IOException { InputSource source = new InputSource(); source.setByteStream(data); SAXParser nekoParser = new SAXParser(); Document document = new Document(baseUri); nekoParser.setContentHandler(new Handler(document)); nekoParser.setErrorHandler(new LocalErrorHandler()); nekoParser.parse(source); return document; }
public Component(Element elem, AttributesRequire attrs) throws Exception { Document doc = new Document(""); doc.appendElement("body"); doc.body().appendChild(elem); renderedElement = renderTemplate(doc, attrs); }
public String toHtml() { Document doc = new Document(""); doc.appendChild(toElement()); RenderUtil.applyMessages(doc); RenderUtil.applyClearAction(doc, true); return doc.html(); }
static Document getBootstrapPage(BootstrapContext context) { Document document = new Document(""); DocumentType doctype = new DocumentType("html", "", "", document.baseUri());
static Document postprocess(Element topNode) { Log.i("postprocess"); Document doc = new Document(""); if (topNode == null) { return doc; } removeNodesWithNegativeScores(topNode); replaceLineBreaksWithSpaces(topNode); removeUnlikelyChildNodes(topNode); removeTagsButRetainContent(topNode); removeTagsNotLikelyToBeParagraphs(topNode); removeTopLevelTagsNotLikelyToBeParagraphs(topNode); removeShortParagraphs(topNode); removeDisallowedAttributes(topNode); for (Node node : topNode.childNodes()) { doc.appendChild(node.clone()); // TODO: Don’t copy each item separately. } return doc; }