private Document parseHTMLDocument (String uriString) { HtmlDocumentBuilder docBuilder = new HtmlDocumentBuilder(); try { return docBuilder.parse(uriString); } catch ( Throwable e ) { throw new OkapiException("Error parsing an HTML document.\n"+e.getMessage(), e); } }
/** * Instantiates the document builder with the JAXP DOM implementation * and a specific XML violation policy. * @param xmlPolicy the policy */ public HtmlDocumentBuilder(XmlViolationPolicy xmlPolicy) { this(jaxpDOMImplementation(), xmlPolicy); }
/** * This is a catch-all convenience method for setting name, xmlns, content space, * content non-XML char and comment policies in one go. This does not affect the * streamability policy or doctype reporting. * * @param xmlPolicy */ public void setXmlPolicy(XmlViolationPolicy xmlPolicy) { setNamePolicy(xmlPolicy); setXmlnsPolicy(xmlPolicy == XmlViolationPolicy.FATAL ? XmlViolationPolicy.ALTER_INFOSET : xmlPolicy); setContentSpacePolicy(xmlPolicy); setContentNonXmlCharPolicy(xmlPolicy); setCommentPolicy(xmlPolicy); }
private XdmNode parseHTML(String text) { HtmlDocumentBuilder htmlBuilder = new HtmlDocumentBuilder(XmlViolationPolicy.ALTER_INFOSET); htmlBuilder.setEntityResolver(runtime.getResolver()); try { InputSource src = new InputSource(new StringReader(text)); Document html = htmlBuilder.parse(src); DocumentBuilder builder = runtime.getProcessor().newDocumentBuilder(); XdmNode doc = builder.build(new DOMSource(html)); return doc; } catch (Exception e) { throw new XProcException(e); } } }
/** {@inheritDoc} */ @Override public void render(DriverRequest httpRequest, String src, Writer out) throws IOException { try { HtmlDocumentBuilder htmlDocumentBuilder = new HtmlDocumentBuilder(); htmlDocumentBuilder.setDoctypeExpectation(DoctypeExpectation.NO_DOCTYPE_ERRORS); Document document = htmlDocumentBuilder.parse(new InputSource(new StringReader(src))); NodeList matchingNodes = (NodeList) expr.evaluate(document, XPathConstants.NODESET); XhtmlSerializer serializer = new XhtmlSerializer(out); Dom2Sax dom2Sax = new Dom2Sax(serializer, serializer); for (int i = 0; i < matchingNodes.getLength(); i++) { dom2Sax.parse(matchingNodes.item(i)); } } catch (XPathExpressionException e) { throw new ProcessingFailedException("Failed to evaluate XPath expression", e); } catch (SAXException e) { throw new ProcessingFailedException("Unable to parse source", e); } } }
/** In this subclass, returns a complete Document, even if the source is just a fragment. */ @Override public HTMLDocument parse(InputSource source) throws SAXException, IOException{ //This is the only parse() method that really does something, the others just //adapt parameters. HTMLDocument d=(HTMLDocument)htb.parse(source); return clean(d); }
@Override public DocumentBuilder newDocumentBuilder() throws ParserConfigurationException { return new HtmlDocumentBuilder(); }
/** * This class wraps different tree builders depending on configuration. This * method does the work of hiding this from the user of the class. */ private void lazyInit() { if (driver == null) { this.driver = new Driver(newTokenizer(treeBuilder, false)); this.driver.setErrorHandler(errorHandler); this.driver.setTransitionHandler(transitionHandler); this.treeBuilder.setErrorHandler(treeBuilderErrorHandler); this.driver.setCheckingNormalization(checkingNormalization); this.driver.setCommentPolicy(commentPolicy); this.driver.setContentNonXmlCharPolicy(contentNonXmlCharPolicy); this.driver.setContentSpacePolicy(contentSpacePolicy); this.driver.setMappingLangToXmlLang(mappingLangToXmlLang); this.driver.setXmlnsPolicy(xmlnsPolicy); this.driver.setHeuristics(heuristics); for (CharacterHandler characterHandler : characterHandlers) { this.driver.addCharacterHandler(characterHandler); } this.treeBuilder.setDocumentModeHandler(documentModeHandler); this.treeBuilder.setScriptingEnabled(scriptingEnabled); this.treeBuilder.setReportingDoctype(reportingDoctype); this.treeBuilder.setNamePolicy(namePolicy); } }
if (driver == null) lazyInit(); driver.tokenize(is);
private XdmNode parseHTML(String text) { HtmlDocumentBuilder htmlBuilder = new HtmlDocumentBuilder(XmlViolationPolicy.ALTER_INFOSET); htmlBuilder.setEntityResolver(runtime.getResolver()); try { InputSource src = new InputSource(new StringReader(text)); Document html = htmlBuilder.parse(src); DocumentBuilder builder = runtime.getProcessor().newDocumentBuilder(); XdmNode doc = builder.build(new DOMSource(html)); return doc; } catch (Exception e) { throw new XProcException(e); } } }
/** {@inheritDoc} */ @Override public void render(DriverRequest httpRequest, String src, Writer out) throws IOException { try { HtmlDocumentBuilder htmlDocumentBuilder = new HtmlDocumentBuilder(); htmlDocumentBuilder.setDoctypeExpectation(DoctypeExpectation.NO_DOCTYPE_ERRORS); Document document = htmlDocumentBuilder.parse(new InputSource(new StringReader(src))); Source source = new DOMSource(document); DOMResult result = new DOMResult(); transformer.transform(source, result); XhtmlSerializer serializer = new XhtmlSerializer(out); Dom2Sax dom2Sax = new Dom2Sax(serializer, serializer); dom2Sax.parse(result.getNode()); } catch (TransformerException e) { throw new ProcessingFailedException("Failed to transform source", e); } catch (SAXException e) { throw new ProcessingFailedException("Failed serialize transformation result", e); } } }
private Document parseHtml(final File f) throws SAXException, IOException { Document d = htmlb.parse(f); d = removeCopyright(d); return rewriteIds(d, htmlIdPattern); }
public HTMLBuilder(){ HTMLDOMImplementation implementation=new HTMLDOMImplementation(); htb=new HtmlDocumentBuilder(implementation); }
/** * This class wraps different tree builders depending on configuration. This * method does the work of hiding this from the user of the class. */ private void lazyInit() { if (driver == null) { this.driver = new Driver(newTokenizer(treeBuilder, false)); this.driver.setErrorHandler(errorHandler); this.driver.setTransitionHandler(transitionHandler); this.treeBuilder.setErrorHandler(treeBuilderErrorHandler); this.driver.setCheckingNormalization(checkingNormalization); this.driver.setCommentPolicy(commentPolicy); this.driver.setContentNonXmlCharPolicy(contentNonXmlCharPolicy); this.driver.setContentSpacePolicy(contentSpacePolicy); this.driver.setMappingLangToXmlLang(mappingLangToXmlLang); this.driver.setXmlnsPolicy(xmlnsPolicy); this.driver.setHeuristics(heuristics); for (CharacterHandler characterHandler : characterHandlers) { this.driver.addCharacterHandler(characterHandler); } this.treeBuilder.setDocumentModeHandler(documentModeHandler); this.treeBuilder.setScriptingEnabled(scriptingEnabled); this.treeBuilder.setReportingDoctype(reportingDoctype); this.treeBuilder.setNamePolicy(namePolicy); } }
if (driver == null) lazyInit(); driver.tokenize(is);
Document dom; try { dom = new HtmlDocumentBuilder().parse(is);
/** * This is a catch-all convenience method for setting name, xmlns, content space, * content non-XML char and comment policies in one go. This does not affect the * streamability policy or doctype reporting. * * @param xmlPolicy */ public void setXmlPolicy(XmlViolationPolicy xmlPolicy) { setNamePolicy(xmlPolicy); setXmlnsPolicy(xmlPolicy == XmlViolationPolicy.FATAL ? XmlViolationPolicy.ALTER_INFOSET : xmlPolicy); setContentSpacePolicy(xmlPolicy); setContentNonXmlCharPolicy(xmlPolicy); setCommentPolicy(xmlPolicy); }
HTMLDocument d=(HTMLDocument)htb.parse(source);
@BeforeClass public static void setUpBeforeClass() throws Exception { final DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); dbf.setNamespaceAware(true); db = dbf.newDocumentBuilder(); htmlb = new HtmlDocumentBuilder(); final String l = System.getProperty(LOG_LEVEL); level = l != null ? Integer.parseInt(l) : -2; }
/** * Instantiates the document builder with the JAXP DOM implementation * and a specific XML violation policy. * @param xmlPolicy the policy */ public HtmlDocumentBuilder(XmlViolationPolicy xmlPolicy) { this(jaxpDOMImplementation(), xmlPolicy); }