Refine search
@Override public String select(String text) { try { HtmlCleaner htmlCleaner = new HtmlCleaner(); TagNode tagNode = htmlCleaner.clean(text); Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode); Object result; try {
cleaner = new HtmlCleaner(new ConfigFileTagProvider(new File(this.taginfofile))); } else { cleaner = new HtmlCleaner(); CleanerProperties props = cleaner.getProperties(); props.setAdvancedXmlEscape(this.advancedxmlescape); props.setUseCdataForScriptAndStyle(this.usecdata); props.setTranslateSpecialEntities(this.specialentities); props.setRecognizeUnicodeChars(this.unicodechars); props.setOmitUnknownTags(this.omitunknowntags); props.setTreatUnknownTagsAsContent(this.treatunknowntagsascontent); props.setOmitDeprecatedTags(this.omitdeprtags); props.setTreatDeprecatedTagsAsContent(this.treatdeprtagsascontent); props.setOmitComments(this.omitcomments); props.setOmitXmlDeclaration(this.omitxmldecl); props.setOmitDoctypeDeclaration(this.omitdoctypedecl); props.setOmitHtmlEnvelope(this.omithtmlenvelope); props.setUseEmptyElementTags(this.useemptyelementtags); props.setAllowMultiWordAttributes(this.allowmultiwordattributes); props.setAllowHtmlInsideAttributes(this.allowhtmlinsideattributes); props.setIgnoreQuestAndExclam(this.ignoreqe); props.setNamespacesAware(this.namespacesaware); props.setHyphenReplacementInComment(this.hyphenreplacement); props.setPruneTags(this.prunetags); props.setBooleanAttributeValues(this.booleanatts);
HtmlCleaner cleaner = new HtmlCleaner(); CleanerProperties props = cleaner.getProperties(); props.setAllowHtmlInsideAttributes(true); props.setAllowMultiWordAttributes(true); props.setRecognizeUnicodeChars(true); props.setOmitComments(true); try { URL url = new URL(playUrl); URLConnection conn = url.openConnection(); TagNode node = cleaner.clean(new InputStreamReader(conn.getInputStream())); Object[] new_nodes = node.evaluateXPath("//*[@class='recent-change']"); Object[] version_nodes = node.evaluateXPath("//*[@itemprop='softwareVersion']"); whatsNew += info_node.getAllChildren().get(0).toString().trim() + "\n";
final HtmlCleaner mCleaner = new HtmlCleaner(); CleanerProperties props = mCleaner.getProperties(); props.setAllowHtmlInsideAttributes(true); props.setAllowMultiWordAttributes(true); props.setRecognizeUnicodeChars(true); props.setOmitComments(true); /*url from were data to be fetched*/ String mSiteUrl="http://www.example.com"; String mXPath="//div"; //TagnNode for storing data received from url final TagNode mGetDataFromUrl; //Establish connection URL url=new URL(mSiteUrl); final URLConnection mCCon=url.openConnection(); mGetDataFromUrl=mCleaner .clean(new InputStreamReader(mCCon.getInputStream())); //get to xpath from were data is to be retrieve Object[] mPageData=mGetDataFromUrl.evaluateXPath(mXPath); //validate object if(mPageData.length>0) { TagNode mXPathParsedData = (TagNode) mPageData[0]; // all text in div is in mData Strign mData=mXPathParsedData .getText().trim(); }
HtmlCleaner cleaner = new HtmlCleaner(); CleanerProperties props = cleaner.getProperties(); props.setUseCdataForScriptAndStyle(false); props.setOmitComments(true); props.setOmitUnknownTags(true); props.setOmitDoctypeDeclaration(true); props.setOmitXmlDeclaration(true); props.setRecognizeUnicodeChars(false); props.setAdvancedXmlEscape(true); props.setTranslateSpecialEntities(false); props.setNamespacesAware(false); props.setAllowHtmlInsideAttributes(true); props.setAllowMultiWordAttributes(true); TagNode nodes = cleaner.clean(contentStr); ExtractUtils.cleanInvalidAttributes(nodes); Document doc;
public Set<String> validateNonEmpty(String html) { final Set<String> result = new HashSet<>(); final HtmlCleaner cleaner = new HtmlCleaner(); final CleanerProperties properties = cleaner.getProperties(); properties.setOmitXmlDeclaration(true); properties.setOmitHtmlEnvelope(true); properties.setOmitComments(true); properties.setNamespacesAware(false); properties.setDeserializeEntities(true); if (isEmpty(cleaner.clean(html))) { result.add(ValidatorMessages.HTML_IS_EMPTY); } return result; }
/** * htmlcleaner로 html string을 xml string으로 바꿔주는 메소드. * @param source * @return */ private String toXML(String source){ try { CleanerProperties props = new CleanerProperties(); props.setTranslateSpecialEntities(true); props.setOmitComments(true); props.setPruneTags("script,style"); // namespace를 무시한다. props.setNamespacesAware(false); props.setAdvancedXmlEscape(true); props.setTranslateSpecialEntities(true); HtmlCleaner cl = new HtmlCleaner(props); TagNode tagNode = cl.clean(source); source = new PrettyXmlSerializer(props).getXmlAsString(tagNode); } catch (IOException e) { logger.error("",e); } return source; } //test용
/** * Cleans the relevant file and generates a valid XML file ready for processing to Sel 2 java File. * * @param absoluteFilename - name of the file to convert. * @return String - location of the converted file. */ public String convertToXML(String absoluteFilename) throws Exception { FileHandler fromSelIDE = new FileHandler(absoluteFilename); FileHandler toXML = new FileHandler(System.getProperty("java.io.tmpdir") + File.separator + fromSelIDE.getFileName() + ".xml", true); if (fromSelIDE.getFile().isDirectory()) { LOGGER.error("Cannot convert directory {} into a Selenium Test!", fromSelIDE.getFileName()); return null; } //Clean up html so that we can read it as XML properly HtmlCleaner cleaner = new HtmlCleaner(); CleanerProperties XMLPrefs = cleaner.getProperties(); XMLPrefs.setUseEmptyElementTags(true); XMLPrefs.setTranslateSpecialEntities(true); XMLPrefs.setTransResCharsToNCR(true); XMLPrefs.setOmitComments(true); XMLPrefs.setOmitComments(true); XMLPrefs.setOmitDoctypeDeclaration(true); XMLPrefs.setNamespacesAware(false); TagNode tagNode = new HtmlCleaner(XMLPrefs).clean(fromSelIDE.getFile()); new PrettyXmlSerializer(XMLPrefs).writeToStream(tagNode, toXML.getWritableFileOutputStream(), "utf-8"); toXML.close(); return toXML.getAbsoluteFile(); }
HtmlCleaner cleaner = new HtmlCleaner(); CleanerProperties props = cleaner.getProperties(); props.setNamespacesAware(false); TagNode mainNode = cleaner.clean(htmlString);
private Document createDom(String data) { HtmlCleaner cleaner = new HtmlCleaner(); CleanerProperties props = cleaner.getProperties(); props.setUseCdataForScriptAndStyle(false); props.setRecognizeUnicodeChars(true); props.setUseEmptyElementTags(true); props.setAdvancedXmlEscape(true); props.setTranslateSpecialEntities(false); props.setBooleanAttributeValues("empty"); props.setAllowHtmlInsideAttributes(true); props.setPruneTags("script,style"); data = XmlUtils.removeNamespace(data); TagNode tagNode = cleaner.clean(data); org.w3c.dom.Document doc = null; try {
private void init() { // Initialize HTMLCleaner cleaner = new HtmlCleaner(); CleanerProperties props = cleaner.getProperties(); props.setAllowHtmlInsideAttributes(true); props.setAllowMultiWordAttributes(true); props.setRecognizeUnicodeChars(true); props.setOmitComments(true); props.setNamespacesAware(false); // Initialize DomSerializer domSerializer = new DomSerializer(props); // Initialize xml parser try { DocumentBuilderFactory documentBuilderFactory = DocumentBuilderFactory.newInstance(); documentBuilder = documentBuilderFactory.newDocumentBuilder(); } catch (ParserConfigurationException e) { // THIS CAN NEVER HAPPEN } }
public static Document getWebpageDocument_fromSource(String source) throws InterruptedException, IOException { try { HtmlCleaner cleaner = new HtmlCleaner(); CleanerProperties props = cleaner.getProperties(); props.setAllowHtmlInsideAttributes(true); props.setAllowMultiWordAttributes(true); props.setRecognizeUnicodeChars(true); props.setOmitComments(true); DocumentBuilderFactory builderFactory = DocumentBuilderFactory.newInstance(); DocumentBuilder builder = null; try { builder = builderFactory.newDocumentBuilder(); } catch (ParserConfigurationException e) { e.printStackTrace(); } TagNode tagNode = new HtmlCleaner().clean(source); Document doc = new DomSerializer(new CleanerProperties()).createDOM(tagNode); return doc; } catch (ParserConfigurationException ex) { ex.printStackTrace(); return null; } }
private Document clean(String content) throws ParserConfigurationException { HtmlCleaner cleaner = new HtmlCleaner(); TagNode rootNode = cleaner.clean(content); // convert to DOM CleanerProperties properties = new CleanerProperties(); properties.setOmitComments(true); DomSerializer domSerializer = new DomSerializer(properties); Document doc = domSerializer.createDOM(rootNode); return doc; }
public CleanHtmlFunction() { this.cleaner = new HtmlCleaner(); CleanerProperties p = cleaner.getProperties(); p.setOmitComments(true); p.setTranslateSpecialEntities(true); p.setTransResCharsToNCR(true); // remove all tags that contain uninteresting content p.setPruneTags("style,script,form,object,audio,video"); }
TagNode tagNode = new HtmlCleaner().clean( "<div><table><td id='1234 foo 5678'>Hello</td>"); org.w3c.dom.Document doc = new DomSerializer( new CleanerProperties()).createDOM(tagNode);
private static synchronized void initCleaner() { if (!htmlCleanerInitialized) { cleaner = new HtmlCleaner(); CleanerProperties props = cleaner.getProperties(); props.setOmitComments(true); props.setOmitXmlDeclaration(true); htmlCleanerInitialized = true; } }
public HtmlProcessorImpl(final HtmlProcessorConfig config) { this.config = config; final CleanerProperties properties = new CleanerProperties(); properties.setOmitHtmlEnvelope(true); properties.setOmitXmlDeclaration(true); properties.setOmitComments(config.isOmitComments()); parser = new HtmlCleaner(properties); filter = new WhitelistHtmlFilter(config.getWhitelistElements(), config.isOmitJavascriptProtocol()); serializer = HtmlSerializerFactory.create(config.getSerializer(), properties); }
CleanerProperties defaultProperties = new CleanerProperties(); defaultProperties.setOmitUnknownTags(true); defaultProperties.setUseEmptyElementTags(false); defaultProperties.setUseCdataForScriptAndStyle(true); defaultProperties.setIgnoreQuestAndExclam(true); defaultProperties.setOmitCdataOutsideScriptAndStyle(true); defaultProperties.setNamespacesAware(namespacesAware); defaultProperties.setCleanerTransformations(getDefaultCleanerTransformations(configuration)); defaultProperties.setTranslateSpecialEntities(false); defaultProperties.setHtmlVersion(4);
/** * Creates a <code>HtmlCleaner</code> instance. * By default, it sets the following properties to the <code>HtmlCleaner</code>: * <ul> * <li>omitXmlDeclaration : true</li> * <li>omitDoctypeDeclaration : true</li> * </ul> * @return */ protected HtmlCleaner createHtmlCleaner() { HtmlCleaner htmlCleaner = new HtmlCleaner(); htmlCleaner.getProperties().setOmitXmlDeclaration(true); htmlCleaner.getProperties().setOmitDoctypeDeclaration(true); return htmlCleaner; } }
/** * @param configuration the configuration to use for the cleaning * @return the default {@link CleanerProperties} to be used for cleaning. */ private CleanerProperties getDefaultCleanerProperties(HTMLCleanerConfiguration configuration) { CleanerProperties defaultProperties = new CleanerProperties(); defaultProperties.setOmitUnknownTags(true); defaultProperties.setNamespacesAware(true); // HTML Cleaner uses the compact notation by default but we don't want that since: // - it's more work and not required since not compact notation is valid XHTML // - expanded elements can also be rendered fine in browsers that only support HTML. defaultProperties.setUseEmptyElementTags(false); // Wrap script and style content in CDATA blocks defaultProperties.setUseCdataForScriptAndStyle(true); // Handle the NAMESPACE_AWARE configuration property String param = configuration.getParameters().get(HTMLCleanerConfiguration.NAMESPACES_AWARE); boolean namespacesAware = (param != null) ? Boolean.parseBoolean(param) : defaultProperties.isNamespacesAware(); defaultProperties.setNamespacesAware(namespacesAware); return defaultProperties; }