TagNode tagNode = new HtmlCleaner().clean( "<div><table><td id='1234 foo 5678'>Hello</td>"); org.w3c.dom.Document doc = new DomSerializer( new CleanerProperties()).createDOM(tagNode);
HtmlCleaner htmlCleaner = new HtmlCleaner(); TagNode tagNode = htmlCleaner.clean(text); Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode); Object result; try {
HtmlCleaner htmlCleaner = new HtmlCleaner(); TagNode tagNode = htmlCleaner.clean(text); Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode); Object result; try {
CleanerProperties props = new CleanerProperties() { @Override public ITagInfoProvider getTagInfoProvider() { return DefaultTagProvider.getInstance(); } }; HtmlSerializer simpleHtmlSerializer = new SimpleHtmlSerializer(props); String message = simpleHtmlSerializer.getAsString(tagNode, true);
/** * Constructor - creates the instance with specified tag info provider and specified properties * @param tagInfoProvider Provider for tag filtering and balancing * @param properties Properties used during parsing and serializing */ public HtmlCleaner(ITagInfoProvider tagInfoProvider, CleanerProperties properties) { this.properties = properties == null ? new CleanerProperties() : properties; this.properties.setTagInfoProvider(tagInfoProvider == null ? DefaultTagProvider.INSTANCE : tagInfoProvider); }
public static Serializer create(final HtmlSerializer serializer) { return create(serializer, new CleanerProperties()); }
TagNode tagNode = new HtmlCleaner().clean("<html><div><p>test"); Document doc = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
final CleanerProperties props = new CleanerProperties(); final SimpleHtmlSerializer htmlSerializer = new SimpleHtmlSerializer(props); htmlSerializer.writeToFile(tagNode, fileName, "utf-8");
TagNode tagNode = new HtmlCleaner().clean("<html><div><p>test"); DomSerializer ser = new DomSerializer(new CleanerProperties()); org.w3c.dom.Document doc = ser.createDOM(tagNode);
final CleanerProperties props = new CleanerProperties(); final HtmlCleaner htmlCleaner = new HtmlCleaner(props); final SimpleHtmlSerializer htmlSerializer = new SimpleHtmlSerializer(props); // make 10 threads using the same cleaner and the same serializer for (int i = 1; i <= 10; i++) { final String url = "http://search.eim.ebay.eu/Art/2-1/?en=100&ep=" + i; final String fileName = "c:/temp/ebay_art" + i + ".xml"; new Thread(new Runnable() { public void run() { try { TagNode tagNode = htmlCleaner.clean(new URL(url)); htmlSerializer.writeToFile(tagNode, fileName, "utf-8"); } catch (IOException e) { e.printStackTrace(); } } }).start(); }
Document doc = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
public static Document toXhtml(String html) throws ParserConfigurationException { HtmlCleaner cleaner = new HtmlCleaner(); TagNode tagNode = cleaner.clean(html); DomSerializer domSerializer = new DomSerializer(new CleanerProperties()); return domSerializer.createDOM(tagNode); }
public static void main(String[] args) { try{ CleanerProperties props = new CleanerProperties(); props.setTranslateSpecialEntities(true); props.setTransResCharsToNCR(true);
private Document clean(String content) throws ParserConfigurationException { HtmlCleaner cleaner = new HtmlCleaner(); TagNode rootNode = cleaner.clean(content); // convert to DOM CleanerProperties properties = new CleanerProperties(); properties.setOmitComments(true); DomSerializer domSerializer = new DomSerializer(properties); Document doc = domSerializer.createDOM(rootNode); return doc; }
public static Document getWebpageDocument_fromSource(String source) throws InterruptedException, IOException { try { HtmlCleaner cleaner = new HtmlCleaner(); CleanerProperties props = cleaner.getProperties(); props.setAllowHtmlInsideAttributes(true); props.setAllowMultiWordAttributes(true); props.setRecognizeUnicodeChars(true); props.setOmitComments(true); DocumentBuilderFactory builderFactory = DocumentBuilderFactory.newInstance(); DocumentBuilder builder = null; try { builder = builderFactory.newDocumentBuilder(); } catch (ParserConfigurationException e) { e.printStackTrace(); } TagNode tagNode = new HtmlCleaner().clean(source); Document doc = new DomSerializer(new CleanerProperties()).createDOM(tagNode); return doc; } catch (ParserConfigurationException ex) { ex.printStackTrace(); return null; } }
/** * @param configuration the configuration to use for the cleaning * @return the default {@link CleanerProperties} to be used for cleaning. */ private CleanerProperties getDefaultCleanerProperties(HTMLCleanerConfiguration configuration) { CleanerProperties defaultProperties = new CleanerProperties(); defaultProperties.setOmitUnknownTags(true); defaultProperties.setNamespacesAware(true); // HTML Cleaner uses the compact notation by default but we don't want that since: // - it's more work and not required since not compact notation is valid XHTML // - expanded elements can also be rendered fine in browsers that only support HTML. defaultProperties.setUseEmptyElementTags(false); // Wrap script and style content in CDATA blocks defaultProperties.setUseCdataForScriptAndStyle(true); // Handle the NAMESPACE_AWARE configuration property String param = configuration.getParameters().get(HTMLCleanerConfiguration.NAMESPACES_AWARE); boolean namespacesAware = (param != null) ? Boolean.parseBoolean(param) : defaultProperties.isNamespacesAware(); defaultProperties.setNamespacesAware(namespacesAware); return defaultProperties; }
/** * Constructor - creates the instance with specified tag info provider and specified properties * * @param tagInfoProvider Provider for tag filtering and balancing * @param properties Properties used during parsing and serializing */ public HtmlCleaner(ITagInfoProvider tagInfoProvider, CleanerProperties properties) { this.properties = properties == null ? new CleanerProperties() : properties; // // If the given tagInfoProvider is null, then we set it to the default // UNLESS the TagInfoProvider has already been set in cleanerProperties. // in which case we leave properties as they are. // if (tagInfoProvider == null && this.properties.getTagInfoProvider() == null) { if (this.properties.getHtmlVersion() == HTML_4) this.properties.setTagInfoProvider(Html4TagProvider.INSTANCE); else this.properties.setTagInfoProvider(Html5TagProvider.INSTANCE); } else { if (tagInfoProvider != null) { this.properties.setTagInfoProvider(tagInfoProvider == null ? Html4TagProvider.INSTANCE : tagInfoProvider); } } }
CleanerProperties defaultProperties = new CleanerProperties(); defaultProperties.setOmitUnknownTags(true);
/** * htmlcleaner로 html string을 xml string으로 바꿔주는 메소드. * @param source * @return */ private String toXML(String source){ try { CleanerProperties props = new CleanerProperties(); props.setTranslateSpecialEntities(true); props.setOmitComments(true); props.setPruneTags("script,style"); // namespace를 무시한다. props.setNamespacesAware(false); props.setAdvancedXmlEscape(true); props.setTranslateSpecialEntities(true); HtmlCleaner cl = new HtmlCleaner(props); TagNode tagNode = cl.clean(source); source = new PrettyXmlSerializer(props).getXmlAsString(tagNode); } catch (IOException e) { logger.error("",e); } return source; } //test용
public HtmlProcessorImpl(final HtmlProcessorConfig config) { this.config = config; final CleanerProperties properties = new CleanerProperties(); properties.setOmitHtmlEnvelope(true); properties.setOmitXmlDeclaration(true); properties.setOmitComments(config.isOmitComments()); parser = new HtmlCleaner(properties); filter = new WhitelistHtmlFilter(config.getWhitelistElements(), config.isOmitJavascriptProtocol()); serializer = HtmlSerializerFactory.create(config.getSerializer(), properties); }