TagNode tagNode = new HtmlCleaner().clean( "<div><table><td id='1234 foo 5678'>Hello</td>"); org.w3c.dom.Document doc = new DomSerializer( new CleanerProperties()).createDOM(tagNode);
@Override public String select(String text) { try { HtmlCleaner htmlCleaner = new HtmlCleaner(); TagNode tagNode = htmlCleaner.clean(text); Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
List<String> results = new ArrayList<String>(); try { HtmlCleaner htmlCleaner = new HtmlCleaner(); TagNode tagNode = htmlCleaner.clean(text); Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
HtmlCleaner cleaner = new HtmlCleaner(); rootNode = cleaner.clean(htmlPage);
HtmlCleaner cleaner = new HtmlCleaner(); rootNode = cleaner.clean(htmlPage);
@Override public SimpleSystem connect(String... connectionParams) throws Exception { // no connection parameters needed yet cleaner = new HtmlCleaner(); return this; }
/** * Default constructor. * * @param outputEncoding the encoding used write. */ public HtmlUtils(final String outputEncoding) { this.outputEncoding = Arg.checkNotBlank("outputEncoding", outputEncoding); cleaner = new HtmlCleaner(); }
HtmlCleaner cleaner = new HtmlCleaner(); final String siteUrl = "http://www.themoscowtimes.com/"; TagNode node = cleaner.clean(new URL(siteUrl)); // serialize to xml file new PrettyXmlSerializer(props).writeToFile( node , "cleaned.xml", "utf-8" );
public static void main(String[] arg){ HtmlCleaner cleaner = new HtmlCleaner(); try { TagNode nodes = cleaner.clean(new File("c:/test.xml")); Object[] objects = nodes.evaluateXPath("//div/a[text(.,'In')]"); System.out.println(((TagNode)objects[0]).getText()); } catch (Exception e) { e.printStackTrace(); } }
URL urlSB = new URL("https://www.groupon.com/browse/chicago?z=skip"); URLConnection urlConnection = urlSB.openConnection(); urlConnection.addRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:5.0) Gecko/20100101 Firefox/25.0"); urlConnection.connect(); HtmlCleaner cleaner = new HtmlCleaner(); CleanerProperties props = cleaner.getProperties(); props.setNamespacesAware(false); TagNode tagNodeRoot = cleaner.clean(urlConnection.getInputStream()); new PrettyXmlSerializer(props).writeToFile(tagNodeRoot, "cleaned.xml", "utf-8");
final URL urlSB = new URL("http://www.groupon.com/browse/chicago?z=skip"); final URLConnection urlConnection = urlSB.openConnection(); urlConnection.addRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:31.0) Gecko/20100101 Firefox/31.0"); urlConnection.connect(); final HtmlCleaner cleaner = new HtmlCleaner(); final CleanerProperties props = cleaner.getProperties(); props.setNamespacesAware(false); final TagNode tagNodeRoot = cleaner.clean(urlConnection.getInputStream());
final HtmlCleaner cleaner = new HtmlCleaner(); final CleanerProperties properties = cleaner.getProperties(); final Serializer serializer = new SimpleHtmlSerializer(properties); TagNode node = cleaner.clean("hello world"); StringWriter writer = new StringWriter(); serializer.write(node, writer, "UTF-8"); System.out.println(writer.toString());
HtmlCleaner cleaner = new HtmlCleaner(); TagNode root= cleaner.clean(...); TagNode[] trNodes= root.getElementsByName("tr"); for (TagNode trNode : trNodes) { System.out.println("All text inside this <tr> tag (including children): " + trNode.getText()); }
import org.htmlcleaner.HtmlCleaner; import org.htmlcleaner.TagNode; public class Test { public static void main(String[] args) throws Throwable { HtmlCleaner cleaner = new HtmlCleaner(); String html = "<div style=\"Z-INDEX: 654; BORDER-BOTTOM: 0px; POSITION: absolute; BORDER-LEFT: 0px; WIDTH: 80px; HEIGHT: 22px; BORDER-TOP: 0px; TOP: 64px; CURSOR: auto; BORDER-RIGHT: 0px; LEFT: 240px\" id=\"textboxElt11286249556014dIi15v\" lineid=\"lineid\" pos_rel=\"false\" x1=\"240\" x2=\"320\" y1=\"64\" y2=\"86\"><input style=\"WIDTH: 80px; HEIGHT: 20px\" id=\"textboxElt11286249556014dIi15v_textbox\" title=\"Enter Registration Number Here\" tabindex=\"1\" value=\" \" maxlength=\"15\" size=\"10\" name=\"scheduled_tribe_registration_number_text\"></input></div>"; TagNode node = cleaner.clean(html); TagNode div = node.findElementByName("div", true); System.out.println(div.getAttributeByName("style")); } }
public static Document toXhtml(String html) throws ParserConfigurationException { HtmlCleaner cleaner = new HtmlCleaner(); TagNode tagNode = cleaner.clean(html); DomSerializer domSerializer = new DomSerializer(new CleanerProperties()); return domSerializer.createDOM(tagNode); }
private static synchronized void initCleaner() { if (!htmlCleanerInitialized) { cleaner = new HtmlCleaner(); CleanerProperties props = cleaner.getProperties(); props.setOmitComments(true); props.setOmitXmlDeclaration(true); htmlCleanerInitialized = true; } }
public String getAsString(String htmlContent) { HtmlCleaner htmlCleaner = new HtmlCleaner(this.props); TagNode tagNode = htmlCleaner.clean(htmlContent); return getAsString(tagNode, props.getCharset()); }
public CleanHtmlFunction() { this.cleaner = new HtmlCleaner(); CleanerProperties p = cleaner.getProperties(); p.setOmitComments(true); p.setTranslateSpecialEntities(true); p.setTransResCharsToNCR(true); // remove all tags that contain uninteresting content p.setPruneTags("style,script,form,object,audio,video"); }
private HtmlCleaner getHtmlCleaner() { HtmlCleaner htmlCleaner = new HtmlCleaner(); htmlCleaner.getProperties().setUseCdataForScriptAndStyle(false); htmlCleaner.getProperties().setPruneTags("script,style"); htmlCleaner.getProperties().setTreatUnknownTagsAsContent(true); htmlCleaner.getProperties().setOmitUnknownTags(true); return htmlCleaner; }
public HtmlProcessorImpl(final HtmlProcessorConfig config) { this.config = config; final CleanerProperties properties = new CleanerProperties(); properties.setOmitHtmlEnvelope(true); properties.setOmitXmlDeclaration(true); properties.setOmitComments(config.isOmitComments()); parser = new HtmlCleaner(properties); filter = new WhitelistHtmlFilter(config.getWhitelistElements(), config.isOmitJavascriptProtocol()); serializer = HtmlSerializerFactory.create(config.getSerializer(), properties); }