try { HtmlCleaner htmlCleaner = new HtmlCleaner(); TagNode tagNode = htmlCleaner.clean(text); Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode); Object result;
try { HtmlCleaner htmlCleaner = new HtmlCleaner(); TagNode tagNode = htmlCleaner.clean(text); Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode); Object result;
rootNode = cleaner.clean(htmlPage);
rootNode = cleaner.clean(htmlPage);
private TagNode parse(String html) { if (html == null) { html = ""; } return parser.clean(html); }
HtmlCleaner cleaner = new HtmlCleaner(); final String siteUrl = "http://www.themoscowtimes.com/"; TagNode node = cleaner.clean(new URL(siteUrl)); // serialize to xml file new PrettyXmlSerializer(props).writeToFile( node , "cleaned.xml", "utf-8" );
/** * Creates instance from the content downloaded from specified URL. * HTML encoding is resolved following the attempts in the sequence: * 1. reading Content-Type response header, 2. Analyzing META tags at the * beginning of the html, 3. Using platform's default charset. * @param url * @return * @throws IOException */ public TagNode clean(URL url) throws IOException { return clean(url, properties.getCharset()); }
public static void main(String[] arg){ HtmlCleaner cleaner = new HtmlCleaner(); try { TagNode nodes = cleaner.clean(new File("c:/test.xml")); Object[] objects = nodes.evaluateXPath("//div/a[text(.,'In')]"); System.out.println(((TagNode)objects[0]).getText()); } catch (Exception e) { e.printStackTrace(); } }
URL urlSB = new URL("https://www.groupon.com/browse/chicago?z=skip"); URLConnection urlConnection = urlSB.openConnection(); urlConnection.addRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:5.0) Gecko/20100101 Firefox/25.0"); urlConnection.connect(); HtmlCleaner cleaner = new HtmlCleaner(); CleanerProperties props = cleaner.getProperties(); props.setNamespacesAware(false); TagNode tagNodeRoot = cleaner.clean(urlConnection.getInputStream()); new PrettyXmlSerializer(props).writeToFile(tagNodeRoot, "cleaned.xml", "utf-8");
final URL urlSB = new URL("http://www.groupon.com/browse/chicago?z=skip"); final URLConnection urlConnection = urlSB.openConnection(); urlConnection.addRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:31.0) Gecko/20100101 Firefox/31.0"); urlConnection.connect(); final HtmlCleaner cleaner = new HtmlCleaner(); final CleanerProperties props = cleaner.getProperties(); props.setNamespacesAware(false); final TagNode tagNodeRoot = cleaner.clean(urlConnection.getInputStream());
final HtmlCleaner cleaner = new HtmlCleaner(); final CleanerProperties properties = cleaner.getProperties(); final Serializer serializer = new SimpleHtmlSerializer(properties); TagNode node = cleaner.clean("hello world"); StringWriter writer = new StringWriter(); serializer.write(node, writer, "UTF-8"); System.out.println(writer.toString());
public TagNode clean(String htmlContent) { try { return clean( new StringReader(htmlContent), new CleanTimeValues() ); } catch (IOException e) { // should never happen because reading from StringReader throw new HtmlCleanerException(e); } }
public TagNode clean(String htmlContent) { try { return clean(new StringReader(htmlContent), new CleanTimeValues()); } catch (IOException e) { // should never happen because reading from StringReader throw new HtmlCleanerException(e); } }
HtmlCleaner cleaner = new HtmlCleaner(); TagNode root= cleaner.clean(...); TagNode[] trNodes= root.getElementsByName("tr"); for (TagNode trNode : trNodes) { System.out.println("All text inside this <tr> tag (including children): " + trNode.getText()); }
import org.htmlcleaner.HtmlCleaner; import org.htmlcleaner.TagNode; public class Test { public static void main(String[] args) throws Throwable { HtmlCleaner cleaner = new HtmlCleaner(); String html = "<div style=\"Z-INDEX: 654; BORDER-BOTTOM: 0px; POSITION: absolute; BORDER-LEFT: 0px; WIDTH: 80px; HEIGHT: 22px; BORDER-TOP: 0px; TOP: 64px; CURSOR: auto; BORDER-RIGHT: 0px; LEFT: 240px\" id=\"textboxElt11286249556014dIi15v\" lineid=\"lineid\" pos_rel=\"false\" x1=\"240\" x2=\"320\" y1=\"64\" y2=\"86\"><input style=\"WIDTH: 80px; HEIGHT: 20px\" id=\"textboxElt11286249556014dIi15v_textbox\" title=\"Enter Registration Number Here\" tabindex=\"1\" value=\" \" maxlength=\"15\" size=\"10\" name=\"scheduled_tribe_registration_number_text\"></input></div>"; TagNode node = cleaner.clean(html); TagNode div = node.findElementByName("div", true); System.out.println(div.getAttributeByName("style")); } }
public static Document toXhtml(String html) throws ParserConfigurationException { HtmlCleaner cleaner = new HtmlCleaner(); TagNode tagNode = cleaner.clean(html); DomSerializer domSerializer = new DomSerializer(new CleanerProperties()); return domSerializer.createDOM(tagNode); }
HtmlCleaner cleaner = new HtmlCleaner(); TagNode node = cleaner.clean(html); DomSerializer ser = new DomSerializer(cleaner.getProperties()); Document myW3cDoc = ser.createDOM(node);
public String getAsString(String htmlContent) { HtmlCleaner htmlCleaner = new HtmlCleaner(this.props); TagNode tagNode = htmlCleaner.clean(htmlContent); return getAsString(tagNode, props.getCharset()); }
public String getAsString(String htmlContent) { HtmlCleaner htmlCleaner = new HtmlCleaner(this.props); TagNode tagNode = htmlCleaner.clean(htmlContent); return getAsString(tagNode, props.getCharset()); }
private Document clean(String content) throws ParserConfigurationException { HtmlCleaner cleaner = new HtmlCleaner(); TagNode rootNode = cleaner.clean(content); // convert to DOM CleanerProperties properties = new CleanerProperties(); properties.setOmitComments(true); DomSerializer domSerializer = new DomSerializer(properties); Document doc = domSerializer.createDOM(rootNode); return doc; }