org.htmlcleaner.HtmlCleaner.clean java code examples

try {
  HtmlCleaner htmlCleaner = new HtmlCleaner();
  TagNode tagNode = htmlCleaner.clean(text);
  Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
  Object result;

try {
  HtmlCleaner htmlCleaner = new HtmlCleaner();
  TagNode tagNode = htmlCleaner.clean(text);
  Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
  Object result;

rootNode = cleaner.clean(htmlPage);

rootNode = cleaner.clean(htmlPage);

private TagNode parse(String html) {
  if (html == null) {
    html = "";
  }
  return parser.clean(html);
}

 HtmlCleaner cleaner = new HtmlCleaner();
final String siteUrl = "http://www.themoscowtimes.com/";

TagNode node = cleaner.clean(new URL(siteUrl));


// serialize to xml file
new PrettyXmlSerializer(props).writeToFile(
  node , "cleaned.xml", "utf-8"
);

/**
 * Creates instance from the content downloaded from specified URL.
 * HTML encoding is resolved following the attempts in the sequence:
 * 1. reading Content-Type response header, 2. Analyzing META tags at the
 * beginning of the html, 3. Using platform's default charset.
 * @param url
 * @return
 * @throws IOException
 */
public TagNode clean(URL url) throws IOException {
  return clean(url, properties.getCharset());
}

public static void main(String[] arg){
    HtmlCleaner cleaner = new HtmlCleaner();
    try {
      TagNode nodes = cleaner.clean(new File("c:/test.xml"));
      Object[] objects = nodes.evaluateXPath("//div/a[text(.,'In')]");
      System.out.println(((TagNode)objects[0]).getText());
    } catch (Exception e) {
     e.printStackTrace();
   }
 }

 URL urlSB = new URL("https://www.groupon.com/browse/chicago?z=skip");
URLConnection urlConnection = urlSB.openConnection();
urlConnection.addRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:5.0) Gecko/20100101 Firefox/25.0");
urlConnection.connect();
HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties props = cleaner.getProperties();
props.setNamespacesAware(false);
TagNode tagNodeRoot = cleaner.clean(urlConnection.getInputStream());
new PrettyXmlSerializer(props).writeToFile(tagNodeRoot, "cleaned.xml", "utf-8");

final URL urlSB = new URL("http://www.groupon.com/browse/chicago?z=skip");
   final URLConnection urlConnection = urlSB.openConnection();
   urlConnection.addRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:31.0) Gecko/20100101 Firefox/31.0");
   urlConnection.connect();
   final HtmlCleaner cleaner = new HtmlCleaner();
   final CleanerProperties props = cleaner.getProperties();
   props.setNamespacesAware(false);
   final TagNode tagNodeRoot = cleaner.clean(urlConnection.getInputStream());

final HtmlCleaner cleaner = new HtmlCleaner();
 final CleanerProperties properties = cleaner.getProperties();
 final Serializer serializer = new SimpleHtmlSerializer(properties);
 TagNode node = cleaner.clean("hello world");
 StringWriter writer = new StringWriter();
 serializer.write(node, writer, "UTF-8");
 System.out.println(writer.toString());

public TagNode clean(String htmlContent) {
  try {
    return clean( new StringReader(htmlContent), new CleanTimeValues() );
  } catch (IOException e) {
    // should never happen because reading from StringReader
    throw new HtmlCleanerException(e);
  }
}

public TagNode clean(String htmlContent) {
  try {
    return clean(new StringReader(htmlContent), new CleanTimeValues());
  } catch (IOException e) {
    // should never happen because reading from StringReader
    throw new HtmlCleanerException(e);
  }
}

 HtmlCleaner cleaner = new HtmlCleaner();
TagNode root= cleaner.clean(...);
TagNode[] trNodes= root.getElementsByName("tr");
for (TagNode trNode : trNodes) {
  System.out.println("All text inside this <tr> tag (including children): " + trNode.getText());
}

 import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;

public class Test {

  public static void main(String[] args) throws Throwable {
    HtmlCleaner cleaner = new HtmlCleaner();
    String html = "<div style=\"Z-INDEX: 654; BORDER-BOTTOM: 0px; POSITION: absolute; BORDER-LEFT: 0px; WIDTH: 80px; HEIGHT: 22px; BORDER-TOP: 0px; TOP: 64px; CURSOR: auto; BORDER-RIGHT: 0px; LEFT: 240px\" id=\"textboxElt11286249556014dIi15v\" lineid=\"lineid\" pos_rel=\"false\" x1=\"240\" x2=\"320\" y1=\"64\" y2=\"86\"><input style=\"WIDTH: 80px; HEIGHT: 20px\" id=\"textboxElt11286249556014dIi15v_textbox\" title=\"Enter Registration Number Here\" tabindex=\"1\" value=\" \" maxlength=\"15\" size=\"10\" name=\"scheduled_tribe_registration_number_text\"></input></div>";
    TagNode node = cleaner.clean(html);
    TagNode div = node.findElementByName("div", true);
    System.out.println(div.getAttributeByName("style"));
  }
}

 public static Document toXhtml(String html) throws ParserConfigurationException {
  HtmlCleaner cleaner = new HtmlCleaner();
  TagNode tagNode = cleaner.clean(html);
  DomSerializer domSerializer = new DomSerializer(new CleanerProperties());
  return domSerializer.createDOM(tagNode);
}

 HtmlCleaner cleaner = new HtmlCleaner();
TagNode node = cleaner.clean(html);
DomSerializer ser = new DomSerializer(cleaner.getProperties());
Document myW3cDoc = ser.createDOM(node);

public String getAsString(String htmlContent) {
  HtmlCleaner htmlCleaner = new HtmlCleaner(this.props);
  TagNode tagNode = htmlCleaner.clean(htmlContent);
  return getAsString(tagNode, props.getCharset());
}

public String getAsString(String htmlContent) {
  HtmlCleaner htmlCleaner = new HtmlCleaner(this.props);
  TagNode tagNode = htmlCleaner.clean(htmlContent);
  return getAsString(tagNode, props.getCharset());
}

 private Document clean(String content) throws ParserConfigurationException {
  HtmlCleaner cleaner = new HtmlCleaner();
  TagNode rootNode = cleaner.clean(content);

  // convert to DOM
  CleanerProperties properties = new CleanerProperties();
  properties.setOmitComments(true);
  DomSerializer domSerializer = new DomSerializer(properties);
  Document doc = domSerializer.createDOM(rootNode);
  return doc;
}

Javadoc

Basic version of the cleaning call.

Popular methods of HtmlCleaner

<init>
getProperties
getInnerHtml
addAttributesToTag
Add attributes from specified map to the specified tag. If some attribute already exist it is preser
addIfNeededToPruneSet
addPossibleHeadCandidate
Checks if specified tag with specified info is candidate for moving to head section.
addPruneNode
calculateRootNode
Assigns root node to internal variable and adds neccessery xmlns attributes if cleaner if namespaces
closeAll
Close all unclosed tags if there are any.
closeSnippet
Forced closing
createDocumentNodes
createTagNode

Popular in Java

Finding current android device location
scheduleAtFixedRate (ScheduledExecutorService)
runOnUiThread (Activity)
setRequestProperty (URLConnection)
FileNotFoundException (java.io)
Thrown when a file specified by a program cannot be found.
Hashtable (java.util)
A plug-in replacement for JDK1.5 java.util.Hashtable. This version is based on org.cliffc.high_scale
ThreadPoolExecutor (java.util.concurrent)
An ExecutorService that executes each submitted task using one of possibly several pooled threads, n
Handler (java.util.logging)
A Handler object accepts a logging request and exports the desired messages to a target, for example
Options (org.apache.commons.cli)
Main entry-point into the library. Options represents a collection of Option objects, which describ
Response (javax.ws.rs.core)
Defines the contract between a returned instance and the runtime when an application needs to provid
Top Vim plugins

How to use cleanmethodin org.htmlcleaner.HtmlCleaner

Best Java code snippets using org.htmlcleaner.HtmlCleaner.clean (Showing top 20 results out of 315)

How to use
clean
method
in
org.htmlcleaner.HtmlCleaner