org.htmlcleaner.HtmlCleaner java code examples

Refine search

HtmlCleaner cleaner = new HtmlCleaner();
rootNode = cleaner.clean(htmlPage);

@Override
public String select(String text) {
  try {
    HtmlCleaner htmlCleaner = new HtmlCleaner();
    TagNode tagNode = htmlCleaner.clean(text);
    Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
    Object result;
    try {

  cleaner = new HtmlCleaner(new ConfigFileTagProvider(new File(this.taginfofile)));
} else {
  cleaner = new HtmlCleaner();
CleanerProperties props = cleaner.getProperties();
props.setAdvancedXmlEscape(this.advancedxmlescape);
props.setUseCdataForScriptAndStyle(this.usecdata);
props.setTranslateSpecialEntities(this.specialentities);
props.setRecognizeUnicodeChars(this.unicodechars);
props.setOmitUnknownTags(this.omitunknowntags);
  cleaner.initCleanerTransformations(transInfos);
  try {
    if ( src != null && (src.startsWith("http://") || src.startsWith("https://")) ) {
      node = cleaner.clean(new URL(src), incharset);
    } else if (src != null) {
      node = cleaner.clean(new File(src), incharset);
    } else {
      node = cleaner.clean(text);
    final Object[] xpathResult = node.evaluateXPath(nodebyxpath);
    for (Object element : xpathResult) {
      if ( element instanceof TagNode ) {

HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties props = cleaner.getProperties();
props.setAllowHtmlInsideAttributes(true);
props.setAllowMultiWordAttributes(true);
props.setRecognizeUnicodeChars(true);
props.setOmitComments(true);
try {
  URL url = new URL(playUrl);
  URLConnection conn = url.openConnection();
  TagNode node = cleaner.clean(new InputStreamReader(conn.getInputStream()));
  Object[] new_nodes = node.evaluateXPath("//*[@class='recent-change']");
  Object[] version_nodes = node.evaluateXPath("//*[@itemprop='softwareVersion']");
    whatsNew += info_node.getAllChildren().get(0).toString().trim()
        + "\n";

 final HtmlCleaner mCleaner = new HtmlCleaner();   
CleanerProperties props = mCleaner.getProperties();  
  props.setAllowHtmlInsideAttributes(true);  
  props.setAllowMultiWordAttributes(true);  
  props.setRecognizeUnicodeChars(true);  
  props.setOmitComments(true);      
 /*url from were data to be fetched*/  
String mSiteUrl="http://www.example.com";   
String mXPath="//div";   
//TagnNode for storing data received from url  
final TagNode mGetDataFromUrl;   //Establish connection   URL
url=new URL(mSiteUrl);   final URLConnection
mCCon=url.openConnection();   mGetDataFromUrl=mCleaner .clean(new   
InputStreamReader(mCCon.getInputStream()));   //get to xpath from
were data is to be retrieve    Object[]
mPageData=mGetDataFromUrl.evaluateXPath(mXPath);   //validate object
if(mPageData.length>0) {
   TagNode mXPathParsedData = (TagNode) mPageData[0];  
   // all text in div is in mData   
   Strign mData=mXPathParsedData .getText().trim();   }

HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties props = cleaner.getProperties();
props.setUseCdataForScriptAndStyle(false);
props.setOmitComments(true);
props.setOmitUnknownTags(true);
props.setOmitDoctypeDeclaration(true);
props.setOmitXmlDeclaration(true);
TagNode nodes = cleaner.clean(contentStr);
ExtractUtils.cleanInvalidAttributes(nodes);
Document doc;

public Set<String> validateNonEmpty(String html) {
  final Set<String> result = new HashSet<>();
  final HtmlCleaner cleaner = new HtmlCleaner();
  final CleanerProperties properties = cleaner.getProperties();
  properties.setOmitXmlDeclaration(true);
  properties.setOmitHtmlEnvelope(true);
  properties.setOmitComments(true);
  properties.setNamespacesAware(false);
  properties.setDeserializeEntities(true);
  if (isEmpty(cleaner.clean(html))) {
    result.add(ValidatorMessages.HTML_IS_EMPTY);
  }
  return result;
}

/**
 * Cleans the relevant file and generates a valid XML file ready for processing to Sel 2 java File.
 *
 * @param absoluteFilename - name of the file to convert.
 * @return String - location of the converted file.
 */
public String convertToXML(String absoluteFilename) throws Exception {
  FileHandler fromSelIDE = new FileHandler(absoluteFilename);
  FileHandler toXML = new FileHandler(System.getProperty("java.io.tmpdir") + File.separator + fromSelIDE.getFileName() + ".xml", true);
  if (fromSelIDE.getFile().isDirectory()) {
    LOGGER.error("Cannot convert directory {} into a Selenium Test!", fromSelIDE.getFileName());
    return null;
  }
  //Clean up html so that we can read it as XML properly
  HtmlCleaner cleaner = new HtmlCleaner();
  CleanerProperties XMLPrefs = cleaner.getProperties();
  XMLPrefs.setUseEmptyElementTags(true);
  XMLPrefs.setTranslateSpecialEntities(true);
  XMLPrefs.setTransResCharsToNCR(true);
  XMLPrefs.setOmitComments(true);
  XMLPrefs.setOmitComments(true);
  XMLPrefs.setOmitDoctypeDeclaration(true);
  XMLPrefs.setNamespacesAware(false);
  TagNode tagNode = new HtmlCleaner(XMLPrefs).clean(fromSelIDE.getFile());
  new PrettyXmlSerializer(XMLPrefs).writeToStream(tagNode, toXML.getWritableFileOutputStream(), "utf-8");
  toXML.close();
  return toXML.getAbsoluteFile();
}

 HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties props = cleaner.getProperties();
props.setNamespacesAware(false);
TagNode mainNode = cleaner.clean(htmlString);

/**
 * htmlcleaner로 html string을 xml string으로 바꿔주는 메소드.
 * @param source
 * @return
 */
private String toXML(String source){
  try {
    CleanerProperties props = new CleanerProperties();
    props.setTranslateSpecialEntities(true);
    props.setOmitComments(true);
    props.setPruneTags("script,style");
    // namespace를 무시한다.
    props.setNamespacesAware(false);
    props.setAdvancedXmlEscape(true);
    props.setTranslateSpecialEntities(true);
    HtmlCleaner cl = new HtmlCleaner(props);
    TagNode tagNode = cl.clean(source);
    source = new PrettyXmlSerializer(props).getXmlAsString(tagNode);
  } catch (IOException e) {
    logger.error("",e);
  }
  return source;
}
//test용

HtmlCleaner cleaner = new HtmlCleaner()
CleanerProperties props = cleaner.getProperties()
TagNode node = cleaner.clean( pageContent )
TagNode titleNode = node.findElementByName("title", true);

private Document createDom(String data) {
    HtmlCleaner cleaner = new HtmlCleaner();
    CleanerProperties props = cleaner.getProperties();
    props.setUseCdataForScriptAndStyle(false);
    props.setRecognizeUnicodeChars(true);
    props.setUseEmptyElementTags(true);
    props.setAdvancedXmlEscape(true);
    props.setTranslateSpecialEntities(false);
        data = XmlUtils.removeNamespace(data);
      TagNode tagNode = cleaner.clean(data);
      org.w3c.dom.Document doc = null;
      try {

protected HttpMethod submitLanguageSelectionForm(HttpMethod logonMethod) throws IOException {
  PostMethod postLanguageFormMethod;
  HtmlCleaner cleaner = new HtmlCleaner();
    TagNode node = cleaner.clean(logonMethod.getResponseBodyAsStream());
    List forms = node.getElementListByName("form", true);
    TagNode languageForm;
      throw new IOException("Form not found");
    String languageMethodPath = languageForm.getAttributeByName("action");
    List inputList = languageForm.getElementListByName("input", true);
    for (Object input : inputList) {
      String name = ((TagNode) input).getAttributeByName("name");

 private Document clean(String content) throws ParserConfigurationException {
  HtmlCleaner cleaner = new HtmlCleaner();
  TagNode rootNode = cleaner.clean(content);

  // convert to DOM
  CleanerProperties properties = new CleanerProperties();
  properties.setOmitComments(true);
  DomSerializer domSerializer = new DomSerializer(properties);
  Document doc = domSerializer.createDOM(rootNode);
  return doc;
}

private void init() {
  
  // Initialize HTMLCleaner
  cleaner = new HtmlCleaner();
  CleanerProperties props = cleaner.getProperties();
  props.setAllowHtmlInsideAttributes(true);
  props.setAllowMultiWordAttributes(true);
  props.setRecognizeUnicodeChars(true);
  props.setOmitComments(true);
  props.setNamespacesAware(false);
  
  // Initialize DomSerializer
  domSerializer = new DomSerializer(props);
  
  // Initialize xml parser		
  try {
    DocumentBuilderFactory documentBuilderFactory = DocumentBuilderFactory.newInstance();
    documentBuilder = documentBuilderFactory.newDocumentBuilder();
  } catch (ParserConfigurationException e) {
    // THIS CAN NEVER HAPPEN
  }
}

 public static Document getWebpageDocument_fromSource(String source) throws InterruptedException, IOException {
  try {
    HtmlCleaner cleaner = new HtmlCleaner();
    CleanerProperties props = cleaner.getProperties();
    props.setAllowHtmlInsideAttributes(true);
    props.setAllowMultiWordAttributes(true);
    props.setRecognizeUnicodeChars(true);
    props.setOmitComments(true);

    DocumentBuilderFactory builderFactory = DocumentBuilderFactory.newInstance();
    DocumentBuilder builder = null;
    try {
      builder = builderFactory.newDocumentBuilder();
    } catch (ParserConfigurationException e) {
      e.printStackTrace();
    }

    TagNode tagNode = new HtmlCleaner().clean(source);

    Document doc = new DomSerializer(new CleanerProperties()).createDOM(tagNode);

    return doc;
  } catch (ParserConfigurationException ex) {
    ex.printStackTrace();
    return null;
  }
}

 import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;

public class Test {

  public static void main(String[] args) throws Throwable {
    HtmlCleaner cleaner = new HtmlCleaner();
    String html = "<div style=\"Z-INDEX: 654; BORDER-BOTTOM: 0px; POSITION: absolute; BORDER-LEFT: 0px; WIDTH: 80px; HEIGHT: 22px; BORDER-TOP: 0px; TOP: 64px; CURSOR: auto; BORDER-RIGHT: 0px; LEFT: 240px\" id=\"textboxElt11286249556014dIi15v\" lineid=\"lineid\" pos_rel=\"false\" x1=\"240\" x2=\"320\" y1=\"64\" y2=\"86\"><input style=\"WIDTH: 80px; HEIGHT: 20px\" id=\"textboxElt11286249556014dIi15v_textbox\" title=\"Enter Registration Number Here\" tabindex=\"1\" value=\" \" maxlength=\"15\" size=\"10\" name=\"scheduled_tribe_registration_number_text\"></input></div>";
    TagNode node = cleaner.clean(html);
    TagNode div = node.findElementByName("div", true);
    System.out.println(div.getAttributeByName("style"));
  }
}

private TagNode parse(String html) {
  if (html == null) {
    html = "";
  }
  return parser.clean(html);
}

public CleanHtmlFunction() {
  this.cleaner = new HtmlCleaner();
  CleanerProperties p = cleaner.getProperties();
  p.setOmitComments(true);
  p.setTranslateSpecialEntities(true);
  p.setTransResCharsToNCR(true);
  // remove all tags that contain uninteresting content
  p.setPruneTags("style,script,form,object,audio,video");
}

 TagNode tagNode = new HtmlCleaner().clean(
    "<div><table><td id='1234 foo 5678'>Hello</td>");
org.w3c.dom.Document doc = new DomSerializer(
    new CleanerProperties()).createDOM(tagNode);

Javadoc

Main HtmlCleaner class.

It represents public interface to the user. It's task is to call tokenizer with specified source HTML, traverse list of produced token list and create internal object model. It also offers a set of methods to write resulting XML to string, file or any output stream.

Typical usage is the following:

// create an instance of HtmlCleaner HtmlCleaner cleaner = new HtmlCleaner(); // take default cleaner properties CleanerProperties props = cleaner.getProperties(); // customize cleaner's behavior with property setters props.setXXX(...); // Clean HTML taken from simple string, file, URL, input stream, // input source or reader. Result is root node of created // tree-like structure. Single cleaner instance may be safely used // multiple times. TagNode node = cleaner.clean(...); // optionally find parts of the DOM or modify some nodes TagNode[] myNodes = node.getElementsByXXX(...); // and/or Object[] myNodes = node.evaluateXPath(xPathExpression); // and/or aNode.removeFromTree(); // and/or aNode.addAttribute(attName, attValue); // and/or aNode.removeAttribute(attName, attValue); // and/or cleaner.setInnerHtml(aNode, htmlContent); // and/or do some other tree manipulation/traversal // serialize a node to a file, output stream, DOM, JDom... new XXXSerializer(props).writeXmlXXX(aNode, ...); myJDom = new JDomSerializer(props, true).createJDom(aNode); myDom = new DomSerializer(props, true).createDOM(aNode);

Most used methods

clean
<init>
getProperties
getInnerHtml
addAttributesToTag
Add attributes from specified map to the specified tag. If some attribute already exist it is preser
addIfNeededToPruneSet
addPossibleHeadCandidate
Checks if specified tag with specified info is candidate for moving to head section.
addPruneNode
calculateRootNode
Assigns root node to internal variable and adds neccessery xmlns attributes if cleaner if namespaces
closeAll
Close all unclosed tags if there are any.
closeSnippet
Forced closing
createDocumentNodes

Popular in Java

Start an intent from android
getOriginalFilename (MultipartFile)
Return the original filename in the client's filesystem.This may contain path information depending
runOnUiThread (Activity)
requestLocationUpdates (LocationManager)
Pointer (com.sun.jna)
An abstraction for a native pointer data type. A Pointer instance represents, on the Java side, a na
File (java.io)
An "abstract" representation of a file system entity identified by a pathname. The pathname may be a
PrintWriter (java.io)
Wraps either an existing OutputStream or an existing Writerand provides convenience methods for prin
LinkedList (java.util)
Doubly-linked list implementation of the List and Dequeinterfaces. Implements all optional list oper
ServletException (javax.servlet)
Defines a general exception a servlet can throw when it encounters difficulty.
IsNull (org.hamcrest.core)
Is the value null?
Top Sublime Text plugins

How to useHtmlCleaner in org.htmlcleaner

Best Java code snippets using org.htmlcleaner.HtmlCleaner (Showing top 20 results out of 315)

Refine search

How to use
HtmlCleaner
in
org.htmlcleaner