TagNode root = htmlCleaner.clean( stream ); Object[] found = root.evaluateXPath( "//div[id='something']" ); if( found.length > 0 && found instanceof TagNode ) { ((TagNode)found[0]).removeFromTree(); }
TagNode tagNode = new HtmlCleaner().clean( "<div><table><td id='1234 foo 5678'>Hello</td>"); org.w3c.dom.Document doc = new DomSerializer( new CleanerProperties()).createDOM(tagNode);
@Override public String select(String text) { try { HtmlCleaner htmlCleaner = new HtmlCleaner(); TagNode tagNode = htmlCleaner.clean(text); Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode); Object result; try {
public String getAsString(String htmlContent) { HtmlCleaner htmlCleaner = new HtmlCleaner(this.props); TagNode tagNode = htmlCleaner.clean(htmlContent); return getAsString(tagNode, props.getCharset()); }
HtmlCleaner cleaner = new HtmlCleaner(); rootNode = cleaner.clean(htmlPage);
public CleanHtmlFunction() { this.cleaner = new HtmlCleaner(); CleanerProperties p = cleaner.getProperties(); p.setOmitComments(true); p.setTranslateSpecialEntities(true); p.setTransResCharsToNCR(true); // remove all tags that contain uninteresting content p.setPruneTags("style,script,form,object,audio,video"); }
private static synchronized void initCleaner() { if (!htmlCleanerInitialized) { cleaner = new HtmlCleaner(); CleanerProperties props = cleaner.getProperties(); props.setOmitComments(true); props.setOmitXmlDeclaration(true); htmlCleanerInitialized = true; } }
protected boolean isMinimizedTagSyntax(TagNode tagNode) { final TagInfo tagInfo = props.getTagInfoProvider().getTagInfo(tagNode.getName()); return tagNode.isEmpty() && (tagInfo == null || tagInfo.isMinimizedTagPermitted()) && (props.isUseEmptyElementTags() || (tagInfo != null && tagInfo.isEmptyTag())); }
private boolean isContentOrInline(Object node) { boolean result = false; if (node instanceof ContentNode) { result = true; } else if (node instanceof TagNode) { TagInfo nextInfo = props.getTagInfoProvider().getTagInfo(((TagNode) node).getName()); result = nextInfo != null && nextInfo.getDisplay() == Display.inline; } return result; }
public String getTagName(String tagName) { TagTransformation tagTransformation = null; if (hasTransformationForTag(tagName)) { tagTransformation = getTransformation(tagName); if (tagTransformation != null) { return tagTransformation.getDestTag(); } } return tagName; }
public ConfigFileTagProvider(InputSource inputSource) { try { new ConfigParser(this).parse(inputSource); } catch (Exception e) { throw new HtmlCleanerException("Error parsing tag configuration file!", e); } }
/** * Writes specified TagNode to the output stream, using system default charset and optionally omits node envelope * (skips open and close tags of the node). * @param tagNode Node to be written * @param out Output stream * @param omitEnvelope Tells whether to skip open and close tag of the node. * @throws IOException */ public void writeToStream(TagNode tagNode, OutputStream out, boolean omitEnvelope) throws IOException { writeToStream( tagNode, out, props.getCharset(), omitEnvelope ); }
/** * @param tagNode Node to serialize to string * @param omitEnvelope Tells whether to skip open and close tag of the node. * @return Output as string * @throws IOException */ public String getAsString(TagNode tagNode, boolean omitEnvelope) { return getAsString(tagNode, props.getCharset(), omitEnvelope); }
public void styleElements(TagInfo tagInfo) { tagInfo = new TagInfo("span", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline); this.put("span", tagInfo); tagInfo = new TagInfo("style", ContentType.text, BelongsTo.HEAD, false, false, false, CloseTag.required, Display.none); this.put("style", tagInfo); tagInfo = new TagInfo("bgsound", ContentType.none, BelongsTo.HEAD, false, false, false, CloseTag.forbidden, Display.none); this.put("bgsound", tagInfo); tagInfo = new TagInfo("meta", ContentType.none, BelongsTo.HEAD, false, false, false, CloseTag.forbidden, Display.none); this.put("meta", tagInfo); tagInfo = new TagInfo("base", ContentType.none, BelongsTo.HEAD, false, false, false, CloseTag.forbidden, Display.none); this.put("base", tagInfo); }
/** * The HTML5 edits tags (2 total) */ public void editTags(TagInfo tagInfo) { tagInfo = new TagInfo("ins", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.any); this.put("ins", tagInfo); tagInfo = new TagInfo("del", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.any); this.put("del", tagInfo); }
List<String> results = new ArrayList<String>(); try { HtmlCleaner htmlCleaner = new HtmlCleaner(); TagNode tagNode = htmlCleaner.clean(text); Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode); Object result; try {
public String getAsString(String htmlContent) { HtmlCleaner htmlCleaner = new HtmlCleaner(this.props); TagNode tagNode = htmlCleaner.clean(htmlContent); return getAsString(tagNode, props.getCharset()); }
HtmlCleaner cleaner = new HtmlCleaner(); rootNode = cleaner.clean(htmlPage);
protected boolean isMinimizedTagSyntax(TagNode tagNode) { final TagInfo tagInfo = props.getTagInfoProvider().getTagInfo(tagNode.getName()); return tagNode.isEmpty() && (tagInfo == null || tagInfo.isMinimizedTagPermitted()) && ( props.isUseEmptyElementTags() || (tagInfo != null && tagInfo.isEmptyTag()) ); } protected void serializeOpenTag(TagNode tagNode, Writer writer) throws IOException {
private boolean isContentOrInline(Object node) { boolean result = false; if (node instanceof ContentNode) { result = true; } else if (node instanceof TagNode) { TagInfo nextInfo = props.getTagInfoProvider().getTagInfo(((TagNode) node).getName()); result = nextInfo != null && nextInfo.getDisplay() == Display.inline; } return result; }