Refine search
TagNode root = htmlCleaner.clean( stream ); Object[] found = root.evaluateXPath( "//div[id='something']" ); if( found.length > 0 && found instanceof TagNode ) { ((TagNode)found[0]).removeFromTree(); }
HtmlCleaner cleaner = new HtmlCleaner(); CleanerProperties props = cleaner.getProperties(); props.setAllowHtmlInsideAttributes(true); props.setAllowMultiWordAttributes(true); props.setRecognizeUnicodeChars(true); props.setOmitComments(true); try { URL url = new URL(playUrl); URLConnection conn = url.openConnection(); TagNode node = cleaner.clean(new InputStreamReader(conn.getInputStream())); Object[] new_nodes = node.evaluateXPath("//*[@class='recent-change']"); Object[] version_nodes = node.evaluateXPath("//*[@itemprop='softwareVersion']"); whatsNew += info_node.getAllChildren().get(0).toString().trim() + "\n"; version = ver.getAllChildren().get(0).toString().trim();
if (startingTag.hasAttribute("xmlns")) { cleanTimeValues.namespace.pop(); if (i > 0 && tag != null && tag.isContinueAfter(closedTag.getName())) { TagNode cloned = closedTag.makeCopy(); cloned.setAutoGenerated(true); nodeIterator.add(cloned); nodeIterator.previous(); String tagName = startTagToken.getName(); if (startTagToken.hasAttribute("xmlns")) { String ns = startTagToken.getAttributeByName("xmlns"); Map<String, String> attributes = startTagToken.getAttributes(); attributes.put("xmlns", "http://www.w3.org/1999/xhtml"); startTagToken.setAttributes(attributes); startTagToken.removeAttribute("xmlns"); } else { startTagToken.removeAttribute("xmlns"); } else { cleanTimeValues.namespace.push(ns); startTagToken.addNamespaceDeclaration("", ns); startTagToken.setForeignMarkup(true);
import org.htmlcleaner.HtmlCleaner; import org.htmlcleaner.TagNode; public class Test { public static void main(String[] args) throws Throwable { HtmlCleaner cleaner = new HtmlCleaner(); String html = "<div style=\"Z-INDEX: 654; BORDER-BOTTOM: 0px; POSITION: absolute; BORDER-LEFT: 0px; WIDTH: 80px; HEIGHT: 22px; BORDER-TOP: 0px; TOP: 64px; CURSOR: auto; BORDER-RIGHT: 0px; LEFT: 240px\" id=\"textboxElt11286249556014dIi15v\" lineid=\"lineid\" pos_rel=\"false\" x1=\"240\" x2=\"320\" y1=\"64\" y2=\"86\"><input style=\"WIDTH: 80px; HEIGHT: 20px\" id=\"textboxElt11286249556014dIi15v_textbox\" title=\"Enter Registration Number Here\" tabindex=\"1\" value=\" \" maxlength=\"15\" size=\"10\" name=\"scheduled_tribe_registration_number_text\"></input></div>"; TagNode node = cleaner.clean(html); TagNode div = node.findElementByName("div", true); System.out.println(div.getAttributeByName("style")); } }
HtmlCleaner cleaner = new HtmlCleaner(); TagNode pageData = cleaner.clean(headContentsStr); TagNode headElement = pageData.findElementByName("head", true); if (headElement.hasAttribute("prefix")) String namespaceData = headElement.getAttributeByName("prefix"); Pattern pattern = Pattern.compile("(([A-Za-z0-9_]+):\\s+(http:\\/\\/ogp.me\\/ns(\\/\\w+)*#))\\s*"); Matcher matcher = pattern.matcher(namespaceData); TagNode[] metaData = pageData.getElementsByName("meta", true); for (TagNode metaElement : metaData) if (metaElement.hasAttribute("property")) target = "property"; else if (metaElement.hasAttribute("name")) target = "name"; if (target != null && metaElement.getAttributeByName(target).startsWith(namespace.getPrefix() + ":")) setProperty(namespace, metaElement.getAttributeByName(target), metaElement.getAttributeByName("content")); break;
if ( i > 0 && tag != null && tag.isContinueAfter(closedTag.getName()) ) { TagNode cloned = closedTag.makeCopy(); cloned.setAutoGenerated(true); nodeIterator.add( cloned ); nodeIterator.previous(); String tagName = startTagToken.getName(); addAttributesToTag(cleanTimeValues.htmlNode, startTagToken.getAttributes()); nodeIterator.set(null); addAttributesToTag(cleanTimeValues.bodyNode, startTagToken.getAttributes()); nodeIterator.set(null); addAttributesToTag(cleanTimeValues.headNode, startTagToken.getAttributes()); nodeIterator.set(null); requiredParentStartToken.setAutoGenerated(true); nodeIterator.previous(); nodeIterator.add(requiredParentStartToken); boolean certainty = startTagToken.hasAttribute("id") ? false : true; while (closedIt.hasPrevious()) { TagNode currStartToken = (TagNode) closedIt.previous(); if ( tag.isCopy(currStartToken.getName()) ) { toBeCopied.add(0, currStartToken); } else {
public void process(Page page) { // TODO Auto-generated method stub HtmlCleaner htmlCleaner = new HtmlCleaner(); // 相当于htmlcleaner对页面进行处理 TagNode rootNode = htmlCleaner.clean(page.getContent()); if(page.getUrl().startsWith("http://item.jd.com")){//表示是商品详情页 processProduct(page, rootNode); }else{//处理页面的url String next_url = HtmlUtils.getAttributeByAttr(rootNode, "//*[@id=\"J_topPage\"]/a[2]", "href"); if(!next_url.equals("javascript:;")){ System.out.println("http://list.jd.com"+next_url.replace("&", "&")); String x = "http://list.jd.com"+next_url.replace("&", "&"); page.addUrl(x); } try { Object[] evaluateXPath = rootNode.evaluateXPath("//*[@id=\"plist\"]/ul/li/div/div[1]/a"); for (Object object : evaluateXPath) { TagNode tagNode = (TagNode)object; String goodsUrl = tagNode.getAttributeByName("href"); page.addUrl(goodsUrl); } } catch (XPatherException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } /**
protected HttpMethod submitLanguageSelectionForm(HttpMethod logonMethod) throws IOException { PostMethod postLanguageFormMethod; HtmlCleaner cleaner = new HtmlCleaner(); TagNode node = cleaner.clean(logonMethod.getResponseBodyAsStream()); List forms = node.getElementListByName("form", true); TagNode languageForm; throw new IOException("Form not found"); String languageMethodPath = languageForm.getAttributeByName("action"); List inputList = languageForm.getElementListByName("input", true); for (Object input : inputList) { String name = ((TagNode) input).getAttributeByName("name"); String value = ((TagNode) input).getAttributeByName("value"); if (name != null && value != null) { postLanguageFormMethod.addParameter(name, value); List selectList = languageForm.getElementListByName("select", true); for (Object select : selectList) { String name = ((TagNode) select).getAttributeByName("name"); List optionList = ((TagNode) select).getElementListByName("option", true); String value = null; for (Object option : optionList) { if (((TagNode) option).getAttributeByName("selected") != null) { value = ((TagNode) option).getAttributeByName("value"); break;
/** * For the specified tag node, defines it's html content. This causes cleaner to * reclean given html portion and insert it inside the node instead of previous content. * * @param node * @param content */ public void setInnerHtml(TagNode node, String content) { if (node != null) { String nodeName = node.getName(); StringBuilder html = new StringBuilder(); html.append("<").append(nodeName).append(" " + MARKER_ATTRIBUTE + "=''>").append(content).append("</").append(nodeName).append(">"); TagNode parent = node.getParent(); while (parent != null) { String parentName = parent.getName(); html.insert(0, "<" + parentName + ">"); html.append("</").append(parentName).append(">"); parent = parent.getParent(); } TagNode innerRootNode = clean(html.toString()); TagNode cleanedNode = innerRootNode.findElementHavingAttribute(MARKER_ATTRIBUTE, true); if (cleanedNode != null) { node.setChildren(cleanedNode.getAllChildren()); } } }
public static void main(String[] arg){ HtmlCleaner cleaner = new HtmlCleaner(); try { TagNode nodes = cleaner.clean(new File("c:/test.xml")); Object[] objects = nodes.evaluateXPath("//div/a[text(.,'In')]"); System.out.println(((TagNode)objects[0]).getText()); } catch (Exception e) { e.printStackTrace(); } }
final TagNode rootNode = htmlCleaner.clean(html); final TagNode[] anchorTags = rootNode.getElementsByName("a", true); String documentPath = anchorTag.getAttributeByName("href"); if (StringUtils.isBlank(documentPath)) { continue; documentPath); final ContentNode textContent = new ContentNode(anchorTag.getText().toString()); anchorTag.getParent().insertChildAfter(anchorTag, textContent); anchorTag.getParent().removeChild(anchorTag); continue; } else { final TagNode[] imageTags = rootNode.getElementsByName("img", true); final String srcPath = imageTag.getAttributeByName("src"); if (StringUtils.isBlank(srcPath)) { continue; final TagNode[] targetNodes = rootNode.getElementsByName("body", false); if (targetNodes.length > 0) { TagNode bodyNode = targetNodes[0]; return htmlCleaner.getInnerHtml(bodyNode); } else { log.warn("Cannot rewrite content for '{}' because there is no 'body' element" + node.getPath());
if (properties.isOmitHtmlEnvelope()) { List bodyChildren = cleanTimeValues.bodyNode.getAllChildren(); cleanTimeValues.rootNode = new TagNode(null); if (bodyChildren != null) { for (Iterator iterator = bodyChildren.iterator(); iterator.hasNext(); ) { Object currChild = iterator.next(); cleanTimeValues.rootNode.addChild(currChild); Map<String, String> atts = cleanTimeValues.rootNode.getAttributes(); if (cleanTimeValues.rootNode.hasAttribute("xmlns")) { cleanTimeValues.rootNode.addNamespaceDeclaration("", cleanTimeValues.rootNode.getAttributeByName("xmlns")); if (properties.isNamespacesAware() && namespacePrefixes != null) { Iterator<String> iterator = namespacePrefixes.iterator(); while (iterator.hasNext()) { if (Thread.currentThread().isInterrupted()) { handleInterruption(); return; cleanTimeValues.rootNode.addAttribute(xmlnsAtt, prefix);
pushNesting(cleanTimeValues); cleanTimeValues._headOpened = false; cleanTimeValues._bodyOpened = false; cleanTimeValues._headTags.clear(); cleanTimeValues.allTags.clear(); cleanTimeValues.pruneTagSet = new HashSet<ITagNodeCondition>(this.properties.getPruneTagSet()); cleanTimeValues.allowTagSet = new HashSet<ITagNodeCondition>(this.properties.getAllowTagSet()); this.transformations = this.properties.getCleanerTransformations(); cleanTimeValues.pruneNodeSet.clear(); cleanTimeValues.htmlNode = this.newTagNode("html"); cleanTimeValues.bodyNode = this.newTagNode("body"); cleanTimeValues.headNode = this.newTagNode("head"); cleanTimeValues.rootNode = null; cleanTimeValues.htmlNode.addChild(cleanTimeValues.headNode); cleanTimeValues.htmlNode.addChild(cleanTimeValues.bodyNode); while (iterator.hasNext()) { TagNode tagNode = iterator.next(); TagNode parent = tagNode.getParent(); if (parent != null) { parent.removeChild(tagNode); cleanTimeValues.rootNode.setDocType( htmlTokenizer.getDocType() ); popNesting(cleanTimeValues); return cleanTimeValues.rootNode;
private static TagNode getTargetTagNode(String html, String tagName) { TagNode targetNode = null; try { TagNode rootNode = getHtmlCleaner().clean(html); if (tagName == null || "".equals(tagName) || tagName.equalsIgnoreCase(rootNode.getName())) { return rootNode; } TagNode [] targetNodes = rootNode.getElementsByName(tagName, true); if (targetNodes.length > 0) { targetNode = targetNodes[0]; } } catch (Exception e) { throw new RuntimeException(e); } return targetNode; }
@Override public List<String> extractList(String content) { List<String> list = new ArrayList<>(); try { HtmlCleaner htmlCleaner = getHtmlCleaner(); TagNode node = htmlCleaner.clean(content); Object[] objects = node.evaluateXPath(xpath); if (objects != null && objects.length > 0) { for (int i = 0; i < objects.length; i++) { list.add(wrap(objects[i], htmlCleaner)); } return list; } else { logger.warn("not found content,xpath:{}", xpath); logger.debug("content:{}", content); } } catch (Exception e) { throw new ExtractException(e); } return list; }
protected void serializeOpenTag(TagNode tagNode, Writer writer, boolean newLine) throws IOException { if (!isForbiddenTag(tagNode)) { String tagName = tagNode.getName(); Map<String, String> tagAtttributes = tagNode.getAttributes(); if (props.isAddNewlineToHeadAndBody() && isHeadOrBody(tagName)) { writer.write("\n"); if (!tagNode.getText().toString().startsWith(CData.SAFE_BEGIN_CDATA)) { writer.write(CData.SAFE_BEGIN_CDATA); if (!tagNode.getText().toString().equals("")) { char firstchar = tagNode.getText().toString().charAt(0); if (firstchar != '\n' && firstchar != '\r') writer.write("\n");
if (properties.isOmitHtmlEnvelope()) { List bodyChildren = cleanTimeValues.bodyNode.getAllChildren(); cleanTimeValues.rootNode = new TagNode(null); if (bodyChildren != null) { for(Iterator iterator = bodyChildren.iterator(); iterator.hasNext(); ) { Object currChild = iterator.next(); cleanTimeValues.rootNode.addChild(currChild); Map<String, String> atts = cleanTimeValues.rootNode.getAttributes(); if (properties.isNamespacesAware() && namespacePrefixes != null) { Iterator<String> iterator = namespacePrefixes.iterator(); while (iterator.hasNext()) { cleanTimeValues.rootNode.addAttribute(xmlnsAtt, prefix);
private boolean satisfy(TagNode tagNode, boolean override) { String name = tagNode.getName(); TagInfo tagInfo = tagInfoProvider.getTagInfo(name); CharSequence contentString = tagNode.getText(); if(isEmptyString(contentString)) { if (tagNode.isEmpty()) { return true; } else { for(Object child: tagNode.getAllChildren()) {
try { objs = tagNode.evaluateXPath(xpath); if (objs != null && objs.length > 0) { TagNode paramNode = (TagNode) obj; objs = paramNode.evaluateXPath(paramTitleXpath); String paramTitle = null; if (objs != null && objs.length > 0) { TagNode paramTitleNode = (TagNode) objs[0]; paramTitle = paramTitleNode.getText().toString(); objs = paramNode.evaluateXPath(paramValueXpath); JSONObject dlJsonObject = null; if (objs != null && objs.length > 0) { dlJsonObject = new JSONObject(); TagNode dlNode = (TagNode) objs[0]; List<TagNode> childTagList = dlNode.getChildTagList(); for (int i = 0; i < childTagList.size(); i = i + 2) { if (chileTagValue.getAttributeByName("class") != null) { // 处理表格中的?提示单元格,它多了class="Ptable-tips"的属性 dlJsonObject.put(childTagTitle.getText().toString().trim(), chileTagValue.getText().toString().trim());
String tagName = tag.getName(); String[] linkAttrNames = tagNameAndLinkAttrs.get(tagName); link = tag.getAttributeByName(linkAttrName); tag.addAttribute(linkAttrName, rewrittenLink);