@Override public String getAttribute(final String name) { return tagNode.getAttributeByName(name); }
public boolean satisfy(TagNode tagNode) { if (tagNode == null || attName == null || attValue == null) { return false; } else { return isCaseSensitive ? attValue.equals(tagNode.getAttributeByName(attName)) : attValue.equalsIgnoreCase(tagNode.getAttributeByName(attName)); } } }
public boolean satisfy(TagNode tagNode) { if (tagNode == null || attName == null || attValue == null) { return false; } else { return isCaseSensitive ? attValue.equals( tagNode.getAttributeByName(attName) ) : attValue.equalsIgnoreCase( tagNode.getAttributeByName(attName) ); } } }
List<String> links = new ArrayList<String> (); for (TagNode aTag : linkElements[i].getElementListByName ("a", false)) { String link = aTag.getAttributeByName ("href"); if (link != null && link.length () > 0) links.add (link); }
/** * 获取指定标签指定属性的值 * @param tagNode * @param Xpath * @param attr * @return */ public static String getAttributeByAttr(TagNode tagNode,String xPath,String attr){ TagNode node = getTagNodeByXpath(tagNode, xPath); return node.getAttributeByName(attr); } }
/** * 得到url列表 * @param tagNode * @param attr * @param xpath * @return */ public static List<String> getListUrlByXpath(TagNode tagNode, String attr, String xpath) { List<String> urls = new ArrayList<>(); try { Object[] objs = tagNode.evaluateXPath(xpath); if (objs != null && objs.length > 0) { for (Object obj : objs) { TagNode aTagNode = (TagNode) obj; String url = aTagNode.getAttributeByName(attr); urls.add("https:" + url); } } return urls; } catch (XPatherException e) { e.printStackTrace(); } return null; }
import org.htmlcleaner.HtmlCleaner; import org.htmlcleaner.TagNode; public class Test { public static void main(String[] args) throws Throwable { HtmlCleaner cleaner = new HtmlCleaner(); String html = "<div style=\"Z-INDEX: 654; BORDER-BOTTOM: 0px; POSITION: absolute; BORDER-LEFT: 0px; WIDTH: 80px; HEIGHT: 22px; BORDER-TOP: 0px; TOP: 64px; CURSOR: auto; BORDER-RIGHT: 0px; LEFT: 240px\" id=\"textboxElt11286249556014dIi15v\" lineid=\"lineid\" pos_rel=\"false\" x1=\"240\" x2=\"320\" y1=\"64\" y2=\"86\"><input style=\"WIDTH: 80px; HEIGHT: 20px\" id=\"textboxElt11286249556014dIi15v_textbox\" title=\"Enter Registration Number Here\" tabindex=\"1\" value=\" \" maxlength=\"15\" size=\"10\" name=\"scheduled_tribe_registration_number_text\"></input></div>"; TagNode node = cleaner.clean(html); TagNode div = node.findElementByName("div", true); System.out.println(div.getAttributeByName("style")); } }
/** * 根据xpath和属性获取对应标签的属性值 * * @param tagNode * @param attr * @param xpath * @return */ public static String getAttrByXpath(TagNode tagNode, String attr, String xpath) { try { Object[] objs = tagNode.evaluateXPath(xpath); if (objs != null && objs.length > 0) { TagNode node = (TagNode) objs[0]; return node.getAttributeByName(attr); } } catch (XPatherException e) { e.printStackTrace(); } return null; }
link = tag.getAttributeByName(linkAttrName);
throw new IOException("Form not found"); String languageMethodPath = languageForm.getAttributeByName("action"); String name = ((TagNode) input).getAttributeByName("name"); String value = ((TagNode) input).getAttributeByName("value"); if (name != null && value != null) { postLanguageFormMethod.addParameter(name, value); String name = ((TagNode) select).getAttributeByName("name"); List optionList = ((TagNode) select).getElementListByName("option", true); String value = null; for (Object option : optionList) { if (((TagNode) option).getAttributeByName("selected") != null) { value = ((TagNode) option).getAttributeByName("value"); break;
if (chileTagValue.getAttributeByName("class") != null) { // 处理表格中的?提示单元格,它多了class="Ptable-tips"的属性
public static String snapFromHtmlWithCookies(Context context, String xPath, String attrToSnap, String urlString, String cookies) throws IOException, XPatherException { String snap = ""; // create an instance of HtmlCleaner HtmlCleaner cleaner = new HtmlCleaner(); // take default cleaner properties CleanerProperties props = cleaner.getProperties(); props.setAllowHtmlInsideAttributes(true); props.setAllowMultiWordAttributes(true); props.setRecognizeUnicodeChars(true); props.setOmitComments(true); URL url = new URL(urlString); HttpURLConnection connection = (HttpURLConnection) url.openConnection(); connection.setDoOutput(true); // optional cookies connection.setRequestProperty(context.getString(R.string.cookie_prefix), cookies); connection.connect(); // use the cleaner to "clean" the HTML and return it as a TagNode object TagNode root = cleaner.clean(new InputStreamReader(connection.getInputStream())); Object[] foundNodes = root.evaluateXPath(xPath); if (foundNodes.length > 0) { TagNode foundNode = (TagNode) foundNodes[0]; snap = foundNode.getAttributeByName(attrToSnap); } return snap; }
if (headElement.hasAttribute("prefix")) String namespaceData = headElement.getAttributeByName("prefix"); Pattern pattern = Pattern.compile("(([A-Za-z0-9_]+):\\s+(http:\\/\\/ogp.me\\/ns(\\/\\w+)*#))\\s*"); Matcher matcher = pattern.matcher(namespaceData); target = "name"; if (target != null && metaElement.getAttributeByName(target).startsWith(namespace.getPrefix() + ":")) setProperty(namespace, metaElement.getAttributeByName(target), metaElement.getAttributeByName("content")); break;
result.addAll( evaluateAgainst(node.getAttributes().values(), from + 1, to, false, 1, 1, isFilterContext, null) ); } else { String attValue = node.getAttributeByName(name); if (attValue != null) { result.addAll( evaluateAgainst(singleton(attValue), from + 1, to, false, 1, 1, isFilterContext, null) );
snap = foundNode.getAttributeByName(attrToStrip);
result.addAll(evaluateAgainst(node.getAttributes().values(), from + 1, to, false, 1, 1, isFilterContext, null)); } else { String attValue = node.getAttributeByName(name); if (attValue != null) { result.addAll(evaluateAgainst(singleton(attValue), from + 1, to, false, 1, 1, isFilterContext, null));
cleanTimeValues.rootNode.addNamespaceDeclaration("", cleanTimeValues.rootNode.getAttributeByName("xmlns"));
public void process(Page page) { // TODO Auto-generated method stub HtmlCleaner htmlCleaner = new HtmlCleaner(); // 相当于htmlcleaner对页面进行处理 TagNode rootNode = htmlCleaner.clean(page.getContent()); if(page.getUrl().startsWith("http://item.jd.com")){//表示是商品详情页 processProduct(page, rootNode); }else{//处理页面的url String next_url = HtmlUtils.getAttributeByAttr(rootNode, "//*[@id=\"J_topPage\"]/a[2]", "href"); if(!next_url.equals("javascript:;")){ System.out.println("http://list.jd.com"+next_url.replace("&", "&")); String x = "http://list.jd.com"+next_url.replace("&", "&"); page.addUrl(x); } try { Object[] evaluateXPath = rootNode.evaluateXPath("//*[@id=\"plist\"]/ul/li/div/div[1]/a"); for (Object object : evaluateXPath) { TagNode tagNode = (TagNode)object; String goodsUrl = tagNode.getAttributeByName("href"); page.addUrl(goodsUrl); } } catch (XPatherException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } /**
} else if (forms.size() > 1) { for (Object form : forms) { if ("logonForm".equals(((TagNode) form).getAttributeByName("name"))) { logonForm = ((TagNode) form); String logonMethodPath = logonForm.getAttributeByName("action"); String type = ((TagNode) input).getAttributeByName("type"); String name = ((TagNode) input).getAttributeByName("name"); String value = ((TagNode) input).getAttributeByName("value"); if ("hidden".equalsIgnoreCase(type) && name != null && value != null) { ((PostMethod) logonMethod).addParameter(name, value); List frameList = node.getElementListByName("frame", true); if (frameList.size() == 1) { String src = ((TagNode) frameList.get(0)).getAttributeByName("src"); if (src != null) { LOGGER.debug("Frames detected in form page, try frame content");
String documentPath = anchorTag.getAttributeByName("href"); if (StringUtils.isBlank(documentPath)) { continue; final String srcPath = imageTag.getAttributeByName("src"); if (StringUtils.isBlank(srcPath)) { continue;