TagNode root = htmlCleaner.clean( stream ); Object[] found = root.evaluateXPath( "//div[id='something']" ); if( found.length > 0 && found instanceof TagNode ) { ((TagNode)found[0]).removeFromTree(); }
TagNode root = htmlCleaner.clean( stream ); Object[] found = root.evaluateXPath( "//div[id='something']" ); if( found.length > 0 && found instanceof TagNode ) { ((TagNode)found[0]).removeFromTree(); }
public static void main(String[] arg){ HtmlCleaner cleaner = new HtmlCleaner(); try { TagNode nodes = cleaner.clean(new File("c:/test.xml")); Object[] objects = nodes.evaluateXPath("//div/a[text(.,'In')]"); System.out.println(((TagNode)objects[0]).getText()); } catch (Exception e) { e.printStackTrace(); } }
/** * 获取指定的标签node * @param tagNode * @param xPath * @return */ public static TagNode getTagNodeByXpath(TagNode tagNode,String xPath){ Object[] evaluateXPath; TagNode node = null; try { evaluateXPath = tagNode.evaluateXPath(xPath); if(evaluateXPath!=null && evaluateXPath.length>0){ node = (TagNode)evaluateXPath[0]; } } catch (XPatherException e) { e.printStackTrace(); } return node; } /**
/** * 根据指定的xpath,从tagNode中选择具体的标签Text * * @param tagNode * @param xpath * @return */ public static String getTextByXpath(TagNode tagNode, String xpath) { Object[] objs = null; try { objs = tagNode.evaluateXPath(xpath); if (objs != null && objs.length > 0) { TagNode titleNode = (TagNode) objs[0]; return titleNode.getText().toString().trim(); } } catch (XPatherException e) { e.printStackTrace(); } return null; }
/** * 得到url列表 * @param tagNode * @param attr * @param xpath * @return */ public static List<String> getListUrlByXpath(TagNode tagNode, String attr, String xpath) { List<String> urls = new ArrayList<>(); try { Object[] objs = tagNode.evaluateXPath(xpath); if (objs != null && objs.length > 0) { for (Object obj : objs) { TagNode aTagNode = (TagNode) obj; String url = aTagNode.getAttributeByName(attr); urls.add("https:" + url); } } return urls; } catch (XPatherException e) { e.printStackTrace(); } return null; }
/** * 根据xpath和属性获取对应标签的属性值 * * @param tagNode * @param attr * @param xpath * @return */ public static String getAttrByXpath(TagNode tagNode, String attr, String xpath) { try { Object[] objs = tagNode.evaluateXPath(xpath); if (objs != null && objs.length > 0) { TagNode node = (TagNode) objs[0]; return node.getAttributeByName(attr); } } catch (XPatherException e) { e.printStackTrace(); } return null; }
try { objs = tagNode.evaluateXPath(xpath); if (objs != null && objs.length > 0) { TagNode paramNode = (TagNode) obj; objs = paramNode.evaluateXPath(paramTitleXpath); String paramTitle = null; if (objs != null && objs.length > 0) { objs = paramNode.evaluateXPath(paramValueXpath); JSONObject dlJsonObject = null; if (objs != null && objs.length > 0) {
@Override public List<String> extractList(String content) { List<String> list = new ArrayList<>(); try { HtmlCleaner htmlCleaner = getHtmlCleaner(); TagNode node = htmlCleaner.clean(content); Object[] objects = node.evaluateXPath(xpath); if (objects != null && objects.length > 0) { for (int i = 0; i < objects.length; i++) { list.add(wrap(objects[i], htmlCleaner)); } return list; } else { logger.warn("not found content,xpath:{}", xpath); logger.debug("content:{}", content); } } catch (Exception e) { throw new ExtractException(e); } return list; }
@Override public String extract(String content) { String result = ""; try { HtmlCleaner htmlCleaner = getHtmlCleaner(); TagNode node = htmlCleaner.clean(content); Object[] objects = node.evaluateXPath(xpath); if (objects != null && objects.length > 0) { result = wrap(objects[0], htmlCleaner); } else { logger.warn("not found content,xpath:{}", xpath); logger.debug("content:{}", content); } } catch (Exception e) { throw new ExtractException(e); } return result; }
page.addField("price", ""+price); Object[] evaluateXPath = rootNode.evaluateXPath("//*[@id=\"product-detail-2\"]/table/tbody/tr"); JSONArray jsonArray2 = new JSONArray(); for (Object object : evaluateXPath) { if(!trNode.getText().toString().trim().equals("")){//把tr为空的标签过滤掉 JSONObject jsonObject2 = new JSONObject(); Object[] evaluateXPath2 = trNode.evaluateXPath("//th"); if(evaluateXPath2!=null && evaluateXPath2.length>0){ jsonObject2.put("value", thNode.getText().toString()); }else{ evaluateXPath2 = trNode.evaluateXPath("//td");
URLConnection conn = url.openConnection(); TagNode node = cleaner.clean(new InputStreamReader(conn.getInputStream())); Object[] new_nodes = node.evaluateXPath("//*[@class='recent-change']"); Object[] version_nodes = node.evaluateXPath("//*[@itemprop='softwareVersion']");
public static String snapFromHtmlWithCookies(Context context, String xPath, String attrToSnap, String urlString, String cookies) throws IOException, XPatherException { String snap = ""; // create an instance of HtmlCleaner HtmlCleaner cleaner = new HtmlCleaner(); // take default cleaner properties CleanerProperties props = cleaner.getProperties(); props.setAllowHtmlInsideAttributes(true); props.setAllowMultiWordAttributes(true); props.setRecognizeUnicodeChars(true); props.setOmitComments(true); URL url = new URL(urlString); HttpURLConnection connection = (HttpURLConnection) url.openConnection(); connection.setDoOutput(true); // optional cookies connection.setRequestProperty(context.getString(R.string.cookie_prefix), cookies); connection.connect(); // use the cleaner to "clean" the HTML and return it as a TagNode object TagNode root = cleaner.clean(new InputStreamReader(connection.getInputStream())); Object[] foundNodes = root.evaluateXPath(xPath); if (foundNodes.length > 0) { TagNode foundNode = (TagNode) foundNodes[0]; snap = foundNode.getAttributeByName(attrToSnap); } return snap; }
try { Object[] objects = rootNode.evaluateXPath("//div[@id='second-filter']/div[2]/div/span"); TagNode tagNode = (TagNode) objects[0];
Object[] foundNodes = root.evaluateXPath(xPath);
public void process(Page page) { // TODO Auto-generated method stub HtmlCleaner htmlCleaner = new HtmlCleaner(); // 相当于htmlcleaner对页面进行处理 TagNode rootNode = htmlCleaner.clean(page.getContent()); if(page.getUrl().startsWith("http://item.jd.com")){//表示是商品详情页 processProduct(page, rootNode); }else{//处理页面的url String next_url = HtmlUtils.getAttributeByAttr(rootNode, "//*[@id=\"J_topPage\"]/a[2]", "href"); if(!next_url.equals("javascript:;")){ System.out.println("http://list.jd.com"+next_url.replace("&", "&")); String x = "http://list.jd.com"+next_url.replace("&", "&"); page.addUrl(x); } try { Object[] evaluateXPath = rootNode.evaluateXPath("//*[@id=\"plist\"]/ul/li/div/div[1]/a"); for (Object object : evaluateXPath) { TagNode tagNode = (TagNode)object; String goodsUrl = tagNode.getAttributeByName("href"); page.addUrl(goodsUrl); } } catch (XPatherException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } /**
final Object[] xpathResult = node.evaluateXPath(nodeByXPath); int i; for (i = 0; i < xpathResult.length; i++) {
final Object[] xpathResult = node.evaluateXPath(nodeByXPath); int i; for (i = 0; i < xpathResult.length; i++) {
final Object[] xpathResult = node.evaluateXPath(nodebyxpath); for (Object element : xpathResult) { if ( element instanceof TagNode ) {