org.htmlcleaner.TagNode.evaluateXPath java code examples

 TagNode root = htmlCleaner.clean( stream );
Object[] found = root.evaluateXPath( "//div[id='something']" );
if( found.length > 0 && found instanceof TagNode ) {
  ((TagNode)found[0]).removeFromTree();
}

 TagNode root = htmlCleaner.clean( stream );
Object[] found = root.evaluateXPath( "//div[id='something']" );
if( found.length > 0 && found instanceof TagNode ) {
  ((TagNode)found[0]).removeFromTree();
}

public static void main(String[] arg){
    HtmlCleaner cleaner = new HtmlCleaner();
    try {
      TagNode nodes = cleaner.clean(new File("c:/test.xml"));
      Object[] objects = nodes.evaluateXPath("//div/a[text(.,'In')]");
      System.out.println(((TagNode)objects[0]).getText());
    } catch (Exception e) {
     e.printStackTrace();
   }
 }

/**
 * 获取指定的标签node
 * @param tagNode
 * @param xPath
 * @return
 */
public static TagNode getTagNodeByXpath(TagNode tagNode,String xPath){
  Object[] evaluateXPath;
  TagNode node = null;
  try {
    evaluateXPath = tagNode.evaluateXPath(xPath);
    if(evaluateXPath!=null && evaluateXPath.length>0){
      node = (TagNode)evaluateXPath[0];
    }
  } catch (XPatherException e) {
    e.printStackTrace();
  }
  
  return node;
}
/**

/**
 * 根据指定的xpath，从tagNode中选择具体的标签Text
 *
 * @param tagNode
 * @param xpath
 * @return
 */
public static String getTextByXpath(TagNode tagNode, String xpath) {
  Object[] objs = null;
  try {
    objs = tagNode.evaluateXPath(xpath);
    if (objs != null && objs.length > 0) {
      TagNode titleNode = (TagNode) objs[0];
      return titleNode.getText().toString().trim();
    }
  } catch (XPatherException e) {
    e.printStackTrace();
  }
  return null;
}

/**
 * 得到url列表
 * @param tagNode
 * @param attr
 * @param xpath
 * @return
 */
public static List<String> getListUrlByXpath(TagNode tagNode, String attr, String xpath) {
  List<String> urls = new ArrayList<>();
  try {
    Object[] objs = tagNode.evaluateXPath(xpath);
    if (objs != null && objs.length > 0) {
      for (Object obj : objs) {
        TagNode aTagNode = (TagNode) obj;
        String url = aTagNode.getAttributeByName(attr);
        urls.add("https:" + url);
      }
    }
    return urls;
  } catch (XPatherException e) {
    e.printStackTrace();
  }
  return null;
}

/**
 * 根据xpath和属性获取对应标签的属性值
 *
 * @param tagNode
 * @param attr
 * @param xpath
 * @return
 */
public static String getAttrByXpath(TagNode tagNode, String attr, String xpath) {
  try {
    Object[] objs = tagNode.evaluateXPath(xpath);
    if (objs != null && objs.length > 0) {
      TagNode node = (TagNode) objs[0];
      return node.getAttributeByName(attr);
    }
  } catch (XPatherException e) {
    e.printStackTrace();
  }
  return null;
}

try {
  objs = tagNode.evaluateXPath(xpath);
  if (objs != null && objs.length > 0) {
      TagNode paramNode = (TagNode) obj;
      objs = paramNode.evaluateXPath(paramTitleXpath);
      String paramTitle = null;
      if (objs != null && objs.length > 0) {
      objs = paramNode.evaluateXPath(paramValueXpath);
      JSONObject dlJsonObject = null;
      if (objs != null && objs.length > 0) {

@Override
public List<String> extractList(String content) {
  List<String> list = new ArrayList<>();
  try {
    HtmlCleaner htmlCleaner = getHtmlCleaner();
    TagNode node = htmlCleaner.clean(content);
    Object[] objects = node.evaluateXPath(xpath);
    if (objects != null && objects.length > 0) {
      for (int i = 0; i < objects.length; i++) {
        list.add(wrap(objects[i], htmlCleaner));
      }
      return list;
    } else {
      logger.warn("not found content,xpath:{}", xpath);
      logger.debug("content:{}", content);
    }
  } catch (Exception e) {
    throw new ExtractException(e);
  }
  return list;
}

@Override
public String extract(String content) {
  String result = "";
  try {
    HtmlCleaner htmlCleaner = getHtmlCleaner();
    TagNode node = htmlCleaner.clean(content);
    Object[] objects = node.evaluateXPath(xpath);
    if (objects != null && objects.length > 0) {
      result = wrap(objects[0], htmlCleaner);
    } else {
      logger.warn("not found content,xpath:{}", xpath);
      logger.debug("content:{}", content);
    }
  } catch (Exception e) {
    throw new ExtractException(e);
  }
  return result;
}

page.addField("price", ""+price);
Object[] evaluateXPath = rootNode.evaluateXPath("//*[@id=\"product-detail-2\"]/table/tbody/tr");
JSONArray jsonArray2 = new JSONArray();
for (Object object : evaluateXPath) {
  if(!trNode.getText().toString().trim().equals("")){//把tr为空的标签过滤掉
    JSONObject jsonObject2 = new JSONObject();
    Object[] evaluateXPath2 = trNode.evaluateXPath("//th");
    if(evaluateXPath2!=null && evaluateXPath2.length>0){
      jsonObject2.put("value", thNode.getText().toString());
    }else{
      evaluateXPath2 = trNode.evaluateXPath("//td");

URLConnection conn = url.openConnection();
TagNode node = cleaner.clean(new InputStreamReader(conn.getInputStream()));
Object[] new_nodes = node.evaluateXPath("//*[@class='recent-change']");
Object[] version_nodes = node.evaluateXPath("//*[@itemprop='softwareVersion']");

public static String snapFromHtmlWithCookies(Context context, String xPath, String attrToSnap, String urlString,
         String cookies) throws IOException, XPatherException {
     String snap = "";
     // create an instance of HtmlCleaner
     HtmlCleaner cleaner = new HtmlCleaner();
     // take default cleaner properties
     CleanerProperties props = cleaner.getProperties();
     props.setAllowHtmlInsideAttributes(true);
     props.setAllowMultiWordAttributes(true);
     props.setRecognizeUnicodeChars(true);
     props.setOmitComments(true);
     URL url = new URL(urlString);
     HttpURLConnection connection = (HttpURLConnection) url.openConnection();
     connection.setDoOutput(true);
     // optional cookies
     connection.setRequestProperty(context.getString(R.string.cookie_prefix), cookies);
     connection.connect();
     // use the cleaner to "clean" the HTML and return it as a TagNode object
     TagNode root = cleaner.clean(new InputStreamReader(connection.getInputStream()));
     Object[] foundNodes = root.evaluateXPath(xPath);
     if (foundNodes.length > 0) {
         TagNode foundNode = (TagNode) foundNodes[0];
         snap = foundNode.getAttributeByName(attrToSnap);
     }
     return snap;
 }

try {
  Object[] objects = rootNode.evaluateXPath("//div[@id='second-filter']/div[2]/div/span");
  TagNode tagNode = (TagNode) objects[0];

Object[] foundNodes = root.evaluateXPath(xPath);

public void process(Page page) {
  // TODO Auto-generated method stub
  HtmlCleaner htmlCleaner = new HtmlCleaner();
  // 相当于htmlcleaner对页面进行处理
  TagNode rootNode = htmlCleaner.clean(page.getContent());
  if(page.getUrl().startsWith("http://item.jd.com")){//表示是商品详情页
    processProduct(page, rootNode);
  }else{//处理页面的url
    String next_url = HtmlUtils.getAttributeByAttr(rootNode, "//*[@id=\"J_topPage\"]/a[2]", "href");
    if(!next_url.equals("javascript:;")){
      System.out.println("http://list.jd.com"+next_url.replace("&amp;", "&"));
      String x = "http://list.jd.com"+next_url.replace("&amp;", "&");
      page.addUrl(x);
    }
    try {
      Object[] evaluateXPath = rootNode.evaluateXPath("//*[@id=\"plist\"]/ul/li/div/div[1]/a");
      for (Object object : evaluateXPath) {
        TagNode tagNode = (TagNode)object;
        String goodsUrl = tagNode.getAttributeByName("href");
        page.addUrl(goodsUrl);
      }
    } catch (XPatherException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
  }
}
/**

final Object[] xpathResult = node.evaluateXPath(nodeByXPath);
int i;
for (i = 0; i < xpathResult.length; i++) {

final Object[] xpathResult = node.evaluateXPath(nodeByXPath);
int i;
for (i = 0; i < xpathResult.length; i++) {

final Object[] xpathResult = node.evaluateXPath(nodebyxpath);
for (Object element : xpathResult) {
  if ( element instanceof TagNode ) {

Javadoc

Evaluates XPath expression on give node.
This is not fully supported XPath parser and evaluator. Examples below show supported elements:

 
 
//div//a 
//div//a[@id][@class] 
/body/*[1]/@type 
//div[3]//a[@id][@href='r/n4'] 
//div[last() >= 4]//./div[position() = last()])[position() > 22]//li[2]//a 
//div[2]/@*[2] 
data(//div//a[@id][@class]) 
//p/last() 
//body//div[3][@class]//span[12.2 
data(//a['v' < @id])

Popular methods of TagNode

Popular in Java

Making http post requests using okhttp
getSharedPreferences (Context)
scheduleAtFixedRate (ScheduledExecutorService)
onRequestPermissionsResult (Fragment)
Comparator (java.util)
A Comparator is used to compare two objects to determine their ordering with respect to each other.
BorderLayout (java.awt)
A border layout lays out a container, arranging and resizing its components to fit in five regions:
Point (java.awt)
A point representing a location in (x,y) coordinate space, specified in integer precision.
JOptionPane (javax.swing)
Runner (org.openjdk.jmh.runner)
Scheduler (org.quartz)
This is the main interface of a Quartz Scheduler. A Scheduler maintains a registry of org.quartz.Job
Top PhpStorm plugins

How to use evaluateXPathmethodin org.htmlcleaner.TagNode

Best Java code snippets using org.htmlcleaner.TagNode.evaluateXPath (Showing top 19 results out of 315)

How to use
evaluateXPath
method
in
org.htmlcleaner.TagNode