org.htmlparser.Node java code examples

Refine search

NodeList

public void visitRemarkNode (Remark remarkNode)
{
  modifiedResult.append (remarkNode.toHtml());
}

  @Override
  public boolean accept(Node node) {
    return content.matcher(node.getText()).find();
  }
}

list = node.getChildren ();
if (null != list)
  count = list.size ();
  for (int i = 0; i < count; i++)
    if (child == list.elementAt (i))

private int getBeginOffset(Node tag) {
 if (onlyContent) {
  return tag.getEndPosition();
 } else {
  return tag.getStartPosition();
 }
}

/**
 * Add a child to the given tag.
 * @param parent The parent tag.
 * @param child The child node.
 */
protected void addChild (Tag parent, Node child)
{
  if (null == parent.getChildren ())
    parent.setChildren (new NodeList ());
  child.setParent (parent);
  parent.getChildren ().add (child);
}

public void visitEndTag(Tag tag)
{
  Node parent;
  
  parent = tag.getParent ();
  // process only those nodes not processed by a parent
  if (null == parent)
    // an orphan end tag
    modifiedResult.append(tag.toHtml());
  else
    if (null == parent.getParent ())
      // a top level tag with no parents
      modifiedResult.append(parent.toHtml());
}

for (int j=0; j<links.size(); j++)
  CompositeTag beginTag = (CompositeTag)links.elementAt(j);
  Tag endTag = beginTag.getEndTag();
  int beginTagEnd = endTag.getStartPosition ();
  int endTagEnd = endTag.getEndPosition ();

private static NodeList deepClone(NodeList tree, Node clonedParent) {
 NodeList newNodeList = new NodeList();
 for (int i = 0; i < tree.size(); i++) {
  Node node = tree.elementAt(i);
  Node newNode = cloneOnlyNode(node, clonedParent);
  newNodeList.add(newNode);
  if (node.getChildren() != null) {
   newNode.setChildren(deepClone(node.getChildren(), newNode));
  }
 }
 return newNodeList;
}

  public boolean accept (Node n)
  {
    if (n.getParent () != null)
    {
      NodeList l = n.getParent ().getChildren ();
      for (int i = 0; i < l.size (); i++)
        if (l.elementAt (i) == n && i > 0)
          return (sibtest.accept (l.elementAt (i - 1)));
    }
    return (false);
  }
}

private String getValueOfTagWithAttributeValue(String attribute, String value) throws Exception {
 NodeList matches = getMatchingTags(new HasAttributeFilter(attribute, value));
 if (matches.size() != 1)
  return String.format("There are %d matches, there should be 1.", matches.size());
 else
  return matches.elementAt(0).toHtml();
}

NodeList links = new NodeList ();
parser = createParserParsingAnInputString(output);
links = parser.extractAllNodesThatMatch(filter);
  for (int j=0; j<links.size(); j++)
    CompositeTag jStartTag = (CompositeTag)links.elementAt(j);
    Tag jEndTag = jStartTag.getEndTag();
    int jStartTagBegin = jStartTag.getStartPosition ();
    int jEndTagEnd = jEndTag.getEndPosition ();
    for (int k=0; k<links.size(); k++)
      Tag kEndTag = kStartTag.getEndTag();
      int kStartTagBegin = kStartTag.getStartPosition ();
      int kEndTagEnd = kEndTag.getEndPosition ();
      if ((k!=j) && (kStartTagBegin>jStartTagBegin) && (kEndTagEnd<jEndTagEnd))

  lexer.setPosition (next.getStartPosition ());
  node = null;
          stack.add (ret);
          ret = next;
    attributes.addElement (new Attribute (name, null));
    Tag opener = lexer.getNodeFactory ().createTagNode (
      lexer.getPage (), next.getStartPosition (), next.getEndPosition (),
      attributes);
      for (int i = stack.size () - 1; (-1 == index) && (i >= 0); i--)
        Tag boffo = (Tag)stack.elementAt (i);
        if (name.equals (boffo.getTagName ()))
          index = i;
node.doSemanticAction ();

private NodeList getRows(NodeList tables) {
 TagNameFilter trFilter = new TagNameFilter("tr");
 Node table = tables.elementAt(0);
 if (table.getChildren() != null)
  return table.getChildren().extractAllNodesThatMatch(trFilter);
 return new NodeList();
}

NodeFilter filter = new AndFilter(new TagNameFilter("li"),new HasAttributeFilter("class","image-item"));
NodeList list = parser.parse(filter);
if (list.size() == 0) {
  parser.reset();
  filter = new AndFilter(new TagNameFilter("li"),new HasAttributeFilter("class","image-item "));
  list = parser.parse(filter);
if (list.size() == 0) {
  parser.reset();
  filter = new TagNameFilter("li");
  list = parser.parse(filter);
for (int i = 0; i < list.size(); i++) {
  Node item = list.elementAt(i);
  NodeList childs = item.getChildren();
  String uri = ((LinkTag)childs.elementAt(0)).getLink();
  String count = "0";
  if (childs.size() == 6) {
    count = childs.elementAt(5).getFirstChild().getFirstChild().getLastChild().getText();

/**
 * Filter the list with the given filter.
 * @param filter The filter to use.
 * @param recursive If <code>true<code> digs into the children recursively.
 * @return A new node array containing the nodes accepted by the filter.
 * This is a linear list and preserves the nested structure of the returned
 * nodes only.
 */
public NodeList extractAllNodesThatMatch (NodeFilter filter, boolean recursive)
{
  Node node;
  NodeList children;
  NodeList ret;
  ret = new NodeList ();
  for (int i = 0; i < size; i++)
  {
    node = nodeData[i];
    if (filter.accept (node))
      ret.add (node);
    if (recursive)
    {
      children = node.getChildren ();
      if (null != children)
        ret.add (children.extractAllNodesThatMatch (filter, recursive));
    }
  }
  return (ret);
}

/**
 * Add the textual contents of the end tag of this node to the buffer.
 * @param verbatim If <code>true</code> return as close to the original
 * page text as possible.
 * @param sb The buffer to append to.
 */
protected void putEndTagInto (StringBuffer sb, boolean verbatim)
{
  // eliminate virtual tags
  if (!verbatim || !(mEndTag.getStartPosition () == mEndTag.getEndPosition ()))
    sb.append (getEndTag ().toHtml());
}

private void extractColumns(Map<String, String> map, Node row) {
 TagNameFilter tdFilter = new TagNameFilter("td");
 if (row.getChildren() != null) {
  NodeList cols = row.getChildren().extractAllNodesThatMatch(tdFilter);
  if (cols.size() == 2)
   addColsToMap(map, cols);
 }
}

/**
 * Convenience method to apply a {@link StringBean} to the filter results.
 * This may yield duplicate or multiple text elements if the node list
 * contains nodes from two or more levels in the same nested tag heirarchy,
 * but if the node list contains only one tag, it provides access to the
 * text within the node.
 * @return The textual contents of the nodes that pass through the filter set,
 * as collected by the StringBean. 
 */
public String getText ()
{
  NodeList list;
  StringBean sb;
  String ret;
  list = getNodes ();
  if (0 != list.size ())
  {
    sb = new StringBean ();
    for (int i = 0; i < list.size (); i++)
      list.elementAt (i).accept (sb);
    ret = sb.getStrings ();
  }
  else
    ret = "";
  
  return (ret);
}

int defaultSize = 22;
try {
  for (NodeIterator e = list.elements(); e.hasMoreNodes(); ) {
    Node node = e.nextNode();
    StringBuilder _sbBuilder = null;
    if (node instanceof ParagraphTag) {
      _sbBuilder = getNewHtml(node.getChildren());
      if (_sbBuilder != null) {
        rt.append(_sbBuilder.toString());
          String str = node.toHtml();
          rt.append(str);
        String str = node.toHtml();
        str = clearLastSpace(str);

private void parseNode(List<AbstractContainer> nodes, int current, List<Item> items) {
  AbstractContainer node = nodes.get(current);
  final String content = node.getDataAsString();
  final Lexer lexer = new Lexer(content);
  Node cursor = null;
  try {
    while ((cursor = lexer.nextNode()) != null) {
      if (cursor instanceof Remark) {
        items.add(new SimpleComment(cursor.getText()));
      } else if (cursor instanceof Text) {
        items.add(new SimpleText(cursor.toHtml()));
      } else if (cursor instanceof org.htmlparser.Tag) {
        processTag(nodes, current, items, (org.htmlparser.Tag) cursor);
      } else {
        throw new BrixException("Unknown node type " + cursor.getClass().getName());
      }
    }
  }
  catch (ParserException e) {
    throw new BrixException("Couldn't parse node content: '" + node.getPath() + "'", e);
  }
}

Javadoc

Specifies the minimum requirements for nodes returned by the Lexer or Parser. There are three types of nodes in HTML: text, remarks and tags. You may wish to define your own nodes to be returned by the org.htmlparser.lexer.Lexer or Parser, but each of the types must support this interface. More specific interface requirements for each of the node types are specified by the Text, Remark and Tag interfaces.

Most used methods

toHtml
Return the HTML for this node. This should be the exact sequence of characters that were encountered
getText
Returns the text of the node.
accept
Apply the visitor to this node.
getChildren
Get the children of this node.
getEndPosition
Gets the ending position of the node. This is the character (not byte) offset of the character follo
getStartPosition
Gets the starting position of the node. This is the character (not byte) offset of this node in the
setChildren
Set the children of this node.
setParent
Sets the parent of this node.
toPlainTextString
A string representation of the node. This is an important method, it allows a simple string transfor
clone
Allow cloning of nodes. Creates and returns a copy of this object. The precise meaning of "copy" may
doSemanticAction
Perform the meaning of this tag. This is defined by the tag, for example the bold tag may switch
collectInto
Collect this node and its child nodes into a list, provided the node satisfies the filtering criteri

Popular in Java

Parsing JSON documents to java classes using gson
startActivity (Activity)
putExtra (Intent)
addToBackStack (FragmentTransaction)
RandomAccessFile (java.io)
Allows reading from and writing to a file in a random-access manner. This is different from the uni-
MessageFormat (java.text)
Produces concatenated messages in language-neutral way. New code should probably use java.util.Forma
Container (java.awt)
A generic Abstract Window Toolkit(AWT) container object is a component that can contain other AWT co
FlowLayout (java.awt)
A flow layout arranges components in a left-to-right flow, much like lines of text in a paragraph. F
Project (org.apache.tools.ant)
Central representation of an Ant project. This class defines an Ant project with all of its targets,
Scheduler (org.quartz)
This is the main interface of a Quartz Scheduler. A Scheduler maintains a registry of org.quartz.Job
Top plugins for Android Studio

How to useNode in org.htmlparser

Best Java code snippets using org.htmlparser.Node (Showing top 20 results out of 315)

Refine search

How to use
Node
in
org.htmlparser