net.htmlparser.jericho.Source.getParseText java code examples

public ProspectiveTagTypeIterator(final Source source, final int pos) {
  // returns empty iterator if pos out of range
  final ParseText parseText=source.getParseText();
  cursor=root;
  int posIndex=0;
  try {
    // find deepest node that matches the text at pos:
    while (true) {
      final TagTypeRegister child=cursor.getChild(parseText.charAt(pos+(posIndex++)));
      if (child==null) break;
      cursor=child;
    }
  } catch (IndexOutOfBoundsException ex) {} // not avoiding this exception is expensive but only happens in the very rare circumstance that the end of file is encountered in the middle of a potential tag.
  // go back up until we reach a node that contains a list of tag types:
  while (cursor.tagTypes==null) if ((cursor=cursor.parent)==null) break;
}

static CharacterReference construct(final Source source, final int begin, final Config.UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettings) {
  try {
    if (source.getParseText().charAt(begin)!='&') return null;
    return (source.getParseText().charAt(begin+1)=='#')
      ? NumericCharacterReference.construct(source,begin,unterminatedCharacterReferenceSettings)
      : CharacterEntityReference.construct(source,begin,unterminatedCharacterReferenceSettings.characterEntityReferenceMaxCodePoint);
  } catch (IndexOutOfBoundsException ex) {
    return null;
  }
}

@Override
public boolean atEndOfAttributes(final Source source, final int pos, final boolean isClosingSlashIgnored) {
  final ParseText parseText = source.getParseText();
  return parseText.charAt(pos) == '>' || (parseText.containsAt("/>", pos));
}

  public boolean atEndOfAttributes(final Source source, final int pos, final boolean isClosingSlashIgnored) {
    final ParseText parseText=source.getParseText();
    return parseText.charAt(pos)=='>' || (!isClosingSlashIgnored && parseText.containsAt("/>",pos));
  }
}

private static boolean isXML(final Segment firstNonTextSegment) {
  if (firstNonTextSegment==null || !(firstNonTextSegment instanceof Tag)) return false;
  Tag tag=(Tag)firstNonTextSegment;
  if (tag.getTagType()==StartTagType.XML_DECLARATION) return true;
  // if document has a DOCTYPE declaration and it contains the text "xhtml", it is an XML document:
  if (tag.source.getParseText().indexOf("xhtml",tag.begin,tag.end)!=-1) return true;
  return false;
}

  protected int getEnd(final Source source, int pos) {
    final ParseText parseText=source.getParseText();
    boolean insideQuotes=false;
    do {
      final char c=parseText.charAt(pos);
      if (c=='"') {
        insideQuotes=!insideQuotes;
      } else if (c=='>' && !insideQuotes) {
        return pos+1;
      }
    } while ((++pos)<source.getEnd());
    return -1;
  }
}

  protected int getEnd(final Source source, final int pos) {
    // This method needs to be overridden because this tag type shares the same start delimiter as the downlevel hidden conditional comment.
    // The closing delimiter of the other tag type must not appear inside this tag.
    // Take the following example:
    // <!--[if IE]> ... <![endif]--> ... <!--[if !(IE 5)]><!--> ... <!--<![endif]-->
    // If the default implementation were used, then the parser would recognise the first tag as:
    // <!--[if IE]> ... <![endif]--> ... <!--[if !(IE 5)]><!-->
    final int delimiterBegin=source.getParseText().indexOf(MicrosoftConditionalCommentTagTypes.DOWNLEVEL_HIDDEN_IF.getClosingDelimiter(),pos);
    if (delimiterBegin==-1) return -1;
    if (source.getParseText().containsAt(getClosingDelimiter(),delimiterBegin)) return delimiterBegin+getClosingDelimiter().length();
    // this is a downlevel hidden conditional comment, so fail this tag type silently without displaying a log message
    return -2;
  }
}

static final Tag getPreviousTagUncached(final Source source, final int pos, final int breakAtPos) {
  // returns null if pos is out of range.
  try {
    final ParseText parseText=source.getParseText();
    int begin=pos;
    do {
      begin=parseText.lastIndexOf('<',begin,breakAtPos); // this assumes that all tags start with '<'
      // parseText.lastIndexOf and indexOf return -1 if pos is out of range.
      if (begin==-1) return null;
      final Tag tag=getTagAt(source,begin,false);
      if (tag!=null && tag.includeInSearch()) return tag;
    } while ((begin-=1)>=0);
  } catch (IndexOutOfBoundsException ex) {
    throw new AssertionError("Unexpected internal exception");
  }
  return null;
}

/**
 * Causes the specified range of the source text to be ignored when parsing.
 * <p>
 * See the documentation of the {@link Segment#ignoreWhenParsing()} method for more information.
 *
 * @param begin  the beginning character position in the source text.
 * @param end  the end character position in the source text.
 */
public void ignoreWhenParsing(final int begin, final int end) {
  if (wasFullSequentialParseCalled()) throw new IllegalStateException("ignoreWhenParsing can not be used after a full sequential parse has been performed");
  if (parseTextOutputDocument==null) {
    parseTextOutputDocument=new OutputDocument(getParseText());
    parseText=null;
  }
  parseTextOutputDocument.replaceWithSpaces(begin,end);
}

private static CharacterReference getNext(final Source source, int pos, final Config.UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettings) {
  final ParseText parseText=source.getParseText();
  pos=parseText.indexOf('&',pos);
  while (pos!=-1) {
    final CharacterReference characterReference=construct(source,pos,unterminatedCharacterReferenceSettings);
    if (characterReference!=null) return characterReference;
    pos=parseText.indexOf('&',pos+1);
  }
  return null;
}

  protected Tag constructTagAt(final Source source, final int pos) {
    final int closingDelimiterPos=source.getParseText().indexOf('>',pos+1);
    if (closingDelimiterPos==-1) return null;
    final Tag tag=constructStartTag(source,pos,closingDelimiterPos+1,"",null);
    if (source.logger.isErrorEnabled()) source.logger.error(source.getRowColumnVector(tag.getBegin()).appendTo(new StringBuilder(200).append("Encountered possible StartTag at ")).append(" whose content does not match a registered StartTagType").toString());
    return tag;
  }
}

  /**
   * Returns the {@linkplain Tag#getEnd() end} of a tag of this type, starting from the specified position in the specified source document.
   * <br />(<a href="TagType.html#ImplementationAssistance">implementation assistance</a> method)
   * <p>
   * This default implementation simply searches for the first occurrence of the
   * {@linkplain #getClosingDelimiter() closing delimiter} after the specified position, and returns the position immediately
   * after the end of it.
   * <p>
   * If the closing delimiter is not found, the value <code>-1</code> is returned.
   *
   * @param source  the {@link Source} document.
   * @param pos  the position in the source document.
   * @return the {@linkplain Tag#getEnd() end} of a tag of this type, starting from the specified position in the specified source document, or <code>-1</code> if the end of the tag can not be found.
   */
  protected int getEnd(final Source source, final int pos) {
    final int delimiterBegin=source.getParseText().indexOf(getClosingDelimiter(),pos);
    return (delimiterBegin==-1 ? -1 : delimiterBegin+getClosingDelimiter().length());
  }
}

private static CharacterReference getPrevious(final Source source, int pos, final Config.UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettings) {
  final ParseText parseText=source.getParseText();
  pos=parseText.lastIndexOf('&',pos);
  while (pos!=-1) {
    final CharacterReference characterReference=construct(source,pos,unterminatedCharacterReferenceSettings);
    if (characterReference!=null) return characterReference;
    pos=parseText.lastIndexOf('&',pos-1);
  }
  return null;
}

  protected Tag constructTagAt(final Source source, final int pos) {
    final ParseText parseText=source.getParseText();
    final int nameBegin=pos+getStartDelimiter().length();
    final int nameEnd=parseText.indexOf(getClosingDelimiter(),nameBegin);
    final String name=source.getName(nameBegin,nameEnd); // throws IndexOutOfBoundsException if nameEnd==-1
    final EndTag endTag=constructEndTag(source,pos,nameEnd+getClosingDelimiter().length(),name);
    if (source.logger.isErrorEnabled()) source.logger.error(source.getRowColumnVector(pos).appendTo(new StringBuilder(200).append("Encountered possible EndTag at ")).append(" whose content does not match a registered EndTagType").toString());
    return endTag;
  }
}

static final Tag getNextTagUncached(final Source source, final int pos, final int breakAtPos) {
  // returns null if pos is out of range.
  try {
    final ParseText parseText=source.getParseText();
    int begin=pos;
    do {
      begin=parseText.indexOf('<',begin,breakAtPos); // this assumes that all tags start with '<'
      // parseText.lastIndexOf and indexOf return -1 if pos is out of range.
      if (begin==-1) return null;
      final Tag tag=getTagAt(source,begin,false);
      if (tag!=null && tag.includeInSearch()) return tag;
    } while ((begin+=1)<source.end);
  } catch (IndexOutOfBoundsException ex) {
    // this should only happen when the end of file is reached in the middle of a tag.
    // we don't have to do anything to handle it as there are no more tags anyway.
  }
  return null;
}

static final Tag getNextTagUncached(final Source source, final int pos, final TagType tagType, final int breakAtPos) {
  // returns null if pos is out of range.
  if (tagType==null) return getNextTagUncached(source,pos,breakAtPos);
  final String startDelimiter=tagType.getStartDelimiter();
  try {
    final ParseText parseText=source.getParseText();
    int begin=pos;
    do {
      begin=parseText.indexOf(startDelimiter,begin,breakAtPos);
      // parseText.lastIndexOf and indexOf return -1 if pos is out of range.
      if (begin==-1) return null;
      final Tag tag=getTagAt(source,begin,false);
      if (tag!=null && tag.getTagType()==tagType) return tag;
    } while ((begin+=1)<source.end);
  } catch (IndexOutOfBoundsException ex) {
    // this should only happen when the end of file is reached in the middle of a tag.
    // we don't have to do anything to handle it as there are no more tags anyway.
  }
  return null;
}

static final Tag getPreviousTagUncached(final Source source, final int pos, final TagType tagType, final int breakAtPos) {
  // returns null if pos is out of range.
  if (tagType==null) return getPreviousTagUncached(source,pos,breakAtPos);
  final String startDelimiter=tagType.getStartDelimiter();
  try {
    final ParseText parseText=source.getParseText();
    int begin=pos;
    do {
      begin=parseText.lastIndexOf(startDelimiter,begin,breakAtPos);
      // parseText.lastIndexOf and indexOf return -1 if pos is out of range.
      if (begin==-1) return null;
      final Tag tag=getTagAt(source,begin,false);
      if (tag!=null && tag.getTagType()==tagType) return tag;
    } while ((begin-=1)>=0);
  } catch (IndexOutOfBoundsException ex) {
    // this should never happen during a get previous operation so rethrow it:
    throw ex;
  }
  return null;
}

public static List<Segment> getStyleURISegments(final Segment segment) {
  if (segment==null || segment.length()==0) return Collections.emptyList();
  if (segment.getFirstStartTag()==null) {
    // no start tags in this segment, assume the segment is a style attribute value
    int urlDelimiterStartPos=segment.getSource().getParseText().indexOf("url(",segment.getBegin(),segment.getEnd());
    if (urlDelimiterStartPos==-1) return Collections.emptyList();
    return addURLSegmentsFromCSS(new ArrayList<Segment>(),new Segment(segment.getSource(),urlDelimiterStartPos,segment.getEnd()));
  }
  List<Segment> uriSegments=new ArrayList<Segment>();
  for (StartTag startTag : segment.getAllStartTags("style",null)) {
    addURLSegmentsFromCSS(uriSegments,startTag.getAttributes().get("style").getValueSegment());
  }
  for (Element element : segment.getAllElements(HTMLElementName.STYLE)) {
    addURLSegmentsFromCSS(uriSegments,element.getContent());
  }
  Collections.sort(uriSegments);
  return uriSegments;
}

private static Appendable appendDecode(final Appendable appendable, final Segment segment, final int searchBegin, final boolean insideAttributeValue, final boolean convertNonBreakingSpaces) throws IOException {
  final Config.UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettings=Config.CurrentCompatibilityMode.getUnterminatedCharacterReferenceSettings(insideAttributeValue);
 final Source source=segment.source;
 final ParseText parseText=source.getParseText();
 final int end=segment.getEnd();
 int begin=segment.getBegin();
  int pos=parseText.indexOf('&',begin+searchBegin,end);
  while (pos!=-1) {
    final CharacterReference characterReference=CharacterReference.construct(source,pos,unterminatedCharacterReferenceSettings);
    if (characterReference!=null) {
      appendable.append(source.substring(begin,pos)); // Don't use appendable.append(source,begin,pos) as it checks source.length() which may throw an exception when using StreamedSource.
      characterReference.appendCharTo(appendable,convertNonBreakingSpaces);
      begin=characterReference.getEnd();
      pos=parseText.indexOf('&',begin,end);
    } else {
      pos=parseText.indexOf('&',pos+1,end);
    }
  }
  appendable.append(source.substring(begin,end));
  return appendable;
}

private static List<Segment> addURLSegmentsFromCSS(final List<Segment> uriSegments, final Segment cssSegment) {
  final Source source=cssSegment.getSource();
  final ParseText parseText=source.getParseText();
  final int breakAtIndex=cssSegment.getEnd();
  for (int pos=cssSegment.getBegin(); (pos=parseText.indexOf("url(",pos,breakAtIndex))!=-1;) {
    pos+=4;
    while (pos<breakAtIndex && Segment.isWhiteSpace(parseText.charAt(pos))) pos++;
    if (pos>=breakAtIndex) break;
    if (isQuote(parseText.charAt(pos))) {
      pos++;
      if (pos>=breakAtIndex) break;
    }
    final int uriBegin=pos;
    final int closingBracketPos=parseText.indexOf(')',uriBegin,breakAtIndex);
    if (closingBracketPos==-1) break;
    pos=closingBracketPos;
    while (Segment.isWhiteSpace(parseText.charAt(pos-1))) pos--;
    if (isQuote(parseText.charAt(pos-1))) pos--;
    final int uriEnd=pos;
    if (uriEnd<=uriBegin) break;
    uriSegments.add(new Segment(source,uriBegin,uriEnd));
    pos=closingBracketPos;
  }
  return uriSegments;
}

Javadoc

Returns the ParseText of this source document.

This method is normally only of interest to users who wish to create custom tag types.

The parse text is defined as the entire text of the source document in lower case, with all Segment#ignoreWhenParsing() segments replaced by space characters.

Popular methods of Source

<init>
getAllElements
getChildElements
Returns a list of the top-level Element in the document element hierarchy. The objects in the list a
fullSequentialParse
Parses all of the Tag in this source document sequentially from beginning to end. Calling this metho
getRow
Returns the row number of the specified character position in the source document.
setLogger
Sets the Logger that handles log messages. Specifying a null argument disables logging completely fo
subSequence
Returns a new character sequence that is a subsequence of this source document.
getAllStartTags
getNextEndTag
Returns the EndTag of the specified EndTagType beginning at or immediately following the specified p
toString
Returns the source text as a String.
getAllTags
Returns a list of all Tag in this source document. Calling this method on the Source object performs
getEnd

Popular in Java

Parsing JSON documents to java classes using gson
setRequestProperty (URLConnection)
requestLocationUpdates (LocationManager)
getOriginalFilename (MultipartFile)
Return the original filename in the client's filesystem.This may contain path information depending
PrintWriter (java.io)
Wraps either an existing OutputStream or an existing Writerand provides convenience methods for prin
URI (java.net)
A Uniform Resource Identifier that identifies an abstract or physical resource, as specified by RFC
HashMap (java.util)
HashMap is an implementation of Map. All optional operations are supported.All elements are permitte
Options (org.apache.commons.cli)
Main entry-point into the library. Options represents a collection of Option objects, which describ
FileUtils (org.apache.commons.io)
General file manipulation utilities. Facilities are provided in the following areas: * writing to a
LogFactory (org.apache.commons.logging)
Factory for creating Log instances, with discovery and configuration features similar to that employ
Best IntelliJ plugins

How to use getParseTextmethodin net.htmlparser.jericho.Source

Best Java code snippets using net.htmlparser.jericho.Source.getParseText (Showing top 20 results out of 315)

How to use
getParseText
method
in
net.htmlparser.jericho.Source