net.htmlparser.jericho.ParseText.indexOf java code examples

private static CharacterReference getNext(final Source source, int pos, final Config.UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettings) {
  final ParseText parseText=source.getParseText();
  pos=parseText.indexOf('&',pos);
  while (pos!=-1) {
    final CharacterReference characterReference=construct(source,pos,unterminatedCharacterReferenceSettings);
    if (characterReference!=null) return characterReference;
    pos=parseText.indexOf('&',pos+1);
  }
  return null;
}

private static boolean isXML(final Segment firstNonTextSegment) {
  if (firstNonTextSegment==null || !(firstNonTextSegment instanceof Tag)) return false;
  Tag tag=(Tag)firstNonTextSegment;
  if (tag.getTagType()==StartTagType.XML_DECLARATION) return true;
  // if document has a DOCTYPE declaration and it contains the text "xhtml", it is an XML document:
  if (tag.source.getParseText().indexOf("xhtml",tag.begin,tag.end)!=-1) return true;
  return false;
}

  /**
   * Returns the {@linkplain Tag#getEnd() end} of a tag of this type, starting from the specified position in the specified source document.
   * <br />(<a href="TagType.html#ImplementationAssistance">implementation assistance</a> method)
   * <p>
   * This default implementation simply searches for the first occurrence of the
   * {@linkplain #getClosingDelimiter() closing delimiter} after the specified position, and returns the position immediately
   * after the end of it.
   * <p>
   * If the closing delimiter is not found, the value <code>-1</code> is returned.
   *
   * @param source  the {@link Source} document.
   * @param pos  the position in the source document.
   * @return the {@linkplain Tag#getEnd() end} of a tag of this type, starting from the specified position in the specified source document, or <code>-1</code> if the end of the tag can not be found.
   */
  protected int getEnd(final Source source, final int pos) {
    final int delimiterBegin=source.getParseText().indexOf(getClosingDelimiter(),pos);
    return (delimiterBegin==-1 ? -1 : delimiterBegin+getClosingDelimiter().length());
  }
}

  protected Tag constructTagAt(final Source source, final int pos) {
    final int closingDelimiterPos=source.getParseText().indexOf('>',pos+1);
    if (closingDelimiterPos==-1) return null;
    final Tag tag=constructStartTag(source,pos,closingDelimiterPos+1,"",null);
    if (source.logger.isErrorEnabled()) source.logger.error(source.getRowColumnVector(tag.getBegin()).appendTo(new StringBuilder(200).append("Encountered possible StartTag at ")).append(" whose content does not match a registered StartTagType").toString());
    return tag;
  }
}

private Segment nextNonTagSegment(final int begin, final int end) {
  if (!legacyIteratorCompatabilityMode) {
    final CharacterReference characterReference=characterReferenceAtCurrentPosition;
    if (characterReference!=null) {
      characterReferenceAtCurrentPosition=null;
      pos=characterReference.end;
      return characterReference;
    }
    final ParseText parseText=source.getParseText();
    int potentialCharacterReferenceBegin=parseText.indexOf('&',begin,end);
    while (potentialCharacterReferenceBegin!=-1) {
      final CharacterReference nextCharacterReference=CharacterReference.construct(source,potentialCharacterReferenceBegin,Config.UnterminatedCharacterReferenceSettings.ACCEPT_ALL);
      if (nextCharacterReference!=null) {
        if (potentialCharacterReferenceBegin==begin) {
          pos=nextCharacterReference.end;
          return nextCharacterReference;
        } else {
          pos=nextCharacterReference.begin;
          characterReferenceAtCurrentPosition=nextCharacterReference;
          return new Segment(source,begin,pos);
        }
      }
      potentialCharacterReferenceBegin=parseText.indexOf('&',potentialCharacterReferenceBegin+1,end);
    }
  }
  return new Segment(source,begin,pos=end);
}

  protected Tag constructTagAt(final Source source, final int pos) {
    final ParseText parseText=source.getParseText();
    final int nameBegin=pos+getStartDelimiter().length();
    final int nameEnd=parseText.indexOf(getClosingDelimiter(),nameBegin);
    final String name=source.getName(nameBegin,nameEnd); // throws IndexOutOfBoundsException if nameEnd==-1
    final EndTag endTag=constructEndTag(source,pos,nameEnd+getClosingDelimiter().length(),name);
    if (source.logger.isErrorEnabled()) source.logger.error(source.getRowColumnVector(pos).appendTo(new StringBuilder(200).append("Encountered possible EndTag at ")).append(" whose content does not match a registered EndTagType").toString());
    return endTag;
  }
}

static final Tag getNextTagUncached(final Source source, final int pos, final int breakAtPos) {
  // returns null if pos is out of range.
  try {
    final ParseText parseText=source.getParseText();
    int begin=pos;
    do {
      begin=parseText.indexOf('<',begin,breakAtPos); // this assumes that all tags start with '<'
      // parseText.lastIndexOf and indexOf return -1 if pos is out of range.
      if (begin==-1) return null;
      final Tag tag=getTagAt(source,begin,false);
      if (tag!=null && tag.includeInSearch()) return tag;
    } while ((begin+=1)<source.end);
  } catch (IndexOutOfBoundsException ex) {
    // this should only happen when the end of file is reached in the middle of a tag.
    // we don't have to do anything to handle it as there are no more tags anyway.
  }
  return null;
}

private static Appendable appendDecode(final Appendable appendable, final Segment segment, final int searchBegin, final boolean insideAttributeValue, final boolean convertNonBreakingSpaces) throws IOException {
  final Config.UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettings=Config.CurrentCompatibilityMode.getUnterminatedCharacterReferenceSettings(insideAttributeValue);
 final Source source=segment.source;
 final ParseText parseText=source.getParseText();
 final int end=segment.getEnd();
 int begin=segment.getBegin();
  int pos=parseText.indexOf('&',begin+searchBegin,end);
  while (pos!=-1) {
    final CharacterReference characterReference=CharacterReference.construct(source,pos,unterminatedCharacterReferenceSettings);
    if (characterReference!=null) {
      appendable.append(source.substring(begin,pos)); // Don't use appendable.append(source,begin,pos) as it checks source.length() which may throw an exception when using StreamedSource.
      characterReference.appendCharTo(appendable,convertNonBreakingSpaces);
      begin=characterReference.getEnd();
      pos=parseText.indexOf('&',begin,end);
    } else {
      pos=parseText.indexOf('&',pos+1,end);
    }
  }
  appendable.append(source.substring(begin,end));
  return appendable;
}

int searchPos=pos;
while (searchPos<source.end) {
  searchPos=parseText.indexOf(searchString.toLowerCase(),searchPos);
  if (searchPos==-1) return null;
  final Tag tag=source.getEnclosingTag(searchPos);

private static final Tag parseAllgetNextTag(final Source source, final ParseText parseText, final int pos, final boolean assumeNoNestedTags) {
  try {
    int begin=pos;
    do {
      begin=parseText.indexOf('<',begin); // this assumes that all tags start with '<'
      if (begin==-1) return null;
      final Tag tag=TagType.getTagAt(source,begin,false,assumeNoNestedTags);
      if (tag!=null) {
        if (!assumeNoNestedTags) {
          // POSSIBLE BUG:
          // It appears that this code should be executed even if assumeNoNestedTags is true.
          // This was originally not the case when first created, but the subsequent addition of the SCRIPT element handling means it should always be executed.  
          // This should be proven and fixed if assumeNoNestedTags is ever allowed to be true (at present it is hard coded to false).
          final TagType tagType=tag.getTagType();
          if (tag.end>source.fullSequentialParseData[0]
              && tagType!=StartTagType.DOCTYPE_DECLARATION
              && tagType!=StartTagType.UNREGISTERED && tagType!=EndTagType.UNREGISTERED) {
            source.fullSequentialParseData[0]=(tagType==StartTagType.NORMAL && tag.name==HTMLElementName.SCRIPT && !((StartTag)tag).isEmptyElementTag()) ? Integer.MAX_VALUE : tag.end;
          }
        }
        return tag;
      }
    } while ((begin+=1)<source.end);
  } catch (IndexOutOfBoundsException ex) {
    // this should only happen when the end of file is reached in the middle of a tag.
    // we don't have to do anything to handle it as there are no more tags anyway.
  }
  return null;
}

  protected int getEnd(final Source source, final int pos) {
    // This method needs to be overridden because this tag type shares the same start delimiter as the downlevel hidden conditional comment.
    // The closing delimiter of the other tag type must not appear inside this tag.
    // Take the following example:
    // <!--[if IE]> ... <![endif]--> ... <!--[if !(IE 5)]><!--> ... <!--<![endif]-->
    // If the default implementation were used, then the parser would recognise the first tag as:
    // <!--[if IE]> ... <![endif]--> ... <!--[if !(IE 5)]><!-->
    final int delimiterBegin=source.getParseText().indexOf(MicrosoftConditionalCommentTagTypes.DOWNLEVEL_HIDDEN_IF.getClosingDelimiter(),pos);
    if (delimiterBegin==-1) return -1;
    if (source.getParseText().containsAt(getClosingDelimiter(),delimiterBegin)) return delimiterBegin+getClosingDelimiter().length();
    // this is a downlevel hidden conditional comment, so fail this tag type silently without displaying a log message
    return -2;
  }
}

int begin=pos;
do {
  begin=parseText.indexOf(searchString,begin);
  if (begin==-1) return null;
  final EndTag endTag=(EndTag)source.getTagAt(begin);

static StartTag getNext(final Source source, final int pos, final String attributeName, final Pattern valueRegexPattern) {
  if (attributeName==null || attributeName.length()==0) throw new IllegalArgumentException();
  if (source.wasFullSequentialParseCalled()) {
    StartTag startTag=source.getNextStartTag(pos);
    while (true) {
      if (startTag==null) return null;
      if (startTag.hasAttribute(attributeName,valueRegexPattern)) return startTag;
      startTag=startTag.getNextStartTag();
    }
  } else {
    final String searchString=attributeName;
    final ParseText parseText=source.getParseText();
    int searchPos=pos;
    while (searchPos<source.end) {
      searchPos=parseText.indexOf(searchString.toLowerCase(),searchPos);
      if (searchPos==-1) return null;
      final Tag tag=source.getEnclosingTag(searchPos);
      if (tag==null || !(tag instanceof StartTag)) {
        searchPos++;
        continue;
      }
      if (tag.begin>=pos) {
        final StartTag startTag=(StartTag)tag;
        if (startTag.hasAttribute(attributeName,valueRegexPattern)) return startTag;
      }
      searchPos=tag.end;
    }
    return null;
  }
}

/**
 * Indicates whether the source document is likely to be <a target="_blank" href="http://www.w3.org/TR/REC-xml/">XML</a>.
 * <p>
 * The algorithm used to determine this is designed to be relatively inexpensive and to provide an accurate result in
 * most normal situations.
 * An exact determination of whether the source document is XML would require a much more complex analysis of the text.
 * <p>
 * The algorithm is as follows:
 * <ol class="HalfSeparated">
 *  <li>If the document begins with an {@linkplain StartTagType#XML_DECLARATION XML declaration}, it is an XML document.
 *  <li>If the document contains a {@linkplain StartTagType#DOCTYPE_DECLARATION document type declaration} that contains the text
 *   "<code>xhtml</code>", it is an <a target="_blank" href="http://www.w3.org/TR/xhtml1/">XHTML</a> document, and hence
 *   also an XML document.
 *  <li>If none of the above conditions are met, assume the document is normal HTML, and therefore not an XML document.
 * </ol>
 *
 * @return <code>true</code> if the source document is likely to be <a target="_blank" href="http://www.w3.org/TR/REC-xml/">XML</a>, otherwise <code>false</code>.
 */
public boolean isXML() {
  final Tag xmlDeclarationTag=getTagAt(0);
  if (xmlDeclarationTag!=null && xmlDeclarationTag.getTagType()==StartTagType.XML_DECLARATION) return true;
  final Tag doctypeTag=getNextTag(0,StartTagType.DOCTYPE_DECLARATION);
  // if document has a DOCTYPE declaration and it contains the text "xhtml", it is an XML document:
  if (doctypeTag!=null && getParseText().indexOf("xhtml",doctypeTag.begin,doctypeTag.end)!=-1) return true;
  return false;
}

static final Tag getNextTagUncached(final Source source, final int pos, final TagType tagType, final int breakAtPos) {
  // returns null if pos is out of range.
  if (tagType==null) return getNextTagUncached(source,pos,breakAtPos);
  final String startDelimiter=tagType.getStartDelimiter();
  try {
    final ParseText parseText=source.getParseText();
    int begin=pos;
    do {
      begin=parseText.indexOf(startDelimiter,begin,breakAtPos);
      // parseText.lastIndexOf and indexOf return -1 if pos is out of range.
      if (begin==-1) return null;
      final Tag tag=getTagAt(source,begin,false);
      if (tag!=null && tag.getTagType()==tagType) return tag;
    } while ((begin+=1)<source.end);
  } catch (IndexOutOfBoundsException ex) {
    // this should only happen when the end of file is reached in the middle of a tag.
    // we don't have to do anything to handle it as there are no more tags anyway.
  }
  return null;
}

public static List<Segment> getStyleURISegments(final Segment segment) {
  if (segment==null || segment.length()==0) return Collections.emptyList();
  if (segment.getFirstStartTag()==null) {
    // no start tags in this segment, assume the segment is a style attribute value
    int urlDelimiterStartPos=segment.getSource().getParseText().indexOf("url(",segment.getBegin(),segment.getEnd());
    if (urlDelimiterStartPos==-1) return Collections.emptyList();
    return addURLSegmentsFromCSS(new ArrayList<Segment>(),new Segment(segment.getSource(),urlDelimiterStartPos,segment.getEnd()));
  }
  List<Segment> uriSegments=new ArrayList<Segment>();
  for (StartTag startTag : segment.getAllStartTags("style",null)) {
    addURLSegmentsFromCSS(uriSegments,startTag.getAttributes().get("style").getValueSegment());
  }
  for (Element element : segment.getAllElements(HTMLElementName.STYLE)) {
    addURLSegmentsFromCSS(uriSegments,element.getContent());
  }
  Collections.sort(uriSegments);
  return uriSegments;
}

int begin=pos;
do {
  begin=parseText.indexOf(startDelimiter,begin);
  if (begin==-1) return null;
  final StartTag startTag=(StartTag)Tag.getTagAt(source,begin,false);

private static List<Segment> addURLSegmentsFromCSS(final List<Segment> uriSegments, final Segment cssSegment) {
  final Source source=cssSegment.getSource();
  final ParseText parseText=source.getParseText();
  final int breakAtIndex=cssSegment.getEnd();
  for (int pos=cssSegment.getBegin(); (pos=parseText.indexOf("url(",pos,breakAtIndex))!=-1;) {
    pos+=4;
    while (pos<breakAtIndex && Segment.isWhiteSpace(parseText.charAt(pos))) pos++;
    if (pos>=breakAtIndex) break;
    if (isQuote(parseText.charAt(pos))) {
      pos++;
      if (pos>=breakAtIndex) break;
    }
    final int uriBegin=pos;
    final int closingBracketPos=parseText.indexOf(')',uriBegin,breakAtIndex);
    if (closingBracketPos==-1) break;
    pos=closingBracketPos;
    while (Segment.isWhiteSpace(parseText.charAt(pos-1))) pos--;
    if (isQuote(parseText.charAt(pos-1))) pos--;
    final int uriEnd=pos;
    if (uriEnd<=uriBegin) break;
    uriSegments.add(new Segment(source,uriBegin,uriEnd));
    pos=closingBracketPos;
  }
  return uriSegments;
}

Javadoc

Returns the index within this parse text of the first occurrence of the specified character, starting the search at the position specified by fromIndex.

If the specified character is not found then -1 is returned.

Popular methods of ParseText

charAt
Returns the character at the specified index.
containsAt
Indicates whether this parse text contains the specified string at the specified position. This meth
lastIndexOf
Returns the index within this parse text of the last occurrence of the specified string, searching b
length
Returns the length of the parse text.
subSequence
Returns a new character sequence that is a subsequence of this sequence.

Popular in Java

Updating database using SQL prepared statement
notifyDataSetChanged (ArrayAdapter)
runOnUiThread (Activity)
setScale (BigDecimal)
MalformedURLException (java.net)
This exception is thrown when a program attempts to create an URL from an incorrect specification.
Collection (java.util)
Collection is the root of the collection hierarchy. It defines operations on data collections and t
Properties (java.util)
A Properties object is a Hashtable where the keys and values must be Strings. Each property can have
Queue (java.util)
A collection designed for holding elements prior to processing. Besides basic java.util.Collection o
UUID (java.util)
UUID is an immutable representation of a 128-bit universally unique identifier (UUID). There are mul
GridBagLayout (java.awt)
The GridBagLayout class is a flexible layout manager that aligns components vertically and horizonta
Best IntelliJ plugins

How to use indexOfmethodin net.htmlparser.jericho.ParseText

Best Java code snippets using net.htmlparser.jericho.ParseText.indexOf (Showing top 18 results out of 315)

How to use
indexOf
method
in
net.htmlparser.jericho.ParseText