private static CharacterReference getNext(final Source source, int pos, final Config.UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettings) { final ParseText parseText=source.getParseText(); pos=parseText.indexOf('&',pos); while (pos!=-1) { final CharacterReference characterReference=construct(source,pos,unterminatedCharacterReferenceSettings); if (characterReference!=null) return characterReference; pos=parseText.indexOf('&',pos+1); } return null; }
private static boolean isXML(final Segment firstNonTextSegment) { if (firstNonTextSegment==null || !(firstNonTextSegment instanceof Tag)) return false; Tag tag=(Tag)firstNonTextSegment; if (tag.getTagType()==StartTagType.XML_DECLARATION) return true; // if document has a DOCTYPE declaration and it contains the text "xhtml", it is an XML document: if (tag.source.getParseText().indexOf("xhtml",tag.begin,tag.end)!=-1) return true; return false; }
/** * Returns the {@linkplain Tag#getEnd() end} of a tag of this type, starting from the specified position in the specified source document. * <br />(<a href="TagType.html#ImplementationAssistance">implementation assistance</a> method) * <p> * This default implementation simply searches for the first occurrence of the * {@linkplain #getClosingDelimiter() closing delimiter} after the specified position, and returns the position immediately * after the end of it. * <p> * If the closing delimiter is not found, the value <code>-1</code> is returned. * * @param source the {@link Source} document. * @param pos the position in the source document. * @return the {@linkplain Tag#getEnd() end} of a tag of this type, starting from the specified position in the specified source document, or <code>-1</code> if the end of the tag can not be found. */ protected int getEnd(final Source source, final int pos) { final int delimiterBegin=source.getParseText().indexOf(getClosingDelimiter(),pos); return (delimiterBegin==-1 ? -1 : delimiterBegin+getClosingDelimiter().length()); } }
protected Tag constructTagAt(final Source source, final int pos) { final int closingDelimiterPos=source.getParseText().indexOf('>',pos+1); if (closingDelimiterPos==-1) return null; final Tag tag=constructStartTag(source,pos,closingDelimiterPos+1,"",null); if (source.logger.isErrorEnabled()) source.logger.error(source.getRowColumnVector(tag.getBegin()).appendTo(new StringBuilder(200).append("Encountered possible StartTag at ")).append(" whose content does not match a registered StartTagType").toString()); return tag; } }
private Segment nextNonTagSegment(final int begin, final int end) { if (!legacyIteratorCompatabilityMode) { final CharacterReference characterReference=characterReferenceAtCurrentPosition; if (characterReference!=null) { characterReferenceAtCurrentPosition=null; pos=characterReference.end; return characterReference; } final ParseText parseText=source.getParseText(); int potentialCharacterReferenceBegin=parseText.indexOf('&',begin,end); while (potentialCharacterReferenceBegin!=-1) { final CharacterReference nextCharacterReference=CharacterReference.construct(source,potentialCharacterReferenceBegin,Config.UnterminatedCharacterReferenceSettings.ACCEPT_ALL); if (nextCharacterReference!=null) { if (potentialCharacterReferenceBegin==begin) { pos=nextCharacterReference.end; return nextCharacterReference; } else { pos=nextCharacterReference.begin; characterReferenceAtCurrentPosition=nextCharacterReference; return new Segment(source,begin,pos); } } potentialCharacterReferenceBegin=parseText.indexOf('&',potentialCharacterReferenceBegin+1,end); } } return new Segment(source,begin,pos=end); }
protected Tag constructTagAt(final Source source, final int pos) { final ParseText parseText=source.getParseText(); final int nameBegin=pos+getStartDelimiter().length(); final int nameEnd=parseText.indexOf(getClosingDelimiter(),nameBegin); final String name=source.getName(nameBegin,nameEnd); // throws IndexOutOfBoundsException if nameEnd==-1 final EndTag endTag=constructEndTag(source,pos,nameEnd+getClosingDelimiter().length(),name); if (source.logger.isErrorEnabled()) source.logger.error(source.getRowColumnVector(pos).appendTo(new StringBuilder(200).append("Encountered possible EndTag at ")).append(" whose content does not match a registered EndTagType").toString()); return endTag; } }
static final Tag getNextTagUncached(final Source source, final int pos, final int breakAtPos) { // returns null if pos is out of range. try { final ParseText parseText=source.getParseText(); int begin=pos; do { begin=parseText.indexOf('<',begin,breakAtPos); // this assumes that all tags start with '<' // parseText.lastIndexOf and indexOf return -1 if pos is out of range. if (begin==-1) return null; final Tag tag=getTagAt(source,begin,false); if (tag!=null && tag.includeInSearch()) return tag; } while ((begin+=1)<source.end); } catch (IndexOutOfBoundsException ex) { // this should only happen when the end of file is reached in the middle of a tag. // we don't have to do anything to handle it as there are no more tags anyway. } return null; }
private static Appendable appendDecode(final Appendable appendable, final Segment segment, final int searchBegin, final boolean insideAttributeValue, final boolean convertNonBreakingSpaces) throws IOException { final Config.UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettings=Config.CurrentCompatibilityMode.getUnterminatedCharacterReferenceSettings(insideAttributeValue); final Source source=segment.source; final ParseText parseText=source.getParseText(); final int end=segment.getEnd(); int begin=segment.getBegin(); int pos=parseText.indexOf('&',begin+searchBegin,end); while (pos!=-1) { final CharacterReference characterReference=CharacterReference.construct(source,pos,unterminatedCharacterReferenceSettings); if (characterReference!=null) { appendable.append(source.substring(begin,pos)); // Don't use appendable.append(source,begin,pos) as it checks source.length() which may throw an exception when using StreamedSource. characterReference.appendCharTo(appendable,convertNonBreakingSpaces); begin=characterReference.getEnd(); pos=parseText.indexOf('&',begin,end); } else { pos=parseText.indexOf('&',pos+1,end); } } appendable.append(source.substring(begin,end)); return appendable; }
int searchPos=pos; while (searchPos<source.end) { searchPos=parseText.indexOf(searchString.toLowerCase(),searchPos); if (searchPos==-1) return null; final Tag tag=source.getEnclosingTag(searchPos);
private static final Tag parseAllgetNextTag(final Source source, final ParseText parseText, final int pos, final boolean assumeNoNestedTags) { try { int begin=pos; do { begin=parseText.indexOf('<',begin); // this assumes that all tags start with '<' if (begin==-1) return null; final Tag tag=TagType.getTagAt(source,begin,false,assumeNoNestedTags); if (tag!=null) { if (!assumeNoNestedTags) { // POSSIBLE BUG: // It appears that this code should be executed even if assumeNoNestedTags is true. // This was originally not the case when first created, but the subsequent addition of the SCRIPT element handling means it should always be executed. // This should be proven and fixed if assumeNoNestedTags is ever allowed to be true (at present it is hard coded to false). final TagType tagType=tag.getTagType(); if (tag.end>source.fullSequentialParseData[0] && tagType!=StartTagType.DOCTYPE_DECLARATION && tagType!=StartTagType.UNREGISTERED && tagType!=EndTagType.UNREGISTERED) { source.fullSequentialParseData[0]=(tagType==StartTagType.NORMAL && tag.name==HTMLElementName.SCRIPT && !((StartTag)tag).isEmptyElementTag()) ? Integer.MAX_VALUE : tag.end; } } return tag; } } while ((begin+=1)<source.end); } catch (IndexOutOfBoundsException ex) { // this should only happen when the end of file is reached in the middle of a tag. // we don't have to do anything to handle it as there are no more tags anyway. } return null; }
protected int getEnd(final Source source, final int pos) { // This method needs to be overridden because this tag type shares the same start delimiter as the downlevel hidden conditional comment. // The closing delimiter of the other tag type must not appear inside this tag. // Take the following example: // <!--[if IE]> ... <![endif]--> ... <!--[if !(IE 5)]><!--> ... <!--<![endif]--> // If the default implementation were used, then the parser would recognise the first tag as: // <!--[if IE]> ... <![endif]--> ... <!--[if !(IE 5)]><!--> final int delimiterBegin=source.getParseText().indexOf(MicrosoftConditionalCommentTagTypes.DOWNLEVEL_HIDDEN_IF.getClosingDelimiter(),pos); if (delimiterBegin==-1) return -1; if (source.getParseText().containsAt(getClosingDelimiter(),delimiterBegin)) return delimiterBegin+getClosingDelimiter().length(); // this is a downlevel hidden conditional comment, so fail this tag type silently without displaying a log message return -2; } }
int begin=pos; do { begin=parseText.indexOf(searchString,begin); if (begin==-1) return null; final EndTag endTag=(EndTag)source.getTagAt(begin);
static StartTag getNext(final Source source, final int pos, final String attributeName, final Pattern valueRegexPattern) { if (attributeName==null || attributeName.length()==0) throw new IllegalArgumentException(); if (source.wasFullSequentialParseCalled()) { StartTag startTag=source.getNextStartTag(pos); while (true) { if (startTag==null) return null; if (startTag.hasAttribute(attributeName,valueRegexPattern)) return startTag; startTag=startTag.getNextStartTag(); } } else { final String searchString=attributeName; final ParseText parseText=source.getParseText(); int searchPos=pos; while (searchPos<source.end) { searchPos=parseText.indexOf(searchString.toLowerCase(),searchPos); if (searchPos==-1) return null; final Tag tag=source.getEnclosingTag(searchPos); if (tag==null || !(tag instanceof StartTag)) { searchPos++; continue; } if (tag.begin>=pos) { final StartTag startTag=(StartTag)tag; if (startTag.hasAttribute(attributeName,valueRegexPattern)) return startTag; } searchPos=tag.end; } return null; } }
/** * Indicates whether the source document is likely to be <a target="_blank" href="http://www.w3.org/TR/REC-xml/">XML</a>. * <p> * The algorithm used to determine this is designed to be relatively inexpensive and to provide an accurate result in * most normal situations. * An exact determination of whether the source document is XML would require a much more complex analysis of the text. * <p> * The algorithm is as follows: * <ol class="HalfSeparated"> * <li>If the document begins with an {@linkplain StartTagType#XML_DECLARATION XML declaration}, it is an XML document. * <li>If the document contains a {@linkplain StartTagType#DOCTYPE_DECLARATION document type declaration} that contains the text * "<code>xhtml</code>", it is an <a target="_blank" href="http://www.w3.org/TR/xhtml1/">XHTML</a> document, and hence * also an XML document. * <li>If none of the above conditions are met, assume the document is normal HTML, and therefore not an XML document. * </ol> * * @return <code>true</code> if the source document is likely to be <a target="_blank" href="http://www.w3.org/TR/REC-xml/">XML</a>, otherwise <code>false</code>. */ public boolean isXML() { final Tag xmlDeclarationTag=getTagAt(0); if (xmlDeclarationTag!=null && xmlDeclarationTag.getTagType()==StartTagType.XML_DECLARATION) return true; final Tag doctypeTag=getNextTag(0,StartTagType.DOCTYPE_DECLARATION); // if document has a DOCTYPE declaration and it contains the text "xhtml", it is an XML document: if (doctypeTag!=null && getParseText().indexOf("xhtml",doctypeTag.begin,doctypeTag.end)!=-1) return true; return false; }
static final Tag getNextTagUncached(final Source source, final int pos, final TagType tagType, final int breakAtPos) { // returns null if pos is out of range. if (tagType==null) return getNextTagUncached(source,pos,breakAtPos); final String startDelimiter=tagType.getStartDelimiter(); try { final ParseText parseText=source.getParseText(); int begin=pos; do { begin=parseText.indexOf(startDelimiter,begin,breakAtPos); // parseText.lastIndexOf and indexOf return -1 if pos is out of range. if (begin==-1) return null; final Tag tag=getTagAt(source,begin,false); if (tag!=null && tag.getTagType()==tagType) return tag; } while ((begin+=1)<source.end); } catch (IndexOutOfBoundsException ex) { // this should only happen when the end of file is reached in the middle of a tag. // we don't have to do anything to handle it as there are no more tags anyway. } return null; }
public static List<Segment> getStyleURISegments(final Segment segment) { if (segment==null || segment.length()==0) return Collections.emptyList(); if (segment.getFirstStartTag()==null) { // no start tags in this segment, assume the segment is a style attribute value int urlDelimiterStartPos=segment.getSource().getParseText().indexOf("url(",segment.getBegin(),segment.getEnd()); if (urlDelimiterStartPos==-1) return Collections.emptyList(); return addURLSegmentsFromCSS(new ArrayList<Segment>(),new Segment(segment.getSource(),urlDelimiterStartPos,segment.getEnd())); } List<Segment> uriSegments=new ArrayList<Segment>(); for (StartTag startTag : segment.getAllStartTags("style",null)) { addURLSegmentsFromCSS(uriSegments,startTag.getAttributes().get("style").getValueSegment()); } for (Element element : segment.getAllElements(HTMLElementName.STYLE)) { addURLSegmentsFromCSS(uriSegments,element.getContent()); } Collections.sort(uriSegments); return uriSegments; }
int begin=pos; do { begin=parseText.indexOf(startDelimiter,begin); if (begin==-1) return null; final StartTag startTag=(StartTag)Tag.getTagAt(source,begin,false);
private static List<Segment> addURLSegmentsFromCSS(final List<Segment> uriSegments, final Segment cssSegment) { final Source source=cssSegment.getSource(); final ParseText parseText=source.getParseText(); final int breakAtIndex=cssSegment.getEnd(); for (int pos=cssSegment.getBegin(); (pos=parseText.indexOf("url(",pos,breakAtIndex))!=-1;) { pos+=4; while (pos<breakAtIndex && Segment.isWhiteSpace(parseText.charAt(pos))) pos++; if (pos>=breakAtIndex) break; if (isQuote(parseText.charAt(pos))) { pos++; if (pos>=breakAtIndex) break; } final int uriBegin=pos; final int closingBracketPos=parseText.indexOf(')',uriBegin,breakAtIndex); if (closingBracketPos==-1) break; pos=closingBracketPos; while (Segment.isWhiteSpace(parseText.charAt(pos-1))) pos--; if (isQuote(parseText.charAt(pos-1))) pos--; final int uriEnd=pos; if (uriEnd<=uriBegin) break; uriSegments.add(new Segment(source,uriBegin,uriEnd)); pos=closingBracketPos; } return uriSegments; }