@Override public TokenStream create(final TokenStream input) { return new URINormalisationFilter(input); }
public URINormalisationFilter(final TokenStream input) { super(input); termAtt = this.addAttribute(CharTermAttribute.class); posIncrAtt = this.addAttribute(PositionIncrementAttribute.class); termBuffer = CharBuffer.allocate(256); }
protected void nextToken() { // There is still delimiters while (this.findNextToken()) { // SRN-66: skip tokens with less than 4 characters if (end - start < 4) { start = end; continue; } this.updateToken(); _nTokens++; return; } // No more delimiters, we have to return the full URI as last step this.updateFinalToken(); _isNormalising = false; }
protected boolean isBreakPoint(final int c) { return this.isDelim(c) || this.isUppercase(c); }
protected boolean findNextToken() { while (start < termLength) { if (this.isDelim(termBuffer.get(start))) { start++; continue; } else { end = start; do { end++; } while (end < termLength && !this.isBreakPoint(termBuffer.get(end))); return true; } } return false; }
@Override public final boolean incrementToken() throws java.io.IOException { // While we are normalising the URI if (_isNormalising) { this.posIncrAtt.setPositionIncrement(1); // reset the position increment this.nextToken(); return true; } // Otherwise, get next URI token and start normalisation if (input.incrementToken()) { termLength = termAtt.length(); this.updateBuffer(); _isNormalising = true; start = end = 0; _nTokens =0; this.skipScheme(); this.nextToken(); return true; } return false; }
/** * Given the type of URI normalisation, apply the right sequence of operations * and filters to the token stream. */ private TokenStream applyURINormalisation(TokenStream in) { switch (normalisationType) { case NONE: return new URITrailingSlashFilter(in); // here, trailing slash filter is after localname filtering, in order to // avoid filtering subdirectory instead of localname case LOCALNAME: in = new URILocalnameFilter(in); return new URITrailingSlashFilter(in); // here, trailing slash filter is before localname filtering, in order to // avoid trailing slash checking on every tokens generated by the // URI normalisation filter case FULL: in = new URITrailingSlashFilter(in); return new URINormalisationFilter(in); default: throw new EnumConstantNotPresentException(URINormalisation.class, normalisationType.toString()); } }
public void assertNormalisesTo(final Tokenizer t, final String input, final String[] expectedImages, final String[] expectedTypes) throws Exception { assertTrue("has CharTermAttribute", t.hasAttribute(CharTermAttribute.class)); final CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class); TypeAttribute typeAtt = null; if (expectedTypes != null) { assertTrue("has TypeAttribute", t.hasAttribute(TypeAttribute.class)); typeAtt = t.getAttribute(TypeAttribute.class); } t.setReader(new StringReader(input)); t.reset(); final TokenStream filter = new URINormalisationFilter(t); for (int i = 0; i < expectedImages.length; i++) { assertTrue("token "+i+" exists", filter.incrementToken()); assertEquals(expectedImages[i], termAtt.toString()); if (expectedTypes != null) { assertEquals(expectedTypes[i], typeAtt.type()); } } assertFalse("end of stream", filter.incrementToken()); filter.end(); filter.close(); }