/** * Returns a shallow clone of this instance (the underlying characters are * <b>not</b> copied and will be shared by both the returned object and this * object. * * @see #deepCopyOf */ @Override public CharsRef clone() { return new CharsRef(chars, offset, length); }
/** Sole constructor. */ public CharsRefBuilder() { ref = new CharsRef(); }
@Override public CharsRef subtract(CharsRef output, CharsRef inc) { assert output != null; assert inc != null; if (inc == NO_OUTPUT) { // no prefix removed return output; } else if (inc.length == output.length) { // entire output removed return NO_OUTPUT; } else { assert inc.length < output.length: "inc.length=" + inc.length + " vs output.length=" + output.length; assert inc.length > 0; return new CharsRef(output.chars, output.offset + inc.length, output.length-inc.length); } }
@Override public CharsRef add(CharsRef prefix, CharsRef output) { assert prefix != null; assert output != null; if (prefix == NO_OUTPUT) { return output; } else if (output == NO_OUTPUT) { return prefix; } else { assert prefix.length > 0; assert output.length > 0; CharsRef result = new CharsRef(prefix.length + output.length); System.arraycopy(prefix.chars, prefix.offset, result.chars, 0, prefix.length); System.arraycopy(output.chars, output.offset, result.chars, prefix.length, output.length); result.length = prefix.length + output.length; return result; } }
@Override public CharsRef common(CharsRef output1, CharsRef output2) { assert output1 != null; assert output2 != null; int pos1 = output1.offset; int pos2 = output2.offset; int stopAt1 = pos1 + Math.min(output1.length, output2.length); while(pos1 < stopAt1) { if (output1.chars[pos1] != output2.chars[pos2]) { break; } pos1++; pos2++; } if (pos1 == output1.offset) { // no common prefix return NO_OUTPUT; } else if (pos1 == output1.offset + output1.length) { // output1 is a prefix of output2 return output1; } else if (pos2 == output2.offset + output2.length) { // output2 is a prefix of output1 return output2; } else { return new CharsRef(output1.chars, output1.offset, pos1-output1.offset); } }
@Override public CharSequence subSequence(int start, int end) { // NOTE: must do a real check here to meet the specs of CharSequence FutureObjects.checkFromToIndex(start, end, length); return new CharsRef(chars, offset + start, end - start); }
/** * Creates a new CharsRef that points to a copy of the chars from * <code>other</code> * <p> * The returned CharsRef will have a length of other.length * and an offset of zero. */ public static CharsRef deepCopyOf(CharsRef other) { return new CharsRef(ArrayUtil.copyOfSubArray(other.chars, other.offset, other.offset + other.length), 0, other.length); }
/** Build a new {@link CharsRef} that has the same content as this builder. */ public CharsRef toCharsRef() { return new CharsRef(ArrayUtil.copyOfSubArray(ref.chars, 0, ref.length), 0, ref.length); }
@Override public CharsRef read(DataInput in) throws IOException { final int len = in.readVInt(); if (len == 0) { return NO_OUTPUT; } else { final CharsRef output = new CharsRef(len); for(int idx=0;idx<len;idx++) { output.chars[idx] = (char) in.readVInt(); } output.length = len; return output; } }
/** * Build a minimal, deterministic automaton from a sorted list of {@link BytesRef} representing * strings in UTF-8. These strings must be binary-sorted. */ public static Automaton build(Collection<BytesRef> input) { final DaciukMihovAutomatonBuilder builder = new DaciukMihovAutomatonBuilder(); char[] chars = new char[0]; CharsRef ref = new CharsRef(); for (BytesRef b : input) { chars = ArrayUtil.grow(chars, b.length); final int len = UnicodeUtil.UTF8toUTF16(b, chars); ref.chars = chars; ref.length = len; builder.add(ref); } Automaton.Builder a = new Automaton.Builder(); convert(a, builder.complete(), new IdentityHashMap<State,Integer>()); return a.finish(); }
@Override public CharsRef analyze(String text, CharsRefBuilder reuse) throws IOException { try { return super.analyze(text, reuse); } catch (IllegalArgumentException ex) { if (lenient) { logger.info("Synonym rule for [" + text + "] was ignored"); return new CharsRef(""); } else { throw ex; } } } }
@Override public CharsRef analyze(String text, CharsRefBuilder reuse) throws IOException { try { return super.analyze(text, reuse); } catch (IllegalArgumentException ex) { if (lenient) { logger.info("Synonym rule for [" + text + "] was ignored"); return new CharsRef(""); } else { throw ex; } } } }
private FST<CharsRef> parseConversions(LineNumberReader reader, int num) throws IOException, ParseException { Map<String,String> mappings = new TreeMap<>(); for (int i = 0; i < num; i++) { String line = reader.readLine(); String parts[] = line.split("\\s+"); if (parts.length != 3) { throw new ParseException("invalid syntax: " + line, reader.getLineNumber()); } if (mappings.put(parts[1], parts[2]) != null) { throw new IllegalStateException("duplicate mapping specified for: " + parts[1]); } } Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton(); Builder<CharsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE2, outputs); IntsRefBuilder scratchInts = new IntsRefBuilder(); for (Map.Entry<String,String> entry : mappings.entrySet()) { Util.toUTF16(entry.getKey(), scratchInts); builder.add(scratchInts.get(), new CharsRef(entry.getValue())); } return builder.finish(); }
/** Builds the NormalizeCharMap; call this once you * are done calling {@link #add}. */ public NormalizeCharMap build() { final FST<CharsRef> map; try { final Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton(); final org.apache.lucene.util.fst.Builder<CharsRef> builder = new org.apache.lucene.util.fst.Builder<>(FST.INPUT_TYPE.BYTE2, outputs); final IntsRefBuilder scratch = new IntsRefBuilder(); for(Map.Entry<String,String> ent : pendingPairs.entrySet()) { builder.add(Util.toUTF16(ent.getKey(), scratch), new CharsRef(ent.getValue())); } map = builder.finish(); pendingPairs.clear(); } catch (IOException ioe) { // Bogus FST IOExceptions!! (will never happen) throw new RuntimeException(ioe); } return new NormalizeCharMap(map); } }
/** * Returns a shallow clone of this instance (the underlying characters are * <b>not</b> copied and will be shared by both the returned object and this * object. * * @see #deepCopyOf */ @Override public CharsRef clone() { return new CharsRef(chars, offset, length); }
@Override public CharSequence subSequence(int start, int end) { // NOTE: must do a real check here to meet the specs of CharSequence if (start < 0 || end > length || start > end) { throw new IndexOutOfBoundsException(); } return new CharsRef(chars, offset + start, end - start); }
@Override public boolean processLine(String line) throws IOException { List<String> synonyms = newArrayList(Splitter.on(',').trimResults().split(line)); for (String term: synonyms) { for (String synonym: synonyms) { if (!term.equals(synonym)) { builder.add(new CharsRef(term), new CharsRef(synonym), true); } } } return true; }
private CharsRef parseSynonym(String line, CharsRef reuse) throws IOException { if (reuse == null) { reuse = new CharsRef(8); } int start = line.indexOf('\'')+1; int end = line.lastIndexOf('\''); String text = line.substring(start, end).replace("''", "'"); return analyze(analyzer, text, reuse); }
@Override public CharSequence subSequence(int start, int end) { // NOTE: must do a real check here to meet the specs of CharSequence FutureObjects.checkFromToIndex(start, end, length); return new CharsRef(chars, offset + start, end - start); }