/** Just maps each UTF16 unit (char) to the ints in an * IntsRef. */ public static IntsRef toUTF16(CharSequence s, IntsRefBuilder scratch) { final int charLimit = s.length(); scratch.setLength(charLimit); scratch.grow(charLimit); for (int idx = 0; idx < charLimit; idx++) { scratch.setIntAt(idx, (int) s.charAt(idx)); } return scratch.get(); }
private FST<IntsRef> affixFST(TreeMap<String,List<Integer>> affixes) throws IOException { IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton(); Builder<IntsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE4, outputs); IntsRefBuilder scratch = new IntsRefBuilder(); for (Map.Entry<String,List<Integer>> entry : affixes.entrySet()) { Util.toUTF32(entry.getKey(), scratch); List<Integer> entries = entry.getValue(); IntsRef output = new IntsRef(entries.size()); for (Integer c : entries) { output.ints[output.length++] = c; } builder.add(scratch.get(), output); } return builder.finish(); }
private FST<CharsRef> parseConversions(LineNumberReader reader, int num) throws IOException, ParseException { Map<String,String> mappings = new TreeMap<>(); for (int i = 0; i < num; i++) { String line = reader.readLine(); String parts[] = line.split("\\s+"); if (parts.length != 3) { throw new ParseException("invalid syntax: " + line, reader.getLineNumber()); } if (mappings.put(parts[1], parts[2]) != null) { throw new IllegalStateException("duplicate mapping specified for: " + parts[1]); } } Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton(); Builder<CharsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE2, outputs); IntsRefBuilder scratchInts = new IntsRefBuilder(); for (Map.Entry<String,String> entry : mappings.entrySet()) { Util.toUTF16(entry.getKey(), scratchInts); builder.add(scratchInts.get(), new CharsRef(entry.getValue())); } return builder.finish(); }
/** * Returns an {@link StemmerOverrideMap} to be used with the {@link StemmerOverrideFilter} * @return an {@link StemmerOverrideMap} to be used with the {@link StemmerOverrideFilter} * @throws IOException if an {@link IOException} occurs; */ public StemmerOverrideMap build() throws IOException { ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); org.apache.lucene.util.fst.Builder<BytesRef> builder = new org.apache.lucene.util.fst.Builder<>( FST.INPUT_TYPE.BYTE4, outputs); final int[] sort = hash.sort(); IntsRefBuilder intsSpare = new IntsRefBuilder(); final int size = hash.size(); BytesRef spare = new BytesRef(); for (int i = 0; i < size; i++) { int id = sort[i]; BytesRef bytesRef = hash.get(id, spare); intsSpare.copyUTF8Bytes(bytesRef); builder.add(intsSpare.get(), new BytesRef(outputValues.get(id))); } return new StemmerOverrideMap(builder.finish(), ignoreCase); }
private void updateFST(SortedMap<String, Double> weights) throws IOException { PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); BytesRefBuilder scratchBytes = new BytesRefBuilder(); IntsRefBuilder scratchInts = new IntsRefBuilder(); for (Map.Entry<String, Double> entry : weights.entrySet()) { scratchBytes.copyChars(entry.getKey()); fstBuilder.add(Util.toIntsRef(scratchBytes.get(), scratchInts), entry .getValue().longValue()); } fst = fstBuilder.finish(); }
new LimitedFiniteStringsIterator(toAutomaton(surfaceForm, ts2a), maxGraphExpansions); for (IntsRef string; (string = finiteStrings.next()) != null; count++) { Util.toBytesRef(string, scratch); Builder<Pair<Long,BytesRef>> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); BytesRefBuilder analyzed = new BytesRefBuilder(); BytesRef surface = new BytesRef(); IntsRefBuilder scratchInts = new IntsRefBuilder(); ByteArrayDataInput input = new ByteArrayDataInput(); analyzed.append((byte) dedup); Util.toIntsRef(analyzed.get(), scratchInts); builder.add(scratchInts.get(), outputs.newPair(cost, BytesRef.deepCopyOf(surface))); } else { int payloadOffset = input.getPosition() + surface.length; System.arraycopy(bytes.bytes, payloadOffset, br.bytes, surface.length+1, payloadLength); br.length = br.bytes.length; builder.add(scratchInts.get(), outputs.newPair(cost, br));
public void finishTerm(long defaultWeight) throws IOException { ArrayUtil.timSort(surfaceFormsAndPayload, 0, count); int deduplicator = 0; analyzed.append((byte) 0); analyzed.setLength(analyzed.length() + 1); analyzed.grow(analyzed.length()); for (int i = 0; i < count; i++) { analyzed.setByteAt(analyzed.length() - 1, (byte) deduplicator++); Util.toIntsRef(analyzed.get(), scratchInts); SurfaceFormAndPayload candiate = surfaceFormsAndPayload[i]; long cost = candiate.weight == -1 ? encodeWeight(Math.min(Integer.MAX_VALUE, defaultWeight)) : candiate.weight; builder.add(scratchInts.get(), outputs.newPair(cost, candiate.payload)); } seenSurfaceForms.clear(); count = 0; }
/** If this automaton accepts a single input, return it. Else, return null. * The automaton must be deterministic. */ public static IntsRef getSingleton(Automaton a) { if (a.isDeterministic() == false) { throw new IllegalArgumentException("input automaton must be deterministic"); } IntsRefBuilder builder = new IntsRefBuilder(); HashSet<Integer> visited = new HashSet<>(); int s = 0; Transition t = new Transition(); while (true) { visited.add(s); if (a.isAccept(s) == false) { if (a.getNumTransitions(s) == 1) { a.getTransition(s, 0, t); if (t.min == t.max && !visited.contains(t.dest)) { builder.append(t.min); s = t.dest; continue; } } } else if (a.getNumTransitions(s) == 0) { return builder.get(); } // Automaton accepts more than one string: return null; } }
assert lastInput.length() == 0 || input.compareTo(lastInput.get()) >= 0: "inputs are added out of order lastInput=" + lastInput.get() + " vs input=" + input; assert validOutput(output); final int pos1Stop = Math.min(lastInput.length(), input.length); while(true) { frontier[pos1].inputCount++; if (pos1 >= pos1Stop || lastInput.intAt(pos1) != input.ints[pos2]) { break; freezeTail(prefixLenPlus1); if (lastInput.length() != input.length || prefixLenPlus1 != input.length + 1) { lastNode.isFinal = true; lastNode.output = NO_OUTPUT; assert validOutput(lastOutput); if (lastInput.length() == input.length && prefixLenPlus1 == 1+input.length) { lastInput.copyInts(input);
/** * Constructor. * * @param a Automaton to create finite string from. * @param startState The starting state for each path. * @param endState The state where each path should stop or -1 if only accepted states should be final. */ public FiniteStringsIterator(Automaton a, int startState, int endState) { this.a = a; this.endState = endState; this.nodes = new PathNode[16]; for (int i = 0, end = nodes.length; i < end; i++) { nodes[i] = new PathNode(); } this.string = new IntsRefBuilder(); this.pathStates = new BitSet(a.getNumStates()); this.string.setLength(0); this.emitEmptyString = a.isAccept(0); // Start iteration with node startState. if (a.getNumTransitions(startState) > 0) { pathStates.set(startState); nodes[0].resetState(a, startState); string.append(startState); } }
IntsRefBuilder scratchInts = new IntsRefBuilder(); IntsRefBuilder currentOrds = new IntsRefBuilder(); Util.toUTF32(currentEntry, scratchInts); words.add(scratchInts.get(), currentOrds.get()); currentOrds = new IntsRefBuilder(); // must be this way currentOrds.append(ord); currentOrds.append(stemExceptionID); } else { currentOrds.append(ord); Util.toUTF32(currentEntry, scratchInts); words.add(scratchInts.get(), currentOrds.get()); success2 = true; } finally {
int state; int pos = 0; savedStates.grow(seekBytesRef.length()+1); savedStates.setIntAt(0, 0); linear = false; for (state = savedStates.intAt(pos); pos < seekBytesRef.length(); pos++) { visited[state] = curGen; int nextState = runAutomaton.step(state, seekBytesRef.byteAt(pos) & 0xff); if (nextState == -1) break; savedStates.setIntAt(pos+1, nextState); if ((pos = backtrack(pos)) < 0) /* no more solutions at all */ return false; final int newState = runAutomaton.step(savedStates.intAt(pos), seekBytesRef.byteAt(pos) & 0xff); if (newState >= 0 && runAutomaton.isAccept(newState))
/** Returns final FST. NOTE: this will return null if * nothing is accepted by the FST. */ public FST<T> finish() throws IOException { final UnCompiledNode<T> root = frontier[0]; // minimize nodes in the last word's suffix freezeTail(0); if (root.inputCount < minSuffixCount1 || root.inputCount < minSuffixCount2 || root.numArcs == 0) { if (fst.emptyOutput == null) { return null; } else if (minSuffixCount1 > 0 || minSuffixCount2 > 0) { // empty string got pruned return null; } } else { if (minSuffixCount2 != 0) { compileAllTargets(root, lastInput.length()); } } //if (DEBUG) System.out.println(" builder.finish root.isFinal=" + root.isFinal + " root.output=" + root.output); fst.finish(compileNode(root, lastInput.length()).node); return fst; }
private void freezeTail(int prefixLenPlus1) throws IOException { for(int idx=lastInput.length(); idx >= downTo; idx--) { parent.deleteLast(lastInput.intAt(idx-1), node); } else { compileAllTargets(node, lastInput.length()-idx); parent.replaceLast(lastInput.intAt(idx-1), compileNode(node, 1+lastInput.length()-idx), nextFinalOutput, isFinal); parent.replaceLast(lastInput.intAt(idx-1), node, nextFinalOutput,
for (int i = builder.length(); --i > 0;) builder.setCharAt(i, Character.toLowerCase(builder.charAt(i))); this.utf32 = UTF16ToUTF32(builder, utf32Builder).get(); builder.setLength(0); maxPathsBuilder.clear(); maxPathsBuilder.grow(utf32.length + 1); Arrays.fill(maxPathsBuilder.ints(), 0, utf32.length + 1, Integer.MAX_VALUE);
/** Starting from node, find the top N min cost * completions to a final node. */ public static <T> TopResults<T> shortestPaths(FST<T> fst, FST.Arc<T> fromNode, T startOutput, Comparator<T> comparator, int topN, boolean allowEmptyString) throws IOException { // All paths are kept, so we can pass topN for // maxQueueDepth and the pruning is admissible: TopNSearcher<T> searcher = new TopNSearcher<>(fst, topN, topN, comparator); // since this search is initialized with a single start node // it is okay to start with an empty input path here searcher.addStartPaths(fromNode, startOutput, allowEmptyString, new IntsRefBuilder()); return searcher.search(); }
/** Reverse lookup (lookup by output instead of by input), * in the special case when your FSTs outputs are * strictly ascending. This locates the input/output * pair where the output is equal to the target, and will * return null if that output does not exist. * * <p>NOTE: this only works with {@code FST<Long>}, only * works when the outputs are ascending in order with * the inputs. * For example, simple ordinals (0, 1, * 2, ...), or file offets (when appending to a file) * fit this. */ public static IntsRef getByOutput(FST<Long> fst, long targetOutput) throws IOException { final BytesReader in = fst.getBytesReader(); // TODO: would be nice not to alloc this on every lookup FST.Arc<Long> arc = fst.getFirstArc(new FST.Arc<Long>()); FST.Arc<Long> scratchArc = new FST.Arc<>(); final IntsRefBuilder result = new IntsRefBuilder(); return getByOutput(fst, targetOutput, in, arc, scratchArc, result); }
@Override public String toString() { return "input=" + input.get() + " output=" + output + " context=" + context + " boost=" + boost + " payload=" + payload; } }