public GraphTokenStreamFiniteStrings(TokenStream in) throws IOException { Automaton aut = build(in); this.det = Operations.removeDeadStates(Operations.determinize(aut, DEFAULT_MAX_DETERMINIZED_STATES)); }
/** * Returns the longest BytesRef that is a suffix of all accepted strings. * Worst case complexity: exponential in number of states (this calls * determinize). * @param maxDeterminizedStates maximum number of states determinizing the * automaton can result in. Set higher to allow more complex queries and * lower to prevent memory exhaustion. * @return common suffix, which can be an empty (length 0) BytesRef (never null) */ public static BytesRef getCommonSuffixBytesRef(Automaton a, int maxDeterminizedStates) { // reverse the language of the automaton, then reverse its common prefix. Automaton r = Operations.determinize(reverse(a), maxDeterminizedStates); BytesRef ref = getCommonPrefixBytesRef(r); reverseBytes(ref); return ref; }
a = Operations.determinize(a, maxDeterminizedStates); this.automaton = a; points = a.getStartPoints();
/** * Returns a (deterministic) automaton that accepts the complement of the * language of the given automaton. * <p> * Complexity: linear in number of states if already deterministic and * exponential otherwise. * @param maxDeterminizedStates maximum number of states determinizing the * automaton can result in. Set higher to allow more complex queries and * lower to prevent memory exhaustion. */ static public Automaton complement(Automaton a, int maxDeterminizedStates) { a = totalize(determinize(a, maxDeterminizedStates)); int numStates = a.getNumStates(); for (int p=0;p<numStates;p++) { a.setAccept(p, !a.isAccept(p)); } return removeDeadStates(a); }
a = Operations.determinize(a, maxDeterminizedStates);
protected Automaton convertAutomaton(Automaton a) { if (queryPrefix != null) { a = Operations.concatenate(Arrays.asList(queryPrefix, a)); // This automaton should not blow up during determinize: a = Operations.determinize(a, Integer.MAX_VALUE); } return a; }
automaton = Operations.determinize(automaton, maxDeterminizedStates);
/** Creates a new SimpleSplitPatternTokenizerFactory */ public SimplePatternSplitTokenizerFactory(Map<String,String> args) { super(args); maxDeterminizedStates = getInt(args, "maxDeterminizedStates", Operations.DEFAULT_MAX_DETERMINIZED_STATES); dfa = Operations.determinize(new RegExp(require(args, PATTERN)).toAutomaton(), maxDeterminizedStates); if (args.isEmpty() == false) { throw new IllegalArgumentException("Unknown parameters: " + args); } }
/** Creates a new SimplePatternTokenizerFactory */ public SimplePatternTokenizerFactory(Map<String,String> args) { super(args); maxDeterminizedStates = getInt(args, "maxDeterminizedStates", Operations.DEFAULT_MAX_DETERMINIZED_STATES); dfa = Operations.determinize(new RegExp(require(args, PATTERN)).toAutomaton(), maxDeterminizedStates); if (args.isEmpty() == false) { throw new IllegalArgumentException("Unknown parameters: " + args); } }
final Automaton toLookupAutomaton(final CharSequence key) throws IOException { // TODO: is there a Reader from a CharSequence? // Turn tokenstream into automaton: Automaton automaton = null; try (TokenStream ts = queryAnalyzer.tokenStream("", key.toString())) { automaton = getTokenStreamToAutomaton().toAutomaton(ts); } automaton = replaceSep(automaton); // TODO: we can optimize this somewhat by determinizing // while we convert // This automaton should not blow up during determinize: automaton = Operations.determinize(automaton, Integer.MAX_VALUE); return automaton; }
@Override protected Automaton convertAutomaton(Automaton a) { if (unicodeAware) { // FLORIAN EDIT: get converted Automaton from superclass Automaton utf8automaton = new UTF32ToUTF8().convert(super.convertAutomaton(a)); // This automaton should not blow up during determinize: utf8automaton = Operations.determinize(utf8automaton, Integer.MAX_VALUE); return utf8automaton; } else { return super.convertAutomaton(a); } }
return Operations.determinize(a, DEFAULT_MAX_DETERMINIZED_STATES);
/** * Converts the tokenStream to an automaton. Does *not* close it. */ public Automaton toAutomaton(boolean unicodeAware) throws IOException { // TODO refactor this // maybe we could hook up a modified automaton from TermAutomatonQuery here? // Create corresponding automaton: labels are bytes // from each analyzed token, with byte 0 used as // separator between tokens: final TokenStreamToAutomaton tsta; if (preserveSep) { tsta = new EscapingTokenStreamToAutomaton(SEP_LABEL); } else { // When we're not preserving sep, we don't steal 0xff // byte, so we don't need to do any escaping: tsta = new TokenStreamToAutomaton(); } tsta.setPreservePositionIncrements(preservePositionIncrements); tsta.setUnicodeArcs(unicodeAware); Automaton automaton = tsta.toAutomaton(inputTokenStream); // TODO: we can optimize this somewhat by determinizing // while we convert automaton = replaceSep(automaton, preserveSep, SEP_LABEL); // This automaton should not blow up during determinize: return Operations.determinize(automaton, maxGraphExpansions); }
protected Automaton convertAutomaton(Automaton a) { if (queryPrefix != null) { a = Operations.concatenate(Arrays.asList(queryPrefix, a)); // This automaton should not blow up during determinize: a = Operations.determinize(a, Integer.MAX_VALUE); } return a; }
protected Automaton convertAutomaton(Automaton a) { if (queryPrefix != null) { a = Operations.concatenate(Arrays.asList(queryPrefix, a)); // This automaton should not blow up during determinize: a = Operations.determinize(a, Integer.MAX_VALUE); } return a; }
protected Automaton convertAutomaton(Automaton a) { if (queryPrefix != null) { a = Operations.concatenate(Arrays.asList(queryPrefix, a)); // This automaton should not blow up during determinize: a = Operations.determinize(a, Integer.MAX_VALUE); } return a; }
@Override protected Automaton convertAutomaton(Automaton a) { if (unicodeAware) { Automaton utf8automaton = new UTF32ToUTF8().convert(a); utf8automaton = Operations.determinize(utf8automaton, DEFAULT_MAX_DETERMINIZED_STATES); return utf8automaton; } else { return a; } }
@Override protected Automaton convertAutomaton(Automaton a) { if (unicodeAware) { // FLORIAN EDIT: get converted Automaton from superclass Automaton utf8automaton = new UTF32ToUTF8().convert(super.convertAutomaton(a)); // This automaton should not blow up during determinize: utf8automaton = Operations.determinize(utf8automaton, Integer.MAX_VALUE); return utf8automaton; } else { return super.convertAutomaton(a); } }
@Override protected Automaton convertAutomaton(Automaton a) { if (unicodeAware) { // FLORIAN EDIT: get converted Automaton from superclass Automaton utf8automaton = new UTF32ToUTF8().convert(super.convertAutomaton(a)); // This automaton should not blow up during determinize: utf8automaton = Operations.determinize(utf8automaton, Integer.MAX_VALUE); return utf8automaton; } else { return super.convertAutomaton(a); } }
private Factory(String regexString, int maxDeterminizedStates) { Automaton automaton = new RegExp(regexString).toAutomaton(maxDeterminizedStates); forward = new OffsetReturningRunAutomaton(automaton, false); if (hasLeadingWildcard(automaton)) { Automaton reversed = Operations.determinize(Operations.reverse( new RegExp("(" + regexString + ").*").toAutomaton(maxDeterminizedStates)), maxDeterminizedStates); reverse = new AcceptReturningReverseRunAutomaton(reversed); } else { reverse = null; } }