public int getValue() { if (isBoundary()) return max(queryIndex, databaseIndex); return hasMatch() ? 0 : 1; }
public boolean hasMatch() { return getQueryWord().equals(getDatabaseWord()); }
public String getQueryWord() { if (queryIndex > 0) return query.get(getQueryIndex()); return null; }
/** * Aligns query sequence with the previously built database. * @param words list words to look for * @param range range of database to look for alignment * * @return indices of alignment */ public int[] align(List<String> words, Range range) { if (range.upperEndpoint() - range.lowerEndpoint() < tupleSize || words.size() < tupleSize) { return alignTextSimple(refWords.subList(range.lowerEndpoint(), range.upperEndpoint()), words, range.lowerEndpoint()); } int[] result = new int[words.size()]; fill(result, -1); int lastIndex = 0; for (Alignment.Node node : new Alignment(getTuples(words), range) .getIndices()) { // for (int j = 0; j < tupleSize; ++j) lastIndex = max(lastIndex, node.getQueryIndex()); for (; lastIndex < node.getQueryIndex() + tupleSize; ++lastIndex) result[lastIndex] = node.getDatabaseIndex() + lastIndex - node.getQueryIndex(); } return result; }
/** * Adds a word as an Item to this WordRelation object. * * @param word the word to add */ public void addWord(String word) { Item tokenItem = tokenToWords.getTokenItem(); Item wordItem = tokenItem.createDaughter(); FeatureSet featureSet = wordItem.getFeatures(); featureSet.setString("name", word); relation.appendItem(wordItem); }
/** * Removes the postpunctuation characters from the current token. Copies * those postpunctuation characters to the class variable * 'postpunctuation'. */ private void removeTokenPostpunctuation() { if (token == null) { return; } final String tokenWord = token.getWord(); int tokenLength = tokenWord.length(); int position = tokenLength - 1; while (position > 0 && postpunctuationSymbols.indexOf((int) tokenWord .charAt(position)) != -1) { position--; } if (tokenLength - 1 != position) { // Copy postpunctuation from token token.setPostpunctuation(tokenWord.substring(position + 1)); // truncate token at postpunctuation token.setWord(tokenWord.substring(0, position + 1)); } else { token.setPostpunctuation(""); } }
private void align(List<String> database, List<String> query, Integer... result) { LongTextAligner aligner = new LongTextAligner(database, 1); int[] alignment = aligner.align(query); assertThat(Utilities.asList(alignment), contains(result)); } }
/** * Returns true if the given token item contains a token that is in a * section-like context, e.g., "chapter" or "act". * * @param tokenItem the token item to check * * @return true or false */ public static boolean sectionLike(Item tokenItem) { String sectionType = ((String) tokenItem.findFeature("p.name")).toLowerCase(); return inKingSectionLikeMap(sectionType, SECTION_TYPES); }
/** * Returns true if the given token matches part of a phone number * * @param tokenItem the token * @param tokenVal the string value of the token * * @return true or false */ private boolean matchesPartPhoneNumber(String tokenVal) { String n_name = (String) tokenItem.findFeature("n.name"); String n_n_name = (String) tokenItem.findFeature("n.n.name"); String p_name = (String) tokenItem.findFeature("p.name"); String p_p_name = (String) tokenItem.findFeature("p.p.name"); boolean matches3DigitsP_name = matches(threeDigitsPattern, p_name); return ((matches(threeDigitsPattern, tokenVal) && ((!matches( digitsPattern, p_name) && matches(threeDigitsPattern, n_name) && matches( fourDigitsPattern, n_n_name)) || (matches(sevenPhoneNumberPattern, n_name)) || (!matches( digitsPattern, p_p_name) && matches3DigitsP_name && matches( fourDigitsPattern, n_name)))) || (matches( fourDigitsPattern, tokenVal) && (!matches(digitsPattern, n_name) && matches3DigitsP_name && matches(threeDigitsPattern, p_p_name)))); }
public List<WordResult> align(URL audioUrl, String transcript) throws IOException { return align(audioUrl, getTokenizer().expand(transcript)); }
/** * Aligns query sequence with the previously built database. * @param query list of words to look for * * @return indices of alignment */ public int[] align(List<String> query) { return align(query, new Range(0, refWords.size())); }
@BeforeClass public void setUp() { Random rng = new Random(42); database = new ArrayList<String>(); String[] dictionary = new String[] {"foo", "bar", "baz", "quz"}; for (int i = 0; i < 100000; ++i) database.add(dictionary[rng.nextInt(dictionary.length)]); aligner = new LongTextAligner(database, 3); }
public String getDatabaseWord() { if (databaseIndex > 0) return reftup.get(getDatabaseIndex()); return null; }
public List<Node> adjacent() { List<Node> result = new ArrayList<Node>(3); if (queryIndex < indices.size() && databaseIndex < shifts.size()) { result.add(new Node(queryIndex + 1, databaseIndex + 1)); } if (databaseIndex < shifts.size()) { result.add(new Node(queryIndex, databaseIndex + 1)); } if (queryIndex < indices.size()) { result.add(new Node(queryIndex + 1, databaseIndex)); } return result; }
public SpeechAligner(String amPath, String dictPath, String g2pPath) throws MalformedURLException, IOException { Configuration configuration = new Configuration(); configuration.setAcousticModelPath(amPath); configuration.setDictionaryPath(dictPath); context = new Context(configuration); if (g2pPath != null) { context.setLocalProperty("dictionary->g2pModelPath", g2pPath); context.setLocalProperty("dictionary->g2pMaxPron", "2"); } context.setLocalProperty("lexTreeLinguist->languageModel", "dynamicTrigramModel"); recognizer = context.getInstance(Recognizer.class); grammar = context.getInstance(AlignerGrammar.class); languageModel = context.getInstance(DynamicTrigramModel.class); setTokenizer(new SimpleTokenizer()); }
@BeforeMethod public void setupMethod() { expander = new USEnglishTokenizer(); }
@Test public void shouldAlignText() { align(asList("foo"), asList("bar"), -1); align(asList("foo"), asList("foo"), 0); align(asList("foo", "bar"), asList("foo"), 0); align(asList("foo", "bar"), asList("bar"), 1); align(asList("foo"), asList("foo", "bar"), 0, -1); align(asList("bar"), asList("foo", "bar"), -1, 0); align(asList("foo", "bar", "baz"), asList("foo", "baz"), 0, 2); align(asList("foo", "bar", "42", "baz", "qux"), asList("42", "baz"), 2, 3); }
/** * Returns true if the given token item contains a token that is in a * king-like context, e.g., "King" or "Louis". * * @param tokenItem the token item to check * * @return true or false */ public static boolean kingLike(Item tokenItem) { String kingName = ((String) tokenItem.findFeature("p.name")).toLowerCase(); if (inKingSectionLikeMap(kingName, KING_NAMES)) { return true; } else { String kingTitle = ((String) tokenItem.findFeature("p.p.name")).toLowerCase(); return inKingSectionLikeMap(kingTitle, KING_TITLES); } }
@Test(dataProvider = "data") public void textToWords(String text, String expanded) { List<String> tokens = expander.expand(text); assertThat(Utilities.join(tokens), equalTo(expanded)); } }
@Test(dataProvider = "words") public void align(List<String> words, Matcher<List<Integer>> matcher) { assertThat(Utilities.asList(aligner.align(words)), matcher); }