private void align(List<String> database, List<String> query, Integer... result) { LongTextAligner aligner = new LongTextAligner(database, 1); int[] alignment = aligner.align(query); assertThat(Utilities.asList(alignment), contains(result)); } }
/** * Constructs new text aligner that servers requests for alignment of * sequence of words with the provided database sequence. Sequences are * aligned by tuples comprising one or more subsequent words. * * @param words list of words forming the database * @param tupleSize size of a tuple, must be greater or equal to 1 */ public LongTextAligner(List<String> words, int tupleSize) { assert words != null; assert tupleSize > 0; this.tupleSize = tupleSize; this.refWords = words; int offset = 0; reftup = getTuples(words); tupleIndex = new HashMap<String, ArrayList<Integer>>(); for (String tuple : reftup) { ArrayList<Integer> indexes = tupleIndex.get(tuple); if (indexes == null) { indexes = new ArrayList<Integer>(); tupleIndex.put(tuple, indexes); } indexes.add(offset++); } }
/** * Aligns query sequence with the previously built database. * @param query list of words to look for * * @return indices of alignment */ public int[] align(List<String> query) { return align(query, new Range(0, refWords.size())); }
/** * Aligns query sequence with the previously built database. * @param words list words to look for * @param range range of database to look for alignment * * @return indices of alignment */ public int[] align(List<String> words, Range range) { if (range.upperEndpoint() - range.lowerEndpoint() < tupleSize || words.size() < tupleSize) { return alignTextSimple(refWords.subList(range.lowerEndpoint(), range.upperEndpoint()), words, range.lowerEndpoint()); } int[] result = new int[words.size()]; fill(result, -1); int lastIndex = 0; for (Alignment.Node node : new Alignment(getTuples(words), range) .getIndices()) { // for (int j = 0; j < tupleSize; ++j) lastIndex = max(lastIndex, node.getQueryIndex()); for (; lastIndex < node.getQueryIndex() + tupleSize; ++lastIndex) result[lastIndex] = node.getDatabaseIndex() + lastIndex - node.getQueryIndex(); } return result; }
@BeforeClass public void setUp() throws IOException { URL url = getClass().getResource("transcription-small.txt"); ArrayList<String> words = new ArrayList<String>(); Scanner scanner = new Scanner(url.openStream()); while (scanner.hasNext()) { words.add(scanner.next()); } scanner.close(); aligner = new LongTextAligner(words, 2); }
@Test(invocationTimeOut = 10000, invocationCount = 1, enabled = false) public void alignShortSequence() { List<String> query = database.subList(100, 200); Integer[] ids = new Integer[query.size()]; for (int i = 0; i < query.size(); ++i) ids[i] = 100 + i; assertThat(Utilities.asList(aligner.align(query)), contains(ids)); }
/** * Aligns query sequence with the previously built database. * @param words list words to look for * @param range range of database to look for alignment * * @return indices of alignment */ public int[] align(List<String> words, Range range) { if (range.upperEndpoint() - range.lowerEndpoint() < tupleSize || words.size() < tupleSize) { return alignTextSimple(refWords.subList(range.lowerEndpoint(), range.upperEndpoint()), words, range.lowerEndpoint()); } int[] result = new int[words.size()]; fill(result, -1); int lastIndex = 0; for (Alignment.Node node : new Alignment(getTuples(words), range) .getIndices()) { // for (int j = 0; j < tupleSize; ++j) lastIndex = max(lastIndex, node.getQueryIndex()); for (; lastIndex < node.getQueryIndex() + tupleSize; ++lastIndex) result[lastIndex] = node.getDatabaseIndex() + lastIndex - node.getQueryIndex(); } return result; }
@BeforeClass public void setUp() { Random rng = new Random(42); database = new ArrayList<String>(); String[] dictionary = new String[] {"foo", "bar", "baz", "quz"}; for (int i = 0; i < 100000; ++i) database.add(dictionary[rng.nextInt(dictionary.length)]); aligner = new LongTextAligner(database, 3); }
LongTextAligner aligner = new LongTextAligner(transcript, TUPLE_SIZE); Map<Integer, WordResult> alignedWords = new TreeMap<Integer, WordResult>(); Queue<Range> ranges = new LinkedList<Range>(); words.add(wr.getWord().getSpelling()); int[] alignment = aligner.align(words, range);
@Test(invocationTimeOut = 10000, invocationCount = 1, enabled = false) public void alignLongSequence() { List<String> query = database.subList(1999, 8777); assertThat(Utilities.asList(aligner.align(query)), contains(1)); } }
/** * Constructs new text aligner that servers requests for alignment of * sequence of words with the provided database sequence. Sequences are * aligned by tuples comprising one or more subsequent words. * * @param words list of words forming the database * @param tupleSize size of a tuple, must be greater or equal to 1 */ public LongTextAligner(List<String> words, int tupleSize) { assert words != null; assert tupleSize > 0; this.tupleSize = tupleSize; this.refWords = words; int offset = 0; reftup = getTuples(words); tupleIndex = new HashMap<String, ArrayList<Integer>>(); for (String tuple : reftup) { ArrayList<Integer> indexes = tupleIndex.get(tuple); if (indexes == null) { indexes = new ArrayList<Integer>(); tupleIndex.put(tuple, indexes); } indexes.add(offset++); } }
LongTextAligner aligner = new LongTextAligner(transcript, TUPLE_SIZE); Map<Integer, WordResult> alignedWords = new TreeMap<Integer, WordResult>(); Queue<Range> ranges = new LinkedList<Range>(); words.add(wr.getWord().getSpelling()); int[] alignment = aligner.align(words, range);
@Test(dataProvider = "words") public void align(List<String> words, Matcher<List<Integer>> matcher) { assertThat(Utilities.asList(aligner.align(words)), matcher); }
/** * Aligns query sequence with the previously built database. * @param query list of words to look for * * @return indices of alignment */ public int[] align(List<String> query) { return align(query, new Range(0, refWords.size())); }