@Override public ListField<Span> deserialize(JsonParser p, DeserializationContext ctxt) throws IOException, JsonProcessingException { JsonNode node = p.getCodec().readTree(p); JsonNode fieldValueNode = node.get(JsonConstants.FIELD_VALUE); ArrayList<Span> spanList = new ArrayList<>(); for (int i = 0; i < fieldValueNode.size(); i++) { JsonNode spanValueNode = fieldValueNode.get(i); spanList.add(new ObjectMapper().convertValue(spanValueNode, Span.class)); } return new ListField<Span>(spanList); }
resultFields.add(new ListField<Span>(mergeSpanList)); List<Span> innerPayload = innerPayloadField.getValue(); ListField<Span> outerPayloadField = outerTuple.getField(SchemaConstants.PAYLOAD); List<Span> outerPayload = outerPayloadField.getValue(); resultPayload.addAll(outerPayload.stream().map(span -> addFieldPrefix(span, "outer_")).collect(Collectors.toList())); resultFields.add(new ListField<Span>(resultPayload));
/** * Creates Map of label and corresponding spans * @param inputTuple * @return */ private Map<String, List<Span>> fetchLabelSpans(Tuple inputTuple) { Map<String, List<Span>> labelSpanMap = new HashMap<>(); for (String label : this.labelList) { ListField<Span> spanListField = inputTuple.getField(label); labelSpanMap.put(label, spanListField.getValue()); } return labelSpanMap; }
/** * Create Map of label id and corresponding attribute values * @param inputTuple * @return map of label id and corresponding attribute values */ private Map<String, Set<String>> fetchLabelValues(Tuple inputTuple) throws DataflowException { Map<String, Set<String>> labelSpanList = new HashMap<>(); for (String label : this.labelList) { if (! inputTuple.getSchema().containsAttribute(label)) { throw new DataflowException("label " + label + " does not exist"); } ListField<Span> spanListField = inputTuple.getField(label); Set<String> labelValues = spanListField.getValue().stream() .map(span -> span.getValue()) .map(value -> escapeString(value)) .collect(Collectors.toSet()); labelSpanList.put(label, labelValues); } return labelSpanList; }
private Tuple constructTuple(int docID) throws IOException, ParseException { Document luceneDocument = luceneIndexSearcher.doc(docID); ArrayList<IField> docFields = documentToFields(luceneDocument); if (payloadAdded) { ArrayList<Span> payloadSpanList = buildPayloadFromTermVector(docFields, docID); ListField<Span> payloadField = new ListField<Span>(payloadSpanList); docFields.add(payloadField); } Tuple resultTuple = new Tuple(outputSchema, docFields.stream().toArray(IField[]::new)); return resultTuple; }
private void computeWordCount() throws TexeraException { Tuple tuple; HashMap<String, Integer> wordCountMap = new HashMap<>(); while ((tuple = this.inputOperator.getNextTuple()) != null) { if (addPayload) { tuple = new Tuple.Builder(tuple).add(SchemaConstants.PAYLOAD_ATTRIBUTE,new ListField<Span>( DataflowUtils.generatePayloadFromTuple(tuple, predicate.getLuceneAnalyzerString()))).build(); } ListField<Span> payloadField = tuple.getField("payload"); List<Span> payloadSpanList = payloadField.getValue(); for (Span span : payloadSpanList) { if (span.getAttributeName().equals(predicate.getAttribute())) { String key = span.getValue().toLowerCase(); wordCountMap.put(key, wordCountMap.get(key)==null ? 1 : wordCountMap.get(key) + 1); } } } sortedWordCountMap = wordCountMap.entrySet().stream() .sorted((e1, e2) -> e2.getValue().compareTo(e1.getValue())) .collect(Collectors.toList()); wordCountIterator = sortedWordCountMap.iterator(); }
public static void match(ArrayList<String> queryList, double threshold, String luceneAnalyzerStr, String tableName, boolean bool) throws TexeraException, IOException { List<String> attributeNames = Arrays.asList(MedlineIndexWriter.ABSTRACT); for (String query : queryList) { FuzzyTokenSourcePredicate predicate = new FuzzyTokenSourcePredicate(query, attributeNames, luceneAnalyzerStr, threshold, tableName, SchemaConstants.SPAN_LIST); FuzzyTokenMatcherSourceOperator fuzzyTokenSource = new FuzzyTokenMatcherSourceOperator(predicate); long startMatchTime = System.currentTimeMillis(); fuzzyTokenSource.open(); int counter = 0; Tuple nextTuple = null; while ((nextTuple = fuzzyTokenSource.getNextTuple()) != null) { ListField<Span> spanListField = nextTuple.getField(SchemaConstants.SPAN_LIST); List<Span> spanList = spanListField.getValue(); counter += spanList.size(); } fuzzyTokenSource.close(); long endMatchTime = System.currentTimeMillis(); double matchTime = (endMatchTime - startMatchTime) / 1000.0; timeResults.add(Double.parseDouble(String.format("%.4f", matchTime))); totalResultCount += counter; } }
@Override public Tuple processOneInputTuple(Tuple inputTuple) throws TexeraException { List<Span> matchingResults = new ArrayList<>(); for (String attributeName : predicate.getAttributeNames()) { IField field = inputTuple.getField(attributeName); matchingResults.addAll(extractNlpSpans(field, attributeName)); } if (matchingResults.isEmpty()) { return null; } return new Tuple.Builder(inputTuple) .add(predicate.getResultAttribute(), AttributeType.LIST, new ListField<Span>(matchingResults)) .build(); }
public static void match(ArrayList<String> queryList, KeywordMatchingType opType, String luceneAnalyzerStr, String tableName) throws TexeraException, IOException { String[] attributeNames = new String[] { MedlineIndexWriter.ABSTRACT }; for (String query : queryList) { KeywordSourcePredicate predicate = new KeywordSourcePredicate( query, Arrays.asList(attributeNames), luceneAnalyzerStr, opType, tableName, SchemaConstants.SPAN_LIST); KeywordMatcherSourceOperator keywordSource = new KeywordMatcherSourceOperator(predicate); long startMatchTime = System.currentTimeMillis(); keywordSource.open(); int counter = 0; Tuple nextTuple = null; while ((nextTuple = keywordSource.getNextTuple()) != null) { ListField<Span> spanListField = nextTuple.getField(SchemaConstants.SPAN_LIST); List<Span> spanList = spanListField.getValue(); counter += spanList.size(); } keywordSource.close(); long endMatchTime = System.currentTimeMillis(); double matchTime = (endMatchTime - startMatchTime) / 1000.0; timeResults.add(Double.parseDouble(String.format("%.4f", matchTime))); totalResultCount += counter; } }
private IField createSpanListField() { List<Span> list = new ArrayList<Span>(); // The key value will be: // For RegexMatcher : "n.*k" // For NamedEntityMatcher : LOCATION // For DictionaryMatcher: "new york" - For DictionaryMatcher the key and // value are same // For KeyWordMatcher: "new york" - the value can be "new" or "york" Span span1 = new Span("description", 18, 26, "LOCATION", "new york"); Span span2 = new Span("description", 52, 63, "LOCATION", "los angeles"); list.add(span1); list.add(span2); IField spanListField = new ListField<Span>(list); return spanListField; }
public List<Span> appendPhraseMatchingSpans4Dictionary(Tuple inputTuple, List<String> attributeNames, List<List<String>> queryTokenList, List<Set<String>> queryTokenSetList, List<List<String>> queryTokenListWithStopwords, List<String> queryList) throws DataflowException { List<Span> matchingResults = new ArrayList<>(); ListField<Span> payloadField = inputTuple.getField(SchemaConstants.PAYLOAD); List<Span> payload = payloadField.getValue(); Map<Integer, List<Span>> relevantSpansMap = filterRelevantSpans(payload, queryTokenSetList); for (String attributeName : attributeNames) {
public static List<Tuple> getOneToOneResultTuple() throws ParseException { // Build the expected result Tuple List<Span> spanList = new ArrayList<Span>(); Span span1 = new Span(TEXT, 0, sentence1.length(), PropertyNameConstants.NLP_SPLIT_KEY, sentence1); spanList.add(span1); Span span2 = new Span(TEXT, sentence1.length()+1, sentence1.length()+sentence2.length()+1, PropertyNameConstants.NLP_SPLIT_KEY, sentence2); spanList.add(span2); Tuple tuple1 = getOneToOneTestTuple().get(0); Tuple returnTuple = new Tuple.Builder(tuple1).add(SchemaConstants.SPAN_LIST_ATTRIBUTE, new ListField<Span>(spanList)).build(); return Arrays.asList(returnTuple); }
private List<Span> appendConjunctionMatchingSpans4Dictionary(Tuple inputTuple, List<String> attributeNames, List<Set<String>> queryTokenSetList, List<String> queryList) throws DataflowException { List<Span> matchingResults = new ArrayList<>(); ListField<Span> payloadField = inputTuple.getField(SchemaConstants.PAYLOAD); List<Span> payload = payloadField.getValue(); Map<Integer, List<Span>> relevantSpansMap = filterRelevantSpans(payload, queryTokenSetList); for (String attributeName : attributeNames) {
return null; outputFields.addAll(currentTuple.getFields()); outputFields.add(new ListField<Span>(computeSentenceList(currentTuple))); } else if(predicate.getOutputType() == RegexOutputType.ONE_TO_MANY) { if(currentSentenceList.isEmpty()) {
public static void matchRegex(List<String> regexes, String tableName) throws TexeraException, IOException { List<String> attributeNames = Arrays.asList(MedlineIndexWriter.ABSTRACT); for(String regex: regexes){ // analyzer should generate grams all in lower case to build a lower // case index. RegexSourcePredicate predicate = new RegexSourcePredicate(regex, attributeNames, tableName, SchemaConstants.SPAN_LIST); RegexMatcherSourceOperator regexSource = new RegexMatcherSourceOperator(predicate); long startMatchTime = System.currentTimeMillis(); regexSource.open(); int counter = 0; Tuple nextTuple = null; while ((nextTuple = regexSource.getNextTuple()) != null) { ListField<Span> spanListField = nextTuple.getField(SchemaConstants.SPAN_LIST); List<Span> spanList = spanListField.getValue(); counter += spanList.size(); } regexSource.close(); long endMatchTime = System.currentTimeMillis(); double matchTime = (endMatchTime - startMatchTime) / 1000.0; totalMatchingTime += matchTime; totalRegexResultCount += counter; } }
field = new ListField<String>(Arrays.asList(fieldValue)); break;
private List<Span> appendConjunctionMatchingSpans(Tuple inputTuple, List<String> attributeNames, Set<String> queryTokenSet, String queryKeyword) throws DataflowException { ListField<Span> payloadField = inputTuple.getField(SchemaConstants.PAYLOAD); List<Span> payload = payloadField.getValue(); List<Span> matchingResults = new ArrayList<>(); for (String attributeName : attributeNames) {
if (currentTuple == null) return null; outputFields.addAll(currentTuple.getFields()); outputFields.add(new ListField<Span>(computeSentenceList(currentTuple)));
private List<Span> appendPhraseMatchingSpans(Tuple inputTuple, List<String> attributeNames, List<String> queryTokenList, List<String> queryTokenListWithStopwords, String queryKeyword) throws DataflowException { ListField<Span> payloadField = inputTuple.getField(SchemaConstants.PAYLOAD); List<Span> payload = payloadField.getValue(); List<Span> matchingResults = new ArrayList<>(); for (String attributeName : attributeNames) {
@Test public void testGetNextTuplePeopleFirstName() throws Exception { String query = "g[^\\s]*"; List<Tuple> exactResults = RegexMatcherTestHelper.getQueryResults( PEOPLE_TABLE, query, Arrays.asList(TestConstants.FIRST_NAME)); List<Tuple> expectedResults = new ArrayList<Tuple>(); // expected to match "brad lie angelina" List<Tuple> data = TestConstants.getSamplePeopleTuples(); Schema spanSchema = new Schema.Builder().add(TestConstants.SCHEMA_PEOPLE).add(RESULTS, AttributeType.LIST).build(); List<Span> spans = new ArrayList<Span>(); spans.add(new Span(TestConstants.FIRST_NAME, 11, 17, query, "gelina")); IField spanField = new ListField<Span>(new ArrayList<Span>(spans)); List<IField> fields = new ArrayList<IField>(data.get(2).getFields()); fields.add(spanField); expectedResults.add(new Tuple(spanSchema, fields.toArray(new IField[fields.size()]))); // expected to match "george lin lin" spans.clear(); spans.add(new Span(TestConstants.FIRST_NAME, 0, 6, query, "george")); spanField = new ListField<Span>(new ArrayList<Span>(spans)); fields = new ArrayList<IField>(data.get(3).getFields()); fields.add(spanField); expectedResults.add(new Tuple(spanSchema, fields.toArray(new IField[fields.size()]))); Assert.assertTrue(TestUtils.equals(expectedResults, exactResults)); }