public static void matchNLP(String tableName, NlpEntityType tokenType) throws Exception { List<String> attributeNames = Arrays.asList(MedlineIndexWriter.ABSTRACT); ISourceOperator sourceOperator = new ScanBasedSourceOperator(new ScanSourcePredicate(tableName)); NlpEntityPredicate nlpEntityPredicate = new NlpEntityPredicate(tokenType, attributeNames, SchemaConstants.SPAN_LIST); NlpEntityOperator nlpEntityOperator = new NlpEntityOperator(nlpEntityPredicate); nlpEntityOperator.setInputOperator(sourceOperator); long startMatchTime = System.currentTimeMillis(); nlpEntityOperator.open(); Tuple nextTuple = null; int counter = 0; while ((nextTuple = nlpEntityOperator.getNextTuple()) != null) { ListField<Span> spanListField = nextTuple.getField(SchemaConstants.SPAN_LIST); List<Span> spanList = spanListField.getValue(); counter += spanList.size(); } nlpEntityOperator.close(); long endMatchTime = System.currentTimeMillis(); double matchTime = (endMatchTime - startMatchTime) / 1000.0; totalMatchingTime += matchTime; totalResults += counter; }
private static String getNlpTypeIndicator(NlpEntityType nlpEntityType) { if (isPOSTokenType(nlpEntityType)) { return "POS"; } else { return "NE_ALL"; } }
public List<Tuple> getQueryResults(String tableName, List<String> attributeNames, NlpEntityType nlpEntityType, int limit, int offset) throws Exception { ScanBasedSourceOperator scanSource = new ScanBasedSourceOperator(new ScanSourcePredicate(tableName)); NlpEntityPredicate nlpEntityPredicate = new NlpEntityPredicate(nlpEntityType, attributeNames, RESULTS); NlpEntityOperator nlpEntityOperator = new NlpEntityOperator(nlpEntityPredicate); nlpEntityOperator.setInputOperator(scanSource); nlpEntityOperator.setLimit(limit); nlpEntityOperator.setOffset(offset); Tuple nextTuple = null; List<Tuple> results = new ArrayList<Tuple>(); nlpEntityOperator.open(); while ((nextTuple = nlpEntityOperator.getNextTuple()) != null) { results.add(nextTuple); } nlpEntityOperator.close(); return results; }
@Override public NlpEntityOperator newOperator() { return new NlpEntityOperator(this); }
if (getNlpTypeIndicator(predicate.getNlpEntityType()).equals("POS")) { props.setProperty("annotators", "tokenize, ssplit, pos"); if (posPipeline == null) { if (getNlpTypeIndicator(predicate.getNlpEntityType()).equals("POS")) { stanfordNlpConstant = token.get(CoreAnnotations.PartOfSpeechAnnotation.class); } else { NlpEntityType nlpEntityType = mapNlpEntityType(stanfordNlpConstant); if (nlpEntityType == null) { continue; if (spanList.size() >= 1 && (getNlpTypeIndicator(predicate.getNlpEntityType()).equals("NE_ALL"))) { Span previousSpan = spanList.get(spanList.size() - 1); if (previousSpan.getAttributeName().equals(span.getAttributeName()) && (span.getStart() - previousSpan.getEnd() <= 1) && previousSpan.getKey().equals(span.getKey())) { Span newSpan = mergeTwoSpans(previousSpan, span); span = newSpan; spanList.remove(spanList.size() - 1);
@Test public void testLogicalPlan2() throws Exception { LogicalPlan logicalPlan = getLogicalPlan2(); Plan queryPlan = logicalPlan.buildQueryPlan(); ISink tupleSink = queryPlan.getRoot(); Assert.assertTrue(tupleSink instanceof TupleSink); IOperator join = ((TupleSink) tupleSink).getInputOperator(); Assert.assertTrue(join instanceof Join); IOperator joinInput1 = ((Join) join).getInnerInputOperator(); Assert.assertTrue(joinInput1 instanceof RegexMatcher); IOperator joinInput2 = ((Join) join).getOuterInputOperator(); Assert.assertTrue(joinInput2 instanceof NlpEntityOperator); IOperator connectorOut1 = ((RegexMatcher) joinInput1).getInputOperator(); Assert.assertTrue(connectorOut1 instanceof ConnectorOutputOperator); IOperator connectorOut2 = ((NlpEntityOperator) joinInput2).getInputOperator(); Assert.assertTrue(connectorOut2 instanceof ConnectorOutputOperator); HashSet<Integer> connectorIndices = new HashSet<>(); connectorIndices.add(((ConnectorOutputOperator) connectorOut1).getOutputIndex()); connectorIndices.add(((ConnectorOutputOperator) connectorOut2).getOutputIndex()); Assert.assertEquals(connectorIndices.size(), 2); OneToNBroadcastConnector connector1 = ((ConnectorOutputOperator) connectorOut1).getOwnerConnector(); OneToNBroadcastConnector connector2 = ((ConnectorOutputOperator) connectorOut2).getOwnerConnector(); Assert.assertSame(connector1, connector2); IOperator keywordSource = connector1.getInputOperator(); Assert.assertTrue(keywordSource instanceof KeywordMatcherSourceOperator); }
@Override public Tuple processOneInputTuple(Tuple inputTuple) throws TexeraException { List<Span> matchingResults = new ArrayList<>(); for (String attributeName : predicate.getAttributeNames()) { IField field = inputTuple.getField(attributeName); matchingResults.addAll(extractNlpSpans(field, attributeName)); } if (matchingResults.isEmpty()) { return null; } return new Tuple.Builder(inputTuple) .add(predicate.getResultAttribute(), AttributeType.LIST, new ListField<Span>(matchingResults)) .build(); }
Assert.assertTrue(connectorOut1 instanceof ConnectorOutputOperator); IOperator connectorOut2 = ((NlpEntityOperator) join1Input2).getInputOperator(); Assert.assertTrue(connectorOut2 instanceof ConnectorOutputOperator);
@Test public void testGetOutputSchema2() throws Exception { LogicalPlan logicalPlan = getLogicalPlan2(); Plan queryPlan = logicalPlan.buildQueryPlan(); ISink tupleSink = queryPlan.getRoot(); IOperator join = ((TupleSink) tupleSink).getInputOperator(); IOperator joinInput1 = ((Join) join).getInnerInputOperator(); IOperator joinInput2 = ((Join) join).getOuterInputOperator(); IOperator connectorOut1 = ((RegexMatcher) joinInput1).getInputOperator(); IOperator connectorOut2 = ((NlpEntityOperator) joinInput2).getInputOperator(); OneToNBroadcastConnector connector1 = ((ConnectorOutputOperator) connectorOut1).getOwnerConnector(); OneToNBroadcastConnector connector2 = ((ConnectorOutputOperator) connectorOut2).getOwnerConnector(); IOperator keywordSource = connector1.getInputOperator(); join.open(); Schema expectedJoinOutputSchema = join.getOutputSchema(); Schema expectedSourceOutputSchema = keywordSource.getOutputSchema(); Schema expectedMatcherOutputSchema = joinInput1.getOutputSchema(); Schema expectedNlpEntityOutputSchema = joinInput2.getOutputSchema(); join.close(); Schema joinOutputSchema = logicalPlan.getOperatorOutputSchema(JOIN_DISTANCE_ID); Schema sourceOutputSchema = logicalPlan.getOperatorOutputSchema(KEYWORD_SOURCE_ID); Schema matcherOutputSchema = logicalPlan.getOperatorOutputSchema(REGEX_ID); Schema nlpEntityOutputSchema = logicalPlan.getOperatorOutputSchema(NLP_ENTITY_ID); Assert.assertEquals(expectedJoinOutputSchema, joinOutputSchema); Assert.assertEquals(expectedSourceOutputSchema, sourceOutputSchema); Assert.assertEquals(expectedMatcherOutputSchema, matcherOutputSchema); Assert.assertEquals(expectedNlpEntityOutputSchema, nlpEntityOutputSchema); }