public static POSSample parse(String sentenceString) throws InvalidFormatException { String[] tokenTags = WhitespaceTokenizer.INSTANCE.tokenize(sentenceString); String[] sentence = new String[tokenTags.length]; String[] tags = new String[tokenTags.length]; for (int i = 0; i < tokenTags.length; i++) { int split = tokenTags[i].lastIndexOf("_"); if (split == -1) { throw new InvalidFormatException("Cannot find \"_\" inside token '" + tokenTags[i] + "'!"); } sentence[i] = tokenTags[i].substring(0, split); tags[i] = tokenTags[i].substring(split + 1); } return new POSSample(sentence, tags); }
public POSSample read() throws IOException { Sentence paragraph; while ((paragraph = this.adSentenceStream.read()) != null) { Node root = paragraph.getRoot(); List<String> sentence = new ArrayList<>(); List<String> tags = new ArrayList<>(); process(root, sentence, tags); return new POSSample(sentence, tags); } return null; }
@Override public POSSample read() throws IOException { ConlluSentence sentence = samples.read(); if (sentence != null) { List<String> tokens = new ArrayList<>(); List<String> tags = new ArrayList<>(); for (ConlluWordLine line : sentence.getWordLines()) { tokens.add(line.getForm()); tags.add(line.getPosTag(tagset)); } return new POSSample(tokens, tags); } return null; } }
public POSSample read() throws IOException { Parse parse = samples.read(); if (parse != null) { List<String> sentence = new ArrayList<>(); List<String> tags = new ArrayList<>(); for (Parse tagNode : parse.getTagNodes()) { sentence.add(tagNode.getCoveredText()); tags.add(tagNode.getType()); } return new POSSample(sentence, tags); } else { return null; } } }
sample = read(); sample = new POSSample(tokens.toArray(new String[tokens.size()]), tags.toArray(new String[tags.size()]));
System.out.println("Error during parsing, ignoring sentence: " + sentence); sample = new POSSample(new String[]{}, new String[]{});
public POSSample read() throws IOException { Parse parse = samples.read(); if (parse != null) { Parse[] nodes = parse.getTagNodes(); String[] toks = new String[nodes.length]; String[] preds = new String[nodes.length]; for (int ti = 0; ti < nodes.length; ti++) { Parse tok = nodes[ti]; toks[ti] = tok.getCoveredText(); preds[ti] = tok.getType(); } return new POSSample(toks, preds); } else { return null; } } }
/** * Evaluates the given reference {@link POSSample} object. * * This is done by tagging the sentence from the reference * {@link POSSample} with the {@link POSTagger}. The * tags are then used to update the word accuracy score. * * @param reference the reference {@link POSSample}. * * @return the predicted {@link POSSample}. */ @Override protected POSSample processSample(POSSample reference) { String[] predictedTags = tagger.tag(reference.getSentence(), reference.getAddictionalContext()); String[] referenceTags = reference.getTags(); for (int i = 0; i < referenceTags.length; i++) { if (referenceTags[i].equals(predictedTags[i])) { wordAccuracy.add(1); } else { wordAccuracy.add(0); } } return new POSSample(reference.getSentence(), predictedTags); }
@Test public void evalChunkerModel() throws Exception { MessageDigest digest = MessageDigest.getInstance(HASH_ALGORITHM); POSTagger tagger = new POSTaggerME(new POSModel( new File(getOpennlpDataDir(), "models-sf/en-pos-perceptron.bin"))); Chunker chunker = new ChunkerME(new ChunkerModel( new File(getOpennlpDataDir(), "models-sf/en-chunker.bin"))); try (ObjectStream<LeipzigTestSample> lines = createLineWiseStream()) { LeipzigTestSample line; while ((line = lines.read()) != null) { POSSample sentence = new POSSample(line.getText(), tagger.tag(line.getText())); String[] chunks = chunker.chunk(sentence.getSentence(), sentence.getTags()); for (String chunk : chunks) { digest.update(chunk.getBytes(StandardCharsets.UTF_8)); } } } Assert.assertEquals(new BigInteger("226003515785585284478071030961407561943"), new BigInteger(1, digest.digest())); }
String[] tags = tagger.tag(whitespaceTokenizerLine); POSSample sample = new POSSample(whitespaceTokenizerLine, tags); System.out.println(sample.toString());
public static POSSample parse(String sentenceString) throws InvalidFormatException { String[] tokenTags = WhitespaceTokenizer.INSTANCE.tokenize(sentenceString); String[] sentence = new String[tokenTags.length]; String[] tags = new String[tokenTags.length]; for (int i = 0; i < tokenTags.length; i++) { int split = tokenTags[i].lastIndexOf("_"); if (split == -1) { throw new InvalidFormatException("Cannot find \"_\" inside token '" + tokenTags[i] + "'!"); } sentence[i] = tokenTags[i].substring(0, split); tags[i] = tokenTags[i].substring(split + 1); } return new POSSample(sentence, tags); }
public static POSSample parse(String sentenceString) throws InvalidFormatException { String[] tokenTags = WhitespaceTokenizer.INSTANCE.tokenize(sentenceString); String[] sentence = new String[tokenTags.length]; String[] tags = new String[tokenTags.length]; for (int i = 0; i < tokenTags.length; i++) { int split = tokenTags[i].lastIndexOf("_"); if (split == -1) { throw new InvalidFormatException("Cannot find \"_\" inside token '" + tokenTags[i] + "'!"); } sentence[i] = tokenTags[i].substring(0, split); tags[i] = tokenTags[i].substring(split + 1); } return new POSSample(sentence, tags); }
public POSSample read() throws IOException { Sentence paragraph; while ((paragraph = this.adSentenceStream.read()) != null) { Node root = paragraph.getRoot(); List<String> sentence = new ArrayList<>(); List<String> tags = new ArrayList<>(); process(root, sentence, tags); return new POSSample(sentence, tags); } return null; }
public POSSample read() throws IOException { Sentence paragraph; while ((paragraph = this.adSentenceStream.read()) != null) { Node root = paragraph.getRoot(); List<String> sentence = new ArrayList<>(); List<String> tags = new ArrayList<>(); process(root, sentence, tags); return new POSSample(sentence, tags); } return null; }
public POSSample read() throws IOException { Parse parse = samples.read(); if (parse != null) { List<String> sentence = new ArrayList<>(); List<String> tags = new ArrayList<>(); for (Parse tagNode : parse.getTagNodes()) { sentence.add(tagNode.getCoveredText()); tags.add(tagNode.getType()); } return new POSSample(sentence, tags); } else { return null; } } }
@Override public POSSample read() throws IOException { ConlluSentence sentence = samples.read(); if (sentence != null) { List<String> tokens = new ArrayList<>(); List<String> tags = new ArrayList<>(); for (ConlluWordLine line : sentence.getWordLines()) { tokens.add(line.getForm()); tags.add(line.getPosTag(tagset)); } return new POSSample(tokens, tags); } return null; } }
@Override public POSSample read() throws IOException { ConlluSentence sentence = samples.read(); if (sentence != null) { List<String> tokens = new ArrayList<>(); List<String> tags = new ArrayList<>(); for (ConlluWordLine line : sentence.getWordLines()) { tokens.add(line.getForm()); tags.add(line.getPosTag(tagset)); } return new POSSample(tokens, tags); } return null; } }
public POSSample read() throws IOException { Parse parse = samples.read(); if (parse != null) { List<String> sentence = new ArrayList<>(); List<String> tags = new ArrayList<>(); for (Parse tagNode : parse.getTagNodes()) { sentence.add(tagNode.getCoveredText()); tags.add(tagNode.getType()); } return new POSSample(sentence, tags); } else { return null; } } }
private void process(CAS tcas, AnnotationFS sentence) { FSIndex<AnnotationFS> allTokens = tcas.getAnnotationIndex(mTokenType); ContainingConstraint containingConstraint = new ContainingConstraint(sentence); List<String> tokens = new ArrayList<String>(); List<String> tags = new ArrayList<String>(); Iterator<AnnotationFS> containingTokens = tcas.createFilteredIterator( allTokens.iterator(), containingConstraint); while (containingTokens.hasNext()) { AnnotationFS tokenAnnotation = (AnnotationFS) containingTokens.next(); String tag = tokenAnnotation.getFeatureValueAsString(mPOSFeature); tokens.add(tokenAnnotation.getCoveredText().trim()); tags.add(tag); } mPOSSamples.add(new POSSample(tokens, tags)); }
public POSSample read() throws IOException { Parse parse = samples.read(); if (parse != null) { Parse[] nodes = parse.getTagNodes(); String[] toks = new String[nodes.length]; String[] preds = new String[nodes.length]; for (int ti = 0; ti < nodes.length; ti++) { Parse tok = nodes[ti]; toks[ti] = tok.getCoveredText(); preds[ti] = tok.getType(); } return new POSSample(toks, preds); } else { return null; } } }