/** * Part of speech tag */ public Annotator posTagger(Properties properties) { String annotatorName = "pos"; return new POSTaggerAnnotator(annotatorName, properties); }
/** Create a POS tagger annotator. * * @param posLoc Location of POS tagger model (may be file path, classpath resource, or URL * @param verbose Whether to show verbose information on model loading * @param maxSentenceLength Sentences longer than this length will be skipped in processing * @param numThreads The number of threads for the POS tagger annotator to use */ public POSTaggerAnnotator(String posLoc, boolean verbose, int maxSentenceLength, int numThreads) { this(loadModel(posLoc, verbose), maxSentenceLength, numThreads); }
@Override public void annotate(Annotation annotation) { // turn the annotation into a sentence if (annotation.containsKey(CoreAnnotations.SentencesAnnotation.class)) { if (nThreads == 1) { for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) { doOneSentence(sentence); } } else { MulticoreWrapper<CoreMap, CoreMap> wrapper = new MulticoreWrapper<>(nThreads, new POSTaggerProcessor()); for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) { wrapper.put(sentence); while (wrapper.peek()) { wrapper.poll(); } } wrapper.join(); while (wrapper.peek()) { wrapper.poll(); } } } else { throw new RuntimeException("unable to find words/tokens in: " + annotation); } }
posAnnotator.annotate(document); parseAnnotator.annotate(document); sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
/** * Takes in a list of words and POS tags them. Tagging is done in place - the * returned CoreLabels are the same ones you passed in, with tags added. * * @param text * List of tokens to tag * @return Tokens with tags */ public List<? extends CoreLabel> processText(List<? extends CoreLabel> text) { // cdm 2009: copying isn't necessary; the POS tagger's apply() // method does not change the parameter passed in. But I think you // can't have it correctly generic without copying. Sigh. // if the text size is more than the max length allowed if (text.size() > maxSentenceLength) { return processTextLargerThanMaxLen(text); } ArrayList<TaggedWord> tagged = pos.apply(new ArrayList<CoreLabel>(text)); // copy in the tags Iterator<TaggedWord> taggedIter = tagged.iterator(); for (CoreLabel word : text) { TaggedWord cur = taggedIter.next(); word.setTag(cur.tag()); } return text; }
posAnnotator.annotate(document); parseAnnotator.annotate(document); sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
private static AnnotationPipeline makeNumericPipeline() { AnnotationPipeline pipeline = new AnnotationPipeline(); pipeline.addAnnotator(new TokenizerAnnotator(false, "en")); pipeline.addAnnotator(new WordsToSentencesAnnotator(false)); pipeline.addAnnotator(new POSTaggerAnnotator(false)); pipeline.addAnnotator(new TimeAnnotator(true)); return pipeline; }
public POSTaggerAnnotator(String annotatorName, Properties props) { String posLoc = props.getProperty(annotatorName + ".model"); if (posLoc == null) { posLoc = DefaultPaths.DEFAULT_POS_MODEL; } boolean verbose = PropertiesUtils.getBool(props, annotatorName + ".verbose", false); this.pos = loadModel(posLoc, verbose); this.maxSentenceLength = PropertiesUtils.getInt(props, annotatorName + ".maxlen", Integer.MAX_VALUE); this.nThreads = PropertiesUtils.getInt(props, annotatorName + ".nthreads", PropertiesUtils.getInt(props, "nthreads", 1)); this.reuseTags = PropertiesUtils.getBool(props, annotatorName + ".reuseTags", false); }
@Override public void annotate(Annotation annotation) { // turn the annotation into a sentence if (annotation.containsKey(CoreAnnotations.SentencesAnnotation.class)) { if (nThreads == 1) { for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) { doOneSentence(sentence); } } else { MulticoreWrapper<CoreMap, CoreMap> wrapper = new MulticoreWrapper<>(nThreads, new POSTaggerProcessor()); for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) { wrapper.put(sentence); while (wrapper.peek()) { wrapper.poll(); } } wrapper.join(); while (wrapper.peek()) { wrapper.poll(); } } } else { throw new RuntimeException("unable to find words/tokens in: " + annotation); } }
posAnnotator.annotate(document); parseAnnotator.annotate(document); sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
public static AnnotationPipeline getPipeline(Properties props, boolean tokenize) throws Exception { // useGUTime = Boolean.parseBoolean(props.getProperty("gutime", "false")); AnnotationPipeline pipeline = new AnnotationPipeline(); if (tokenize) { pipeline.addAnnotator(new TokenizerAnnotator(false, "en")); pipeline.addAnnotator(new WordsToSentencesAnnotator(false)); } pipeline.addAnnotator(new POSTaggerAnnotator(false)); // pipeline.addAnnotator(new NumberAnnotator(false)); // pipeline.addAnnotator(new QuantifiableEntityNormalizingAnnotator(false, false)); String timeAnnotator = props.getProperty("timeAnnotator", "sutime"); switch (timeAnnotator) { case "gutime": useGUTime = true; pipeline.addAnnotator(new GUTimeAnnotator("gutime", props)); break; case "heideltime": requiredDocDateFormat = "yyyy-MM-dd"; pipeline.addAnnotator(new HeidelTimeAnnotator("heideltime", props)); break; case "sutime": pipeline.addAnnotator(new TimeAnnotator("sutime", props)); break; default: throw new IllegalArgumentException("Unknown timeAnnotator: " + timeAnnotator); } return pipeline; }
/** Create a POS tagger annotator. * * @param posLoc Location of POS tagger model (may be file path, classpath resource, or URL * @param verbose Whether to show verbose information on model loading * @param maxSentenceLength Sentences longer than this length will be skipped in processing * @param numThreads The number of threads for the POS tagger annotator to use */ public POSTaggerAnnotator(String posLoc, boolean verbose, int maxSentenceLength, int numThreads) { this(loadModel(posLoc, verbose), maxSentenceLength, numThreads); }
@Override public void annotate(Annotation annotation) { // turn the annotation into a sentence if (annotation.has(CoreAnnotations.SentencesAnnotation.class)) { if (nThreads == 1) { for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) { doOneSentence(sentence); } } else { MulticoreWrapper<CoreMap, CoreMap> wrapper = new MulticoreWrapper<CoreMap, CoreMap>(nThreads, new POSTaggerProcessor()); for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) { wrapper.put(sentence); while (wrapper.peek()) { wrapper.poll(); } } wrapper.join(); while (wrapper.peek()) { wrapper.poll(); } } } else { throw new RuntimeException("unable to find words/tokens in: " + annotation); } }
posAnnotator.annotate(document); parseAnnotator.annotate(document); sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
/** * Part of speech tag */ public Annotator posTagger(Properties properties) { String annotatorName = "pos"; return new POSTaggerAnnotator(annotatorName, properties); }
/** Create a POS tagger annotator. * * @param posLoc Location of POS tagger model (may be file path, classpath resource, or URL * @param verbose Whether to show verbose information on model loading * @param maxSentenceLength Sentences longer than this length will be skipped in processing * @param numThreads The number of threads for the POS tagger annotator to use */ public POSTaggerAnnotator(String posLoc, boolean verbose, int maxSentenceLength, int numThreads) { this(loadModel(posLoc, verbose), maxSentenceLength, numThreads); }
@Override public Annotator create() { try { return new POSTaggerAnnotator("pos", properties); } catch (Exception e) { throw new RuntimeException(e); } }
public POSTaggerAnnotator(String posLoc, boolean verbose, int maxSentenceLength) { this(loadModel(posLoc, verbose), verbose, maxSentenceLength); }
public Annotator create() { try { String maxLenStr = props.getProperty("pos.maxlen"); int maxLen = Integer.MAX_VALUE; if(maxLenStr != null) maxLen = Integer.parseInt(maxLenStr); return new POSTaggerAnnotator(props.getProperty("pos.model", DefaultPaths.DEFAULT_POS_MODEL), true, maxLen); } catch (Exception e) { throw new RuntimeException(e); } } });
public POSTaggerAnnotator(String annotatorName, Properties props) { String posLoc = props.getProperty(annotatorName + ".model"); if (posLoc == null) { posLoc = DefaultPaths.DEFAULT_POS_MODEL; } boolean verbose = PropertiesUtils.getBool(props, annotatorName + ".verbose", false); this.pos = loadModel(posLoc, verbose); this.maxSentenceLength = PropertiesUtils.getInt(props, annotatorName + ".maxlen", Integer.MAX_VALUE); this.nThreads = PropertiesUtils.getInt(props, annotatorName + ".nthreads", PropertiesUtils.getInt(props, "nthreads", 1)); this.reuseTags = PropertiesUtils.getBool(props, annotatorName + ".reuseTags", false); }