private void tokenizeDate(String inputDate) { tokens = new ArrayList<>(); Pattern pat = Pattern.compile("[-]"); if (inputDate == null) { System.out.println("Null input date"); } Matcher m = pat.matcher(inputDate); String str = m.replaceAll(" - "); str = str.replaceAll(",", " "); PTBTokenizer<Word> tokenizer = PTBTokenizer.newPTBTokenizer(new BufferedReader(new StringReader(str))); while (tokenizer.hasNext()) { Word nextToken = tokenizer.next(); tokens.add(nextToken.toString()); } if(DEBUG) { System.out.println("tokens:" + tokens); } }
/** * Test program for demonstrating the Stemmer. It reads text from a * a list of files, stems each word, and writes the result to standard * output. Note that the word stemmed is expected to be in lower case: * forcing lower case must be done outside the Stemmer class. * Usage: Stemmer file-name file-name ... */ public static void main(String[] args) throws IOException { Stemmer s = new Stemmer(); if (args[0].equals("-file")) { Iterator<Word> it = PTBTokenizer.newPTBTokenizer(new InputStreamReader(new FileInputStream(args[1]), "utf-8")); while (it.hasNext()) { Word token = it.next(); System.out.print(s.stem(token.word())); System.out.print(' '); } } else { for (String arg : args) { System.out.print(s.stem(arg)); System.out.print(' '); } } System.out.println(); }
@Override public Tree next() { if (line == null) { throw new NoSuchElementException(); } Reader lineReader = new StringReader(line); line = null; List<Word> words; if (tokenized) { words = WhitespaceTokenizer.newWordWhitespaceTokenizer(lineReader).tokenize(); } else { words = PTBTokenizer.newPTBTokenizer(lineReader).tokenize(); } if (!words.isEmpty()) { // the parser throws an exception if told to parse an empty sentence. Tree parseTree = lp.apply(words); return parseTree; } else { return new SimpleTree(); } }
for (String line; (line = reader.readLine()) != null; ) { System.out.println("Processing sentence: " + line); PTBTokenizer<Word> ptb = PTBTokenizer.newPTBTokenizer(new StringReader(line)); List<Word> words = ptb.tokenize(); Tree parseTree = lp.parseTree(words);
PTBTokenizer<CoreLabel> ptb = PTBTokenizer.newPTBTokenizer(new BufferedReader(new StringReader(doc)), false, true); List<CoreLabel> words = ptb.tokenize();
/** * Constructs a new PTBTokenizer that returns Word tokens and which treats * carriage returns as normal whitespace. * * @param r The Reader whose contents will be tokenized * @return A PTBTokenizer that tokenizes a stream to objects of type * {@link Word} */ public static PTBTokenizer<Word> newPTBTokenizer(Reader r) { return newPTBTokenizer(r, false); }
private void tokenizeDate(String inputDate) { tokens = new ArrayList<String>(); Pattern pat = Pattern.compile("[-]"); if (inputDate == null) { System.out.println("Null input date"); } Matcher m = pat.matcher(inputDate); String str = m.replaceAll(" - "); str = str.replaceAll(",", " "); PTBTokenizer<Word> tokenizer = PTBTokenizer.newPTBTokenizer(new BufferedReader(new StringReader(str))); while (tokenizer.hasNext()) { Word nextToken = tokenizer.next(); tokens.add(nextToken.toString()); } if(DEBUG) { System.out.println("tokens:" + tokens); } }
private void tokenizeDate(String inputDate) { tokens = new ArrayList<>(); Pattern pat = Pattern.compile("[-]"); if (inputDate == null) { System.out.println("Null input date"); } Matcher m = pat.matcher(inputDate); String str = m.replaceAll(" - "); str = str.replaceAll(",", " "); PTBTokenizer<Word> tokenizer = PTBTokenizer.newPTBTokenizer(new BufferedReader(new StringReader(str))); while (tokenizer.hasNext()) { Word nextToken = tokenizer.next(); tokens.add(nextToken.toString()); } if(DEBUG) { System.out.println("tokens:" + tokens); } }
@Override public String[] tokenize(String sentence) { Reader r=new StringReader(sentence); PTBTokenizer<Word> tokenizer=PTBTokenizer.newPTBTokenizer(r); List<String> l=new ArrayList<String>(); while(tokenizer.hasNext()) l.add(tokenizer.next().word()); String[] tok=new String[l.size()+1]; tok[0]=is2.io.CONLLReader09.ROOT; int i=1; for(String s:l) tok[i++]=s; return tok; }
private void tokenizeDate(String inputDate) { tokens = new ArrayList<String>(); Pattern pat = Pattern.compile("[-]"); if (inputDate == null) { System.out.println("Null input date"); } Matcher m = pat.matcher(inputDate); String str = m.replaceAll(" - "); str = str.replaceAll(",", " "); PTBTokenizer<Word> tokenizer = PTBTokenizer.newPTBTokenizer(new BufferedReader(new StringReader(str))); while (tokenizer.hasNext()) { Word nextToken = tokenizer.next(); tokens.add(nextToken.toString()); } if(DEBUG) { System.out.println("tokens:" + tokens); } }
@Override public String[] tokenize(String sentence) { Reader r = new StringReader(sentence); PTBTokenizer<Word> tokenizer = PTBTokenizer.newPTBTokenizer(r); List<String> l = new ArrayList<>(); while (tokenizer.hasNext()) { Word w = tokenizer.next(); l.add(w.word()); } String[] tok = new String[l.size() + 1]; tok[0] = is2.io.CONLLReader09.ROOT; int i = 1; for (String s : l) tok[i++] = s; return tok; }
@Override public Tree next() { if (line == null) { throw new NoSuchElementException(); } Reader lineReader = new StringReader(line); line = null; List<Word> words; if (tokenized) { words = WhitespaceTokenizer.newWordWhitespaceTokenizer(lineReader).tokenize(); } else { words = PTBTokenizer.newPTBTokenizer(lineReader).tokenize(); } if (!words.isEmpty()) { // the parser throws an exception if told to parse an empty sentence. Tree parseTree = lp.apply(words); return parseTree; } else { return new SimpleTree(); } }
@Override public Tree next() { if (line == null) { throw new NoSuchElementException(); } Reader lineReader = new StringReader(line); line = null; List<Word> words; if (tokenized) { words = WhitespaceTokenizer.newWordWhitespaceTokenizer(lineReader).tokenize(); } else { words = PTBTokenizer.newPTBTokenizer(lineReader).tokenize(); } if (!words.isEmpty()) { // the parser throws an exception if told to parse an empty sentence. Tree parseTree = lp.apply(words); return parseTree; } else { return new SimpleTree(); } }
for (String line; (line = reader.readLine()) != null;) { PTBTokenizer<Word> ptb = PTBTokenizer.newPTBTokenizer(new StringReader(line)); List<Word> words = ptb.tokenize(); if (!words.isEmpty()) {
public StringInText[] tokenizeplus(String sentence) { Reader r = new StringReader(sentence); PTBTokenizer<Word> tokenizer = PTBTokenizer.newPTBTokenizer(r); List<StringInText> l = new ArrayList<>(); while (tokenizer.hasNext()) { Word w = tokenizer.next(); l.add(new StringInText(w.word(), w.beginPosition() + startpos, w .endPosition() + startpos)); } StringInText[] tok = new StringInText[l.size() + 1]; tok[0] = new StringInText(is2.io.CONLLReader09.ROOT, 0, 0); int i = 1; for (StringInText s : l) tok[i++] = s; startpos += (1 + sentence.length()); return tok; }
@Override public Tree next() { if (line == null) { throw new NoSuchElementException(); } Reader lineReader = new StringReader(line); line = null; List<Word> words; if (tokenized) { words = WhitespaceTokenizer.newWordWhitespaceTokenizer(lineReader).tokenize(); } else { words = PTBTokenizer.newPTBTokenizer(lineReader).tokenize(); } if (!words.isEmpty()) { // the parser throws an exception if told to parse an empty sentence. Tree parseTree = lp.apply(words); return parseTree; } else { return new SimpleTree(); } }
while ((line = reader.readLine()) != null) { System.out.println("Processing sentence: " + line); PTBTokenizer<Word> ptb = PTBTokenizer.newPTBTokenizer(new StringReader(line)); List<Word> words = ptb.tokenize(); lp.parse(words);