/** * Returns a presentable version of a given PTB token. For instance, * it transforms -LRB- into (. */ public static String ptbToken2Text(String ptbText) { return ptb2Text(' ' + ptbText + ' ').trim(); }
/** * Returns a presentable version of the given PTB-tokenized words. * Pass in a List of Strings and this method will * join the words with spaces and call {@link #ptb2Text(String)} on the * output. * * @param ptbWords A list of String * @return A presentable version of the given PTB-tokenized words */ public static String ptb2Text(List<String> ptbWords) { return ptb2Text(StringUtils.join(ptbWords)); }
/** * Returns a presentable version of the given PTB-tokenized words. * Pass in a List of Words or a Document and this method will * take the word() values (to prevent additional text from creeping in, e.g., POS tags), * and call {@link #ptb2Text(String)} on the output. * * @param ptbWords A list of HasWord objects * @return A presentable version of the given PTB-tokenized words */ public static String labelList2Text(List<? extends HasWord> ptbWords) { List<String> words = new ArrayList<>(); for (HasWord hw : ptbWords) { words.add(hw.word()); } return ptb2Text(words); }
private static void untok(List<String> inputFileList, List<String> outputFileList, String charset) throws IOException { final long start = System.nanoTime(); int numTokens = 0; int sz = inputFileList.size(); if (sz == 0) { Reader r = new InputStreamReader(System.in, charset); BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(System.out, charset)); numTokens = ptb2Text(r, writer); writer.close(); } else { for (int j = 0; j < sz; j++) { try (Reader r = IOUtils.readerFromString(inputFileList.get(j), charset)) { BufferedWriter writer; if (outputFileList == null) { writer = new BufferedWriter(new OutputStreamWriter(System.out, charset)); } else { writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outputFileList.get(j)), charset)); } try { numTokens += ptb2Text(r, writer); } finally { writer.close(); } } } } final long duration = System.nanoTime() - start; final double wordsPerSec = (double) numTokens / ((double) duration / 1000000000.0); System.err.printf("PTBTokenizer untokenized %d tokens at %.2f tokens per second.%n", numTokens, wordsPerSec); }
String sent = SentenceUtils.listToString(outputTree.yield(), false); if(ptb2text) { pw.println(PTBTokenizer.ptb2Text(sent)); } else { pw.println(sent);
public String unTokenize(String tokenized) { return PTBTokenizer.ptb2Text(tokenized); }
/** * Returns a presentable version of a given PTB token. For instance, * it transforms -LRB- into (. */ public static String ptbToken2Text(String ptbText) { return ptb2Text(' ' + ptbText + ' ').trim(); }
/** * Returns a presentable version of a given PTB token. For instance, * it transforms -LRB- into (. */ public static String ptbToken2Text(String ptbText) { return ptb2Text(' ' + ptbText + ' ').trim(); }
/** * Returns a presentable version of a given PTB token. For instance, * it transforms -LRB- into (. */ public static String ptbToken2Text(String ptbText) { return ptb2Text(' ' + ptbText + ' ').trim(); }
/** * Returns a presentable version of a given PTB token. For instance, * it transforms -LRB- into (. */ public static String ptbToken2Text(String ptbText) { return ptb2Text(' ' + ptbText + ' ').trim(); }
/** * Returns a presentable version of the given PTB-tokenized words. * Pass in a List of Strings and this method will * join the words with spaces and call {@link #ptb2Text(String)} on the * output. * * @param ptbWords A list of String * @return A presentable version of the given PTB-tokenized words */ public static String ptb2Text(List<String> ptbWords) { return ptb2Text(StringUtils.join(ptbWords)); }
/** * Returns a presentable version of the given PTB-tokenized words. * Pass in a List of Strings and this method will * join the words with spaces and call {@link #ptb2Text(String)} on the * output. * * @param ptbWords A list of String * @return A presentable version of the given PTB-tokenized words */ public static String ptb2Text(List<String> ptbWords) { return ptb2Text(StringUtils.join(ptbWords)); }
/** * Returns a presentable version of the given PTB-tokenized words. * Pass in a List of Strings and this method will * join the words with spaces and call {@link #ptb2Text(String)} on the * output. * * @param ptbWords A list of String * @return A presentable version of the given PTB-tokenized words */ public static String ptb2Text(List<String> ptbWords) { return ptb2Text(StringUtils.join(ptbWords)); }
/** * Returns a presentable version of the given PTB-tokenized words. * Pass in a List of Strings and this method will * join the words with spaces and call {@link #ptb2Text(String)} on the * output. * * @param ptbWords A list of String * @return A presentable version of the given PTB-tokenized words */ public static String ptb2Text(List<String> ptbWords) { return ptb2Text(StringUtils.join(ptbWords)); }
/** * Returns a presentable version of the given PTB-tokenized words. * Pass in a List of Words or a Document and this method will * join the words with spaces and call {@link #ptb2Text(String)} on the * output. This method will take the word() values to prevent additional * text from creeping in (e.g., POS tags). * * @param ptbWords A list of HasWord objects * @return A presentable version of the given PTB-tokenized words */ public static String labelList2Text(List<? extends HasWord> ptbWords) { List<String> words = new ArrayList<String>(); for (HasWord hw : ptbWords) { words.add(hw.word()); } return ptb2Text(words); }
/** * Returns a presentable version of the given PTB-tokenized words. * Pass in a List of Words or a Document and this method will * take the word() values (to prevent additional text from creeping in, e.g., POS tags), * and call {@link #ptb2Text(String)} on the output. * * @param ptbWords A list of HasWord objects * @return A presentable version of the given PTB-tokenized words */ public static String labelList2Text(List<? extends HasWord> ptbWords) { List<String> words = new ArrayList<>(); for (HasWord hw : ptbWords) { words.add(hw.word()); } return ptb2Text(words); }
/** * Returns a presentable version of the given PTB-tokenized words. * Pass in a List of Words or a Document and this method will * take the word() values (to prevent additional text from creeping in, e.g., POS tags), * and call {@link #ptb2Text(String)} on the output. * * @param ptbWords A list of HasWord objects * @return A presentable version of the given PTB-tokenized words */ public static String labelList2Text(List<? extends HasWord> ptbWords) { List<String> words = new ArrayList<>(); for (HasWord hw : ptbWords) { words.add(hw.word()); } return ptb2Text(words); }
/** * Returns a presentable version of the given PTB-tokenized words. * Pass in a List of Words or a Document and this method will * join the words with spaces and call {@link #ptb2Text(String)} on the * output. This method will take the word() values to prevent additional * text from creeping in (e.g., POS tags). * * @param ptbWords A list of HasWord objects * @return A presentable version of the given PTB-tokenized words */ public static String labelList2Text(List<? extends HasWord> ptbWords) { List<String> words = new ArrayList<String>(); for (HasWord hw : ptbWords) { words.add(hw.word()); } return ptb2Text(words); }
private static void untok(List<String> inputFileList, List<String> outputFileList, String charset) throws IOException { final long start = System.nanoTime(); int numTokens = 0; int sz = inputFileList.size(); if (sz == 0) { Reader r = new InputStreamReader(System.in, charset); BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(System.out, charset)); numTokens = ptb2Text(r, writer); writer.close(); } else { for (int j = 0; j < sz; j++) { Reader r = IOUtils.readerFromString(inputFileList.get(j), charset); BufferedWriter writer; if (outputFileList == null) { writer = new BufferedWriter(new OutputStreamWriter(System.out, charset)); } else { writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outputFileList.get(j)), charset)); } numTokens += ptb2Text(r, writer); writer.close(); r.close(); } } final long duration = System.nanoTime() - start; final double wordsPerSec = (double) numTokens / ((double) duration / 1000000000.0); System.err.printf("PTBTokenizer untokenized %d tokens at %.2f tokens per second.%n", numTokens, wordsPerSec); }
private static void untok(List<String> inputFileList, List<String> outputFileList, String charset) throws IOException { final long start = System.nanoTime(); int numTokens = 0; int sz = inputFileList.size(); if (sz == 0) { Reader r = new InputStreamReader(System.in, charset); BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(System.out, charset)); numTokens = ptb2Text(r, writer); writer.close(); } else { for (int j = 0; j < sz; j++) { try (Reader r = IOUtils.readerFromString(inputFileList.get(j), charset)) { BufferedWriter writer; if (outputFileList == null) { writer = new BufferedWriter(new OutputStreamWriter(System.out, charset)); } else { writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outputFileList.get(j)), charset)); } try { numTokens += ptb2Text(r, writer); } finally { writer.close(); } } } } final long duration = System.nanoTime() - start; final double wordsPerSec = (double) numTokens / ((double) duration / 1000000000.0); System.err.printf("PTBTokenizer untokenized %d tokens at %.2f tokens per second.%n", numTokens, wordsPerSec); }