bd = ErasureUtils.<BasicDocument<L>>uncheckedCast(getClass().newInstance()); } catch (Exception e) { bd = new BasicDocument<>(); bd.setTitle(title()); bd.setLabels(labels()); bd.setTokenizerFactory(tokenizerFactory);
/** * Initializes a new BasicDocument with the given list of words and title. */ public BasicDocument<L> init(List<? extends Word> words, String title) { // initializes the List of labels and sets the title setTitle(title); // no original text originalText = null; // adds all of the given words to the list maintained by this document addAll(words); return (this); }
/** * Removes all currently assigned labels for this Document then adds * the given label. * Calling <tt>setLabel(null)</tt> effectively clears all labels. */ public void setLabel(L label) { labels.clear(); addLabel(label); }
/** * Creates a new Document for the given text. Default implementation tokenizes * the text using the tokenizer provided during construction and sticks the words * in a new BasicDocument. The text is also stored as the original text in * the BasicDocument if keepOriginalText was set in the constructor. Subclasses * may wish to extract additional information from the text and/or return another * document subclass with additional meta-data. */ protected BasicDocument<L> parseDocumentText(String text) { new BasicDocument<L>(); return BasicDocument.init(text, keepOriginalText); }
/** * Inits a new BasicDocument with the given text contents and title. * The text is tokenized using {@link #parse(String)} to populate the list of words * ("" is used if text is null). If specified, a reference to the * original text is also maintained so that the text() method returns the * text given to this constructor. Returns a reference to this * BasicDocument * for convenience (so it's more like a constructor, but inherited). */ public static <L> BasicDocument<L> init(String text, String title, boolean keepOriginalText) { BasicDocument<L> basicDocument = new BasicDocument<>(); // initializes the List of labels and sets the title basicDocument.setTitle(title); // stores the original text as specified if (keepOriginalText) { basicDocument.originalText = text; } else { basicDocument.originalText = null; } // populates the words by parsing the text basicDocument.parse(text == null ? "" : text); return basicDocument; }
/** * For internal debugging purposes only. * Prints the state of the given BasicDocument to stderr. */ public static <L> void printState(BasicDocument<L> bd) throws Exception { log.info("BasicDocument:"); log.info("\tTitle: " + bd.title()); log.info("\tLabels: " + bd.labels()); log.info("\tOriginalText: " + bd.originalText()); log.info("\tWords: " + bd); log.info(); }
/** * For internal debugging purposes only. Creates and tests various instances * of BasicDocument. */ public static void main(String[] args) { try { printState(BasicDocument.init("this is the text", "this is the title [String]", true)); printState(BasicDocument.init(new StringReader("this is the text"), "this is the title [Reader]", true)); File f = File.createTempFile("BasicDocumentTestFile", null); f.deleteOnExit(); PrintWriter out = new PrintWriter(new FileWriter(f)); out.print("this is the text"); out.flush(); out.close(); printState(new BasicDocument<String>().init(f, "this is the title [File]", true)); printState(new BasicDocument<String>().init(new URL("http://www.stanford.edu/~jsmarr/BasicDocumentTestFile.txt"), "this is the title [URL]", true)); } catch (Exception e) { e.printStackTrace(); } }
/** * Calls init((String)null,null,true) */ public static <L> BasicDocument<L> init() { return init((String) null, null, true); }
public BasicDocument(Collection<Word> d) { this(); addAll(d); }
/** * For internal debugging purposes only. */ public static void main(String[] args) { new BasicDocument<String>(); Document<String, Word, Word> htmlDoc = BasicDocument.init("top text <h1>HEADING text</h1> this is <p>new paragraph<br>next line<br/>xhtml break etc."); System.out.println("Before:"); System.out.println(htmlDoc); Document<String, Word, Word> txtDoc = new StripTagsProcessor<String, Word>(true).processDocument(htmlDoc); System.out.println("After:"); System.out.println(txtDoc); Document<String, Word, List<Word>> sentences = new WordToSentenceProcessor<Word>().processDocument(txtDoc); System.out.println("Sentences:"); System.out.println(sentences); } }
/** * For internal debugging purposes only. * Prints the state of the given BasicDocument to stderr. */ public static <L> void printState(BasicDocument<L> bd) throws Exception { System.err.println("BasicDocument:"); System.err.println("\tTitle: " + bd.title()); System.err.println("\tLabels: " + bd.labels()); System.err.println("\tOriginalText: " + bd.originalText()); System.err.println("\tWords: " + bd); System.err.println(); }
/** * For internal debugging purposes only. Creates and tests various instances * of BasicDocument. */ public static void main(String[] args) { try { printState(BasicDocument.init("this is the text", "this is the title [String]", true)); printState(BasicDocument.init(new StringReader("this is the text"), "this is the title [Reader]", true)); File f = File.createTempFile("BasicDocumentTestFile", null); f.deleteOnExit(); PrintWriter out = new PrintWriter(new FileWriter(f)); out.print("this is the text"); out.flush(); out.close(); printState(new BasicDocument<String>().init(f, "this is the title [File]", true)); printState(new BasicDocument<String>().init(new URL("http://www.stanford.edu/~jsmarr/BasicDocumentTestFile.txt"), "this is the title [URL]", true)); } catch (Exception e) { e.printStackTrace(); } }
/** * Inits a new BasicDocument with the given text contents and title. * The text is tokenized using {@link #parse(String)} to populate the list of words * ("" is used if text is null). If specified, a reference to the * original text is also maintained so that the text() method returns the * text given to this constructor. Returns a reference to this * BasicDocument * for convenience (so it's more like a constructor, but inherited). */ public static <L> BasicDocument<L> init(String text, String title, boolean keepOriginalText) { BasicDocument<L> basicDocument = new BasicDocument<L>(); // initializes the List of labels and sets the title basicDocument.setTitle(title); // stores the original text as specified if (keepOriginalText) { basicDocument.originalText = text; } else { basicDocument.originalText = null; } // populates the words by parsing the text basicDocument.parse(text == null ? "" : text); return basicDocument; }
/** * Calls init(words,null) */ public BasicDocument<L> init(List<? extends Word> words) { return init(words, null); }
/** * Tokenizes the given text to populate the list of words this Document * represents. The default implementation uses the current tokenizer and tokenizes * the entirety of the text into words. Subclasses should override this method * to parse documents in non-standard formats, and/or to pull the title of the * document from the text. The given text may be empty ("") but will never * be null. Subclasses may want to do additional processing and then just * call super.parse. * * @see #setTokenizerFactory */ protected void parse(String text) { Tokenizer<Word> toke = tokenizerFactory.getTokenizer(new StringReader(text)); addAll(toke.tokenize()); }
bd = ErasureUtils.<BasicDocument<L>>uncheckedCast(getClass().newInstance()); } catch (Exception e) { bd = new BasicDocument<>(); bd.setTitle(title()); bd.setLabels(labels()); bd.setTokenizerFactory(tokenizerFactory);
Document<HasWord, Word, Word> d; if (filename.startsWith("http://")) { Document<HasWord, Word, Word> dpre = new BasicDocument<HasWord>().init(new URL(filename)); DocumentProcessor<Word, Word, HasWord, Word> notags = new StripTagsProcessor<>(); d = notags.processDocument(dpre); } else { d = new BasicDocument<HasWord>().init(new File(filename));
/** * For internal debugging purposes only. * Prints the state of the given BasicDocument to stderr. */ public static <L> void printState(BasicDocument<L> bd) throws Exception { System.err.println("BasicDocument:"); System.err.println("\tTitle: " + bd.title()); System.err.println("\tLabels: " + bd.labels()); System.err.println("\tOriginalText: " + bd.originalText()); System.err.println("\tWords: " + bd); System.err.println(); }
/** * For internal debugging purposes only. Creates and tests various instances * of BasicDocument. */ public static void main(String[] args) { try { printState(BasicDocument.init("this is the text", "this is the title [String]", true)); printState(BasicDocument.init(new StringReader("this is the text"), "this is the title [Reader]", true)); File f = File.createTempFile("BasicDocumentTestFile", null); f.deleteOnExit(); PrintWriter out = new PrintWriter(new FileWriter(f)); out.print("this is the text"); out.flush(); out.close(); printState(new BasicDocument<String>().init(f, "this is the title [File]", true)); printState(new BasicDocument<String>().init(new URL("http://www.stanford.edu/~jsmarr/BasicDocumentTestFile.txt"), "this is the title [URL]", true)); } catch (Exception e) { e.printStackTrace(); } }
/** * Inits a new BasicDocument with the given text contents and title. * The text is tokenized using {@link #parse(String)} to populate the list of words * ("" is used if text is null). If specified, a reference to the * original text is also maintained so that the text() method returns the * text given to this constructor. Returns a reference to this * BasicDocument * for convenience (so it's more like a constructor, but inherited). */ public static <L> BasicDocument<L> init(String text, String title, boolean keepOriginalText) { BasicDocument<L> basicDocument = new BasicDocument<>(); // initializes the List of labels and sets the title basicDocument.setTitle(title); // stores the original text as specified if (keepOriginalText) { basicDocument.originalText = text; } else { basicDocument.originalText = null; } // populates the words by parsing the text basicDocument.parse(text == null ? "" : text); return basicDocument; }