edu.stanford.nlp.ling.BasicDocument java code examples

 bd = ErasureUtils.<BasicDocument<L>>uncheckedCast(getClass().newInstance());
} catch (Exception e) {
 bd = new BasicDocument<>();
bd.setTitle(title());
bd.setLabels(labels());
bd.setTokenizerFactory(tokenizerFactory);

/**
 * Initializes a new BasicDocument with the given list of words and title.
 */
public BasicDocument<L> init(List<? extends Word> words, String title) {
 // initializes the List of labels and sets the title
 setTitle(title);
 // no original text
 originalText = null;
 // adds all of the given words to the list maintained by this document
 addAll(words);
 return (this);
}

/**
 * Removes all currently assigned labels for this Document then adds
 * the given label.
 * Calling <tt>setLabel(null)</tt> effectively clears all labels.
 */
public void setLabel(L label) {
 labels.clear();
 addLabel(label);
}

/**
 * Creates a new Document for the given text. Default implementation tokenizes
 * the text using the tokenizer provided during construction and sticks the words
 * in a new BasicDocument. The text is also stored as the original text in
 * the BasicDocument if keepOriginalText was set in the constructor. Subclasses
 * may wish to extract additional information from the text and/or return another
 * document subclass with additional meta-data.
 */
protected BasicDocument<L> parseDocumentText(String text) {
 new BasicDocument<L>();
 return BasicDocument.init(text, keepOriginalText);
}

/**
 * Inits a new BasicDocument with the given text contents and title.
 * The text is tokenized using {@link #parse(String)} to populate the list of words
 * ("" is used if text is null). If specified, a reference to the
 * original text is also maintained so that the text() method returns the
 * text given to this constructor. Returns a reference to this
 * BasicDocument
 * for convenience (so it's more like a constructor, but inherited).
 */
public static <L> BasicDocument<L> init(String text, String title, boolean keepOriginalText) {
 BasicDocument<L> basicDocument = new BasicDocument<>();
 // initializes the List of labels and sets the title
 basicDocument.setTitle(title);
 // stores the original text as specified
 if (keepOriginalText) {
  basicDocument.originalText = text;
 } else {
  basicDocument.originalText = null;
 }
 // populates the words by parsing the text
 basicDocument.parse(text == null ? "" : text);
 return basicDocument;
}

/**
 * For internal debugging purposes only.
 * Prints the state of the given BasicDocument to stderr.
 */
public static <L> void printState(BasicDocument<L> bd) throws Exception {
 log.info("BasicDocument:");
 log.info("\tTitle: " + bd.title());
 log.info("\tLabels: " + bd.labels());
 log.info("\tOriginalText: " + bd.originalText());
 log.info("\tWords: " + bd);
 log.info();
}

/**
 * For internal debugging purposes only. Creates and tests various instances
 * of BasicDocument.
 */
public static void main(String[] args) {
 try {
  printState(BasicDocument.init("this is the text", "this is the title [String]", true));
  printState(BasicDocument.init(new StringReader("this is the text"), "this is the title [Reader]", true));
  File f = File.createTempFile("BasicDocumentTestFile", null);
  f.deleteOnExit();
  PrintWriter out = new PrintWriter(new FileWriter(f));
  out.print("this is the text");
  out.flush();
  out.close();
  printState(new BasicDocument<String>().init(f, "this is the title [File]", true));
  printState(new BasicDocument<String>().init(new URL("http://www.stanford.edu/~jsmarr/BasicDocumentTestFile.txt"), "this is the title [URL]", true));
 } catch (Exception e) {
  e.printStackTrace();
 }
}

/**
 * Calls init((String)null,null,true)
 */
public static <L> BasicDocument<L> init() {
 return init((String) null, null, true);
}

public BasicDocument(Collection<Word> d) {
 this();
 addAll(d);
}

 /**
  * For internal debugging purposes only.
  */
 public static void main(String[] args) {
  new BasicDocument<String>();
  Document<String, Word, Word> htmlDoc = BasicDocument.init("top text <h1>HEADING text</h1> this is <p>new paragraph<br>next line<br/>xhtml break etc.");
  System.out.println("Before:");
  System.out.println(htmlDoc);
  Document<String, Word, Word> txtDoc = new StripTagsProcessor<String, Word>(true).processDocument(htmlDoc);
  System.out.println("After:");
  System.out.println(txtDoc);
  Document<String, Word, List<Word>> sentences = new WordToSentenceProcessor<Word>().processDocument(txtDoc);
  System.out.println("Sentences:");
  System.out.println(sentences);
 }
}

/**
 * For internal debugging purposes only.
 * Prints the state of the given BasicDocument to stderr.
 */
public static <L> void printState(BasicDocument<L> bd) throws Exception {
 System.err.println("BasicDocument:");
 System.err.println("\tTitle: " + bd.title());
 System.err.println("\tLabels: " + bd.labels());
 System.err.println("\tOriginalText: " + bd.originalText());
 System.err.println("\tWords: " + bd);
 System.err.println();
}

/**
 * For internal debugging purposes only. Creates and tests various instances
 * of BasicDocument.
 */
public static void main(String[] args) {
 try {
  printState(BasicDocument.init("this is the text", "this is the title [String]", true));
  printState(BasicDocument.init(new StringReader("this is the text"), "this is the title [Reader]", true));
  File f = File.createTempFile("BasicDocumentTestFile", null);
  f.deleteOnExit();
  PrintWriter out = new PrintWriter(new FileWriter(f));
  out.print("this is the text");
  out.flush();
  out.close();
  printState(new BasicDocument<String>().init(f, "this is the title [File]", true));
  printState(new BasicDocument<String>().init(new URL("http://www.stanford.edu/~jsmarr/BasicDocumentTestFile.txt"), "this is the title [URL]", true));
 } catch (Exception e) {
  e.printStackTrace();
 }
}

/**
 * Inits a new BasicDocument with the given text contents and title.
 * The text is tokenized using {@link #parse(String)} to populate the list of words
 * ("" is used if text is null). If specified, a reference to the
 * original text is also maintained so that the text() method returns the
 * text given to this constructor. Returns a reference to this
 * BasicDocument
 * for convenience (so it's more like a constructor, but inherited).
 */
public static <L> BasicDocument<L> init(String text, String title, boolean keepOriginalText) {
 BasicDocument<L> basicDocument = new BasicDocument<L>();
 // initializes the List of labels and sets the title
 basicDocument.setTitle(title);
 // stores the original text as specified
 if (keepOriginalText) {
  basicDocument.originalText = text;
 } else {
  basicDocument.originalText = null;
 }
 // populates the words by parsing the text
 basicDocument.parse(text == null ? "" : text);
 return basicDocument;
}

/**
 * Calls init(words,null)
 */
public BasicDocument<L> init(List<? extends Word> words) {
 return init(words, null);
}

/**
 * Tokenizes the given text to populate the list of words this Document
 * represents. The default implementation uses the current tokenizer and tokenizes
 * the entirety of the text into words. Subclasses should override this method
 * to parse documents in non-standard formats, and/or to pull the title of the
 * document from the text. The given text may be empty ("") but will never
 * be null. Subclasses may want to do additional processing and then just
 * call super.parse.
 *
 * @see #setTokenizerFactory
 */
protected void parse(String text) {
 Tokenizer<Word> toke = tokenizerFactory.getTokenizer(new StringReader(text));
 addAll(toke.tokenize());
}

 bd = ErasureUtils.<BasicDocument<L>>uncheckedCast(getClass().newInstance());
} catch (Exception e) {
 bd = new BasicDocument<>();
bd.setTitle(title());
bd.setLabels(labels());
bd.setTokenizerFactory(tokenizerFactory);

Document<HasWord, Word, Word> d;
if (filename.startsWith("http://")) {
 Document<HasWord, Word, Word> dpre = new BasicDocument<HasWord>().init(new URL(filename));
 DocumentProcessor<Word, Word, HasWord, Word> notags = new StripTagsProcessor<>();
 d = notags.processDocument(dpre);
} else {
 d = new BasicDocument<HasWord>().init(new File(filename));

/**
 * For internal debugging purposes only.
 * Prints the state of the given BasicDocument to stderr.
 */
public static <L> void printState(BasicDocument<L> bd) throws Exception {
 System.err.println("BasicDocument:");
 System.err.println("\tTitle: " + bd.title());
 System.err.println("\tLabels: " + bd.labels());
 System.err.println("\tOriginalText: " + bd.originalText());
 System.err.println("\tWords: " + bd);
 System.err.println();
}

/**
 * For internal debugging purposes only. Creates and tests various instances
 * of BasicDocument.
 */
public static void main(String[] args) {
 try {
  printState(BasicDocument.init("this is the text", "this is the title [String]", true));
  printState(BasicDocument.init(new StringReader("this is the text"), "this is the title [Reader]", true));
  File f = File.createTempFile("BasicDocumentTestFile", null);
  f.deleteOnExit();
  PrintWriter out = new PrintWriter(new FileWriter(f));
  out.print("this is the text");
  out.flush();
  out.close();
  printState(new BasicDocument<String>().init(f, "this is the title [File]", true));
  printState(new BasicDocument<String>().init(new URL("http://www.stanford.edu/~jsmarr/BasicDocumentTestFile.txt"), "this is the title [URL]", true));
 } catch (Exception e) {
  e.printStackTrace();
 }
}

/**
 * Inits a new BasicDocument with the given text contents and title.
 * The text is tokenized using {@link #parse(String)} to populate the list of words
 * ("" is used if text is null). If specified, a reference to the
 * original text is also maintained so that the text() method returns the
 * text given to this constructor. Returns a reference to this
 * BasicDocument
 * for convenience (so it's more like a constructor, but inherited).
 */
public static <L> BasicDocument<L> init(String text, String title, boolean keepOriginalText) {
 BasicDocument<L> basicDocument = new BasicDocument<>();
 // initializes the List of labels and sets the title
 basicDocument.setTitle(title);
 // stores the original text as specified
 if (keepOriginalText) {
  basicDocument.originalText = text;
 } else {
  basicDocument.originalText = null;
 }
 // populates the words by parsing the text
 basicDocument.parse(text == null ? "" : text);
 return basicDocument;
}

Javadoc

Basic implementation of Document that should be suitable for most needs. BasicDocument is an ArrayList for storing words and performs tokenization during construction. Override #parse(String) to provide support for custom document formats or to do a custom job of tokenization. BasicDocument should only be used for documents that are small enough to store in memory. The easiest way to use BasicDocuments is to construct them and call an init method in the same line (we use init methods instead of constructors because they're inherited and allow subclasses to have other more specific constructors). For example, to read in a file file and tokenize it, you can call

Document doc=new BasicDocument().init(file);

Most used methods

<init>
addAll
addLabel
Adds the given label to the List of labels for this Document if it is not null.
init
Initializes a new BasicDocument with the given list of words and title.
labels
Returns the complete List of labels for this Document. This is an empty collection if none have been
originalText
Returns the text originally used to construct this document, or null if there was no original text.
parse
Tokenizes the given text to populate the list of words this Document represents. The default impleme
printState
For internal debugging purposes only. Prints the state of the given BasicDocument to stderr.
setLabels
Removes all currently assigned labels for this Document then adds all of the given labels.
setTitle
Sets the title of this Document to the given title. If the given title is null, sets the title to ""
setTokenizerFactory
Sets the tokenizerFactory to be used by #parse(String). Set this tokenizer before calling one of the
title
Returns the title of this document. The title may be empty ("") but will never be null.

Popular in Java

Making http requests using okhttp
getSystemService (Context)
setRequestProperty (URLConnection)
setContentView (Activity)
Thread (java.lang)
A thread is a thread of execution in a program. The Java Virtual Machine allows an application to ha
HttpURLConnection (java.net)
An URLConnection for HTTP (RFC 2616 [http://tools.ietf.org/html/rfc2616]) used to send and receive d
Collection (java.util)
Collection is the root of the collection hierarchy. It defines operations on data collections and t
SortedMap (java.util)
A map that has its keys ordered. The sorting is according to either the natural ordering of its keys
Manifest (java.util.jar)
The Manifest class is used to obtain attribute information for a JarFile and its entries.
HttpServlet (javax.servlet.http)
Provides an abstract class to be subclassed to create an HTTP servlet suitable for a Web site. A sub
Top Vim plugins

How to useBasicDocument in edu.stanford.nlp.ling

Best Java code snippets using edu.stanford.nlp.ling.BasicDocument (Showing top 20 results out of 315)

How to use
BasicDocument
in
edu.stanford.nlp.ling