cc.mallet.extract.DocumentExtraction.<init> java code examples

/**
 * Creates an extration given a sequence output by some kind of per-sequece labeler, like an
 *  HMM or a CRF.  The extraction will contain a single document.
 */
public Extraction (Extractor extractor, LabelAlphabet dict, String name, Tokenization input, Sequence output, String background)
{
 this.extractor = extractor;
 this.dict = dict;
 DocumentExtraction docseq = new DocumentExtraction (name, dict, input, output, background);
 addDocumentExtraction (docseq);
}

/**
 * Creates an extration given a sequence output by some kind of per-sequece labeler, like an
 *  HMM or a CRF.  The extraction will contain a single document.
 */
public Extraction (Extractor extractor, LabelAlphabet dict, String name, Tokenization input, Sequence output, String background)
{
 this.extractor = extractor;
 this.dict = dict;
 DocumentExtraction docseq = new DocumentExtraction (name, dict, input, output, background);
 addDocumentExtraction (docseq);
}

/**
 * Creates an extration given a sequence output by some kind of per-sequece labeler, like an
 *  HMM or a CRF.  The extraction will contain a single document.
 */
public Extraction (Extractor extractor, LabelAlphabet dict, String name, Tokenization input, Sequence output, String background)
{
 this.extractor = extractor;
 this.dict = dict;
 DocumentExtraction docseq = new DocumentExtraction (name, dict, input, output, background);
 addDocumentExtraction (docseq);
}

public LabeledSpans constructLabeledSpans (LabelAlphabet dict, Object document, Label backgroundTag,
                      Tokenization input, Sequence seq)
{
 DocumentExtraction extraction = new DocumentExtraction("Extraction",
                             dict,
                             input,
                             seq,
                             null,
                             backgroundTag.toString());
 confidenceEstimator.estimateConfidence(extraction);
 return extraction.getExtractedSpans();
}

public LabeledSpans constructLabeledSpans (LabelAlphabet dict, Object document, Label backgroundTag,
                      Tokenization input, Sequence seq)
{
 DocumentExtraction extraction = new DocumentExtraction("Extraction",
                             dict,
                             input,
                             seq,
                             null,
                             backgroundTag.toString());
 confidenceEstimator.estimateConfidence(extraction);
 return extraction.getExtractedSpans();
}

public LabeledSpans constructLabeledSpans (LabelAlphabet dict, Object document, Label backgroundTag,
                      Tokenization input, Sequence seq)
{
 DocumentExtraction extraction = new DocumentExtraction("Extraction",
                             dict,
                             input,
                             seq,
                             null,
                             backgroundTag.toString());
 confidenceEstimator.estimateConfidence(extraction);
 return extraction.getExtractedSpans();
}

public Extraction extract (InstanceList testing)
{
 Extraction extraction = new Extraction (this, getTargetAlphabet ());
 for (int i = 0; i < testing.size(); i++) {
  Instance instance = testing.get (i);
  Tokenization tok = (Tokenization) instance.getProperty ("TOKENIZATION");
  if (tok == null)
   throw new IllegalArgumentException
    ("To use extract(InstanceList), must save the Tokenization!");
  String name = instance.getName ().toString ();
  Sequence target = (Sequence) instance.getTarget ();
  Sequence output = acrf.getBestLabels (instance);
  DocumentExtraction docseq = new DocumentExtraction (name, getTargetAlphabet (), tok,
                            output, target, backgroundTag, filter);
  extraction.addDocumentExtraction (docseq);
 }
 return extraction;
}

public Extraction extract (InstanceList testing)
{
 Extraction extraction = new Extraction (this, getTargetAlphabet ());
 for (int i = 0; i < testing.size(); i++) {
  Instance instance = testing.get (i);
  Tokenization tok = (Tokenization) instance.getProperty ("TOKENIZATION");
  if (tok == null)
   throw new IllegalArgumentException
    ("To use extract(InstanceList), must save the Tokenization!");
  String name = instance.getName ().toString ();
  Sequence target = (Sequence) instance.getTarget ();
  Sequence output = acrf.getBestLabels (instance);
  DocumentExtraction docseq = new DocumentExtraction (name, getTargetAlphabet (), tok,
                            output, target, backgroundTag, filter);
  extraction.addDocumentExtraction (docseq);
 }
 return extraction;
}

public Extraction extract (InstanceList testing)
{
 Extraction extraction = new Extraction (this, getTargetAlphabet ());
 for (int i = 0; i < testing.size(); i++) {
  Instance instance = testing.get (i);
  Tokenization tok = (Tokenization) instance.getProperty ("TOKENIZATION");
  if (tok == null)
   throw new IllegalArgumentException
    ("To use extract(InstanceList), must save the Tokenization!");
  String name = instance.getName ().toString ();
  Sequence target = (Sequence) instance.getTarget ();
  Sequence output = acrf.getBestLabels (instance);
  DocumentExtraction docseq = new DocumentExtraction (name, getTargetAlphabet (), tok,
                            output, target, backgroundTag, filter);
  extraction.addDocumentExtraction (docseq);
 }
 return extraction;
}

public Extraction extract (Tokenization spans)
{
 // We assume the input is unpiped.
 Instance carrier = featurePipe.pipe (new Instance (spans, null, null, null));
 Sequence output = crf.transduce ((Sequence) carrier.getData ());
 Extraction extraction = new Extraction (this, getTargetAlphabet());
 DocumentExtraction docseq = new DocumentExtraction ("Extraction", getTargetAlphabet(), 
                           spans,
                           output, null, backgroundTag,
                           filter);
 extraction.addDocumentExtraction (docseq);
 return extraction;
}

public Extraction extract (Tokenization spans)
{
 // We assume the input is unpiped.
 Instance carrier = featurePipe.pipe (new Instance (spans, null, null, null));
 Sequence output = crf.transduce ((Sequence) carrier.getData ());
 Extraction extraction = new Extraction (this, getTargetAlphabet());
 DocumentExtraction docseq = new DocumentExtraction ("Extraction", getTargetAlphabet(), 
                           spans,
                           output, null, backgroundTag,
                           filter);
 extraction.addDocumentExtraction (docseq);
 return extraction;
}

public Extraction extract (Tokenization spans)
{
 // We assume the input is unpiped.
 Instance carrier = featurePipe.pipe (new Instance (spans, null, null, null));
 Sequence output = crf.transduce ((Sequence) carrier.getData ());
 Extraction extraction = new Extraction (this, getTargetAlphabet());
 DocumentExtraction docseq = new DocumentExtraction ("Extraction", getTargetAlphabet(), 
                           spans,
                           output, null, backgroundTag,
                           filter);
 extraction.addDocumentExtraction (docseq);
 return extraction;
}

/** Assumes Instance.source contains the Tokenization object. */
public Extraction extract (InstanceList ilist) {
Extraction extraction = new Extraction (this, getTargetAlphabet ());
  for (int i = 0; i < ilist.size(); i++) {
    Instance inst = ilist.get(i);
    Tokenization tok = (Tokenization)inst.getSource();
 String name = inst.getName().toString();
 Sequence input = (Sequence)inst.getData ();
 Sequence target = (Sequence)inst.getTarget ();
 Sequence output = crf.transduce(input);
 DocumentExtraction docseq =
      new DocumentExtraction (name, getTargetAlphabet(), tok,
                              output, target, backgroundTag,
                              filter);
 extraction.addDocumentExtraction (docseq);            
  }
return extraction;
}

/** Assumes Instance.source contains the Tokenization object. */
public Extraction extract (InstanceList ilist) {
Extraction extraction = new Extraction (this, getTargetAlphabet ());
  for (int i = 0; i < ilist.size(); i++) {
    Instance inst = ilist.get(i);
    Tokenization tok = (Tokenization)inst.getSource();
 String name = inst.getName().toString();
 Sequence input = (Sequence)inst.getData ();
 Sequence target = (Sequence)inst.getTarget ();
 Sequence output = crf.transduce(input);
 DocumentExtraction docseq =
      new DocumentExtraction (name, getTargetAlphabet(), tok,
                              output, target, backgroundTag,
                              filter);
 extraction.addDocumentExtraction (docseq);            
  }
return extraction;
}

/** Assumes Instance.source contains the Tokenization object. */
public Extraction extract (InstanceList ilist) {
Extraction extraction = new Extraction (this, getTargetAlphabet ());
  for (int i = 0; i < ilist.size(); i++) {
    Instance inst = ilist.get(i);
    Tokenization tok = (Tokenization)inst.getSource();
 String name = inst.getName().toString();
 Sequence input = (Sequence)inst.getData ();
 Sequence target = (Sequence)inst.getTarget ();
 Sequence output = crf.transduce(input);
 DocumentExtraction docseq =
      new DocumentExtraction (name, getTargetAlphabet(), tok,
                              output, target, backgroundTag,
                              filter);
 extraction.addDocumentExtraction (docseq);            
  }
return extraction;
}

public void testNestedXMLTokenizationFilter ()
{
 LabelAlphabet dict = new LabelAlphabet ();
 String document = "the quick brown fox leapt over the lazy dog";
 StringTokenization toks = new StringTokenization (document, new CharSequenceLexer ());
 Label O = dict.lookupLabel ("O");
 Label ANML = dict.lookupLabel ("ANIMAL");
 Label ANML_MAMM = dict.lookupLabel ("ANIMAL|MAMMAL");
 Label VB = dict.lookupLabel ("VERB");
 Label ANML_JJ = dict.lookupLabel ("ANIMAL|ADJ");
 Label ANML_JJ_MAMM = dict.lookupLabel ("ANIMAL|ADJ|MAMMAL");
 LabelSequence tags = new LabelSequence (new Label[] { O, ANML, ANML, ANML_MAMM, VB, O, ANML, ANML_JJ, ANML_JJ_MAMM });
 DocumentExtraction extr = new DocumentExtraction ("Test", dict, toks, tags, null, "O", new HierarchicalTokenizationFilter ());
 String actualXml = extr.toXmlString();
 String expectedXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n" +
     "<doc>the <ANIMAL>quick brown <MAMMAL>fox </MAMMAL></ANIMAL><VERB>leapt </VERB>over <ANIMAL>the <ADJ>lazy <MAMMAL>dog</MAMMAL></ADJ></ANIMAL></doc>\r\n";
 assertEquals (expectedXml, actualXml);
 // Test the ignore function
 extr = new DocumentExtraction ("Test", dict, toks, tags, null, "O", new HierarchicalTokenizationFilter (Pattern.compile ("AD.*")));
 actualXml = extr.toXmlString();
 expectedXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n" +
     "<doc>the <ANIMAL>quick brown <MAMMAL>fox </MAMMAL></ANIMAL><VERB>leapt </VERB>over <ANIMAL>the lazy <MAMMAL>dog</MAMMAL></ANIMAL></doc>\r\n";
 assertEquals (expectedXml, actualXml);
}

public void testToXml () {
 LabelAlphabet dict = new LabelAlphabet ();
 String document = "the quick brown fox leapt over the lazy dog";
 StringTokenization toks = new StringTokenization (document, new CharSequenceLexer ());
 Label O = dict.lookupLabel ("O");
 Label ANML = dict.lookupLabel ("ANIMAL");
 Label VB = dict.lookupLabel ("VERB");
 LabelSequence tags = new LabelSequence (new Label[] { O, ANML, ANML, ANML, VB, O, O, ANML, ANML });
 DocumentExtraction extr = new DocumentExtraction ("Test", dict, toks, tags, "O");
 String actualXml = extr.toXmlString();
 String expectedXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n" +
     "<doc>the <ANIMAL>quick brown fox </ANIMAL><VERB>leapt </VERB>over the <ANIMAL>lazy dog</ANIMAL></doc>\r\n";
 assertEquals (expectedXml, actualXml);
}

public void testToXml () {
 LabelAlphabet dict = new LabelAlphabet ();
 String document = "the quick brown fox leapt over the lazy dog";
 StringTokenization toks = new StringTokenization (document, new CharSequenceLexer ());
 Label O = dict.lookupLabel ("O");
 Label ANML = dict.lookupLabel ("ANIMAL");
 Label VB = dict.lookupLabel ("VERB");
 LabelSequence tags = new LabelSequence (new Label[] { O, ANML, ANML, ANML, VB, O, O, ANML, ANML });
 DocumentExtraction extr = new DocumentExtraction ("Test", dict, toks, tags, "O");
 String actualXml = extr.toXmlString();
 String expectedXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n" +
     "<doc>the <ANIMAL>quick brown fox </ANIMAL><VERB>leapt </VERB>over the <ANIMAL>lazy dog</ANIMAL></doc>\r\n";
 assertEquals (expectedXml, actualXml);
}

 public void testToXmlBIO () {
 LabelAlphabet dict = new LabelAlphabet ();
 String document = "the quick brown fox leapt over the lazy dog";
 StringTokenization toks = new StringTokenization (document, new CharSequenceLexer ());
 Label O = dict.lookupLabel ("O");
 Label BANML = dict.lookupLabel ("B-ANIMAL");
 Label ANML = dict.lookupLabel ("ANIMAL");
 Label BVB = dict.lookupLabel ("B-VERB");
 Label VB = dict.lookupLabel ("I-VERB");
 LabelSequence tags = new LabelSequence (new Label[] { O, BANML, ANML, BANML, BVB, VB, O, ANML, ANML });
 DocumentExtraction extr = new DocumentExtraction ("Test", dict, toks, tags, null, "O", new BIOTokenizationFilter());
 String actualXml = extr.toXmlString();
 String expectedXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n" +
     "<doc>the <ANIMAL>quick brown </ANIMAL><ANIMAL>fox </ANIMAL><VERB>leapt over </VERB>the <ANIMAL>lazy dog</ANIMAL></doc>\r\n";
 assertEquals (expectedXml, actualXml);
}

 public void testToXmlBIO () {
 LabelAlphabet dict = new LabelAlphabet ();
 String document = "the quick brown fox leapt over the lazy dog";
 StringTokenization toks = new StringTokenization (document, new CharSequenceLexer ());
 Label O = dict.lookupLabel ("O");
 Label BANML = dict.lookupLabel ("B-ANIMAL");
 Label ANML = dict.lookupLabel ("ANIMAL");
 Label BVB = dict.lookupLabel ("B-VERB");
 Label VB = dict.lookupLabel ("I-VERB");
 LabelSequence tags = new LabelSequence (new Label[] { O, BANML, ANML, BANML, BVB, VB, O, ANML, ANML });
 DocumentExtraction extr = new DocumentExtraction ("Test", dict, toks, tags, null, "O", new BIOTokenizationFilter());
 String actualXml = extr.toXmlString();
 String expectedXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n" +
     "<doc>the <ANIMAL>quick brown </ANIMAL><ANIMAL>fox </ANIMAL><VERB>leapt over </VERB>the <ANIMAL>lazy dog</ANIMAL></doc>\r\n";
 assertEquals (expectedXml, actualXml);
}

Popular in Java

Reading from database using SQL prepared statement
getContentResolver (Context)
getSharedPreferences (Context)
getOriginalFilename (MultipartFile)
Return the original filename in the client's filesystem.This may contain path information depending
OutputStream (java.io)
A writable sink for bytes.Most clients will use output streams that write data to the file system (
MalformedURLException (java.net)
This exception is thrown when a program attempts to create an URL from an incorrect specification.
Proxy (java.net)
This class represents proxy server settings. A created instance of Proxy stores a type and an addres
StringUtils (org.apache.commons.lang)
Operations on java.lang.String that arenull safe. * IsEmpty/IsBlank - checks if a String contains
Rectangle (java.awt)
A Rectangle specifies an area in a coordinate space that is enclosed by the Rectangle object's top-
Reflections (org.reflections)
Reflections one-stop-shop objectReflections scans your classpath, indexes the metadata, allows you t
Best plugins for Eclipse

How to use cc.mallet.extract.DocumentExtractionconstructor

Best Java code snippets using cc.mallet.extract.DocumentExtraction.<init> (Showing top 20 results out of 315)

How to use
cc.mallet.extract.DocumentExtraction
constructor