cc.mallet.extract.DocumentExtraction java code examples

/**
 * Creates an extration given a sequence output by some kind of per-sequece labeler, like an
 *  HMM or a CRF.  The extraction will contain a single document.
 */
public Extraction (Extractor extractor, LabelAlphabet dict, String name, Tokenization input, Sequence output, String background)
{
 this.extractor = extractor;
 this.dict = dict;
 DocumentExtraction docseq = new DocumentExtraction (name, dict, input, output, background);
 addDocumentExtraction (docseq);
}

private static void writeDualExtractions (PrintWriter out, Extraction e1, CRFExtractor extor1, Extraction e2, CRFExtractor extor2,
                     int start, int end, boolean showLattice)
{
 writeHeader (out);
 for (int i = start; i < end; i++) {
  DocumentExtraction doc1 = e1.getDocumentExtraction (i);
  DocumentExtraction doc2 = e2.getDocumentExtraction (i);
  String desc = doc1.getName();
  String doc1Str = ((CharSequence) doc1.getDocument ()).toString();
  String doc2Str = ((CharSequence) doc2.getDocument ()).toString();
  if (!doc1Str.equals (doc2Str)) {
   System.err.println ("Skipping document "+i+": Extractions don't match");
   continue;
  }
  Sequence targ1 = doc1.getPredictedLabels ();
  Sequence targ2 = doc2.getPredictedLabels ();
  if (!predictionsMatch (targ1, targ2)) {
   ExtorInfo info1 = infoForDoc (doc1Str, "CRF1::"+desc, "C1I"+i, doc1, extor1, showLattice);
   ExtorInfo info2 = infoForDoc (doc1Str, "CRF2::"+desc, "C2I"+i, doc2, extor2, showLattice);
   if (!showLattice) { // add links from errors.html --> lattice.html
    info1.link = info2.link = computeLatticeFname (i);
   }
   dualLattice2html (out, desc, info1, info2);
  }
 }
 writeFooter (out);
}

public void addDocumentExtraction (DocumentExtraction docseq)
{
 byDocs.add (docseq);
 records.add (new Record (docseq.getName (), docseq.getExtractedSpans ()));
 if (docseq.getTargetSpans () != null) {
  trueRecords.add (new Record ("TRUE:"+docseq.getName (), docseq.getTargetSpans ()));
 }
}

public LabeledSpans constructLabeledSpans (LabelAlphabet dict, Object document, Label backgroundTag,
                      Tokenization input, Sequence seq)
{
 DocumentExtraction extraction = new DocumentExtraction("Extraction",
                             dict,
                             input,
                             seq,
                             null,
                             backgroundTag.toString());
 confidenceEstimator.estimateConfidence(extraction);
 return extraction.getExtractedSpans();
}

 private static ExtorInfo infoForDoc (String doc, String desc, String idx, DocumentExtraction docextr,
                     CRFExtractor extor, boolean showLattice)
 {
//    Instance c2 = new Instance (doc, null, null, null, extor.getTokenizationPipe ());
//    TokenSequence input = (TokenSequence) c2.getData ();
  TokenSequence input = (TokenSequence) docextr.getInput (); 
  LabelSequence target = docextr.getTarget ();
  Sequence predicted = docextr.getPredictedLabels ();

  ExtorInfo info = new ExtorInfo (input, predicted, target, desc, idx);

  if (showLattice == true) {
   CRF crf = extor.getCrf();
   // xxx perhaps the next two lines could be a transducer method???
   Instance carrier = extor.getFeaturePipe().pipe(new Instance (input, null, null, null));
   info.fvs = (FeatureVectorSequence) carrier.getData ();
   info.lattice = new MaxLatticeDefault (crf, (Sequence) carrier.getData(), null);
   info.bestStates = info.lattice.bestOutputSequence();
  }

  return info;
 }

public void estimateConfidence (DocumentExtraction documentExtraction) {
 Tokenization input = documentExtraction.getInput();
 // WARNING: input Tokenization will likely already have many
 // features appended from the last time it was passed through a
 // featurePipe. To avoid a redundant calculation of features, the
 // caller may want to set this.featurePipe =
 // TokenSequence2FeatureVectorSequence
 Instance carrier = this.featurePipe.pipe(new Instance(input, null, null, null)); 
 Sequence pipedInput = (Sequence) carrier.getData();
 Sequence prediction = documentExtraction.getPredictedLabels();
 LabeledSpans labeledSpans = documentExtraction.getExtractedSpans();
 SumLatticeDefault lattice = new SumLatticeDefault (this.confidenceEstimator.getTransducer(), pipedInput);
 for (int i=0; i < labeledSpans.size(); i++) {
  LabeledSpan span = labeledSpans.getLabeledSpan(i);
  if (span.isBackground()) 
   continue;
  int[] segmentBoundaries = getSegmentBoundaries(input, span);
  Segment segment = new Segment(pipedInput, prediction, prediction, 
                 segmentBoundaries[0], segmentBoundaries[1],
                 null, null);
  span.setConfidence(confidenceEstimator.estimateConfidenceFor(segment, lattice));
 }
}

public void testToXml () {
 LabelAlphabet dict = new LabelAlphabet ();
 String document = "the quick brown fox leapt over the lazy dog";
 StringTokenization toks = new StringTokenization (document, new CharSequenceLexer ());
 Label O = dict.lookupLabel ("O");
 Label ANML = dict.lookupLabel ("ANIMAL");
 Label VB = dict.lookupLabel ("VERB");
 LabelSequence tags = new LabelSequence (new Label[] { O, ANML, ANML, ANML, VB, O, O, ANML, ANML });
 DocumentExtraction extr = new DocumentExtraction ("Test", dict, toks, tags, "O");
 String actualXml = extr.toXmlString();
 String expectedXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n" +
     "<doc>the <ANIMAL>quick brown fox </ANIMAL><VERB>leapt </VERB>over the <ANIMAL>lazy dog</ANIMAL></doc>\r\n";
 assertEquals (expectedXml, actualXml);
}

public static void extraction2html (Extraction extraction, CRFExtractor extor, PrintWriter out, boolean showLattice)
{
 writeHeader (out);
 for (int i = 0; i < extraction.getNumDocuments (); i++) {
  DocumentExtraction docextr = extraction.getDocumentExtraction (i);
  String desc = docextr.getName();
  String doc = ((CharSequence) docextr.getDocument ()).toString();
  ExtorInfo info = infoForDoc (doc, desc, "N"+i, docextr, extor, showLattice);
  if (!showLattice) info.link = "lattice.html";
  lattice2html (out, info);
 }
 writeFooter (out);
}

private static DualLabeledSpans intersectSpans (DocumentExtraction docExtr)
{
 int predIdx = 0;
 int trueIdx = 0;
 LabeledSpans trueSpans = docExtr.getTargetSpans ();
 LabeledSpans predSpans = docExtr.getExtractedSpans ();
 LabeledSpans retPredSpans = new LabeledSpans (predSpans.getDocument ());
 LabeledSpans retTrueSpans = new LabeledSpans (predSpans.getDocument ());
 while ((predIdx < predSpans.size()) && (trueIdx < trueSpans.size ())) {
  LabeledSpan predSpan = predSpans.getLabeledSpan (predIdx);
  LabeledSpan trueSpan = trueSpans.getLabeledSpan (trueIdx);
  LabeledSpan newPredSpan = (LabeledSpan) predSpan.intersection (trueSpan);
  LabeledSpan newTrueSpan = (LabeledSpan) trueSpan.intersection (predSpan);
  retPredSpans.add (newPredSpan);
  retTrueSpans.add (newTrueSpan);
  if (predSpan.getEndIdx () <= trueSpan.getEndIdx ()) {
   predIdx++;
  }
  if (trueSpan.getEndIdx () <= predSpan.getEndIdx ()) {
   trueIdx++;
  }
 }
 assert (retPredSpans.size() == retTrueSpans.size());
 return new DualLabeledSpans (retPredSpans, retTrueSpans);
}

private static void outputOneDocument (PrintWriter out, DocumentExtraction docExtr)
 String name = docExtr.getName ();
 out.println ("<HTML><HEAD><TITLE>"+name+": Extraction from Document</TITLE>");
 out.println ("<LINK REL=\"stylesheet\" TYPE=\"text/css\" HREF=\""+DOC_ERRS_CSS_FNAME+"\" title=\"Agreement\" />");
 out.println ("</HEAD><BODY>");
 outputClassLegend (out, docExtr.getExtractedSpans ().getLabeledSpan (0).getLabel ().getLabelAlphabet ());
 outputRightWrongLegend (out);

public Document toXmlDocument ()
{
 return toXmlDocument ("doc", Namespace.NO_NAMESPACE);
}

 private static void outputIndex (File directory, Extraction extraction) throws IOException
 {
  PrintWriter out = new PrintWriter (new FileWriter (new File (directory, "index.html")));
  out.println ("<HTML><HEAD><TITLE>Extraction Results</TITLE></HEAD><BODY><OL>");
  for (int i = 0; i < extraction.getNumDocuments(); i++) {
   String name = extraction.getDocumentExtraction (i).getName ();
   out.println ("  <LI><A HREF=\"extraction"+i+".html\">"+name+"</A></LI>");
  }
  out.println ("</OL></BODY></HTML>");
  out.close ();
 }
}

return new Document (generateElement (rootEltName, wholeDoc, roots, children));

 private static ExtorInfo infoForDoc (String doc, String desc, String idx, DocumentExtraction docextr,
                     CRFExtractor extor, boolean showLattice)
 {
//    Instance c2 = new Instance (doc, null, null, null, extor.getTokenizationPipe ());
//    TokenSequence input = (TokenSequence) c2.getData ();
  TokenSequence input = (TokenSequence) docextr.getInput (); 
  LabelSequence target = docextr.getTarget ();
  Sequence predicted = docextr.getPredictedLabels ();

  ExtorInfo info = new ExtorInfo (input, predicted, target, desc, idx);

  if (showLattice == true) {
   CRF crf = extor.getCrf();
   // xxx perhaps the next two lines could be a transducer method???
   Instance carrier = extor.getFeaturePipe().pipe(new Instance (input, null, null, null));
   info.fvs = (FeatureVectorSequence) carrier.getData ();
   info.lattice = new MaxLatticeDefault (crf, (Sequence) carrier.getData(), null);
   info.bestStates = info.lattice.bestOutputSequence();
  }

  return info;
 }

public void estimateConfidence (DocumentExtraction documentExtraction) {
 Tokenization input = documentExtraction.getInput();
 // WARNING: input Tokenization will likely already have many
 // features appended from the last time it was passed through a
 // featurePipe. To avoid a redundant calculation of features, the
 // caller may want to set this.featurePipe =
 // TokenSequence2FeatureVectorSequence
 Instance carrier = this.featurePipe.pipe(new Instance(input, null, null, null)); 
 Sequence pipedInput = (Sequence) carrier.getData();
 Sequence prediction = documentExtraction.getPredictedLabels();
 LabeledSpans labeledSpans = documentExtraction.getExtractedSpans();
 SumLatticeDefault lattice = new SumLatticeDefault (this.confidenceEstimator.getTransducer(), pipedInput);
 for (int i=0; i < labeledSpans.size(); i++) {
  LabeledSpan span = labeledSpans.getLabeledSpan(i);
  if (span.isBackground()) 
   continue;
  int[] segmentBoundaries = getSegmentBoundaries(input, span);
  Segment segment = new Segment(pipedInput, prediction, prediction, 
                 segmentBoundaries[0], segmentBoundaries[1],
                 null, null);
  span.setConfidence(confidenceEstimator.estimateConfidenceFor(segment, lattice));
 }
}

public void testToXml () {
 LabelAlphabet dict = new LabelAlphabet ();
 String document = "the quick brown fox leapt over the lazy dog";
 StringTokenization toks = new StringTokenization (document, new CharSequenceLexer ());
 Label O = dict.lookupLabel ("O");
 Label ANML = dict.lookupLabel ("ANIMAL");
 Label VB = dict.lookupLabel ("VERB");
 LabelSequence tags = new LabelSequence (new Label[] { O, ANML, ANML, ANML, VB, O, O, ANML, ANML });
 DocumentExtraction extr = new DocumentExtraction ("Test", dict, toks, tags, "O");
 String actualXml = extr.toXmlString();
 String expectedXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n" +
     "<doc>the <ANIMAL>quick brown fox </ANIMAL><VERB>leapt </VERB>over the <ANIMAL>lazy dog</ANIMAL></doc>\r\n";
 assertEquals (expectedXml, actualXml);
}

public LabeledSpans constructLabeledSpans (LabelAlphabet dict, Object document, Label backgroundTag,
                      Tokenization input, Sequence seq)
{
 DocumentExtraction extraction = new DocumentExtraction("Extraction",
                             dict,
                             input,
                             seq,
                             null,
                             backgroundTag.toString());
 confidenceEstimator.estimateConfidence(extraction);
 return extraction.getExtractedSpans();
}

public static void extraction2html (Extraction extraction, CRFExtractor extor, PrintWriter out, boolean showLattice)
{
 writeHeader (out);
 for (int i = 0; i < extraction.getNumDocuments (); i++) {
  DocumentExtraction docextr = extraction.getDocumentExtraction (i);
  String desc = docextr.getName();
  String doc = ((CharSequence) docextr.getDocument ()).toString();
  ExtorInfo info = infoForDoc (doc, desc, "N"+i, docextr, extor, showLattice);
  if (!showLattice) info.link = "lattice.html";
  lattice2html (out, info);
 }
 writeFooter (out);
}

private static DualLabeledSpans intersectSpans (DocumentExtraction docExtr)
{
 int predIdx = 0;
 int trueIdx = 0;
 LabeledSpans trueSpans = docExtr.getTargetSpans ();
 LabeledSpans predSpans = docExtr.getExtractedSpans ();
 LabeledSpans retPredSpans = new LabeledSpans (predSpans.getDocument ());
 LabeledSpans retTrueSpans = new LabeledSpans (predSpans.getDocument ());
 while ((predIdx < predSpans.size()) && (trueIdx < trueSpans.size ())) {
  LabeledSpan predSpan = predSpans.getLabeledSpan (predIdx);
  LabeledSpan trueSpan = trueSpans.getLabeledSpan (trueIdx);
  LabeledSpan newPredSpan = (LabeledSpan) predSpan.intersection (trueSpan);
  LabeledSpan newTrueSpan = (LabeledSpan) trueSpan.intersection (predSpan);
  retPredSpans.add (newPredSpan);
  retTrueSpans.add (newTrueSpan);
  if (predSpan.getEndIdx () <= trueSpan.getEndIdx ()) {
   predIdx++;
  }
  if (trueSpan.getEndIdx () <= predSpan.getEndIdx ()) {
   trueIdx++;
  }
 }
 assert (retPredSpans.size() == retTrueSpans.size());
 return new DualLabeledSpans (retPredSpans, retTrueSpans);
}

private static void outputOneDocument (PrintWriter out, DocumentExtraction docExtr)
 String name = docExtr.getName ();
 out.println ("<HTML><HEAD><TITLE>"+name+": Extraction from Document</TITLE>");
 out.println ("<LINK REL=\"stylesheet\" TYPE=\"text/css\" HREF=\""+DOC_ERRS_CSS_FNAME+"\" title=\"Agreement\" />");
 out.println ("</HEAD><BODY>");
 outputClassLegend (out, docExtr.getExtractedSpans ().getLabeledSpan (0).getLabel ().getLabelAlphabet ());
 outputRightWrongLegend (out);

Javadoc

Created: Oct 12, 2004

Most used methods

toXmlString

Popular in Java

Reading from database using SQL prepared statement
getContentResolver (Context)
getSharedPreferences (Context)
getOriginalFilename (MultipartFile)
Return the original filename in the client's filesystem.This may contain path information depending
OutputStream (java.io)
A writable sink for bytes.Most clients will use output streams that write data to the file system (
MalformedURLException (java.net)
This exception is thrown when a program attempts to create an URL from an incorrect specification.
Proxy (java.net)
This class represents proxy server settings. A created instance of Proxy stores a type and an addres
StringUtils (org.apache.commons.lang)
Operations on java.lang.String that arenull safe. * IsEmpty/IsBlank - checks if a String contains
Rectangle (java.awt)
A Rectangle specifies an area in a coordinate space that is enclosed by the Rectangle object's top-
Reflections (org.reflections)
Reflections one-stop-shop objectReflections scans your classpath, indexes the metadata, allows you t
Top Sublime Text plugins

How to useDocumentExtraction in cc.mallet.extract

Best Java code snippets using cc.mallet.extract.DocumentExtraction (Showing top 20 results out of 315)

How to use
DocumentExtraction
in
cc.mallet.extract