/** * Creates an extration given a sequence output by some kind of per-sequece labeler, like an * HMM or a CRF. The extraction will contain a single document. */ public Extraction (Extractor extractor, LabelAlphabet dict, String name, Tokenization input, Sequence output, String background) { this.extractor = extractor; this.dict = dict; DocumentExtraction docseq = new DocumentExtraction (name, dict, input, output, background); addDocumentExtraction (docseq); }
private static void writeDualExtractions (PrintWriter out, Extraction e1, CRFExtractor extor1, Extraction e2, CRFExtractor extor2, int start, int end, boolean showLattice) { writeHeader (out); for (int i = start; i < end; i++) { DocumentExtraction doc1 = e1.getDocumentExtraction (i); DocumentExtraction doc2 = e2.getDocumentExtraction (i); String desc = doc1.getName(); String doc1Str = ((CharSequence) doc1.getDocument ()).toString(); String doc2Str = ((CharSequence) doc2.getDocument ()).toString(); if (!doc1Str.equals (doc2Str)) { System.err.println ("Skipping document "+i+": Extractions don't match"); continue; } Sequence targ1 = doc1.getPredictedLabels (); Sequence targ2 = doc2.getPredictedLabels (); if (!predictionsMatch (targ1, targ2)) { ExtorInfo info1 = infoForDoc (doc1Str, "CRF1::"+desc, "C1I"+i, doc1, extor1, showLattice); ExtorInfo info2 = infoForDoc (doc1Str, "CRF2::"+desc, "C2I"+i, doc2, extor2, showLattice); if (!showLattice) { // add links from errors.html --> lattice.html info1.link = info2.link = computeLatticeFname (i); } dualLattice2html (out, desc, info1, info2); } } writeFooter (out); }
public void addDocumentExtraction (DocumentExtraction docseq) { byDocs.add (docseq); records.add (new Record (docseq.getName (), docseq.getExtractedSpans ())); if (docseq.getTargetSpans () != null) { trueRecords.add (new Record ("TRUE:"+docseq.getName (), docseq.getTargetSpans ())); } }
public LabeledSpans constructLabeledSpans (LabelAlphabet dict, Object document, Label backgroundTag, Tokenization input, Sequence seq) { DocumentExtraction extraction = new DocumentExtraction("Extraction", dict, input, seq, null, backgroundTag.toString()); confidenceEstimator.estimateConfidence(extraction); return extraction.getExtractedSpans(); }
private static ExtorInfo infoForDoc (String doc, String desc, String idx, DocumentExtraction docextr, CRFExtractor extor, boolean showLattice) { // Instance c2 = new Instance (doc, null, null, null, extor.getTokenizationPipe ()); // TokenSequence input = (TokenSequence) c2.getData (); TokenSequence input = (TokenSequence) docextr.getInput (); LabelSequence target = docextr.getTarget (); Sequence predicted = docextr.getPredictedLabels (); ExtorInfo info = new ExtorInfo (input, predicted, target, desc, idx); if (showLattice == true) { CRF crf = extor.getCrf(); // xxx perhaps the next two lines could be a transducer method??? Instance carrier = extor.getFeaturePipe().pipe(new Instance (input, null, null, null)); info.fvs = (FeatureVectorSequence) carrier.getData (); info.lattice = new MaxLatticeDefault (crf, (Sequence) carrier.getData(), null); info.bestStates = info.lattice.bestOutputSequence(); } return info; }
public void estimateConfidence (DocumentExtraction documentExtraction) { Tokenization input = documentExtraction.getInput(); // WARNING: input Tokenization will likely already have many // features appended from the last time it was passed through a // featurePipe. To avoid a redundant calculation of features, the // caller may want to set this.featurePipe = // TokenSequence2FeatureVectorSequence Instance carrier = this.featurePipe.pipe(new Instance(input, null, null, null)); Sequence pipedInput = (Sequence) carrier.getData(); Sequence prediction = documentExtraction.getPredictedLabels(); LabeledSpans labeledSpans = documentExtraction.getExtractedSpans(); SumLatticeDefault lattice = new SumLatticeDefault (this.confidenceEstimator.getTransducer(), pipedInput); for (int i=0; i < labeledSpans.size(); i++) { LabeledSpan span = labeledSpans.getLabeledSpan(i); if (span.isBackground()) continue; int[] segmentBoundaries = getSegmentBoundaries(input, span); Segment segment = new Segment(pipedInput, prediction, prediction, segmentBoundaries[0], segmentBoundaries[1], null, null); span.setConfidence(confidenceEstimator.estimateConfidenceFor(segment, lattice)); } }
public void testToXml () { LabelAlphabet dict = new LabelAlphabet (); String document = "the quick brown fox leapt over the lazy dog"; StringTokenization toks = new StringTokenization (document, new CharSequenceLexer ()); Label O = dict.lookupLabel ("O"); Label ANML = dict.lookupLabel ("ANIMAL"); Label VB = dict.lookupLabel ("VERB"); LabelSequence tags = new LabelSequence (new Label[] { O, ANML, ANML, ANML, VB, O, O, ANML, ANML }); DocumentExtraction extr = new DocumentExtraction ("Test", dict, toks, tags, "O"); String actualXml = extr.toXmlString(); String expectedXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n" + "<doc>the <ANIMAL>quick brown fox </ANIMAL><VERB>leapt </VERB>over the <ANIMAL>lazy dog</ANIMAL></doc>\r\n"; assertEquals (expectedXml, actualXml); }
public static void extraction2html (Extraction extraction, CRFExtractor extor, PrintWriter out, boolean showLattice) { writeHeader (out); for (int i = 0; i < extraction.getNumDocuments (); i++) { DocumentExtraction docextr = extraction.getDocumentExtraction (i); String desc = docextr.getName(); String doc = ((CharSequence) docextr.getDocument ()).toString(); ExtorInfo info = infoForDoc (doc, desc, "N"+i, docextr, extor, showLattice); if (!showLattice) info.link = "lattice.html"; lattice2html (out, info); } writeFooter (out); }
private static DualLabeledSpans intersectSpans (DocumentExtraction docExtr) { int predIdx = 0; int trueIdx = 0; LabeledSpans trueSpans = docExtr.getTargetSpans (); LabeledSpans predSpans = docExtr.getExtractedSpans (); LabeledSpans retPredSpans = new LabeledSpans (predSpans.getDocument ()); LabeledSpans retTrueSpans = new LabeledSpans (predSpans.getDocument ()); while ((predIdx < predSpans.size()) && (trueIdx < trueSpans.size ())) { LabeledSpan predSpan = predSpans.getLabeledSpan (predIdx); LabeledSpan trueSpan = trueSpans.getLabeledSpan (trueIdx); LabeledSpan newPredSpan = (LabeledSpan) predSpan.intersection (trueSpan); LabeledSpan newTrueSpan = (LabeledSpan) trueSpan.intersection (predSpan); retPredSpans.add (newPredSpan); retTrueSpans.add (newTrueSpan); if (predSpan.getEndIdx () <= trueSpan.getEndIdx ()) { predIdx++; } if (trueSpan.getEndIdx () <= predSpan.getEndIdx ()) { trueIdx++; } } assert (retPredSpans.size() == retTrueSpans.size()); return new DualLabeledSpans (retPredSpans, retTrueSpans); }
private static void outputOneDocument (PrintWriter out, DocumentExtraction docExtr) String name = docExtr.getName (); out.println ("<HTML><HEAD><TITLE>"+name+": Extraction from Document</TITLE>"); out.println ("<LINK REL=\"stylesheet\" TYPE=\"text/css\" HREF=\""+DOC_ERRS_CSS_FNAME+"\" title=\"Agreement\" />"); out.println ("</HEAD><BODY>"); outputClassLegend (out, docExtr.getExtractedSpans ().getLabeledSpan (0).getLabel ().getLabelAlphabet ()); outputRightWrongLegend (out);
public Document toXmlDocument () { return toXmlDocument ("doc", Namespace.NO_NAMESPACE); }
private static void outputIndex (File directory, Extraction extraction) throws IOException { PrintWriter out = new PrintWriter (new FileWriter (new File (directory, "index.html"))); out.println ("<HTML><HEAD><TITLE>Extraction Results</TITLE></HEAD><BODY><OL>"); for (int i = 0; i < extraction.getNumDocuments(); i++) { String name = extraction.getDocumentExtraction (i).getName (); out.println (" <LI><A HREF=\"extraction"+i+".html\">"+name+"</A></LI>"); } out.println ("</OL></BODY></HTML>"); out.close (); } }
return new Document (generateElement (rootEltName, wholeDoc, roots, children));
private static ExtorInfo infoForDoc (String doc, String desc, String idx, DocumentExtraction docextr, CRFExtractor extor, boolean showLattice) { // Instance c2 = new Instance (doc, null, null, null, extor.getTokenizationPipe ()); // TokenSequence input = (TokenSequence) c2.getData (); TokenSequence input = (TokenSequence) docextr.getInput (); LabelSequence target = docextr.getTarget (); Sequence predicted = docextr.getPredictedLabels (); ExtorInfo info = new ExtorInfo (input, predicted, target, desc, idx); if (showLattice == true) { CRF crf = extor.getCrf(); // xxx perhaps the next two lines could be a transducer method??? Instance carrier = extor.getFeaturePipe().pipe(new Instance (input, null, null, null)); info.fvs = (FeatureVectorSequence) carrier.getData (); info.lattice = new MaxLatticeDefault (crf, (Sequence) carrier.getData(), null); info.bestStates = info.lattice.bestOutputSequence(); } return info; }
public void estimateConfidence (DocumentExtraction documentExtraction) { Tokenization input = documentExtraction.getInput(); // WARNING: input Tokenization will likely already have many // features appended from the last time it was passed through a // featurePipe. To avoid a redundant calculation of features, the // caller may want to set this.featurePipe = // TokenSequence2FeatureVectorSequence Instance carrier = this.featurePipe.pipe(new Instance(input, null, null, null)); Sequence pipedInput = (Sequence) carrier.getData(); Sequence prediction = documentExtraction.getPredictedLabels(); LabeledSpans labeledSpans = documentExtraction.getExtractedSpans(); SumLatticeDefault lattice = new SumLatticeDefault (this.confidenceEstimator.getTransducer(), pipedInput); for (int i=0; i < labeledSpans.size(); i++) { LabeledSpan span = labeledSpans.getLabeledSpan(i); if (span.isBackground()) continue; int[] segmentBoundaries = getSegmentBoundaries(input, span); Segment segment = new Segment(pipedInput, prediction, prediction, segmentBoundaries[0], segmentBoundaries[1], null, null); span.setConfidence(confidenceEstimator.estimateConfidenceFor(segment, lattice)); } }
public void testToXml () { LabelAlphabet dict = new LabelAlphabet (); String document = "the quick brown fox leapt over the lazy dog"; StringTokenization toks = new StringTokenization (document, new CharSequenceLexer ()); Label O = dict.lookupLabel ("O"); Label ANML = dict.lookupLabel ("ANIMAL"); Label VB = dict.lookupLabel ("VERB"); LabelSequence tags = new LabelSequence (new Label[] { O, ANML, ANML, ANML, VB, O, O, ANML, ANML }); DocumentExtraction extr = new DocumentExtraction ("Test", dict, toks, tags, "O"); String actualXml = extr.toXmlString(); String expectedXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n" + "<doc>the <ANIMAL>quick brown fox </ANIMAL><VERB>leapt </VERB>over the <ANIMAL>lazy dog</ANIMAL></doc>\r\n"; assertEquals (expectedXml, actualXml); }
public LabeledSpans constructLabeledSpans (LabelAlphabet dict, Object document, Label backgroundTag, Tokenization input, Sequence seq) { DocumentExtraction extraction = new DocumentExtraction("Extraction", dict, input, seq, null, backgroundTag.toString()); confidenceEstimator.estimateConfidence(extraction); return extraction.getExtractedSpans(); }
public static void extraction2html (Extraction extraction, CRFExtractor extor, PrintWriter out, boolean showLattice) { writeHeader (out); for (int i = 0; i < extraction.getNumDocuments (); i++) { DocumentExtraction docextr = extraction.getDocumentExtraction (i); String desc = docextr.getName(); String doc = ((CharSequence) docextr.getDocument ()).toString(); ExtorInfo info = infoForDoc (doc, desc, "N"+i, docextr, extor, showLattice); if (!showLattice) info.link = "lattice.html"; lattice2html (out, info); } writeFooter (out); }
private static DualLabeledSpans intersectSpans (DocumentExtraction docExtr) { int predIdx = 0; int trueIdx = 0; LabeledSpans trueSpans = docExtr.getTargetSpans (); LabeledSpans predSpans = docExtr.getExtractedSpans (); LabeledSpans retPredSpans = new LabeledSpans (predSpans.getDocument ()); LabeledSpans retTrueSpans = new LabeledSpans (predSpans.getDocument ()); while ((predIdx < predSpans.size()) && (trueIdx < trueSpans.size ())) { LabeledSpan predSpan = predSpans.getLabeledSpan (predIdx); LabeledSpan trueSpan = trueSpans.getLabeledSpan (trueIdx); LabeledSpan newPredSpan = (LabeledSpan) predSpan.intersection (trueSpan); LabeledSpan newTrueSpan = (LabeledSpan) trueSpan.intersection (predSpan); retPredSpans.add (newPredSpan); retTrueSpans.add (newTrueSpan); if (predSpan.getEndIdx () <= trueSpan.getEndIdx ()) { predIdx++; } if (trueSpan.getEndIdx () <= predSpan.getEndIdx ()) { trueIdx++; } } assert (retPredSpans.size() == retTrueSpans.size()); return new DualLabeledSpans (retPredSpans, retTrueSpans); }
private static void outputOneDocument (PrintWriter out, DocumentExtraction docExtr) String name = docExtr.getName (); out.println ("<HTML><HEAD><TITLE>"+name+": Extraction from Document</TITLE>"); out.println ("<LINK REL=\"stylesheet\" TYPE=\"text/css\" HREF=\""+DOC_ERRS_CSS_FNAME+"\" title=\"Agreement\" />"); out.println ("</HEAD><BODY>"); outputClassLegend (out, docExtr.getExtractedSpans ().getLabeledSpan (0).getLabel ().getLabelAlphabet ()); outputRightWrongLegend (out);