/** * Creates an extration given a sequence output by some kind of per-sequece labeler, like an * HMM or a CRF. The extraction will contain a single document. */ public Extraction (Extractor extractor, LabelAlphabet dict, String name, Tokenization input, Sequence output, String background) { this.extractor = extractor; this.dict = dict; DocumentExtraction docseq = new DocumentExtraction (name, dict, input, output, background); addDocumentExtraction (docseq); }
public void cleanFields (FieldCleaner cleaner) { Iterator it = records.iterator (); while (it.hasNext ()) { cleanRecord ((Record) it.next (), cleaner); } it = trueRecords.iterator (); while (it.hasNext ()) { cleanRecord ((Record) it.next (), cleaner); } }
abstract public void estimateConfidence (DocumentExtraction documentExtraction); }
public void evaluate (String description, Extraction extraction, PrintWriter out) int numDocs = extraction.getNumDocuments (); assert numDocs == extraction.getNumRecords (); LabelAlphabet dict = extraction.getLabelAlphabet(); int numLabels = dict.size(); int[] numCorr = new int [numLabels]; Record extracted = extraction.getRecord (docnum); Record target = extraction.getTargetRecord (docnum); errorOutputStream.println ("Error in extraction! Document "+extraction.getDocumentExtraction (docnum).getName ()); errorOutputStream.println ("Predicted "+predField); errorOutputStream.println ("True "+trueField);
public Extraction extract (Tokenization spans) { // We assume the input is unpiped. Instance carrier = featurePipe.pipe (new Instance (spans, null, null, null)); Sequence output = crf.transduce ((Sequence) carrier.getData ()); Extraction extraction = new Extraction (this, getTargetAlphabet()); DocumentExtraction docseq = new DocumentExtraction ("Extraction", getTargetAlphabet(), spans, output, null, backgroundTag, filter); extraction.addDocumentExtraction (docseq); return extraction; }
public static void viewDualResults (File dir, Extraction e1, CRFExtractor extor1, Extraction e2, CRFExtractor extor2) throws IOException { if (e1.getNumDocuments () != e2.getNumDocuments ()) throw new IllegalArgumentException ("Extractions don't match: different number of docs."); PrintWriter errorStr = new PrintWriter (new FileWriter (new File (dir, "errors.html"))); writeDualExtractions (errorStr, e1, extor1, e2, extor2, 0, e1.getNumDocuments (), false); errorStr.close (); int max = e1.getNumDocuments (); for (int start = 0; start < max; start += EXTRACTIONS_PER_FILE) { int end = Math.min (start + EXTRACTIONS_PER_FILE, max); PrintWriter latticeStr = new PrintWriter (new FileWriter (new File (dir, "lattice-"+start+".html"))); writeDualExtractions (latticeStr, e1, extor1, e2, extor2, start, end, true); latticeStr.close (); } }
private static void writeDualExtractions (PrintWriter out, Extraction e1, CRFExtractor extor1, Extraction e2, CRFExtractor extor2, int start, int end, boolean showLattice) { writeHeader (out); for (int i = start; i < end; i++) { DocumentExtraction doc1 = e1.getDocumentExtraction (i); DocumentExtraction doc2 = e2.getDocumentExtraction (i); String desc = doc1.getName(); String doc1Str = ((CharSequence) doc1.getDocument ()).toString(); String doc2Str = ((CharSequence) doc2.getDocument ()).toString(); if (!doc1Str.equals (doc2Str)) { System.err.println ("Skipping document "+i+": Extractions don't match"); continue; } Sequence targ1 = doc1.getPredictedLabels (); Sequence targ2 = doc2.getPredictedLabels (); if (!predictionsMatch (targ1, targ2)) { ExtorInfo info1 = infoForDoc (doc1Str, "CRF1::"+desc, "C1I"+i, doc1, extor1, showLattice); ExtorInfo info2 = infoForDoc (doc1Str, "CRF2::"+desc, "C2I"+i, doc2, extor2, showLattice); if (!showLattice) { // add links from errors.html --> lattice.html info1.link = info2.link = computeLatticeFname (i); } dualLattice2html (out, desc, info1, info2); } } writeFooter (out); }
LabelAlphabet dict = extraction.getLabelAlphabet (); String[] fields = determineFieldNames (dict); String[] colors = ColorUtils.rainbow (fields.length, (float) SATURATION, 1);
public void testFieldCleaning () { Extraction extraction = createExtractionFrom (punctPred, punctTrue); extraction.cleanFields (new RegexFieldCleaner ("<.*?>|,|!")); PerFieldF1Evaluator eval = new PerFieldF1Evaluator (); ByteArrayOutputStream out = new ByteArrayOutputStream (); eval.evaluate ("Testing", extraction, new PrintStream (out)); assertEquals (mpdExpected, out.toString()); }
public void evaluate (String description, Extraction extraction, PrintWriter out) int numDocs = extraction.getNumDocuments (); assert numDocs == extraction.getNumRecords (); LabelAlphabet dict = extraction.getLabelAlphabet(); int numLabels = dict.size(); int[] numCorr = new int [numLabels]; Record extracted = extraction.getRecord (docnum); Record target = extraction.getTargetRecord (docnum); errorOutputStream.println ("Error in extraction! Document "+extraction.getDocumentExtraction (docnum).getName ()); errorOutputStream.println ("Predicted "+predField); errorOutputStream.println ("True "+trueField);
public Extraction extract (Tokenization spans) { // We assume the input is unpiped. Instance carrier = featurePipe.pipe (new Instance (spans, null, null, null)); Sequence output = crf.transduce ((Sequence) carrier.getData ()); Extraction extraction = new Extraction (this, getTargetAlphabet()); DocumentExtraction docseq = new DocumentExtraction ("Extraction", getTargetAlphabet(), spans, output, null, backgroundTag, filter); extraction.addDocumentExtraction (docseq); return extraction; }
public static void viewDualResults (File dir, Extraction e1, CRFExtractor extor1, Extraction e2, CRFExtractor extor2) throws IOException { if (e1.getNumDocuments () != e2.getNumDocuments ()) throw new IllegalArgumentException ("Extractions don't match: different number of docs."); PrintWriter errorStr = new PrintWriter (new FileWriter (new File (dir, "errors.html"))); writeDualExtractions (errorStr, e1, extor1, e2, extor2, 0, e1.getNumDocuments (), false); errorStr.close (); int max = e1.getNumDocuments (); for (int start = 0; start < max; start += EXTRACTIONS_PER_FILE) { int end = Math.min (start + EXTRACTIONS_PER_FILE, max); PrintWriter latticeStr = new PrintWriter (new FileWriter (new File (dir, "lattice-"+start+".html"))); writeDualExtractions (latticeStr, e1, extor1, e2, extor2, start, end, true); latticeStr.close (); } }
private static void writeDualExtractions (PrintWriter out, Extraction e1, CRFExtractor extor1, Extraction e2, CRFExtractor extor2, int start, int end, boolean showLattice) { writeHeader (out); for (int i = start; i < end; i++) { DocumentExtraction doc1 = e1.getDocumentExtraction (i); DocumentExtraction doc2 = e2.getDocumentExtraction (i); String desc = doc1.getName(); String doc1Str = ((CharSequence) doc1.getDocument ()).toString(); String doc2Str = ((CharSequence) doc2.getDocument ()).toString(); if (!doc1Str.equals (doc2Str)) { System.err.println ("Skipping document "+i+": Extractions don't match"); continue; } Sequence targ1 = doc1.getPredictedLabels (); Sequence targ2 = doc2.getPredictedLabels (); if (!predictionsMatch (targ1, targ2)) { ExtorInfo info1 = infoForDoc (doc1Str, "CRF1::"+desc, "C1I"+i, doc1, extor1, showLattice); ExtorInfo info2 = infoForDoc (doc1Str, "CRF2::"+desc, "C2I"+i, doc2, extor2, showLattice); if (!showLattice) { // add links from errors.html --> lattice.html info1.link = info2.link = computeLatticeFname (i); } dualLattice2html (out, desc, info1, info2); } } writeFooter (out); }
LabelAlphabet dict = extraction.getLabelAlphabet (); String[] fields = determineFieldNames (dict); String[] colors = ColorUtils.rainbow (fields.length, (float) SATURATION, 1);
public void testFieldCleaning () { Extraction extraction = createExtractionFrom (punctPred, punctTrue); extraction.cleanFields (new RegexFieldCleaner ("<.*?>|,|!")); PerFieldF1Evaluator eval = new PerFieldF1Evaluator (); ByteArrayOutputStream out = new ByteArrayOutputStream (); eval.evaluate ("Testing", extraction, new PrintStream (out)); assertEquals (mpdExpected, out.toString()); }
public void evaluate (String description, Extraction extraction, PrintWriter out) int numDocs = extraction.getNumDocuments (); assert numDocs == extraction.getNumRecords (); LabelAlphabet dict = extraction.getLabelAlphabet(); int numLabels = dict.size(); int[] numCorr = new int [numLabels]; Record extracted = extraction.getRecord (docnum); Record target = extraction.getTargetRecord (docnum); errorOutputStream.println ("Error in extraction! Document "+extraction.getDocumentExtraction (docnum).getName ()); errorOutputStream.println ("Predicted "+predField); errorOutputStream.println ("True "+trueField);
public Extraction extract (Tokenization spans) { // We assume the input is unpiped. Instance carrier = featurePipe.pipe (new Instance (spans, null, null, null)); Sequence output = crf.transduce ((Sequence) carrier.getData ()); Extraction extraction = new Extraction (this, getTargetAlphabet()); DocumentExtraction docseq = new DocumentExtraction ("Extraction", getTargetAlphabet(), spans, output, null, backgroundTag, filter); extraction.addDocumentExtraction (docseq); return extraction; }
abstract public void estimateConfidence (DocumentExtraction documentExtraction); }
/** * Creates an extration given a sequence output by some kind of per-sequece labeler, like an * HMM or a CRF. The extraction will contain a single document. */ public Extraction (Extractor extractor, LabelAlphabet dict, String name, Tokenization input, Sequence output, String background) { this.extractor = extractor; this.dict = dict; DocumentExtraction docseq = new DocumentExtraction (name, dict, input, output, background); addDocumentExtraction (docseq); }
public void cleanFields (FieldCleaner cleaner) { Iterator it = records.iterator (); while (it.hasNext ()) { cleanRecord ((Record) it.next (), cleaner); } it = trueRecords.iterator (); while (it.hasNext ()) { cleanRecord ((Record) it.next (), cleaner); } }