/** * Creates an extration given a sequence output by some kind of per-sequece labeler, like an * HMM or a CRF. The extraction will contain a single document. */ public Extraction (Extractor extractor, LabelAlphabet dict, String name, Tokenization input, Sequence output, String background) { this.extractor = extractor; this.dict = dict; DocumentExtraction docseq = new DocumentExtraction (name, dict, input, output, background); addDocumentExtraction (docseq); }
/** * Creates an extration given a sequence output by some kind of per-sequece labeler, like an * HMM or a CRF. The extraction will contain a single document. */ public Extraction (Extractor extractor, LabelAlphabet dict, String name, Tokenization input, Sequence output, String background) { this.extractor = extractor; this.dict = dict; DocumentExtraction docseq = new DocumentExtraction (name, dict, input, output, background); addDocumentExtraction (docseq); }
/** * Creates an extration given a sequence output by some kind of per-sequece labeler, like an * HMM or a CRF. The extraction will contain a single document. */ public Extraction (Extractor extractor, LabelAlphabet dict, String name, Tokenization input, Sequence output, String background) { this.extractor = extractor; this.dict = dict; DocumentExtraction docseq = new DocumentExtraction (name, dict, input, output, background); addDocumentExtraction (docseq); }
public LabeledSpans constructLabeledSpans (LabelAlphabet dict, Object document, Label backgroundTag, Tokenization input, Sequence seq) { DocumentExtraction extraction = new DocumentExtraction("Extraction", dict, input, seq, null, backgroundTag.toString()); confidenceEstimator.estimateConfidence(extraction); return extraction.getExtractedSpans(); }
public LabeledSpans constructLabeledSpans (LabelAlphabet dict, Object document, Label backgroundTag, Tokenization input, Sequence seq) { DocumentExtraction extraction = new DocumentExtraction("Extraction", dict, input, seq, null, backgroundTag.toString()); confidenceEstimator.estimateConfidence(extraction); return extraction.getExtractedSpans(); }
public LabeledSpans constructLabeledSpans (LabelAlphabet dict, Object document, Label backgroundTag, Tokenization input, Sequence seq) { DocumentExtraction extraction = new DocumentExtraction("Extraction", dict, input, seq, null, backgroundTag.toString()); confidenceEstimator.estimateConfidence(extraction); return extraction.getExtractedSpans(); }
public Extraction extract (InstanceList testing) { Extraction extraction = new Extraction (this, getTargetAlphabet ()); for (int i = 0; i < testing.size(); i++) { Instance instance = testing.get (i); Tokenization tok = (Tokenization) instance.getProperty ("TOKENIZATION"); if (tok == null) throw new IllegalArgumentException ("To use extract(InstanceList), must save the Tokenization!"); String name = instance.getName ().toString (); Sequence target = (Sequence) instance.getTarget (); Sequence output = acrf.getBestLabels (instance); DocumentExtraction docseq = new DocumentExtraction (name, getTargetAlphabet (), tok, output, target, backgroundTag, filter); extraction.addDocumentExtraction (docseq); } return extraction; }
public Extraction extract (InstanceList testing) { Extraction extraction = new Extraction (this, getTargetAlphabet ()); for (int i = 0; i < testing.size(); i++) { Instance instance = testing.get (i); Tokenization tok = (Tokenization) instance.getProperty ("TOKENIZATION"); if (tok == null) throw new IllegalArgumentException ("To use extract(InstanceList), must save the Tokenization!"); String name = instance.getName ().toString (); Sequence target = (Sequence) instance.getTarget (); Sequence output = acrf.getBestLabels (instance); DocumentExtraction docseq = new DocumentExtraction (name, getTargetAlphabet (), tok, output, target, backgroundTag, filter); extraction.addDocumentExtraction (docseq); } return extraction; }
public Extraction extract (InstanceList testing) { Extraction extraction = new Extraction (this, getTargetAlphabet ()); for (int i = 0; i < testing.size(); i++) { Instance instance = testing.get (i); Tokenization tok = (Tokenization) instance.getProperty ("TOKENIZATION"); if (tok == null) throw new IllegalArgumentException ("To use extract(InstanceList), must save the Tokenization!"); String name = instance.getName ().toString (); Sequence target = (Sequence) instance.getTarget (); Sequence output = acrf.getBestLabels (instance); DocumentExtraction docseq = new DocumentExtraction (name, getTargetAlphabet (), tok, output, target, backgroundTag, filter); extraction.addDocumentExtraction (docseq); } return extraction; }
public Extraction extract (Tokenization spans) { // We assume the input is unpiped. Instance carrier = featurePipe.pipe (new Instance (spans, null, null, null)); Sequence output = crf.transduce ((Sequence) carrier.getData ()); Extraction extraction = new Extraction (this, getTargetAlphabet()); DocumentExtraction docseq = new DocumentExtraction ("Extraction", getTargetAlphabet(), spans, output, null, backgroundTag, filter); extraction.addDocumentExtraction (docseq); return extraction; }
public Extraction extract (Tokenization spans) { // We assume the input is unpiped. Instance carrier = featurePipe.pipe (new Instance (spans, null, null, null)); Sequence output = crf.transduce ((Sequence) carrier.getData ()); Extraction extraction = new Extraction (this, getTargetAlphabet()); DocumentExtraction docseq = new DocumentExtraction ("Extraction", getTargetAlphabet(), spans, output, null, backgroundTag, filter); extraction.addDocumentExtraction (docseq); return extraction; }
public Extraction extract (Tokenization spans) { // We assume the input is unpiped. Instance carrier = featurePipe.pipe (new Instance (spans, null, null, null)); Sequence output = crf.transduce ((Sequence) carrier.getData ()); Extraction extraction = new Extraction (this, getTargetAlphabet()); DocumentExtraction docseq = new DocumentExtraction ("Extraction", getTargetAlphabet(), spans, output, null, backgroundTag, filter); extraction.addDocumentExtraction (docseq); return extraction; }
/** Assumes Instance.source contains the Tokenization object. */ public Extraction extract (InstanceList ilist) { Extraction extraction = new Extraction (this, getTargetAlphabet ()); for (int i = 0; i < ilist.size(); i++) { Instance inst = ilist.get(i); Tokenization tok = (Tokenization)inst.getSource(); String name = inst.getName().toString(); Sequence input = (Sequence)inst.getData (); Sequence target = (Sequence)inst.getTarget (); Sequence output = crf.transduce(input); DocumentExtraction docseq = new DocumentExtraction (name, getTargetAlphabet(), tok, output, target, backgroundTag, filter); extraction.addDocumentExtraction (docseq); } return extraction; }
/** Assumes Instance.source contains the Tokenization object. */ public Extraction extract (InstanceList ilist) { Extraction extraction = new Extraction (this, getTargetAlphabet ()); for (int i = 0; i < ilist.size(); i++) { Instance inst = ilist.get(i); Tokenization tok = (Tokenization)inst.getSource(); String name = inst.getName().toString(); Sequence input = (Sequence)inst.getData (); Sequence target = (Sequence)inst.getTarget (); Sequence output = crf.transduce(input); DocumentExtraction docseq = new DocumentExtraction (name, getTargetAlphabet(), tok, output, target, backgroundTag, filter); extraction.addDocumentExtraction (docseq); } return extraction; }
/** Assumes Instance.source contains the Tokenization object. */ public Extraction extract (InstanceList ilist) { Extraction extraction = new Extraction (this, getTargetAlphabet ()); for (int i = 0; i < ilist.size(); i++) { Instance inst = ilist.get(i); Tokenization tok = (Tokenization)inst.getSource(); String name = inst.getName().toString(); Sequence input = (Sequence)inst.getData (); Sequence target = (Sequence)inst.getTarget (); Sequence output = crf.transduce(input); DocumentExtraction docseq = new DocumentExtraction (name, getTargetAlphabet(), tok, output, target, backgroundTag, filter); extraction.addDocumentExtraction (docseq); } return extraction; }
public void testNestedXMLTokenizationFilter () { LabelAlphabet dict = new LabelAlphabet (); String document = "the quick brown fox leapt over the lazy dog"; StringTokenization toks = new StringTokenization (document, new CharSequenceLexer ()); Label O = dict.lookupLabel ("O"); Label ANML = dict.lookupLabel ("ANIMAL"); Label ANML_MAMM = dict.lookupLabel ("ANIMAL|MAMMAL"); Label VB = dict.lookupLabel ("VERB"); Label ANML_JJ = dict.lookupLabel ("ANIMAL|ADJ"); Label ANML_JJ_MAMM = dict.lookupLabel ("ANIMAL|ADJ|MAMMAL"); LabelSequence tags = new LabelSequence (new Label[] { O, ANML, ANML, ANML_MAMM, VB, O, ANML, ANML_JJ, ANML_JJ_MAMM }); DocumentExtraction extr = new DocumentExtraction ("Test", dict, toks, tags, null, "O", new HierarchicalTokenizationFilter ()); String actualXml = extr.toXmlString(); String expectedXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n" + "<doc>the <ANIMAL>quick brown <MAMMAL>fox </MAMMAL></ANIMAL><VERB>leapt </VERB>over <ANIMAL>the <ADJ>lazy <MAMMAL>dog</MAMMAL></ADJ></ANIMAL></doc>\r\n"; assertEquals (expectedXml, actualXml); // Test the ignore function extr = new DocumentExtraction ("Test", dict, toks, tags, null, "O", new HierarchicalTokenizationFilter (Pattern.compile ("AD.*"))); actualXml = extr.toXmlString(); expectedXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n" + "<doc>the <ANIMAL>quick brown <MAMMAL>fox </MAMMAL></ANIMAL><VERB>leapt </VERB>over <ANIMAL>the lazy <MAMMAL>dog</MAMMAL></ANIMAL></doc>\r\n"; assertEquals (expectedXml, actualXml); }
public void testToXml () { LabelAlphabet dict = new LabelAlphabet (); String document = "the quick brown fox leapt over the lazy dog"; StringTokenization toks = new StringTokenization (document, new CharSequenceLexer ()); Label O = dict.lookupLabel ("O"); Label ANML = dict.lookupLabel ("ANIMAL"); Label VB = dict.lookupLabel ("VERB"); LabelSequence tags = new LabelSequence (new Label[] { O, ANML, ANML, ANML, VB, O, O, ANML, ANML }); DocumentExtraction extr = new DocumentExtraction ("Test", dict, toks, tags, "O"); String actualXml = extr.toXmlString(); String expectedXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n" + "<doc>the <ANIMAL>quick brown fox </ANIMAL><VERB>leapt </VERB>over the <ANIMAL>lazy dog</ANIMAL></doc>\r\n"; assertEquals (expectedXml, actualXml); }
public void testToXml () { LabelAlphabet dict = new LabelAlphabet (); String document = "the quick brown fox leapt over the lazy dog"; StringTokenization toks = new StringTokenization (document, new CharSequenceLexer ()); Label O = dict.lookupLabel ("O"); Label ANML = dict.lookupLabel ("ANIMAL"); Label VB = dict.lookupLabel ("VERB"); LabelSequence tags = new LabelSequence (new Label[] { O, ANML, ANML, ANML, VB, O, O, ANML, ANML }); DocumentExtraction extr = new DocumentExtraction ("Test", dict, toks, tags, "O"); String actualXml = extr.toXmlString(); String expectedXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n" + "<doc>the <ANIMAL>quick brown fox </ANIMAL><VERB>leapt </VERB>over the <ANIMAL>lazy dog</ANIMAL></doc>\r\n"; assertEquals (expectedXml, actualXml); }
public void testToXmlBIO () { LabelAlphabet dict = new LabelAlphabet (); String document = "the quick brown fox leapt over the lazy dog"; StringTokenization toks = new StringTokenization (document, new CharSequenceLexer ()); Label O = dict.lookupLabel ("O"); Label BANML = dict.lookupLabel ("B-ANIMAL"); Label ANML = dict.lookupLabel ("ANIMAL"); Label BVB = dict.lookupLabel ("B-VERB"); Label VB = dict.lookupLabel ("I-VERB"); LabelSequence tags = new LabelSequence (new Label[] { O, BANML, ANML, BANML, BVB, VB, O, ANML, ANML }); DocumentExtraction extr = new DocumentExtraction ("Test", dict, toks, tags, null, "O", new BIOTokenizationFilter()); String actualXml = extr.toXmlString(); String expectedXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n" + "<doc>the <ANIMAL>quick brown </ANIMAL><ANIMAL>fox </ANIMAL><VERB>leapt over </VERB>the <ANIMAL>lazy dog</ANIMAL></doc>\r\n"; assertEquals (expectedXml, actualXml); }
public void testToXmlBIO () { LabelAlphabet dict = new LabelAlphabet (); String document = "the quick brown fox leapt over the lazy dog"; StringTokenization toks = new StringTokenization (document, new CharSequenceLexer ()); Label O = dict.lookupLabel ("O"); Label BANML = dict.lookupLabel ("B-ANIMAL"); Label ANML = dict.lookupLabel ("ANIMAL"); Label BVB = dict.lookupLabel ("B-VERB"); Label VB = dict.lookupLabel ("I-VERB"); LabelSequence tags = new LabelSequence (new Label[] { O, BANML, ANML, BANML, BVB, VB, O, ANML, ANML }); DocumentExtraction extr = new DocumentExtraction ("Test", dict, toks, tags, null, "O", new BIOTokenizationFilter()); String actualXml = extr.toXmlString(); String expectedXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n" + "<doc>the <ANIMAL>quick brown </ANIMAL><ANIMAL>fox </ANIMAL><VERB>leapt over </VERB>the <ANIMAL>lazy dog</ANIMAL></doc>\r\n"; assertEquals (expectedXml, actualXml); }