/** * Creates an extration given a sequence output by some kind of per-sequece labeler, like an * HMM or a CRF. The extraction will contain a single document. */ public Extraction (Extractor extractor, LabelAlphabet dict, String name, Tokenization input, Sequence output, String background) { this.extractor = extractor; this.dict = dict; DocumentExtraction docseq = new DocumentExtraction (name, dict, input, output, background); addDocumentExtraction (docseq); }
/** * Creates an extration given a sequence output by some kind of per-sequece labeler, like an * HMM or a CRF. The extraction will contain a single document. */ public Extraction (Extractor extractor, LabelAlphabet dict, String name, Tokenization input, Sequence output, String background) { this.extractor = extractor; this.dict = dict; DocumentExtraction docseq = new DocumentExtraction (name, dict, input, output, background); addDocumentExtraction (docseq); }
/** * Creates an extration given a sequence output by some kind of per-sequece labeler, like an * HMM or a CRF. The extraction will contain a single document. */ public Extraction (Extractor extractor, LabelAlphabet dict, String name, Tokenization input, Sequence output, String background) { this.extractor = extractor; this.dict = dict; DocumentExtraction docseq = new DocumentExtraction (name, dict, input, output, background); addDocumentExtraction (docseq); }
public Extraction extract (InstanceList testing) { Extraction extraction = new Extraction (this, getTargetAlphabet ()); for (int i = 0; i < testing.size(); i++) { Instance instance = testing.get (i); Tokenization tok = (Tokenization) instance.getProperty ("TOKENIZATION"); if (tok == null) throw new IllegalArgumentException ("To use extract(InstanceList), must save the Tokenization!"); String name = instance.getName ().toString (); Sequence target = (Sequence) instance.getTarget (); Sequence output = acrf.getBestLabels (instance); DocumentExtraction docseq = new DocumentExtraction (name, getTargetAlphabet (), tok, output, target, backgroundTag, filter); extraction.addDocumentExtraction (docseq); } return extraction; }
public Extraction extract (InstanceList testing) { Extraction extraction = new Extraction (this, getTargetAlphabet ()); for (int i = 0; i < testing.size(); i++) { Instance instance = testing.get (i); Tokenization tok = (Tokenization) instance.getProperty ("TOKENIZATION"); if (tok == null) throw new IllegalArgumentException ("To use extract(InstanceList), must save the Tokenization!"); String name = instance.getName ().toString (); Sequence target = (Sequence) instance.getTarget (); Sequence output = acrf.getBestLabels (instance); DocumentExtraction docseq = new DocumentExtraction (name, getTargetAlphabet (), tok, output, target, backgroundTag, filter); extraction.addDocumentExtraction (docseq); } return extraction; }
public Extraction extract (InstanceList testing) { Extraction extraction = new Extraction (this, getTargetAlphabet ()); for (int i = 0; i < testing.size(); i++) { Instance instance = testing.get (i); Tokenization tok = (Tokenization) instance.getProperty ("TOKENIZATION"); if (tok == null) throw new IllegalArgumentException ("To use extract(InstanceList), must save the Tokenization!"); String name = instance.getName ().toString (); Sequence target = (Sequence) instance.getTarget (); Sequence output = acrf.getBestLabels (instance); DocumentExtraction docseq = new DocumentExtraction (name, getTargetAlphabet (), tok, output, target, backgroundTag, filter); extraction.addDocumentExtraction (docseq); } return extraction; }
public Extraction extract (Tokenization spans) { // We assume the input is unpiped. Instance carrier = featurePipe.pipe (new Instance (spans, null, null, null)); Sequence output = crf.transduce ((Sequence) carrier.getData ()); Extraction extraction = new Extraction (this, getTargetAlphabet()); DocumentExtraction docseq = new DocumentExtraction ("Extraction", getTargetAlphabet(), spans, output, null, backgroundTag, filter); extraction.addDocumentExtraction (docseq); return extraction; }
public Extraction extract (Tokenization spans) { // We assume the input is unpiped. Instance carrier = featurePipe.pipe (new Instance (spans, null, null, null)); Sequence output = crf.transduce ((Sequence) carrier.getData ()); Extraction extraction = new Extraction (this, getTargetAlphabet()); DocumentExtraction docseq = new DocumentExtraction ("Extraction", getTargetAlphabet(), spans, output, null, backgroundTag, filter); extraction.addDocumentExtraction (docseq); return extraction; }
public Extraction extract (Tokenization spans) { // We assume the input is unpiped. Instance carrier = featurePipe.pipe (new Instance (spans, null, null, null)); Sequence output = crf.transduce ((Sequence) carrier.getData ()); Extraction extraction = new Extraction (this, getTargetAlphabet()); DocumentExtraction docseq = new DocumentExtraction ("Extraction", getTargetAlphabet(), spans, output, null, backgroundTag, filter); extraction.addDocumentExtraction (docseq); return extraction; }
public Extraction extract (Iterator<Instance> source) { Extraction extraction = new Extraction (this, getTargetAlphabet ()); // Put all the instances through both pipes, then get viterbi path InstanceList tokedList = new InstanceList (tokenizationPipe); tokedList.addThruPipe (source); InstanceList pipedList = new InstanceList (getFeaturePipe ()); pipedList.addThruPipe (tokedList.iterator()); Iterator<Instance> it1 = tokedList.iterator (); Iterator<Instance> it2 = pipedList.iterator (); while (it1.hasNext()) { Instance toked = it1.next(); Instance piped = it2.next (); Tokenization tok = (Tokenization) toked.getData(); String name = piped.getName().toString(); Sequence input = (Sequence) piped.getData (); Sequence target = (Sequence) piped.getTarget (); Sequence output = crf.transduce (input); DocumentExtraction docseq = new DocumentExtraction (name, getTargetAlphabet (), tok, output, target, backgroundTag, filter); extraction.addDocumentExtraction (docseq); } return extraction; }
public Extraction extract (Iterator<Instance> source) { Extraction extraction = new Extraction (this, getTargetAlphabet ()); // Put all the instances through both pipes, then get viterbi path InstanceList tokedList = new InstanceList (tokenizationPipe); tokedList.addThruPipe (source); InstanceList pipedList = new InstanceList (getFeaturePipe ()); pipedList.addThruPipe (tokedList.iterator()); Iterator<Instance> it1 = tokedList.iterator (); Iterator<Instance> it2 = pipedList.iterator (); while (it1.hasNext()) { Instance toked = it1.next(); Instance piped = it2.next (); Tokenization tok = (Tokenization) toked.getData(); String name = piped.getName().toString(); Sequence input = (Sequence) piped.getData (); Sequence target = (Sequence) piped.getTarget (); Sequence output = crf.transduce (input); DocumentExtraction docseq = new DocumentExtraction (name, getTargetAlphabet (), tok, output, target, backgroundTag, filter); extraction.addDocumentExtraction (docseq); } return extraction; }
public Extraction extract (Iterator<Instance> source) { Extraction extraction = new Extraction (this, getTargetAlphabet ()); // Put all the instances through both pipes, then get viterbi path InstanceList tokedList = new InstanceList (tokenizationPipe); tokedList.addThruPipe (source); InstanceList pipedList = new InstanceList (getFeaturePipe ()); pipedList.addThruPipe (tokedList.iterator()); Iterator<Instance> it1 = tokedList.iterator (); Iterator<Instance> it2 = pipedList.iterator (); while (it1.hasNext()) { Instance toked = it1.next(); Instance piped = it2.next (); Tokenization tok = (Tokenization) toked.getData(); String name = piped.getName().toString(); Sequence input = (Sequence) piped.getData (); Sequence target = (Sequence) piped.getTarget (); Sequence output = crf.transduce (input); DocumentExtraction docseq = new DocumentExtraction (name, getTargetAlphabet (), tok, output, target, backgroundTag, filter); extraction.addDocumentExtraction (docseq); } return extraction; }
/** Assumes Instance.source contains the Tokenization object. */ public Extraction extract (InstanceList ilist) { Extraction extraction = new Extraction (this, getTargetAlphabet ()); for (int i = 0; i < ilist.size(); i++) { Instance inst = ilist.get(i); Tokenization tok = (Tokenization)inst.getSource(); String name = inst.getName().toString(); Sequence input = (Sequence)inst.getData (); Sequence target = (Sequence)inst.getTarget (); Sequence output = crf.transduce(input); DocumentExtraction docseq = new DocumentExtraction (name, getTargetAlphabet(), tok, output, target, backgroundTag, filter); extraction.addDocumentExtraction (docseq); } return extraction; }
public Extraction extract (Iterator<Instance> source) { Extraction extraction = new Extraction (this, getTargetAlphabet ()); // Put all the instances through both pipes, then get viterbi path InstanceList tokedList = new InstanceList (tokPipe); tokedList.addThruPipe (source); InstanceList pipedList = new InstanceList (getFeaturePipe ()); pipedList.addThruPipe (tokedList.iterator()); Iterator<Instance> it1 = tokedList.iterator (); Iterator<Instance> it2 = pipedList.iterator (); while (it1.hasNext()) { Instance toked = it1.next(); Instance piped = it2.next(); Tokenization tok = (Tokenization) toked.getData(); String name = piped.getName().toString(); Sequence target = (Sequence) piped.getTarget (); LabelsSequence output = acrf.getBestLabels (piped); LabelSequence ls = SliceLabelsSequence.sliceLabelsSequence (output, slice); LabelSequence lsTarget = SliceLabelsSequence.sliceLabelsSequence ((LabelsSequence) target, slice); DocumentExtraction docseq = new DocumentExtraction (name, getTargetAlphabet (), tok, ls, lsTarget, backgroundTag, filter); extraction.addDocumentExtraction (docseq); } return extraction; }
public Extraction extract (Iterator<Instance> source) { Extraction extraction = new Extraction (this, getTargetAlphabet ()); // Put all the instances through both pipes, then get viterbi path InstanceList tokedList = new InstanceList (tokPipe); tokedList.addThruPipe (source); InstanceList pipedList = new InstanceList (getFeaturePipe ()); pipedList.addThruPipe (tokedList.iterator()); Iterator<Instance> it1 = tokedList.iterator (); Iterator<Instance> it2 = pipedList.iterator (); while (it1.hasNext()) { Instance toked = it1.next(); Instance piped = it2.next(); Tokenization tok = (Tokenization) toked.getData(); String name = piped.getName().toString(); Sequence target = (Sequence) piped.getTarget (); LabelsSequence output = acrf.getBestLabels (piped); LabelSequence ls = SliceLabelsSequence.sliceLabelsSequence (output, slice); LabelSequence lsTarget = SliceLabelsSequence.sliceLabelsSequence ((LabelsSequence) target, slice); DocumentExtraction docseq = new DocumentExtraction (name, getTargetAlphabet (), tok, ls, lsTarget, backgroundTag, filter); extraction.addDocumentExtraction (docseq); } return extraction; }
/** Assumes Instance.source contains the Tokenization object. */ public Extraction extract (InstanceList ilist) { Extraction extraction = new Extraction (this, getTargetAlphabet ()); for (int i = 0; i < ilist.size(); i++) { Instance inst = ilist.get(i); Tokenization tok = (Tokenization)inst.getSource(); String name = inst.getName().toString(); Sequence input = (Sequence)inst.getData (); Sequence target = (Sequence)inst.getTarget (); Sequence output = crf.transduce(input); DocumentExtraction docseq = new DocumentExtraction (name, getTargetAlphabet(), tok, output, target, backgroundTag, filter); extraction.addDocumentExtraction (docseq); } return extraction; }
/** Assumes Instance.source contains the Tokenization object. */ public Extraction extract (InstanceList ilist) { Extraction extraction = new Extraction (this, getTargetAlphabet ()); for (int i = 0; i < ilist.size(); i++) { Instance inst = ilist.get(i); Tokenization tok = (Tokenization)inst.getSource(); String name = inst.getName().toString(); Sequence input = (Sequence)inst.getData (); Sequence target = (Sequence)inst.getTarget (); Sequence output = crf.transduce(input); DocumentExtraction docseq = new DocumentExtraction (name, getTargetAlphabet(), tok, output, target, backgroundTag, filter); extraction.addDocumentExtraction (docseq); } return extraction; }
public Extraction extract (Iterator<Instance> source) { Extraction extraction = new Extraction (this, getTargetAlphabet ()); // Put all the instances through both pipes, then get viterbi path InstanceList tokedList = new InstanceList (tokPipe); tokedList.addThruPipe (source); InstanceList pipedList = new InstanceList (getFeaturePipe ()); pipedList.addThruPipe (tokedList.iterator()); Iterator<Instance> it1 = tokedList.iterator (); Iterator<Instance> it2 = pipedList.iterator (); while (it1.hasNext()) { Instance toked = it1.next(); Instance piped = it2.next(); Tokenization tok = (Tokenization) toked.getData(); String name = piped.getName().toString(); Sequence target = (Sequence) piped.getTarget (); LabelsSequence output = acrf.getBestLabels (piped); LabelSequence ls = SliceLabelsSequence.sliceLabelsSequence (output, slice); LabelSequence lsTarget = SliceLabelsSequence.sliceLabelsSequence ((LabelsSequence) target, slice); DocumentExtraction docseq = new DocumentExtraction (name, getTargetAlphabet (), tok, ls, lsTarget, backgroundTag, filter); extraction.addDocumentExtraction (docseq); } return extraction; }
private Extraction createExtractionFrom (String[] predStrings, String[] trueStrings) { Pipe pipe = new SerialPipes (new Pipe[] { new SGML2TokenSequence (new CharSequenceLexer (CharSequenceLexer.LEX_NONWHITESPACE_CLASSES ), "O"), new Target2LabelSequence (), new PrintInputAndTarget (), }); InstanceList pred = new InstanceList (pipe); pred.addThruPipe (new ArrayIterator (predStrings)); InstanceList targets = new InstanceList (pipe); targets.addThruPipe (new ArrayIterator (trueStrings)); LabelAlphabet dict = (LabelAlphabet) pipe.getTargetAlphabet (); Extraction extraction = new Extraction (null, dict); for (int i = 0; i < pred.size(); i++) { Instance aPred = pred.get (i); Instance aTarget = targets.get (i); Tokenization input = (Tokenization) aPred.getData (); Sequence predSeq = (Sequence) aPred.getTarget (); Sequence targetSeq = (Sequence) aTarget.getTarget (); DocumentExtraction docextr = new DocumentExtraction ("TEST"+i, dict, input, predSeq, targetSeq, "O"); extraction.addDocumentExtraction (docextr); } return extraction; }
private Extraction createExtractionFrom (String[] predStrings, String[] trueStrings) { Pipe pipe = new SerialPipes (new Pipe[] { new SGML2TokenSequence (new CharSequenceLexer (CharSequenceLexer.LEX_NONWHITESPACE_CLASSES ), "O"), new Target2LabelSequence (), new PrintInputAndTarget (), }); InstanceList pred = new InstanceList (pipe); pred.addThruPipe (new ArrayIterator (predStrings)); InstanceList targets = new InstanceList (pipe); targets.addThruPipe (new ArrayIterator (trueStrings)); LabelAlphabet dict = (LabelAlphabet) pipe.getTargetAlphabet (); Extraction extraction = new Extraction (null, dict); for (int i = 0; i < pred.size(); i++) { Instance aPred = pred.get (i); Instance aTarget = targets.get (i); Tokenization input = (Tokenization) aPred.getData (); Sequence predSeq = (Sequence) aPred.getTarget (); Sequence targetSeq = (Sequence) aTarget.getTarget (); DocumentExtraction docextr = new DocumentExtraction ("TEST"+i, dict, input, predSeq, targetSeq, "O"); extraction.addDocumentExtraction (docextr); } return extraction; }