private void addLabeledSpan (LabeledSpans labeled, Tokenization input, TagStart tagStart, int end, Label backgroundTag) { Span span = input.subspan (tagStart.start, end); Label splitTag = tagStart.label; labeled.add (new LabeledSpan (span, splitTag, splitTag == backgroundTag)); }
private void addLabeledSpan (LabeledSpans labeled, Tokenization input, TagStart tagStart, int end, Label backgroundTag) { Span span = input.subspan (tagStart.start, end); Label splitTag = tagStart.label; labeled.add (new LabeledSpan (span, splitTag, splitTag == backgroundTag)); }
private void addLabeledSpan (LabeledSpans labeled, Tokenization input, TagStart tagStart, int end, Label backgroundTag) { Span span = input.subspan (tagStart.start, end); Label splitTag = tagStart.label; labeled.add (new LabeledSpan (span, splitTag, splitTag == backgroundTag)); }
public Span intersection (Span r) { LabeledSpan other = (LabeledSpan) r; Span newSpan = getSpan ().intersection (other.getSpan ()); return new LabeledSpan (newSpan, label, isBackground, confidence); }
public Span intersection (Span r) { LabeledSpan other = (LabeledSpan) r; Span newSpan = getSpan ().intersection (other.getSpan ()); return new LabeledSpan (newSpan, label, isBackground, confidence); }
public Span intersection (Span r) { LabeledSpan other = (LabeledSpan) r; Span newSpan = getSpan ().intersection (other.getSpan ()); return new LabeledSpan (newSpan, label, isBackground, confidence); }
private void addBackgroundIfNecessary (LabeledSpans labeled, StringSpan span, int docidx, Label background) { int nextIdx = span.getStartIdx (); if (docidx < nextIdx) { Span newSpan = new StringSpan ((CharSequence) span.getDocument (), docidx, nextIdx); labeled.add (new LabeledSpan (newSpan, background, true)); } }
private void addBackgroundIfNecessary (LabeledSpans labeled, StringSpan span, int docidx, Label background) { int nextIdx = span.getStartIdx (); if (docidx < nextIdx) { Span newSpan = new StringSpan ((CharSequence) span.getDocument (), docidx, nextIdx); labeled.add (new LabeledSpan (newSpan, background, true)); } }
private void addBackgroundIfNecessary (LabeledSpans labeled, StringSpan span, int docidx, Label background) { int nextIdx = span.getStartIdx (); if (docidx < nextIdx) { Span newSpan = new StringSpan ((CharSequence) span.getDocument (), docidx, nextIdx); labeled.add (new LabeledSpan (newSpan, background, true)); } }
private void addBackgroundIfNecessary (LabeledSpans labeled, StringSpan span, int docidx, Label background) { int nextIdx = span.getStartIdx (); if (docidx < nextIdx) { Span newSpan = new StringSpan ((CharSequence) span.getDocument (), docidx, nextIdx); labeled.add (new LabeledSpan (newSpan, background, true)); } }
private void addBackgroundIfNecessary (LabeledSpans labeled, StringSpan span, int docidx, Label background) { int nextIdx = span.getStartIdx (); if (docidx < nextIdx) { Span newSpan = new StringSpan ((CharSequence) span.getDocument (), docidx, nextIdx); labeled.add (new LabeledSpan (newSpan, background, true)); } }
private void addBackgroundIfNecessary (LabeledSpans labeled, StringSpan span, int docidx, Label background) { int nextIdx = span.getStartIdx (); if (docidx < nextIdx) { Span newSpan = new StringSpan ((CharSequence) span.getDocument (), docidx, nextIdx); labeled.add (new LabeledSpan (newSpan, background, true)); } }
private void addSpansFromTags (LabeledSpans labeled, Tokenization input, Sequence tags, LabelAlphabet dict, Label backgroundTag) { int i = 0; int docidx = 0; while (i < tags.size()) { Label thisTag = dict.lookupLabel (tags.get(i).toString()); int startTokenIdx = i; while (i < tags.size()) { Label nextTag = dict.lookupLabel (tags.get(i).toString ()); if (thisTag != nextTag) break; i++; } int endTokenIdx = i; Span span = input.subspan(startTokenIdx, endTokenIdx); addBackgroundIfNecessary (labeled, (StringSpan) span, docidx, backgroundTag); docidx = ((StringSpan) span).getEndIdx (); labeled.add (new LabeledSpan (span, thisTag, thisTag == backgroundTag)); } }
private void addSpansFromTags (LabeledSpans labeled, Tokenization input, Sequence tags, LabelAlphabet dict, Label backgroundTag) { int i = 0; int docidx = 0; while (i < tags.size()) { Label thisTag = dict.lookupLabel (tags.get(i).toString()); int startTokenIdx = i; while (i < tags.size()) { Label nextTag = dict.lookupLabel (tags.get(i).toString ()); if (thisTag != nextTag) break; i++; } int endTokenIdx = i; Span span = input.subspan(startTokenIdx, endTokenIdx); addBackgroundIfNecessary (labeled, (StringSpan) span, docidx, backgroundTag); docidx = ((StringSpan) span).getEndIdx (); labeled.add (new LabeledSpan (span, thisTag, thisTag == backgroundTag)); } }
private void addSpansFromTags (LabeledSpans labeled, Tokenization input, Sequence tags, LabelAlphabet dict, Label backgroundTag) { int i = 0; int docidx = 0; while (i < tags.size()) { Label thisTag = dict.lookupLabel (tags.get(i).toString()); int startTokenIdx = i; while (i < tags.size()) { Label nextTag = dict.lookupLabel (tags.get(i).toString ()); if (thisTag != nextTag) break; i++; } int endTokenIdx = i; Span span = input.subspan(startTokenIdx, endTokenIdx); addBackgroundIfNecessary (labeled, (StringSpan) span, docidx, backgroundTag); docidx = ((StringSpan) span).getEndIdx (); labeled.add (new LabeledSpan (span, thisTag, thisTag == backgroundTag)); } }
private void addSpansFromTags (LabeledSpans labeled, Tokenization input, Sequence tags, LabelAlphabet dict, Label backgroundTag) { int i = 0; int docidx = 0; while (i < tags.size ()) { Label thisTag = dict.lookupLabel (tags.get (i).toString ()); int startTokenIdx = i; while (++i < tags.size ()) { Label nextTag = dict.lookupLabel (tags.get (i).toString ()); if (isBeginTag (nextTag) || !tagsMatch (thisTag, nextTag)) break; } int endTokenIdx = i; Span span = createSpan (input, startTokenIdx, endTokenIdx); addBackgroundIfNecessary (labeled, (StringSpan) span, docidx, backgroundTag); docidx = ((StringSpan) span).getEndIdx (); if (isBeginTag (thisTag) || isInsideTag (thisTag)) { thisTag = trimTag (dict, thisTag); } labeled.add (new LabeledSpan (span, thisTag, thisTag == backgroundTag)); } }
private void addSpansFromTags (LabeledSpans labeled, Tokenization input, Sequence tags, LabelAlphabet dict, Label backgroundTag) { int i = 0; int docidx = 0; while (i < tags.size ()) { Label thisTag = dict.lookupLabel (tags.get (i).toString ()); int startTokenIdx = i; while (++i < tags.size ()) { Label nextTag = dict.lookupLabel (tags.get (i).toString ()); if (isBeginTag (nextTag) || !tagsMatch (thisTag, nextTag)) break; } int endTokenIdx = i; Span span = createSpan (input, startTokenIdx, endTokenIdx); addBackgroundIfNecessary (labeled, (StringSpan) span, docidx, backgroundTag); docidx = ((StringSpan) span).getEndIdx (); if (isBeginTag (thisTag) || isInsideTag (thisTag)) { thisTag = trimTag (dict, thisTag); } labeled.add (new LabeledSpan (span, thisTag, thisTag == backgroundTag)); } }
private void addSpansFromTags (LabeledSpans labeled, Tokenization input, Sequence tags, LabelAlphabet dict, Label backgroundTag) { int i = 0; int docidx = 0; while (i < tags.size ()) { Label thisTag = dict.lookupLabel (tags.get (i).toString ()); int startTokenIdx = i; while (++i < tags.size ()) { Label nextTag = dict.lookupLabel (tags.get (i).toString ()); if (isBeginTag (nextTag) || !tagsMatch (thisTag, nextTag)) break; } int endTokenIdx = i; Span span = createSpan (input, startTokenIdx, endTokenIdx); addBackgroundIfNecessary (labeled, (StringSpan) span, docidx, backgroundTag); docidx = ((StringSpan) span).getEndIdx (); if (isBeginTag (thisTag) || isInsideTag (thisTag)) { thisTag = trimTag (dict, thisTag); } labeled.add (new LabeledSpan (span, thisTag, thisTag == backgroundTag)); } }
public void testNestedToXML () { LabelAlphabet dict = new LabelAlphabet (); String document = "the quick brown fox leapt over the lazy dog"; StringTokenization toks = new StringTokenization (document, new CharSequenceLexer ()); Label O = dict.lookupLabel ("O"); Label ANML = dict.lookupLabel ("ANIMAL"); Label VB = dict.lookupLabel ("VERB"); Label JJ = dict.lookupLabel ("ADJ"); Label MAMMAL = dict.lookupLabel ("MAMMAL"); LabelSequence tags = new LabelSequence (new Label[] { O, ANML, ANML, ANML, VB, O, ANML, ANML, ANML }); LabeledSpans spans = new DefaultTokenizationFilter ().constructLabeledSpans (dict, document, O, toks, tags); Span foxToken = toks.subspan (3, 4); spans.add (new LabeledSpan (foxToken, MAMMAL, false)); Span bigDogToken = toks.subspan (7, 8); spans.add (new LabeledSpan (bigDogToken, JJ, false)); DocumentExtraction extr = new DocumentExtraction ("Test", dict, toks, spans, null, "O"); String actualXml = extr.toXmlString(); String expectedXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n" + "<doc>the <ANIMAL>quick brown <MAMMAL>fox </MAMMAL></ANIMAL><VERB>leapt </VERB>over <ANIMAL>the <ADJ>lazy </ADJ>dog</ANIMAL></doc>\r\n"; assertEquals (expectedXml, actualXml); }
public void testNestedToXML () { LabelAlphabet dict = new LabelAlphabet (); String document = "the quick brown fox leapt over the lazy dog"; StringTokenization toks = new StringTokenization (document, new CharSequenceLexer ()); Label O = dict.lookupLabel ("O"); Label ANML = dict.lookupLabel ("ANIMAL"); Label VB = dict.lookupLabel ("VERB"); Label JJ = dict.lookupLabel ("ADJ"); Label MAMMAL = dict.lookupLabel ("MAMMAL"); LabelSequence tags = new LabelSequence (new Label[] { O, ANML, ANML, ANML, VB, O, ANML, ANML, ANML }); LabeledSpans spans = new DefaultTokenizationFilter ().constructLabeledSpans (dict, document, O, toks, tags); Span foxToken = toks.subspan (3, 4); spans.add (new LabeledSpan (foxToken, MAMMAL, false)); Span bigDogToken = toks.subspan (7, 8); spans.add (new LabeledSpan (bigDogToken, JJ, false)); DocumentExtraction extr = new DocumentExtraction ("Test", dict, toks, spans, null, "O"); String actualXml = extr.toXmlString(); String expectedXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n" + "<doc>the <ANIMAL>quick brown <MAMMAL>fox </MAMMAL></ANIMAL><VERB>leapt </VERB>over <ANIMAL>the <ADJ>lazy </ADJ>dog</ANIMAL></doc>\r\n"; assertEquals (expectedXml, actualXml); }