private void addBackgroundIfNecessary (LabeledSpans labeled, StringSpan span, int docidx, Label background) { int nextIdx = span.getStartIdx (); if (docidx < nextIdx) { Span newSpan = new StringSpan ((CharSequence) span.getDocument (), docidx, nextIdx); labeled.add (new LabeledSpan (newSpan, background, true)); } }
public StringSpan (CharSequence doc, int start, int end) { super (constructTokenText (doc, start, end)); this.document = doc; this.start = start; this.end = end; }
public Span subspan (int firstToken, int lastToken) { StringSpan firstSpan = (StringSpan) get(firstToken); int startIdx = firstSpan.getStartIdx (); int endIdx; if (lastToken > size()) { endIdx = document.length (); } else { StringSpan lastSpan = (StringSpan) get(lastToken - 1); endIdx = lastSpan.getEndIdx (); } return new StringSpan (document, startIdx, endIdx); }
public Instance pipe (Instance carrier) { Object data = carrier.getData (); if (data instanceof Tokenization) { // we're done } else if (data instanceof TokenSequence) { StringBuffer buf = new StringBuffer (); TokenSequence ts = (TokenSequence) data; StringTokenization spans = new StringTokenization (buf); // I can use a StringBuffer as the doc! Awesome! for (int i = 0; i < ts.size(); i++) { Token token = ts.get(i); int start = buf.length (); buf.append (token.getText()); int end = buf.length(); StringSpan span = new StringSpan (buf, start, end); span.setFeatures (token.getFeatures ()); span.setProperties (token.getProperties ()); spans.add (span); buf.append (" "); } carrier.setData (spans); } else { throw new IllegalArgumentException ("Can't convert "+data+" to Tokenization."); } return carrier; }
public Span intersection (Span r) { StringSpan other = (StringSpan) r; int newStart = Math.max (start, other.start); int newEnd = Math.min (end, other.end); return new StringSpan (document, newStart, newEnd); }
buf.append (" "); StringSpan span = new StringSpan (buf, start, end); span.setFeatureValue (toks[j].intern (), Double.parseDouble(m.group(2))); } else { span.setFeatureValue (toks[j].intern (), 1.0); span.setFeatureValue ((textFeaturePrefix+text).intern(), 1.0);
private void addSpansFromTags (LabeledSpans labeled, Tokenization input, Sequence tags, LabelAlphabet dict, Label backgroundTag) { int i = 0; int docidx = 0; while (i < tags.size()) { Label thisTag = dict.lookupLabel (tags.get(i).toString()); int startTokenIdx = i; while (i < tags.size()) { Label nextTag = dict.lookupLabel (tags.get(i).toString ()); if (thisTag != nextTag) break; i++; } int endTokenIdx = i; Span span = input.subspan(startTokenIdx, endTokenIdx); addBackgroundIfNecessary (labeled, (StringSpan) span, docidx, backgroundTag); docidx = ((StringSpan) span).getEndIdx (); labeled.add (new LabeledSpan (span, thisTag, thisTag == backgroundTag)); } }
protected Span createSpan(Tokenization input, int startTokenIdx, int endTokenIdx) { StringSpan span = (StringSpan) input .subspan(startTokenIdx, endTokenIdx); span.setProperty("StartTokenIdx", new Integer(startTokenIdx)); span.setProperty("EndTokenIdx", new Integer(endTokenIdx-1)); return span; }
public Instance pipe(Instance carrier) { StringTokenization ts = (StringTokenization) carrier.getData(); StringTokenization newTs = new StringTokenization((CharSequence) ts.getDocument ()); final LabelAlphabet dict = (LabelAlphabet) getTargetAlphabet(); LabelSequence labelSeq = new LabelSequence(dict); Label start = dict.lookupLabel ("start"); Label notstart = dict.lookupLabel ("notstart"); boolean lastWasSpace = true; StringBuffer sb = new StringBuffer(); for (int i = 0; i < ts.size(); i++) { StringSpan t = (StringSpan) ts.getSpan(i); if (t.getText().equals(" ")) lastWasSpace = true; else { sb.append(t.getText()); newTs.add(t); labelSeq.add(lastWasSpace ? "start" : "notstart"); lastWasSpace = false; } } if (isTargetProcessing()) carrier.setTarget(labelSeq); carrier.setData(newTs); carrier.setSource(sb.toString()); return carrier; }
public Instance pipe (Instance carrier) { Object data = carrier.getData (); if (data instanceof Tokenization) { // we're done } else if (data instanceof TokenSequence) { StringBuffer buf = new StringBuffer (); TokenSequence ts = (TokenSequence) data; StringTokenization spans = new StringTokenization (buf); // I can use a StringBuffer as the doc! Awesome! for (int i = 0; i < ts.size(); i++) { Token token = ts.get(i); int start = buf.length (); buf.append (token.getText()); int end = buf.length(); StringSpan span = new StringSpan (buf, start, end); span.setFeatures (token.getFeatures ()); span.setProperties (token.getProperties ()); spans.add (span); buf.append (" "); } carrier.setData (spans); } else { throw new IllegalArgumentException ("Can't convert "+data+" to Tokenization."); } return carrier; }
public Span intersection (Span r) { StringSpan other = (StringSpan) r; int newStart = Math.max (start, other.start); int newEnd = Math.min (end, other.end); return new StringSpan (document, newStart, newEnd); }
buf.append (" "); StringSpan span = new StringSpan (buf, start, end); span.setFeatureValue (toks[j].intern (), Double.parseDouble(m.group(2))); } else { span.setFeatureValue (toks[j].intern (), 1.0); span.setFeatureValue ((textFeaturePrefix+text).intern(), 1.0);
private void addSpansFromTags (LabeledSpans labeled, Tokenization input, Sequence tags, LabelAlphabet dict, Label backgroundTag) { int i = 0; int docidx = 0; while (i < tags.size()) { Label thisTag = dict.lookupLabel (tags.get(i).toString()); int startTokenIdx = i; while (i < tags.size()) { Label nextTag = dict.lookupLabel (tags.get(i).toString ()); if (thisTag != nextTag) break; i++; } int endTokenIdx = i; Span span = input.subspan(startTokenIdx, endTokenIdx); addBackgroundIfNecessary (labeled, (StringSpan) span, docidx, backgroundTag); docidx = ((StringSpan) span).getEndIdx (); labeled.add (new LabeledSpan (span, thisTag, thisTag == backgroundTag)); } }
protected Span createSpan(Tokenization input, int startTokenIdx, int endTokenIdx) { StringSpan span = (StringSpan) input .subspan(startTokenIdx, endTokenIdx); span.setProperty("StartTokenIdx", new Integer(startTokenIdx)); span.setProperty("EndTokenIdx", new Integer(endTokenIdx-1)); return span; }
public Instance pipe(Instance carrier) { StringTokenization ts = (StringTokenization) carrier.getData(); StringTokenization newTs = new StringTokenization((CharSequence) ts.getDocument ()); final LabelAlphabet dict = (LabelAlphabet) getTargetAlphabet(); LabelSequence labelSeq = new LabelSequence(dict); Label start = dict.lookupLabel ("start"); Label notstart = dict.lookupLabel ("notstart"); boolean lastWasSpace = true; StringBuffer sb = new StringBuffer(); for (int i = 0; i < ts.size(); i++) { StringSpan t = (StringSpan) ts.getSpan(i); if (t.getText().equals(" ")) lastWasSpace = true; else { sb.append(t.getText()); newTs.add(t); labelSeq.add(lastWasSpace ? "start" : "notstart"); lastWasSpace = false; } } if (isTargetProcessing()) carrier.setTarget(labelSeq); carrier.setData(newTs); carrier.setSource(sb.toString()); return carrier; }
private void addBackgroundIfNecessary (LabeledSpans labeled, StringSpan span, int docidx, Label background) { int nextIdx = span.getStartIdx (); if (docidx < nextIdx) { Span newSpan = new StringSpan ((CharSequence) span.getDocument (), docidx, nextIdx); labeled.add (new LabeledSpan (newSpan, background, true)); } }
public Span subspan (int firstToken, int lastToken) { StringSpan firstSpan = (StringSpan) get(firstToken); int startIdx = firstSpan.getStartIdx (); int endIdx; if (lastToken > size()) { endIdx = document.length (); } else { StringSpan lastSpan = (StringSpan) get(lastToken - 1); endIdx = lastSpan.getEndIdx (); } return new StringSpan (document, startIdx, endIdx); }
public Instance pipe (Instance carrier) { Object data = carrier.getData (); if (data instanceof Tokenization) { // we're done } else if (data instanceof TokenSequence) { StringBuffer buf = new StringBuffer (); TokenSequence ts = (TokenSequence) data; StringTokenization spans = new StringTokenization (buf); // I can use a StringBuffer as the doc! Awesome! for (int i = 0; i < ts.size(); i++) { Token token = ts.get(i); int start = buf.length (); buf.append (token.getText()); int end = buf.length(); StringSpan span = new StringSpan (buf, start, end); span.setFeatures (token.getFeatures ()); span.setProperties (token.getProperties ()); spans.add (span); buf.append (" "); } carrier.setData (spans); } else { throw new IllegalArgumentException ("Can't convert "+data+" to Tokenization."); } return carrier; }
public Span intersection (Span r) { StringSpan other = (StringSpan) r; int newStart = Math.max (start, other.start); int newEnd = Math.min (end, other.end); return new StringSpan (document, newStart, newEnd); }
buf.append (" "); StringSpan span = new StringSpan (buf, start, end); span.setFeatureValue (toks[j].intern (), 1.0); j++; span.setFeatureValue ((textFeaturePrefix+text).intern(), 1.0);