cc.mallet.extract.StringSpan java code examples

private void addBackgroundIfNecessary (LabeledSpans labeled, StringSpan span, int docidx, Label background)
{
 int nextIdx = span.getStartIdx ();
 if (docidx < nextIdx) {
  Span newSpan = new StringSpan ((CharSequence) span.getDocument (), docidx, nextIdx);
  labeled.add (new LabeledSpan (newSpan, background, true));
 }
}

public StringSpan (CharSequence doc, int start, int end)
{
 super (constructTokenText (doc, start, end));
 this.document = doc;
 this.start = start;
 this.end = end;
}

public Span subspan (int firstToken, int lastToken)
{
 StringSpan firstSpan = (StringSpan) get(firstToken);
 int startIdx = firstSpan.getStartIdx ();
 int endIdx;
 if (lastToken > size()) {
  endIdx = document.length ();
 } else {
  StringSpan lastSpan = (StringSpan) get(lastToken - 1);
  endIdx = lastSpan.getEndIdx ();
 }
 return new StringSpan (document, startIdx, endIdx);
}

public Instance pipe (Instance carrier)
{
 Object data = carrier.getData ();
 if (data instanceof Tokenization) {
  // we're done
 } else if (data instanceof TokenSequence) {
  StringBuffer buf = new StringBuffer ();
  TokenSequence ts = (TokenSequence) data;
  StringTokenization spans = new StringTokenization (buf);  // I can use a StringBuffer as the doc! Awesome!
  for (int i = 0; i < ts.size(); i++) {
   Token token = ts.get(i);
   int start = buf.length ();
   buf.append (token.getText());
   int end = buf.length();
   StringSpan span = new StringSpan (buf, start, end);
   span.setFeatures (token.getFeatures ());
   span.setProperties (token.getProperties ());
   spans.add (span);
   buf.append (" ");
  }
  carrier.setData (spans);
 } else {
  throw new IllegalArgumentException ("Can't convert "+data+" to Tokenization.");
 }
 return carrier;
}

public Span intersection (Span r)
{
 StringSpan other = (StringSpan) r;
 int newStart = Math.max (start, other.start);
 int newEnd = Math.min (end, other.end);
 return new StringSpan (document, newStart, newEnd);
}

buf.append (" ");
StringSpan span = new StringSpan (buf, start, end);
  span.setFeatureValue (toks[j].intern (), Double.parseDouble(m.group(2)));
 } else {
  span.setFeatureValue (toks[j].intern (), 1.0);
 span.setFeatureValue ((textFeaturePrefix+text).intern(), 1.0);

private void addSpansFromTags (LabeledSpans labeled, Tokenization input, Sequence tags, LabelAlphabet dict,
                Label backgroundTag)
 {
  int i = 0;
  int docidx = 0;
  while (i < tags.size()) {
   Label thisTag = dict.lookupLabel (tags.get(i).toString());
   int startTokenIdx = i;
   while (i < tags.size()) {
    Label nextTag = dict.lookupLabel (tags.get(i).toString ());
    if (thisTag != nextTag) break;
    i++;
   }
   int endTokenIdx = i;
   Span span = input.subspan(startTokenIdx, endTokenIdx);
   addBackgroundIfNecessary (labeled, (StringSpan) span, docidx, backgroundTag);
   docidx = ((StringSpan) span).getEndIdx ();
   labeled.add (new LabeledSpan (span, thisTag, thisTag == backgroundTag));
  }
 }

protected Span createSpan(Tokenization input, int startTokenIdx,
    int endTokenIdx) {
  StringSpan span = (StringSpan) input
      .subspan(startTokenIdx, endTokenIdx);
  span.setProperty("StartTokenIdx", new Integer(startTokenIdx));
  span.setProperty("EndTokenIdx", new Integer(endTokenIdx-1));
  return span;
}

public Instance pipe(Instance carrier)
{
 StringTokenization ts =  (StringTokenization) carrier.getData();
 StringTokenization newTs = new StringTokenization((CharSequence) ts.getDocument ());
final LabelAlphabet dict = (LabelAlphabet) getTargetAlphabet();
LabelSequence labelSeq = new LabelSequence(dict);
Label start = dict.lookupLabel ("start");
Label notstart = dict.lookupLabel ("notstart");
 boolean lastWasSpace = true;
 StringBuffer sb = new StringBuffer();
 for (int i = 0; i < ts.size(); i++) {
  StringSpan t = (StringSpan) ts.getSpan(i);
  if (t.getText().equals(" "))
   lastWasSpace = true;
  else {
   sb.append(t.getText());
   newTs.add(t);
   labelSeq.add(lastWasSpace ? "start" : "notstart");
   lastWasSpace = false;
  }
 }
 if (isTargetProcessing())
  carrier.setTarget(labelSeq);
 carrier.setData(newTs);
 carrier.setSource(sb.toString());
 return carrier;
}

public Instance pipe (Instance carrier)
{
 Object data = carrier.getData ();
 if (data instanceof Tokenization) {
  // we're done
 } else if (data instanceof TokenSequence) {
  StringBuffer buf = new StringBuffer ();
  TokenSequence ts = (TokenSequence) data;
  StringTokenization spans = new StringTokenization (buf);  // I can use a StringBuffer as the doc! Awesome!
  for (int i = 0; i < ts.size(); i++) {
   Token token = ts.get(i);
   int start = buf.length ();
   buf.append (token.getText());
   int end = buf.length();
   StringSpan span = new StringSpan (buf, start, end);
   span.setFeatures (token.getFeatures ());
   span.setProperties (token.getProperties ());
   spans.add (span);
   buf.append (" ");
  }
  carrier.setData (spans);
 } else {
  throw new IllegalArgumentException ("Can't convert "+data+" to Tokenization.");
 }
 return carrier;
}

public Span intersection (Span r)
{
 StringSpan other = (StringSpan) r;
 int newStart = Math.max (start, other.start);
 int newEnd = Math.min (end, other.end);
 return new StringSpan (document, newStart, newEnd);
}

buf.append (" ");
StringSpan span = new StringSpan (buf, start, end);
  span.setFeatureValue (toks[j].intern (), Double.parseDouble(m.group(2)));
 } else {
  span.setFeatureValue (toks[j].intern (), 1.0);
 span.setFeatureValue ((textFeaturePrefix+text).intern(), 1.0);

private void addSpansFromTags (LabeledSpans labeled, Tokenization input, Sequence tags, LabelAlphabet dict,
                Label backgroundTag)
 {
  int i = 0;
  int docidx = 0;
  while (i < tags.size()) {
   Label thisTag = dict.lookupLabel (tags.get(i).toString());
   int startTokenIdx = i;
   while (i < tags.size()) {
    Label nextTag = dict.lookupLabel (tags.get(i).toString ());
    if (thisTag != nextTag) break;
    i++;
   }
   int endTokenIdx = i;
   Span span = input.subspan(startTokenIdx, endTokenIdx);
   addBackgroundIfNecessary (labeled, (StringSpan) span, docidx, backgroundTag);
   docidx = ((StringSpan) span).getEndIdx ();
   labeled.add (new LabeledSpan (span, thisTag, thisTag == backgroundTag));
  }
 }

protected Span createSpan(Tokenization input, int startTokenIdx,
    int endTokenIdx) {
  StringSpan span = (StringSpan) input
      .subspan(startTokenIdx, endTokenIdx);
  span.setProperty("StartTokenIdx", new Integer(startTokenIdx));
  span.setProperty("EndTokenIdx", new Integer(endTokenIdx-1));
  return span;
}

public Instance pipe(Instance carrier)
{
 StringTokenization ts =  (StringTokenization) carrier.getData();
 StringTokenization newTs = new StringTokenization((CharSequence) ts.getDocument ());
final LabelAlphabet dict = (LabelAlphabet) getTargetAlphabet();
LabelSequence labelSeq = new LabelSequence(dict);
Label start = dict.lookupLabel ("start");
Label notstart = dict.lookupLabel ("notstart");
 boolean lastWasSpace = true;
 StringBuffer sb = new StringBuffer();
 for (int i = 0; i < ts.size(); i++) {
  StringSpan t = (StringSpan) ts.getSpan(i);
  if (t.getText().equals(" "))
   lastWasSpace = true;
  else {
   sb.append(t.getText());
   newTs.add(t);
   labelSeq.add(lastWasSpace ? "start" : "notstart");
   lastWasSpace = false;
  }
 }
 if (isTargetProcessing())
  carrier.setTarget(labelSeq);
 carrier.setData(newTs);
 carrier.setSource(sb.toString());
 return carrier;
}

private void addBackgroundIfNecessary (LabeledSpans labeled, StringSpan span, int docidx, Label background)
{
 int nextIdx = span.getStartIdx ();
 if (docidx < nextIdx) {
  Span newSpan = new StringSpan ((CharSequence) span.getDocument (), docidx, nextIdx);
  labeled.add (new LabeledSpan (newSpan, background, true));
 }
}

public Span subspan (int firstToken, int lastToken)
{
 StringSpan firstSpan = (StringSpan) get(firstToken);
 int startIdx = firstSpan.getStartIdx ();
 int endIdx;
 if (lastToken > size()) {
  endIdx = document.length ();
 } else {
  StringSpan lastSpan = (StringSpan) get(lastToken - 1);
  endIdx = lastSpan.getEndIdx ();
 }
 return new StringSpan (document, startIdx, endIdx);
}

public Instance pipe (Instance carrier)
{
 Object data = carrier.getData ();
 if (data instanceof Tokenization) {
  // we're done
 } else if (data instanceof TokenSequence) {
  StringBuffer buf = new StringBuffer ();
  TokenSequence ts = (TokenSequence) data;
  StringTokenization spans = new StringTokenization (buf);  // I can use a StringBuffer as the doc! Awesome!
  for (int i = 0; i < ts.size(); i++) {
   Token token = ts.get(i);
   int start = buf.length ();
   buf.append (token.getText());
   int end = buf.length();
   StringSpan span = new StringSpan (buf, start, end);
   span.setFeatures (token.getFeatures ());
   span.setProperties (token.getProperties ());
   spans.add (span);
   buf.append (" ");
  }
  carrier.setData (spans);
 } else {
  throw new IllegalArgumentException ("Can't convert "+data+" to Tokenization.");
 }
 return carrier;
}

public Span intersection (Span r)
{
 StringSpan other = (StringSpan) r;
 int newStart = Math.max (start, other.start);
 int newEnd = Math.min (end, other.end);
 return new StringSpan (document, newStart, newEnd);
}

buf.append (" ");
StringSpan span = new StringSpan (buf, start, end);
 span.setFeatureValue (toks[j].intern (), 1.0);
 j++;
 span.setFeatureValue ((textFeaturePrefix+text).intern(), 1.0);

Javadoc

A sub-section of a linear string.

Most used methods

Popular in Java

Making http requests using okhttp
setContentView (Activity)
getSystemService (Context)
getSupportFragmentManager (FragmentActivity)
BufferedInputStream (java.io)
A BufferedInputStream adds functionality to another input stream-namely, the ability to buffer the i
URL (java.net)
A Uniform Resource Locator that identifies the location of an Internet resource as specified by RFC
NumberFormat (java.text)
The abstract base class for all number formats. This class provides the interface for formatting and
HashMap (java.util)
HashMap is an implementation of Map. All optional operations are supported.All elements are permitte
Table (com.google.common.collect)
A collection that associates an ordered pair of keys, called a row key and a column key, with a sing
Annotation (javassist.bytecode.annotation)
The annotation structure.An instance of this class is returned bygetAnnotations() in AnnotationsAttr
Best plugins for Eclipse

How to useStringSpan in cc.mallet.extract

Best Java code snippets using cc.mallet.extract.StringSpan (Showing top 20 results out of 315)

How to use
StringSpan
in
cc.mallet.extract