org.apache.mahout.vectorizer.encoders.LuceneTextValueEncoder java code examples

public static ArgumentEncoder newTextEncoder(int position, String name, String analyzerClass) throws SchemaParseException {
  ArgumentEncoder r = new ArgumentEncoder(position, name);
  LuceneTextValueEncoder enc = new LuceneTextValueEncoder(name);
  Analyzer analyzer;
  try {
    analyzer = (Analyzer) Class.forName(analyzerClass).getConstructor(Version.class).newInstance(Version.LUCENE_31);
  } catch (InstantiationException e) {
    throw new SchemaParseException("Can't construct analyzer for class " + analyzerClass, e);
  } catch (IllegalAccessException e) {
    throw new SchemaParseException("Can't access analyzer class " + analyzerClass, e);
  } catch (ClassNotFoundException e) {
    throw new SchemaParseException("Can't find analyzer class " + analyzerClass, e);
  } catch (NoSuchMethodException e) {
    throw new SchemaParseException("Can't find constructor for analyzer class " + analyzerClass, e);
  } catch (InvocationTargetException e) {
    throw new SchemaParseException("Can't construct analyzer object " + analyzerClass, e);
  }
  enc.setAnalyzer(analyzer);
  r.encoder = enc;
  return r;
}

@Test
public void testLuceneEncoding() throws Exception {
 LuceneTextValueEncoder enc = new LuceneTextValueEncoder("text");
 enc.setAnalyzer(new WhitespaceAnalyzer(Version.LUCENE_46));
 Vector v1 = new DenseVector(200);
 enc.addToVector("test1 and more", v1);
 enc.flush(1, v1);
 enc.addToVector("", v1);
 enc.flush(1, v1);
 assertEquals(0.0, v1.norm(1), 0);
 assertEquals(0.0, v1.maxValue(), 0);
  builder.append("token_").append(i).append(' ');
 enc.addToVector(builder.toString(), v1);
 enc.flush(1, v1);

@Override
protected void setup(Context context) throws IOException, InterruptedException {
 Configuration conf = context.getConfiguration();
 sequentialVectors = conf.getBoolean(USE_SEQUENTIAL, false);
 namedVectors = conf.getBoolean(USE_NAMED_VECTORS, false);
 String analyzerName = conf.get(ANALYZER_NAME, StandardAnalyzer.class.getName());
 Analyzer analyzer;
 try {
  analyzer = AnalyzerUtils.createAnalyzer(analyzerName);
 } catch (ClassNotFoundException e) {
  //TODO: hmmm, don't like this approach
  throw new IOException("Unable to create Analyzer for name: " + analyzerName, e);
 }
 String encoderName = conf.get(ENCODER_FIELD_NAME, "text");
 cardinality = conf.getInt(CARDINALITY, 5000);
 String encClass = conf.get(ENCODER_CLASS);
 encoder = ClassUtils.instantiateAs(encClass,
     FeatureVectorEncoder.class,
     new Class[]{String.class},
     new Object[]{encoderName});
 if (encoder instanceof LuceneTextValueEncoder) {
  ((LuceneTextValueEncoder) encoder).setAnalyzer(analyzer);
 }
}

/**
 * Tokenizes a string using the simplest method.  This should be over-ridden for more subtle
 * tokenization.
 */
@Override
protected Iterable<String> tokenize(CharSequence originalForm) {
 try {
  TokenStream ts = analyzer.tokenStream(getName(), new CharSequenceReader(originalForm));
  ts.addAttribute(CharTermAttribute.class);
  return new LuceneTokenIterable(ts, false);
 } catch (IOException ex) {
  throw new IllegalStateException(ex);
 }
}

@Override
protected void setup(Context context) throws IOException, InterruptedException {
 Configuration conf = context.getConfiguration();
 sequentialVectors = conf.getBoolean(USE_SEQUENTIAL, false);
 namedVectors = conf.getBoolean(USE_NAMED_VECTORS, false);
 String analyzerName = conf.get(ANALYZER_NAME, StandardAnalyzer.class.getName());
 Analyzer analyzer;
 try {
  analyzer = AnalyzerUtils.createAnalyzer(analyzerName);
 } catch (ClassNotFoundException e) {
  //TODO: hmmm, don't like this approach
  throw new IOException("Unable to create Analyzer for name: " + analyzerName, e);
 }
 String encoderName = conf.get(ENCODER_FIELD_NAME, "text");
 cardinality = conf.getInt(CARDINALITY, 5000);
 String encClass = conf.get(ENCODER_CLASS);
 encoder = ClassUtils.instantiateAs(encClass,
     FeatureVectorEncoder.class,
     new Class[]{String.class},
     new Object[]{encoderName});
 if (encoder instanceof LuceneTextValueEncoder) {
  ((LuceneTextValueEncoder) encoder).setAnalyzer(analyzer);
 }
}

/**
 * Tokenizes a string using the simplest method.  This should be over-ridden for more subtle
 * tokenization.
 */
@Override
protected Iterable<String> tokenize(CharSequence originalForm) {
 try {
  TokenStream ts = analyzer.tokenStream(getName(), new CharSequenceReader(originalForm));
  ts.addAttribute(CharTermAttribute.class);
  return new LuceneTokenIterable(ts, false);
 } catch (IOException ex) {
  throw new IllegalStateException(ex);
 }
}

@Override
protected void setup(Context context) throws IOException, InterruptedException {
 Configuration conf = context.getConfiguration();
 sequentialVectors = conf.getBoolean(USE_SEQUENTIAL, false);
 namedVectors = conf.getBoolean(USE_NAMED_VECTORS, false);
 String analyzerName = conf.get(ANALYZER_NAME, StandardAnalyzer.class.getName());
 Analyzer analyzer;
 try {
  analyzer = AnalyzerUtils.createAnalyzer(analyzerName);
 } catch (ClassNotFoundException e) {
  //TODO: hmmm, don't like this approach
  throw new IOException("Unable to create Analyzer for name: " + analyzerName, e);
 }
 String encoderName = conf.get(ENCODER_FIELD_NAME, "text");
 cardinality = conf.getInt(CARDINALITY, 5000);
 String encClass = conf.get(ENCODER_CLASS);
 encoder = ClassUtils.instantiateAs(encClass,
     FeatureVectorEncoder.class,
     new Class[]{String.class},
     new Object[]{encoderName});
 if (encoder instanceof LuceneTextValueEncoder) {
  ((LuceneTextValueEncoder) encoder).setAnalyzer(analyzer);
 }
}

/**
 * Tokenizes a string using the simplest method.  This should be over-ridden for more subtle
 * tokenization.
 */
@Override
protected Iterable<String> tokenize(CharSequence originalForm) {
 TokenStream ts = analyzer.tokenStream(getName(), new CharSequenceReader(originalForm));
 ts.addAttribute(CharTermAttribute.class);
 return new LuceneTokenIterable(ts, false);
}

Javadoc

Encodes text using a lucene style tokenizer.

Most used methods

Popular in Java

Finding current android device location
putExtra (Intent)
findViewById (Activity)
scheduleAtFixedRate (ScheduledExecutorService)
IOException (java.io)
Signals a general, I/O-related error. Error details may be specified when calling the constructor, a
Runnable (java.lang)
Represents a command that can be executed. Often used to run code in a different Thread.
BitSet (java.util)
The BitSet class implements abit array [http://en.wikipedia.org/wiki/Bit_array]. Each element is eit
Comparator (java.util)
A Comparator is used to compare two objects to determine their ordering with respect to each other.
TimerTask (java.util)
The TimerTask class represents a task to run at a specified time. The task may be run once or repeat
Get (org.apache.hadoop.hbase.client)
Used to perform Get operations on a single row. To get everything for a row, instantiate a Get objec
Top plugins for WebStorm

How to useLuceneTextValueEncoder in org.apache.mahout.vectorizer.encoders

Best Java code snippets using org.apache.mahout.vectorizer.encoders.LuceneTextValueEncoder (Showing top 8 results out of 315)

How to use
LuceneTextValueEncoder
in
org.apache.mahout.vectorizer.encoders