org.apache.mahout.vectorizer.encoders.FeatureVectorEncoder java code examples

/**
 * Adds a value expressed in byte array form to a vector.
 *
 * @param originalForm The original form of the value as a byte array.
 * @param data         The vector to which the value should be added.
 */
public void addToVector(byte[] originalForm, Vector data) {
 addToVector(originalForm, 1.0, data);
}

public abstract void addToVector(byte[] originalForm, double weight, Vector data);

protected double getWeight(byte[] originalForm1, byte[] originalForm2, double w) {
 return firstEncoder.getWeight(originalForm1, 1.0) * secondEncoder.getWeight(originalForm2, 1.0) * w;
}

FeatureVectorEncoder enc = new ContinuousValueEncoder("foo");
Vector v1 = new DenseVector(20);
enc.addToVector("-123", v1);
assertEquals(-123, v1.minValue(), 0);
assertEquals(0, v1.maxValue(), 0);
enc.addToVector("123", v1);
assertEquals(123, v1.maxValue(), 0);
assertEquals(0, v1.minValue(), 0);
enc.setProbes(2);
enc.addToVector("123", v2);
assertEquals(123, v2.maxValue(), 0);
assertEquals(2 * 123, v2.norm(1), 0);
enc.setProbes(2);
enc.addToVector("100", v3);
v1 = v2.minus(v3);
assertEquals(23, v1.maxValue(), 0);
assertEquals(2 * 23, v1.norm(1), 0);
enc.addToVector("7", v1);
assertEquals(30, v1.maxValue(), 0);
assertEquals(2 * 30, v1.norm(1), 0);
enc.setProbes(6);
enc.addToVector("145", v2);
enc.addToVector((byte[]) null, 145, v3);
assertEquals(0, v2.minus(v3).norm(1), 0);

encoder.setProbes(2);
encoder.setTraceDictionary(traceDictionary);
FeatureVectorEncoder bias = new ConstantValueEncoder("Intercept");
bias.setTraceDictionary(traceDictionary);
FeatureVectorEncoder lines = new ConstantValueEncoder("Lines");
lines.setTraceDictionary(traceDictionary);
Dictionary newsGroups = new Dictionary();

/**
 * Converts a value into a form that would help a human understand the internals of how the value
 * is being interpreted.  For text-like things, this is likely to be a list of the terms found with
 * associated weights (if any).
 *
 * @param originalForm The original form of the value as a string.
 * @return A string that a human can read.
 */
@Override
public String asString(String originalForm) {
 StringBuilder r = new StringBuilder();
 r.append('[');
 for (String word : tokenize(originalForm)) {
  if (r.length() > 1) {
   r.append(", ");
  }
  r.append(wordEncoder.asString(word));
 }
 r.append(']');
 return r.toString();
}

/**
 * Adds a value to a vector.
 *
 * @param originalForm1 The original form of the first value as a byte array.
 * @param originalForm2 The original form of the second value as a byte array.
 * @param weight        How much to weight this interaction
 * @param data          The vector to which the value should be added.
 */
public void addInteractionToVector(byte[] originalForm1, byte[] originalForm2, double weight, Vector data) {
 String name = getName();
 double w = getWeight(originalForm1, originalForm2, weight);
 for (int i = 0; i < probes(); i++) {
  Iterable<Integer> jValues =
    secondEncoder.hashesForProbe(originalForm2, data.size(), name, i % secondEncoder.getProbes());
  for (Integer k : firstEncoder.hashesForProbe(originalForm1, data.size(), name, i % firstEncoder.getProbes())) {
   for (Integer j : jValues) {
    int n = (k + j) % data.size();
    if (isTraceEnabled()) {
     trace(String.format("%s:%s", new String(originalForm1, Charsets.UTF_8), new String(originalForm2,
   Charsets.UTF_8)), n);
    }
    data.set(n, data.get(n) + w);
   }
  }
 }
}

/**
 * Hash a string and an integer into the range [0..numFeatures-1].
 *
 * @param term        The string.
 * @param probe       An integer that modifies the resulting hash.
 * @param numFeatures The range into which the resulting hash must fit.
 * @return An integer in the range [0..numFeatures-1] that has good spread for small changes in
 *         term and probe.
 */
protected int hash(String term, int probe, int numFeatures) {
 long r = MurmurHash.hash64A(bytesForString(term), probe) % numFeatures;
 if (r < 0) {
  r += numFeatures;
 }
 return (int) r;
}

/**
 * Sets the number of locations in the feature vector that a value should be in.
 * This causes the cached probe locations to be recomputed.
 *
 * @param probes Number of locations to increment.
 */
@Override
public void setProbes(int probes) {
 super.setProbes(probes);
 cacheProbeLocations(getSeed());
}

 FeatureVectorEncoder encoder = constructor.newInstance(name);
 predictorEncoders.put(predictor, encoder);
 encoder.setTraceDictionary(traceDictionary);
} catch (InstantiationException e) {
 throw new IllegalStateException(CANNOT_CONSTRUCT_CONVERTER, e);

protected void trace(byte[] subName, int n) {
 trace(new String(subName, Charsets.UTF_8), n);
}

/**
 * Returns all of the hashes for this probe.  For most encoders, this is a singleton, but
 * for text, many hashes are returned, one for each word (unique or not).  Most implementations
 * should only implement hashForProbe for simplicity.
 *
 * @param originalForm The original byte array value.
 * @param dataSize     The length of the vector being encoded
 * @param name         The name of the variable being encoded
 * @param probe        The probe number
 * @return an Iterable of the hashes
 */
protected Iterable<Integer> hashesForProbe(byte[] originalForm, int dataSize, String name, int probe) {
 return Collections.singletonList(hashForProbe(originalForm, dataSize, name, probe));
}

FeatureVectorEncoder enc = new ConstantValueEncoder("foo");
Vector v1 = new DenseVector(20);
enc.addToVector((byte[]) null, -123, v1);
assertEquals(-123, v1.minValue(), 0);
assertEquals(0, v1.maxValue(), 0);
enc.addToVector((byte[]) null, 123, v1);
assertEquals(123, v1.maxValue(), 0);
assertEquals(0, v1.minValue(), 0);
enc.setProbes(2);
enc.addToVector((byte[]) null, 123, v2);
assertEquals(123, v2.maxValue(), 0);
assertEquals(2 * 123, v2.norm(1), 0);
enc.setProbes(2);
enc.addToVector((byte[]) null, 100, v3);
v1 = v2.minus(v3);
assertEquals(23, v1.maxValue(), 0);
assertEquals(2 * 23, v1.norm(1), 0);
enc.addToVector((byte[]) null, 7, v1);
assertEquals(30, v1.maxValue(), 0);
assertEquals(2 * 30, v1.norm(1), 0);

/**
 * Converts a value into a form that would help a human understand the internals of how the value
 * is being interpreted.  For text-like things, this is likely to be a list of the terms found with
 * associated weights (if any).
 *
 * @param originalForm The original form of the value as a string.
 * @return A string that a human can read.
 */
@Override
public String asString(String originalForm) {
 StringBuilder r = new StringBuilder();
 r.append('[');
 for (String word : tokenize(originalForm)) {
  if (r.length() > 1) {
   r.append(", ");
  }
  r.append(wordEncoder.asString(word));
 }
 r.append(']');
 return r.toString();
}

/**
 * Adds a value to a vector.
 *
 * @param originalForm1 The original form of the first value as a byte array.
 * @param originalForm2 The original form of the second value as a byte array.
 * @param weight        How much to weight this interaction
 * @param data          The vector to which the value should be added.
 */
public void addInteractionToVector(byte[] originalForm1, byte[] originalForm2, double weight, Vector data) {
 String name = getName();
 double w = getWeight(originalForm1, originalForm2, weight);
 for (int i = 0; i < probes(); i++) {
  Iterable<Integer> jValues =
    secondEncoder.hashesForProbe(originalForm2, data.size(), name, i % secondEncoder.getProbes());
  for (Integer k : firstEncoder.hashesForProbe(originalForm1, data.size(), name, i % firstEncoder.getProbes())) {
   for (Integer j : jValues) {
    int n = (k + j) % data.size();
    if (isTraceEnabled()) {
     trace(String.format("%s:%s", new String(originalForm1, Charsets.UTF_8), new String(originalForm2,
   Charsets.UTF_8)), n);
    }
    data.set(n, data.get(n) + w);
   }
  }
 }
}

/**
 * Hash a string and an integer into the range [0..numFeatures-1].
 *
 * @param term        The string.
 * @param probe       An integer that modifies the resulting hash.
 * @param numFeatures The range into which the resulting hash must fit.
 * @return An integer in the range [0..numFeatures-1] that has good spread for small changes in
 *         term and probe.
 */
protected int hash(String term, int probe, int numFeatures) {
 long r = MurmurHash.hash64A(bytesForString(term), probe) % numFeatures;
 if (r < 0) {
  r += numFeatures;
 }
 return (int) r;
}

/**
 * Sets the number of locations in the feature vector that a value should be in.
 * This causes the cached probe locations to be recomputed.
 *
 * @param probes Number of locations to increment.
 */
@Override
public void setProbes(int probes) {
 super.setProbes(probes);
 cacheProbeLocations(getSeed());
}

 FeatureVectorEncoder encoder = constructor.newInstance(name);
 predictorEncoders.put(predictor, encoder);
 encoder.setTraceDictionary(traceDictionary);
} catch (InstantiationException e) {
 throw new IllegalStateException(CANNOT_CONSTRUCT_CONVERTER, e);

protected void trace(byte[] subName, int n) {
 trace(new String(subName, Charsets.UTF_8), n);
}

/**
 * Returns all of the hashes for this probe.  For most encoders, this is a singleton, but
 * for text, many hashes are returned, one for each word (unique or not).  Most implementations
 * should only implement hashForProbe for simplicity.
 *
 * @param originalForm The original byte array value.
 * @param dataSize     The length of the vector being encoded
 * @param name         The name of the variable being encoded
 * @param probe        The probe number
 * @return an Iterable of the hashes
 */
protected Iterable<Integer> hashesForProbe(byte[] originalForm, int dataSize, String name, int probe) {
 return Collections.singletonList(hashForProbe(originalForm, dataSize, name, probe));
}

Javadoc

General interface for objects that record features into a feature vector.

By convention, sub-classes should provide a constructor that accepts just a field name as well as setters to customize properties of the conversion such as adding tokenizers or a weight dictionary.

Most used methods

addToVector
Adds a value expressed in byte array form to a vector.
setProbes
Sets the number of locations in the feature vector that a value should be in.
asString
Converts a value into a form that would help a human understand the internals of how the value is be
setTraceDictionary
bytesForString
getProbes
getWeight
hashForProbe
Provides the unique hash for a particular probe. For all encoders except text, this is all that is n
hashesForProbe
Returns all of the hashes for this probe. For most encoders, this is a singleton, but for text, many
trace

Popular in Java

Making http requests using okhttp
addToBackStack (FragmentTransaction)
setContentView (Activity)
onCreateOptionsMenu (Activity)
FileNotFoundException (java.io)
Thrown when a file specified by a program cannot be found.
Random (java.util)
This class provides methods that return pseudo-random values.It is dangerous to seed Random with the
Set (java.util)
A Set is a data structure which does not allow duplicate elements.
Executors (java.util.concurrent)
Factory and utility methods for Executor, ExecutorService, ScheduledExecutorService, ThreadFactory,
Handler (java.util.logging)
A Handler object accepts a logging request and exports the desired messages to a target, for example
JButton (javax.swing)
Github Copilot alternatives

How to useFeatureVectorEncoder in org.apache.mahout.vectorizer.encoders

Best Java code snippets using org.apache.mahout.vectorizer.encoders.FeatureVectorEncoder (Showing top 20 results out of 315)

How to use
FeatureVectorEncoder
in
org.apache.mahout.vectorizer.encoders