/** * Adds a value expressed in byte array form to a vector. * * @param originalForm The original form of the value as a byte array. * @param data The vector to which the value should be added. */ public void addToVector(byte[] originalForm, Vector data) { addToVector(originalForm, 1.0, data); }
public abstract void addToVector(byte[] originalForm, double weight, Vector data);
FeatureVectorEncoder enc = new ContinuousValueEncoder("foo"); Vector v1 = new DenseVector(20); enc.addToVector("-123", v1); assertEquals(-123, v1.minValue(), 0); assertEquals(0, v1.maxValue(), 0); enc.addToVector("123", v1); assertEquals(123, v1.maxValue(), 0); assertEquals(0, v1.minValue(), 0); enc.setProbes(2); enc.addToVector("123", v2); assertEquals(123, v2.maxValue(), 0); assertEquals(2 * 123, v2.norm(1), 0); enc.setProbes(2); enc.addToVector("100", v3); v1 = v2.minus(v3); assertEquals(23, v1.maxValue(), 0); assertEquals(2 * 23, v1.norm(1), 0); enc.addToVector("7", v1); assertEquals(30, v1.maxValue(), 0); assertEquals(2 * 30, v1.norm(1), 0); enc.setProbes(6); enc.addToVector("145", v2); enc.addToVector((byte[]) null, 145, v3); assertEquals(0, v2.minus(v3).norm(1), 0);
encoder.setProbes(2); encoder.setTraceDictionary(traceDictionary); FeatureVectorEncoder bias = new ConstantValueEncoder("Intercept"); bias.setTraceDictionary(traceDictionary); FeatureVectorEncoder lines = new ConstantValueEncoder("Lines"); lines.setTraceDictionary(traceDictionary); Dictionary newsGroups = new Dictionary();
/** * Converts a value into a form that would help a human understand the internals of how the value * is being interpreted. For text-like things, this is likely to be a list of the terms found with * associated weights (if any). * * @param originalForm The original form of the value as a string. * @return A string that a human can read. */ @Override public String asString(String originalForm) { StringBuilder r = new StringBuilder(); r.append('['); for (String word : tokenize(originalForm)) { if (r.length() > 1) { r.append(", "); } r.append(wordEncoder.asString(word)); } r.append(']'); return r.toString(); }
/** * Adds a value to a vector. * * @param originalForm1 The original form of the first value as a byte array. * @param originalForm2 The original form of the second value as a byte array. * @param weight How much to weight this interaction * @param data The vector to which the value should be added. */ public void addInteractionToVector(byte[] originalForm1, byte[] originalForm2, double weight, Vector data) { String name = getName(); double w = getWeight(originalForm1, originalForm2, weight); for (int i = 0; i < probes(); i++) { Iterable<Integer> jValues = secondEncoder.hashesForProbe(originalForm2, data.size(), name, i % secondEncoder.getProbes()); for (Integer k : firstEncoder.hashesForProbe(originalForm1, data.size(), name, i % firstEncoder.getProbes())) { for (Integer j : jValues) { int n = (k + j) % data.size(); if (isTraceEnabled()) { trace(String.format("%s:%s", new String(originalForm1, Charsets.UTF_8), new String(originalForm2, Charsets.UTF_8)), n); } data.set(n, data.get(n) + w); } } } }
/** * Hash a string and an integer into the range [0..numFeatures-1]. * * @param term The string. * @param probe An integer that modifies the resulting hash. * @param numFeatures The range into which the resulting hash must fit. * @return An integer in the range [0..numFeatures-1] that has good spread for small changes in * term and probe. */ protected int hash(String term, int probe, int numFeatures) { long r = MurmurHash.hash64A(bytesForString(term), probe) % numFeatures; if (r < 0) { r += numFeatures; } return (int) r; }
/** * Sets the number of locations in the feature vector that a value should be in. * This causes the cached probe locations to be recomputed. * * @param probes Number of locations to increment. */ @Override public void setProbes(int probes) { super.setProbes(probes); cacheProbeLocations(getSeed()); }
FeatureVectorEncoder encoder = constructor.newInstance(name); predictorEncoders.put(predictor, encoder); encoder.setTraceDictionary(traceDictionary); } catch (InstantiationException e) { throw new IllegalStateException(CANNOT_CONSTRUCT_CONVERTER, e);
protected void trace(byte[] subName, int n) { trace(new String(subName, Charsets.UTF_8), n); }
/** * Returns all of the hashes for this probe. For most encoders, this is a singleton, but * for text, many hashes are returned, one for each word (unique or not). Most implementations * should only implement hashForProbe for simplicity. * * @param originalForm The original byte array value. * @param dataSize The length of the vector being encoded * @param name The name of the variable being encoded * @param probe The probe number * @return an Iterable of the hashes */ protected Iterable<Integer> hashesForProbe(byte[] originalForm, int dataSize, String name, int probe) { return Collections.singletonList(hashForProbe(originalForm, dataSize, name, probe)); }
FeatureVectorEncoder enc = new ConstantValueEncoder("foo"); Vector v1 = new DenseVector(20); enc.addToVector((byte[]) null, -123, v1); assertEquals(-123, v1.minValue(), 0); assertEquals(0, v1.maxValue(), 0); enc.addToVector((byte[]) null, 123, v1); assertEquals(123, v1.maxValue(), 0); assertEquals(0, v1.minValue(), 0); enc.setProbes(2); enc.addToVector((byte[]) null, 123, v2); assertEquals(123, v2.maxValue(), 0); assertEquals(2 * 123, v2.norm(1), 0); enc.setProbes(2); enc.addToVector((byte[]) null, 100, v3); v1 = v2.minus(v3); assertEquals(23, v1.maxValue(), 0); assertEquals(2 * 23, v1.norm(1), 0); enc.addToVector((byte[]) null, 7, v1); assertEquals(30, v1.maxValue(), 0); assertEquals(2 * 30, v1.norm(1), 0);
/** * Converts a value into a form that would help a human understand the internals of how the value * is being interpreted. For text-like things, this is likely to be a list of the terms found with * associated weights (if any). * * @param originalForm The original form of the value as a string. * @return A string that a human can read. */ @Override public String asString(String originalForm) { StringBuilder r = new StringBuilder(); r.append('['); for (String word : tokenize(originalForm)) { if (r.length() > 1) { r.append(", "); } r.append(wordEncoder.asString(word)); } r.append(']'); return r.toString(); }
/** * Adds a value to a vector. * * @param originalForm1 The original form of the first value as a byte array. * @param originalForm2 The original form of the second value as a byte array. * @param weight How much to weight this interaction * @param data The vector to which the value should be added. */ public void addInteractionToVector(byte[] originalForm1, byte[] originalForm2, double weight, Vector data) { String name = getName(); double w = getWeight(originalForm1, originalForm2, weight); for (int i = 0; i < probes(); i++) { Iterable<Integer> jValues = secondEncoder.hashesForProbe(originalForm2, data.size(), name, i % secondEncoder.getProbes()); for (Integer k : firstEncoder.hashesForProbe(originalForm1, data.size(), name, i % firstEncoder.getProbes())) { for (Integer j : jValues) { int n = (k + j) % data.size(); if (isTraceEnabled()) { trace(String.format("%s:%s", new String(originalForm1, Charsets.UTF_8), new String(originalForm2, Charsets.UTF_8)), n); } data.set(n, data.get(n) + w); } } } }
/** * Hash a string and an integer into the range [0..numFeatures-1]. * * @param term The string. * @param probe An integer that modifies the resulting hash. * @param numFeatures The range into which the resulting hash must fit. * @return An integer in the range [0..numFeatures-1] that has good spread for small changes in * term and probe. */ protected int hash(String term, int probe, int numFeatures) { long r = MurmurHash.hash64A(bytesForString(term), probe) % numFeatures; if (r < 0) { r += numFeatures; } return (int) r; }
/** * Sets the number of locations in the feature vector that a value should be in. * This causes the cached probe locations to be recomputed. * * @param probes Number of locations to increment. */ @Override public void setProbes(int probes) { super.setProbes(probes); cacheProbeLocations(getSeed()); }
FeatureVectorEncoder encoder = constructor.newInstance(name); predictorEncoders.put(predictor, encoder); encoder.setTraceDictionary(traceDictionary); } catch (InstantiationException e) { throw new IllegalStateException(CANNOT_CONSTRUCT_CONVERTER, e);
protected void trace(byte[] subName, int n) { trace(new String(subName, Charsets.UTF_8), n); }
/** * Returns all of the hashes for this probe. For most encoders, this is a singleton, but * for text, many hashes are returned, one for each word (unique or not). Most implementations * should only implement hashForProbe for simplicity. * * @param originalForm The original byte array value. * @param dataSize The length of the vector being encoded * @param name The name of the variable being encoded * @param probe The probe number * @return an Iterable of the hashes */ protected Iterable<Integer> hashesForProbe(byte[] originalForm, int dataSize, String name, int probe) { return Collections.singletonList(hashForProbe(originalForm, dataSize, name, probe)); }