/** * Returns whether the given Datum contains the same features as this Datum. * Doesn't check the labels, should we change this? */ @SuppressWarnings("unchecked") @Override public boolean equals(Object o) { if (!(o instanceof Datum)) { return (false); } Datum<LabelType, FeatureType> d = (Datum<LabelType, FeatureType>) o; return features.equals(d.asFeatures()); }
/** * Returns whether the given Datum contains the same features as this Datum. * Doesn't check the labels, should we change this? * (CDM Feb 2012: Also doesn't correctly respect the contract for equals, * since it gives one way equality with other Datum's.) * * @param o The object to test equality with * @return Whether it is equal to this CRFDatum in terms of features */ @Override public boolean equals(Object o) { if (!(o instanceof Datum)) { return (false); } Datum<?, ?> d = (Datum<?, ?>) o; return features.equals(d.asFeatures()); }
@Override public L classOf(Datum<L, F> datum) { if(datum instanceof RVFDatum<?,?>){ return classOfRVFDatum((RVFDatum<L,F>) datum); } return classOf(datum.asFeatures()); }
@Override public void add(Datum<L, F> d) { add(d.asFeatures(), d.label()); }
public void add(Datum<L, F> d, float weight) { add(d.asFeatures(), d.label(), weight); }
public static <L,L2,F> Datum<L2,F> mapDatum(Datum<L,F> d, Map<L,L2> labelMapping, L2 defaultLabel) { // TODO: How to copy datum? L2 newLabel = labelMapping.get(d.label()); if (newLabel == null) { newLabel = defaultLabel; } if (d instanceof RVFDatum) { return new RVFDatum<>(((RVFDatum<L, F>) d).asFeaturesCounter(), newLabel); } else { return new BasicDatum<>(d.asFeatures(), newLabel); } }
public double probabilityOf(Datum<L, F> example) { if (example instanceof RVFDatum<?,?>) { return probabilityOfRVFDatum((RVFDatum<L,F>)example); } return probabilityOf(example.asFeatures(), example.label()); }
Datum<L,F> datum = labeledDataset.getDatum(i); int labelID = labeledDataset.labelIndex.indexOf(datum.label()); for(F feature : datum.asFeatures()){ if(geFeatureMap.containsKey(feature)){ int geFnum = geFeatureMap.get(feature); for (F feature : datum.asFeatures()) { if (geFeatureMap.containsKey(feature)) { int geFnum = geFeatureMap.get(feature);
/** Returns of the score of the Datum for the specified label. * Ignores the true label of the Datum. */ public double scoreOf(Datum<L, F> example, L label) { if (example instanceof RVFDatum<?, ?>) { return scoreOfRVFDatum((RVFDatum<L,F>)example, label); } int iLabel = labelIndex.indexOf(label); double score = 0.0; for (F f : example.asFeatures()) { score += weight(f, iLabel); } return score + thresholds[iLabel]; }
/** * Constructs a new RVFDatum taking the data from a Datum. <i>Implementation * note:</i> This constructor allocates its own counter over features, but is * only guaranteed correct if the label and feature names are immutable. * * @param m The Datum to copy. */ public RVFDatum(Datum<L, F> m) { this.features = new ClassicCounter<>(); for (F key : m.asFeatures()) { features.incrementCount(key, 1.0); } setLabel(m.label()); }
@Override public Counter<L> probabilityOf(Datum<L, F> example) { // calculate the feature indices and feature values int[] featureIndices = LogisticUtils.indicesOf(example.asFeatures(), featureIndex); double[] featureValues; if (example instanceof RVFDatum<?, ?>) { Collection<Double> featureValuesCollection = ((RVFDatum<?, ?>) example).asFeaturesCounter().values(); featureValues = LogisticUtils.convertToArray(featureValuesCollection); } else { featureValues = new double[example.asFeatures().size()]; Arrays.fill(featureValues, 1.0); } // calculate probability of each class Counter<L> result = new ClassicCounter<>(); int numClasses = labelIndex.size(); double[] sigmoids = LogisticUtils.calculateSigmoids(weights, featureIndices, featureValues); for (int c = 0; c < numClasses; c++) { L label = labelIndex.get(c); result.incrementCount(label, sigmoids[c]); } return result; }
/** * returns the scores for both the classes */ @Override public Counter<L> scoresOf(Datum<L, F> datum) { if(datum instanceof RVFDatum<?,?>)return scoresOfRVFDatum((RVFDatum<L,F>)datum); Collection<F> features = datum.asFeatures(); double sum = scoreOf(features); Counter<L> c = new ClassicCounter<>(); c.setCount(classes[0], -sum); c.setCount(classes[1], sum); return c; }
/** Construct a counter with keys the labels of the classifier and * values the score (unnormalized log probability) of each class. */ @Override public Counter<L> scoresOf(Datum<L, F> example) { if(example instanceof RVFDatum<?, ?>)return scoresOfRVFDatum((RVFDatum<L,F>)example); Collection<F> feats = example.asFeatures(); int[] features = new int[feats.size()]; int i = 0; for (F f : feats) { int index = featureIndex.indexOf(f); if (index >= 0) { features[i++] = index; // } else { //logger.info("FEATURE LESS THAN ZERO: " + f); } } int[] activeFeatures = new int[i]; synchronized (System.class) { System.arraycopy(features, 0, activeFeatures, 0, i); } Counter<L> scores = new ClassicCounter<>(); for (L lab : labels()) { scores.setCount(lab, scoreOf(activeFeatures, lab)); } return scores; }
Datum<String, String> d = makeDatum(doc, i, featureFactories); Collection<String> newFeats = new ArrayList<>(); for (String f : d.asFeatures()) { if ( ! origFeatIndex.contains(f)) { newFeats.add(f);
private void updateDerivative(Datum<L,F> datum, double[] probs,Counter<Triple<Integer,Integer,Integer>> feature2classPairDerivatives){ for (F feature : datum.asFeatures()) { int fID = labeledDataset.featureIndex.indexOf(feature); if (fID >= 0) { for (int c = 0; c < numClasses; c++) { for (int cPrime = 0; cPrime < numClasses; cPrime++) { if (cPrime == c) { feature2classPairDerivatives.incrementCount(new Triple<>(fID, c, cPrime), - probs[c]*(1-probs[c])*valueOfFeature(feature,datum)); } else { feature2classPairDerivatives.incrementCount(new Triple<>(fID, c, cPrime), probs[c]*probs[cPrime]*valueOfFeature(feature,datum)); } } } } } }
@Override // If you edit me, also take care of WeightedRVFDataset public void add(Datum<L, F> d) { if (d instanceof RVFDatum<?, ?>) { addLabel(d.label()); addFeatures(((RVFDatum<L, F>) d).asFeaturesCounter()); size++; } else { addLabel(d.label()); addFeatures(Counters.asCounter(d.asFeatures())); size++; } }
Collection<F> features = datum.asFeatures(); for (F feature : features) { int i = indexOf(featureIndex.indexOf(feature), labelIndex.indexOf(datum.label())); Collection<F> features = datum.asFeatures(); for (F feature : features) { for (int c = 0; c < numClasses; c++) {
public void add(Datum<L, F> d, String src, String id) { if (d instanceof RVFDatum<?, ?>) { addLabel(d.label()); addFeatures(((RVFDatum<L, F>) d).asFeaturesCounter()); addSourceAndId(src, id); size++; } else { addLabel(d.label()); addFeatures(Counters.asCounter(d.asFeatures())); addSourceAndId(src, id); size++; } }
public Classifier<L, F> trainClassifier(Iterable<Datum<L, F>> dataIterable) { Minimizer<DiffFunction> minimizer = getMinimizer(); Index<F> featureIndex = Generics.newIndex(); Index<L> labelIndex = Generics.newIndex(); for (Datum<L, F> d : dataIterable) { labelIndex.add(d.label()); featureIndex.addAll(d.asFeatures());//If there are duplicates, it doesn't add them again. } logger.info(String.format("Training linear classifier with %d features and %d labels", featureIndex.size(), labelIndex.size())); LogConditionalObjectiveFunction<L, F> objective = new LogConditionalObjectiveFunction<>(dataIterable, logPrior, featureIndex, labelIndex); // [cdm 2014] Commented out next line. Why not use the logPrior set up previously and used at creation??? // objective.setPrior(new LogPrior(LogPrior.LogPriorType.QUADRATIC)); double[] initial = objective.initial(); double[] weights = minimizer.minimize(objective, TOL, initial); LinearClassifier<L, F> classifier = new LinearClassifier<>(objective.to2D(weights), featureIndex, labelIndex); return classifier; }
/** * Method to convert features from counts to L1-normalized TFIDF based features * @param datum with a collection of features. * @param featureDocCounts a counter of doc-count for each feature. * @return RVFDatum with l1-normalized tf-idf features. */ public RVFDatum<L,F> getL1NormalizedTFIDFDatum(Datum<L,F> datum,Counter<F> featureDocCounts){ Counter<F> tfidfFeatures = new ClassicCounter<>(); for(F feature : datum.asFeatures()){ if(featureDocCounts.containsKey(feature)) tfidfFeatures.incrementCount(feature,1.0); } double l1norm = 0; for(F feature: tfidfFeatures.keySet()){ double idf = Math.log(((double)(this.size()+1))/(featureDocCounts.getCount(feature)+0.5)); double tf = tfidfFeatures.getCount(feature); tfidfFeatures.setCount(feature, tf*idf); l1norm += tf*idf; } for(F feature: tfidfFeatures.keySet()){ double tfidf = tfidfFeatures.getCount(feature); tfidfFeatures.setCount(feature, tfidf/l1norm); } RVFDatum<L,F> rvfDatum = new RVFDatum<>(tfidfFeatures, datum.label()); return rvfDatum; }