/** * @param trainPointData data to cluster * @param model trained KMeans Model * @return map of ClusterId, count of points associated with the clusterId */ private static Map<Integer,Long> fetchClusterCountsFromModel(JavaRDD<? extends Vector> trainPointData, KMeansModel model) { return trainPointData.map(model::predict).countByValue(); }
@Test public void approximateResults() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 1, 2, 3, 5, 8, 13)); Map<Integer, Long> countsByValue = rdd.countByValue(); assertEquals(2, countsByValue.get(1).longValue()); assertEquals(1, countsByValue.get(13).longValue()); PartialResult<Map<Integer, BoundedDouble>> approx = rdd.countByValueApprox(1); Map<Integer, BoundedDouble> finalValue = approx.getFinalValue(); assertEquals(2.0, finalValue.get(1).mean(), 0.01); assertEquals(1.0, finalValue.get(13).mean(), 0.01); }
@Test public void approximateResults() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 1, 2, 3, 5, 8, 13)); Map<Integer, Long> countsByValue = rdd.countByValue(); assertEquals(2, countsByValue.get(1).longValue()); assertEquals(1, countsByValue.get(13).longValue()); PartialResult<Map<Integer, BoundedDouble>> approx = rdd.countByValueApprox(1); Map<Integer, BoundedDouble> finalValue = approx.getFinalValue(); assertEquals(2.0, finalValue.get(1).mean(), 0.01); assertEquals(1.0, finalValue.get(13).mean(), 0.01); }
public static void main(String[] args) throws Exception { if (args.length != 2) { throw new Exception("Usage BasicFlatMap sparkMaster inputFile"); } JavaSparkContext sc = new JavaSparkContext( args[0], "basicflatmap", System.getenv("SPARK_HOME"), System.getenv("JARS")); JavaRDD<String> rdd = sc.textFile(args[1]); JavaRDD<String> words = rdd.flatMap( new FlatMapFunction<String, String>() { public Iterable<String> call(String x) { return Arrays.asList(x.split(" ")); }}); Map<String, Long> result = words.countByValue(); for (Entry<String, Long> entry: result.entrySet()) { System.out.println(entry.getKey() + ":" + entry.getValue()); } } }
@Test public void approximateResults() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 1, 2, 3, 5, 8, 13)); Map<Integer, Long> countsByValue = rdd.countByValue(); assertEquals(2, countsByValue.get(1).longValue()); assertEquals(1, countsByValue.get(13).longValue()); PartialResult<Map<Integer, BoundedDouble>> approx = rdd.countByValueApprox(1); Map<Integer, BoundedDouble> finalValue = approx.getFinalValue(); assertEquals(2.0, finalValue.get(1).mean(), 0.01); assertEquals(1.0, finalValue.get(13).mean(), 0.01); }
@Override public Map<T, Long> countByValue() { return rdd.countByValue(); }
public NGramBuilder(String regexpFileName, String inputFileName, String unigramFileName, String bigramFileName) { JavaRDD<String> lines = jsc.textFile(inputFileName).filter(new InvalidLineFilter()); System.out.println("#(lines) = " + lines.count()); // create unigrams and save them // converter = new Converter(regexpFileName); Map<String, Long> unigrams = lines.flatMap(new UnigramFunction()).countByValue(); List<Tuple2<String, Long>> tuples = new ArrayList<Tuple2<String, Long>>(unigrams.size()); for (String word : unigrams.keySet()) { Long f = unigrams.get(word); if (f >= 2) tuples.add(new Tuple2<String, Long>(word, f)); } JavaPairRDD<String, Long> jprdd = jsc.parallelizePairs(tuples); jprdd.saveAsTextFile(unigramFileName, GzipCodec.class); // create bigrams and save them Map<Tuple2<String, String>, Long> bigrams = lines.flatMap(new BigramFunction()).countByValue(); tuples = new ArrayList<Tuple2<String, Long>>(bigrams.size()); for (Tuple2<String, String> pair : bigrams.keySet()) { Long f = bigrams.get(pair); if (f >= 2) tuples.add(new Tuple2<String, Long>(pair._1() + ',' + pair._2(), f)); } jprdd = jsc.parallelizePairs(tuples); jprdd.saveAsTextFile(bigramFileName, GzipCodec.class); }