private static Map<String,Integer> buildIDIndexOneWayMap(PMML model, JavaRDD<String[]> parsedTestRDD, boolean user) { // Add to mapping everything from the model List<String> ids = AppPMMLUtils.getExtensionContent(model, user ? "XIDs" : "YIDs"); Map<String,Integer> idIndex = new HashMap<>(ids.size()); int index = 0; for (String id : ids) { idIndex.put(id, index++); } // And from the test set, which may have a few more IDs int offset = user ? 0 : 1; for (String id : parsedTestRDD.map(tokens -> tokens[offset]).distinct().collect()) { if (!idIndex.containsKey(id)) { idIndex.put(id, index++); } } return idIndex; }
private static Map<String,Integer> buildIDIndexMapping(JavaRDD<String[]> parsedRDD, boolean user) { int offset = user ? 0 : 1; Map<String,Integer> reverseIDLookup = parsedRDD.map(tokens -> tokens[offset]) .distinct().sortBy(s -> s, true, parsedRDD.getNumPartitions()) .zipWithIndex().mapValues(Long::intValue) .collectAsMap(); // Clone, due to some serialization problems with the result of collectAsMap? return new HashMap<>(reverseIDLookup); }
List<String> clientList = pairs.keys().distinct().collect(); Queue<ClientDetail> clientDetailQueue = new PriorityQueue<ClientDetail>(30, new Comparator<ClientDetail>() {
Broadcast<List<Integer>> allItemIDsBC = sparkContext.broadcast(positiveUserProducts.values().distinct().collect());
JavaRDD<Tuple3<Long,Long,Long>> uniqueTriangles = trianglesWithDuplicates.distinct();
JavaRDD<Tuple3<Long,Long,Long>> uniqueTriangles = trianglesWithDuplicates.distinct();
private JavaRDD<Tuple2<GeoWaveInputKey, Geometry>> prepareForReproject( final JavaPairRDD<ByteArray, Tuple2<GeoWaveInputKey, Geometry>> indexedRDD, final int numPartitions) { return indexedRDD.values().distinct(numPartitions); }
@Override public SparkStream<T> distinct() { return new SparkStream<>(rdd.distinct()); }
public static JavaRDD<Tuple2<Integer,Optional<String>>> joinData(JavaPairRDD<Integer, Integer> t, JavaPairRDD<Integer, String> u){ JavaRDD<Tuple2<Integer,Optional<String>>> leftJoinOutput = t.leftOuterJoin(u).values().distinct(); return leftJoinOutput; }
/** * Get a list of unique values from the specified column. * For sequence data, use {@link #getUniqueSequence(String, Schema, JavaRDD)} * * @param columnName Name of the column to get unique values from * @param schema Data schema * @param data Data to get unique values from * @return List of unique values */ public static List<Writable> getUnique(String columnName, Schema schema, JavaRDD<List<Writable>> data) { int colIdx = schema.getIndexOfColumn(columnName); JavaRDD<Writable> ithColumn = data.map(new SelectColumnFunction(colIdx)); return ithColumn.distinct().collect(); }
/** * Get a list of unique values from the specified column. * For sequence data, use {@link #getUniqueSequence(String, Schema, JavaRDD)} * * @param columnName Name of the column to get unique values from * @param schema Data schema * @param data Data to get unique values from * @return List of unique values */ public static List<Writable> getUnique(String columnName, Schema schema, JavaRDD<List<Writable>> data) { int colIdx = schema.getIndexOfColumn(columnName); JavaRDD<Writable> ithColumn = data.map(new SelectColumnFunction(colIdx)); return ithColumn.distinct().collect(); }
@Override public Tuple<Collection<ExecutionLineageNode>, Collection<ChannelInstance>> evaluate( ChannelInstance[] inputs, ChannelInstance[] outputs, SparkExecutor sparkExecutor, OptimizationContext.OperatorContext operatorContext) { assert inputs.length == this.getNumInputs(); assert outputs.length == this.getNumOutputs(); final RddChannel.Instance input = (RddChannel.Instance) inputs[0]; final RddChannel.Instance output = (RddChannel.Instance) outputs[0]; final JavaRDD<Type> inputRdd = input.provideRdd(); final JavaRDD<Type> outputRdd = inputRdd.distinct(sparkExecutor.getNumDefaultPartitions()); this.name(outputRdd); output.accept(outputRdd, sparkExecutor); return ExecutionOperator.modelLazyExecution(inputs, outputs, operatorContext); }
clusters = clusters.distinct();
@Test public void testTotalPutsBatching() throws Exception { HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(); HoodieWriteConfig config = getConfig(); HBaseIndex index = new HBaseIndex(config); HoodieWriteClient writeClient = new HoodieWriteClient(jsc, config); // start a commit and generate test data String newCommitTime = writeClient.startCommit(); List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 250); JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1); HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath); HoodieTable hoodieTable = HoodieTable.getHoodieTable(metaClient, config, jsc); // Insert 200 records JavaRDD<WriteStatus> writeStatues = writeClient.upsert(writeRecords, newCommitTime); // commit this upsert writeClient.commit(newCommitTime, writeStatues); // Mock hbaseConnection and related entities Connection hbaseConnection = Mockito.mock(Connection.class); HTable table = Mockito.mock(HTable.class); Mockito.when(hbaseConnection.getTable(TableName.valueOf(tableName))).thenReturn(table); Mockito.when(table.get((List<Get>) anyObject())).thenReturn(new Result[0]); // only for test, set the hbaseConnection to mocked object index.setHbaseConnection(hbaseConnection); // Get all the files generated int numberOfDataFileIds = (int) writeStatues.map(status -> status.getFileId()).distinct().count(); index.updateLocation(writeStatues, jsc, hoodieTable); // 3 batches should be executed given batchSize = 100 and <=numberOfDataFileIds getting updated, // so each fileId ideally gets updates Mockito.verify(table, atMost(numberOfDataFileIds)).put((List<Put>) anyObject()); }
: resultWithDuplicates.distinct();
: resultWithDuplicates.distinct();
javaRDD = index.tagLocation(writeRecords, jsc, hoodieTable); assertTrue(javaRDD.filter(record -> record.isCurrentLocationKnown()).collect().size() == 200); assertTrue(javaRDD.map(record -> record.getKey().getRecordKey()).distinct().count() == 200); assertTrue(javaRDD.filter( record -> (record.getCurrentLocation() != null && record.getCurrentLocation().getCommitTime() .equals(newCommitTime))).distinct().count() == 200);
List<String> taggedFileIds = javaRDD.map(record -> record.getCurrentLocation().getFileId()).distinct().collect();