/** * Configures which fields of the CSV file containing edges data should be included and which should be skipped. The * positions in the string (read from position 0 to its length) define whether the field at * the corresponding position in the CSV schema should be included. * parser will look at the first {@code n} fields, where {@code n} is the length of the mask string * The parser will skip over all fields where the character at the corresponding position * in the string is {@code '0'}, {@code 'F'}, or {@code 'f'} (representing the value * {@code false}). The result contains the fields where the corresponding position in * the boolean array is {@code '1'}, {@code 'T'}, or {@code 't'} (representing the value {@code true}). * * @param mask The string mask defining which fields to include and which to skip. * @return The GraphCSVReader instance itself, to allow for fluent function chaining. */ public GraphCsvReader includeFieldsEdges(String mask) { this.edgeReader.includeFields(mask); return this; }
/** * Configures which fields of the CSV file containing edges data should be included and which should be skipped. The * parser will look at the first {@code n} fields, where {@code n} is the length of the boolean * array. The parser will skip over all fields where the boolean value at the corresponding position * in the array is {@code false}. The result contains the fields where the corresponding position in * the boolean array is {@code true}. * The number of fields in the result is consequently equal to the number of times that {@code true} * occurs in the fields array. * * @param edgeFields The array of flags that describes which fields are to be included from the CSV file for edges. * @return The GraphCSVReader instance itself, to allow for fluent function chaining. */ public GraphCsvReader includeFieldsEdges(boolean ... edgeFields) { this.edgeReader.includeFields(edgeFields); return this; }
/** * Configures which fields of the CSV file containing vertices data should be included and which should be skipped. The * parser will look at the first {@code n} fields, where {@code n} is the length of the boolean * array. The parser will skip over all fields where the boolean value at the corresponding position * in the array is {@code false}. The result contains the fields where the corresponding position in * the boolean array is {@code true}. * The number of fields in the result is consequently equal to the number of times that {@code true} * occurs in the fields array. * * @param vertexFields The array of flags that describes which fields are to be included from the CSV file for vertices. * @return The GraphCSVReader instance itself, to allow for fluent function chaining. */ public GraphCsvReader includeFieldsVertices(boolean ... vertexFields) { if (this.vertexReader != null) { this.vertexReader.includeFields(vertexFields); } return this; }
/** * Configures which fields of the CSV file containing vertices data should be included and which should be skipped. The * positions in the string (read from position 0 to its length) define whether the field at * the corresponding position in the CSV schema should be included. * parser will look at the first {@code n} fields, where {@code n} is the length of the mask string * The parser will skip over all fields where the character at the corresponding position * in the string is {@code '0'}, {@code 'F'}, or {@code 'f'} (representing the value * {@code false}). The result contains the fields where the corresponding position in * the boolean array is {@code '1'}, {@code 'T'}, or {@code 't'} (representing the value {@code true}). * * @param mask The string mask defining which fields to include and which to skip. * @return The GraphCSVReader instance itself, to allow for fluent function chaining. */ public GraphCsvReader includeFieldsVertices(String mask) { if (this.vertexReader != null) { this.vertexReader.includeFields(mask); } return this; }
/** * Configures which fields of the CSV file containing edges data should be included and which should be skipped. The * bits in the value (read from least significant to most significant) define whether the field at * the corresponding position in the CSV schema should be included. * parser will look at the first {@code n} fields, where {@code n} is the position of the most significant * non-zero bit. * The parser will skip over all fields where the character at the corresponding bit is zero, and * include the fields where the corresponding bit is one. * * <p>Examples: * <ul> * <li>A mask of {@code 0x7} would include the first three fields.</li> * <li>A mask of {@code 0x26} (binary {@code 100110} would skip the first fields, include fields * two and three, skip fields four and five, and include field six.</li> * </ul> * * @param mask The bit mask defining which fields to include and which to skip. * @return The GraphCSVReader instance itself, to allow for fluent function chaining. */ public GraphCsvReader includeFieldsEdges(long mask) { this.edgeReader.includeFields(mask); return this; }
/** * Configures which fields of the CSV file containing vertices data should be included and which should be skipped. The * bits in the value (read from least significant to most significant) define whether the field at * the corresponding position in the CSV schema should be included. * parser will look at the first {@code n} fields, where {@code n} is the position of the most significant * non-zero bit. * The parser will skip over all fields where the character at the corresponding bit is zero, and * include the fields where the corresponding bit is one. * * <p>Examples: * <ul> * <li>A mask of {@code 0x7} would include the first three fields.</li> * <li>A mask of {@code 0x26} (binary {@code 100110} would skip the first fields, include fields * two and three, skip fields four and five, and include field six.</li> * </ul> * * @param mask The bit mask defining which fields to include and which to skip. * @return The GraphCSVReader instance itself, to allow for fluent function chaining. */ public GraphCsvReader includeFieldsVertices(long mask) { if (this.vertexReader != null) { this.vertexReader.includeFields(mask); } return this; }
/** * Configures which fields of the CSV file should be included and which should be skipped. The * positions in the string (read from position 0 to its length) define whether the field at * the corresponding position in the CSV schema should be included. * parser will look at the first {@code n} fields, where {@code n} is the length of the mask string * The parser will skip over all fields where the character at the corresponding position * in the string is {@code '0'}, {@code 'F'}, or {@code 'f'} (representing the value * {@code false}). The result contains the fields where the corresponding position in * the boolean array is {@code '1'}, {@code 'T'}, or {@code 't'} (representing the value {@code true}). * * @param mask The string mask defining which fields to include and which to skip. * @return The CSV reader instance itself, to allow for fluent function chaining. */ public CsvReader includeFields(String mask) { boolean[] includedMask = new boolean[mask.length()]; for (int i = 0; i < mask.length(); i++) { char c = mask.charAt(i); if (c == '1' || c == 'T' || c == 't') { includedMask[i] = true; } else if (c != '0' && c != 'F' && c != 'f') { throw new IllegalArgumentException("Mask string may contain only '0' and '1'."); } } return includeFields(includedMask); }
private static DataSet<Order> getOrdersDataSet(ExecutionEnvironment env, String ordersPath) { return env.readCsvFile(ordersPath) .fieldDelimiter("|") .includeFields("110010010") .tupleType(Order.class); }
private static DataSet<Tuple5<Integer, String, String, Integer, Double>> getCustomerDataSet(ExecutionEnvironment env, String customerPath) { return env.readCsvFile(customerPath) .fieldDelimiter("|") .includeFields("11110100") .types(Integer.class, String.class, String.class, Integer.class, Double.class); }
private static DataSet<Tuple4<Integer, Double, Double, String>> getLineitemDataSet(ExecutionEnvironment env, String lineitemPath) { return env.readCsvFile(lineitemPath) .fieldDelimiter("|") .includeFields("1000011010000000") .types(Integer.class, Double.class, Double.class, String.class); }
private static DataSet<Lineitem> getLineitemDataSet(ExecutionEnvironment env, String lineitemPath) { return env.readCsvFile(lineitemPath) .fieldDelimiter("|") .includeFields("1000011000100000") .tupleType(Lineitem.class); }
private static DataSet<Tuple2<Integer, String>> getNationsDataSet(ExecutionEnvironment env, String nationPath) { return env.readCsvFile(nationPath) .fieldDelimiter("|") .includeFields("1100") .types(Integer.class, String.class); } }
private static DataSet<Customer> getCustomerDataSet(ExecutionEnvironment env, String customerPath) { return env.readCsvFile(customerPath) .fieldDelimiter("|") .includeFields("10000010") .tupleType(Customer.class); }
private static DataSet<Tuple3<Integer, Integer, String>> getOrdersDataSet(ExecutionEnvironment env, String ordersPath) { return env.readCsvFile(ordersPath) .fieldDelimiter("|") .includeFields("110010000") .types(Integer.class, Integer.class, String.class); }
private static DataSet<Tuple2<String, String>> getVisitsDataSet(ExecutionEnvironment env, ParameterTool params) { // Create DataSet for visits relation (URL, Date) if (params.has("visits")) { return env.readCsvFile(params.get("visits")) .fieldDelimiter("|") .includeFields("011000000") .types(String.class, String.class); } else { System.out.println("Executing WebLogAnalysis example with default visits data set."); System.out.println("Use --visits to specify file input."); return WebLogData.getVisitDataSet(env); } }
@Test public void testIncludeFieldsDense() { CsvReader reader = getCsvReader(); reader.includeFields(true, true, true); Assert.assertTrue(Arrays.equals(new boolean[] {true, true, true}, reader.includedMask)); reader = getCsvReader(); reader.includeFields("ttt"); Assert.assertTrue(Arrays.equals(new boolean[] {true, true, true}, reader.includedMask)); reader = getCsvReader(); reader.includeFields("TTT"); Assert.assertTrue(Arrays.equals(new boolean[] {true, true, true}, reader.includedMask)); reader = getCsvReader(); reader.includeFields("111"); Assert.assertTrue(Arrays.equals(new boolean[] {true, true, true}, reader.includedMask)); reader = getCsvReader(); reader.includeFields(0x7L); Assert.assertTrue(Arrays.equals(new boolean[] {true, true, true}, reader.includedMask)); }
@Test public void testIncludeFieldsSparse() { CsvReader reader = getCsvReader(); reader.includeFields(false, true, true, false, false, true, false, false); Assert.assertTrue(Arrays.equals(new boolean[] {false, true, true, false, false, true}, reader.includedMask)); reader = getCsvReader(); reader.includeFields("fttfftff"); Assert.assertTrue(Arrays.equals(new boolean[] {false, true, true, false, false, true}, reader.includedMask)); reader = getCsvReader(); reader.includeFields("FTTFFTFF"); Assert.assertTrue(Arrays.equals(new boolean[] {false, true, true, false, false, true}, reader.includedMask)); reader = getCsvReader(); reader.includeFields("01100100"); Assert.assertTrue(Arrays.equals(new boolean[] {false, true, true, false, false, true}, reader.includedMask)); reader = getCsvReader(); reader.includeFields("0t1f0TFF"); Assert.assertTrue(Arrays.equals(new boolean[] {false, true, true, false, false, true}, reader.includedMask)); reader = getCsvReader(); reader.includeFields(0x26L); Assert.assertTrue(Arrays.equals(new boolean[] {false, true, true, false, false, true}, reader.includedMask)); }
@Test public void testIllegalCharInStringMask() { CsvReader reader = getCsvReader(); try { reader.includeFields("1t0Tfht"); Assert.fail("Reader accepted an invalid mask string"); } catch (IllegalArgumentException e) { // expected } }
@Test public void testPOJOTypeWithFieldsOrderAndFieldsSelection() throws Exception { final String inputData = "3,2.20,ABC\n5,5.1,DEF\n1,3.30,DEF\n10,3.30,GHI"; final String dataPath = createInputData(inputData); final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<POJOItem> data = env.readCsvFile(dataPath).includeFields(true, false, true).pojoType(POJOItem.class, new String[]{"f2", "f1"}); List<POJOItem> result = data.collect(); expected = "ABC,3,0.00\nDEF,5,0.00\nDEF,1,0.00\nGHI,10,0.00"; compareResultAsText(result, expected); }
public static void tcph3(String[] args) throws Exception { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(Integer.parseInt(args[0])); //order id, order status, order data, order prio, ship prio DataSet<Tuple5<Long, String, String, String, Integer>> orders = env.readCsvFile(args[1]) .fieldDelimiter("|").lineDelimiter("\n") .includeFields("101011001").types(Long.class, String.class, String.class, String.class, Integer.class) .name(ORDERS); //order id, extended price DataSet<Tuple2<Long, Double>> lineItems = env.readCsvFile(args[2]) .fieldDelimiter("|").lineDelimiter("\n") .includeFields("100001").types(Long.class, Double.class) .name(LINEITEM); DataSet<Tuple2<Long, Integer>> filterO = orders.flatMap(new FilterO()).name(MAPPER_NAME); DataSet<Tuple3<Long, Integer, Double>> joinLiO = filterO.join(lineItems).where(0).equalTo(0).with(new JoinLiO()).name(JOIN_NAME); DataSet<Tuple3<Long, Integer, Double>> aggLiO = joinLiO.groupBy(0, 1).reduceGroup(new AggLiO()).name(REDUCE_NAME); aggLiO.writeAsCsv(args[3], "\n", "|").name(SINK); env.execute(); }