private static DataSet<Tuple3<Integer, Integer, String>> getOrdersDataSet(ExecutionEnvironment env) { return env.readCsvFile(ordersPath) .fieldDelimiter('|') .includeFields("110010000") .types(Integer.class, Integer.class, String.class); }
/** * Creates a CSV reader to read a comma separated value (CSV) file. The reader has options to * define parameters and field types and will eventually produce the DataSet that corresponds to * the read and parsed CSV input. * * @param filePath The path of the CSV file. * @return A CsvReader that can be used to configure the CSV input. */ public CsvReader readCsvFile(String filePath) { return new CsvReader(filePath, this); }
private static DataSet<Tuple1<Long>> getPagesDataSet(ExecutionEnvironment env) { if(fileOutput) { return env.readCsvFile(pagesInputPath) .fieldDelimiter(' ') .lineDelimiter("\n") .types(Long.class); } else { return PageRankData.getDefaultPagesDataSet(env); } }
private static DataSet<Customer> getCustomerDataSet(ExecutionEnvironment env) { return env.readCsvFile(customerPath) .fieldDelimiter('|') .includeFields("10000010") .tupleType(Customer.class); }
private static DataSet<Tuple3<Integer, String, Integer>> getRanksDataSet(ExecutionEnvironment env) { // Create DataSet for ranks relation (Rank, URL, Avg-Visit-Duration) if(fileOutput) { return env.readCsvFile(ranksPath) .fieldDelimiter('|') .types(Integer.class, String.class, Integer.class); } else { return WebLogData.getRankDataSet(env); } }
/** * Specifies the types for the CSV fields. This method parses the CSV data to a 1-tuple * which has fields of the specified types. * This method is overloaded for each possible length of the tuples to support type safe * creation of data sets through CSV parsing. * * @param type0 The type of CSV field 0 and the type of field 0 in the returned tuple type. * @return The {@link eu.stratosphere.api.java.DataSet} representing the parsed CSV data. */ public <T0> DataSource<Tuple1<T0>> types(Class<T0> type0) { TupleTypeInfo<Tuple1<T0>> types = TupleTypeInfo.getBasicTupleTypeInfo(type0); CsvInputFormat<Tuple1<T0>> inputFormat = new CsvInputFormat<Tuple1<T0>>(path); configureInputFormat(inputFormat, type0); return new DataSource<Tuple1<T0>>(executionContext, inputFormat, types); }
/** * Configures which fields of the CSV file should be included and which should be skipped. The * positions in the string (read from position 0 to its length) define whether the field at * the corresponding position in the CSV schema should be included. * parser will look at the first {@code n} fields, where {@code n} is the length of the mask string * The parser will skip over all fields where the character at the corresponding position * in the string is {@code '0'}, {@code 'F'}, or {@code 'f'} (representing the value * {@code false}). The result contains the fields where the corresponding position in * the boolean array is {@code '1'}, {@code 'T'}, or {@code 't'} (representing the value {@code true}). * * @param mask The string mask defining which fields to include and which to skip. * @return The CSV reader instance itself, to allow for fluent function chaining. */ public CsvReader includeFields(String mask) { boolean[] includedMask = new boolean[mask.length()]; for (int i = 0; i < mask.length(); i++) { char c = mask.charAt(i); if (c == '1' || c == 'T' || c == 't') { includedMask[i] = true; } else if (c != '0' && c != 'F' && c != 'f') { throw new IllegalArgumentException("Mask string may contain only '0' and '1'."); } } return includeFields(includedMask); }
private static DataSet<Long> getVertexDataSet(ExecutionEnvironment env) { if(fileOutput) { return env.readCsvFile(verticesPath).types(Long.class) .map( new MapFunction<Tuple1<Long>, Long>() { public Long map(Tuple1<Long> value) { return value.f0; } }); } else { return ConnectedComponentsData.getDefaultVertexDataSet(env); } }
private static DataSet<Lineitem> getLineitemDataSet(ExecutionEnvironment env) { return env.readCsvFile(lineitemPath) .fieldDelimiter('|') .includeFields("1000011000100000") .tupleType(Lineitem.class); }
private static DataSet<Tuple2<String, String>> getDocumentsDataSet(ExecutionEnvironment env) { // Create DataSet for documents relation (URL, Doc-Text) if(fileOutput) { return env.readCsvFile(documentsPath) .fieldDelimiter('|') .types(String.class, String.class); } else { return WebLogData.getDocumentDataSet(env); } }
/** * Specifies the types for the CSV fields. This method parses the CSV data to a 2-tuple * which has fields of the specified types. * This method is overloaded for each possible length of the tuples to support type safe * creation of data sets through CSV parsing. * * @param type0 The type of CSV field 0 and the type of field 0 in the returned tuple type. * @param type1 The type of CSV field 1 and the type of field 1 in the returned tuple type. * @return The {@link eu.stratosphere.api.java.DataSet} representing the parsed CSV data. */ public <T0, T1> DataSource<Tuple2<T0, T1>> types(Class<T0> type0, Class<T1> type1) { TupleTypeInfo<Tuple2<T0, T1>> types = TupleTypeInfo.getBasicTupleTypeInfo(type0, type1); CsvInputFormat<Tuple2<T0, T1>> inputFormat = new CsvInputFormat<Tuple2<T0, T1>>(path); configureInputFormat(inputFormat, type0, type1); return new DataSource<Tuple2<T0, T1>>(executionContext, inputFormat, types); }
return includeFields(fieldsArray);
private static DataSet<Long> getVertexDataSet(ExecutionEnvironment env) { if(fileOutput) { return env.readCsvFile(verticesPath).types(Long.class) .map( new MapFunction<Tuple1<Long>, Long>() { public Long map(Tuple1<Long> value) { return value.f0; } }); } else { return ConnectedComponentsData.getDefaultVertexDataSet(env); } }
private static DataSet<Tuple2<Integer, Double>> getLineitemDataSet(ExecutionEnvironment env) { return env.readCsvFile(lineitemPath) .fieldDelimiter('|') .includeFields("1000010000000000") .types(Integer.class, Double.class); }
private static DataSet<Customer> getCustomerDataSet(ExecutionEnvironment env) { return env.readCsvFile(customerPath) .fieldDelimiter('|') .includeFields("10000010") .tupleType(Customer.class); }
private static DataSet<Tuple1<Long>> getPagesDataSet(ExecutionEnvironment env) { if(fileOutput) { return env.readCsvFile(pagesInputPath) .fieldDelimiter(' ') .lineDelimiter("\n") .types(Long.class); } else { return PageRankData.getDefaultPagesDataSet(env); } }
private static DataSet<Tuple3<Integer, String, Integer>> getRanksDataSet(ExecutionEnvironment env) { // Create DataSet for ranks relation (Rank, URL, Avg-Visit-Duration) if(fileOutput) { return env.readCsvFile(ranksPath) .fieldDelimiter('|') .types(Integer.class, String.class, Integer.class); } else { return WebLogData.getRankDataSet(env); } }
/** * Specifies the types for the CSV fields. This method parses the CSV data to a 5-tuple * which has fields of the specified types. * This method is overloaded for each possible length of the tuples to support type safe * creation of data sets through CSV parsing. * * @param type0 The type of CSV field 0 and the type of field 0 in the returned tuple type. * @param type1 The type of CSV field 1 and the type of field 1 in the returned tuple type. * @param type2 The type of CSV field 2 and the type of field 2 in the returned tuple type. * @param type3 The type of CSV field 3 and the type of field 3 in the returned tuple type. * @param type4 The type of CSV field 4 and the type of field 4 in the returned tuple type. * @return The {@link eu.stratosphere.api.java.DataSet} representing the parsed CSV data. */ public <T0, T1, T2, T3, T4> DataSource<Tuple5<T0, T1, T2, T3, T4>> types(Class<T0> type0, Class<T1> type1, Class<T2> type2, Class<T3> type3, Class<T4> type4) { TupleTypeInfo<Tuple5<T0, T1, T2, T3, T4>> types = TupleTypeInfo.getBasicTupleTypeInfo(type0, type1, type2, type3, type4); CsvInputFormat<Tuple5<T0, T1, T2, T3, T4>> inputFormat = new CsvInputFormat<Tuple5<T0, T1, T2, T3, T4>>(path); configureInputFormat(inputFormat, type0, type1, type2, type3, type4); return new DataSource<Tuple5<T0, T1, T2, T3, T4>>(executionContext, inputFormat, types); }
private static DataSet<Tuple3<Integer, Integer, String>> getOrdersDataSet(ExecutionEnvironment env) { return env.readCsvFile(ordersPath) .fieldDelimiter('|') .includeFields("110010000") .types(Integer.class, Integer.class, String.class); }
private static DataSet<Lineitem> getLineitemDataSet(ExecutionEnvironment env) { return env.readCsvFile(lineitemPath) .fieldDelimiter('|') .includeFields("1000011000100000") .tupleType(Lineitem.class); }