org.apache.flink.api.java.io.CsvReader java code examples

Refine search

ExecutionEnvironment

private static DataSet<Tuple3<String, String, Integer>> getUserSongTripletsData(ExecutionEnvironment env) {
  if (fileOutput) {
    return env.readCsvFile(userSongTripletsInputPath)
        .lineDelimiter("\n").fieldDelimiter("\t")
        .types(String.class, String.class, Integer.class);
  } else {
    return MusicProfilesData.getUserSongTriplets(env);
  }
}

private static DataSet<Order> getOrdersDataSet(ExecutionEnvironment env, String ordersPath) {
  return env.readCsvFile(ordersPath)
        .fieldDelimiter("|")
        .includeFields("110010010")
        .tupleType(Order.class);
}

public GraphCsvReader(Path vertexPath, Path edgePath, ExecutionEnvironment context) {
  this.vertexPath = vertexPath;
  this.edgePath = edgePath;
  this.vertexReader = new CsvReader(vertexPath, context);
  this.edgeReader = new CsvReader(edgePath, context);
  this.mapper = null;
  this.executionContext = context;
}

  private static DataSet<Tuple2<Integer, String>> getNationsDataSet(ExecutionEnvironment env, String nationPath) {
    return env.readCsvFile(nationPath)
          .fieldDelimiter("|")
          .includeFields("1100")
          .types(Integer.class, String.class);
  }
}

public static void tcph3(String[] args) throws Exception {
  ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
  env.setParallelism(Integer.parseInt(args[0]));
  //order id, order status, order data, order prio, ship prio
  DataSet<Tuple5<Long, String, String, String, Integer>> orders =
      env.readCsvFile(args[1])
      .fieldDelimiter("|").lineDelimiter("\n")
      .includeFields("101011001").types(Long.class, String.class, String.class, String.class, Integer.class)
      .name(ORDERS);
  //order id, extended price
  DataSet<Tuple2<Long, Double>> lineItems =
      env.readCsvFile(args[2])
      .fieldDelimiter("|").lineDelimiter("\n")
      .includeFields("100001").types(Long.class, Double.class)
      .name(LINEITEM);
  DataSet<Tuple2<Long, Integer>> filterO = orders.flatMap(new FilterO()).name(MAPPER_NAME);
  DataSet<Tuple3<Long, Integer, Double>> joinLiO = filterO.join(lineItems).where(0).equalTo(0).with(new JoinLiO()).name(JOIN_NAME);
  DataSet<Tuple3<Long, Integer, Double>> aggLiO = joinLiO.groupBy(0, 1).reduceGroup(new AggLiO()).name(REDUCE_NAME);
  aggLiO.writeAsCsv(args[3], "\n", "|").name(SINK);
  env.execute();
}

final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
env.getConfig().setGlobalJobParameters(params);
if (params.has("input")) {
  data = env.readCsvFile(params.get("input"))
      .fieldDelimiter(" ")
      .includeFields(true, true)
      .pojoType(Data.class);
} else {
  System.out.println("Executing LinearRegression example with default input data set.");

private static SingleOutputStreamOperator<Order> getOrdersDataStream(StreamExecutionEnvironment env, String ordersPath, boolean useSourceV2) {
  final CsvReader csvReader =
    new CsvReader(ordersPath, ExecutionEnvironment.getExecutionEnvironment())
        .fieldDelimiter("|")
        .includeFields("110010010");
  final TupleCsvInputFormat<Order> inputFormat = csvReader.generateTupleCsvInputFormat(Order.class);
  if (useSourceV2) {
    return env.createInputV2(inputFormat, inputFormat.getTupleTypeInfo(), "Order source v2");
  } else {
    return env.createInput(inputFormat, inputFormat.getTupleTypeInfo(), "Order source v1");
  }
}

@SuppressWarnings("unchecked")
private static DataSet<StringTriple> getDataSet(ExecutionEnvironment env, ParameterTool params) {
  if (params.has("input")) {
    return env.readCsvFile(params.get("input"))
      .fieldDelimiter(";")
      .pojoType(StringTriple.class);
  } else {
    System.out.println("Executing EmptyFieldsCountAccumulator example with default input data set.");
    System.out.println("Use --input to specify file input.");
    return env.fromCollection(getExampleInputTuples());
  }
}

@Test(expected = NullPointerException.class)
public void testPOJOTypeWithoutFieldsOrder() throws Exception {
  final String inputData = "";
  final String dataPath = createInputData(inputData);
  final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
  env.readCsvFile(dataPath).pojoType(POJOItem.class, null);
}

private static DataSet<Centroid> getCentroidDataSet(ParameterTool params, ExecutionEnvironment env) {
  DataSet<Centroid> centroids;
  if (params.has("centroids")) {
    centroids = env.readCsvFile(params.get("centroids"))
      .fieldDelimiter(" ")
      .pojoType(Centroid.class, "id", "x", "y");
  } else {
    System.out.println("Executing K-Means example with default centroid data set.");
    System.out.println("Use --centroids to specify file input.");
    centroids = KMeansData.getDefaultCentroidDataSet(env);
  }
  return centroids;
}

private static DataSet<Tuple3<Integer, String, Integer>> getRanksDataSet(ExecutionEnvironment env, ParameterTool params) {
  // Create DataSet for ranks relation (Rank, URL, Avg-Visit-Duration)
  if (params.has("ranks")) {
    return env.readCsvFile(params.get("ranks"))
          .fieldDelimiter("|")
          .types(Integer.class, String.class, Integer.class);
  } else {
    System.out.println("Executing WebLogAnalysis example with default ranks data set.");
    System.out.println("Use --ranks to specify file input.");
    return WebLogData.getRankDataSet(env);
  }
}

@Test
public void testPOJOTypeWithFieldsOrderAndFieldsSelection() throws Exception {
  final String inputData = "3,2.20,ABC\n5,5.1,DEF\n1,3.30,DEF\n10,3.30,GHI";
  final String dataPath = createInputData(inputData);
  final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
  DataSet<POJOItem> data = env.readCsvFile(dataPath).includeFields(true, false, true).pojoType(POJOItem.class, new String[]{"f2", "f1"});
  List<POJOItem> result = data.collect();
  expected = "ABC,3,0.00\nDEF,5,0.00\nDEF,1,0.00\nGHI,10,0.00";
  compareResultAsText(result, expected);
}

@Test
public void testValueTypes() throws Exception {
  final String inputData = "ABC,true,1,2,3,4,5.0,6.0\nBCD,false,1,2,3,4,5.0,6.0";
  final String dataPath = createInputData(inputData);
  final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
  DataSet<Tuple8<StringValue, BooleanValue, ByteValue, ShortValue, IntValue, LongValue, FloatValue, DoubleValue>> data =
      env.readCsvFile(dataPath).types(StringValue.class, BooleanValue.class, ByteValue.class, ShortValue.class, IntValue.class, LongValue.class, FloatValue.class, DoubleValue.class);
  List<Tuple8<StringValue, BooleanValue, ByteValue, ShortValue, IntValue, LongValue, FloatValue, DoubleValue>> result = data.collect();
  expected = inputData;
  compareResultAsTuples(result, expected);
}

private static DataSet<Vertex<Long, Point>> getVerticesDataSet(ExecutionEnvironment env) {
  if (fileOutput) {
    return env.readCsvFile(verticesInputPath)
        .lineDelimiter("\n")
        .types(Long.class, Double.class, Double.class)
        .map(new MapFunction<Tuple3<Long, Double, Double>, Vertex<Long, Point>>() {
          @Override
          public Vertex<Long, Point> map(Tuple3<Long, Double, Double> value) throws Exception {
            return new Vertex<>(value.f0, new Point(value.f1, value.f2));
          }
        });
  } else {
    return EuclideanGraphData.getDefaultVertexDataSet(env);
  }
}

/**
 * Configures which fields of the CSV file containing edges data should be included and which should be skipped. The
 * parser will look at the first {@code n} fields, where {@code n} is the length of the boolean
 * array. The parser will skip over all fields where the boolean value at the corresponding position
 * in the array is {@code false}. The result contains the fields where the corresponding position in
 * the boolean array is {@code true}.
 * The number of fields in the result is consequently equal to the number of times that {@code true}
 * occurs in the fields array.
 *
 * @param edgeFields The array of flags that describes which fields are to be included from the CSV file for edges.
 * @return The GraphCSVReader instance itself, to allow for fluent function chaining.
 */
public GraphCsvReader includeFieldsEdges(boolean ... edgeFields) {
  this.edgeReader.includeFields(edgeFields);
  return this;
}

private static DataSet<Long> getVertexDataSet(ExecutionEnvironment env, ParameterTool params) {
  if (params.has("vertices")) {
    return env.readCsvFile(params.get("vertices")).types(Long.class).map(
      new MapFunction<Tuple1<Long>, Long>() {
        public Long map(Tuple1<Long> value) {
          return value.f0;
        }
      });
  } else {
    System.out.println("Executing Connected Components example with default vertices data set.");
    System.out.println("Use --vertices to specify file input.");
    return ConnectedComponentsData.getDefaultVertexDataSet(env);
  }
}

@Test(expected = IllegalArgumentException.class)
public void testWithInvalidValueType2() throws Exception {
  CsvReader reader = getCsvReader();
  // CsvReader doesn't support custom Value type
  reader.types(ValueItem.class);
}

private static CsvReader getCsvReader() {
  return new CsvReader("/some/none/existing/path", ExecutionEnvironment.createLocalEnvironment(1));
}

/**
 *Configures the Delimiter that separates rows for the CSV reader used to read the edges
 *    ({@code '\n'}) is used by default.
 *
 *@param delimiter The delimiter that separates the rows.
 * @return The GraphCSVReader instance itself, to allow for fluent function chaining.
 */
public GraphCsvReader lineDelimiterEdges(String delimiter) {
  edgeReader.lineDelimiter(delimiter);
  return this;
}

/**
 *Configures the Delimiter that separates fields in a row for the CSV reader used to read the edges
 * ({@code ','}) is used by default.
 *
 * @param delimiter The delimiter that separates the fields in a row.
 * @return The GraphCsv reader instance itself, to allow for fluent function chaining.
 */
public GraphCsvReader fieldDelimiterEdges(String delimiter) {
  this.edgeReader.fieldDelimiter(delimiter);
  return this;
}

Javadoc

A builder class to instantiate a CSV parsing data source. The CSV reader configures the field types, the delimiters (row and field), the fields that should be included or skipped, and other flags such as whether to skip the initial line as the header.

Most used methods

types
Specifies the types for the CSV fields. This method parses the CSV data to a 25-tuple which has fiel
fieldDelimiter
Configures the delimiter that separates the fields within a row. The comma character ( ',') is used
includeFields
Configures which fields of the CSV file should be included and which should be skipped. The parser w
lineDelimiter
Configures the delimiter that separates the lines/rows. The linebreak character ( '\n') is used by d
<init>
tupleType
Configures the reader to read the CSV data and parse it to the given type. The type must be a subcla
ignoreComments
Configures the string that starts comments. By default comments will be treated as invalid lines. Th
pojoType
Configures the reader to read the CSV data and parse it to the given type. The all fields of the typ
ignoreFirstLine
Sets the CSV reader to ignore the first line. This is useful for files that contain a header line.
ignoreInvalidLines
Sets the CSV reader to ignore any invalid lines. This is useful for files that contain an empty line
parseQuotedStrings
Enables quoted String parsing. Field delimiters in quoted Strings are ignored. A String is parsed as
configureInputFormat

Popular in Java

Reactive rest calls using spring rest template
getContentResolver (Context)
getOriginalFilename (MultipartFile)
Return the original filename in the client's filesystem.This may contain path information depending
compareTo (BigDecimal)
BigDecimal (java.math)
An immutable arbitrary-precision signed decimal.A value is represented by an arbitrary-precision "un
Timestamp (java.sql)
A Java representation of the SQL TIMESTAMP type. It provides the capability of representing the SQL
Table (com.google.common.collect)
A collection that associates an ordered pair of keys, called a row key and a column key, with a sing
VirtualMachine (com.sun.tools.attach)
A Java virtual machine. A VirtualMachine represents a Java virtual machine to which this Java vir
Get (org.apache.hadoop.hbase.client)
Used to perform Get operations on a single row. To get everything for a row, instantiate a Get objec
Reflections (org.reflections)
Reflections one-stop-shop objectReflections scans your classpath, indexes the metadata, allows you t
Best plugins for Eclipse

How to useCsvReader in org.apache.flink.api.java.io

Best Java code snippets using org.apache.flink.api.java.io.CsvReader (Showing top 20 results out of 315)

Refine search

How to use
CsvReader
in
org.apache.flink.api.java.io