eu.stratosphere.api.java.record.io.CsvInputFormat java code examples

@Override
public void open(FileInputSplit split) throws IOException {
  super.open(split);
  
  @SuppressWarnings("unchecked")
  FieldParser<Value>[] fieldParsers = (FieldParser<Value>[]) getFieldParsers();
  
  // create the value holders
  this.parsedValues = new Value[fieldParsers.length];
  for (int i = 0; i < fieldParsers.length; i++) {
    this.parsedValues[i] = fieldParsers[i].createValue();
  }
  
  //left to right evaluation makes access [0] okay
  //this marker is used to fasten up readRecord, so that it doesn't have to check each call if the line ending is set to default
  if(this.getDelimiter().length == 1 && this.getDelimiter()[0] == '\n' ) {
        this.lineDelimiterIsLinebreak = true;
  }
}

@Override
public Record readRecord(Record reuse, byte[] bytes, int offset, int numBytes) throws ParseException {
  /*
   * Fix to support windows line endings in CSVInputFiles with standard delimiter setup = \n
   */
  //Find windows end line, so find chariage return before the newline 
  if(this.lineDelimiterIsLinebreak == true && bytes[offset + numBytes -1] == '\r') {
    //reduce the number of bytes so that the Carriage return is not taken as data
    numBytes--;
  }
  
  if (parseRecord(parsedValues, bytes, offset, numBytes)) {
    // valid parse, map values into pact record
    for (int i = 0; i < parsedValues.length; i++) {
      reuse.setField(targetPositions[i], parsedValues[i]);
    }
    return reuse;
  } else {
    return null;
  }
}

public CsvInputFormat(char fieldDelimiter, Class<? extends Value> ... fields) {
  super();
  setFieldDelimiter(fieldDelimiter);
  setFieldTypes(fields);
}

@Override
protected Plan getTestJob() {
  // Sc1 generates M parameters a,b,c for second degree polynomials P(x) = ax^2 + bx + c identified by id
  FileDataSource sc1 = new FileDataSource(new CsvInputFormat(), sc1Path);
  CsvInputFormat.configureRecordFormat(sc1).fieldDelimiter(' ').field(StringValue.class, 0).field(IntValue.class, 1)
      .field(IntValue.class, 2).field(IntValue.class, 3);
  // Sc2 generates N x values to be evaluated with the polynomial identified by id
  FileDataSource sc2 = new FileDataSource(new CsvInputFormat(), sc2Path);
  CsvInputFormat.configureRecordFormat(sc2).fieldDelimiter(' ').field(StringValue.class, 0).field(IntValue.class, 1);
  // Sc3 generates N y values to be evaluated with the polynomial identified by id
  FileDataSource sc3 = new FileDataSource(new CsvInputFormat(), sc3Path);
  CsvInputFormat.configureRecordFormat(sc3).fieldDelimiter(' ').field(StringValue.class, 0).field(IntValue.class, 1);
  // Jn1 matches x and y values on id and emits (id, x, y) triples
  JoinOperator jn1 = JoinOperator.builder(Jn1.class, StringValue.class, 0, 0).input1(sc2).input2(sc3).build();
  // Jn2 matches polynomial and arguments by id, computes p = min(P(x),P(y)) and emits (id, p) tuples
  JoinOperator jn2 = JoinOperator.builder(Jn2.class, StringValue.class, 0, 0).input1(jn1).input2(sc1).build();
  // Mp1 selects (id, x, y) triples where x = y and broadcasts z (=x=y) to Mp2
  MapOperator mp1 = MapOperator.builder(Mp1.class).input(jn1).build();
  // Mp2 filters out all p values which can be divided by z
  MapOperator mp2 = MapOperator.builder(Mp2.class).setBroadcastVariable("z", mp1).input(jn2).build();
  FileDataSink output = new FileDataSink(new ContractITCaseOutputFormat(), resultPath);
  output.setDegreeOfParallelism(1);
  output.setInput(mp2);
  return new Plan(output);
}

@SuppressWarnings("unchecked")
private static JobInputVertex createPointsInput(JobGraph jobGraph, String pointsPath, int numSubTasks, TypeSerializerFactory<?> serializer) {
  CsvInputFormat pointsInFormat = new CsvInputFormat(' ', LongValue.class, LongValue.class, LongValue.class, LongValue.class);
  JobInputVertex pointsInput = JobGraphUtils.createInput(pointsInFormat, pointsPath, "Input[Points]", jobGraph, numSubTasks, numSubTasks);
  {
    TaskConfig taskConfig = new TaskConfig(pointsInput.getConfiguration());
    taskConfig.addOutputShipStrategy(ShipStrategyType.FORWARD);
    taskConfig.setOutputSerializer(serializer);
  }
  return pointsInput;
}

        "Field delimiter must be a single character");
  } else {
    setFieldDelimiter(fieldDelimStr.charAt(0));
  if (getNumberOfNonNullFields() > 0) {
    throw new IllegalConfigurationException("Mixing configuration via instance parameters and config parameters is not possible.");
  setFieldTypes(types);
    this.targetPositions = new int[getNumberOfNonNullFields()];
    for (int i = 0; i < this.targetPositions.length; i++) {
      this.targetPositions[i] = i;
if (getNumberOfNonNullFields() == 0) {
  throw new IllegalConfigurationException("No fields configured in the CsvInputFormat.");

public CsvInputFormat(Class<? extends Value> ... fields) {
  super();
  setFieldTypes(fields);
}

public CsvInputFormat(char fieldDelimiter) {
  super();
  setFieldDelimiter(fieldDelimiter);
}

public void setFieldTypes(Class<? extends Value> ... fieldTypes) {
  if (fieldTypes == null) {
    throw new IllegalArgumentException("Field types must not be null.");
  }
  
  // sanity check
  for (Class<? extends Value> type : fieldTypes) {
    if (type != null && !Value.class.isAssignableFrom(type)) {
      throw new IllegalArgumentException("The types must be subclasses if " + Value.class.getName());
    }
  }
  
  setFieldTypesGeneric(fieldTypes);
}

public void setFields(int[] sourceFieldIndices, Class<? extends Value>[] fieldTypes) {
  Preconditions.checkNotNull(fieldTypes);
  
  // sanity check
  for (Class<? extends Value> type : fieldTypes) {
    if (!Value.class.isAssignableFrom(type)) {
      throw new IllegalArgumentException("The types must be subclasses if " + Value.class.getName());
    }
  }
  
  setFieldsGeneric(sourceFieldIndices, fieldTypes);
}

@Override
public Plan getPlan(String... args) throws IllegalArgumentException {
  // parse program parameters
  int numSubtasks       = (args.length > 0 ? Integer.parseInt(args[0]) : 1);
  String recordsPath    = (args.length > 1 ? args[1] : "");
  String output        = (args.length > 2 ? args[2] : "");
  
  FileDataSource source = new FileDataSource(CsvInputFormat.class, recordsPath);
  source.setDegreeOfParallelism(numSubtasks);
  CsvInputFormat.configureRecordFormat(source)
    .recordDelimiter('\n')
    .fieldDelimiter('|')
    .field(IntValue.class, 0);
  
  FileDataSink sink =
    new FileDataSink(CsvOutputFormat.class, output);
  sink.setDegreeOfParallelism(numSubtasks);
  CsvOutputFormat.configureRecordFormat(sink)
    .recordDelimiter('\n')
    .fieldDelimiter('|')
    .lenient(true)
    .field(IntValue.class, 0);
  
  sink.setGlobalOrder(new Ordering(0, IntValue.class, Order.ASCENDING), new UniformIntegerDistribution(Integer.MIN_VALUE, Integer.MAX_VALUE));
  sink.setInput(source);
  
  return new Plan(sink);
}

FileDataSource orders1 = new FileDataSource(new CsvInputFormat(), orders1Path, "Orders 1");
CsvInputFormat.configureRecordFormat(orders1)
  .recordDelimiter('\n')
  .fieldDelimiter('|')
  .field(StringValue.class, 5, 8);	// order prio
FileDataSource orders2 = new FileDataSource(new CsvInputFormat(), orders2Path, "Orders 2");
CsvInputFormat.configureRecordFormat(orders2)
  .recordDelimiter('\n')
  .fieldDelimiter('|')
FileDataSource lineitems = new FileDataSource(new CsvInputFormat(), lineitemsPath, "LineItems");
CsvInputFormat.configureRecordFormat(lineitems)
  .recordDelimiter('\n')
  .fieldDelimiter('|')
  .build();
FileDataSource partJoin1 = new FileDataSource(new CsvInputFormat(), partJoin1Path, "Part Join 1");
CsvInputFormat.configureRecordFormat(partJoin1)
  .recordDelimiter('\n')
  .fieldDelimiter('|')
FileDataSource partJoin2 = new FileDataSource(new CsvInputFormat(), partJoin2Path, "Part Join 2");
CsvInputFormat.configureRecordFormat(partJoin2)
  .recordDelimiter('\n')
  .fieldDelimiter('|')

@SuppressWarnings("unchecked")
private static JobInputVertex createModelsInput(JobGraph jobGraph, String pointsPath, int numSubTasks, TypeSerializerFactory<?> serializer) {
  CsvInputFormat modelsInFormat = new CsvInputFormat(' ', LongValue.class, LongValue.class, LongValue.class, LongValue.class);
  JobInputVertex modelsInput = JobGraphUtils.createInput(modelsInFormat, pointsPath, "Input[Models]", jobGraph, numSubTasks, numSubTasks);
  {
    TaskConfig taskConfig = new TaskConfig(modelsInput.getConfiguration());
    taskConfig.addOutputShipStrategy(ShipStrategyType.BROADCAST);
    taskConfig.setOutputSerializer(serializer);
  }
  return modelsInput;
}

public void setFieldTypesArray(Class<? extends Value>[] fieldTypes) {
  setFieldTypes(fieldTypes);
}

FileDataSource orders = new FileDataSource(new CsvInputFormat(), ordersPath, "Orders");
orders.setDegreeOfParallelism(numSubtasks);
CsvInputFormat.configureRecordFormat(orders)
  .recordDelimiter('\n')
  .fieldDelimiter('|')
FileDataSource customers = new FileDataSource(new CsvInputFormat(), customerPath, "Customers");
customers.setDegreeOfParallelism(numSubtasks);
CsvInputFormat.configureRecordFormat(customers)
  .recordDelimiter('\n')
  .fieldDelimiter('|')

private static JobInputVertex createEdgesInput(JobGraph jobGraph, String edgesPath, int numSubTasks,
    TypeSerializerFactory<?> serializer,
    TypeComparatorFactory<?> comparator) {
  // edges
  @SuppressWarnings("unchecked")
  CsvInputFormat edgesInFormat = new CsvInputFormat(' ', LongValue.class, LongValue.class);
  JobInputVertex edgesInput = JobGraphUtils.createInput(edgesInFormat, edgesPath, "EdgesInput", jobGraph,
    numSubTasks, numSubTasks);
  TaskConfig edgesInputConfig = new TaskConfig(edgesInput.getConfiguration());
  {
    edgesInputConfig.setOutputSerializer(serializer);
    edgesInputConfig.addOutputShipStrategy(ShipStrategyType.PARTITION_HASH);
    edgesInputConfig.setOutputComparator(comparator, 0);
  }
  return edgesInput;
}

FileDataSource orders = new FileDataSource(new CsvInputFormat(), ordersPath, "Orders");
CsvInputFormat.configureRecordFormat(orders)
  .recordDelimiter('\n')
  .fieldDelimiter('|')
FileDataSource lineitems = new FileDataSource(new CsvInputFormat(), lineitemsPath, "LineItems");
CsvInputFormat.configureRecordFormat(lineitems)
  .recordDelimiter('\n')
  .fieldDelimiter('|')

FileDataSource initialSolutionSet = new FileDataSource(new CsvInputFormat(' ', LongValue.class, DoubleValue.class), solutionSetInput, "Initial Solution Set");
FileDataSource initialDeltaSet = new FileDataSource(new CsvInputFormat(' ', LongValue.class, DoubleValue.class), deltasInput, "Initial DeltaSet");
FileDataSource dependencySet = new FileDataSource(new CsvInputFormat(' ', LongValue.class, LongValue.class, LongValue.class), dependencySetInput, "Dependency Set");

CsvInputFormat docsFormat = new CsvInputFormat('|', StringValue.class, StringValue.class);
FileDataSource docs = new FileDataSource(docsFormat, docsInput, "Docs Input");
FileDataSource ranks = new FileDataSource(new CsvInputFormat(), ranksInput, "Ranks input");
CsvInputFormat.configureRecordFormat(ranks)
  .recordDelimiter('\n')
  .fieldDelimiter('|')
CsvInputFormat visitsFormat = new CsvInputFormat('|', null, StringValue.class, StringValue.class);
FileDataSource visits = new FileDataSource(visitsFormat, visitsInput, "Visits input:q");

FileDataSource pointsSource = new FileDataSource(new CsvInputFormat('|', IntValue.class, DoubleValue.class, DoubleValue.class, DoubleValue.class), dataPointInput, "Data Points");
FileDataSource clustersSource = new FileDataSource(new CsvInputFormat('|', IntValue.class, DoubleValue.class, DoubleValue.class, DoubleValue.class), clusterInput, "Centers");

Javadoc

Input format to parse text files and generate Records. The input file is structured by record delimiters and field delimiters (CSV files are common). Record delimiter separate records from each other ('\n' is common). Field delimiters separate fields within a record. Record and field delimiters must be configured using the InputFormat Configuration. The number of fields to parse must be configured as well. For each field a data type must be specified using the CsvInputFormat#FIELD_TYPE_PARAMETER_PREFIX config key. The position within the text record can be configured for each field using the CsvInputFormat#TEXT_POSITION_PARAMETER_PREFIX config key. Either all text positions must be configured or none. If none is configured, the index of the config key is used. The position of a value within the Record is the index of the config key.

Most used methods

<init>
configureRecordFormat
Creates a configuration builder that can be used to set the input format's parameters to the config
getDelimiter
getFieldParsers
getNumberOfNonNullFields
parseRecord
setFieldDelimiter
setFieldTypes
setFieldTypesGeneric
setFieldsGeneric

Popular in Java

Parsing JSON documents to java classes using gson
setRequestProperty (URLConnection)
scheduleAtFixedRate (ScheduledExecutorService)
compareTo (BigDecimal)
FileWriter (java.io)
A specialized Writer that writes to a file in the file system. All write requests made by calling me
HttpURLConnection (java.net)
An URLConnection for HTTP (RFC 2616 [http://tools.ietf.org/html/rfc2616]) used to send and receive d
InetAddress (java.net)
An Internet Protocol (IP) address. This can be either an IPv4 address or an IPv6 address, and in pra
LinkedHashMap (java.util)
LinkedHashMap is an implementation of Map that guarantees iteration order. All optional operations a
Scanner (java.util)
A parser that parses a text string of primitive types and strings with the help of regular expressio
JLabel (javax.swing)
Github Copilot alternatives

How to useCsvInputFormat in eu.stratosphere.api.java.record.io

Best Java code snippets using eu.stratosphere.api.java.record.io.CsvInputFormat (Showing top 20 results out of 315)

How to use
CsvInputFormat
in
eu.stratosphere.api.java.record.io