@Override public void open(FileInputSplit split) throws IOException { super.open(split); @SuppressWarnings("unchecked") FieldParser<Value>[] fieldParsers = (FieldParser<Value>[]) getFieldParsers(); // create the value holders this.parsedValues = new Value[fieldParsers.length]; for (int i = 0; i < fieldParsers.length; i++) { this.parsedValues[i] = fieldParsers[i].createValue(); } //left to right evaluation makes access [0] okay //this marker is used to fasten up readRecord, so that it doesn't have to check each call if the line ending is set to default if(this.getDelimiter().length == 1 && this.getDelimiter()[0] == '\n' ) { this.lineDelimiterIsLinebreak = true; } }
@Override public Record readRecord(Record reuse, byte[] bytes, int offset, int numBytes) throws ParseException { /* * Fix to support windows line endings in CSVInputFiles with standard delimiter setup = \n */ //Find windows end line, so find chariage return before the newline if(this.lineDelimiterIsLinebreak == true && bytes[offset + numBytes -1] == '\r') { //reduce the number of bytes so that the Carriage return is not taken as data numBytes--; } if (parseRecord(parsedValues, bytes, offset, numBytes)) { // valid parse, map values into pact record for (int i = 0; i < parsedValues.length; i++) { reuse.setField(targetPositions[i], parsedValues[i]); } return reuse; } else { return null; } }
public CsvInputFormat(char fieldDelimiter, Class<? extends Value> ... fields) { super(); setFieldDelimiter(fieldDelimiter); setFieldTypes(fields); }
@Override protected Plan getTestJob() { // Sc1 generates M parameters a,b,c for second degree polynomials P(x) = ax^2 + bx + c identified by id FileDataSource sc1 = new FileDataSource(new CsvInputFormat(), sc1Path); CsvInputFormat.configureRecordFormat(sc1).fieldDelimiter(' ').field(StringValue.class, 0).field(IntValue.class, 1) .field(IntValue.class, 2).field(IntValue.class, 3); // Sc2 generates N x values to be evaluated with the polynomial identified by id FileDataSource sc2 = new FileDataSource(new CsvInputFormat(), sc2Path); CsvInputFormat.configureRecordFormat(sc2).fieldDelimiter(' ').field(StringValue.class, 0).field(IntValue.class, 1); // Sc3 generates N y values to be evaluated with the polynomial identified by id FileDataSource sc3 = new FileDataSource(new CsvInputFormat(), sc3Path); CsvInputFormat.configureRecordFormat(sc3).fieldDelimiter(' ').field(StringValue.class, 0).field(IntValue.class, 1); // Jn1 matches x and y values on id and emits (id, x, y) triples JoinOperator jn1 = JoinOperator.builder(Jn1.class, StringValue.class, 0, 0).input1(sc2).input2(sc3).build(); // Jn2 matches polynomial and arguments by id, computes p = min(P(x),P(y)) and emits (id, p) tuples JoinOperator jn2 = JoinOperator.builder(Jn2.class, StringValue.class, 0, 0).input1(jn1).input2(sc1).build(); // Mp1 selects (id, x, y) triples where x = y and broadcasts z (=x=y) to Mp2 MapOperator mp1 = MapOperator.builder(Mp1.class).input(jn1).build(); // Mp2 filters out all p values which can be divided by z MapOperator mp2 = MapOperator.builder(Mp2.class).setBroadcastVariable("z", mp1).input(jn2).build(); FileDataSink output = new FileDataSink(new ContractITCaseOutputFormat(), resultPath); output.setDegreeOfParallelism(1); output.setInput(mp2); return new Plan(output); }
@SuppressWarnings("unchecked") private static JobInputVertex createPointsInput(JobGraph jobGraph, String pointsPath, int numSubTasks, TypeSerializerFactory<?> serializer) { CsvInputFormat pointsInFormat = new CsvInputFormat(' ', LongValue.class, LongValue.class, LongValue.class, LongValue.class); JobInputVertex pointsInput = JobGraphUtils.createInput(pointsInFormat, pointsPath, "Input[Points]", jobGraph, numSubTasks, numSubTasks); { TaskConfig taskConfig = new TaskConfig(pointsInput.getConfiguration()); taskConfig.addOutputShipStrategy(ShipStrategyType.FORWARD); taskConfig.setOutputSerializer(serializer); } return pointsInput; }
"Field delimiter must be a single character"); } else { setFieldDelimiter(fieldDelimStr.charAt(0)); if (getNumberOfNonNullFields() > 0) { throw new IllegalConfigurationException("Mixing configuration via instance parameters and config parameters is not possible."); setFieldTypes(types); this.targetPositions = new int[getNumberOfNonNullFields()]; for (int i = 0; i < this.targetPositions.length; i++) { this.targetPositions[i] = i; if (getNumberOfNonNullFields() == 0) { throw new IllegalConfigurationException("No fields configured in the CsvInputFormat.");
public CsvInputFormat(Class<? extends Value> ... fields) { super(); setFieldTypes(fields); }
public CsvInputFormat(char fieldDelimiter) { super(); setFieldDelimiter(fieldDelimiter); }
public void setFieldTypes(Class<? extends Value> ... fieldTypes) { if (fieldTypes == null) { throw new IllegalArgumentException("Field types must not be null."); } // sanity check for (Class<? extends Value> type : fieldTypes) { if (type != null && !Value.class.isAssignableFrom(type)) { throw new IllegalArgumentException("The types must be subclasses if " + Value.class.getName()); } } setFieldTypesGeneric(fieldTypes); }
public void setFields(int[] sourceFieldIndices, Class<? extends Value>[] fieldTypes) { Preconditions.checkNotNull(fieldTypes); // sanity check for (Class<? extends Value> type : fieldTypes) { if (!Value.class.isAssignableFrom(type)) { throw new IllegalArgumentException("The types must be subclasses if " + Value.class.getName()); } } setFieldsGeneric(sourceFieldIndices, fieldTypes); }
@Override public Plan getPlan(String... args) throws IllegalArgumentException { // parse program parameters int numSubtasks = (args.length > 0 ? Integer.parseInt(args[0]) : 1); String recordsPath = (args.length > 1 ? args[1] : ""); String output = (args.length > 2 ? args[2] : ""); FileDataSource source = new FileDataSource(CsvInputFormat.class, recordsPath); source.setDegreeOfParallelism(numSubtasks); CsvInputFormat.configureRecordFormat(source) .recordDelimiter('\n') .fieldDelimiter('|') .field(IntValue.class, 0); FileDataSink sink = new FileDataSink(CsvOutputFormat.class, output); sink.setDegreeOfParallelism(numSubtasks); CsvOutputFormat.configureRecordFormat(sink) .recordDelimiter('\n') .fieldDelimiter('|') .lenient(true) .field(IntValue.class, 0); sink.setGlobalOrder(new Ordering(0, IntValue.class, Order.ASCENDING), new UniformIntegerDistribution(Integer.MIN_VALUE, Integer.MAX_VALUE)); sink.setInput(source); return new Plan(sink); }
FileDataSource orders1 = new FileDataSource(new CsvInputFormat(), orders1Path, "Orders 1"); CsvInputFormat.configureRecordFormat(orders1) .recordDelimiter('\n') .fieldDelimiter('|') .field(StringValue.class, 5, 8); // order prio FileDataSource orders2 = new FileDataSource(new CsvInputFormat(), orders2Path, "Orders 2"); CsvInputFormat.configureRecordFormat(orders2) .recordDelimiter('\n') .fieldDelimiter('|') FileDataSource lineitems = new FileDataSource(new CsvInputFormat(), lineitemsPath, "LineItems"); CsvInputFormat.configureRecordFormat(lineitems) .recordDelimiter('\n') .fieldDelimiter('|') .build(); FileDataSource partJoin1 = new FileDataSource(new CsvInputFormat(), partJoin1Path, "Part Join 1"); CsvInputFormat.configureRecordFormat(partJoin1) .recordDelimiter('\n') .fieldDelimiter('|') FileDataSource partJoin2 = new FileDataSource(new CsvInputFormat(), partJoin2Path, "Part Join 2"); CsvInputFormat.configureRecordFormat(partJoin2) .recordDelimiter('\n') .fieldDelimiter('|')
@SuppressWarnings("unchecked") private static JobInputVertex createModelsInput(JobGraph jobGraph, String pointsPath, int numSubTasks, TypeSerializerFactory<?> serializer) { CsvInputFormat modelsInFormat = new CsvInputFormat(' ', LongValue.class, LongValue.class, LongValue.class, LongValue.class); JobInputVertex modelsInput = JobGraphUtils.createInput(modelsInFormat, pointsPath, "Input[Models]", jobGraph, numSubTasks, numSubTasks); { TaskConfig taskConfig = new TaskConfig(modelsInput.getConfiguration()); taskConfig.addOutputShipStrategy(ShipStrategyType.BROADCAST); taskConfig.setOutputSerializer(serializer); } return modelsInput; }
public void setFieldTypesArray(Class<? extends Value>[] fieldTypes) { setFieldTypes(fieldTypes); }
FileDataSource orders = new FileDataSource(new CsvInputFormat(), ordersPath, "Orders"); orders.setDegreeOfParallelism(numSubtasks); CsvInputFormat.configureRecordFormat(orders) .recordDelimiter('\n') .fieldDelimiter('|') FileDataSource customers = new FileDataSource(new CsvInputFormat(), customerPath, "Customers"); customers.setDegreeOfParallelism(numSubtasks); CsvInputFormat.configureRecordFormat(customers) .recordDelimiter('\n') .fieldDelimiter('|')
private static JobInputVertex createEdgesInput(JobGraph jobGraph, String edgesPath, int numSubTasks, TypeSerializerFactory<?> serializer, TypeComparatorFactory<?> comparator) { // edges @SuppressWarnings("unchecked") CsvInputFormat edgesInFormat = new CsvInputFormat(' ', LongValue.class, LongValue.class); JobInputVertex edgesInput = JobGraphUtils.createInput(edgesInFormat, edgesPath, "EdgesInput", jobGraph, numSubTasks, numSubTasks); TaskConfig edgesInputConfig = new TaskConfig(edgesInput.getConfiguration()); { edgesInputConfig.setOutputSerializer(serializer); edgesInputConfig.addOutputShipStrategy(ShipStrategyType.PARTITION_HASH); edgesInputConfig.setOutputComparator(comparator, 0); } return edgesInput; }
FileDataSource orders = new FileDataSource(new CsvInputFormat(), ordersPath, "Orders"); CsvInputFormat.configureRecordFormat(orders) .recordDelimiter('\n') .fieldDelimiter('|') FileDataSource lineitems = new FileDataSource(new CsvInputFormat(), lineitemsPath, "LineItems"); CsvInputFormat.configureRecordFormat(lineitems) .recordDelimiter('\n') .fieldDelimiter('|')
FileDataSource initialSolutionSet = new FileDataSource(new CsvInputFormat(' ', LongValue.class, DoubleValue.class), solutionSetInput, "Initial Solution Set"); FileDataSource initialDeltaSet = new FileDataSource(new CsvInputFormat(' ', LongValue.class, DoubleValue.class), deltasInput, "Initial DeltaSet"); FileDataSource dependencySet = new FileDataSource(new CsvInputFormat(' ', LongValue.class, LongValue.class, LongValue.class), dependencySetInput, "Dependency Set");
CsvInputFormat docsFormat = new CsvInputFormat('|', StringValue.class, StringValue.class); FileDataSource docs = new FileDataSource(docsFormat, docsInput, "Docs Input"); FileDataSource ranks = new FileDataSource(new CsvInputFormat(), ranksInput, "Ranks input"); CsvInputFormat.configureRecordFormat(ranks) .recordDelimiter('\n') .fieldDelimiter('|') CsvInputFormat visitsFormat = new CsvInputFormat('|', null, StringValue.class, StringValue.class); FileDataSource visits = new FileDataSource(visitsFormat, visitsInput, "Visits input:q");
FileDataSource pointsSource = new FileDataSource(new CsvInputFormat('|', IntValue.class, DoubleValue.class, DoubleValue.class, DoubleValue.class), dataPointInput, "Data Points"); FileDataSource clustersSource = new FileDataSource(new CsvInputFormat('|', IntValue.class, DoubleValue.class, DoubleValue.class, DoubleValue.class), clusterInput, "Centers");