eu.stratosphere.api.java.record.operators.FileDataSource java code examples

FileDataSource pageWithRankInput = new FileDataSource(new DanglingPageRankInputFormat(),
  pageWithRankInputPath, "DanglingPageWithRankInput");
pageWithRankInput.getParameters().setLong(DanglingPageRankInputFormat.NUM_VERTICES_PARAMETER, numVertices);
FileDataSource adjacencyListInput = new FileDataSource(new ImprovedAdjacencyListInputFormat(),
  adjacencyListInputPath, "AdjancencyListInput");

  @Override
  public Plan getPlan(String... args) throws IllegalArgumentException {
    // parse job parameters
    final int numSubTasks = (args.length > 0 ? Integer.parseInt(args[0]) : 1);
    final String input = (args.length > 1 ? args[1] : "");
    final String output = (args.length > 2 ? args[2] : "");

    // This task will read the input data and generate the key/value pairs
    final FileDataSource source = 
        new FileDataSource(new TeraInputFormat(), input, "Data Source");
    source.setDegreeOfParallelism(numSubTasks);

    // This task writes the sorted data back to disk
    final FileDataSink sink = 
        new FileDataSink(new TeraOutputFormat(), output, "Data Sink");
    sink.setDegreeOfParallelism(numSubTasks);
    sink.setGlobalOrder(new Ordering(0, TeraKey.class, Order.ASCENDING), new TeraDistribution());

    sink.setInput(source);

    return new Plan(sink, "TeraSort");
  }
}

static Plan getTestPlanPlan(int numSubTasks, String input, String output) {
  
  FileDataSource source = new FileDataSource(new TextInputFormat(), input, "Input Lines");
  source.setParameter(TextInputFormat.CHARSET_NAME, "ASCII");
  MapOperator mapper = MapOperator.builder(new TokenizeLine())
    .input(source)
    .name("Tokenize Lines")
    .build();
  ReduceOperator reducer = ReduceOperator.builder(CountWords.class, StringValue.class, 0)
    .input(mapper)
    .name("Count Words")
    .build();
  @SuppressWarnings("unchecked")
  FileDataSink out = new FileDataSink(new CsvOutputFormat("\n"," ", StringValue.class, IntValue.class), output, reducer, "Word Counts");
  Plan plan = new Plan(out, "WordCount Example");
  plan.setDefaultParallelism(numSubTasks);
  
  return plan;
}

@Override
public Plan getPlan(String... args) {
  // parse job parameters
  int numSubTasks   = (args.length > 0 ? Integer.parseInt(args[0]) : 1);
  String dataInput = (args.length > 1 ? args[1] : "");
  String output    = (args.length > 2 ? args[2] : "");
  FileDataSource source = new FileDataSource(new TextInputFormat(), dataInput, "Input Lines");
  MapOperator mapper = MapOperator.builder(new TokenizeLine())
    .input(source)
    .name("Tokenize Lines")
    .build();
  ReduceOperator reducer = ReduceOperator.builder(CountWords.class, StringValue.class, 0)
    .input(mapper)
    .name("Count Words")
    .build();
  
  @SuppressWarnings("unchecked")
  FileDataSink out = new FileDataSink(new CsvOutputFormat("\n", " ", StringValue.class, IntValue.class), output, reducer, "Word Counts");
  
  Plan plan = new Plan(out, "WordCount Example");
  plan.setDefaultParallelism(numSubTasks);
  return plan;
}

/**
 * Creates a configuration builder that can be used to set the input format's parameters to the config in a fluent
 * fashion.
 * 
 * @return A config builder for setting parameters.
 */
public static ConfigBuilder configureRecordFormat(FileDataSource target) {
  return new ConfigBuilder(target, target.getParameters());
}

FileDataSource initialSolutionSet = new FileDataSource(new CsvInputFormat(' ', LongValue.class, DoubleValue.class), solutionSetInput, "Initial Solution Set");
FileDataSource initialDeltaSet = new FileDataSource(new CsvInputFormat(' ', LongValue.class, DoubleValue.class), deltasInput, "Initial DeltaSet");
FileDataSource dependencySet = new FileDataSource(new CsvInputFormat(' ', LongValue.class, LongValue.class, LongValue.class), dependencySetInput, "Dependency Set");

@SuppressWarnings({ "deprecation", "unchecked" })
@Override
protected Plan getTestJob() {
  String input1Path = config.getString("UnionTest#Input1Path", "").equals("empty") ? emptyInPath : inPath;
  String input2Path = config.getString("UnionTest#Input2Path", "").equals("empty") ? emptyInPath : inPath;
  FileDataSource input1 = new FileDataSource(
    new ContractITCaseInputFormat(), input1Path);
  DelimitedInputFormat.configureDelimitedFormat(input1)
    .recordDelimiter('\n');
  input1.setDegreeOfParallelism(config.getInteger("UnionTest#NoSubtasks", 1));
  
  FileDataSource input2 = new FileDataSource(
      new ContractITCaseInputFormat(), input2Path);
  DelimitedInputFormat.configureDelimitedFormat(input2)
    .recordDelimiter('\n');
  input2.setDegreeOfParallelism(config.getInteger("UnionTest#NoSubtasks", 1));
  
  MapOperator testMapper = MapOperator.builder(new TestMapper()).build();
  testMapper.setDegreeOfParallelism(config.getInteger("UnionTest#NoSubtasks", 1));
  FileDataSink output = new FileDataSink(
      new ContractITCaseOutputFormat(), resultPath);
  output.setDegreeOfParallelism(1);
  output.setInput(testMapper);
  testMapper.addInput(input1);
  testMapper.addInput(input2);
  return new Plan(output);
}

@Override
public Plan getPlan(String... args) {
  int numSubTasks = (args.length > 0 ? Integer.parseInt(args[0]) : 1);
  String dataInput = (args.length > 1 ? args[1] : "");
  String output = (args.length > 2 ? args[2] : "");
  FileDataSource source = new FileDataSource(new TextInputFormat(), dataInput, "Input Lines");
  MapOperator mapper = MapOperator.builder(new TokenizeLine()).input(source).name("Tokenize Lines").build();
  
  ReduceOperator reducer = ReduceOperator.builder(CountWords.class, StringValue.class, 0).input(mapper)
      .name("Count Words").build();
  
  FileDataSink out = new FileDataSink(new CsvOutputFormat(), output, reducer, "Word Counts");
  
  CsvOutputFormat.configureRecordFormat(out).recordDelimiter('\n')
      .fieldDelimiter(' ').field(StringValue.class, 0)
      .field(IntValue.class, 1);
  Plan plan = new Plan(out, "WordCount Example");
  plan.setDefaultParallelism(numSubTasks);
  return plan;
}

FileDataSource pageWithRankInput = new FileDataSource(new DanglingPageRankInputFormat(),
  pageWithRankInputPath, "PageWithRank Input");
pageWithRankInput.getParameters().setLong(NUM_VERTICES_CONFIG_PARAM, numVertices);
FileDataSource adjacencyListInput = new FileDataSource(new ImprovedAdjacencyListInputFormat(),
  adjacencyListInputPath, "AdjancencyListInput");

FileDataSource edges = new FileDataSource(new EdgeInputFormat(), edgeInput, "Input Edges");
edges.setParameter(EdgeInputFormat.ID_DELIMITER_CHAR, delimiter);

@Override
public Plan getPlan(String... args) throws IllegalArgumentException {
  // parse program parameters
  int numSubtasks       = (args.length > 0 ? Integer.parseInt(args[0]) : 1);
  String recordsPath    = (args.length > 1 ? args[1] : "");
  String output        = (args.length > 2 ? args[2] : "");
  
  FileDataSource source = new FileDataSource(CsvInputFormat.class, recordsPath);
  source.setDegreeOfParallelism(numSubtasks);
  CsvInputFormat.configureRecordFormat(source)
    .recordDelimiter('\n')
    .fieldDelimiter('|')
    .field(IntValue.class, 0);
  
  FileDataSink sink =
    new FileDataSink(CsvOutputFormat.class, output);
  sink.setDegreeOfParallelism(numSubtasks);
  CsvOutputFormat.configureRecordFormat(sink)
    .recordDelimiter('\n')
    .fieldDelimiter('|')
    .lenient(true)
    .field(IntValue.class, 0);
  
  sink.setGlobalOrder(new Ordering(0, IntValue.class, Order.ASCENDING), new UniformIntegerDistribution(Integer.MIN_VALUE, Integer.MAX_VALUE));
  sink.setInput(source);
  
  return new Plan(sink);
}

@SuppressWarnings({ "deprecation", "unchecked" })
@Override
protected JobGraph getJobGraph() throws Exception {
  
  String path1 = config.getBoolean("input1PathHasData", false) ? textInput : emptyInput;
  String path2 = config.getBoolean("input2PathHasData", false) ? textInput : emptyInput;
  
  FileDataSource input1 = new FileDataSource(new ContractITCaseInputFormat(), path1);
  FileDataSource input2 = new FileDataSource(new ContractITCaseInputFormat(), path2);
  
  MapOperator testMapper1 = MapOperator.builder(new TestMapper()).build();
  MapOperator testMapper2 = MapOperator.builder(new TestMapper()).build();
  FileDataSink output = new FileDataSink(new ContractITCaseOutputFormat(), resultDir);
  testMapper1.setInput(input1);
  testMapper2.setInput(input2);
  output.addInput(testMapper1);
  output.addInput(testMapper2);
  
  Plan plan = new Plan(output);
  plan.setDefaultParallelism(4);
  PactCompiler pc = new PactCompiler(new DataStatistics());
  OptimizedPlan op = pc.compile(plan);
  NepheleJobGraphGenerator jgg = new NepheleJobGraphGenerator();
  return jgg.compileJobGraph(op);
}

String output    = args.length > 2 ? args[2] : "";
FileDataSource edges = new FileDataSource(new EdgeWithDegreesInputFormat(), edgeInput, "Input Edges with Degrees");
edges.setParameter(EdgeWithDegreesInputFormat.VERTEX_DELIMITER_CHAR, '|');
edges.setParameter(EdgeWithDegreesInputFormat.DEGREE_DELIMITER_CHAR, ',');

private static Plan getPlan(int numSubTasks, String input, String output) {
  FileDataSource initialInput = new FileDataSource(new PointInFormat(), input, "Input");
  initialInput.setDegreeOfParallelism(1);
  
  BulkIteration iteration = new BulkIteration("Loop");
  iteration.setInput(initialInput);
  iteration.setMaximumNumberOfIterations(2);
  @SuppressWarnings("unchecked")
  MapOperator map2 = MapOperator.builder(new IdentityMapper()).input(iteration.getPartialSolution(), iteration.getPartialSolution()).name("map").build();
  
  iteration.setNextPartialSolution(map2);
  FileDataSink finalResult = new FileDataSink(new PointOutFormat(), output, iteration, "Output");
  Plan plan = new Plan(finalResult, "Iteration with union test");
  plan.setDefaultParallelism(numSubTasks);
  return plan;
}

static Plan getTestPlanPlan(int numSubTasks, String input, String output) {
  FileDataSource initialInput = new FileDataSource(TextInputFormat.class, input, "input");
  
  BulkIteration iteration = new BulkIteration("Loop");
  iteration.setInput(initialInput);
  iteration.setMaximumNumberOfIterations(NUM_ITERATIONS);
  ReduceOperator sumReduce = ReduceOperator.builder(new SumReducer())
      .input(iteration.getPartialSolution())
      .name("Compute sum (Reduce)")
      .build();
  
  iteration.setNextPartialSolution(sumReduce);
  @SuppressWarnings("unchecked")
  FileDataSink finalResult = new FileDataSink(new CsvOutputFormat("\n",  " ", StringValue.class), output, iteration, "Output");
  Plan plan = new Plan(finalResult, "Iteration with AllReducer (keyless Reducer)");
  plan.setDefaultParallelism(numSubTasks);
  return plan;
}

FileDataSource edges = new FileDataSource(new EdgeInputFormat(), edgeInput, "Input Edges");
edges.setParameter(EdgeInputFormat.ID_DELIMITER_CHAR, delimiter);

@Override
protected Plan getTestJob() {
  FileDataSource input = new FileDataSource(
      new ContractITCaseInputFormat(), inPath);
  DelimitedInputFormat.configureDelimitedFormat(input)
    .recordDelimiter('\n');
  input.setDegreeOfParallelism(config.getInteger("MapTest#NoSubtasks", 1));
  MapOperator testMapper = MapOperator.builder(new TestMapper()).build();
  testMapper.setDegreeOfParallelism(config.getInteger("MapTest#NoSubtasks", 1));
  FileDataSink output = new FileDataSink(
      new ContractITCaseOutputFormat(), resultPath);
  output.setDegreeOfParallelism(1);
  output.setInput(testMapper);
  testMapper.setInput(input);
  return new Plan(output);
}

@Override
public Plan getPlan(String... args) {
  
  // parse job parameters
  int numSubTasks = (args.length > 0 ? Integer.parseInt(args[0]) : 1);
  String dataInput = (args.length > 1 ? args[1] : "");
  String output = (args.length > 2 ? args[2] : "");
  @SuppressWarnings("unchecked")
  CsvInputFormat format = new CsvInputFormat(' ', IntValue.class, IntValue.class);
  FileDataSource input = new FileDataSource(format, dataInput, "Input");
  
  // create the reduce contract and sets the key to the first field
  ReduceOperator sorter = ReduceOperator.builder(new IdentityReducer(), IntValue.class, 0)
    .input(input)
    .name("Reducer")
    .build();
  // sets the group sorting to the second field
  sorter.setGroupOrder(new Ordering(1, IntValue.class, Order.ASCENDING));
  // create and configure the output format
  FileDataSink out = new FileDataSink(new CsvOutputFormat(), output, sorter, "Sorted Output");
  CsvOutputFormat.configureRecordFormat(out)
    .recordDelimiter('\n')
    .fieldDelimiter(' ')
    .field(IntValue.class, 0)
    .field(IntValue.class, 1);
  
  Plan plan = new Plan(out, "SecondarySort Example");
  plan.setDefaultParallelism(numSubTasks);
  return plan;
}

@Override
protected Plan getTestJob() {
  FileDataSource input_left =  new FileDataSource(new CoGroupTestInFormat(), leftInPath);
  DelimitedInputFormat.configureDelimitedFormat(input_left)
    .recordDelimiter('\n');
  input_left.setDegreeOfParallelism(config.getInteger("CoGroupTest#NoSubtasks", 1));
  FileDataSource input_right =  new FileDataSource(new CoGroupTestInFormat(), rightInPath);
  DelimitedInputFormat.configureDelimitedFormat(input_right)
    .recordDelimiter('\n');
  input_right.setDegreeOfParallelism(config.getInteger("CoGroupTest#NoSubtasks", 1));
  CoGroupOperator testCoGrouper = CoGroupOperator.builder(new TestCoGrouper(), StringValue.class, 0, 0)
    .build();
  testCoGrouper.setDegreeOfParallelism(config.getInteger("CoGroupTest#NoSubtasks", 1));
  testCoGrouper.getParameters().setString(PactCompiler.HINT_LOCAL_STRATEGY,
      config.getString("CoGroupTest#LocalStrategy", ""));
  testCoGrouper.getParameters().setString(PactCompiler.HINT_SHIP_STRATEGY,
      config.getString("CoGroupTest#ShipStrategy", ""));
  FileDataSink output = new FileDataSink(new CoGroupOutFormat(), resultPath);
  output.setDegreeOfParallelism(1);
  output.setInput(testCoGrouper);
  testCoGrouper.setFirstInput(input_left);
  testCoGrouper.setSecondInput(input_right);
  return new Plan(output);
}

FileDataSource source = new FileDataSource(new CsvInputFormat(',', IntValue.class, IntValue.class, IntValue.class), recordsPath);

Javadoc

Operator for input nodes which read data from files. (For Record data model)

Most used methods

Popular in Java

Creating JSON documents from java classes using gson
getOriginalFilename (MultipartFile)
Return the original filename in the client's filesystem.This may contain path information depending
scheduleAtFixedRate (Timer)
compareTo (BigDecimal)
FileReader (java.io)
A specialized Reader that reads from a file in the file system. All read requests made by calling me
DecimalFormat (java.text)
A concrete subclass of NumberFormat that formats decimal numbers. It has a variety of features desig
MessageFormat (java.text)
Produces concatenated messages in language-neutral way. New code should probably use java.util.Forma
Servlet (javax.servlet)
Defines methods that all servlets must implement. A servlet is a small Java program that runs within
Base64 (org.apache.commons.codec.binary)
Provides Base64 encoding and decoding as defined by RFC 2045.This class implements section 6.8. Base
SAXParseException (org.xml.sax)
Encapsulate an XML parse error or warning.> This module, both source code and documentation, is in t
Top Sublime Text plugins

How to useFileDataSource in eu.stratosphere.api.java.record.operators

Best Java code snippets using eu.stratosphere.api.java.record.operators.FileDataSource (Showing top 20 results out of 315)

How to use
FileDataSource
in
eu.stratosphere.api.java.record.operators