eu.stratosphere.api.java.record.operators.FileDataSink java code examples

  @Override
  public Plan getPlan(String... args) throws IllegalArgumentException {
    // parse job parameters
    final int numSubTasks = (args.length > 0 ? Integer.parseInt(args[0]) : 1);
    final String input = (args.length > 1 ? args[1] : "");
    final String output = (args.length > 2 ? args[2] : "");

    // This task will read the input data and generate the key/value pairs
    final FileDataSource source = 
        new FileDataSource(new TeraInputFormat(), input, "Data Source");
    source.setDegreeOfParallelism(numSubTasks);

    // This task writes the sorted data back to disk
    final FileDataSink sink = 
        new FileDataSink(new TeraOutputFormat(), output, "Data Sink");
    sink.setDegreeOfParallelism(numSubTasks);
    sink.setGlobalOrder(new Ordering(0, TeraKey.class, Order.ASCENDING), new TeraDistribution());

    sink.setInput(source);

    return new Plan(sink, "TeraSort");
  }
}

/**
 * Creates a configuration builder that can be used to set the input
 * format's parameters to the config in a fluent fashion.
 * 
 * @return A config builder for setting parameters.
 */
public static ConfigBuilder configureRecordFormat(FileDataSink target) {
  return new ConfigBuilder(target.getParameters());
}

@Override
protected Plan getTestJob() {
  FileDataSource input = new FileDataSource(
      new ContractITCaseInputFormat(), inPath);
  DelimitedInputFormat.configureDelimitedFormat(input)
    .recordDelimiter('\n');
  input.setDegreeOfParallelism(config.getInteger("MapTest#NoSubtasks", 1));
  MapOperator testMapper = MapOperator.builder(new TestMapper()).build();
  testMapper.setDegreeOfParallelism(config.getInteger("MapTest#NoSubtasks", 1));
  FileDataSink output = new FileDataSink(
      new ContractITCaseOutputFormat(), resultPath);
  output.setDegreeOfParallelism(1);
  output.setInput(testMapper);
  testMapper.setInput(input);
  return new Plan(output);
}

  .name("Count Words")
  .build();
FileDataSink out = new FileDataSink(new CsvOutputFormat(), output, reducer, "Word Counts");
CsvOutputFormat.configureRecordFormat(out)
  .recordDelimiter('\n')

@Override
protected JobGraph getJobGraph() throws Exception {
  
  // init data source 
  FileDataSource input = new FileDataSource(new ContractITCaseInputFormat(), inputPath);
  // init (working) map task
  MapOperator testMapper = MapOperator.builder(TestMapper.class).build();
  // init data sink
  FileDataSink output = new FileDataSink(new ContractITCaseOutputFormat(), resultPath);
  // compose working program
  output.setInput(testMapper);
  testMapper.setInput(input);
  // generate plan
  Plan plan = new Plan(output);
  plan.setDefaultParallelism(4);
  // optimize and compile plan
  PactCompiler pc = new PactCompiler(new DataStatistics());
  OptimizedPlan op = pc.compile(plan);
  // return job graph of working job
  NepheleJobGraphGenerator jgg = new NepheleJobGraphGenerator();
  return jgg.compileJobGraph(op);
}

FileDataSource source = new FileDataSource(new CsvInputFormat(',', IntValue.class, IntValue.class, IntValue.class), recordsPath);
FileDataSink sink = new FileDataSink(CsvOutputFormat.class, output);
CsvOutputFormat.configureRecordFormat(sink)
  .recordDelimiter('\n')
  .field(IntValue.class, 2);
sink.setGlobalOrder(
  new Ordering(0, IntValue.class, Order.DESCENDING)
    .appendOrdering(1, IntValue.class, Order.ASCENDING)
    .appendOrdering(2, IntValue.class, Order.DESCENDING),
  new TripleIntDistribution(Order.DESCENDING, Order.ASCENDING, Order.DESCENDING));
sink.setInput(source);

/**
 * Creates a FileDataSink with the provided {@link FileOutputFormat} implementation and the given name,
 * writing to the file indicated by the given path. It uses the given contract as its input.
 *
 * @param f The {@link FileOutputFormat} implementation used to encode the data.
 * @param filePath The path to the file to write the contents to.
 * @param input The contract to use as the input.
 * @param name The given name for the sink, used in plans, logs and progress messages.
 */
public FileDataSink(Class<? extends FileOutputFormat<Record>> f, String filePath, Operator<Record> input, String name) {
  this(f, filePath, name);
  setInput(input);
}

@SuppressWarnings({ "deprecation", "unchecked" })
@Override
protected JobGraph getJobGraph() throws Exception {
  
  String path1 = config.getBoolean("input1PathHasData", false) ? textInput : emptyInput;
  String path2 = config.getBoolean("input2PathHasData", false) ? textInput : emptyInput;
  
  FileDataSource input1 = new FileDataSource(new ContractITCaseInputFormat(), path1);
  FileDataSource input2 = new FileDataSource(new ContractITCaseInputFormat(), path2);
  
  MapOperator testMapper1 = MapOperator.builder(new TestMapper()).build();
  MapOperator testMapper2 = MapOperator.builder(new TestMapper()).build();
  FileDataSink output = new FileDataSink(new ContractITCaseOutputFormat(), resultDir);
  testMapper1.setInput(input1);
  testMapper2.setInput(input2);
  output.addInput(testMapper1);
  output.addInput(testMapper2);
  
  Plan plan = new Plan(output);
  plan.setDefaultParallelism(4);
  PactCompiler pc = new PactCompiler(new DataStatistics());
  OptimizedPlan op = pc.compile(plan);
  NepheleJobGraphGenerator jgg = new NepheleJobGraphGenerator();
  return jgg.compileJobGraph(op);
}

  .name("Count Words")
  .build();
FileDataSink out = new FileDataSink(new CsvOutputFormat(), OUT_FILE, reduceNode, "Word Counts");
CsvOutputFormat.configureRecordFormat(out)
  .recordDelimiter('\n')
out.setGlobalOrder(ordering, new SimpleDistribution(new StringValue[] {new StringValue("N")}));

FileDataSink result = new FileDataSink(new CsvOutputFormat(), output, antiJoinVisits, "Result");
result.setDegreeOfParallelism(numSubTasks);
CsvOutputFormat.configureRecordFormat(result)
  .recordDelimiter('\n')

  new FileDataSink(new StringTupleDataOutFormat(), this.outputPath, "Output");
result.setDegreeOfParallelism(this.degreeOfParallelism);
result.setInput(groupByReturnFlag);

public Plan getPlan(int numSubTasks, String output) {
  List<Object> tmp = new ArrayList<Object>();
  int pos = 0;
  for (String s : WordCountData.COUNTS.split("\n")) {
    List<Object> tmpInner = new ArrayList<Object>();
    tmpInner.add(pos++);
    tmpInner.add(Integer.parseInt(s.split(" ")[1]));
    tmp.add(tmpInner);
  }
  // test serializable iterator input, the input record is {id, word}
  CollectionDataSource source = new CollectionDataSource(new SerializableIteratorTest(), "test_iterator");
  // test collection input, the input record is {id, count}
  CollectionDataSource source2 = new CollectionDataSource(tmp, "test_collection");
  JoinOperator join = JoinOperator.builder(Join.class, IntValue.class, 0, 0)
    .input1(source).input2(source2).build();
  FileDataSink out = new FileDataSink(new CsvOutputFormat(), output, join, "Collection Join");
  CsvOutputFormat.configureRecordFormat(out)
    .recordDelimiter('\n')
    .fieldDelimiter(' ')
    .field(StringValue.class, 0)
    .field(IntValue.class, 1);
  Plan plan = new Plan(out, "CollectionDataSource");
  plan.setDefaultParallelism(numSubTasks);
  return plan;
}

@Override
protected JobGraph getFailingJobGraph() throws Exception {
  
  // init data source 
  FileDataSource input = new FileDataSource(new ContractITCaseInputFormat(), inputPath);
  // init failing map task
  MapOperator testMapper = MapOperator.builder(FailingMapper.class).build();
  // init data sink
  FileDataSink output = new FileDataSink(new ContractITCaseOutputFormat(), resultPath);
  // compose failing program
  output.setInput(testMapper);
  testMapper.setInput(input);
  // generate plan
  Plan plan = new Plan(output);
  plan.setDefaultParallelism(4);
  // optimize and compile plan 
  PactCompiler pc = new PactCompiler(new DataStatistics());
  OptimizedPlan op = pc.compile(plan);
  
  // return job graph of failing job
  NepheleJobGraphGenerator jgg = new NepheleJobGraphGenerator();
  return jgg.compileJobGraph(op);
}

/**
 * Creates a FileDataSink with the provided {@link FileOutputFormat} implementation the default name,
 * writing to the file indicated by the given path. It uses the given contract as its input.
 * 
 * @param f The {@link FileOutputFormat} implementation used to encode the data.
 * @param filePath The path to the file to write the contents to.
 * @param input The contract to use as the input.
 */
public FileDataSink(FileOutputFormat<Record> f, String filePath, Operator<Record> input) {
  this(f, filePath);
  setInput(input);
}

@SuppressWarnings({ "deprecation", "unchecked" })
@Override
protected Plan getTestJob() {
  String input1Path = config.getString("UnionTest#Input1Path", "").equals("empty") ? emptyInPath : inPath;
  String input2Path = config.getString("UnionTest#Input2Path", "").equals("empty") ? emptyInPath : inPath;
  FileDataSource input1 = new FileDataSource(
    new ContractITCaseInputFormat(), input1Path);
  DelimitedInputFormat.configureDelimitedFormat(input1)
    .recordDelimiter('\n');
  input1.setDegreeOfParallelism(config.getInteger("UnionTest#NoSubtasks", 1));
  
  FileDataSource input2 = new FileDataSource(
      new ContractITCaseInputFormat(), input2Path);
  DelimitedInputFormat.configureDelimitedFormat(input2)
    .recordDelimiter('\n');
  input2.setDegreeOfParallelism(config.getInteger("UnionTest#NoSubtasks", 1));
  
  MapOperator testMapper = MapOperator.builder(new TestMapper()).build();
  testMapper.setDegreeOfParallelism(config.getInteger("UnionTest#NoSubtasks", 1));
  FileDataSink output = new FileDataSink(
      new ContractITCaseOutputFormat(), resultPath);
  output.setDegreeOfParallelism(1);
  output.setInput(testMapper);
  testMapper.addInput(input1);
  testMapper.addInput(input2);
  return new Plan(output);
}

@Override
public Plan getPlan(String... args) {
  // parse job parameters
  int numSubTasks   = (args.length > 0 ? Integer.parseInt(args[0]) : 1);
  String dataInput = (args.length > 1 ? args[1] : "");
  String output    = (args.length > 2 ? args[2] : "");
  FileDataSource source = new FileDataSource(new TextInputFormat(), dataInput, "Input Lines");
  MapOperator mapper = MapOperator.builder(new TokenizeLine())
    .input(source)
    .name("Tokenize Lines")
    .build();
  ReduceOperator reducer = ReduceOperator.builder(CountWords.class, StringValue.class, 0)
    .input(mapper)
    .name("Count Words")
    .build();
  
  @SuppressWarnings("unchecked")
  FileDataSink out = new FileDataSink(new CsvOutputFormat("\n", " ", StringValue.class, IntValue.class), output, reducer, "Word Counts");
  
  Plan plan = new Plan(out, "WordCount Example");
  plan.setDefaultParallelism(numSubTasks);
  return plan;
}

closeTriads.setParameter("LOCAL_STRATEGY", "LOCAL_STRATEGY_HASH_BUILD_SECOND");
FileDataSink triangles = new FileDataSink(new CsvOutputFormat(), output, "Output");
CsvOutputFormat.configureRecordFormat(triangles)
  .recordDelimiter('\n')
  .field(StringValue.class, 2);
triangles.setInput(closeTriads);
closeTriads.setSecondInput(edges);
closeTriads.setFirstInput(buildTriads);

@Override
public Plan getPlan(String... args) throws IllegalArgumentException {
  // parse program parameters
  int numSubtasks       = (args.length > 0 ? Integer.parseInt(args[0]) : 1);
  String recordsPath    = (args.length > 1 ? args[1] : "");
  String output        = (args.length > 2 ? args[2] : "");
  
  FileDataSource source = new FileDataSource(CsvInputFormat.class, recordsPath);
  source.setDegreeOfParallelism(numSubtasks);
  CsvInputFormat.configureRecordFormat(source)
    .recordDelimiter('\n')
    .fieldDelimiter('|')
    .field(IntValue.class, 0);
  
  FileDataSink sink =
    new FileDataSink(CsvOutputFormat.class, output);
  sink.setDegreeOfParallelism(numSubtasks);
  CsvOutputFormat.configureRecordFormat(sink)
    .recordDelimiter('\n')
    .fieldDelimiter('|')
    .lenient(true)
    .field(IntValue.class, 0);
  
  sink.setGlobalOrder(new Ordering(0, IntValue.class, Order.ASCENDING), new UniformIntegerDistribution(Integer.MIN_VALUE, Integer.MAX_VALUE));
  sink.setInput(source);
  
  return new Plan(sink);
}

/**
 * Creates a FileDataSink with the provided {@link FileOutputFormat} implementation and the given name,
 * writing to the file indicated by the given path. It uses the given contract as its input.
 *
 * @param f The {@link FileOutputFormat} implementation used to encode the data.
 * @param filePath The path to the file to write the contents to.
 * @param input The contract to use as the input.
 * @param name The given name for the sink, used in plans, logs and progress messages.
 */
public FileDataSink(FileOutputFormat<Record> f, String filePath, Operator<Record> input, String name) {
  this(f, filePath, name);
  setInput(input);
}

/**
 * Creates a configuration builder that can be used to set the input format's parameters to the config in a fluent
 * fashion.
 * 
 * @return A config builder for setting parameters.
 */
public static ConfigBuilder configureDelimitedFormat(FileDataSink target) {
  return new ConfigBuilder(target.getParameters());
}

Javadoc

Operator for nodes which act as data sinks, storing the data they receive in a file instead of sending it to another contract. The encoding of the data in the file is handled by the FileOutputFormat.

Most used methods

Popular in Java

Making http post requests using okhttp
findViewById (Activity)
getContentResolver (Context)
onCreateOptionsMenu (Activity)
BufferedWriter (java.io)
Wraps an existing Writer and buffers the output. Expensive interaction with the underlying reader is
Map (java.util)
A Map is a data structure consisting of a set of keys and values in which each key is mapped to a si
TreeMap (java.util)
Walk the nodes of the tree left-to-right or right-to-left. Note that in descending iterations, next
Logger (org.slf4j)
The org.slf4j.Logger interface is the main user entry point of SLF4J API. It is expected that loggin
SAXParseException (org.xml.sax)
Encapsulate an XML parse error or warning.> This module, both source code and documentation, is in t
Loader (org.hibernate.loader)
Abstract superclass of object loading (and querying) strategies. This class implements useful common
Top Vim plugins

How to useFileDataSink in eu.stratosphere.api.java.record.operators

Best Java code snippets using eu.stratosphere.api.java.record.operators.FileDataSink (Showing top 20 results out of 315)

How to use
FileDataSink
in
eu.stratosphere.api.java.record.operators