org.apache.hadoop.mapreduce.RecordReader java code examples

Refine search

@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
  while (reader.nextKeyValue()) {
    // TODO titan05 integration -- the duplicate() call may be unnecessary
    final TinkerVertex maybeNullTinkerVertex =
        deser.readHadoopVertex(reader.getCurrentKey(), reader.getCurrentValue());
    if (null != maybeNullTinkerVertex) {
      vertex = new VertexWritable(maybeNullTinkerVertex);
      //vertexQuery.filterRelationsOf(vertex); // TODO reimplement vertexquery filtering
      return true;
    }
  }
  return false;
}

@Override
public void open(HadoopInputSplit split) throws IOException {
  // enforce sequential open() calls
  synchronized (OPEN_MUTEX) {
    TaskAttemptContext context = new TaskAttemptContextImpl(configuration, new TaskAttemptID());
    try {
      this.recordReader = this.mapreduceInputFormat
          .createRecordReader(split.getHadoopInputSplit(), context);
      this.recordReader.initialize(split.getHadoopInputSplit(), context);
    } catch (InterruptedException e) {
      throw new IOException("Could not create RecordReader.", e);
    } finally {
      this.fetched = false;
    }
  }
}

@Override
public boolean hasNext() {
 try {
  boolean retVal = curRecReader.nextKeyValue();
  if (retVal) {
   return true;
  }
  // if its false, we need to close recordReader.
  curRecReader.close();
  return false;
 } catch (IOException e) {
  throw new RuntimeException(e);
 } catch (InterruptedException e) {
  throw new RuntimeException(e);
 }
}

@Override
public Tuple2<K, V> nextRecord(Tuple2<K, V> record) throws IOException {
  if (!this.fetched) {
    fetchNext();
  }
  if (!this.hasNext) {
    return null;
  }
  try {
    record.f0 = recordReader.getCurrentKey();
    record.f1 = recordReader.getCurrentValue();
  } catch (InterruptedException e) {
    throw new IOException("Could not get KeyValue pair.", e);
  }
  this.fetched = false;
  return record;
}

@Test
public void readBitcoinRawBlockInputFormatBlockVersion1() throws IOException, InterruptedException {
 Configuration conf = new Configuration(defaultConf);
 ClassLoader classLoader = getClass().getClassLoader();
 String fileName="version1.blk";
 String fileNameBlock=classLoader.getResource("testdata/"+fileName).getFile();	
 Path file = new Path(fileNameBlock);
 Job job = Job.getInstance(conf);
 FileInputFormat.setInputPaths(job, file);
 BitcoinRawBlockFileInputFormat format = new BitcoinRawBlockFileInputFormat();
 List<InputSplit> splits = format.getSplits(job);
 TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
 assertEquals( 1, splits.size(),"Only one split generated for block version 1");
   RecordReader<BytesWritable, BytesWritable> reader = format.createRecordReader(splits.get(0), context);
 assertNotNull( reader,"Format returned  null RecordReader");
 reader.initialize(splits.get(0),context);
 BytesWritable key = new BytesWritable();	
 BytesWritable block = new BytesWritable();
 assertTrue( reader.nextKeyValue(),"Input Split for block version contains at least one block");
 block=reader.getCurrentValue();
 assertEquals( 482, block.getLength(),"Random block version 1  must have size of 482 bytes");
   assertFalse( reader.nextKeyValue(),"No further blocks in block version 1");
 reader.close();
}

expect(inputSplit.getPath()).andReturn(new Path("/path/to/an/avro/file")).anyTimes();
expect(inputSplit.getStart()).andReturn(0L).anyTimes();
expect(inputSplit.getLength()).andReturn(avroFileInput.length()).anyTimes();
expect(context.getConfiguration()).andReturn(conf).anyTimes();
recordReader.initialize(inputSplit, context);
  0.0f, recordReader.getProgress(), 0.0f);
assertTrue("Expected at least one record", recordReader.nextKeyValue());
key = recordReader.getCurrentKey();
value = recordReader.getCurrentValue();
  key == recordReader.getCurrentKey());
assertTrue("getCurrentValue() returned different values for the same record",
  value == recordReader.getCurrentValue());
assertTrue("Expected to read a second record", recordReader.nextKeyValue());
key = recordReader.getCurrentKey();
value = recordReader.getCurrentValue();
  1.0f, recordReader.getProgress(), 0.0f);
assertFalse("Expected only 2 records", recordReader.nextKeyValue());
recordReader.close();

private void writeThenReadByRecordReader(int intervalRecordCount,
           int writeCount, int splitNumber, long maxSplitSize, CompressionCodec codec)
 throws IOException, InterruptedException {
 Path testDir = new Path(System.getProperty("test.tmp.dir", ".")
  + "/mapred/testsmallfirstsplit");
 Path testFile = new Path(testDir, "test_rcfile");
 fs.delete(testFile, true);
 Configuration cloneConf = new Configuration(conf);
 jonconf.set("mapred.input.dir", testDir.toString());
 JobContext context = new Job(jonconf);
 HiveConf.setLongVar(context.getConfiguration(),
   HiveConf.ConfVars.MAPREDMAXSPLITSIZE, maxSplitSize);
    new TaskAttemptID());
  RecordReader<LongWritable, BytesRefArrayWritable> rr = inputFormat.createRecordReader(splits.get(i), tac);
  rr.initialize(splits.get(i), tac);
  while (rr.nextKeyValue()) {
   readCount++;

private static List<Text> readSplit(KeyValueTextInputFormat format, 
  InputSplit split, Job job) throws IOException, InterruptedException {
 List<Text> result = new ArrayList<Text>();
 Configuration conf = job.getConfiguration();
 TaskAttemptContext context = MapReduceTestUtil.
  createDummyMapTaskAttemptContext(conf);
 RecordReader<Text, Text> reader = format.createRecordReader(split, 
  MapReduceTestUtil.createDummyMapTaskAttemptContext(conf));
 MapContext<Text, Text, Text, Text> mcontext = 
  new MapContextImpl<Text, Text, Text, Text>(conf, 
  context.getTaskAttemptID(), reader, null, null,
  MapReduceTestUtil.createDummyReporter(), 
  split);
 reader.initialize(split, mcontext);
 while (reader.nextKeyValue()) {
  result.add(new Text(reader.getCurrentValue()));
 }
 return result;
}

@Test
public void readExcelInputFormatExcel2013SingleSheetEncryptedNegative() throws IOException, InterruptedException {
  Configuration conf = new Configuration(defaultConf);
  ClassLoader classLoader = getClass().getClassLoader();
  String fileName = "excel2013encrypt.xlsx";
  String fileNameSpreadSheet = classLoader.getResource(fileName).getFile();
  Path file = new Path(fileNameSpreadSheet);
  // set locale to the one of the test data
  conf.set("hadoopoffice.read.locale.bcp47", "de");
  // for decryption simply set the password
  conf.set("hadoopoffice.read.security.crypt.password", "test2");
  Job job = Job.getInstance(conf);
  FileInputFormat.setInputPaths(job, file);
  TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
  ExcelFileInputFormat format = new ExcelFileInputFormat();
  List<InputSplit> splits = format.getSplits(job);
  assertEquals(1, splits.size(), "Only one split generated for Excel file");
  RecordReader<Text, ArrayWritable> reader = format.createRecordReader(splits.get(0), context);
  InterruptedException ex = assertThrows(InterruptedException.class,
      () -> reader.initialize(splits.get(0), context), "Exception is thrown in case of wrong password");
}

private void runCDXTest(Configuration conf, String expected)
    throws Exception {
  File testFile = new File("src/test/resources/rr-test-inputs.txt");
  Path path = new Path(testFile.getAbsoluteFile().toURI().toString());
  FileSplit split = new FileSplit(path, 0, testFile.length(), null);
  ArchiveToCDXFileInputFormat inputFormat = ReflectionUtils
      .newInstance(ArchiveToCDXFileInputFormat.class, conf);
  TaskAttemptContext context = new TaskAttemptContext(conf,
      new TaskAttemptID());
  RecordReader<Text, Text> reader = inputFormat.createRecordReader(split,
      context);
  reader.initialize(split, context);
  int position = 0;
  String value = "";
  while (reader.nextKeyValue() != false) {
    position += 1;
    if (position == 3)
      value = reader.getCurrentValue().toString();
  }
  // Check the third value is as expected
  log.debug(value);
  Assert.assertEquals(expected, value);
}

@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException,
    InterruptedException {
  LOG.debug("initialize({}, {})", genericSplit, context);
  // Assuming file split
  if (!(genericSplit instanceof FileSplit))
    throw new IOException("This record reader only supports FileSplit inputs");
  // Find RDF language
  FileSplit split = (FileSplit) genericSplit;
  Path path = split.getPath();
  Lang lang = RDFLanguages.filenameToLang(path.getName());
  if (lang == null)
    throw new IOException("There is no registered RDF language for the input file " + path.toString());
  // Select the record reader and initialize
  this.reader = this.selectRecordReader(lang);
  this.reader.initialize(split, context);
}

@Override
public String getCurrentValue() throws IOException, InterruptedException {
 Text text = delegate.getCurrentValue();
 return text == null ? null : text.toString();
}

@Test
public void testRecordReader()
  throws Exception {
 List<String> paths = Lists.newArrayList("/path1", "/path2");
 GobblinWorkUnitsInputFormat.GobblinSplit split = new GobblinWorkUnitsInputFormat.GobblinSplit(paths);
 GobblinWorkUnitsInputFormat inputFormat = new GobblinWorkUnitsInputFormat();
 RecordReader<LongWritable, Text> recordReader =
   inputFormat.createRecordReader(split, new TaskAttemptContextImpl(new Configuration(), new TaskAttemptID("a", 1,
   TaskType.MAP, 1, 1)));
 recordReader.nextKeyValue();
 Assert.assertEquals(recordReader.getCurrentKey().get(), 0);
 Assert.assertEquals(recordReader.getCurrentValue().toString(), "/path1");
 recordReader.nextKeyValue();
 Assert.assertEquals(recordReader.getCurrentKey().get(), 1);
 Assert.assertEquals(recordReader.getCurrentValue().toString(), "/path2");
 Assert.assertFalse(recordReader.nextKeyValue());
}

 job.getConfiguration().getBoolean(SNAPSHOT_INPUTFORMAT_LOCALITY_ENABLED_KEY,
                  SNAPSHOT_INPUTFORMAT_LOCALITY_ENABLED_DEFAULT);
when(taskAttemptContext.getConfiguration()).thenReturn(job.getConfiguration());
RecordReader<ImmutableBytesWritable, Result> rr =
  tsif.createRecordReader(split, taskAttemptContext);
rr.initialize(split, taskAttemptContext);
while (rr.nextKeyValue()) {
 byte[] row = rr.getCurrentKey().get();
 verifyRowFromMap(rr.getCurrentKey(), rr.getCurrentValue());
 rowTracker.addRow(row);
rr.close();

/**
 * Simple case for record readers.
 * @throws Exception if failed
 */
@Test
public void reader_simple() throws Exception {
  Configuration conf = new ConfigurationProvider().newInstance();
  FileStatus stat = write(conf, 1);
  try (RecordReader<NullWritable, Text> reader = TemporaryInputFormat.createRecordReader()) {
    reader.initialize(
        new FileSplit(stat.getPath(), 0, stat.getLen(), null),
        new TaskAttemptContextImpl(conf, new TaskAttemptID()));
    assertThat(reader.nextKeyValue(), is(true));
    assertThat(reader.getCurrentValue(), is(new Text("Hello, world!")));
    assertThat(reader.nextKeyValue(), is(false));
    assertThat((double) reader.getProgress(), closeTo(1.0, 0.01));
  }
}

 public void run() {
  long records = 0;
  try {
   TaskAttemptContext context = new TaskAttemptContextImpl(
    job.getConfiguration(), new TaskAttemptID());
   RecordReader<Text, Text> reader = 
    inFormat.createRecordReader(splits.get(sampleStep * idx),
    context);
   reader.initialize(splits.get(sampleStep * idx), context);
   while (reader.nextKeyValue()) {
    sampler.addKey(new Text(reader.getCurrentKey()));
    records += 1;
    if (recordsPerSample <= records) {
     break;
    }
   }
  } catch (IOException ie){
   System.err.println("Got an exception while reading splits " +
     StringUtils.stringifyException(ie));
   throw new RuntimeException(ie);
  } catch (InterruptedException e) {
    
  }
 }
};

 /**
  * Verifies that a non-null record reader can be created, and the key/value types are
  * as expected.
  */
 @Test
 public void testCreateRecordReader() throws IOException, InterruptedException {
  // Set up the job configuration.
  Job job = new Job();
  AvroJob.setInputKeySchema(job, Schema.create(Schema.Type.STRING));
  Configuration conf = job.getConfiguration();

  FileSplit inputSplit = createMock(FileSplit.class);
  TaskAttemptContext context = createMock(TaskAttemptContext.class);
  expect(context.getConfiguration()).andReturn(conf).anyTimes();

  replay(inputSplit);
  replay(context);

  AvroKeyInputFormat inputFormat = new AvroKeyInputFormat();
  @SuppressWarnings("unchecked")
  RecordReader<AvroKey<Object>, NullWritable> recordReader = inputFormat.createRecordReader(
    inputSplit, context);
  assertNotNull(inputFormat);
  recordReader.close();

  verify(inputSplit);
  verify(context);
 }
}

public Object[] getSample(InputFormat inf, Job job) throws IOException, InterruptedException {
 long counter = 0;
 List<InputSplit> splits = inf.getSplits(job);
 ArrayList<K> samples = new ArrayList<K>(numSamples);
 int splitsToSample = Math.min(maxSplitsSampled, splits.size());
   job.getConfiguration(), new TaskAttemptID());
  RecordReader<K, V> reader = inf.createRecordReader(splits.get(i), samplingContext);
  reader.initialize(splits.get(i), samplingContext);
  while (reader.nextKeyValue()) {
   if (r.nextDouble() <= freq) {
    if (samples.size() < numSamples) {
      LOG.info(String.format("Fill: Collected %d samples from %d splits", counter, i));
     counter++;
     samples.add(ReflectionUtils.copy(job.getConfiguration(), reader.getCurrentKey(), null));
    } else {
      samples.set(ind, ReflectionUtils.copy(job.getConfiguration(),
       reader.getCurrentKey(), null));
      if (counter % 1000 == 0)
       LOG.info(String.format("Replace Random: Collected %d samples from %d splits", counter, i));
  reader.close();

@Override
public void initialize(final InputSplit inputSplit, final TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
  reader.initialize(inputSplit, taskAttemptContext);
  final Configuration conf = taskAttemptContext.getConfiguration();
  if (conf.get(Constants.GREMLIN_HADOOP_GRAPH_FILTER, null) != null) {
    graphFilter = VertexProgramHelper.deserialize(ConfUtil.makeApacheConfiguration(conf),
      Constants.GREMLIN_HADOOP_GRAPH_FILTER);
  }
}

Job job = new Job(jobConf);
TaskAttemptContext tac = ShimLoader.getHadoopShims().newTaskAttemptContext(job.getConfiguration(), reporter);
 recordReader = createRecordReader(tableSplit, tac);
 try {
  recordReader.initialize(tableSplit, tac);
 } catch (InterruptedException e) {

Javadoc

The record reader breaks the data into key/value pairs for input to the Mapper.

Most used methods

nextKeyValue
Read the next key, value pair.
getCurrentValue
Get the current value.
initialize
Called once at initialization.
close
Close the record reader.
getCurrentKey
Get the current key
getProgress
The current progress of the record reader through its data.

Popular in Java

Creating JSON documents from java classes using gson
getSupportFragmentManager (FragmentActivity)
getApplicationContext (Context)
setRequestProperty (URLConnection)
ObjectMapper (com.fasterxml.jackson.databind)
ObjectMapper provides functionality for reading and writing JSON, either to and from basic POJOs (Pl
BufferedInputStream (java.io)
A BufferedInputStream adds functionality to another input stream-namely, the ability to buffer the i
CountDownLatch (java.util.concurrent)
A synchronization aid that allows one or more threads to wait until a set of operations being perfor
LogFactory (org.apache.commons.logging)
Factory for creating Log instances, with discovery and configuration features similar to that employ
Logger (org.slf4j)
The org.slf4j.Logger interface is the main user entry point of SLF4J API. It is expected that loggin
JOptionPane (javax.swing)
Top Vim plugins

How to useRecordReader in org.apache.hadoop.mapreduce

Best Java code snippets using org.apache.hadoop.mapreduce.RecordReader (Showing top 20 results out of 1,008)

Refine search

How to use
RecordReader
in
org.apache.hadoop.mapreduce