/** * Just return the record reader */ public RecordReader getRecordReader(InputSplit split, JobConf conf, Reporter reporter) throws IOException { return new WarcFileRecordReader(conf, split); } }
public WarcFileRecordReader(Configuration conf, InputSplit split) throws IOException { if (split instanceof FileSplit) { this.filePathList=new Path[1]; this.filePathList[0]=((FileSplit)split).getPath(); } else if (split instanceof MultiFileSplit) { this.filePathList=((MultiFileSplit)split).getPaths(); } else { throw new IOException("InputSplit is not a file split or a multi-file split - aborting"); } fs = this.filePathList[0].getFileSystem(conf); // get the total file sizes for (int i=0; i < filePathList.length; i++) { totalFileSize += fs.getFileStatus(filePathList[i]).getLen(); } Class<? extends CompressionCodec> codecClass=null; try { codecClass=conf.getClassByName("org.apache.hadoop.io.compress.GzipCodec").asSubclass(CompressionCodec.class); compressionCodec=(CompressionCodec)ReflectionUtils.newInstance(codecClass, conf); } catch (ClassNotFoundException cnfEx) { compressionCodec=null; LOG.info("!!! ClassNotFoun Exception thrown setting Gzip codec"); } openNextFile(); }
/** * Just return the record reader */ public RecordReader getRecordReader(InputSplit split, JobConf conf, Reporter reporter) throws IOException { return new WarcFileRecordReader(conf, split); } }
public boolean next(LongWritable key, WritableWarcRecord value) throws IOException { DataInputStream whichStream=null; if (compressionInput!=null) { whichStream=compressionInput; } else if (currentFile!=null) { whichStream=currentFile; } if (whichStream==null) { return false; } WarcRecord newRecord=WarcRecord.readNextWarcRecord(whichStream); if (newRecord==null) { // try advancing the file if (openNextFile()) { newRecord=WarcRecord.readNextWarcRecord(whichStream); } if (newRecord==null) { return false; } } totalNumBytesRead += (long)newRecord.getTotalRecordLength(); newRecord.setWarcFilePath(filePathList[currentFilePath].toString()); // now, set our output variables value.setRecord(newRecord); key.set(recordNumber); recordNumber++; return true; }
/** * Just return the record reader */ public RecordReader getRecordReader(InputSplit split, JobConf conf, Reporter reporter) throws IOException { return new WarcFileRecordReader(conf, split); } }
public boolean next(LongWritable key, WritableWarcRecord value) throws IOException { DataInputStream whichStream=null; if (compressionInput!=null) { whichStream=compressionInput; } else if (currentFile!=null) { whichStream=currentFile; } if (whichStream==null) { return false; } WarcRecord newRecord=WarcRecord.readNextWarcRecord(whichStream); if (newRecord==null) { // try advancing the file if (openNextFile()) { newRecord=WarcRecord.readNextWarcRecord(whichStream); } if (newRecord==null) { return false; } } totalNumBytesRead += (long)newRecord.getTotalRecordLength(); newRecord.setWarcFilePath(filePathList[currentFilePath].toString()); // now, set our output variables value.setRecord(newRecord); key.set(recordNumber); recordNumber++; return true; }
public boolean next(LongWritable key, WritableWarcRecord value) throws IOException { DataInputStream whichStream = null; if (compressionInput != null) { whichStream = compressionInput; } else if (currentFile != null) { whichStream = currentFile; } if (whichStream == null) { return false; } WarcRecord newRecord = WarcRecord.readNextWarcRecord(whichStream); if (newRecord == null) { // try advancing the file if (openNextFile()) { newRecord = WarcRecord.readNextWarcRecord(whichStream); } if (newRecord == null) { return false; } } totalNumBytesRead += (long) newRecord.getTotalRecordLength(); newRecord.setWarcFilePath(filePathList[currentFilePath].toString()); // now, set our output variables value.setRecord(newRecord); key.set(recordNumber); recordNumber++; return true; }