protected boolean isPigLoaded() { try { Class.forName("org.apache.pig.impl.logicalLayer.schema.Schema"); return true; } catch (ClassNotFoundException e) { LOG.info("Pig is not loaded, pig metadata will not be written"); return false; } }
private void log(Object o) { LOG.info(o); }
private static <T> List<T> runAllInParallel(int parallelism, List<Callable<T>> toRun) throws ExecutionException { LOG.info("Initiating action with parallelism: " + parallelism); ExecutorService threadPool = Executors.newFixedThreadPool(parallelism); try { List<Future<T>> futures = new ArrayList<Future<T>>(); for (Callable<T> callable : toRun) { futures.add(threadPool.submit(callable)); } List<T> result = new ArrayList<T>(toRun.size()); for (Future<T> future : futures) { try { result.add(future.get()); } catch (InterruptedException e) { throw new RuntimeException("The thread was interrupted", e); } } return result; } finally { threadPool.shutdownNow(); } }
private static List<FileStatus> getAllFileRecursively( List<FileStatus> files, Configuration conf) throws IOException { List<FileStatus> result = new ArrayList<FileStatus>(); for (FileStatus file : files) { if (file.isDir()) { Path p = file.getPath(); FileSystem fs = p.getFileSystem(conf); staticAddInputPathRecursively(result, fs, p, HiddenFileFilter.INSTANCE); } else { result.add(file); } } LOG.info("Total input paths to process : " + result.size()); return result; }
/** * Given a FilterPredicate, return a Filter that wraps it. * This method also logs the filter being used and rewrites * the predicate to not include the not() operator. */ public static Filter get(FilterPredicate filterPredicate) { checkNotNull(filterPredicate, "filterPredicate"); LOG.info("Filtering using predicate: " + filterPredicate); // rewrite the predicate to not include the not() operator FilterPredicate collapsedPredicate = LogicalInverseRewriter.rewrite(filterPredicate); if (!filterPredicate.equals(collapsedPredicate)) { LOG.info("Predicate has been collapsed to: " + collapsedPredicate); } return new FilterPredicateCompat(collapsedPredicate); }
static ParquetMetadata readSummaryMetadata(Configuration configuration, Path basePath, boolean skipRowGroups) throws IOException { Path metadataFile = new Path(basePath, PARQUET_METADATA_FILE); Path commonMetaDataFile = new Path(basePath, PARQUET_COMMON_METADATA_FILE); FileSystem fileSystem = basePath.getFileSystem(configuration); if (skipRowGroups && fileSystem.exists(commonMetaDataFile)) { // reading the summary file that does not contain the row groups if (Log.INFO) LOG.info("reading summary file: " + commonMetaDataFile); return readFooter(configuration, commonMetaDataFile, filter(skipRowGroups)); } else if (fileSystem.exists(metadataFile)) { if (Log.INFO) LOG.info("reading summary file: " + metadataFile); return readFooter(configuration, metadataFile, filter(skipRowGroups)); } else { return null; } }
protected PageHeader readPageHeader() throws IOException { PageHeader pageHeader; int initialPos = this.pos; try { pageHeader = Util.readPageHeader(this); } catch (IOException e) { // this is to workaround a bug where the compressedLength // of the chunk is missing the size of the header of the dictionary // to allow reading older files (using dictionary) we need this. // usually 13 to 19 bytes are missing // if the last page is smaller than this, the page header itself is truncated in the buffer. this.pos = initialPos; // resetting the buffer to the position before we got the error LOG.info("completing the column chunk to read the page header"); pageHeader = Util.readPageHeader(new SequenceInputStream(this, f)); // trying again from the buffer + remainder of the stream. } return pageHeader; }
private void checkBlockSizeReached() throws IOException { if (recordCount >= recordCountForNextMemCheck) { // checking the memory size is relatively expensive, so let's not do it for every record. long memSize = columnStore.getBufferedSize(); if (memSize > rowGroupSizeThreshold) { LOG.info(format("mem size %,d > %,d: flushing %,d records to disk.", memSize, rowGroupSizeThreshold, recordCount)); flushRowGroupToStore(); initStore(); recordCountForNextMemCheck = min(max(MINIMUM_RECORD_COUNT_FOR_CHECK, recordCount / 2), MAXIMUM_RECORD_COUNT_FOR_CHECK); } else { float recordSize = (float) memSize / recordCount; recordCountForNextMemCheck = min( max(MINIMUM_RECORD_COUNT_FOR_CHECK, (recordCount + (long)(rowGroupSizeThreshold / recordSize)) / 2), // will check halfway recordCount + MAXIMUM_RECORD_COUNT_FOR_CHECK // will not look more than max records ahead ); if (DEBUG) LOG.debug(format("Checked mem at %,d will check again at: %,d ", recordCount, recordCountForNextMemCheck)); } } }
private void checkBlockSizeReached() throws IOException { if (recordCount >= recordCountForNextMemCheck) { // checking the memory size is relatively expensive, so let's not do it for every record. long memSize = store.memSize(); if (memSize > blockSize) { LOG.info(format("mem size %,d > %,d: flushing %,d records to disk.", memSize, blockSize, recordCount)); flushStore(); initStore(); recordCountForNextMemCheck = min(max(MINIMUM_RECORD_COUNT_FOR_CHECK, recordCount / 2), MAXIMUM_RECORD_COUNT_FOR_CHECK); } else { float recordSize = (float) memSize / recordCount; recordCountForNextMemCheck = min( max(MINIMUM_RECORD_COUNT_FOR_CHECK, (recordCount + (long)(blockSize / recordSize)) / 2), // will check halfway recordCount + MAXIMUM_RECORD_COUNT_FOR_CHECK // will not look more than max records ahead ); if (DEBUG) LOG.debug(format("Checked mem at %,d will check again at: %,d ", recordCount, recordCountForNextMemCheck)); } } }
public CompressionCodecName getCodec() { CompressionCodecName codec; Configuration configuration = getConfiguration(); if (isParquetCompressionSet(configuration)) { // explicit parquet config codec = getParquetCompressionCodec(configuration); } else if (isHadoopCompressionSet()) { // from hadoop config codec = getHadoopCompressionCodec(); } else { if (INFO) LOG.info("Compression set to false"); codec = CompressionCodecName.UNCOMPRESSED; } if (INFO) LOG.info("Compression: " + codec.name()); return codec; }
@Override public RecordMaterializer<Tuple> prepareForRead( Configuration configuration, Map<String, String> keyValueMetaData, MessageType fileSchema, ReadContext readContext) { MessageType requestedSchema = readContext.getRequestedSchema(); Schema requestedPigSchema = getPigSchema(configuration); if (requestedPigSchema == null) { throw new ParquetDecodingException("Missing Pig schema: ParquetLoader sets the schema in the job conf"); } boolean elephantBirdCompatible = configuration.getBoolean(PARQUET_PIG_ELEPHANT_BIRD_COMPATIBLE, false); boolean columnIndexAccess = configuration.getBoolean(PARQUET_COLUMN_INDEX_ACCESS, false); if (elephantBirdCompatible) { LOG.info("Numbers will default to 0 instead of NULL; Boolean will be converted to Int"); } return new TupleRecordMaterializer(requestedSchema, requestedPigSchema, elephantBirdCompatible, columnIndexAccess); }
private CompressionCodecName getHadoopCompressionCodec() { CompressionCodecName codec; try { // find the right codec Class<?> codecClass = getHadoopOutputCompressorClass(CompressionCodecName.UNCOMPRESSED.getHadoopCompressionCodecClass()); if (INFO) LOG.info("Compression set through hadoop codec: " + codecClass.getName()); codec = CompressionCodecName.fromCompressionCodec(codecClass); } catch (CompressionCodecNotSupportedException e) { if (WARN) LOG.warn("codec defined in hadoop config is not supported by parquet [" + e.getCodecClass().getName() + "] and will use UNCOMPRESSED", e); codec = CompressionCodecName.UNCOMPRESSED; } catch (IllegalArgumentException e) { if (WARN) LOG.warn("codec class not found: " + e.getMessage(), e); codec = CompressionCodecName.UNCOMPRESSED; } return codec; }
public void writeToFileWriter(ParquetFileWriter writer) throws IOException { writer.startColumn(path, totalValueCount, compressor.getCodecName()); if (dictionaryPage != null) { writer.writeDictionaryPage(dictionaryPage); encodings.add(dictionaryPage.getEncoding()); } writer.writeDataPages(buf, uncompressedLength, compressedLength, totalStatistics, new ArrayList<Encoding>(encodings)); writer.endColumn(); if (INFO) { LOG.info( String.format( "written %,dB for %s: %,d values, %,dB raw, %,dB comp, %d pages, encodings: %s", buf.size(), path, totalValueCount, uncompressedLength, compressedLength, pageCount, encodings) + (dictionaryPage != null ? String.format( ", dic { %,d entries, %,dB raw, %,dB comp}", dictionaryPage.getDictionarySize(), dictionaryPage.getUncompressedSize(), dictionaryPage.getDictionarySize()) : "")); } encodings.clear(); pageCount = 0; }
public BytesInput readAsBytesInput(int size) throws IOException { if (pos + size > count) { // this is to workaround a bug where the compressedLength // of the chunk is missing the size of the header of the dictionary // to allow reading older files (using dictionary) we need this. // usually 13 to 19 bytes are missing int l1 = count - pos; int l2 = size - l1; LOG.info("completed the column chunk with " + l2 + " bytes"); return BytesInput.concat(super.readAsBytesInput(l1), BytesInput.copy(BytesInput.from(f, l2))); } return super.readAsBytesInput(size); }
public void writeToFileWriter(ParquetFileWriter writer) throws IOException { writer.startColumn(path, totalValueCount, compressor.getCodecName()); if (dictionaryPage != null) { writer.writeDictionaryPage(dictionaryPage); encodings.add(dictionaryPage.getEncoding()); } writer.writeDataPages(BytesInput.from(buf), uncompressedLength, compressedLength, totalStatistics, new ArrayList<Encoding>(encodings)); writer.endColumn(); if (INFO) { LOG.info( String.format( "written %,dB for %s: %,d values, %,dB raw, %,dB comp, %d pages, encodings: %s", buf.size(), path, totalValueCount, uncompressedLength, compressedLength, pageCount, encodings) + (dictionaryPage != null ? String.format( ", dic { %,d entries, %,dB raw, %,dB comp}", dictionaryPage.getDictionarySize(), dictionaryPage.getUncompressedSize(), dictionaryPage.getDictionarySize()) : "")); } encodings.clear(); pageCount = 0; }
public void initialize(MessageType requestedSchema, MessageType fileSchema, Map<String, String> extraMetadata, Map<String, String> readSupportMetadata, Path file, List<BlockMetaData> blocks, Configuration configuration) throws IOException { this.requestedSchema = requestedSchema; this.fileSchema = fileSchema; this.file = file; this.columnCount = this.requestedSchema.getPaths().size(); this.recordConverter = readSupport.prepareForRead( configuration, extraMetadata, fileSchema, new ReadSupport.ReadContext(requestedSchema, readSupportMetadata)); List<ColumnDescriptor> columns = requestedSchema.getColumns(); reader = new ParquetFileReader(configuration, file, blocks, columns); for (BlockMetaData block : blocks) { total += block.getRowCount(); } LOG.info("RecordReader initialized will read a total of " + total + " records."); }
private void flushRowGroupToStore() throws IOException { LOG.info(format("Flushing mem columnStore to file. allocated memory: %,d", columnStore.getAllocatedSize())); if (columnStore.getAllocatedSize() > 3 * (long)rowGroupSizeThreshold) { LOG.warn("Too much memory used: " + columnStore.memUsageString()); } if (recordCount > 0) { parquetFileWriter.startBlock(recordCount); columnStore.flush(); pageStore.flushToFileWriter(parquetFileWriter); recordCount = 0; parquetFileWriter.endBlock(); } columnStore = null; pageStore = null; }
private void flushStore() throws IOException { LOG.info(format("Flushing mem store to file. allocated memory: %,d", store.allocatedSize())); if (store.allocatedSize() > 3 * blockSize) { LOG.warn("Too much memory used: " + store.memUsageString()); } w.startBlock(recordCount); store.flush(); pageStore.flushToFileWriter(w); recordCount = 0; w.endBlock(); store = null; pageStore = null; } }
public void initialize(MessageType fileSchema, Map<String, String> fileMetadata, Path file, List<BlockMetaData> blocks, Configuration configuration) throws IOException { // initialize a ReadContext for this file ReadSupport.ReadContext readContext = readSupport.init(new InitContext( configuration, toSetMultiMap(fileMetadata), fileSchema)); this.requestedSchema = readContext.getRequestedSchema(); this.fileSchema = fileSchema; this.file = file; this.columnCount = requestedSchema.getPaths().size(); this.recordConverter = readSupport.prepareForRead( configuration, fileMetadata, fileSchema, readContext); this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true); List<ColumnDescriptor> columns = requestedSchema.getColumns(); reader = new ParquetFileReader(configuration, file, blocks, columns); for (BlockMetaData block : blocks) { total += block.getRowCount(); } LOG.info("RecordReader initialized will read a total of " + total + " records."); }
/** * end a column (once all rep, def and data have been written) * @throws IOException */ public void endColumn() throws IOException { state = state.endColumn(); if (DEBUG) LOG.debug(out.getPos() + ": end column"); currentBlock.addColumn(ColumnChunkMetaData.get( currentChunkPath, currentChunkType, currentChunkCodec, currentEncodings, currentStatistics, currentChunkFirstDataPage, currentChunkDictionaryPageOffset, currentChunkValueCount, compressedLength, uncompressedLength)); if (DEBUG) LOG.info("ended Column chumk: " + currentColumn); currentColumn = null; this.currentBlock.setTotalByteSize(currentBlock.getTotalByteSize() + uncompressedLength); this.uncompressedLength = 0; this.compressedLength = 0; }