io.prestosql.orc.OrcReader java code examples

public TempFileReader(List<Type> types, OrcDataSource dataSource)
{
  this.types = ImmutableList.copyOf(requireNonNull(types, "types is null"));
  try {
    OrcReader orcReader = new OrcReader(
        dataSource,
        ORC,
        new DataSize(1, MEGABYTE),
        new DataSize(8, MEGABYTE),
        new DataSize(8, MEGABYTE),
        new DataSize(16, MEGABYTE));
    Map<Integer, Type> includedColumns = new HashMap<>();
    for (int i = 0; i < types.size(); i++) {
      includedColumns.put(i, types.get(i));
    }
    reader = orcReader.createRecordReader(
        includedColumns,
        OrcPredicate.TRUE,
        UTC,
        newSimpleAggregatedMemoryContext(),
        INITIAL_BATCH_SIZE);
  }
  catch (IOException e) {
    throw new PrestoException(HIVE_WRITER_DATA_ERROR, "Failed to read temporary data");
  }
}

OrcReader reader = new OrcReader(dataSource, ORC, readerAttributes.getMaxMergeDistance(), readerAttributes.getMaxReadSize(), readerAttributes.getTinyStripeThreshold(), HUGE_MAX_READ_BLOCK_SIZE);
Map<Long, Integer> indexMap = columnIdIndex(reader.getColumnNames());
ImmutableMap.Builder<Integer, Type> includedColumns = ImmutableMap.builder();
ImmutableList.Builder<Integer> columnIndexes = ImmutableList.builder();
OrcRecordReader recordReader = reader.createRecordReader(includedColumns.build(), predicate, UTC, systemMemoryUsage, INITIAL_BATCH_SIZE);

private List<ColumnInfo> getColumnInfo(OrcReader reader)
{
  Optional<OrcFileMetadata> metadata = getOrcFileMetadata(reader);
  if (metadata.isPresent()) {
    return getColumnInfoFromOrcUserMetadata(metadata.get());
  }
  // support for legacy files without metadata
  return getColumnInfoFromOrcColumnTypes(reader.getColumnNames(), reader.getFooter().getTypes());
}

private static ColumnStats doComputeColumnStats(OrcReader orcReader, long columnId, Type type)
    throws IOException
{
  int columnIndex = columnIndex(orcReader.getColumnNames(), columnId);
  OrcRecordReader reader = orcReader.createRecordReader(ImmutableMap.of(columnIndex, type), OrcPredicate.TRUE, UTC, newSimpleAggregatedMemoryContext(), INITIAL_BATCH_SIZE);
  if (type.equals(BooleanType.BOOLEAN)) {
    return indexBoolean(type, reader, columnIndex, columnId);
  }
  if (type.equals(BigintType.BIGINT) ||
      type.equals(DateType.DATE) ||
      type.equals(TimestampType.TIMESTAMP)) {
    return indexLong(type, reader, columnIndex, columnId);
  }
  if (type.equals(DoubleType.DOUBLE)) {
    return indexDouble(type, reader, columnIndex, columnId);
  }
  if (type instanceof VarcharType) {
    return indexString(type, reader, columnIndex, columnId);
  }
  return null;
}

public void doIntegration(TestingOrcDataSource orcDataSource, DataSize maxMergeDistance, DataSize maxReadSize, DataSize tinyStripeThreshold)
    throws IOException
{
  OrcReader orcReader = new OrcReader(orcDataSource, ORC, maxMergeDistance, maxReadSize, tinyStripeThreshold, new DataSize(1, Unit.MEGABYTE));
  // 1 for reading file footer
  assertEquals(orcDataSource.getReadCount(), 1);
  List<StripeInformation> stripes = orcReader.getFooter().getStripes();
  // Sanity check number of stripes. This can be three or higher because of orc writer low memory mode.
  assertGreaterThanOrEqual(stripes.size(), 3);
  //verify wrapped by CachingOrcReader
  assertInstanceOf(wrapWithCacheIfTinyStripes(orcDataSource, stripes, maxMergeDistance, tinyStripeThreshold), CachingOrcDataSource.class);
  OrcRecordReader orcRecordReader = orcReader.createRecordReader(
      ImmutableMap.of(0, VARCHAR),
      (numberOfRows, statisticsByColumnIndex) -> true,
      HIVE_STORAGE_TIME_ZONE,
      newSimpleAggregatedMemoryContext(),
      INITIAL_BATCH_SIZE);
  int positionCount = 0;
  while (true) {
    int batchSize = orcRecordReader.nextBatch();
    if (batchSize <= 0) {
      break;
    }
    Block block = orcRecordReader.readBlock(VARCHAR, 0);
    positionCount += block.getPositionCount();
  }
  assertEquals(positionCount, POSITION_COUNT);
}

@Test
public void testReadUserMetadata()
    throws Exception
{
  try (TempFile tempFile = new TempFile()) {
    Map<String, String> metadata = ImmutableMap.of(
        "a", "ala",
        "b", "ma",
        "c", "kota");
    createFileWithOnlyUserMetadata(tempFile.getFile(), metadata);
    OrcDataSource orcDataSource = new FileOrcDataSource(tempFile.getFile(), new DataSize(1, MEGABYTE), new DataSize(1, MEGABYTE), new DataSize(1, MEGABYTE), true);
    OrcReader orcReader = new OrcReader(orcDataSource, ORC, new DataSize(1, MEGABYTE), new DataSize(1, MEGABYTE), new DataSize(1, MEGABYTE), new DataSize(1, MEGABYTE));
    Footer footer = orcReader.getFooter();
    Map<String, String> readMetadata = Maps.transformValues(footer.getUserMetadata(), Slice::toStringAscii);
    assertEquals(readMetadata, metadata);
  }
}

public static OrcRecordReader createReader(OrcDataSource dataSource, List<Long> columnIds, List<Type> types)
    throws IOException
{
  OrcReader orcReader = new OrcReader(dataSource, ORC, new DataSize(1, MEGABYTE), new DataSize(1, MEGABYTE), new DataSize(1, MEGABYTE), new DataSize(1, MEGABYTE));
  List<String> columnNames = orcReader.getColumnNames();
  assertEquals(columnNames.size(), columnIds.size());
  Map<Integer, Type> includedColumns = new HashMap<>();
  int ordinal = 0;
  for (long columnId : columnIds) {
    assertEquals(columnNames.get(ordinal), String.valueOf(columnId));
    includedColumns.put(ordinal, types.get(ordinal));
    ordinal++;
  }
  return createRecordReader(orcReader, includedColumns);
}

  throws IOException
orcDataSource = wrapWithCacheIfTiny(orcDataSource, tinyStripeThreshold);
this.orcDataSource = orcDataSource;
requireNonNull(orcEncoding, "orcEncoding is null");
  if (!isValidHeaderMagic(orcDataSource)) {
    throw new OrcCorruptionException(orcDataSource.getId(), "Not an ORC file");
checkOrcVersion(orcDataSource, postScript.getVersion());
validateWrite(validation -> validation.getVersion().equals(postScript.getVersion()), "Unexpected version");
validateWrite(validation -> validation.getCompression() == compressionKind, "Unexpected compression");
validateWrite(validation -> validation.getColumnNames().equals(getColumnNames()), "Unexpected column names");
validateWrite(validation -> validation.getRowGroupMaxRowCount() == footer.getRowsInRowGroup(), "Unexpected rows in group");
if (writeValidation.isPresent()) {
  writeValidation.get().validateMetadata(orcDataSource.getId(), footer.getUserMetadata());

public static OrcRecordReader createRecordReader(OrcReader orcReader, Map<Integer, Type> includedColumns)
{
  return orcReader.createRecordReader(includedColumns, OrcPredicate.TRUE, DateTimeZone.UTC, newSimpleAggregatedMemoryContext(), MAX_BATCH_SIZE);
}

private List<ColumnStats> computeShardStats(File file)
{
  try (OrcDataSource dataSource = fileOrcDataSource(defaultReaderAttributes, file)) {
    OrcReader reader = new OrcReader(dataSource, ORC, defaultReaderAttributes.getMaxMergeDistance(), defaultReaderAttributes.getMaxReadSize(), defaultReaderAttributes.getTinyStripeThreshold(), HUGE_MAX_READ_BLOCK_SIZE);
    ImmutableList.Builder<ColumnStats> list = ImmutableList.builder();
    for (ColumnInfo info : getColumnInfo(reader)) {
      computeColumnStats(reader, info.getColumnId(), info.getType()).ifPresent(list::add);
    }
    return list.build();
  }
  catch (IOException e) {
    throw new PrestoException(RAPTOR_ERROR, "Failed to read file: " + file, e);
  }
}

  private static Map<String, Integer> buildPhysicalNameOrdinalMap(OrcReader reader)
  {
    ImmutableMap.Builder<String, Integer> physicalNameOrdinalMap = ImmutableMap.builder();

    int ordinal = 0;
    for (String physicalColumnName : reader.getColumnNames()) {
      physicalNameOrdinalMap.put(physicalColumnName, ordinal);
      ordinal++;
    }

    return physicalNameOrdinalMap.build();
  }
}

private static Optional<OrcFileMetadata> getOrcFileMetadata(OrcReader reader)
{
  return Optional.ofNullable(reader.getFooter().getUserMetadata().get(OrcFileMetadata.KEY))
      .map(slice -> METADATA_CODEC.fromJson(slice.getBytes()));
}

public void doIntegration(TestingOrcDataSource orcDataSource, DataSize maxMergeDistance, DataSize maxReadSize, DataSize tinyStripeThreshold)
    throws IOException
{
  OrcReader orcReader = new OrcReader(orcDataSource, ORC, maxMergeDistance, maxReadSize, tinyStripeThreshold, new DataSize(1, Unit.MEGABYTE));
  // 1 for reading file footer
  assertEquals(orcDataSource.getReadCount(), 1);
  List<StripeInformation> stripes = orcReader.getFooter().getStripes();
  // Sanity check number of stripes. This can be three or higher because of orc writer low memory mode.
  assertGreaterThanOrEqual(stripes.size(), 3);
  //verify wrapped by CachingOrcReader
  assertInstanceOf(wrapWithCacheIfTinyStripes(orcDataSource, stripes, maxMergeDistance, tinyStripeThreshold), CachingOrcDataSource.class);
  OrcRecordReader orcRecordReader = orcReader.createRecordReader(
      ImmutableMap.of(0, VARCHAR),
      (numberOfRows, statisticsByColumnIndex) -> true,
      HIVE_STORAGE_TIME_ZONE,
      newSimpleAggregatedMemoryContext(),
      INITIAL_BATCH_SIZE);
  int positionCount = 0;
  while (true) {
    int batchSize = orcRecordReader.nextBatch();
    if (batchSize <= 0) {
      break;
    }
    Block block = orcRecordReader.readBlock(VARCHAR, 0);
    positionCount += block.getPositionCount();
  }
  assertEquals(positionCount, POSITION_COUNT);
}

@Test
public void testReadUserMetadata()
    throws Exception
{
  try (TempFile tempFile = new TempFile()) {
    Map<String, String> metadata = ImmutableMap.of(
        "a", "ala",
        "b", "ma",
        "c", "kota");
    createFileWithOnlyUserMetadata(tempFile.getFile(), metadata);
    OrcDataSource orcDataSource = new FileOrcDataSource(tempFile.getFile(), new DataSize(1, MEGABYTE), new DataSize(1, MEGABYTE), new DataSize(1, MEGABYTE), true);
    OrcReader orcReader = new OrcReader(orcDataSource, ORC, new DataSize(1, MEGABYTE), new DataSize(1, MEGABYTE), new DataSize(1, MEGABYTE), new DataSize(1, MEGABYTE));
    Footer footer = orcReader.getFooter();
    Map<String, String> readMetadata = Maps.transformValues(footer.getUserMetadata(), Slice::toStringAscii);
    assertEquals(readMetadata, metadata);
  }
}

public static OrcRecordReader createReaderNoRows(OrcDataSource dataSource)
    throws IOException
{
  OrcReader orcReader = new OrcReader(dataSource, ORC, new DataSize(1, MEGABYTE), new DataSize(1, MEGABYTE), new DataSize(1, MEGABYTE), new DataSize(1, MEGABYTE));
  assertEquals(orcReader.getColumnNames().size(), 0);
  return createRecordReader(orcReader, ImmutableMap.of());
}

  throws IOException
orcDataSource = wrapWithCacheIfTiny(orcDataSource, tinyStripeThreshold);
this.orcDataSource = orcDataSource;
requireNonNull(orcEncoding, "orcEncoding is null");
  if (!isValidHeaderMagic(orcDataSource)) {
    throw new OrcCorruptionException(orcDataSource.getId(), "Not an ORC file");
checkOrcVersion(orcDataSource, postScript.getVersion());
validateWrite(validation -> validation.getVersion().equals(postScript.getVersion()), "Unexpected version");
validateWrite(validation -> validation.getCompression() == compressionKind, "Unexpected compression");
validateWrite(validation -> validation.getColumnNames().equals(getColumnNames()), "Unexpected column names");
validateWrite(validation -> validation.getRowGroupMaxRowCount() == footer.getRowsInRowGroup(), "Unexpected rows in group");
if (writeValidation.isPresent()) {
  writeValidation.get().validateMetadata(orcDataSource.getId(), footer.getUserMetadata());

public OrcRecordReader createRecordReader(
    Map<Integer, Type> includedColumns,
    OrcPredicate predicate,
    long offset,
    long length,
    DateTimeZone hiveStorageTimeZone,
    AggregatedMemoryContext systemMemoryUsage,
    int initialBatchSize)
{
  return new OrcRecordReader(
      requireNonNull(includedColumns, "includedColumns is null"),
      requireNonNull(predicate, "predicate is null"),
      footer.getNumberOfRows(),
      footer.getStripes(),
      footer.getFileStats(),
      metadata.getStripeStatsList(),
      orcDataSource,
      offset,
      length,
      footer.getTypes(),
      decompressor,
      footer.getRowsInRowGroup(),
      requireNonNull(hiveStorageTimeZone, "hiveStorageTimeZone is null"),
      hiveWriterVersion,
      metadataReader,
      maxMergeDistance,
      tinyStripeThreshold,
      maxBlockSize,
      footer.getUserMetadata(),
      systemMemoryUsage,

private static List<HiveColumnHandle> getPhysicalHiveColumnHandles(List<HiveColumnHandle> columns, boolean useOrcColumnNames, OrcReader reader, Path path)
{
  if (!useOrcColumnNames) {
    return columns;
  }
  verifyFileHasColumnNames(reader.getColumnNames(), path);
  Map<String, Integer> physicalNameOrdinalMap = buildPhysicalNameOrdinalMap(reader);
  int nextMissingColumnIndex = physicalNameOrdinalMap.size();
  ImmutableList.Builder<HiveColumnHandle> physicalColumns = ImmutableList.builder();
  for (HiveColumnHandle column : columns) {
    Integer physicalOrdinal = physicalNameOrdinalMap.get(column.getName());
    if (physicalOrdinal == null) {
      // if the column is missing from the file, assign it a column number larger
      // than the number of columns in the file so the reader will fill it with nulls
      physicalOrdinal = nextMissingColumnIndex;
      nextMissingColumnIndex++;
    }
    physicalColumns.add(new HiveColumnHandle(column.getName(), column.getHiveType(), column.getTypeSignature(), physicalOrdinal, column.getColumnType(), column.getComment()));
  }
  return physicalColumns.build();
}

  static void validateFile(
      OrcWriteValidation writeValidation,
      OrcDataSource input,
      List<Type> types,
      DateTimeZone hiveStorageTimeZone,
      OrcEncoding orcEncoding)
      throws OrcCorruptionException
  {
    ImmutableMap.Builder<Integer, Type> readTypes = ImmutableMap.builder();
    for (int columnIndex = 0; columnIndex < types.size(); columnIndex++) {
      readTypes.put(columnIndex, types.get(columnIndex));
    }
    try {
      OrcReader orcReader = new OrcReader(input, orcEncoding, new DataSize(1, MEGABYTE), new DataSize(8, MEGABYTE), new DataSize(8, MEGABYTE), new DataSize(16, MEGABYTE), Optional.of(writeValidation));
      try (OrcRecordReader orcRecordReader = orcReader.createRecordReader(readTypes.build(), OrcPredicate.TRUE, hiveStorageTimeZone, newSimpleAggregatedMemoryContext(), INITIAL_BATCH_SIZE)) {
        while (orcRecordReader.nextBatch() >= 0) {
          // ignored
        }
      }
    }
    catch (IOException e) {
      throw new OrcCorruptionException(e, input.getId(), "Validation failed");
    }
  }
}

Footer footer = new OrcReader(orcDataSource, ORC, dataSize, dataSize, dataSize, dataSize).getFooter();

Most used methods

<init>
createRecordReader
getColumnNames
getFooter
checkOrcVersion
Check to see if this ORC file is from a future version and if so, warn the user that we may not be a
isValidHeaderMagic
Does the file start with the ORC magic bytes?
validateFile
validateWrite
wrapWithCacheIfTiny

Popular in Java

Reading from database using SQL prepared statement
setScale (BigDecimal)
compareTo (BigDecimal)
getOriginalFilename (MultipartFile)
Return the original filename in the client's filesystem.This may contain path information depending
BufferedInputStream (java.io)
A BufferedInputStream adds functionality to another input stream-namely, the ability to buffer the i
SocketException (java.net)
This SocketException may be thrown during socket creation or setting options, and is the superclass
DateFormat (java.text)
Formats or parses dates and times.This class provides factories for obtaining instances configured f
MessageFormat (java.text)
Produces concatenated messages in language-neutral way. New code should probably use java.util.Forma
Modifier (javassist)
The Modifier class provides static methods and constants to decode class and member access modifiers
JFrame (javax.swing)
Github Copilot alternatives

How to useOrcReader in io.prestosql.orc

Best Java code snippets using io.prestosql.orc.OrcReader (Showing top 20 results out of 315)

How to use
OrcReader
in
io.prestosql.orc