/** * Add an Avro {@link Schema} to the given {@link HiveRegistrationUnit}. * * <p> * If {@link #USE_SCHEMA_FILE} is true, the schema will be added via {@link #SCHEMA_URL} pointing to * the schema file named {@link #SCHEMA_FILE_NAME}. * </p> * * <p> * If {@link #USE_SCHEMA_FILE} is false, the schema will be obtained by {@link #getDirectorySchema(Path)}. * If the length of the schema is less than {@link #SCHEMA_LITERAL_LENGTH_LIMIT}, it will be added via * {@link #SCHEMA_LITERAL}. Otherwise, the schema will be written to {@link #SCHEMA_FILE_NAME} and added * via {@link #SCHEMA_URL}. * </p> */ @Override public void addSerDeProperties(Path path, HiveRegistrationUnit hiveUnit) throws IOException { Preconditions.checkArgument(this.fs.getFileStatus(path).isDirectory(), path + " is not a directory."); Schema schema; try (Timer.Context context = metricContext.timer(HIVE_SPEC_SCHEMA_READING_TIMER).time()) { schema = getDirectorySchema(path); } if (schema == null) { return; } hiveUnit.setSerDeType(this.serDeWrapper.getSerDe().getClass().getName()); hiveUnit.setInputFormat(this.serDeWrapper.getInputFormatClassName()); hiveUnit.setOutputFormat(this.serDeWrapper.getOutputFormatClassName()); addSchemaProperties(path, hiveUnit, schema); }
/** * Get an instance of {@link HiveSerDeWrapper}. * * @param serDeType The SerDe type. If serDeType is one of the available {@link HiveSerDeWrapper.BuiltInHiveSerDe}, * the other three parameters are not used. Otherwise, serDeType should be the class name of a {@link SerDe}, * and the other three parameters must be present. */ public static HiveSerDeWrapper get(String serDeType, Optional<String> inputFormatClassName, Optional<String> outputFormatClassName) { Optional<BuiltInHiveSerDe> hiveSerDe = Enums.getIfPresent(BuiltInHiveSerDe.class, serDeType.toUpperCase()); if (hiveSerDe.isPresent()) { return new HiveSerDeWrapper(hiveSerDe.get()); } Preconditions.checkArgument(inputFormatClassName.isPresent(), "Missing input format class name for SerDe " + serDeType); Preconditions.checkArgument(outputFormatClassName.isPresent(), "Missing output format class name for SerDe " + serDeType); return new HiveSerDeWrapper(serDeType, inputFormatClassName.get(), outputFormatClassName.get()); }
/** * Get an instance of {@link HiveSerDeWrapper}. * * @param serDeType The SerDe type. This should be one of the available {@link HiveSerDeWrapper.BuiltInHiveSerDe}s. */ public static HiveSerDeWrapper get(String serDeType) { return get(serDeType, Optional.<String> absent(), Optional.<String> absent()); }
@Override public List<WorkUnit> getWorkunits(SourceState state) { if (!state.contains(HadoopFileInputSource.FILE_INPUT_FORMAT_CLASS_KEY)) { state.setProp(HadoopFileInputSource.FILE_INPUT_FORMAT_CLASS_KEY, HiveSerDeWrapper.getDeserializer(state).getInputFormatClassName()); } return super.getWorkunits(state); }
@SuppressWarnings("deprecation") @Override public DataWriter<Writable> build() throws IOException { Preconditions.checkNotNull(this.destination); Preconditions.checkArgument(!Strings.isNullOrEmpty(this.writerId)); State properties = this.destination.getProperties(); if (!properties.contains(WRITER_WRITABLE_CLASS) || !properties.contains(WRITER_OUTPUT_FORMAT_CLASS)) { HiveSerDeWrapper serializer = HiveSerDeWrapper.getSerializer(properties); properties.setProp(WRITER_WRITABLE_CLASS, serializer.getSerDe().getSerializedClass().getName()); properties.setProp(WRITER_OUTPUT_FORMAT_CLASS, serializer.getOutputFormatClassName()); } return new HiveWritableHdfsDataWriter(this, properties); }
@Override public HiveSerDeConverter init(WorkUnitState state) { super.init(state); Configuration conf = HadoopUtils.getConfFromState(state); try { this.serializer = HiveSerDeWrapper.getSerializer(state).getSerDe(); this.deserializer = HiveSerDeWrapper.getDeserializer(state).getSerDe(); this.deserializer.initialize(conf, state.getProperties()); setColumnsIfPossible(state); this.serializer.initialize(conf, state.getProperties()); } catch (IOException e) { log.error("Failed to instantiate serializer and deserializer", e); throw Throwables.propagate(e); } catch (SerDeException e) { log.error("Failed to initialize serializer and deserializer", e); throw Throwables.propagate(e); } return this; }
@SuppressWarnings("deprecation") @Override public DataWriter<Writable> build() throws IOException { Preconditions.checkNotNull(this.destination); Preconditions.checkArgument(!Strings.isNullOrEmpty(this.writerId)); State properties = this.destination.getProperties(); if (!properties.contains(WRITER_WRITABLE_CLASS) || !properties.contains(WRITER_OUTPUT_FORMAT_CLASS)) { HiveSerDeWrapper serializer = HiveSerDeWrapper.getSerializer(properties); properties.setProp(WRITER_WRITABLE_CLASS, serializer.getSerDe().getSerializedClass().getName()); properties.setProp(WRITER_OUTPUT_FORMAT_CLASS, serializer.getOutputFormatClassName()); } return new HiveWritableHdfsDataWriter(this, properties); }
@Override public HiveSerDeConverter init(WorkUnitState state) { super.init(state); Configuration conf = HadoopUtils.getConfFromState(state); try { this.serializer = HiveSerDeWrapper.getSerializer(state).getSerDe(); this.deserializer = HiveSerDeWrapper.getDeserializer(state).getSerDe(); this.deserializer.initialize(conf, state.getProperties()); setColumnsIfPossible(state); this.serializer.initialize(conf, state.getProperties()); } catch (IOException e) { log.error("Failed to instantiate serializer and deserializer", e); throw Throwables.propagate(e); } catch (SerDeException e) { log.error("Failed to initialize serializer and deserializer", e); throw Throwables.propagate(e); } return this; }
@Override public List<WorkUnit> getWorkunits(SourceState state) { if (!state.contains(HadoopFileInputSource.FILE_INPUT_FORMAT_CLASS_KEY)) { state.setProp(HadoopFileInputSource.FILE_INPUT_FORMAT_CLASS_KEY, HiveSerDeWrapper.getDeserializer(state).getInputFormatClassName()); } return super.getWorkunits(state); }
/** * Add an Avro {@link Schema} to the given {@link HiveRegistrationUnit}. * * <p> * If {@link #USE_SCHEMA_FILE} is true, the schema will be added via {@link #SCHEMA_URL} pointing to * the schema file named {@link #SCHEMA_FILE_NAME}. * </p> * * <p> * If {@link #USE_SCHEMA_FILE} is false, the schema will be obtained by {@link #getDirectorySchema(Path)}. * If the length of the schema is less than {@link #SCHEMA_LITERAL_LENGTH_LIMIT}, it will be added via * {@link #SCHEMA_LITERAL}. Otherwise, the schema will be written to {@link #SCHEMA_FILE_NAME} and added * via {@link #SCHEMA_URL}. * </p> */ @Override public void addSerDeProperties(Path path, HiveRegistrationUnit hiveUnit) throws IOException { Preconditions.checkArgument(this.fs.getFileStatus(path).isDirectory(), path + " is not a directory."); Schema schema; try (Timer.Context context = metricContext.timer(HIVE_SPEC_SCHEMA_READING_TIMER).time()) { schema = getDirectorySchema(path); } if (schema == null) { return; } hiveUnit.setSerDeType(this.serDeWrapper.getSerDe().getClass().getName()); hiveUnit.setInputFormat(this.serDeWrapper.getInputFormatClassName()); hiveUnit.setOutputFormat(this.serDeWrapper.getOutputFormatClassName()); addSchemaProperties(path, hiveUnit, schema); }
/** * Get an instance of {@link HiveSerDeWrapper} from a {@link State}. * * @param state The state should contain property {@link #SERDE_SERIALIZER_TYPE}, and optionally contain properties * {@link #SERDE_SERIALIZER_INPUT_FORMAT_TYPE}, {@link #SERDE_SERIALIZER_OUTPUT_FORMAT_TYPE} and */ public static HiveSerDeWrapper getSerializer(State state) { Preconditions.checkArgument(state.contains(SERDE_SERIALIZER_TYPE), "Missing required property " + SERDE_SERIALIZER_TYPE); return get(state.getProp(SERDE_SERIALIZER_TYPE), Optional.fromNullable(state.getProp(SERDE_SERIALIZER_INPUT_FORMAT_TYPE)), Optional.fromNullable(state.getProp(SERDE_SERIALIZER_OUTPUT_FORMAT_TYPE))); }
/** * Get an instance of {@link HiveSerDeWrapper}. * * @param serDeType The SerDe type. If serDeType is one of the available {@link HiveSerDeWrapper.BuiltInHiveSerDe}, * the other three parameters are not used. Otherwise, serDeType should be the class name of a {@link SerDe}, * and the other three parameters must be present. */ public static HiveSerDeWrapper get(String serDeType, Optional<String> inputFormatClassName, Optional<String> outputFormatClassName) { Optional<BuiltInHiveSerDe> hiveSerDe = Enums.getIfPresent(BuiltInHiveSerDe.class, serDeType.toUpperCase()); if (hiveSerDe.isPresent()) { return new HiveSerDeWrapper(hiveSerDe.get()); } Preconditions.checkArgument(inputFormatClassName.isPresent(), "Missing input format class name for SerDe " + serDeType); Preconditions.checkArgument(outputFormatClassName.isPresent(), "Missing output format class name for SerDe " + serDeType); return new HiveSerDeWrapper(serDeType, inputFormatClassName.get(), outputFormatClassName.get()); }
/** * Get an instance of {@link HiveSerDeWrapper} from a {@link State}. * * @param state The state should contain property {@link #SERDE_DESERIALIZER_TYPE}, and optionally contain properties * {@link #SERDE_DESERIALIZER_INPUT_FORMAT_TYPE}, {@link #SERDE_DESERIALIZER_OUTPUT_FORMAT_TYPE} and */ public static HiveSerDeWrapper getDeserializer(State state) { Preconditions.checkArgument(state.contains(SERDE_DESERIALIZER_TYPE), "Missing required property " + SERDE_DESERIALIZER_TYPE); return get(state.getProp(SERDE_DESERIALIZER_TYPE), Optional.fromNullable(state.getProp(SERDE_DESERIALIZER_INPUT_FORMAT_TYPE)), Optional.fromNullable(state.getProp(SERDE_DESERIALIZER_OUTPUT_FORMAT_TYPE))); } }
/** * Get an instance of {@link HiveSerDeWrapper}. * * @param serDeType The SerDe type. This should be one of the available {@link HiveSerDeWrapper.BuiltInHiveSerDe}s. */ public static HiveSerDeWrapper get(String serDeType) { return get(serDeType, Optional.<String> absent(), Optional.<String> absent()); }
/** * Get an instance of {@link HiveSerDeWrapper} from a {@link State}. * * @param state The state should contain property {@link #SERDE_SERIALIZER_TYPE}, and optionally contain properties * {@link #SERDE_SERIALIZER_INPUT_FORMAT_TYPE}, {@link #SERDE_SERIALIZER_OUTPUT_FORMAT_TYPE} and */ public static HiveSerDeWrapper getSerializer(State state) { Preconditions.checkArgument(state.contains(SERDE_SERIALIZER_TYPE), "Missing required property " + SERDE_SERIALIZER_TYPE); return get(state.getProp(SERDE_SERIALIZER_TYPE), Optional.fromNullable(state.getProp(SERDE_SERIALIZER_INPUT_FORMAT_TYPE)), Optional.fromNullable(state.getProp(SERDE_SERIALIZER_OUTPUT_FORMAT_TYPE))); }
/** * Get an instance of {@link HiveSerDeWrapper} from a {@link State}. * * @param state The state should contain property {@link #SERDE_DESERIALIZER_TYPE}, and optionally contain properties * {@link #SERDE_DESERIALIZER_INPUT_FORMAT_TYPE}, {@link #SERDE_DESERIALIZER_OUTPUT_FORMAT_TYPE} and */ public static HiveSerDeWrapper getDeserializer(State state) { Preconditions.checkArgument(state.contains(SERDE_DESERIALIZER_TYPE), "Missing required property " + SERDE_DESERIALIZER_TYPE); return get(state.getProp(SERDE_DESERIALIZER_TYPE), Optional.fromNullable(state.getProp(SERDE_DESERIALIZER_INPUT_FORMAT_TYPE)), Optional.fromNullable(state.getProp(SERDE_DESERIALIZER_OUTPUT_FORMAT_TYPE))); } }