Refine search
final String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID); final String outputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH); final Path partitionFilePath = new Path(optionsHelper.getOptionValue(OPTION_PARTITION_FILE_PATH)); final String hbaseConfFile = optionsHelper.getOptionValue(AbstractHadoopJob.OPTION_HBASE_CONF_PATH); final String counterPath = optionsHelper.getOptionValue(OPTION_COUNTER_PATH); try (JavaSparkContext sc = new JavaSparkContext(conf)) { sc.sc().addSparkListener(jobListener); final FileSystem fs = partitionFilePath.getFileSystem(sc.hadoopConfiguration()); if (!fs.exists(partitionFilePath)) { throw new IllegalArgumentException("File not exist: " + partitionFilePath.toString()); try (SequenceFile.Reader reader = new SequenceFile.Reader(fs, partitionFilePath, sc.hadoopConfiguration())) { RowKeyWritable key = new RowKeyWritable(); Writable value = NullWritable.get(); while (reader.next(key, value)) { keys.add(key); logger.info(" ------- split key: {}", key); final FileSystem hbaseClusterFs = hbaseConfFilePath.getFileSystem(sc.hadoopConfiguration()); try (FSDataInputStream confInput = hbaseClusterFs.open(new Path(hbaseConfFile))) { Configuration hbaseJobConf = new Configuration(); hbaseJobConf.addResource(confInput); hbaseJobConf.set("spark.hadoop.dfs.replication", "3"); // HFile, replication=3 Job job = Job.getInstance(hbaseJobConf, cubeSegment.getStorageLocationIdentifier());
/** * Read the metadata from a hadoop SequenceFile * * @param fs The filesystem to read from * @param path The file to read from * @return The metadata from this file */ public static Map<String, String> getMetadataFromSequenceFile(FileSystem fs, Path path) { try { Configuration conf = new Configuration(); conf.setInt("io.file.buffer.size", 4096); SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, new Configuration()); SequenceFile.Metadata meta = reader.getMetadata(); reader.close(); TreeMap<Text, Text> map = meta.getMetadata(); Map<String, String> values = new HashMap<String, String>(); for(Map.Entry<Text, Text> entry: map.entrySet()) values.put(entry.getKey().toString(), entry.getValue().toString()); return values; } catch(IOException e) { throw new RuntimeException(e); } }
public void printFile(String path) throws Exception { FileSystem fileSystem = FileUtil.getFileSystem(path); Path fsPath = new Path(path); SequenceFile.Reader reader = new SequenceFile.Reader(fileSystem, fsPath, new Configuration()); LongWritable key = (LongWritable) reader.getKeyClass().newInstance(); BytesWritable value = (BytesWritable) reader.getValueClass().newInstance(); System.out.println("reading file " + path); while (reader.next(key, value)) { if (mPrintOffsetsOnly) { System.out.println(Long.toString(key.get())); } else { byte[] nonPaddedBytes = new byte[value.getLength()]; System.arraycopy(value.getBytes(), 0, nonPaddedBytes, 0, value.getLength()); System.out.println(Long.toString(key.get()) + ": " + new String(nonPaddedBytes)); } } } }
/** * Get token from the token sequence file. * @param authPath * @param proxyUserName * @return Token for proxyUserName if it exists. * @throws IOException */ private static Optional<Token<?>> getTokenFromSeqFile(String authPath, String proxyUserName) throws IOException { try (Closer closer = Closer.create()) { FileSystem localFs = FileSystem.getLocal(new Configuration()); SequenceFile.Reader tokenReader = closer.register(new SequenceFile.Reader(localFs, new Path(authPath), localFs.getConf())); Text key = new Text(); Token<?> value = new Token<>(); while (tokenReader.next(key, value)) { LOG.info("Found token for " + key); if (key.toString().equals(proxyUserName)) { return Optional.<Token<?>> of(value); } } } return Optional.absent(); } }
public JobState.DatasetState getInternal(String storeName, String tableName, String stateId, boolean sanitizeKeyForComparison) throws IOException { Path tablePath = new Path(new Path(this.storeRootDir, storeName), tableName); if (!this.fs.exists(tablePath)) { return null; Configuration deserializeConf = new Configuration(this.conf); WritableShimSerialization.addToHadoopConfiguration(deserializeConf); try (@SuppressWarnings("deprecation") SequenceFile.Reader reader = new SequenceFile.Reader(this.fs, tablePath, deserializeConf)) { Object writable = reader.getValueClass() == JobState.class ? new JobState() : new JobState.DatasetState(); Text key = new Text(); while (reader.next(key)) { String stringKey = sanitizeKeyForComparison ? sanitizeDatasetStatestoreNameFromDatasetURN(storeName, key.toString()) : key.toString(); writable = reader.getCurrentValue(writable); if (stringKey.equals(stateId)) { if (writable instanceof JobState.DatasetState) {
@Override public RecordReader<BytesWritable, BytesWritable> getRecordReader(InputSplit split, JobConf conf, Reporter reporter) throws IOException { String inputPathString = ((FileSplit) split).getPath().toUri().getPath(); log.info("Input file path:" + inputPathString); Path inputPath = new Path(inputPathString); SequenceFile.Reader reader = new SequenceFile.Reader(inputPath.getFileSystem(conf), inputPath, conf); SequenceFile.Metadata meta = reader.getMetadata(); try { Text keySchema = meta.get(new Text("key.schema")); Text valueSchema = meta.get(new Text("value.schema")); if(0 == keySchema.getLength() || 0 == valueSchema.getLength()) { throw new Exception(); } // update Joboconf with schemas conf.set("mapper.input.key.schema", keySchema.toString()); conf.set("mapper.input.value.schema", valueSchema.toString()); } catch(Exception e) { throw new IOException("Failed to Load Schema from file:" + inputPathString + "\n"); } return super.getRecordReader(split, conf, reporter); }
/** Tests that reading and writing ordinary Writables still works. */ @Test public void testReadWritables() throws IOException { Path sequenceFilePath = new Path(new File(mTempDir.getRoot(), "output.seq").getPath()); writeSequenceFile(sequenceFilePath, Text.class, IntWritable.class, null, null, new Text("one"), new IntWritable(1), new Text("two"), new IntWritable(2)); Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); AvroSequenceFile.Reader.Options options = new AvroSequenceFile.Reader.Options() .withFileSystem(fs) .withInputPath(sequenceFilePath) .withConfiguration(conf); SequenceFile.Reader reader = new AvroSequenceFile.Reader(options); Text key = new Text(); IntWritable value = new IntWritable(); // Read the first record. assertTrue(reader.next(key)); assertEquals("one", key.toString()); reader.getCurrentValue(value); assertNotNull(value); assertEquals(1, value.get()); // Read the second record. assertTrue(reader.next(key)); assertEquals("two", key.toString()); reader.getCurrentValue(value); assertNotNull(value); assertEquals(2, value.get()); assertFalse("Should be no more records.", reader.next(key)); }
public static void copyTo64MB(String src, String dst) throws IOException { Configuration hconf = new Configuration(); Path srcPath = new Path(src); Path dstPath = new Path(dst); FileSystem fs = FileSystem.get(hconf); long srcSize = fs.getFileStatus(srcPath).getLen(); int copyTimes = (int) (67108864 / srcSize); // 64 MB System.out.println("Copy " + copyTimes + " times"); Reader reader = new Reader(hconf, SequenceFile.Reader.file(srcPath)); Writable key = (Writable) ReflectionUtils.newInstance(reader.getKeyClass(), hconf); Text value = new Text(); Writer writer = SequenceFile.createWriter(hconf, Writer.file(dstPath), Writer.keyClass(key.getClass()), Writer.valueClass(Text.class), Writer.compression(CompressionType.BLOCK, getLZOCodec(hconf))); int count = 0; while (reader.next(key, value)) { for (int i = 0; i < copyTimes; i++) { writer.append(key, value); count++; } } System.out.println("Len: " + writer.getLength()); System.out.println("Rows: " + count); reader.close(); writer.close(); }
@SuppressWarnings("unchecked") public void testReadAvroWithoutReaderSchemas() throws IOException { Path sequenceFilePath = new Path(new File(mTempDir.getRoot(), "output.seq").getPath()); new AvroKey<CharSequence>("two"), new AvroValue<>(2)); Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); AvroSequenceFile.Reader.Options options = new AvroSequenceFile.Reader.Options() .withFileSystem(fs) key = (AvroKey<CharSequence>) reader.next(key); assertNotNull(key); assertEquals("one", key.datum().toString()); value = (AvroValue<Integer>) reader.getCurrentValue(value); assertNotNull(value); assertEquals(1, value.datum().intValue()); key = (AvroKey<CharSequence>) reader.next(key); assertNotNull(key); assertEquals("two", key.datum().toString()); value = (AvroValue<Integer>) reader.getCurrentValue(value); assertNotNull(value); assertEquals(2, value.datum().intValue()); assertNull("Should be no more records.", reader.next(key));
public static ClusteringPolicy readPolicy(Path path) throws IOException { Path policyPath = new Path(path, POLICY_FILE_NAME); Configuration config = new Configuration(); FileSystem fs = FileSystem.get(policyPath.toUri(), config); SequenceFile.Reader reader = new SequenceFile.Reader(fs, policyPath, config); Text key = new Text(); ClusteringPolicyWritable cpw = new ClusteringPolicyWritable(); reader.next(key, cpw); Closeables.close(reader, true); return cpw.getValue(); }
private String[] getContent(Configuration conf, Path path) throws Exception { ClassLoader prevClassLoader = ClassLoaderStack.addJarFile( new Path(new Path(new SqoopOptions().getJarOutputDir()), getTableName() + ".jar").toString(), getTableName()); FileSystem fs = FileSystem.getLocal(conf); FileStatus[] stats = fs.listStatus(path); Path[] paths = new Path[stats.length]; for (int i = 0; i < stats.length; i++) { paths[i] = stats[i].getPath(); } // Read all the files adding the value lines to the list. List<String> strings = new ArrayList<String>(); for (Path filePath : paths) { if (filePath.getName().startsWith("_") || filePath.getName().startsWith(".")) { continue; } // Need to use new configuration object so that it has the proper classloaders. SequenceFile.Reader reader = new SequenceFile.Reader(fs, filePath, new Configuration()); WritableComparable key = (WritableComparable) reader.getKeyClass().newInstance(); Writable value = (Writable) reader.getValueClass().newInstance(); while (reader.next(key, value)) { strings.add(value.toString()); } } ClassLoaderStack.setCurrentClassLoader(prevClassLoader); return strings.toArray(new String[0]); }
String dictInfoPath = mergedDictInfo == null ? "" : mergedDictInfo.getResourcePath(); context.write(new IntWritable(-1), new Text(tblCol + "=" + dictInfoPath)); context.getConfiguration().get(BatchConstants.ARG_META_URL)); final String cubeName = context.getConfiguration().get(BatchConstants.ARG_CUBE_NAME); final String segmentId = context.getConfiguration().get(BatchConstants.ARG_SEGMENT_ID); final String statOutputPath = context.getConfiguration() .get(MergeDictionaryJob.OPTION_OUTPUT_PATH_STAT.getOpt()); conf = HadoopUtil.getCurrentConfiguration(); reader = new SequenceFile.Reader(fs, new Path(tempFile.getAbsolutePath()), conf); LongWritable keyW = (LongWritable) ReflectionUtils.newInstance(reader.getKeyClass(), conf); BytesWritable valueW = (BytesWritable) ReflectionUtils.newInstance(reader.getValueClass(), conf); while (reader.next(keyW, valueW)) { if (keyW.get() == 0L) { CubeStatsWriter.writeCuboidStatistics(conf, new Path(statOutputPath), cuboidHLLMap, averageSamplingPercentage); Path statisticsFilePath = new Path(statOutputPath, FSDataInputStream fis = fs.open(statisticsFilePath); context.write(new IntWritable(-1), new Text(""));
@SuppressWarnings("unchecked") public static Path merge(Configuration configuration, String inputPath, String outputPath, int mapperTasks, boolean textFileFormat, boolean deleteSource) throws IOException, InstantiationException, IllegalAccessException { Class<? extends Writable> keyClass = LongWritable.class; Class<? extends Writable> valueClass = Text.class; FileSystem fs = FileSystem.get(new Configuration()); if (!textFileFormat) { FileStatus[] fileStatus = fs.globStatus(new Path(inputPath)); Preconditions.checkArgument(fileStatus.length > 0, "Invalid input path..."); SequenceFile.Reader reader = new SequenceFile.Reader(fs, fileStatus[fileStatus.length - 1].getPath(), fs.getConf()); try { keyClass = (Class<? extends Writable>) reader.getKeyClass(); valueClass = (Class<? extends Writable>) reader.getValueClass(); sLogger.info("Key type: " + keyClass.toString()); sLogger.info("Value type: " + valueClass.toString()); } catch (Exception e) { throw new RuntimeException("Error in loading key/value class"); } reader.close(); } if (textFileFormat) { return mergeTextFiles(configuration, inputPath, outputPath, mapperTasks, deleteSource); } else { return mergeSequenceFiles(configuration, inputPath, outputPath, mapperTasks, keyClass, valueClass, deleteSource); } }
private List<MyMessage> readMessages(Path path) throws IOException, InstantiationException, IllegalAccessException { List<MyMessage> messages = new ArrayList<MyMessage>(); try { for (FileStatus file : fs.listStatus(path)) { if (file.isDir()) { messages.addAll(readMessages(file.getPath())); } else { SequenceFile.Reader reader = new SequenceFile.Reader(fs, file.getPath(), new Configuration()); try { LongWritable key = (LongWritable) reader.getKeyClass().newInstance(); Text value = (Text) reader.getValueClass().newInstance(); while (reader.next(key, value)) { messages.add(gson.fromJson(value.toString(), MyMessage.class)); } } finally { reader.close(); } } } } catch (FileNotFoundException e) { System.out.println("No camus messages were found in [" + path + "]"); } return messages; }
segmentsConsidered++; updateProgress(mStream[i].in.getPosition()); new Path(tmpDir, "intermediate").suffix("." + passNo); tmpFilename.toString(), approxOutputSize, conf); if(LOG.isDebugEnabled()) { fs.makeQualified(segmentsToMerge.get(0).segmentPathName), fs.makeQualified(outputFile), null); fs.getFileStatus(outputFile).getLen(), outputFile);
protected static void checkOutputData(Path dataDir, int expectedCount) throws IOException { List<Path> dataFiles = IOUtils.listFiles(dataDir, "*/part-*"); Configuration hadoopConf = new Configuration(); Writable key = new Text(); Writable message = new Text(); int count = 0; for (Path dataFile : dataFiles) { SequenceFile.Reader.Option fileOptions = SequenceFile.Reader.file(new org.apache.hadoop.fs.Path(dataFile.toUri().toString())); try (SequenceFile.Reader reader = new SequenceFile.Reader(hadoopConf, fileOptions)) { while (reader.next(key, message)) { count++; log.info("Data file: {}", dataFile); SequenceFile.Reader.Option fileOptions = SequenceFile.Reader.file(new org.apache.hadoop.fs.Path(dataFile.toUri().toString())); try (SequenceFile.Reader reader = new SequenceFile.Reader(hadoopConf, fileOptions)) { while (reader.next(key, message)) { log.info(" {} = {}", key, message);
configurer.addOutputFormat("out2", SequenceFileOutputFormat.class, Text.class, IntWritable.class); Path outDir = new Path(workDir.getPath(), job.getJobName()); FileOutputFormat.setOutputPath(configurer.getJob("out1"), new Path(outDir, "out1")); FileOutputFormat.setOutputPath(configurer.getJob("out2"), new Path(outDir, "out2")); String[] textOutput = readFully(textOutPath).split("\n"); Path seqOutPath = new Path(outDir, "out2/part-m-00000"); SequenceFile.Reader reader = new SequenceFile.Reader(fs, seqOutPath, mrConf); Text key = new Text(); IntWritable value = new IntWritable(); String[] words = fileContent.split(" "); for (int i = 0; i < words.length; i++) { Assert.assertEquals((i + 1) + "\t" + words[i], textOutput[i]); reader.next(key, value); Assert.assertEquals(words[i], key.toString()); Assert.assertEquals((i + 1), value.get()); Assert.assertFalse(reader.next(key, value));
public ObjectWritableIterator(final Configuration configuration, final Path path) throws IOException { for (final FileStatus status : FileSystem.get(configuration).listStatus(path, HiddenFileFilter.instance())) { this.readers.add(new SequenceFile.Reader(configuration, SequenceFile.Reader.file(status.getPath()))); } }
public static void checkOutput(FileSystem fileSys, Configuration conf, int tasks) throws Exception { FileStatus[] listStatus = fileSys.globStatus(new Path(OUTPUT_PATH + "/part-*")); if (!status.isDir()) { SequenceFile.Reader reader = new SequenceFile.Reader(fileSys, status.getPath(), conf); int superStep = 0; int taskstep = 0; IntWritable key = new IntWritable(); Text value = new Text(); while (reader.next(key, value)) { assertEquals(superStep, key.get()); taskstep++; reader.close(); fileSys.delete(new Path(TMP_OUTPUT), true);
private Map<CamusRequest, EtlKey> getPreviousOffsets(Path[] inputs, JobContext context) throws IOException { Map<CamusRequest, EtlKey> offsetKeysMap = new HashMap<CamusRequest, EtlKey>(); for (Path input : inputs) { FileSystem fs = input.getFileSystem(context.getConfiguration()); for (FileStatus f : fs.listStatus(input, new OffsetFileFilter())) { log.info("previous offset file:" + f.getPath().toString()); SequenceFile.Reader reader = new SequenceFile.Reader(fs, f.getPath(), context.getConfiguration()); EtlKey key = new EtlKey(); while (reader.next(key, NullWritable.get())) { //TODO: factor out kafka specific request functionality CamusRequest request = new EtlRequest(context, key.getTopic(), key.getLeaderId(), key.getPartition()); if (offsetKeysMap.containsKey(request)) { EtlKey oldKey = offsetKeysMap.get(request); if (oldKey.getOffset() < key.getOffset()) { offsetKeysMap.put(request, key); } } else { offsetKeysMap.put(request, key); } key = new EtlKey(); } reader.close(); } } return offsetKeysMap; }