Refine search
/** * Read the metadata from a hadoop SequenceFile * * @param fs The filesystem to read from * @param path The file to read from * @return The metadata from this file */ public static Map<String, String> getMetadataFromSequenceFile(FileSystem fs, Path path) { try { Configuration conf = new Configuration(); conf.setInt("io.file.buffer.size", 4096); SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, new Configuration()); SequenceFile.Metadata meta = reader.getMetadata(); reader.close(); TreeMap<Text, Text> map = meta.getMetadata(); Map<String, String> values = new HashMap<String, String>(); for(Map.Entry<Text, Text> entry: map.entrySet()) values.put(entry.getKey().toString(), entry.getValue().toString()); return values; } catch(IOException e) { throw new RuntimeException(e); } }
@Override public boolean validateInput(FileSystem fs, HiveConf conf, List<FileStatus> files) throws IOException { if (files.size() <= 0) { return false; } for (int fileId = 0; fileId < files.size(); fileId++) { SequenceFile.Reader reader = null; try { reader = new SequenceFile.Reader(fs, files.get( fileId).getPath(), conf); reader.close(); reader = null; } catch (IOException e) { return false; } finally{ IOUtils.closeStream(reader); } } return true; }
@Override public boolean validateInput(FileSystem fs, HiveConf conf, List<FileStatus> files) throws IOException { if (files.size() <= 0) { return false; } for (int fileId = 0; fileId < files.size(); fileId++) { SequenceFile.Reader reader = null; try { reader = new SequenceFile.Reader(fs, files.get( fileId).getPath(), conf); reader.close(); reader = null; } catch (IOException e) { return false; } finally{ IOUtils.closeStream(reader); } } return true; }
public static void extractRawLogFromdataSink(String directory, String fileName) throws Exception { ChukwaConfiguration conf = new ChukwaConfiguration(); String fsName = conf.get("writer.hdfs.filesystem"); FileSystem fs = FileSystem.get(new URI(fsName), conf); SequenceFile.Reader r = new SequenceFile.Reader(fs, new Path(directory + fileName + ".done"), conf); File outputFile = new File(directory + fileName + ".raw"); ChukwaArchiveKey key = new ChukwaArchiveKey(); ChunkImpl chunk = ChunkImpl.getBlankChunk(); FileWriter out = new FileWriter(outputFile); try { while (r.next(key, chunk)) { out.write(new String(chunk.getData())); } } finally { out.close(); r.close(); } }
private Map<Text, CopyListingFileStatus> getListing(Path listingPath) throws Exception { SequenceFile.Reader reader = null; Map<Text, CopyListingFileStatus> values = new HashMap<>(); try { reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(listingPath)); Text key = new Text(); CopyListingFileStatus value = new CopyListingFileStatus(); while (reader.next(key, value)) { values.put(key, value); key = new Text(); value = new CopyListingFileStatus(); } } finally { if (reader != null) { reader.close(); } } return values; }
/** * read output from the map reduce job * @param fs the DFS FileSystem * @param jobconf configuration of the map reduce job */ public static long readOutput(FileSystem fs, JobConf jobconf) throws IOException, InterruptedException { //read outputs final Path outdir = new Path(TMP_DIR, "out"); Path infile = new Path(outdir, "reduce-out"); IntWritable nworkers = new IntWritable(); LongWritable result = new LongWritable(); long output = 0; SequenceFile.Reader reader = new SequenceFile.Reader(fs, infile, jobconf); try { reader.next(nworkers, result); output = result.get(); } finally { reader.close(); } return output; }
static void verifyOutput(HamaConfiguration conf, Path outputPath, double expectedResult, double delta) throws IOException { FileStatus[] listStatus = fs.listStatus(outputPath); for (FileStatus status : listStatus) { if (!status.isDir()) { SequenceFile.Reader reader = new SequenceFile.Reader(fs, status.getPath(), conf); NullWritable key = NullWritable.get(); DoubleWritable value = new DoubleWritable(); if (reader.next(key, value)) { LOG.info("Output File: " + status.getPath()); LOG.info("key: '" + key + "' value: '" + value + "' expected: '" + expectedResult + "'"); assertEquals("Expected value: '" + expectedResult + "' != '" + value + "'", expectedResult, value.get(), delta); } reader.close(); } } }
public List<Pair<Object, Object>> ensureOutput(String output) throws IOException { List<Pair<Object, Object>> outs = outputs.get(output); if(outs == null) { outs = new ArrayList<Pair<Object, Object>>(); SequenceFile.Reader reader = new SequenceFile.Reader(fS, new Path(output), getConf()); Object keyToRead, valueToRead; keyToRead = ReflectionUtils.newInstance(reader.getKeyClass(), getConf()); valueToRead = ReflectionUtils.newInstance(reader.getValueClass(), getConf()); while(reader.next(keyToRead) != null) { valueToRead = reader.getCurrentValue(valueToRead); Pair<Object, Object> p = new Pair<Object, Object>(keyToRead, valueToRead); outs.add(p); keyToRead = ReflectionUtils.newInstance(reader.getKeyClass(), getConf()); valueToRead = ReflectionUtils.newInstance(reader.getValueClass(), getConf()); } reader.close(); outputs.put(output, outs); } return outs; }
public static void main(String args[]) throws Exception { String inputDir = "reuters"; Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); String vectorsFolder = inputDir + "/tfidf-vectors"; SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path(vectorsFolder + "/part-r-00000"), conf); List<Vector> points = new ArrayList<Vector>(); Text key = new Text(); VectorWritable value = new VectorWritable(); while (reader.next(key, value)) { points.add(value.get()); } System.out.println(points.size()); reader.close(); List<Canopy> canopies = CanopyClusterer.createCanopies(points, new CosineDistanceMeasure(), 0.8, 0.7); List<Cluster> clusters = new ArrayList<Cluster>(); System.out.println(canopies.size()); for (Canopy canopy : canopies) { clusters.add(new Cluster(canopy.getCenter(), canopy.getId(), new CosineDistanceMeasure())); } } }
private static void checkOuterConsistency(Job job, Path[] src) throws IOException { Path outf = FileOutputFormat.getOutputPath(job); FileStatus[] outlist = cluster.getFileSystem().listStatus(outf, new Utils.OutputFileUtils.OutputFilesFilter()); assertEquals("number of part files is more than 1. It is" + outlist.length, 1, outlist.length); assertTrue("output file with zero length" + outlist[0].getLen(), 0 < outlist[0].getLen()); SequenceFile.Reader r = new SequenceFile.Reader(cluster.getFileSystem(), outlist[0].getPath(), job.getConfiguration()); IntWritable k = new IntWritable(); IntWritable v = new IntWritable(); while (r.next(k, v)) { assertEquals("counts does not match", v.get(), countProduct(k, src, job.getConfiguration())); } r.close(); }
@Override protected void setup(Context context) throws IOException, InterruptedException { // TODO Auto-generated method stub Configuration configuration=context.getConfiguration(); fileSystem=FileSystem.get(configuration); numCols=configuration.getInt("numCols", 0); path=configuration.get("outpath"); for(int i=1;i<numCols;i++){ SequenceFile.Reader reader=new SequenceFile.Reader(fileSystem, new Path(path+"/"+i+"/part-m-00000"), configuration); IndexPair indexPair=new IndexPair(); DoubleWritable doubleWritable=new DoubleWritable(); while(reader.next(indexPair, doubleWritable)){ context.write(new LongWritable(indexPair.getColIndex()), doubleWritable); } reader.close(); } } /* (non-Javadoc)
private static void checkOuterConsistency(Job job, Path[] src) throws IOException { Path outf = FileOutputFormat.getOutputPath(job); FileStatus[] outlist = cluster.getFileSystem().listStatus(outf, new Utils.OutputFileUtils.OutputFilesFilter()); assertEquals("number of part files is more than 1. It is" + outlist.length, 1, outlist.length); assertTrue("output file with zero length" + outlist[0].getLen(), 0 < outlist[0].getLen()); SequenceFile.Reader r = new SequenceFile.Reader(cluster.getFileSystem(), outlist[0].getPath(), job.getConfiguration()); IntWritable k = new IntWritable(); IntWritable v = new IntWritable(); while (r.next(k, v)) { assertEquals("counts does not match", v.get(), countProduct(k, src, job.getConfiguration())); } r.close(); }
/** * Reads the cluster centers. * * @return an index on the key dimension, and a cluster center on the value. */ public static HashMap<Integer, DoubleVector> readClusterCenters( Configuration conf, Path out, Path centerPath, FileSystem fs) throws IOException { HashMap<Integer, DoubleVector> centerMap = new HashMap<Integer, DoubleVector>(); SequenceFile.Reader centerReader = new SequenceFile.Reader(fs, centerPath, conf); int index = 0; VectorWritable center = new VectorWritable(); while (centerReader.next(center, NullWritable.get())) { centerMap.put(index++, center.getVector()); } centerReader.close(); return centerMap; }
public void reduce(WritableComparable key, Iterator values, OutputCollector output, Reporter reporter ) throws IOException { if (first) { first = false; MapOutputFile mapOutputFile = new MROutputFiles(); mapOutputFile.setConf(conf); Path input = mapOutputFile.getInputFile(0); FileSystem fs = FileSystem.get(conf); assertTrue("reduce input exists " + input, fs.exists(input)); SequenceFile.Reader rdr = new SequenceFile.Reader(fs, input, conf); assertEquals("is reduce input compressed " + input, compressInput, rdr.isCompressed()); rdr.close(); } }
public List<Pair<Object, Object>> ensureOutput(String output) throws IOException { List<Pair<Object, Object>> outs = outputs.get(output); if(outs == null) { outs = new ArrayList<Pair<Object, Object>>(); SequenceFile.Reader reader = new SequenceFile.Reader(fS, new Path(output), getConf()); Object keyToRead, valueToRead; keyToRead = ReflectionUtils.newInstance(reader.getKeyClass(), getConf()); valueToRead = ReflectionUtils.newInstance(reader.getValueClass(), getConf()); while(reader.next(keyToRead) != null) { valueToRead = reader.getCurrentValue(valueToRead); Pair<Object, Object> p = new Pair<Object, Object>(keyToRead, valueToRead); outs.add(p); keyToRead = ReflectionUtils.newInstance(reader.getKeyClass(), getConf()); valueToRead = ReflectionUtils.newInstance(reader.getValueClass(), getConf()); } reader.close(); outputs.put(output, outs); } return outs; }
@Test public void testWriteSplitsFileExistingPathMultipleKeyExtents() throws Exception { Map<Text,String> splits = new HashMap<>(); splits.put(new Text("zEndRow"), "location2_1234"); Configuration conf = new Configuration(); Path file = createSplitsFile(splits, conf, 1); SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(file)); Assert.assertEquals(Text.class, reader.getKeyClass()); Assert.assertEquals(Text.class, reader.getValueClass()); Text key = new Text(); Text val = new Text(); boolean valid = reader.next(key, val); Assert.assertTrue(valid); Assert.assertEquals("zEndRow", key.toString()); Assert.assertEquals("location2_1234", val.toString()); valid = reader.next(key, val); Assert.assertFalse(valid); reader.close(); }
@Test public void testWriteSplitsFileExistingPath() throws Exception { Map<Text,String> splits = new HashMap<>(); Configuration conf = new Configuration(); splits.put(new Text(), "hello, world!"); Path file = createSplitsFile(splits, conf, 1); SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(file)); Assert.assertEquals(Text.class, reader.getKeyClass()); Assert.assertEquals(Text.class, reader.getValueClass()); Text key = new Text(); Text val = new Text(); boolean valid = reader.next(key, val); Assert.assertTrue(valid); Assert.assertEquals("", key.toString()); Assert.assertEquals("hello, world!", val.toString()); valid = reader.next(key, val); Assert.assertFalse(valid); reader.close(); }
/** * Reads the cluster centers. * * @return an index on the key dimension, and a cluster center on the value. */ public static HashMap<Integer, DoubleVector> readClusterCenters( Configuration conf, Path out, Path centerPath, FileSystem fs) throws IOException { HashMap<Integer, DoubleVector> centerMap = new HashMap<Integer, DoubleVector>(); SequenceFile.Reader centerReader = new SequenceFile.Reader(fs, centerPath, conf); int index = 0; VectorWritable center = new VectorWritable(); while (centerReader.next(center, NullWritable.get())) { centerMap.put(index++, center.getVector()); } centerReader.close(); return centerMap; }
public void reduce(WritableComparable key, Iterator values, OutputCollector output, Reporter reporter ) throws IOException { if (first) { first = false; MapOutputFile mapOutputFile = new MapOutputFile(); mapOutputFile.setConf(conf); Path input = mapOutputFile.getInputFile(0); FileSystem fs = FileSystem.get(conf); assertTrue("reduce input exists " + input, fs.exists(input)); SequenceFile.Reader rdr = new SequenceFile.Reader(fs, input, conf); assertEquals("is reduce input compressed " + input, compressInput, rdr.isCompressed()); rdr.close(); } }
public void testJavaSerialization() throws Exception { Path file = new Path(System.getProperty("test.build.data",".") + "/testseqser.seq"); fs.delete(file, true); Writer writer = SequenceFile.createWriter(fs, conf, file, Long.class, String.class); writer.append(1L, "one"); writer.append(2L, "two"); writer.close(); Reader reader = new Reader(fs, file, conf); assertEquals(1L, reader.next((Object) null)); assertEquals("one", reader.getCurrentValue((Object) null)); assertEquals(2L, reader.next((Object) null)); assertEquals("two", reader.getCurrentValue((Object) null)); assertNull(reader.next((Object) null)); reader.close(); } }