protected ArchiveRecord innerNext() throws IOException { return get(positionForRecord(getIn())); }
@Override public synchronized void close() throws IOException { reader.close(); } }
/** * Returns version of this ARC file. Usually read from first record of ARC. * If we're reading without having first read the first record -- e.g. * random access into middle of an ARC -- then version will not have been * set. For now, we return a default, version 1.1. Later, if more than * just one version of ARC, we could look at such as the meta line to see * what version of ARC this is. * @return Version of this ARC file. */ public String getVersion() { return (super.getVersion() == null)? "1.1": super.getVersion(); }
/** * @return Return Archive Record created against current offset. * @throws IOException */ public ArchiveRecord get() throws IOException { return createArchiveRecord(this.in, positionForRecord(in)); }
/** * Get record at passed <code>offset</code>. * * @param offset Byte index into file at which a record starts. * @return An Archive Record reference. * @throws IOException */ public ArchiveRecord get(long offset) throws IOException { cleanupCurrentRecord(); long posn = positionForRecord(in); if(offset>=posn) { in.skip(offset-posn); } else { throw new UnsupportedOperationException("no reverse seeking: at "+posn+" requested "+offset); } return createArchiveRecord(this.in, offset); }
BufferedWriter cdxWriter = null; if (toFile) { String cdxFilename = stripExtension(getReaderIdentifier(), DOT_COMPRESSED_FILE_EXTENSION); cdxFilename = stripExtension(cdxFilename, getDotFileExtension()); cdxFilename += ('.' + CDX); cdxWriter = new BufferedWriter(new FileWriter(cdxFilename)); String header = "CDX b e a m s c " + ((isCompressed()) ? "V" : "v") + " n g"; if (toFile) { String strippedFileName = getStrippedFileName(); try { for (Iterator<ArchiveRecord> ii = iterator(); ii.hasNext();) { ArchiveRecord r = ii.next(); if (toFile) {
private boolean nextFile() throws IOException { currentPath++; if (currentPath >= paths.length) { return false; } // Output the archive filename, to help with debugging: log.info("Opening nextFile: " + paths[currentPath]); // Set up the ArchiveReader: this.status = this.filesystem.getFileStatus(paths[currentPath]); datainputstream = this.filesystem.open(paths[currentPath]); arcreader = (ArchiveReader) ArchiveReaderFactory.get( paths[currentPath].getName(), datainputstream, true); // Set to strict reading, in order to cope with malformed archive files // which cause an infinite loop otherwise. arcreader.setStrict(true); // Get the iterator: iterator = arcreader.iterator(); this.archiveName = paths[currentPath].getName(); return true; }
/** * Output passed record using passed format specifier. * @param format What format to use outputting. * @throws IOException * @return True if handled. */ public boolean outputRecord(final String format) throws IOException { boolean result = true; if (format.equals(CDX)) { System.out.println(get().outputCdx(getStrippedFileName())); } else if(format.equals(ArchiveFileConstants.DUMP)) { // No point digesting if dumping content. setDigest(false); get().dump(); } else { result = false; } return result; }
metadata.set("version",ar.getVersion()); Iterator<ArchiveRecord> it = ar.iterator();
@Override public WARCRecord get(long offset) throws IOException { return (WARCRecord)super.get(offset); } @Override
@Override public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); start = split.getStart(); end = start + split.getLength(); final Path file = split.getPath(); FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); reader = ArchiveReaderFactory.get(split.getPath().toString(), new BufferedInputStream(fileIn), true); if (reader instanceof ARCReader) { format = ArchiveFormat.ARC; iter = reader.iterator(); } if (reader instanceof WARCReader) { format = ArchiveFormat.WARC; iter = reader.iterator(); } this.pos = start; }
throws IOException { try { String version = super.getVersion(); ARCRecord record = new ARCRecord(is, getReaderIdentifier(), offset, isDigest(), isStrict(), isParseHttpHeaders(), isAlignedOnFirstRecord(), version); if (version != null && super.getVersion() == null) super.setVersion(version); currentRecord(record); } catch (IOException e) {
/** * @return True if we have more records to read. * @exception RuntimeException Can throw an IOException wrapped in a * RuntimeException if a problem reading underlying stream (Corrupted * gzip, etc.). */ public boolean hasNext() { // Call close on any extant record. This will scoot us past // any content not yet read. try { cleanupCurrentRecord(); } catch (IOException e) { if (isStrict()) { throw new RuntimeException(e); } if (e instanceof EOFException) { logger.warning("Premature EOF cleaning up " + currentRecord.getHeader().toString() + ": " + e.getMessage()); return false; } // If not strict, try going again. We might be able to skip // over the bad record. logger.log(Level.WARNING,"Trying skip of failed record cleanup of " + currentRecord.getHeader().toString() + ": " + e.getMessage(), e); } return innerHasNext(); }
arcreader = ArchiveReaderFactory.get(path.getName(), datainputstream, true); arcreader.setStrict(false); if (path.getName().matches("^.+\\.warc(\\.gz)?$")) { archiveIterator = warcIndexer
/** * Returns an ArchiveRecord iterator. * Of note, on IOException, especially if ZipException reading compressed * ARCs, rather than fail the iteration, try moving to the next record. * If {@link ArchiveReader#strict} is not set, this will usually succeed. * @return An iterator over ARC records. */ public Iterator<ArchiveRecord> iterator() { // Eat up any record outstanding. try { cleanupCurrentRecord(); } catch (IOException e) { throw new RuntimeException(e); } return new ArchiveRecordIterator(); }
BufferedWriter cdxWriter = null; if (toFile) { String cdxFilename = stripExtension(getReaderIdentifier(), DOT_COMPRESSED_FILE_EXTENSION); cdxFilename = stripExtension(cdxFilename, getDotFileExtension()); cdxFilename += ('.' + CDX); cdxWriter = new BufferedWriter(new FileWriter(cdxFilename)); String header = "CDX b e a m s c " + ((isCompressed()) ? "V" : "v") + " n g"; if (toFile) { String strippedFileName = getStrippedFileName(); try { for (Iterator<ArchiveRecord> ii = iterator(); ii.hasNext();) { ArchiveRecord r = ii.next(); if (toFile) {
private boolean nextFile() throws IOException { currentPath++; if (currentPath >= paths.length) { return false; } // Output the archive filename, to help with debugging: log.info("Opening nextFile: " + paths[currentPath]); // Set up the ArchiveReader: this.status = this.filesystem.getFileStatus(paths[currentPath]); datainputstream = this.filesystem.open(paths[currentPath]); arcreader = (ArchiveReader) ArchiveReaderFactory .get(paths[currentPath].getName(), datainputstream, true); // Set to strict reading, in order to cope with malformed archive files // which cause an infinite loop otherwise. arcreader.setStrict(true); // Get the iterator: iterator = arcreader.iterator(); this.archiveName = paths[currentPath].getName(); return true; }
/** * Output passed record using passed format specifier. * @param format What format to use outputting. * @throws IOException * @return True if handled. */ public boolean outputRecord(final String format) throws IOException { boolean result = true; if (format.equals(CDX)) { System.out.println(get().outputCdx(getStrippedFileName())); } else if(format.equals(ArchiveFileConstants.DUMP)) { // No point digesting if dumping content. setDigest(false); get().dump(); } else { result = false; } return result; }
metadata.set("version",ar.getVersion()); Iterator<ArchiveRecord> it = ar.iterator();