org.archive.io.ArchiveReader java code examples

protected ArchiveRecord innerNext() throws IOException {
  return get(positionForRecord(getIn()));
}

 @Override
 public synchronized void close() throws IOException {
  reader.close();
 }
}

/**
 * Returns version of this ARC file.  Usually read from first record of ARC.
 * If we're reading without having first read the first record -- e.g.
 * random access into middle of an ARC -- then version will not have been
 * set.  For now, we return a default, version 1.1.  Later, if more than
 * just one version of ARC, we could look at such as the meta line to see
 * what version of ARC this is.
 * @return Version of this ARC file.
 */
public String getVersion() {
  return (super.getVersion() == null)? "1.1": super.getVersion();
}

/**
 * @return Return Archive Record created against current offset.
 * @throws IOException
 */
public ArchiveRecord get() throws IOException {
  return createArchiveRecord(this.in, positionForRecord(in));
}

/**
 * Get record at passed <code>offset</code>.
 * 
 * @param offset Byte index into file at which a record starts.
 * @return An Archive Record reference.
 * @throws IOException
 */
public ArchiveRecord get(long offset) throws IOException {
  cleanupCurrentRecord();
  long posn = positionForRecord(in); 
  if(offset>=posn) {
    in.skip(offset-posn); 
  } else {
    throw new UnsupportedOperationException("no reverse seeking: at "+posn+" requested "+offset); 
  }
  return createArchiveRecord(this.in, offset);
}

BufferedWriter cdxWriter = null;
if (toFile) {
  String cdxFilename = stripExtension(getReaderIdentifier(),
    DOT_COMPRESSED_FILE_EXTENSION);
  cdxFilename = stripExtension(cdxFilename, getDotFileExtension());
  cdxFilename += ('.' + CDX);
  cdxWriter = new BufferedWriter(new FileWriter(cdxFilename));
String header = "CDX b e a m s c " + ((isCompressed()) ? "V" : "v")
  + " n g";
if (toFile) {
String strippedFileName = getStrippedFileName();
try {
  for (Iterator<ArchiveRecord> ii = iterator(); ii.hasNext();) {
    ArchiveRecord r = ii.next();
    if (toFile) {

private boolean nextFile() throws IOException {
  currentPath++;
  if (currentPath >= paths.length) {
    return false;
  }
  // Output the archive filename, to help with debugging:
  log.info("Opening nextFile: " + paths[currentPath]);
  // Set up the ArchiveReader:
  this.status = this.filesystem.getFileStatus(paths[currentPath]);
  datainputstream = this.filesystem.open(paths[currentPath]);
  arcreader = (ArchiveReader) ArchiveReaderFactory.get(
      paths[currentPath].getName(), datainputstream, true);
  // Set to strict reading, in order to cope with malformed archive files
  // which cause an infinite loop otherwise.
  arcreader.setStrict(true);
  // Get the iterator:
  iterator = arcreader.iterator();
  this.archiveName = paths[currentPath].getName();
  return true;
}

/**
 * Output passed record using passed format specifier.
 * @param format What format to use outputting.
 * @throws IOException
 * @return True if handled.
 */
public boolean outputRecord(final String format)
throws IOException {
  boolean result = true;
  if (format.equals(CDX)) {
    System.out.println(get().outputCdx(getStrippedFileName()));
  } else if(format.equals(ArchiveFileConstants.DUMP)) {
    // No point digesting if dumping content.
    setDigest(false);
    get().dump();
  } else {
    result = false;
  }
  return result;
}

metadata.set("version",ar.getVersion());
Iterator<ArchiveRecord> it = ar.iterator();

@Override
public WARCRecord get(long offset) throws IOException {
  return (WARCRecord)super.get(offset);
}
@Override

@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
 FileSplit split = (FileSplit) genericSplit;
 Configuration job = context.getConfiguration();
 start = split.getStart();
 end = start + split.getLength();
 final Path file = split.getPath();
 FileSystem fs = file.getFileSystem(job);
 FSDataInputStream fileIn = fs.open(split.getPath());
 reader = ArchiveReaderFactory.get(split.getPath().toString(),
   new BufferedInputStream(fileIn), true);
 if (reader instanceof ARCReader) {
  format = ArchiveFormat.ARC;
  iter = reader.iterator();
 }
 if (reader instanceof WARCReader) {
  format = ArchiveFormat.WARC;
  iter = reader.iterator();
 }
 this.pos = start;
}

  setDigest(false);
  dump(false);
} else if (format.equals(GZIP_DUMP)) {
  setDigest(false);
  dump(true);
} else if (format.equals(CDX)) {
  cdxOutput(false);   
} else if (format.equals(CDX_FILE)) {
  cdxOutput(true);
} else {
  result = false;

throws IOException {
  try {
    String version = super.getVersion();
    ARCRecord record = new ARCRecord(is, getReaderIdentifier(), offset,
        isDigest(), isStrict(), isParseHttpHeaders(),
        isAlignedOnFirstRecord(), version);
    if (version != null && super.getVersion() == null)
      super.setVersion(version);
    currentRecord(record);
  } catch (IOException e) {

/**
 * @return True if we have more records to read.
 * @exception RuntimeException Can throw an IOException wrapped in a
 * RuntimeException if a problem reading underlying stream (Corrupted
 * gzip, etc.).
 */
public boolean hasNext() {
  // Call close on any extant record.  This will scoot us past
  // any content not yet read.
  try {
    cleanupCurrentRecord();
  } catch (IOException e) {
    if (isStrict()) {
      throw new RuntimeException(e);
    }
    if (e instanceof EOFException) {
      logger.warning("Premature EOF cleaning up " + 
        currentRecord.getHeader().toString() + ": " +
        e.getMessage());
      return false;
    }
    // If not strict, try going again.  We might be able to skip
    // over the bad record.
    logger.log(Level.WARNING,"Trying skip of failed record cleanup of " +
      currentRecord.getHeader().toString() + ": " +
      e.getMessage(), e);
  }
  return innerHasNext();
}

arcreader = ArchiveReaderFactory.get(path.getName(),
    datainputstream, true);
arcreader.setStrict(false);
if (path.getName().matches("^.+\\.warc(\\.gz)?$")) {
  archiveIterator = warcIndexer

/**
 * Returns an ArchiveRecord iterator.
 * Of note, on IOException, especially if ZipException reading compressed
 * ARCs, rather than fail the iteration, try moving to the next record.
 * If {@link ArchiveReader#strict} is not set, this will usually succeed.
 * @return An iterator over ARC records.
 */
public Iterator<ArchiveRecord> iterator() {
  // Eat up any record outstanding.
  try {
    cleanupCurrentRecord();
  } catch (IOException e) {
    throw new RuntimeException(e);
  }
  return new ArchiveRecordIterator();
}

BufferedWriter cdxWriter = null;
if (toFile) {
  String cdxFilename = stripExtension(getReaderIdentifier(),
    DOT_COMPRESSED_FILE_EXTENSION);
  cdxFilename = stripExtension(cdxFilename, getDotFileExtension());
  cdxFilename += ('.' + CDX);
  cdxWriter = new BufferedWriter(new FileWriter(cdxFilename));
String header = "CDX b e a m s c " + ((isCompressed()) ? "V" : "v")
  + " n g";
if (toFile) {
String strippedFileName = getStrippedFileName();
try {
  for (Iterator<ArchiveRecord> ii = iterator(); ii.hasNext();) {
    ArchiveRecord r = ii.next();
    if (toFile) {

private boolean nextFile() throws IOException {
  currentPath++;
  if (currentPath >= paths.length) {
    return false;
  }
  // Output the archive filename, to help with debugging:
  log.info("Opening nextFile: " + paths[currentPath]);
  // Set up the ArchiveReader:
  this.status = this.filesystem.getFileStatus(paths[currentPath]);
  datainputstream = this.filesystem.open(paths[currentPath]);
  arcreader = (ArchiveReader) ArchiveReaderFactory
      .get(paths[currentPath].getName(), datainputstream, true);
  // Set to strict reading, in order to cope with malformed archive files
  // which cause an infinite loop otherwise.
  arcreader.setStrict(true);
  // Get the iterator:
  iterator = arcreader.iterator();
  this.archiveName = paths[currentPath].getName();
  return true;
}

/**
 * Output passed record using passed format specifier.
 * @param format What format to use outputting.
 * @throws IOException
 * @return True if handled.
 */
public boolean outputRecord(final String format)
throws IOException {
  boolean result = true;
  if (format.equals(CDX)) {
    System.out.println(get().outputCdx(getStrippedFileName()));
  } else if(format.equals(ArchiveFileConstants.DUMP)) {
    // No point digesting if dumping content.
    setDigest(false);
    get().dump();
  } else {
    result = false;
  }
  return result;
}

metadata.set("version",ar.getVersion());
Iterator<ArchiveRecord> it = ar.iterator();

Javadoc

Reader for an Archive file of Archive ArchiveRecords.

Most used methods

iterator
Returns an ArchiveRecord iterator. Of note, on IOException, especially if ZipException reading compr
get
Get record at passed offset.
close
getVersion
setStrict
cdxOutput
cleanupCurrentRecord
Cleanout the current record if there is one.
createArchiveRecord
Return an Archive Record homed on offset intois.
dump
Dump this file on STDOUT
getDeleteFileOnCloseReader
getDotFileExtension
getFileName

Popular in Java

Running tasks concurrently on multiple threads
setContentView (Activity)
orElseThrow (Optional)
Return the contained value, if present, otherwise throw an exception to be created by the provided s
getResourceAsStream (ClassLoader)
Dictionary (java.util)
Note: Do not use this class since it is obsolete. Please use the Map interface for new implementatio
GregorianCalendar (java.util)
GregorianCalendar is a concrete subclass of Calendarand provides the standard calendar used by most
CountDownLatch (java.util.concurrent)
A synchronization aid that allows one or more threads to wait until a set of operations being perfor
IOUtils (org.apache.commons.io)
General IO stream manipulation utilities. This class provides static utility methods for input/outpu
JCheckBox (javax.swing)
Join (org.hibernate.mapping)
From CI to AI: The AI layer in your organization

How to useArchiveReader in org.archive.io

Best Java code snippets using org.archive.io.ArchiveReader (Showing top 20 results out of 315)

How to use
ArchiveReader
in
org.archive.io