protected void writeDnsRecords(final CrawlURI curi, WARCWriter w, final URI baseid, final String timestamp) throws IOException { WARCRecordInfo recordInfo = new WARCRecordInfo(); recordInfo.setType(WARCRecordType.response); recordInfo.setUrl(curi.toString()); recordInfo.setCreate14DigitDate(timestamp); recordInfo.setMimetype(curi.getContentType()); recordInfo.setRecordId(baseid); recordInfo.setContentLength(curi.getRecorder().getRecordedInput().getSize()); recordInfo.setEnforceLength(true); String ip = (String)curi.getData().get(A_DNS_SERVER_IP_LABEL); if (ip != null && ip.length() > 0) { recordInfo.addExtraHeader(HEADER_KEY_IP, ip); } ReplayInputStream ris = curi.getRecorder().getRecordedInput().getReplayInputStream(); recordInfo.setContentStream(ris); try { w.writeRecord(recordInfo); } finally { IOUtils.closeQuietly(ris); } recordInfo.getRecordId(); }
if (digestContent) { algorithm = getDigestAlgorithm(); rec.getRecordedInput().setDigest(algorithm); } else { rec.getRecordedInput().setDigest((MessageDigest)null); rec.getRecordedInput().setLimits(getMaxLengthBytes(), 1000l * (long) getTimeoutSeconds(), (long) getMaxFetchKBSec()); rec.getRecordedInput().readToEndOfContent(contentLength); rec.getRecordedInput().getDigestValue()); + " " + curi.getUURI().toString() + " " + response.getStatusLine().getStatusCode() + " " + rec.getRecordedInput().getSize() + " " + curi.getContentType()); if (rec.getRecordedInput().isOpen()) { logger.severe(curi.toString() + " RIS still open. Should have" + " been closed by method release: " + Thread.currentThread().getName()); try { rec.getRecordedInput().close(); } catch (IOException e) { logger.log(Level.SEVERE, "second-chance RIS close failed", e);
/** Copies a resource into a file. A temporary file is created and then atomically renamed to the destination file. This prevents leaving a partial file in case of a crash. @param recis the RecordingInputStream that recorded the contents of the resource @param dest the destination file @throws IOException on I/O error @throws IOException if the file rename fails */ private void writeToPath(RecordingInputStream recis, File dest) throws IOException { File tf = new File (dest.getPath() + "N"); ReplayInputStream replayis = null; FileOutputStream fos = null; try { replayis = recis.getMessageBodyReplayInputStream(); fos = new FileOutputStream(tf); replayis.readFullyTo(fos); } finally { IOUtils.closeQuietly(replayis); IOUtils.closeQuietly(fos); } if (!tf.renameTo(dest)) { throw new IOException("Can not rename " + tf.getAbsolutePath() + " to " + dest.getAbsolutePath()); } }
protected URI writeRevisit(final WARCWriter w, final String timestamp, final String mimetype, final URI baseid, final CrawlURI curi, final ANVLRecord headers) throws IOException { long revisedLength = 0; // By default, truncate all data if (curi.getRevisitProfile().getProfileName().equals(PROFILE_REVISIT_IDENTICAL_DIGEST) ) { // Save response from identical digest matches revisedLength = curi.getRecorder().getRecordedInput().getContentBegin(); revisedLength = revisedLength > 0 ? revisedLength : curi.getRecorder().getRecordedInput().getSize(); } return writeRevisit(w, timestamp, mimetype, baseid, curi, headers, revisedLength); }
@Override protected boolean shouldExtract(CrawlURI uri) { long max = getMaxSizeToParse(); if (uri.getRecorder().getRecordedInput().getSize() > max) { return false; } String ct = uri.getContentType(); return (ct != null) && (ct.startsWith("application/pdf")); }
if (shouldWrite(curi)) { ris = curi.getRecorder().getRecordedInput() .getReplayInputStream(); return write(curi, recordLength, ris, getHostAddress(curi)); } else {
/** * Calculate a recommended size for an in-memory decoded-character buffer * of this content. We seek a size that is itself no larger (in 2-byte chars) * than the memory already used by the RecordingInputStream's internal raw * byte buffer, and also no larger than likely necessary. So, we take the * minimum of the actual recorded byte size and the RecordingInputStream's * max buffer size. * * @param inStream * @return int length for in-memory decoded-character buffer */ static protected int calcRecommendedCharBufferSize(RecordingInputStream inStream) { return (int) Math.min(inStream.getRecordedBufferLength()/2, inStream.getSize()); }
public void open(InputStream wrappedStream) throws IOException { logger.fine(Thread.currentThread().getName() + " opening " + wrappedStream + ", " + Thread.currentThread().getName()); if(isOpen()) { // error; should not be opening/wrapping in an unclosed // stream remains open throw new IOException("RIS already open for " +Thread.currentThread().getName()); } try { this.in = wrappedStream; this.recordingOutputStream.open(); } catch (IOException ioe) { close(); // ...and rethrow... throw ioe; } }
if (digestContent) { algorithm = getDigestAlgorithm(); recorder.getRecordedInput().setDigest(algorithm); recorder.getRecordedInput().startDigest(); } else { recorder.getRecordedInput().setDigest((MessageDigest)null); recorder.close(); curi.setContentSize(recorder.getRecordedInput().getSize()); logger.fine("read " + recorder.getRecordedInput().getSize() + " bytes from ftp data socket"); recorder.getRecordedInput().getDigestValue());
public int read(byte[] b) throws IOException { if (!isOpen()) { throw new IOException("Stream closed " + Thread.currentThread().getName()); } int count = this.in.read(b); if (count > 0) { assert this.recordingOutputStream != null: "ROS is null " + Thread.currentThread().getName(); this.recordingOutputStream.write(b,0,count); } return count; }
if (0L == recis.getResponseContentLength()) { return;
/** * Close all streams. */ public void close() { logger.fine(Thread.currentThread().getName() + " closing"); try { this.ris.close(); } catch (IOException e) { // TODO: Can we not let the exception out of here and report it // higher up in the caller? DevUtils.logger.log(Level.SEVERE, "close() ris" + DevUtils.extraInfo(), e); } try { this.ros.close(); } catch (IOException e) { DevUtils.logger.log(Level.SEVERE, "close() ros" + DevUtils.extraInfo(), e); } }
public Recorder(File file, int outBufferSize, int inBufferSize) { super(); this.backingFileBasename = file.getAbsolutePath(); this.ris = new RecordingInputStream(inBufferSize, this.backingFileBasename + RECORDING_INPUT_STREAM_SUFFIX); this.ros = new RecordingOutputStream(outBufferSize, this.backingFileBasename + RECORDING_OUTPUT_STREAM_SUFFIX); }
public void endReplays() { ArchiveUtils.closeQuietly(replayCharSequence); replayCharSequence = null; // like closeQuietly try { ris.clearForReuse(); } catch (IOException ioe) { } // like closeQuietly try { ros.clearForReuse(); } catch (IOException e) { } } }
/** * Close both input and output recorders. * * Recorders are the output streams to which we are recording. * {@link #close()} closes the stream that is being recorded and the * recorder. This method explicitly closes the recorder only. */ public void closeRecorders() { try { this.ris.closeRecorder(); this.ros.closeRecorder(); } catch (IOException e) { DevUtils.warnHandle(e, "Convert to runtime exception?"); } }
/** * Update CrawlURI internal sizes based on current transaction (and * in the case of 304s, history) * * @param curi CrawlURI * @param rec HttpRecorder */ protected void setSizes(CrawlURI curi, Recorder rec) { // set reporting size curi.setContentSize(rec.getRecordedInput().getSize()); // add contentSize to extraInfo so it's available to log in the crawl log curi.addExtraInfo("contentSize", rec.getRecordedInput().getSize()); // special handling for 304-not modified if (curi.getFetchStatus() == HttpStatus.SC_NOT_MODIFIED && curi.getFetchHistory() != null) { Map<String, Object>[] history = curi.getFetchHistory(); if (history[0] != null && history[0].containsKey(A_REFERENCE_LENGTH)) { long referenceLength = (Long) history[0].get(A_REFERENCE_LENGTH); // carry-forward previous 'reference-length' for future curi.getData().put(A_REFERENCE_LENGTH, referenceLength); // increase content-size to virtual-size for reporting curi.setContentSize(rec.getRecordedInput().getSize() + referenceLength); } } }
String boundary = BOUNDARY_START + stringToMD5(curi.toString()); ris = curi.getRecorder().getRecordedInput(). getReplayInputStream(); out = initOutputStream(curi);
/** * Calculate a recommended size for an in-memory decoded-character buffer * of this content. We seek a size that is itself no larger (in 2-byte chars) * than the memory already used by the RecordingInputStream's internal raw * byte buffer, and also no larger than likely necessary. So, we take the * minimum of the actual recorded byte size and the RecordingInputStream's * max buffer size. * * @param inStream * @return int length for in-memory decoded-character buffer */ static protected int calcRecommendedCharBufferSize(RecordingInputStream inStream) { return (int) Math.min(inStream.getRecordedBufferLength()/2, inStream.getSize()); }