@Override protected boolean shouldExtract(CrawlURI uri) { long max = getMaxSizeToParse(); if (uri.getRecorder().getRecordedInput().getSize() > max) { return false; } String ct = uri.getContentType(); return (ct != null) && (ct.startsWith("application/pdf")); }
/** * Update CrawlURI internal sizes based on current transaction (and * in the case of 304s, history) * * @param curi CrawlURI * @param rec HttpRecorder */ protected void setSizes(CrawlURI curi, Recorder rec) { // set reporting size curi.setContentSize(rec.getRecordedInput().getSize()); // add contentSize to extraInfo so it's available to log in the crawl log curi.addExtraInfo("contentSize", rec.getRecordedInput().getSize()); // special handling for 304-not modified if (curi.getFetchStatus() == HttpStatus.SC_NOT_MODIFIED && curi.getFetchHistory() != null) { Map<String, Object>[] history = curi.getFetchHistory(); if (history[0] != null && history[0].containsKey(A_REFERENCE_LENGTH)) { long referenceLength = (Long) history[0].get(A_REFERENCE_LENGTH); // carry-forward previous 'reference-length' for future curi.getData().put(A_REFERENCE_LENGTH, referenceLength); // increase content-size to virtual-size for reporting curi.setContentSize(rec.getRecordedInput().getSize() + referenceLength); } } }
public static long getRecordedSize(CrawlURI puri) { if (puri.getRecorder() == null) { return puri.getContentSize(); } else { return puri.getRecorder().getRecordedInput().getSize(); } }
/** * Get size of data recorded (transferred) * * @return recorded data size */ public long getRecordedSize() { return (getRecorder() != null) ? getRecorder() .getRecordedInput().getSize() // if unavailable fall back on content-size : getContentSize(); }
protected URI writeRevisit(final WARCWriter w, final String timestamp, final String mimetype, final URI baseid, final CrawlURI curi, final ANVLRecord headers) throws IOException { long revisedLength = 0; // By default, truncate all data if (curi.getRevisitProfile().getProfileName().equals(PROFILE_REVISIT_IDENTICAL_DIGEST) ) { // Save response from identical digest matches revisedLength = curi.getRecorder().getRecordedInput().getContentBegin(); revisedLength = revisedLength > 0 ? revisedLength : curi.getRecorder().getRecordedInput().getSize(); } return writeRevisit(w, timestamp, mimetype, baseid, curi, headers, revisedLength); }
recordInfo.setRecordId(baseid); recordInfo.setExtraHeaders(namedFields); recordInfo.setContentLength(curi.getRecorder().getRecordedInput().getSize()); recordInfo.setEnforceLength(true);
protected void writeWhoisRecords(WARCWriter w, CrawlURI curi, URI baseid, String timestamp) throws IOException { WARCRecordInfo recordInfo = new WARCRecordInfo(); recordInfo.setType(WARCRecordType.response); recordInfo.setUrl(curi.toString()); recordInfo.setCreate14DigitDate(timestamp); recordInfo.setMimetype(curi.getContentType()); recordInfo.setRecordId(baseid); recordInfo.setContentLength(curi.getRecorder().getRecordedInput().getSize()); recordInfo.setEnforceLength(true); Object whoisServerIP = curi.getData().get(CoreAttributeConstants.A_WHOIS_SERVER_IP); if (whoisServerIP != null) { recordInfo.addExtraHeader(HEADER_KEY_IP, whoisServerIP.toString()); } ReplayInputStream ris = curi.getRecorder().getRecordedInput().getReplayInputStream(); recordInfo.setContentStream(ris); try { w.writeRecord(recordInfo); } finally { IOUtils.closeQuietly(ris); } recordInfo.getRecordId(); }
protected void writeDnsRecords(final CrawlURI curi, WARCWriter w, final URI baseid, final String timestamp) throws IOException { WARCRecordInfo recordInfo = new WARCRecordInfo(); recordInfo.setType(WARCRecordType.response); recordInfo.setUrl(curi.toString()); recordInfo.setCreate14DigitDate(timestamp); recordInfo.setMimetype(curi.getContentType()); recordInfo.setRecordId(baseid); recordInfo.setContentLength(curi.getRecorder().getRecordedInput().getSize()); recordInfo.setEnforceLength(true); String ip = (String)curi.getData().get(A_DNS_SERVER_IP_LABEL); if (ip != null && ip.length() > 0) { recordInfo.addExtraHeader(HEADER_KEY_IP, ip); } ReplayInputStream ris = curi.getRecorder().getRecordedInput().getReplayInputStream(); recordInfo.setContentStream(ris); try { w.writeRecord(recordInfo); } finally { IOUtils.closeQuietly(ris); } recordInfo.getRecordId(); }
+ " " + curi.getUURI().toString() + " " + response.getStatusLine().getStatusCode() + " " + rec.getRecordedInput().getSize() + " " + curi.getContentType());
protected URI writeResource(final WARCWriter w, final String timestamp, final String mimetype, final URI baseid, final CrawlURI curi, final ANVLRecord namedFields) throws IOException { WARCRecordInfo recordInfo = new WARCRecordInfo(); recordInfo.setType(WARCRecordType.resource); recordInfo.setUrl(curi.toString()); recordInfo.setCreate14DigitDate(timestamp); recordInfo.setMimetype(mimetype); recordInfo.setRecordId(baseid); recordInfo.setExtraHeaders(namedFields); recordInfo.setContentLength(curi.getRecorder().getRecordedInput().getSize()); recordInfo.setEnforceLength(true); ReplayInputStream ris = curi.getRecorder().getRecordedInput().getReplayInputStream(); recordInfo.setContentStream(ris); try { w.writeRecord(recordInfo); } finally { IOUtils.closeQuietly(ris); } return recordInfo.getRecordId(); }
/** * Calculate a recommended size for an in-memory decoded-character buffer * of this content. We seek a size that is itself no larger (in 2-byte chars) * than the memory already used by the RecordingInputStream's internal raw * byte buffer, and also no larger than likely necessary. So, we take the * minimum of the actual recorded byte size and the RecordingInputStream's * max buffer size. * * @param inStream * @return int length for in-memory decoded-character buffer */ static protected int calcRecommendedCharBufferSize(RecordingInputStream inStream) { return (int) Math.min(inStream.getRecordedBufferLength()/2, inStream.getSize()); }
/** * Calculate a recommended size for an in-memory decoded-character buffer * of this content. We seek a size that is itself no larger (in 2-byte chars) * than the memory already used by the RecordingInputStream's internal raw * byte buffer, and also no larger than likely necessary. So, we take the * minimum of the actual recorded byte size and the RecordingInputStream's * max buffer size. * * @param inStream * @return int length for in-memory decoded-character buffer */ static protected int calcRecommendedCharBufferSize(RecordingInputStream inStream) { return (int) Math.min(inStream.getRecordedBufferLength()/2, inStream.getSize()); }
/** * Calculate a recommended size for an in-memory decoded-character buffer * of this content. We seek a size that is itself no larger (in 2-byte chars) * than the memory already used by the RecordingInputStream's internal raw * byte buffer, and also no larger than likely necessary. So, we take the * minimum of the actual recorded byte size and the RecordingInputStream's * max buffer size. * * @param inStream * @return int length for in-memory decoded-character buffer */ static protected int calcRecommendedCharBufferSize(RecordingInputStream inStream) { return (int) Math.min(inStream.getRecordedBufferLength()/2, inStream.getSize()); }
@Override protected boolean shouldExtract(CrawlURI uri) { long max = getMaxSizeToParse(); if (uri.getRecorder().getRecordedInput().getSize() > max) { return false; } String ct = uri.getContentType(); return (ct != null) && (ct.startsWith("application/pdf")); }
/** * Get size of data recorded (transferred) * * @return recorded data size */ public long getRecordedSize() { return (getRecorder() != null) ? getRecorder() .getRecordedInput().getSize() // if unavailable fall back on content-size : getContentSize(); }
public static long getRecordedSize(CrawlURI puri) { if (puri.getRecorder() == null) { return puri.getContentSize(); } else { return puri.getRecorder().getRecordedInput().getSize(); } }
protected URI writeRevisit(final WARCWriter w, final String timestamp, final String mimetype, final URI baseid, final CrawlURI curi, final ANVLRecord headers) throws IOException { long revisedLength = 0; // By default, truncate all data if (curi.getRevisitProfile().getProfileName().equals(PROFILE_REVISIT_IDENTICAL_DIGEST) ) { // Save response from identical digest matches revisedLength = curi.getRecorder().getRecordedInput().getContentBegin(); revisedLength = revisedLength > 0 ? revisedLength : curi.getRecorder().getRecordedInput().getSize(); } return writeRevisit(w, timestamp, mimetype, baseid, curi, headers, revisedLength); }
protected void writeWhoisRecords(WARCWriter w, CrawlURI curi, URI baseid, String timestamp) throws IOException { WARCRecordInfo recordInfo = new WARCRecordInfo(); recordInfo.setType(WARCRecordType.response); recordInfo.setUrl(curi.toString()); recordInfo.setCreate14DigitDate(timestamp); recordInfo.setMimetype(curi.getContentType()); recordInfo.setRecordId(baseid); recordInfo.setContentLength(curi.getRecorder().getRecordedInput().getSize()); recordInfo.setEnforceLength(true); Object whoisServerIP = curi.getData().get(CoreAttributeConstants.A_WHOIS_SERVER_IP); if (whoisServerIP != null) { recordInfo.addExtraHeader(HEADER_KEY_IP, whoisServerIP.toString()); } ReplayInputStream ris = curi.getRecorder().getRecordedInput().getReplayInputStream(); recordInfo.setContentStream(ris); try { w.writeRecord(recordInfo); } finally { IOUtils.closeQuietly(ris); } recordInfo.getRecordId(); }