firstRecord.dump(baos); ANVLRecord ar = new ANVLRecord(); ar.addLabelValue("Filedesc", baos.toString()); List<String> metadata = new ArrayList<String>(1); metadata.add(ar.toString());
ANVLRecord r = new ANVLRecord(); if (curi.isSeed()) { r.addLabel("seed"); } else { if (curi.forceFetch()) { r.addLabel("force-fetch"); r.addLabelValue("via", flattenVia(curi)); r.addLabelValue("hopsFromSeed", curi.getPathFromSeed()); r.addLabelValue("sourceTag", (String)curi.getData().get(A_SOURCE_TAG)); r.addLabelValue("fetchTimeMs", Long.toString(duration)); r.addLabelValue("ftpFetchStatus", curi.getData().get(A_FTP_FETCH_STATUS).toString()); r.addLabelValue("charsetForLinkExtraction", curi.getRecorder().getCharset().name()); if (annotation.startsWith("usingCharsetIn") || annotation.startsWith("inconsistentCharsetIn")) { String[] kv = annotation.split(":", 2); r.addLabelValue(kv[0], kv[1]); if (links != null && links.size() > 0) { for (CrawlURI link: links) { r.addLabelValue("outlink", link.getURI()+" "+link.getLastHop()+" "+link.getViaContext()); byte [] b = r.getUTF8Bytes();
protected void writeFtpRecords(WARCWriter w, final CrawlURI curi, final URI baseid, final String timestamp) throws IOException { ANVLRecord headers = new ANVLRecord(); headers.addLabelValue(HEADER_KEY_IP, getHostAddress(curi)); String controlConversation = curi.getData().get(A_FTP_CONTROL_CONVERSATION).toString(); URI rid = writeFtpControlConversation(w, timestamp, baseid, curi, headers, controlConversation); headers.addLabelValue(HEADER_KEY_PAYLOAD_DIGEST, curi.getContentDigestSchemeString()); baseid, curi, headers, 0); } else { headers = new ANVLRecord(); headers.addLabelValue(HEADER_KEY_TRUNCATED, value); headers.addLabelValue(HEADER_KEY_PAYLOAD_DIGEST, curi.getContentDigestSchemeString()); headers.addLabelValue(HEADER_KEY_CONCURRENT_TO, '<' + rid.toString() + '>'); rid = writeResource(w, timestamp, curi.getContentType(), baseid, curi, headers); headers = new ANVLRecord(); headers.addLabelValue(HEADER_KEY_CONCURRENT_TO, '<' + rid.toString() + '>'); writeMetadata(w, timestamp, baseid, curi, headers);
ANVLRecord record = new ANVLRecord(); boolean inValue = false, inLabel = false, inComment = false, inNewLine = false; if (inNewLine && isLF(c)) { continue; if (inNewLine && isCR(c) && isLF(s.charAt(i + 1))) { break; inNewLine = isCR(c) && isLF(s.charAt(i + 1)); } else if (label != null && !inValue) { record.addLabel(label); label = null; sb.setLength(0); + "(2):\n" + s.substring(i)); if (!isCR(s.charAt(i + 2)) && !isLF(s.charAt(i + 3)) && Character.isWhitespace(s.charAt(i + 2))) { record.addLabelValue(label, sb.toString()); sb.setLength(0); label = null;
HttpRequestMessage requestMessage = new HttpRequestMessage( GET,url.getFile().getBytes(),HTTP11); ANVLRecord headers = new ANVLRecord(); headers.addLabelValue("Host", hostname); headers.addLabelValue(RANGE_HTTP_HEADER, HEADER_BYTES_PREFIX + String.valueOf(offset) + HEADER_BYTES_SUFFIX); InputStream socketIn = socket.getInputStream(); socketOut.write(requestMessage.getBytes(true)); socketOut.write(headers.getUTF8Bytes()); socketOut.flush(); HttpResponse response = HttpResponse.load(socketIn); String contentType = response.getHeaders().asMap().get("Content-Type"); if(contentType == null) { contentType = "application/unknown"; String xferEncoding = response.getHeaders().asMap().get("Transfer-Encoding");
protected void addIfNotBlank(ANVLRecord record, String label, String value) { if(StringUtils.isNotBlank(value)) { record.addLabelValue(label, value); } }
private void writeHeaderRecord(File target, File fieldsSrc, String id) throws IOException { WARCWriter writer = null; BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(target)); FileInputStream is = new FileInputStream(fieldsSrc); ANVLRecord ar = ANVLRecord.load(is); List<String> metadata = new ArrayList<String>(1); metadata.add(ar.toString()); writer = new WARCWriter(new AtomicInteger(),bos,target,getSettings(true, null, null, metadata)); // Write a warcinfo record with description about how this WARC // was made. writer.writeWarcinfoRecord(target.getName(), "Made from " + id + " by " + this.getClass().getName()); } private WARCWriterPoolSettings getSettings(final boolean isCompressed,
/** * @return This ANVLRecord as UTF8 bytes. */ public byte [] getUTF8Bytes() throws UnsupportedEncodingException { return toString().getBytes(UTF8); }
ANVLRecord namedFields = suppliedFields; if(curi.getData().containsKey(A_WARC_RESPONSE_HEADERS)) { namedFields = namedFields.clone(); for (Object headerObj : curi.getDataList(A_WARC_RESPONSE_HEADERS)) { String[] kv = StringUtils.split(((String)headerObj),":",2); namedFields.addLabelValue(kv[0].trim(), kv[1].trim());
/** * @return Count of ANVLRecord bytes. Be careful, an empty ANVLRecord is * CRLFCRLF so is of size 4. Also, expensive, since it makes String of * the record so it can count bytes. */ public synchronized int getLength() { int length = -1; try { length = getUTF8Bytes().length; } catch (UnsupportedEncodingException e) { throw new RuntimeException(e); } return length; }
public HttpRequest(HttpRequestMessage message, byte[] originalHeaders) throws IOException { this.originalHeaders = originalHeaders; this.message = message; // If we want to keep the headers - we're not using them: ByteArrayInputStream bais = new ByteArrayInputStream(originalHeaders); headers = ANVLRecord.load(bais); }
@Override public String toString() { // TODO: What to emit for empty ANVLRecord? StringBuilder sb = new StringBuilder(); for (final Iterator<Element> i = iterator(); i.hasNext();) { sb.append(i.next()); sb.append(CRLF); } // 'ANVL Records end in a blank line'. sb.append(CRLF); return sb.toString(); }
protected void checkCRLF(final char c, final String srcStr, final int index) { if (ANVLRecord.isCROrLF(c)) { throw new IllegalArgumentException(srcStr + " contains disallowed CRLF control character(s): 0x" + Integer.toHexString(c)); } }
ANVLRecord headers = new ANVLRecord(); headers.addLabelValue(HEADER_KEY_IP, getHostAddress(curi)); } else { if (curi.getContentDigest() != null) { headers.addLabelValue(HEADER_KEY_PAYLOAD_DIGEST, curi.getContentDigestSchemeString()); headers.addLabelValue(HEADER_KEY_TRUNCATED, value); headers = new ANVLRecord(); headers.addLabelValue(HEADER_KEY_CONCURRENT_TO, '<' + rid.toString() + '>');
ANVLRecord record = new ANVLRecord(); boolean inValue = false, inLabel = false, inComment = false, inNewLine = false; if (inNewLine && isLF(c)) { continue; if (inNewLine && isCR(c) && isLF(s.charAt(i + 1))) { break; inNewLine = isCR(c) && isLF(s.charAt(i + 1)); } else if (label != null && !inValue) { record.addLabel(label); label = null; sb.setLength(0); + "(2):\n" + s.substring(i)); if (!isCR(s.charAt(i + 2)) && !isLF(s.charAt(i + 3)) && Character.isWhitespace(s.charAt(i + 2))) { record.addLabelValue(label, sb.toString()); sb.setLength(0); label = null;
HttpRequestMessage requestMessage = new HttpRequestMessage( GET,url.getFile().getBytes(),HTTP11); ANVLRecord headers = new ANVLRecord(); headers.addLabelValue("Host", hostname); headers.addLabelValue(RANGE_HTTP_HEADER, HEADER_BYTES_PREFIX + String.valueOf(offset) + HEADER_BYTES_SUFFIX); InputStream socketIn = socket.getInputStream(); socketOut.write(requestMessage.getBytes(true)); socketOut.write(headers.getUTF8Bytes()); socketOut.flush(); HttpResponse response = HttpResponse.load(socketIn); String contentType = response.getHeaders().asMap().get("Content-Type"); if(contentType == null) { contentType = "application/unknown"; String xferEncoding = response.getHeaders().asMap().get("Transfer-Encoding");
/** * Saves a header from the given HTTP operation into the * provider headers under a new name */ protected void saveHeader(CrawlURI curi, ANVLRecord warcHeaders, String origName, String newName) { String value = curi.getHttpResponseHeader(origName); if (value != null) { warcHeaders.addLabelValue(newName, value); } }