protected void addIfNotBlank(ANVLRecord record, String label, String value) { if(StringUtils.isNotBlank(value)) { record.addLabelValue(label, value); } }
ANVLRecord headers = new ANVLRecord(); headers.addLabelValue(HEADER_KEY_IP, getHostAddress(curi)); } else { if (curi.getContentDigest() != null) { headers.addLabelValue(HEADER_KEY_PAYLOAD_DIGEST, curi.getContentDigestSchemeString()); headers.addLabelValue(HEADER_KEY_TRUNCATED, value); headers = new ANVLRecord(); headers.addLabelValue(HEADER_KEY_CONCURRENT_TO, '<' + rid.toString() + '>');
return cachedMetadata; ANVLRecord record = new ANVLRecord(); record.addLabelValue("software", "Heritrix/" + ArchiveUtils.VERSION + " http://crawler.archive.org"); try { InetAddress host = InetAddress.getLocalHost(); record.addLabelValue("ip", host.getHostAddress()); record.addLabelValue("hostname", host.getCanonicalHostName()); } catch (UnknownHostException e) { logger.log(Level.WARNING,"unable top obtain local crawl engine host",e); record.addLabelValue("format","WARC File Format 1.0"); record.addLabelValue("conformsTo","http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf"); return Collections.singletonList(record.toString());
ANVLRecord r = new ANVLRecord(); if (curi.isSeed()) { r.addLabel("seed"); } else { if (curi.forceFetch()) { r.addLabel("force-fetch"); r.addLabelValue("via", flattenVia(curi)); r.addLabelValue("hopsFromSeed", curi.getPathFromSeed()); r.addLabelValue("sourceTag", (String)curi.getData().get(A_SOURCE_TAG)); r.addLabelValue("fetchTimeMs", Long.toString(duration)); r.addLabelValue("ftpFetchStatus", curi.getData().get(A_FTP_FETCH_STATUS).toString()); r.addLabelValue("charsetForLinkExtraction", curi.getRecorder().getCharset().name()); if (annotation.startsWith("usingCharsetIn") || annotation.startsWith("inconsistentCharsetIn")) { String[] kv = annotation.split(":", 2); r.addLabelValue(kv[0], kv[1]); if (links != null && links.size() > 0) { for (CrawlURI link: links) { r.addLabelValue("outlink", link.getURI()+" "+link.getLastHop()+" "+link.getViaContext()); byte [] b = r.getUTF8Bytes();
ANVLRecord namedFields = suppliedFields; if(curi.getData().containsKey(A_WARC_RESPONSE_HEADERS)) { namedFields = namedFields.clone(); for (Object headerObj : curi.getDataList(A_WARC_RESPONSE_HEADERS)) { String[] kv = StringUtils.split(((String)headerObj),":",2); namedFields.addLabelValue(kv[0].trim(), kv[1].trim());
protected void checkCharacter(final char c, final String srcStr, final int index) { checkControlCharacter(c, srcStr, index); checkCRLF(c, srcStr, index); }
public Value getValue() { if (!isValue()) { return null; } return (Value)this.subElements[1]; }
protected void writeFtpRecords(WARCWriter w, final CrawlURI curi, final URI baseid, final String timestamp) throws IOException { ANVLRecord headers = new ANVLRecord(); headers.addLabelValue(HEADER_KEY_IP, getHostAddress(curi)); String controlConversation = curi.getData().get(A_FTP_CONTROL_CONVERSATION).toString(); URI rid = writeFtpControlConversation(w, timestamp, baseid, curi, headers, controlConversation); headers.addLabelValue(HEADER_KEY_PAYLOAD_DIGEST, curi.getContentDigestSchemeString()); baseid, curi, headers, 0); } else { headers = new ANVLRecord(); headers.addLabelValue(HEADER_KEY_TRUNCATED, value); headers.addLabelValue(HEADER_KEY_PAYLOAD_DIGEST, curi.getContentDigestSchemeString()); headers.addLabelValue(HEADER_KEY_CONCURRENT_TO, '<' + rid.toString() + '>'); rid = writeResource(w, timestamp, curi.getContentType(), baseid, curi, headers); headers = new ANVLRecord(); headers.addLabelValue(HEADER_KEY_CONCURRENT_TO, '<' + rid.toString() + '>'); writeMetadata(w, timestamp, baseid, curi, headers);
firstRecord.dump(baos); ANVLRecord ar = new ANVLRecord(); ar.addLabelValue("Filedesc", baos.toString()); List<String> metadata = new ArrayList<String>(1); metadata.add(ar.toString());
/** * Saves a header from the given HTTP operation into the * provider headers under a new name */ protected void saveHeader(CrawlURI curi, ANVLRecord warcHeaders, String origName, String newName) { String value = curi.getHttpResponseHeader(origName); if (value != null) { warcHeaders.addLabelValue(newName, value); } }
protected void checkCharacter(final char c, final String srcStr, final int index) { checkControlCharacter(c, srcStr, index); checkCRLF(c, srcStr, index); }
public Value getValue() { if (!isValue()) { return null; } return (Value)this.subElements[1]; }
recordInfo.setCreate14DigitDate(warcDateString); ANVLRecord ar = new ANVLRecord(); String ip = (String)r.getHeader() .getHeaderValue((ARCConstants.IP_HEADER_FIELD_KEY)); if (ip != null && ip.length() > 0) { ar.addLabelValue(WARCConstants.NAMED_FIELD_IP_LABEL, ip); r.getMetaData();
headers.addLabelValue(HEADER_KEY_PROFILE, revisitProfile.getProfileName()); headers.addLabelValue(HEADER_KEY_TRUNCATED, NAMED_FIELD_TRUNCATED_VALUE_LENGTH); recordInfo.setExtraHeaders(headers); for ( String key : revisitHeaders.keySet()) { headers.addLabelValue(key, revisitHeaders.get(key));
protected void checkCharacter(final char c, final String srcStr, final int index) { checkControlCharacter(c, srcStr, index); checkCRLF(c, srcStr, index); }
public void addExtraHeader(String label, String value) { if (extraHeaders == null) { extraHeaders = new ANVLRecord(); } extraHeaders.addLabelValue(label, value); }
public void addExtraHeader(String label, String value) { if (extraHeaders == null) { extraHeaders = new ANVLRecord(); } extraHeaders.addLabelValue(label, value); }