writer.write(r.getHeader().getUrl(), mimetype, ip, time, (int)(length - offset), r);
protected void write(final WARCWriter writer, final ARCRecord r) throws IOException { WARCRecordInfo recordInfo = new WARCRecordInfo(); recordInfo.setUrl(r.getHeader().getUrl()); recordInfo.setContentStream(r); recordInfo.setContentLength(r.getHeader().getLength());
System.out.println(r.getHeader().getUrl()); System.out.println();
public void map(LongWritable key, WarcRecordWritable record, Context context) throws IOException, InterruptedException { context.getCounter(Records.TOTAL).increment(1); ArchiveRecordHeader header = record.getRecord().getHeader(); if (header.getHeaderValue("WARC-Type").equals("response")) { return; } String url = header.getUrl(); if ((url != null) && url.startsWith("http://")) { KEY.set(url); context.write(KEY, VALUE); } } }
protected String outputCdx(final String strippedFileName) throws IOException { // Read the whole record so we get out a hash. Should be safe calling // close on already closed Record. close(); ArchiveRecordHeader h = getHeader(); StringBuilder buffer = new StringBuilder(ArchiveFileConstants.CDX_LINE_BUFFER_SIZE); buffer.append(h.getDate()); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(getIp4Cdx(h)); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(h.getUrl()); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(getMimetype4Cdx(h)); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(getStatusCode4Cdx(h)); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(getDigest4Cdx(h)); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(h.getOffset()); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(h.getLength()); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(strippedFileName != null? strippedFileName: '-'); return buffer.toString(); }
protected String outputCdx(final String strippedFileName) throws IOException { // Read the whole record so we get out a hash. Should be safe calling // close on already closed Record. close(); ArchiveRecordHeader h = getHeader(); StringBuilder buffer = new StringBuilder(ArchiveFileConstants.CDX_LINE_BUFFER_SIZE); buffer.append(h.getDate()); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(getIp4Cdx(h)); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(h.getUrl()); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(getMimetype4Cdx(h)); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(getStatusCode4Cdx(h)); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(getDigest4Cdx(h)); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(h.getOffset()); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(h.getLength()); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(strippedFileName != null? strippedFileName: '-'); return buffer.toString(); }
protected String outputCdx(final String strippedFileName) throws IOException { // Read the whole record so we get out a hash. Should be safe calling // close on already closed Record. close(); ArchiveRecordHeader h = getHeader(); StringBuilder buffer = new StringBuilder(ArchiveFileConstants.CDX_LINE_BUFFER_SIZE); buffer.append(h.getDate()); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(getIp4Cdx(h)); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(h.getUrl()); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(getMimetype4Cdx(h)); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(getStatusCode4Cdx(h)); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(getDigest4Cdx(h)); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(h.getOffset()); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(h.getLength()); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(strippedFileName != null? strippedFileName: '-'); return buffer.toString(); }
/** * Is it likely that this record contains headers? * This method will return true if the body is a http response that includes * http response headers or the body is a http request that includes request * headers, etc. Be aware that headers in content are distinct from * {@link ArchiveRecordHeader} 'headers'. * @return True if this Record's content has headers: */ public boolean hasContentHeaders() { final String url = getHeader().getUrl(); if (url == null) { return false; } if (!url.toLowerCase().startsWith("http")) { return false; } if (getHeader().getLength() <= MIN_HTTP_HEADER_LENGTH) { return false; } return true; }
/** * Is it likely that this record contains headers? * This method will return true if the body is a http response that includes * http response headers or the body is a http request that includes request * headers, etc. Be aware that headers in content are distinct from * {@link ArchiveRecordHeader} 'headers'. * @return True if this Record's content has headers: */ public boolean hasContentHeaders() { final String url = getHeader().getUrl(); if (url == null) { return false; } if (!url.toLowerCase().startsWith("http")) { return false; } if (getHeader().getLength() <= MIN_HTTP_HEADER_LENGTH) { return false; } return true; }
/** * Is it likely that this record contains headers? * This method will return true if the body is a http response that includes * http response headers or the body is a http request that includes request * headers, etc. Be aware that headers in content are distinct from * {@link ArchiveRecordHeader} 'headers'. * @return True if this Record's content has headers: */ public boolean hasContentHeaders() { final String url = getHeader().getUrl(); if (url == null) { return false; } if (!url.toLowerCase().startsWith("http")) { return false; } if (getHeader().getLength() <= MIN_HTTP_HEADER_LENGTH) { return false; } return true; }
if( !header.getHeaderFields().isEmpty() ) { newKey = (header.getDate().replaceAll("[^0-9]", "")) + "/" + header.getUrl();
@Override public void analyse(String source, ArchiveRecordHeader header, InputStream tikainput, SolrRecord solr) { final String url = Normalisation .sanitiseWARCHeaderValue(header.getUrl()); log.debug("Analysing " + url); final long start = System.nanoTime(); // Analyse with tika: try { if (passUriToFormatTools) { solr = this.extract(source, solr, tikainput, url); } else { solr = this.extract(source, solr, tikainput, null); } } catch (Exception i) { log.error(i + ": " + i.getMessage() + ";tika; " + url + "@" + header.getOffset()); } Instrument.timeRel("WARCPayloadAnalyzers.analyze#total", "WARCPayloadAnalyzers.analyze#tikasolrextract", start); } /**
System.out.println("URL: " + r.getHeader().getUrl()); System.out.println();
public SolrRecord(int defaultMaxFieldLength, HashMap<String, Integer> maxFieldLengths, String filename, ArchiveRecordHeader header) { defaultMax = defaultMaxFieldLength; maxLengths = maxFieldLengths; setField(SolrFields.ID, "exception-at-" + filename + "@" + header.getOffset()); setField(SolrFields.SOURCE_FILE, filename); setField(SolrFields.SOURCE_FILE_OFFSET, "" + header.getOffset()); setField(SolrFields.SOLR_URL, Normalisation.sanitiseWARCHeaderValue(header.getUrl())); setField(SolrFields.SOLR_URL_TYPE, SolrFields.SOLR_URL_TYPE_UNKNOWN); }
public SolrRecord(int defaultMaxFieldLength, HashMap<String, Integer> maxFieldLengths, String filename, ArchiveRecordHeader header) { defaultMax = defaultMaxFieldLength; maxLengths = maxFieldLengths; setField(SolrFields.ID, "exception-at-" + filename + "@" + header.getOffset()); setField(SolrFields.SOURCE_FILE, filename); setField(SolrFields.SOURCE_FILE_OFFSET, "" + header.getOffset()); setField(SolrFields.SOLR_URL, Normalisation.sanitiseWARCHeaderValue(header.getUrl())); setField(SolrFields.SOLR_URL_TYPE, SolrFields.SOLR_URL_TYPE_UNKNOWN); }
@Test public void shouldStoreAndIterageOverData() throws IOException { String folder = tempFolder.newFolder().toString(); Page target = new Page(new URL(url), html, responseHeaders); target.setTargetRelevance(TargetRelevance.RELEVANT); target.setFetchTime(System.currentTimeMillis()); WarcTargetRepository repository = new WarcTargetRepository(folder); // when repository.insert(target); repository.close(); File testFolder = new File(folder); if (testFolder.isDirectory()) { File[] allFiles = testFolder.listFiles(); assertTrue(allFiles[0].getName().startsWith("crawl_data")); } Iterator<WARCRecord> it = repository.iterator(); // then assertThat(it.hasNext(), is(true)); WARCRecord page = it.next(); assertThat(it.hasNext(), is(false)); assertThat(page.getHeader().getUrl(), is(url)); assertThat(page.getHeader().getHeaderValue("Content-Type"), is(WARCConstants.HTTP_RESPONSE_MIMETYPE)); assertThat(page.getHeader().getHeaderValue("ACHE-IsRelevant"), is(target.getTargetRelevance().isRelevant() + "")); assertThat(Double.valueOf(page.getHeader().getHeaderValue("ACHE-Relevance").toString()), is(Double.valueOf(target.getTargetRelevance().getRelevance()))); }
@Test public void testShouldNotFailWhenThereAreNonASCIICharactersOnHeaders() throws Exception { // given String folder = tempFolder.newFolder().toString(); String url1 = "http://a.com"; Map<String, List<String>> headers = new HashMap<>(); Character invalidChar = new Character((char) 0x80); String headerValue = "inline; filename=\"Invalid_" + invalidChar + "\""; headers.put("Content-Disposition", asList(headerValue)); Page target1 = new Page(new URL(url1), html, headers); WarcTargetRepository repository = new WarcTargetRepository(folder); // when repository.insert(target1); repository.close(); RepositoryIterator respositoryIterator = repository.iterator(); // then assertTrue(respositoryIterator.hasNext()); WARCRecord record = respositoryIterator.next(); assertThat(record.getHeader().getUrl(), is(url1)); String recordData = IOUtils.toString(record); assertThat(recordData, containsString(html)); assertThat(recordData, containsString(headerValue)); assertFalse(respositoryIterator.hasNext()); }
@Test public void testReadingMultipleWarcRecordsUsingIterator() throws Exception { // given String folder = tempFolder.newFolder().toString(); String url1 = "http://a.com"; String url2 = "http://b.com"; Page target1 = new Page(new URL(url1), html); Page target2 = new Page(new URL(url2), html); WarcTargetRepository repository = new WarcTargetRepository(folder); // when repository.insert(target1); repository.insert(target2); repository.close(); RepositoryIterator respositoryIterator = repository.iterator(); // then assertTrue(respositoryIterator.hasNext()); WARCRecord record = respositoryIterator.next(); assertThat(record.getHeader().getUrl(), is(url1)); assertTrue(respositoryIterator.hasNext()); record = respositoryIterator.next(); assertThat(record.getHeader().getUrl(), is(url2)); assertFalse(respositoryIterator.hasNext()); }
/** * Create a test revisit record referring Resource {@code revisited}. * @param timestamp CDX-style 14digit timestamp * @param revisited Capture being revisited (must be a {@link WarcResource} * or {@code ClassCastException} will be the result) * @param withHeader {@code true} unless you want to emulate old implementation * where revisit record had no HTTP headers. * @return new Resource object * @throws IOException for unexpected I/O error building payload */ public static Resource createTestRevisitResource(String timestamp, Resource revisited, boolean withHeader) throws IOException { String clen = revisited.getHttpHeaders().get("Content-Length"); int len = clen != null ? Integer.parseInt(clen) : -1; TestWARCRecordInfo recinfo = TestWARCRecordInfo .createRevisitHttpResponse("text/html", len, withHeader); recinfo.setCreate14DigitDateFromDT14(timestamp); ArchiveRecordHeader warcHeader = ((WarcResource)revisited).getWarcHeaders(); recinfo.addExtraHeader("WARC-Refers-To-Target-URI", warcHeader.getUrl()); recinfo.addExtraHeader("WARC-Refers-To-Date", warcHeader.getDate()); recinfo.setUrl(warcHeader.getUrl()); TestWARCReader ar = new TestWARCReader(recinfo); WARCRecord rec = ar.get(0); WarcResource resource = new WarcResource(rec, ar); resource.parseHeaders(); return resource; }