org.archive.io.ArchiveRecordHeader.getUrl java code examples

writer.write(r.getHeader().getUrl(), mimetype, ip, time,
  (int)(length - offset), r);

protected void write(final WARCWriter writer, final ARCRecord r)
throws IOException {
  WARCRecordInfo recordInfo = new WARCRecordInfo();
  recordInfo.setUrl(r.getHeader().getUrl());
  recordInfo.setContentStream(r);
  recordInfo.setContentLength(r.getHeader().getLength());

System.out.println(r.getHeader().getUrl());
System.out.println();

 public void map(LongWritable key, WarcRecordWritable record, Context context) throws IOException,
   InterruptedException {
  context.getCounter(Records.TOTAL).increment(1);
  
  ArchiveRecordHeader header = record.getRecord().getHeader();
  if (header.getHeaderValue("WARC-Type").equals("response")) {
   return;
  }
  String url = header.getUrl();
  if ((url != null) && url.startsWith("http://")) {
   KEY.set(url);
   context.write(KEY, VALUE);
  }
 }
}

protected String outputCdx(final String strippedFileName)
    throws IOException {
  // Read the whole record so we get out a hash. Should be safe calling
  // close on already closed Record.
  close();
  ArchiveRecordHeader h = getHeader();
  StringBuilder buffer =
      new StringBuilder(ArchiveFileConstants.CDX_LINE_BUFFER_SIZE);
  buffer.append(h.getDate());
  buffer.append(ArchiveFileConstants.SINGLE_SPACE);
  buffer.append(getIp4Cdx(h));
  buffer.append(ArchiveFileConstants.SINGLE_SPACE);
  buffer.append(h.getUrl());
  buffer.append(ArchiveFileConstants.SINGLE_SPACE);
  buffer.append(getMimetype4Cdx(h));
  buffer.append(ArchiveFileConstants.SINGLE_SPACE);
  buffer.append(getStatusCode4Cdx(h));
  buffer.append(ArchiveFileConstants.SINGLE_SPACE);
  buffer.append(getDigest4Cdx(h));
  buffer.append(ArchiveFileConstants.SINGLE_SPACE);
  buffer.append(h.getOffset());
  buffer.append(ArchiveFileConstants.SINGLE_SPACE);
  buffer.append(h.getLength());
  buffer.append(ArchiveFileConstants.SINGLE_SPACE);
  buffer.append(strippedFileName != null? strippedFileName: '-');
  return buffer.toString();
}

protected String outputCdx(final String strippedFileName)
    throws IOException {
  // Read the whole record so we get out a hash. Should be safe calling
  // close on already closed Record.
  close();
  ArchiveRecordHeader h = getHeader();
  StringBuilder buffer =
      new StringBuilder(ArchiveFileConstants.CDX_LINE_BUFFER_SIZE);
  buffer.append(h.getDate());
  buffer.append(ArchiveFileConstants.SINGLE_SPACE);
  buffer.append(getIp4Cdx(h));
  buffer.append(ArchiveFileConstants.SINGLE_SPACE);
  buffer.append(h.getUrl());
  buffer.append(ArchiveFileConstants.SINGLE_SPACE);
  buffer.append(getMimetype4Cdx(h));
  buffer.append(ArchiveFileConstants.SINGLE_SPACE);
  buffer.append(getStatusCode4Cdx(h));
  buffer.append(ArchiveFileConstants.SINGLE_SPACE);
  buffer.append(getDigest4Cdx(h));
  buffer.append(ArchiveFileConstants.SINGLE_SPACE);
  buffer.append(h.getOffset());
  buffer.append(ArchiveFileConstants.SINGLE_SPACE);
  buffer.append(h.getLength());
  buffer.append(ArchiveFileConstants.SINGLE_SPACE);
  buffer.append(strippedFileName != null? strippedFileName: '-');
  return buffer.toString();
}

protected String outputCdx(final String strippedFileName)
    throws IOException {
  // Read the whole record so we get out a hash. Should be safe calling
  // close on already closed Record.
  close();
  ArchiveRecordHeader h = getHeader();
  StringBuilder buffer =
      new StringBuilder(ArchiveFileConstants.CDX_LINE_BUFFER_SIZE);
  buffer.append(h.getDate());
  buffer.append(ArchiveFileConstants.SINGLE_SPACE);
  buffer.append(getIp4Cdx(h));
  buffer.append(ArchiveFileConstants.SINGLE_SPACE);
  buffer.append(h.getUrl());
  buffer.append(ArchiveFileConstants.SINGLE_SPACE);
  buffer.append(getMimetype4Cdx(h));
  buffer.append(ArchiveFileConstants.SINGLE_SPACE);
  buffer.append(getStatusCode4Cdx(h));
  buffer.append(ArchiveFileConstants.SINGLE_SPACE);
  buffer.append(getDigest4Cdx(h));
  buffer.append(ArchiveFileConstants.SINGLE_SPACE);
  buffer.append(h.getOffset());
  buffer.append(ArchiveFileConstants.SINGLE_SPACE);
  buffer.append(h.getLength());
  buffer.append(ArchiveFileConstants.SINGLE_SPACE);
  buffer.append(strippedFileName != null? strippedFileName: '-');
  return buffer.toString();
}

/**
 * Is it likely that this record contains headers?
 * This method will return true if the body is a http response that includes
 * http response headers or the body is a http request that includes request
 * headers, etc. Be aware that headers in content are distinct from
 * {@link ArchiveRecordHeader} 'headers'.
 * @return True if this Record's content has headers: 
 */
public boolean hasContentHeaders() {
  final String url = getHeader().getUrl();
  if (url == null) {
    return false;
  }
  if (!url.toLowerCase().startsWith("http")) {
    return false;
  }
  if (getHeader().getLength() <= MIN_HTTP_HEADER_LENGTH) {
    return false;
  }
  return true;
}

/**
 * Is it likely that this record contains headers?
 * This method will return true if the body is a http response that includes
 * http response headers or the body is a http request that includes request
 * headers, etc. Be aware that headers in content are distinct from
 * {@link ArchiveRecordHeader} 'headers'.
 * @return True if this Record's content has headers: 
 */
public boolean hasContentHeaders() {
  final String url = getHeader().getUrl();
  if (url == null) {
    return false;
  }
  if (!url.toLowerCase().startsWith("http")) {
    return false;
  }
  if (getHeader().getLength() <= MIN_HTTP_HEADER_LENGTH) {
    return false;
  }
  return true;
}

/**
 * Is it likely that this record contains headers?
 * This method will return true if the body is a http response that includes
 * http response headers or the body is a http request that includes request
 * headers, etc. Be aware that headers in content are distinct from
 * {@link ArchiveRecordHeader} 'headers'.
 * @return True if this Record's content has headers: 
 */
public boolean hasContentHeaders() {
  final String url = getHeader().getUrl();
  if (url == null) {
    return false;
  }
  if (!url.toLowerCase().startsWith("http")) {
    return false;
  }
  if (getHeader().getLength() <= MIN_HTTP_HEADER_LENGTH) {
    return false;
  }
  return true;
}

if( !header.getHeaderFields().isEmpty() ) {
  newKey = (header.getDate().replaceAll("[^0-9]", "")) + "/"
      + header.getUrl();

@Override
public void analyse(String source, ArchiveRecordHeader header,
    InputStream tikainput,
    SolrRecord solr) {
  final String url = Normalisation
      .sanitiseWARCHeaderValue(header.getUrl());
  log.debug("Analysing " + url);
  final long start = System.nanoTime();
  // Analyse with tika:
  try {
    if (passUriToFormatTools) {
      solr = this.extract(source, solr, tikainput, url);
    } else {
      solr = this.extract(source, solr, tikainput, null);
    }
  } catch (Exception i) {
    log.error(i + ": " + i.getMessage() + ";tika; " + url + "@"
        + header.getOffset());
  }
  Instrument.timeRel("WARCPayloadAnalyzers.analyze#total",
      "WARCPayloadAnalyzers.analyze#tikasolrextract", start);
}
/**

System.out.println("URL: " + r.getHeader().getUrl());
System.out.println();

public SolrRecord(int defaultMaxFieldLength, HashMap<String, Integer> maxFieldLengths,
         String filename, ArchiveRecordHeader header) {
  defaultMax = defaultMaxFieldLength;
  maxLengths = maxFieldLengths;
  setField(SolrFields.ID,
      "exception-at-" + filename + "@" + header.getOffset());
  setField(SolrFields.SOURCE_FILE, filename);
  setField(SolrFields.SOURCE_FILE_OFFSET, "" + header.getOffset());
  setField(SolrFields.SOLR_URL, Normalisation.sanitiseWARCHeaderValue(header.getUrl()));
  setField(SolrFields.SOLR_URL_TYPE, SolrFields.SOLR_URL_TYPE_UNKNOWN);
}

public SolrRecord(int defaultMaxFieldLength, HashMap<String, Integer> maxFieldLengths,
         String filename, ArchiveRecordHeader header) {
  defaultMax = defaultMaxFieldLength;
  maxLengths = maxFieldLengths;
  setField(SolrFields.ID,
      "exception-at-" + filename + "@" + header.getOffset());
  setField(SolrFields.SOURCE_FILE, filename);
  setField(SolrFields.SOURCE_FILE_OFFSET, "" + header.getOffset());
  setField(SolrFields.SOLR_URL, Normalisation.sanitiseWARCHeaderValue(header.getUrl()));
  setField(SolrFields.SOLR_URL_TYPE, SolrFields.SOLR_URL_TYPE_UNKNOWN);
}

assertThat(record.getHeader().getUrl(), isIn(allUrls));
allUrls.remove(record.getHeader().getUrl());
assertThat(record.getHeader().getUrl(), isIn(allUrls));
allUrls.remove(record.getHeader().getUrl());

@Test
public void shouldStoreAndIterageOverData() throws IOException {
  String folder = tempFolder.newFolder().toString();
  Page target = new Page(new URL(url), html, responseHeaders);
  target.setTargetRelevance(TargetRelevance.RELEVANT);
  target.setFetchTime(System.currentTimeMillis());
  WarcTargetRepository repository = new WarcTargetRepository(folder);
  // when
  repository.insert(target);
  repository.close();
  File testFolder = new File(folder);
  if (testFolder.isDirectory()) {
    File[] allFiles = testFolder.listFiles();
    assertTrue(allFiles[0].getName().startsWith("crawl_data"));
  }
  Iterator<WARCRecord> it = repository.iterator();
  // then
  assertThat(it.hasNext(), is(true));
  WARCRecord page = it.next();
  assertThat(it.hasNext(), is(false));
  assertThat(page.getHeader().getUrl(), is(url));
  assertThat(page.getHeader().getHeaderValue("Content-Type"),
      is(WARCConstants.HTTP_RESPONSE_MIMETYPE));
  assertThat(page.getHeader().getHeaderValue("ACHE-IsRelevant"),
      is(target.getTargetRelevance().isRelevant() + ""));
  assertThat(Double.valueOf(page.getHeader().getHeaderValue("ACHE-Relevance").toString()),
      is(Double.valueOf(target.getTargetRelevance().getRelevance())));
}

@Test
public void testShouldNotFailWhenThereAreNonASCIICharactersOnHeaders() throws Exception {
  // given
  String folder = tempFolder.newFolder().toString();
  String url1 = "http://a.com";
  Map<String, List<String>> headers = new HashMap<>();
  Character invalidChar = new Character((char) 0x80);
  String headerValue = "inline; filename=\"Invalid_" + invalidChar + "\"";
  headers.put("Content-Disposition", asList(headerValue));
  Page target1 = new Page(new URL(url1), html, headers);
  WarcTargetRepository repository = new WarcTargetRepository(folder);
  // when
  repository.insert(target1);
  repository.close();
  RepositoryIterator respositoryIterator = repository.iterator();
  // then
  assertTrue(respositoryIterator.hasNext());
  WARCRecord record = respositoryIterator.next();
  assertThat(record.getHeader().getUrl(), is(url1));
  String recordData = IOUtils.toString(record);
  assertThat(recordData, containsString(html));
  assertThat(recordData, containsString(headerValue));
  assertFalse(respositoryIterator.hasNext());
}

@Test
public void testReadingMultipleWarcRecordsUsingIterator() throws Exception {
  // given
  String folder = tempFolder.newFolder().toString();
  String url1 = "http://a.com";
  String url2 = "http://b.com";
  Page target1 = new Page(new URL(url1), html);
  Page target2 = new Page(new URL(url2), html);
  WarcTargetRepository repository = new WarcTargetRepository(folder);
  // when
  repository.insert(target1);
  repository.insert(target2);
  repository.close();
  RepositoryIterator respositoryIterator = repository.iterator();
  // then
  assertTrue(respositoryIterator.hasNext());
  WARCRecord record = respositoryIterator.next();
  assertThat(record.getHeader().getUrl(), is(url1));
  assertTrue(respositoryIterator.hasNext());
  record = respositoryIterator.next();
  assertThat(record.getHeader().getUrl(), is(url2));
  assertFalse(respositoryIterator.hasNext());
}

/**
 * Create a test revisit record referring Resource {@code revisited}.
 * @param timestamp CDX-style 14digit timestamp
 * @param revisited Capture being revisited (must be a {@link WarcResource}
 * or {@code ClassCastException} will be the result)
 * @param withHeader {@code true} unless you want to emulate old implementation
 * where revisit record had no HTTP headers.
 * @return new Resource object
 * @throws IOException for unexpected I/O error building payload
 */
public static Resource createTestRevisitResource(String timestamp,
    Resource revisited, boolean withHeader) throws IOException {
  String clen = revisited.getHttpHeaders().get("Content-Length");
  int len = clen != null ? Integer.parseInt(clen) : -1;
  TestWARCRecordInfo recinfo = TestWARCRecordInfo
    .createRevisitHttpResponse("text/html", len, withHeader);
  recinfo.setCreate14DigitDateFromDT14(timestamp);
  ArchiveRecordHeader warcHeader = ((WarcResource)revisited).getWarcHeaders();
  recinfo.addExtraHeader("WARC-Refers-To-Target-URI",
    warcHeader.getUrl());
  recinfo.addExtraHeader("WARC-Refers-To-Date",
    warcHeader.getDate());
  recinfo.setUrl(warcHeader.getUrl());
  TestWARCReader ar = new TestWARCReader(recinfo);
  WARCRecord rec = ar.get(0);
  WarcResource resource = new WarcResource(rec, ar);
  resource.parseHeaders();
  return resource;
}

Popular methods of ArchiveRecordHeader

getDate
Get the time when the record was created.
getHeaderValue
getHeaderFields
getLength
getMimetype
getOffset
getContentLength
getHeaderFieldKeys
getReaderIdentifier
toString
getContentBegin
Offset at which the content begins. For ARCs, its used to delimit where http headers end and content
getDigest

Popular in Java

Parsing JSON documents to java classes using gson
getContentResolver (Context)
getOriginalFilename (MultipartFile)
Return the original filename in the client's filesystem.This may contain path information depending
scheduleAtFixedRate (Timer)
RandomAccessFile (java.io)
Allows reading from and writing to a file in a random-access manner. This is different from the uni-
LinkedHashMap (java.util)
LinkedHashMap is an implementation of Map that guarantees iteration order. All optional operations a
ConcurrentHashMap (java.util.concurrent)
A plug-in replacement for JDK1.5 java.util.concurrent.ConcurrentHashMap. This version is based on or
Color (java.awt)
The Color class is used to encapsulate colors in the default sRGB color space or colors in arbitrary
JTextField (javax.swing)
Get (org.apache.hadoop.hbase.client)
Used to perform Get operations on a single row. To get everything for a row, instantiate a Get objec
CodeWhisperer alternatives

How to use getUrlmethodin org.archive.io.ArchiveRecordHeader

Best Java code snippets using org.archive.io.ArchiveRecordHeader.getUrl (Showing top 20 results out of 315)

How to use
getUrl
method
in
org.archive.io.ArchiveRecordHeader