for (final Iterator<ArchiveRecord> i = reader.iterator(); i.hasNext();) { WARCRecord r = (WARCRecord)i.next(); if (!isARCType(r.getHeader().getMimetype())) { continue; String mimetype = r.getHeader().getMimetype();
} else { recordInfo.setType(WARCRecordType.resource); recordInfo.setMimetype(r.getHeader().getMimetype()); recordInfo.setRecordId(((WARCWriterPoolSettings)writer.settings).getRecordIDGenerator().getRecordID());
protected String getMimetype4Cdx(final ArchiveRecordHeader h) { return h.getMimetype(); }
protected String getMimetype4Cdx(final ArchiveRecordHeader h) { return h.getMimetype(); }
protected String getMimetype4Cdx(final ArchiveRecordHeader h) { return h.getMimetype(); }
@Override public void map(Text key, ArchiveReader value, Context context) throws IOException { for (ArchiveRecord r : value) { // Skip any records that are not JSON if (!r.getHeader().getMimetype().equals("application/json")) { continue; } try { context.getCounter(MAPPERCOUNTER.RECORDS_IN).increment(1); // Convenience function that reads the full message into a raw byte array byte[] rawData = IOUtils.toByteArray(r, r.available()); String content = new String(rawData); JSONObject json = new JSONObject(content); try { String server = json.getJSONObject("Envelope").getJSONObject("Payload-Metadata").getJSONObject("HTTP-Response-Metadata").getJSONObject("Headers").getString("Server"); outKey.set(server); context.write(outKey, outVal); } catch (JSONException ex) { // If we reach here, the JSON object didn't have the header we were looking for // There are likely better ways to check for json["Envelope"]["Payload-Metadata"][...] but this is concise } } catch (Exception ex) { LOG.error("Caught Exception", ex); context.getCounter(MAPPERCOUNTER.EXCEPTIONS).increment(1); } } } }
LOG.debug(r.getHeader().getUrl() + " -- " + r.available()); if (r.getHeader().getMimetype().equals("application/http; msgtype=response")) {
ArchiveRecord r = i.next(); if (r.getHeader().getLength() <= 0 && r.getHeader().getMimetype(). equals(MimetypeUtils.NO_TYPE_MIMETYPE)) { throw new IOException("record content is empty.");
ArchiveRecord r = i.next(); if (r.getHeader().getLength() <= 0 && r.getHeader().getMimetype(). equals(MimetypeUtils.NO_TYPE_MIMETYPE)) { throw new IOException("record content is empty.");
ArchiveRecord r = i.next(); if (r.getHeader().getLength() <= 0 && r.getHeader().getMimetype(). equals(MimetypeUtils.NO_TYPE_MIMETYPE)) { throw new IOException("record content is empty.");
output.collect( new Text("content-types"), new Text("CONTENT-TYPE\t"+header.getMimetype()) ); String date = header.getDate(); if( date != null && date.length() > 4 ) {
for (ArchiveRecord r : value) { try { if (r.getHeader().getMimetype().equals("text/plain")) { context.getCounter(MAPPERCOUNTER.RECORDS_IN).increment(1); LOG.debug(r.getHeader().getUrl() + " -- " + r.available());
output.collect( new Text("content-types"), new Text("CONTENT-TYPE\t"+header.getMimetype()) ); String date = header.getDate(); if( date != null && date.length() > 4 ) {
for (final Iterator<ArchiveRecord> i = reader.iterator(); i.hasNext();) { WARCRecord r = (WARCRecord)i.next(); if (!isARCType(r.getHeader().getMimetype())) { continue; String mimetype = r.getHeader().getMimetype();
private CaptureSearchResult adaptWARCHTTPResponse(CaptureSearchResult result, WARCRecord rec) throws IOException { ArchiveRecordHeader header = rec.getHeader(); // need to parse the documents HTTP message and headers here: WARCReader // does not implement this... yet.. byte [] statusBytes = LaxHttpParser.readRawLine(rec); int eolCharCount = getEolCharsCount(statusBytes); if (eolCharCount <= 0) { throw new RecoverableIOException("Failed to read http status where one " + " was expected: " + ((statusBytes == null) ? "(null)" : new String(statusBytes))); } String statusLine = EncodingUtil.getString(statusBytes, 0, statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING); if ((statusLine == null) || !StatusLine.startsWithHTTP(statusLine)) { throw new RecoverableIOException("Failed parse of http status line."); } StatusLine status = new StatusLine(statusLine); result.setHttpCode(String.valueOf(status.getStatusCode())); Header[] headers = LaxHttpParser.parseHeaders(rec, ARCConstants.DEFAULT_ENCODING); annotater.annotateHTTPContent(result,rec,headers,header.getMimetype()); return result; }
private CaptureSearchResult adaptWARCHTTPResponse(CaptureSearchResult result, WARCRecord rec) throws IOException { ArchiveRecordHeader header = rec.getHeader(); // need to parse the documents HTTP message and headers here: WARCReader // does not implement this... yet.. byte [] statusBytes = LaxHttpParser.readRawLine(rec); int eolCharCount = getEolCharsCount(statusBytes); if (eolCharCount <= 0) { throw new RecoverableIOException("Failed to read http status where one " + " was expected: " + ((statusBytes == null) ? "(null)" : new String(statusBytes))); } String statusLine = EncodingUtil.getString(statusBytes, 0, statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING); if ((statusLine == null) || !StatusLine.startsWithHTTP(statusLine)) { throw new RecoverableIOException("Failed parse of http status line."); } StatusLine status = new StatusLine(statusLine); result.setHttpCode(String.valueOf(status.getStatusCode())); Header[] headers = LaxHttpParser.parseHeaders(rec, ARCConstants.DEFAULT_ENCODING); annotater.annotateHTTPContent(result,rec,headers,header.getMimetype()); return result; }
.get("WARC-Identified-Payload-Type")); } else { contentType = header.getMimetype();
String mime = annotater.transformHTTPMime(header.getMimetype()); if(mime != null && mime.equals("text/dns")) {
String mime = annotater.transformHTTPMime(header.getMimetype()); if(mime != null && mime.equals("text/dns")) {
} else { recordInfo.setType(WARCRecordType.resource); recordInfo.setMimetype(r.getHeader().getMimetype()); recordInfo.setRecordId(((WARCWriterPoolSettings)writer.settings).getRecordIDGenerator().getRecordID());