public static void detectFile(MimeTypeDetector detector, File file) { try { InputStream is = new FileInputStream(file); Resource resource = JWATResource.getResource(is, 0); String contentType = resource.getHeader("content-type"); if (contentType == null) contentType = "-"; else { int p = contentType.indexOf(';'); if (p >= 0) { contentType = contentType.substring(0, p).trim(); } } String mimeType = detector.sniff(resource); if (mimeType == null) mimeType = "-"; System.out.println(file.getPath() + "\t" + contentType + "\t" + mimeType); } catch (Exception ex) { System.out.println(file.getPath() + "\t" + "-" + "\tERROR " + ex.getMessage()); } } }
mime = result.getDuplicatePayload().getMimeType(); } else { mime = payloadResource.getHeader("Content-Type");
mime = result.getDuplicatePayload().getMimeType(); } else { mime = payloadResource.getHeader("Content-Type");
/** * Read first {@code sniffLength} bytes of {@code resource}'s payload, * decoding {@code Content-Encoding} if any. Reset {@code resource}'s * read position back to zero. * @param resource Resource to load bytes from * @return bytes, zero-padded if payload is shorter. * @throws IOException */ protected byte[] peekContent(Resource resource) throws IOException { byte[] bbuffer = new byte[Math.max(sniffLength, MINIMUM_SNIFF_BUFFER_SIZE)]; String encoding = resource.getHeader("content-encoding"); if ("gzip".equalsIgnoreCase(encoding) || "x-gzip".equalsIgnoreCase(encoding)) { // use larger readlimit, because gzip-ed data can be larger than the original // at low compression level. resource.mark(sniffLength + 100); @SuppressWarnings("resource") Resource z = new GzipDecodingResource(resource); z.read(bbuffer, 0, sniffLength); resource.reset(); } else { resource.mark(sniffLength); resource.read(bbuffer, 0, sniffLength); resource.reset(); } return bbuffer; }
/** * Read first {@code sniffLength} bytes of {@code resource}'s payload, * decoding {@code Content-Encoding} if any. Reset {@code resource}'s * read position back to zero. * @param resource Resource to load bytes from * @return bytes, zero-padded if payload is shorter. * @throws IOException */ protected byte[] peekContent(Resource resource) throws IOException { byte[] bbuffer = new byte[Math.max(sniffLength, MINIMUM_SNIFF_BUFFER_SIZE)]; String encoding = resource.getHeader("content-encoding"); if ("gzip".equalsIgnoreCase(encoding) || "x-gzip".equalsIgnoreCase(encoding)) { // use larger readlimit, because gzip-ed data can be larger than the original // at low compression level. resource.mark(sniffLength + 100); @SuppressWarnings("resource") Resource z = new GzipDecodingResource(resource); z.read(bbuffer, 0, sniffLength); resource.reset(); } else { resource.mark(sniffLength); resource.read(bbuffer, 0, sniffLength); resource.reset(); } return bbuffer; }
mimeType = resource.getHeader("Content-Type");
mimeType = resource.getHeader("Content-Type");
String location = resource.getHeader("Location");
String location = resource.getHeader("Location");
/** * resource record, typically used for archiving ftp fetches. * @throws Exception */ public void testResourceRecord() throws Exception { final String ct = "text/plain"; final byte[] block = "blahblahblah\n".getBytes(); WARCRecordInfo recinfo = new TestWARCRecordInfo(block); recinfo.setType(WARCRecordType.resource); recinfo.setUrl("ftp://ftp.example.com/afile.txt"); recinfo.setMimetype(ct); Resource res = createResource(recinfo); res.parseHeaders(); int scode = res.getStatusCode(); assertEquals("statusCode", 200, scode); Map<String, String> headers = res.getHttpHeaders(); assertNotNull("headers", headers); assertEquals("content-type", ct, res.getHeader("Content-Type")); // must have Date header, in HTTP Date format. String date = res.getHeader("Date"); assertNotNull("has date header", date); new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss z", Locale.ENGLISH).parse(date); res.close(); }
/** * plain HTTP response (without any transfer/content-encoding) * @throws Exception */ public void testPlainHttpRecord() throws Exception { String payload = "hogehogehogehogehoge"; WARCRecordInfo recinfo = TestWARCRecordInfo.createHttpResponse(payload); Resource res = createResource(recinfo); res.parseHeaders(); assertEquals("statusCode", 200, res.getStatusCode()); assertEquals("content-type", "text/plain", res.getHeader("Content-Type")); byte[] buf = new byte[payload.getBytes().length + 1]; int n = res.read(buf); assertEquals("content length", buf.length - 1, n); res.close(); } /**
/** * uncompressed, but chunked-encoded HTTP response * @throws Exception */ public void testPlainChunkedHttpRecord() throws Exception { String payload = "hogehogehogehogehoge"; WARCRecordInfo recinfo = new TestWARCRecordInfo( TestWARCRecordInfo.buildHttpResponseBlock("200 OK", "text/plain", payload.getBytes("UTF-8"), true)); Resource res = createResource(recinfo); res.parseHeaders(); assertEquals("statusCode", 200, res.getStatusCode()); assertEquals("content-type", "text/plain", res.getHeader("Content-Type")); byte[] buf = new byte[payload.getBytes().length + 1]; int n = res.read(buf); assertEquals("content length", buf.length - 1, n); res.close(); }
/** * gzip-compressed HTTP response. * @throws Exception */ public void testCompressedHttpRecord() throws Exception { String payload = "hogehogehogehogehoge"; String ctype = "text/plain"; WARCRecordInfo recinfo = new TestWARCRecordInfo( TestWARCRecordInfo.buildCompressedHttpResponseBlock(ctype, payload.getBytes())); Resource res = createResource(recinfo); res.parseHeaders(); assertEquals("statusCode", 200, res.getStatusCode()); assertEquals("content-type", ctype, res.getHeader("Content-Type")); Resource zres = TextReplayRenderer.decodeResource(res); assertTrue("wrapped with GzipDecodingResource", (zres instanceof GzipDecodingResource)); byte[] buf = new byte[payload.getBytes().length + 1]; int n = zres.read(buf); assertEquals("content length", buf.length - 1, n); res.close(); }
/** * gzip-compressed, chunked-encoded HTTP response. * @throws Exception */ public void testCompressedChunkedHttpRecord() throws Exception { String payload = "hogehogehogehogehoge"; String ctype = "text/plain"; WARCRecordInfo recinfo = new TestWARCRecordInfo( TestWARCRecordInfo.buildCompressedHttpResponseBlock(ctype, payload.getBytes(), true)); Resource res = createResource(recinfo); res.parseHeaders(); assertEquals("statusCode", 200, res.getStatusCode()); assertEquals("content-type", ctype, res.getHeader("Content-Type")); Resource zres = TextReplayRenderer.decodeResource(res); assertTrue("wrapped with GzipDecodingResource", (zres instanceof GzipDecodingResource)); byte[] buf = new byte[payload.getBytes().length + 1]; int n = zres.read(buf); assertEquals("content length", buf.length - 1, n); res.close(); }
/** * new, current revisit record, which has just HTTP response line and * headers part of the capture. * <p>Expectations: * TextReplayRender receives revisit WarcResource as {@code httpHeaderResource}, * and calls following methods on it:</p> * <ul> * <li>{@link WarcResource#getStatusCode()}</li> * <li>{@link WarcResource#getHttpHeaders()} (ok to return null)</li> * </ul> * @throws Exception */ public void testRevisitRecord() throws Exception { final String ct = "text/html"; WARCRecordInfo recinfo = TestWARCRecordInfo.createRevisitHttpResponse(ct, 1345); Resource res = createResource(recinfo); res.parseHeaders(); // these are from this record. assertEquals("statusCode", 200, res.getStatusCode()); assertEquals("content-type", ct, res.getHeader("Content-Type")); StandardCharsetDetector csd = new StandardCharsetDetector(); // assuming WaybackRequest (3rd parameter) is not used in getCharset() csd.getCharset(res, res, null); res.close(); }
public void testUrlAgnosticRevisitRecord() throws Exception { final String ctype = "text/html"; WARCRecordInfo recinfo = TestWARCRecordInfo .createUrlAgnosticRevisitHttpResponse(ctype, 1345); Resource res = createResource(recinfo); res.parseHeaders(); // these are from this record. assertEquals("statusCode", 200, res.getStatusCode()); assertEquals("content-type", ctype, res.getHeader("Content-Type")); assertEquals("http://example.com/", res.getRefersToTargetURI()); assertEquals("20140101101010", res.getRefersToDate()); StandardCharsetDetector csd = new StandardCharsetDetector(); // assuming WaybackRequest (3rd parameter) is not used in getCharset() csd.getCharset(res, res, null); res.close(); }