public Resource(String url, String group, JobContext sparklerJob, Date fetchTimestamp, Integer numTries, Integer numFetches, ResourceStatus status) { this(url, group, sparklerJob, fetchTimestamp); //this.numFetches = numFetches; this.status = status.toString(); }
public Resource(String url, String group, JobContext sparklerJob, Date fetchTimestamp) { this(url, group, sparklerJob); this.id = resourceId(url, sparklerJob, fetchTimestamp); this.fetchTimestamp = fetchTimestamp; }
public Resource(String url, Integer discoverDepth, JobContext sparklerJob, ResourceStatus status, Date fetchTimestamp, String parent) throws MalformedURLException { this(url, new URL(url).getHost(), sparklerJob); this.id = resourceId(url, sparklerJob, fetchTimestamp); this.discoverDepth = discoverDepth; this.status = status.toString(); this.parent = parent; }
@Override public FetchedData apply(Resource resource) { try { return this.fetch(resource); } catch (Exception e) { int statusCode = DEFAULT_ERROR_CODE; if (e instanceof FileNotFoundException){ statusCode = 404; } LOG.warn("FETCH-ERROR {}", resource.getUrl()); LOG.debug(e.getMessage(), e); FetchedData fetchedData = new FetchedData(new byte[0], "", statusCode); resource.setStatus(ResourceStatus.ERROR.toString()); fetchedData.setResource(resource); return fetchedData; } } }
@Override public FetchedData fetch(Resource resource) throws Exception { LOG.info("HtmlUnit FETCHER {}", resource.getUrl()); FetchedData fetchedData; try { Page page = driver.getPage(resource.getUrl()); try (ByteArrayOutputStream out = new ByteArrayOutputStream()) { IOUtils.copy(boundedStream, out); fetchedData = new FetchedData(out.toByteArray(), response.getContentType(), response.getStatusCode()); long contentLength = page.getWebResponse().getContentLength(); if (contentLength > 0 && contentLength < Integer.MAX_VALUE) { fetchedData.setContentLength((int) contentLength); truncated = (contentLength > fetchedData.getContentLength()); if (truncated) { LOG.info("Content Truncated: {}, TotalSize={}", resource.getUrl(), contentLength); resource.setStatus(ResourceStatus.FETCHED.toString()); fetchedData.setHeaders(headers); if (respHeaders != null && !respHeaders.isEmpty()){ respHeaders.forEach(item -> { fetchedData = new FetchedData(new byte[0], "unknown/unknown", 0); // fixme: use proper status code resource.setStatus(ResourceStatus.ERROR.toString()); fetchedData.setResource(resource); return fetchedData;
public FetchedData fetch(Resource resource) throws Exception { LOG.info("DEFAULT FETCHER {}", resource.getUrl()); URLConnection urlConn = new URL(resource.getUrl()).openConnection(); if (httpHeaders != null){ httpHeaders.forEach(urlConn::setRequestProperty); urlConn.setReadTimeout(READ_TIMEOUT); int responseCode = ((HttpURLConnection)urlConn).getResponseCode(); LOG.debug("STATUS CODE : " + responseCode + " " + resource.getUrl()); boolean truncated = false; try (InputStream inStream = urlConn.getInputStream()) { if (bufferOutStream.size() >= CONTENT_LIMIT) { truncated = true; LOG.info("Content Truncated: {}, TotalSize={}, TruncatedSize={}", resource.getUrl(), urlConn.getContentLength(), bufferOutStream.size()); break; byte[] rawData = bufferOutStream.toByteArray(); IOUtils.closeQuietly(bufferOutStream); FetchedData fetchedData = new FetchedData(rawData, urlConn.getContentType(), responseCode); resource.setStatus(ResourceStatus.FETCHED.toString()); fetchedData.setResource(resource); fetchedData.setHeaders(urlConn.getHeaderFields()); if (truncated) { fetchedData.getHeaders().put(TRUNCATED, Collections.singletonList(Boolean.TRUE.toString()));
public void add(K key, V value){ if (!this.containsKey(key)){ this.put(key, new ArrayList<>()); } this.get(key).add(value); } }
@Override public FetchedData fetch(Resource resource) throws Exception { LOG.info("JBrowser FETCHER {}", resource.getUrl()); FetchedData fetchedData; if (!isWebPage(resource.getUrl())) { LOG.debug("{} not a html. Falling back to default fetcher.", resource.getUrl()); driver.get(resource.getUrl()); LOG.debug("Time taken to load {} - {} ", resource.getUrl(), (System.currentTimeMillis() - start)); LOG.info("{} Failed to fetch the page. Falling back to default fetcher.", resource.getUrl()); return super.fetch(resource); fetchedData = new FetchedData(html.getBytes(), "application/html", status); resource.setStatus(ResourceStatus.FETCHED.toString()); fetchedData.setResource(resource); return fetchedData;
public Resource(String url, Integer discoverDepth, JobContext sparklerJob, ResourceStatus status) throws MalformedURLException { this(url, new URL(url).getHost(), sparklerJob); this.indexedAt = new Date(); this.id = resourceId(url, sparklerJob, this.indexedAt); this.discoverDepth = discoverDepth; this.status = status.toString(); }
public Resource(String url, Integer discoverDepth, JobContext sparklerJob, ResourceStatus status, String parent, Map<String, Double> score) throws MalformedURLException { this(url, new URL(url).getHost(), sparklerJob); this.indexedAt = new Date(); this.id = resourceId(url, sparklerJob, this.indexedAt); this.discoverDepth = discoverDepth; this.status = status.toString(); this.parent = parent; this.score = score; }