this.parser = parser == null ? new Parser(config, tldList) : parser; this.robotstxtServer = robotstxtServer;
parser.parse(page, curURL.getURL());
public CrawlController(CrawlConfig config, PageFetcher pageFetcher, RobotstxtServer robotstxtServer) throws Exception { this(config, pageFetcher, new Parser(config), robotstxtServer); }
private Page download(String url) { WebURL curURL = new WebURL(); curURL.setURL(url); PageFetchResult fetchResult = null; try { fetchResult = pageFetcher.fetchPage(curURL); if (fetchResult.getStatusCode() == HttpStatus.SC_OK) { Page page = new Page(curURL); fetchResult.fetchContent(page, pageFetcher.getConfig().getMaxDownloadSize()); parser.parse(page, curURL.getURL()); return page; } } catch (Exception e) { logger.error("Error occurred while fetching url: " + curURL.getURL(), e); } finally { if (fetchResult != null) { fetchResult.discardContentIfNotConsumed(); } } return null; } }
public Downloader() throws InstantiationException, IllegalAccessException { CrawlConfig config = new CrawlConfig(); parser = new Parser(config); pageFetcher = new PageFetcher(config); }
if (fetched && parser.parse(page, curURL.getURL()))
@Override public void prepare(Map map, TopologyContext context, OutputCollector collector) { this.stormConfig = (Map<String,String>) map; this.collector = collector; stayOnDomain = Boolean.valueOf(stormConfig.get("stay.on.domain")); throttlePauseMs = Integer.valueOf(stormConfig.get("throttle.pause.ms")); //Setup crawler4j config = new CrawlConfig(); config.setIncludeHttpsPages(true); pageFetcher = new PageFetcher(config); robotstxtConfig = new RobotstxtConfig(); robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher); parser = new Parser(config); }
parser.parse(page, curURL.getURL());