pipeline.setSubPipeline(githubRepoProcessor, githubUserProcessor); Spider.create(pageProcessor).addUrl("https://github.com/code4craft").thread(5).addPipeline(pipeline).runAsync();
public static void main(String[] args) { OOSpider.create(Site.me().setSleepTime(0).setRetryTimes(3), new JsonFilePageModelPipeline(), GithubRepo.class) .addUrl("https://github.com/explore") .setScheduler(new FileCacheQueueScheduler("/data/webmagic/cache/")).thread(15).run(); }
public void crawl() { //RedisUtil.init(); List<BookTag> bookTag = bookTagMapper.selectAll().subList(145,173); Request[] requests = new Request[bookTag.size()]; int i = 0; for(BookTag tag : bookTag) { requests[i++] = new Request(tag.getUrl() + "?start=0&Type=S").setPriority(0).putExtra("RedisSuffix","www.douban.com"); } Spider.create(new SimpleBookInfoProcessor()) .addRequest(requests) //.addUrl("http://book.douban.com/tag/古龙?start=0&Type=T") .addPipeline(simpleBookInfoPipeline) .setDownloader(new HttpClientDownloader()) //.scheduler(new RedisScheduler(pool,0, QueueNameConstant.QUEUE_SIMPLE_BOOK_INFO)) .thread(8).run(); } public static void main(String[] args)
public void crawl() { RedisUtil.init(); List<BookTag> bookTag = bookTagMapper.selectAll(); Request[] requests = new Request[bookTag.size()]; int i = 0; for(BookTag tag : bookTag) { requests[i++] = new Request(tag.getUrl()).setPriority(0); } Spider.create(new BookInfoProcessor()) .addRequest(requests) // .addUrl("http://www.ip138.com/") .addPipeline(bookInfoPipeline) .scheduler(new RedisScheduler(pool,Integer.parseInt(ConfigUtil.getProperty("redis", "redis.index")),QueueNameConstant.QUEUE_BOOK_INFO)) .thread(1).run(); } public static void main(String[] args)
public static void main(String[] args) { OOSpider.create(Site.me().setDomain("www.jokeji.cn").setCharset("gbk").setSleepTime(100).setTimeOut(3000) .setUserAgent("Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)") , new ConsolePageModelPipeline(), JokejiModel.class).addUrl("http://www.jokeji.cn/").thread(2) .scheduler(new RedisScheduler("127.0.0.1")) .run(); }
public static void main(String[] args) { OOSpider.create(Site.me(), News163.class).addUrl("http://news.163.com/13/0802/05/958I1E330001124J_2.html") .scheduler(new RedisScheduler("localhost")).addPipeline(new MultiPagePipeline()).addPipeline(new ConsolePipeline()).run(); }
public static void main(String[] args) { //single download Spider spider = Spider.create(new BaiduBaikePageProcessor()).thread(2); String urlTemplate = "http://baike.baidu.com/search/word?word=%s&pic=1&sug=1&enc=utf8"; ResultItems resultItems = spider.<ResultItems>get(String.format(urlTemplate, "水力发电")); System.out.println(resultItems); //multidownload List<String> list = new ArrayList<String>(); list.add(String.format(urlTemplate,"风力发电")); list.add(String.format(urlTemplate,"太阳能")); list.add(String.format(urlTemplate,"地热发电")); list.add(String.format(urlTemplate,"地热发电")); List<ResultItems> resultItemses = spider.<ResultItems>getAll(list); for (ResultItems resultItemse : resultItemses) { System.out.println(resultItemse.getAll()); } spider.close(); } }
public static void main(String[] args) { OOSpider.create(Site.me() .setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36") .setSleepTime(0) .setRetryTimes(3) ,new PageModelPipeline() { @Override public void process(Object o, Task task) { } }, OschinaBlog.class).thread(10).addUrl("http://my.oschina.net/flashsword/blog").run(); }
public static void main(String[] args) throws Exception { Spider zhihuSpider = Spider.create(new ZhihuPageProcessor()) .addUrl("http://my.oschina.net/flashsword/blog"); Spider githubSpider = Spider.create(new GithubRepoPageProcessor()) .addUrl("https://github.com/code4craft"); SpiderMonitor.instance().register(zhihuSpider); SpiderMonitor.instance().register(githubSpider); zhihuSpider.start(); githubSpider.start(); } }
public static void main(String[] args) { OOSpider.create(Site.me(), OschinaAnswer.class).addUrl("http://www.oschina.net/question/567527_120597").run(); }
.setScheduler(new BlockingQueueScheduler(model)) .addPipeline((resultItems, task) -> process(resultItems, virtualArticles, spider)) .thread(model.getThreadCount());
/** * Download urls synchronizing. * * @param urls urls * @param <T> type of process result * @return list downloaded */ public <T> List<T> getAll(Collection<String> urls) { destroyWhenExit = false; spawnUrl = false; if (startRequests!=null){ startRequests.clear(); } for (Request request : UrlUtils.convertToRequests(urls)) { addRequest(request); } CollectorPipeline collectorPipeline = getCollectorPipeline(); pipelines.add(collectorPipeline); run(); spawnUrl = true; destroyWhenExit = true; return collectorPipeline.getCollected(); }
public static void main(String[] args) throws IOException, JMException { //Just for benchmark Spider thread = OOSpider.create(Site.me().setSleepTime(0), new PageModelPipeline() { @Override public void process(Object o, Task task) { } }, Kr36NewsModel.class).thread(20).addUrl("http://www.36kr.com/"); thread.start(); SpiderMonitor spiderMonitor = SpiderMonitor.instance(); spiderMonitor.register(thread); }
public static void main(String[] args) { init(); String key = null; key = readKey(key); System.out.println("The demo started and will last 20 seconds..."); //Start spider OOSpider.create(Site.me(), clazzMap.get(key)).addUrl(urlMap.get(key)).addPipeline(new MultiPagePipeline()).addPipeline(new ConsolePipeline()).runAsync(); try { Thread.sleep(20000); } catch (InterruptedException e) { e.printStackTrace(); } System.out.println("The demo stopped!"); System.out.println("To more usage, try to customize your own Spider!"); System.exit(0); }
/** * add a pipeline for Spider * * @param pipeline pipeline * @return this * @see #addPipeline(us.codecraft.webmagic.pipeline.Pipeline) * @deprecated */ public Spider pipeline(Pipeline pipeline) { return addPipeline(pipeline); }