目标url:例子
package com.spider.test;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
public class chinataxProcessor implements PageProcessor {
private Site site;
public chinataxProcessor() {
this.site = Site.me().setRetryTimes(3).setSleepTime(1000).setUseGzip(true);
}
@Override
public void process(Page page) {
//获取年份
page.putField("year", page.getHtml().xpath("//select[@class='sv_fgk_input']/option/text()").replace("年度", "").all());
page.putField("areaSample", page.getHtml().xpath("//td[@class='sv_hei']/a/@onclick"));
//获取地区码
page.putField(
"areaCode",
page.getHtml().xpath("//td[@class='sv_hei']/a/@onclick").replace("changeParam", "").replace("\\('taxCode',", "").replace("\\)", "")
.replace("'", "").all());
//获取末尾页码
page.putField("end", page.getHtml().xpath("//td/a[@title='末页']/@onclick").replace("changeParam", "").replace("\\('cPage',", "").replace("\\)", "")
.replace("'", ""));
//获取查询结果
page.putField("result", page.getHtml().xpath("//td[@class='sv_hei']/table/tbody/tr/td[@bgcolor='#F0F0F0']/text()").all());
page.putField("title", page.getHtml().xpath("//title"));
// page.putField("html", page.getHtml().toString());
}
@Override
public Site getSite() {
return this.site;
}
public static void main(String[] args) {
Spider.create(new chinataxProcessor())
// 从"https://github.com/code4craft"开始抓
.addUrl("http://hd.chinatax.gov.cn/fagui/action/InitCredit.do?articleField03=2015&taxCode=110000&randCode=jovd8967&flag=1&cPage=3")
// 开启5个线程抓取
.thread(5)
// 启动爬虫
.run();
}
}
考虑断点续抓
String fileCachePath = "/data/webmagic/"+this.getClass().getName();
Spider.create(new ChinataxPayerProcessor())
.addPipeline(new ConsolePipeline()).addPipeline(new ChinataxPayerPipeline())
.scheduler(new FileCacheQueueScheduler(fileCachePath))
.addUrl(URLDefine.CHINA_TAXPAYER_URL)
.thread(5)
.run();
本人测试的初步结论是
- urls.txt 记录URL文件,断掉后处理完全部URL之后才会更新该文件。
- cursor.txt 记录处理到哪部分,每次更新,记录始起始位置到结束位置,开始位置是上次断点的n*Thread,例如10个线程,上次断点是300,那么继续开始处理的290-300URL