从http://webmagic.io/download.html下载的所有依赖jar包比较多,全部导入已有项目后容易出现冲突,可以只导入下面几个jar包
然后写一个公用的方法供调用
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
public class WebMagic implements PageProcessor {
private Site site = Site.me().setRetryTimes(3).setSleepTime(100);
private static String issue;
private static String jczq;
@Override
public void process(Page page) {
page.putField("content",page.getHtml().xpath("//span[@id='bet_period']/text()").toString());
if (page.getResultItems().get("content")==null){
//skip this page
page.setSkip(true);
}
//获得所需的内容
issue = page.getHtml().xpath("//span[@id='bet_period']/text()").toString();
//div[@class='dataBody unAttention']/dl/dt/text()
//表示类名为dataBody unAttention的div元素下的dl元素下的dt元素的文本内容
jczq = page.getHtml().xpath("//div[@class='dataBody unAttention']/dl/dt/text()").toString()
+ page.getHtml().xpath("//div[@class='dataBody unAttention']/dl/dt/span/text()").toString();
}
@Override
public Site getSite() {
return site;
}
public static String issue(String url) {
Spider.create(new WebMagic()).addUrl(url)
.addPipeline(new ConsolePipeline()).thread(5).run();
return issue;
}
public static String jczq(String url) {
Spider.create(new WebMagic()).addUrl(url)
.addPipeline(new ConsolePipeline()).thread(5).run();
return jczq;
}
}
在其他类中中通过
String issue = WebMagic.issue("http://caipiao.163.com/order/dlt/");
String jczq = WebMagic.jczq("http://caipiao.163.com/order/preBet_jczqspfmixp.html");
可调用爬虫,得到爬取的值