用了好久webcollector框架,总结一些经验和大家分享。
首先来一个hello world示例。
import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.plugin.berkeley
.BreadthCrawler;
import org.jsoup.nodes.Element;
public class SinaCrawler extends BreadthCrawler {
public SinaCrawler(String crawlPath, boolean autoParse) {
super(crawlPath, autoParse);
}
@Override
public void visit(Page page, CrawlDatums next) {
Element element = page
.select("meta[name=keywords]").get(0);
String name = element.attr("content");
System.out.println(name);
}
public static void main(String[] args) {
SinaCrawler code = new SinaCrawler("crawle", false);
code.setThreads(1);
code.addSeed("http://www.sina.com.cn/");
try {
code.start(1);
} catch (Exception e) {
e.printStackTrace();
}
}
}
webcollector内置两套插件,berkeley和ram,本实例只讲述berkeley。想要使用webcollector首先需要选择一套插件,方法很简单,继承BreadthCrawler即可(对应着本示例)。您可以在任务的入口,本示例为主方法设置一些爬虫的配置参数,如线程数,最大重试次数,执行间隔等参数。
下面来处理一些常见的问题:
1.自定义请求头
应用场景:在一些爬取任务中,一些网站的反爬机制比较强,需要登录,或许还有其他请求,这个时候就需要自定义请求头将登录后的Cookie信息封装在内。方法很简单,重写getResponse(CrawlDatum crawlDatum)方法即可轻松自定义请求头。
import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.net.HttpRequest;
import cn.edu.hfut.dmic.webcollector.net.HttpResponse;
import cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler;
import org.jsoup.nodes.Element;
public class SinaCrawler extends BreadthCrawler {
public SinaCrawler(String crawlPath, boolean autoParse) {
super(crawlPath, autoParse);
}
@Override
public void visit(Page page, CrawlDatums next) {
Element element = page.select("meta[name=keywords]").get(0);
String name = element.attr("content");
System.out.println(name);
}
@Override
public HttpResponse getResponse(CrawlDatum crawlDatum) throws Exception {
HttpRequest request = new HttpRequest(crawlDatum);
request.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.87 Safari/537.36");
request.addHeader("Cookie", "pac_uid=1_1178798573; tvfe_boss_uuid=24f7ae8931c6686c; gaduid=57e4e6dece34f; mobileUV=1_15808e77088_87611; ts_refer=ADTAGCLIENT.QQ.5503_.0; _gscu_661903259=81099449jmjs2c11; luin=o1178798573; lskey=000100007075d231f7fe323869be9daeef1f95a12844fa1bcd61d037f485cfe4a7b530f0a4ea248737f22ad4; RK=kydHWeiWM2; pgv_pvi=2492743680; gj_mpvid=26756599; qzone_check=1178798573_1482388052; uid=191154303; dsp_cookiemapping0=1482394928148; dsp_cookiemapping1=1482394928150; pt2gguin=o1178798573; uin=o1178798573; skey=@kkOQfsLja; ptisp=cnc; ptcz=a92fe6d6eb2bee50c8bdfcfe2eb9f6cdbdd56def2af4313513b57d642f1b5612; ad_play_index=63; ptag=aio2015|; pgv_info=ssid=s4382823580; ts_last=news.qq.com/a/20161222/017379.htm; pgv_pvid=5809492510; o_cookie=0000000000; ts_uid=1771653839");
/*
或者直接调用request.setCookie("xxxx");也可以
还可以自定义其他请求头元素,不再一一列举
*/
return request.getResponse();
}
public static void main(String[] args) {
SinaCrawler code = new SinaCrawler("crawle", false);
code.setThreads(1);
code.addSeed("http://www.sina.com.cn/");
try {
code.start(1);
} catch (Exception e) {
e.printStackTrace();
}
}
}
2.设置代理(Proxy)
应用场景:在一些反爬机制比较强的网站,当发现同一个ip访问本网站特别频繁,会采取一些机制来应对,比如弹验证码,或者直接将此ip拉黑,则无法继续爬取任务,这个时候就需要设置代理。
下面是webcollector HttpRequest.java中的源码
Proxy proxy = null;
protected Map<String, List<String>> headerMap = null;
protected CrawlDatum crawlDatum = null;
public HttpRequest(String url) throws Exception {
this.crawlDatum = new CrawlDatum(url);
setUserAgent(Config.DEFAULT_USER_AGENT);
}
public HttpRequest(String url, Proxy proxy) throws Exception {
this(url);
this.proxy = proxy;
}
public HttpRequest(CrawlDatum crawlDatum) throws Exception {
this.crawlDatum = crawlDatum;
setUserAgent(Config.DEFAULT_USER_AGENT);
}
public HttpRequest(CrawlDatum crawlDatum, Proxy proxy) throws Exception {
this(crawlDatum);
this.proxy = proxy;
}
public void setProxy(Proxy proxy) {
this.proxy = proxy;
}
下面是webcollector中 Proxys.java类的源码
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.net.InetSocketAddress;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.Random;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
*
* @author hu
*/
public class Proxys extends ArrayList<Proxy> {
public static final Logger LOG=LoggerFactory.getLogger(Proxys.class);
public static Random random = new Random();
public Proxy nextRandom(){
int r=random.nextInt(this.size());
return this.get(r);
}
public void addEmpty(){
Proxy nullProxy=null;
this.add(nullProxy);
}
public void add(String ip, int port) {
Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(ip, port));
this.add(proxy);
}
public void add(String proxyStr) throws Exception {
try {
String[] infos = proxyStr.split(":");
String ip = infos[0];
int port = Integer.valueOf(infos[1]);
Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(ip, port));
this.add(proxy);
} catch (Exception ex) {
LOG.info("Exception", ex);
}
}
public void addAllFromFile(File file) throws Exception {
FileInputStream fis = new FileInputStream(file);
BufferedReader br = new BufferedReader(new InputStreamReader(fis));
String line = null;
while ((line = br.readLine()) != null) {
line = line.trim();
if (line.startsWith("#")||line.isEmpty()) {
continue;
} else {
this.add(line);
}
}
}
}
这个类对添加代理对象和获取代理对象进行了简单的封装,添加代理对象的时候我们可以逐个添加,甚至可以把大量的ip和端口号放入到文件中进行批量添加,获取的时候调用nextRandom()方法即可。
通过源码我们就可以这样设置代理
import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.net.HttpRequest;
import cn.edu.hfut.dmic.webcollector.net.HttpResponse;
import cn.edu.hfut.dmic.webcollector.net.Proxys;
import cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler;
import org.jsoup.nodes.Element;
public class SinaCrawler extends BreadthCrawler {
private static Proxys proxys = null;
public SinaCrawler(String crawlPath, boolean autoParse) {
super(crawlPath, autoParse);
proxys.add("x.x.x.x" , 8080);
proxys.add("y.y.y.y" , 3306);
}
@Override
public HttpResponse getResponse(CrawlDatum crawlDatum) throws Exception {
HttpRequest request = new HttpRequest(crawlDatum);
//或者直接new HttpRequest(crawlDatum, proxys.nextRandom());
request.setProxy(proxys.nextRandom()); //随机获取一个ip
return request.getResponse();
}
@Override
public void visit(Page page, CrawlDatums next) {
Element element = page.select("meta[name=keywords]").get(0);
String name = element.attr("content");
System.out.println(name);
}
public static void main(String[] args) {
SinaCrawler code = new SinaCrawler("crawle", false);
code.setThreads(1);
code.addSeed("http://www.sina.com.cn/");
try {
code.start(1);
} catch (Exception e) {
e.printStackTrace();
}
}
}
3.爬取js生成的数据
webcollector对js生成的数据抓取的功能并不是很强大,抓取js生成数据的页面可以简单自定义一个执行器
public static void main(String[] args) throws Exception {
Executor executor = new Executor() {
@Override
public void execute(CrawlDatum datum, CrawlDatums next) throws Exception {
HtmlUnitDriver driver = new HtmlUnitDriver();
driver.setJavascriptEnabled(false);
driver.get(datum.getUrl());
List<WebElement> elementList = driver.findElementsByCssSelector("h3.vrTitle a");
for(WebElement element:elementList){
System.out.println("title:"+element.getText());
}
}
};
//创建一个基于伯克利DB的DBManager
DBManager manager = new BerkeleyDBManager("crawl");
//创建一个Crawler需要有DBManager和Executor
Crawler crawler = new Crawler(manager, executor);
crawler.addSeed("https://www.sogou.com/web?query=%E6%B7%98%E5%AE%9D");
crawler.setThreads(1);
crawler.start(1);
}
抓取js生成的数据可以说和Webcollector没有什么关系了,而是使用selenium,感兴趣的话可以百度一下selenium。
那怎么区分js生成的数据呢?随便选一个网页,查看它的源代码,如果你想要爬取的数据没在里面,基本上就是js生成的
以上的总结是个人的一些经验,适用于webcollector新手,入不了方家法眼。