Webcollector爬虫

最新推荐文章于 2021-02-13 15:28:08 发布

原创最新推荐文章于 2021-02-13 15:28:08 发布

· 3.7k 阅读

2 ·

版权

文章标签：

#爬虫 #自定义请求头 #设置代理 #爬取js生成的数据

爬虫专栏收录该内容

1 篇文章

订阅专栏

用了好久webcollector框架，总结一些经验和大家分享。
首先来一个hello world示例。

import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.plugin.berkeley
      .BreadthCrawler;
import org.jsoup.nodes.Element;

public class SinaCrawler extends BreadthCrawler {

    public SinaCrawler(String crawlPath, boolean autoParse) {
        super(crawlPath, autoParse);
    }

    @Override
    public void visit(Page page, CrawlDatums next) {
        Element element = page
            .select("meta[name=keywords]").get(0);
        String name = element.attr("content");
        System.out.println(name);
    }

    public static void main(String[] args) {
        SinaCrawler code = new SinaCrawler("crawle", false);
        code.setThreads(1);
        code.addSeed("http://www.sina.com.cn/");
        try {
            code.start(1);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

webcollector内置两套插件，berkeley和ram，本实例只讲述berkeley。想要使用webcollector首先需要选择一套插件，方法很简单，继承BreadthCrawler即可（对应着本示例）。您可以在任务的入口，本示例为主方法设置一些爬虫的配置参数，如线程数，最大重试次数，执行间隔等参数。
下面来处理一些常见的问题：

1.自定义请求头

应用场景：在一些爬取任务中，一些网站的反爬机制比较强，需要登录，或许还有其他请求，这个时候就需要自定义请求头将登录后的Cookie信息封装在内。方法很简单，重写getResponse(CrawlDatum crawlDatum)方法即可轻松自定义请求头。

import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.net.HttpRequest;
import cn.edu.hfut.dmic.webcollector.net.HttpResponse;
import cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler;
import org.jsoup.nodes.Element;

public class SinaCrawler extends BreadthCrawler {

    public SinaCrawler(String crawlPath, boolean autoParse) {
        super(crawlPath, autoParse);
    }

    @Override
    public void visit(Page page, CrawlDatums next) {
        Element element = page.select("meta[name=keywords]").get(0);
        String name = element.attr("content");
        System.out.println(name);
    }

    @Override
    public HttpResponse getResponse(CrawlDatum crawlDatum) throws Exception {
        HttpRequest request = new HttpRequest(crawlDatum);
        request.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.87 Safari/537.36");
        request.addHeader("Cookie", "pac_uid=1_1178798573; tvfe_boss_uuid=24f7ae8931c6686c; gaduid=57e4e6dece34f; mobileUV=1_15808e77088_87611; ts_refer=ADTAGCLIENT.QQ.5503_.0; _gscu_661903259=81099449jmjs2c11; luin=o1178798573; lskey=000100007075d231f7fe323869be9daeef1f95a12844fa1bcd61d037f485cfe4a7b530f0a4ea248737f22ad4; RK=kydHWeiWM2; pgv_pvi=2492743680; gj_mpvid=26756599; qzone_check=1178798573_1482388052; uid=191154303; dsp_cookiemapping0=1482394928148; dsp_cookiemapping1=1482394928150; pt2gguin=o1178798573; uin=o1178798573; skey=@kkOQfsLja; ptisp=cnc; ptcz=a92fe6d6eb2bee50c8bdfcfe2eb9f6cdbdd56def2af4313513b57d642f1b5612; ad_play_index=63; ptag=aio2015|; pgv_info=ssid=s4382823580; ts_last=news.qq.com/a/20161222/017379.htm; pgv_pvid=5809492510; o_cookie=0000000000; ts_uid=1771653839");
        /*
            或者直接调用request.setCookie("xxxx");也可以
            还可以自定义其他请求头元素，不再一一列举
         */
        return request.getResponse();
    }

    public static void main(String[] args) {
        SinaCrawler code = new SinaCrawler("crawle", false);
        code.setThreads(1);
        code.addSeed("http://www.sina.com.cn/");
        try {
            code.start(1);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

2.设置代理(Proxy)

应用场景：在一些反爬机制比较强的网站，当发现同一个ip访问本网站特别频繁，会采取一些机制来应对，比如弹验证码，或者直接将此ip拉黑，则无法继续爬取任务，这个时候就需要设置代理。
下面是webcollector HttpRequest.java中的源码

    Proxy proxy = null;

    protected Map<String, List<String>> headerMap = null;

    protected CrawlDatum crawlDatum = null;

    public HttpRequest(String url) throws Exception {
        this.crawlDatum = new CrawlDatum(url);
        setUserAgent(Config.DEFAULT_USER_AGENT);
    }

    public HttpRequest(String url, Proxy proxy) throws Exception {
        this(url);
        this.proxy = proxy;
    }

    public HttpRequest(CrawlDatum crawlDatum) throws Exception {
        this.crawlDatum = crawlDatum;
        setUserAgent(Config.DEFAULT_USER_AGENT);
    }

    public HttpRequest(CrawlDatum crawlDatum, Proxy proxy) throws Exception {
        this(crawlDatum);
        this.proxy = proxy;
    }
    public void setProxy(Proxy proxy) {
        this.proxy = proxy;
    }

下面是webcollector中 Proxys.java类的源码

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.net.InetSocketAddress;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.Random;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 *
 * @author hu
 */
public class Proxys extends ArrayList<Proxy> {
    public static final Logger LOG=LoggerFactory.getLogger(Proxys.class);

    public static Random random = new Random();

    public Proxy nextRandom(){
        int r=random.nextInt(this.size());
        return this.get(r);
    }

    public void addEmpty(){
        Proxy nullProxy=null;
        this.add(nullProxy);
    }

    public void add(String ip, int port) {
        Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(ip, port));
        this.add(proxy);
    }

    public void add(String proxyStr) throws Exception {
        try {
            String[] infos = proxyStr.split(":");
            String ip = infos[0];
            int port = Integer.valueOf(infos[1]);

            Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(ip, port));
            this.add(proxy);
        } catch (Exception ex) {
            LOG.info("Exception", ex);
        }

    }


    public void addAllFromFile(File file) throws Exception {
        FileInputStream fis = new FileInputStream(file);
        BufferedReader br = new BufferedReader(new InputStreamReader(fis));
        String line = null;
        while ((line = br.readLine()) != null) {
            line = line.trim();
            if (line.startsWith("#")||line.isEmpty()) {
                continue;
            } else {
                this.add(line);
            }
        }
    }
}

这个类对添加代理对象和获取代理对象进行了简单的封装，添加代理对象的时候我们可以逐个添加，甚至可以把大量的ip和端口号放入到文件中进行批量添加，获取的时候调用nextRandom()方法即可。

通过源码我们就可以这样设置代理

import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.net.HttpRequest;
import cn.edu.hfut.dmic.webcollector.net.HttpResponse;
import cn.edu.hfut.dmic.webcollector.net.Proxys;
import cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler;
import org.jsoup.nodes.Element;

public class SinaCrawler extends BreadthCrawler {


    private static Proxys proxys = null;

    public SinaCrawler(String crawlPath, boolean autoParse) {
        super(crawlPath, autoParse);
        proxys.add("x.x.x.x" , 8080);
        proxys.add("y.y.y.y" , 3306);
    }

    @Override
    public HttpResponse getResponse(CrawlDatum crawlDatum) throws Exception {
        HttpRequest request = new HttpRequest(crawlDatum);
 //或者直接new HttpRequest(crawlDatum, proxys.nextRandom());
        request.setProxy(proxys.nextRandom()); //随机获取一个ip
        return request.getResponse();
    }

    @Override
    public void visit(Page page, CrawlDatums next) {
        Element element = page.select("meta[name=keywords]").get(0);
        String name = element.attr("content");
        System.out.println(name);
    }

    public static void main(String[] args) {
        SinaCrawler code = new SinaCrawler("crawle", false);
        code.setThreads(1);
        code.addSeed("http://www.sina.com.cn/");
        try {
            code.start(1);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

3.爬取js生成的数据

webcollector对js生成的数据抓取的功能并不是很强大，抓取js生成数据的页面可以简单自定义一个执行器

public static void main(String[] args) throws Exception {
        Executor executor = new Executor() {
            @Override
            public void execute(CrawlDatum datum, CrawlDatums next) throws Exception {
                HtmlUnitDriver driver = new HtmlUnitDriver();
                driver.setJavascriptEnabled(false);
                driver.get(datum.getUrl());
                List<WebElement> elementList = driver.findElementsByCssSelector("h3.vrTitle a");
                for(WebElement element:elementList){
                System.out.println("title:"+element.getText());
                }
            }
        };
        //创建一个基于伯克利DB的DBManager
        DBManager manager = new BerkeleyDBManager("crawl");
        //创建一个Crawler需要有DBManager和Executor
        Crawler crawler = new Crawler(manager, executor);
        crawler.addSeed("https://www.sogou.com/web?query=%E6%B7%98%E5%AE%9D");
        crawler.setThreads(1);
        crawler.start(1);
    }

抓取js生成的数据可以说和Webcollector没有什么关系了，而是使用selenium，感兴趣的话可以百度一下selenium。
那怎么区分js生成的数据呢？随便选一个网页，查看它的源代码，如果你想要爬取的数据没在里面，基本上就是js生成的
以上的总结是个人的一些经验，适用于webcollector新手，入不了方家法眼。