BreadthCrawler是WebCollector最常用的爬取器之一,依赖文件系统进行爬取信息的存储。这里以BreadthCrawler为例,对WebCollector的爬取配置进行描述:
import cn.edu.hfut.dmic.webcollector.crawler.BreadthCrawler;
import cn.edu.hfut.dmic.webcollector.model.Page;
import java.net.InetSocketAddress;
import java.net.Proxy;
public class MyCrawler extends BreadthCrawler{
/*在visit方法里定义自己的操作*/
@Override
public void visit(Page page) {
System.out.println("URL:"+page.getUrl());
System.out.println("Content-Type:"+page.getResponse().getContentType());
System.out.println("Code:"+page.getResponse().getContentType());
System.out.println("-----------------------------");
}
public static void main(String[] args) throws Exception{
MyCrawler crawler=new MyCrawler();
/*配置爬取合肥工业大学网站*/
crawler.addSeed("http://www.hfut.edu.cn/ch/");