轻量级爬虫Crawler4j之初体验
Crawler4j:轻量级多线程爬虫
github传送门:https://github.com/yasserg/crawler4j
本篇是对源码中调用Crawler4j例子的学习笔记。
源码中的注释我也保留了,以免理解错误
package code;
import edu.uci.ics.crawler4j.crawler.CrawlConfig;
import edu.uci.ics.crawler4j.crawler.CrawlController;
import edu.uci.ics.crawler4j.fetcher.PageFetcher;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
public class crawler4FstTry {
private static void myCrawlerTry() throws Exception {
// 设置爬取时数据临时存放目录
String crawlStorageFolder = "./tmp";
// 设置爬取线程数
int numberOfCrawlers = 5;
// 初始化爬虫配置
CrawlConfig config = new CrawlConfig();
config.setCrawlStorageFolder(crawlStorageFolder);
/*
* Be polite: Make sure that we don't send more than 1 request per
* second (1000 milliseconds between requests).
*/
/*
* 设置没多少秒发出一次请求
* 比如每1000ms发出一个请求
*/
config.setPolitenessDelay(1000);
/*
* You can set the maximum crawl depth here. The default value is -1 for
* unlimited depth
*/
/*
* 每个网页爬取的深度:
* 比如说
* 我们准备爬取A网址,A网址中包含网址,B网址中包含C网址
* 那么 A的深度是1,B的深度是2,C的深度是3
*
*/
config.setMaxDepthOfCrawling(2);
/*
* You can set the maximum number of pages to crawl. The default value
* is -1 for unlimited number of pages
*/
/*
* 设置最多爬取多少个url
* 设为-1则是无限制
*/
config.setMaxPagesToFetch(1000);
/*
* 如果需要爬google或者另外一些需要代理的网址
* 通过下面方法可以设置代理
*/
// config.setProxyHost("");
// config.setProxyPort(0);
/*
* 这个参数用来设置你的爬虫是否可恢复
* 举个栗子:当该参数设为ture时,我们爬虫程序中断或是崩溃掉后
* 重启爬虫,爬虫会从上次中断的地方继续开始爬
*/
config.setResumableCrawling(false);
/*
* 实例化爬虫的controller
*/
PageFetcher pageFetcher = new PageFetcher(config);
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
/*
* 我们需要设置一个初始的url,程序会从这个url开始爬取网页里其他的
* 的url,并继续爬取所找到的url,不断循环
*/
controller.addSeed("http://www.ics.uci.edu/");
// controller.addSeed("http://www.ics.uci.edu/~lopes/");
// controller.addSeed("http://www.ics.uci.edu/~welling/");
/*
* Start the crawl. This is a blocking operation, meaning that your code
* will reach the line after this only when crawling is finished.
*/
/*
*开始爬取
*/
controller.start(BasicCrawler.class, numberOfCrawlers);
}
public static void main(String[] args) throws Exception {
myCrawlerTry();
}
}
BasicCrawler.class的源码
package code;
import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.crawler.WebCrawler;
import edu.uci.ics.crawler4j.parser.HtmlParseData;
import edu.uci.ics.crawler4j.url.WebURL;
import java.util.List;
import java.util.regex.Pattern;
import org.apache.http.Header;
/**
* @author Yasser Ganjisaffar <lastname at gmail dot com>
*/
public class BasicCrawler extends WebCrawler {
//正则表达式,用于匹配url
private final static Pattern FILTERS = Pattern.compile(".*(\\.(css|js|bmp|gif|jpe?g" + "|png|tiff?|mid|mp2|mp3|mp4"
+ "|wav|avi|mov|mpeg|ram|m4v|pdf" + "|rm|smil|wmv|swf|wma|zip|rar|gz))$");
/**
* You should implement this function to specify whether the given url
* should be crawled or not (based on your crawling logic).
*/
/**
* 该方法是判断该url是否需要爬取
*/
@Override
public boolean shouldVisit(WebURL url) {
String href = url.getURL().toLowerCase();
return !FILTERS.matcher(href).matches() && href.startsWith("http://www.ics.uci.edu/");
}
/**
* This function is called when a page is fetched and ready to be processed
* by your program.
*/
/**
* 当页面被爬取时,这个方法会被调用
* 用于输出一些页面的信息
*/
@Override
public void visit(Page page) {
//以 www.baidu.com 为例
//程序定义的ID
int docid = page.getWebURL().getDocid();
//url:https://www.baidu.com/
String url = page.getWebURL().getURL();
////域名,如baidu.com
String domain = page.getWebURL().getDomain();
////路径,不包含url等参数 如 "/"
String path = page.getWebURL().getPath();
//子域名
String subDomain = page.getWebURL().getSubDomain();
//父页面
String parentUrl = page.getWebURL().getParentUrl();
//锚,即HTML显示的信息,如<a href="***">锚</a>
String anchor = page.getWebURL().getAnchor();
System.out.println("Docid: " + docid);
System.out.println("URL: " + url);
System.out.println("Domain: '" + domain + "'");
System.out.println("Sub-domain: '" + subDomain + "'");
System.out.println("Path: '" + path + "'");
System.out.println("Parent page: " + parentUrl);
System.out.println("Anchor text: " + anchor);
if (page.getParseData() instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
//HTML显示的信息
String text = htmlParseData.getText();
//HTML全部代码
String html = htmlParseData.getHtml();
//在该页面发现的全部URL地址
List<WebURL> links = htmlParseData.getOutgoingUrls();
System.out.println("Text length: " + text.length());
System.out.println("Html length: " + html.length());
System.out.println("Number of outgoing links: " + links.size());
}
//页面服务器返回的HTML头信息
Header[] responseHeaders = page.getFetchResponseHeaders();
if (responseHeaders != null) {
System.out.println("Response headers:");
for (Header header : responseHeaders) {
System.out.println("\t" + header.getName() + ": " + header.getValue());
}
}
System.out.println("=============");
}
}