1. Crawler是什么?
crawler4j是一个开源的java爬虫类库,可以用来构建多线程的web爬虫来抓取页面内容。
2. 如何获取Crawler?
crawler4j的官方地址在这里,目前版本为4.1。如果你使用Maven,可以通过下面的pom的方式,如直接下载,点击这里。
3. Crawler怎么用?
crawler4j的使用分为两个步骤:一是实现一个继承自edu.uci.ics.crawler4j.crawler.WebCrawler的爬虫类;另外就是通过CrawController调用实现的爬虫类。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
|
package
com.favccxx.favsoft.favcrawler;
import
java.util.Set;
import
java.util.regex.Pattern;
import
org.slf4j.Logger;
import
org.slf4j.LoggerFactory;
import
edu.uci.ics.crawler4j.crawler.Page;
import
edu.uci.ics.crawler4j.crawler.WebCrawler;
import
edu.uci.ics.crawler4j.parser.HtmlParseData;
import
edu.uci.ics.crawler4j.url.WebURL;
public
class
FavWebCrawler
extends
WebCrawler {
private
static
final
Logger logger = LoggerFactory.getLogger(WebCrawler.
class
);
private
final
static
Pattern FILTERS = Pattern.compile(
".*(\\.(css|js|gif|jpg"
+
"|png|mp3|mp3|zip|gz))$"
);
@Override
public
boolean
shouldVisit(Page referringPage, WebURL url) {
String href = url.getURL().toLowerCase();
return
!FILTERS.matcher(href).matches() && href.startsWith(
"http://www.oschina.net/"
);
}
/**
* 处理抓取到的页面时,调用该方法
*/
@Override
public
void
visit(Page page) {
int
docid = page.getWebURL().getDocid();
String url = page.getWebURL().getURL();
String domain = page.getWebURL().getDomain();
String path = page.getWebURL().getPath();
String subDomain = page.getWebURL().getSubDomain();
String parentUrl = page.getWebURL().getParentUrl();
String anchor = page.getWebURL().getAnchor();
logger.debug(
"Docid: {}"
, docid);
logger.info(
"URL: {}"
, url);
logger.debug(
"Domain: '{}'"
, domain);
logger.debug(
"Sub-domain: '{}'"
, subDomain);
logger.debug(
"Path: '{}'"
, path);
logger.debug(
"Parent page: {}"
, parentUrl);
logger.debug(
"Anchor text: {}"
, anchor);
if
(page.getParseData()
instanceof
HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
String text = htmlParseData.getText();
String html = htmlParseData.getHtml();
Set<WebURL> links = htmlParseData.getOutgoingUrls();
logger.debug(
"Text length: "
+ text.length());
logger.debug(
"Html length: "
+ html.length());
logger.debug(
"Number of outgoing links: "
+ links.size());
}
}
}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
|
package
com.favccxx.favsoft.favcrawler;
import
java.util.Set;
import
java.util.regex.Pattern;
import
org.slf4j.Logger;
import
org.slf4j.LoggerFactory;
import
edu.uci.ics.crawler4j.crawler.CrawlConfig;
import
edu.uci.ics.crawler4j.crawler.CrawlController;
import
edu.uci.ics.crawler4j.crawler.Page;
import
edu.uci.ics.crawler4j.crawler.WebCrawler;
import
edu.uci.ics.crawler4j.fetcher.PageFetcher;
import
edu.uci.ics.crawler4j.parser.HtmlParseData;
import
edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig;
import
edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
import
edu.uci.ics.crawler4j.url.WebURL;
public
class
MyCrawler
extends
WebCrawler {
private
static
final
Logger logger = LoggerFactory.getLogger(WebCrawler.
class
);
private
final
static
Pattern FILTERS = Pattern.compile(
".*(\\.(css|js|gif|jpg"
+
"|png|mp3|mp3|zip|gz))$"
);
@Override
public
boolean
shouldVisit(Page referringPage, WebURL url) {
String href = url.getURL().toLowerCase();
return
!FILTERS.matcher(href).matches()
&& href.startsWith(
"http://www.oschina.net/"
);
}
/**
* This function is called when a page is fetched and ready
* to be processed by your program.
*/
@Override
public
void
visit(Page page) {
int
docid = page.getWebURL().getDocid();
String url = page.getWebURL().getURL();
String domain = page.getWebURL().getDomain();
String path = page.getWebURL().getPath();
String subDomain = page.getWebURL().getSubDomain();
String parentUrl = page.getWebURL().getParentUrl();
String anchor = page.getWebURL().getAnchor();
// page.getWebURL().getTag()
System.out.println(
"********************************"
);
//
// System.out.println("Docid: {}" + docid);
// System.out.println("URL: {}"+ url);
// System.out.println("Domain: '{}'"+ domain);
// System.out.println("Sub-domain: '{}'"+ subDomain);
// System.out.println("Path: '{}'"+ path);
// System.out.println("Parent page: {}"+ parentUrl);
// System.out.println("Anchor text: {}"+ anchor);
logger.debug(
"Docid: {}"
, docid);
logger.info(
"URL: {}"
, url);
logger.debug(
"Domain: '{}'"
, domain);
logger.debug(
"Sub-domain: '{}'"
, subDomain);
logger.debug(
"Path: '{}'"
, path);
logger.debug(
"Parent page: {}"
, parentUrl);
logger.debug(
"Anchor text: {}"
, anchor);
// String url = page.getWebURL().getURL();
System.out.println(
"URL: "
+ url);
if
(page.getParseData()
instanceof
HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
String text = htmlParseData.getText();
String html = htmlParseData.getHtml();
Set<WebURL> links = htmlParseData.getOutgoingUrls();
System.out.println(
"--------------------------"
);
// System.out.println(text);
System.out.println(
"--------------------------"
);
System.out.println(
"Text length: "
+ text.length());
System.out.println(
"Html length: "
+ html.length());
System.out.println(
"Number of outgoing links: "
+ links.size());
}
}
public
static
void
main(String[] args)
throws
Exception{
String crawlStorageFolder =
"/data/crawl/root"
;
int
numberOfCrawlers =
7
;
CrawlConfig config =
new
CrawlConfig();
config.setCrawlStorageFolder(crawlStorageFolder);
/*
* Instantiate the controller for this crawl.
*/
PageFetcher pageFetcher = new PageFetcher(config);
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
/*
* For each crawl, you need to add some seed urls. These are the first
* URLs that are fetched and then the crawler starts following links
* which are found in these pages
*/
controller.addSeed("http://www.oschina.net/");
// controller.addSeed("http://www.ics.uci.edu/~welling/");
// controller.addSeed("http://www.ics.uci.edu/");
/*
* Start the crawl. This is a blocking operation, meaning that your code
* will reach the line after this only when crawling is finished.
*/
controller.start(MyCrawler.
class
, numberOfCrawlers);
}
}
|
4. Crawler常用配置
crawler4j的配置文件都位于edu.uci.ics.crawler4j.crawler.CrawlConfig中,各配置属性的详细说明如下。
crawlStorageFolder:临时存储抓取来的文件的地方,相当于文件中转站。 resumableCrawling:是否重新抓取上一个异常停止/损坏的文件的开关,默认不开启。如果开启该开关,毫无疑问会降低抓取的效率。 maxDepthOfCrawling:抓取的最大深度。默认为-1,即无限深度。 maxPagesToFetch:抓取的最大页面数。默认为-1,即无限抓取。 userAgentString:抓取web服务器的用户代理。默认为“crawler4j (http://code.google.com/p/crawler4j/)”。 politenessDelay:(同一主机的两个请求间的)延迟毫秒数。默认为200。 includeHttpsPages:是否包含Https页面。默认包含。 includeBinaryContentInCrawling:是否包含二进制文件,如image,audio等。默认为不抓取。 maxConnectionsPerHost:每个主机的最大连接数,默认为100。 maxTotalConnections:主机的总共连接数,默认为100。 socketTimeout:socket超时毫秒数,默认为20000。 connectionTimeout:连接超时毫秒数,默认为30000。 maxOutgoingLinksToFollow:每个页面的最大外链数,默认为5000。 maxDownloadSize:每个页面的最大下载容量,默认1048576kb(1024M),超过的部分不会下载。 followRedirects:是否抓取重定向的页面,默认抓取。 proxyHost:代理主机地址,仅在使用代理上网时使用。 proxyPort:代理端口号。 proxyUsername:代理用户名。 proxyPassword:代理密码。 authInfos:授权用户信息。 |