网络爬虫,也叫网络蜘蛛,有的项目也把它称作“walker”。维基百科所给的定义是“一种系统地扫描互联网,以获取索引为目的的网络程序”。网络上有很多关于网络爬虫的开源项目,其中比较有名的是Heritrix和Apache Nutch。 有时需要在网上搜集信息,如果需要搜集的是获取方法单一而人工搜集费时费力的信息,比如统计一个网站每个月发了多少篇文章、用了哪些标签,为自然语言处理项目搜集语料,或者为模式识别项目搜集图片等等,就需要爬虫程序来完成这样的任务。而且搜索引擎必不可少的组件之一也是网络爬虫。 很多网络爬虫都是用Python,Java或C#实现的。我这里给出的是Java版本的爬虫程序。为了节省时间和空间,我把程序限制在只扫描本博客地址下的网页(也就是http://johnhan.net/但不包括http://johnhany.net/wp-content/下的内容),并从网址中统计出所用的所有标签。只要稍作修改,去掉代码里的限制条件就能作为扫描整个网络的程序使用。或者对输出格式稍作修改,可以作为生成博客sitemap的工具。 代码也可以在这里下载:johnhany/WPCrawler。 环境需求 我的开发环境是Windows7 + Eclipse。 需要XAMPP提供通过url访问MySQL数据库的端口。 还要用到三个开源的Java类库: Apache HttpComponents 4.3 提供HTTP接口,用来向目标网址提交HTTP请求,以获取网页的内容; HTML Parser 2.0 用来解析网页,从DOM节点中提取网址链接; MySQL Connector/J 5.1.27 连接Java程序和MySQL,然后就可以用Java代码操作数据库。 代码 代码位于三个文件中,分别是:crawler.java,httpGet.java和parsePage.java。包名为net.johnhany.wpcrawler。 crawler.javapackage net.johnhany.wpcrawler; import java.sql.Connection; import java.sql.DriverManager; import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Statement; public class crawler { public static void main(String args[]) throws Exception { String frontpage = "http://johnhany.net/"; Connection conn = null; //connect the MySQL database try { Class.forName("com.mysql.jdbc.Driver"); String dburl = "jdbc:mysql://localhost:3306?useUnicode=true&characterEncoding=utf8"; conn = DriverManager.getConnection(dburl, "root", ""); System.out.println("connection built"); } catch (SQLException e) { e.printStackTrace(); } catch (ClassNotFoundException e) { e.printStackTrace(); } String sql = null; String url = frontpage; Statement stmt = null; ResultSet rs = null; int count = 0; if(conn != null) { //create database and table that will be needed try { sql = "CREATE DATABASE IF NOT EXISTS crawler"; stmt = conn.createStatement(); stmt.executeUpdate(sql); sql = "USE crawler"; stmt = conn.createStatement(); stmt.executeUpdate(sql); sql = "create table if not exists record (recordID int(5) not null auto_increment, URL text not null, crawled tinyint(1) not null, primary key (recordID)) engine=InnoDB DEFAULT CHARSET=utf8"; stmt = conn.createStatement(); stmt.executeUpdate(sql); sql = "create table if not exists tags (tagnum int(4) not null auto_increment, tagname text not null, primary key (tagnum)) engine=InnoDB DEFAULT CHARSET=utf8"; stmt = conn.createStatement(); stmt.executeUpdate(sql); } catch (SQLException e) { e.printStackTrace(); } //crawl every link in the database while(true) { //get page content of link "url" httpGet.getByString(url,conn); count++; //set boolean value "crawled" to true after crawling this page sql = "UPDATE record SET crawled = 1 WHERE URL = '" + url + "'"; stmt = conn.createStatement(); if(stmt.executeUpdate(sql) > 0) { //get the next page that has not been crawled yet sql = "SELECT * FROM record WHERE crawled = 0"; stmt = conn.createStatement(); rs = stmt.executeQuery(sql); if(rs.next()) { url = rs.getString(2); }else { //stop crawling if reach the bottom of the list break; } //set a limit of crawling count if(count > 1000 || url == null) { break; } } } conn.close(); conn = null; System.out.println("Done."); System.out.println(count); } } } packagenet.johnhany.wpcrawler; importjava.io.IOException; importjava.sql.Connection; importorg.apache.http.HttpEntity; importorg.apache.http.HttpResponse; importorg.apache.http.client.ClientProtocolException; importorg.apache.http.client.ResponseHandler; importorg.apache.http.client.methods.HttpGet; importorg.apache.http.impl.client.CloseableHttpClient; importorg.apache.http.impl.client.HttpClients; importorg.apache.http.util.EntityUtils; publicclasshttpGet{ publicfinalstaticvoidgetByString(Stringurl,Connectionconn)throwsException{ CloseableHttpClienthttpclient=HttpClients.createDefault(); try{ HttpGethttpget=newHttpGet(url); System.out.println("executing request "+httpget.getURI()); ResponseHandler<String>responseHandler=newResponseHandler<String>(){ publicStringhandleResponse( finalHttpResponseresponse)throwsClientProtocolException,IOException{ intstatus=response.getStatusLine().getStatusCode(); if(status>=200&&status<300){ HttpEntityentity=response.getEntity(); returnentity!=null?EntityUtils.toString(entity):null; }else{ thrownewClientProtocolException("Unexpected response status: "+status); } } }; StringresponseBody=httpclient.execute(httpget,responseHandler); /* //print the content of the page System.out.println("----------------------------------------"); System.out.println(responseBody); System.out.println("----------------------------------------"); */ parsePage.parseFromString(responseBody,conn); }finally{ httpclient.close(); } } } package net.johnhany.wpcrawler; import java.sql.Connection; import java.sql.PreparedStatement; import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Statement; import org.htmlparser.Node; import org.htmlparser.Parser; import org.htmlparser.filters.HasAttributeFilter; import org.htmlparser.tags.LinkTag; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; import java.net.URLDecoder; public class parsePage { public static void parseFromString(String content, Connection conn) throws Exception { Parser parser = new Parser(content); HasAttributeFilter filter = new HasAttributeFilter("href"); try { NodeList list = parser.parse(filter); int count = list.size(); //process every link on this page for(int i=0; i<count; i++) { Node node = list.elementAt(i); if(node instanceof LinkTag) { LinkTag link = (LinkTag) node; String nextlink = link.extractLink(); String mainurl = "http://johnhany.net/"; String wpurl = mainurl + "wp-content/"; //only save page from "http://johnhany.net" if(nextlink.startsWith(mainurl)) { String sql = null; ResultSet rs = null; PreparedStatement pstmt = null; Statement stmt = null; String tag = null; //do not save any page from "wp-content" if(nextlink.startsWith(wpurl)) { continue; } try { //check if the link already exists in the database sql = "SELECT * FROM record WHERE URL = '" + nextlink + "'"; stmt = conn.createStatement(ResultSet.TYPE_FORWARD_ONLY,ResultSet.CONCUR_UPDATABLE); rs = stmt.executeQuery(sql); if(rs.next()) { }else { //if the link does not exist in the database, insert it sql = "INSERT INTO record (URL, crawled) VALUES ('" + nextlink + "',0)"; pstmt = conn.prepareStatement(sql, Statement.RETURN_GENERATED_KEYS); pstmt.execute(); System.out.println(nextlink); //use substring for better comparison performance nextlink = nextlink.substring(mainurl.length()); //System.out.println(nextlink); if(nextlink.startsWith("tag/")) { tag = nextlink.substring(4, nextlink.length()-1); //decode in UTF-8 for Chinese characters tag = URLDecoder.decode(tag,"UTF-8"); sql = "INSERT INTO tags (tagname) VALUES ('" + tag + "')"; pstmt = conn.prepareStatement(sql, Statement.RETURN_GENERATED_KEYS); //if the links are different from each other, the tags must be different //so there is no need to check if the tag already exists pstmt.execute(); } } } catch (SQLException e) { //handle the exceptions System.out.println("SQLException: " + e.getMessage()); System.out.println("SQLState: " + e.getSQLState()); System.out.println("VendorError: " + e.getErrorCode()); } finally { //close and release the resources of PreparedStatement, ResultSet and Statement if(pstmt != null) { try { pstmt.close(); } catch (SQLException e2) {} } pstmt = null; if(rs != null) { try { rs.close(); } catch (SQLException e1) {} } rs = null; if(stmt != null) { try { stmt.close(); } catch (SQLException e3) {} } stmt = null; } } } } } catch (ParserException e) { e.printStackTrace(); } } } 程序原理
httpGet.java
parsePage.java
所谓“互联网”,是网状结构,任意两个节点间都有可能存在路径。爬虫程序对互联网的扫描,在图论角度来讲,就是对有向图的遍历(链接是从一个网页指向另一个网页,所以是有向的)。常见的遍历方法有深度优先和广度优先两种。相关理论知识可以参考树的遍历:这里和这里。我的程序采用的是广度优先方式。
程序从crawler.java的main()开始运行。
Class.forName("com.mysql.jdbc.Driver");
Stringdburl="jdbc:mysql://localhost:3306?useUnicode=true&characterEncoding=utf8";
conn=DriverManager.getConnection(dburl,"root","");
System.out.println("connection built");
首先,调用DriverManager连接MySQL服务。这里使用的是XAMPP的默认MySQL端口3306,端口值可以在XAMPP主界面看到:
Apache和MySQL都启动之后,在浏览器地址栏输入“http://localhost/phpmyadmin/”就可以看到数据库了。等程序运行完之后可以在这里检查一下运行是否正确。
sql = "CREATE DATABASE IF NOT EXISTS crawler";
stmt = conn.createStatement();
stmt.executeUpdate(sql);
sql = "USE crawler";
stmt = conn.createStatement();
stmt.executeUpdate(sql);
sql = "create table if not exists record (recordID int(5) not null auto_increment, URL text not null, crawled tinyint(1) not null, primary key (recordID)) engine=InnoDB DEFAULT CHARSET=utf8";
stmt = conn.createStatement();
stmt.executeUpdate(sql);
sql = "create table if not exists tags (tagnum int(4) not null auto_increment, tagname text not null, primary key (tagnum)) engine=InnoDB DEFAULT CHARSET=utf8";
stmt = conn.createStatement();
stmt.executeUpdate(sql);
连接好数据库后,建立一个名为“crawler”的数据库,在库里建两个表,一个叫“record”,包含字段“recordID”,“URL”和“crawled”,分别记录地址编号、链接地址和地址是否被扫描过;另一个叫“tags”,包含字段“tagnum”和“tagname”,分别记录标签编号和标签名。
while(true){
httpGet.getByString(url,conn);
count++;
sql="UPDATE record SET crawled = 1 WHERE URL = '"+url+"'";
stmt=conn.createStatement();
if(stmt.executeUpdate(sql)>0){
sql="SELECT * FROM record WHERE crawled = 0";
stmt=conn.createStatement();
rs=stmt.executeQuery(sql);
if(rs.next()){
url=rs.getString(2);
}else{
break;
}
}
}
接着在一个while循环内依次处理表record内的每个地址。每次处理时,把地址url传递给httpGet.getByString(),然后在表record中把crawled改为true,表明这个地址已经处理过。然后寻找下一个crawled为false的地址,继续处理,直到处理到表尾。
这里需要注意的细节是,执行executeQuery()后,得到了一个ResultSet结构rs,rs包含SQL查询返回的所有行和一个指针,指针指向结果中第一行之前的位置,需要执行一次rs.next()才能让rs的指针指向第一个结果,同时返回true,之后每次执行rs.next()都会把指针移到下一个结果上并返回true,直至再也没有结果时,rs.next()的返回值变成了false。
还有一个细节,在执行建库建表、INSERT、UPDATE时,需要用executeUpdate();在执行SELECT时,需要使用executeQuery()。executeQuery()总是返回一个ResultSet,executeUpdate()返回符合查询的行数。
httpGet.java的getByString()类负责向所给的网址发送请求,然后下载网页内容。
HttpGet httpget = new HttpGet(url);
System.out.println("executing request " + httpget.getURI());
ResponseHandler<String> responseHandler = new ResponseHandler<String>() {
public String handleResponse(
final HttpResponse response) throws ClientProtocolException, IOException {
int status = response.getStatusLine().getStatusCode();
if (status >= 200 && status < 300) {
HttpEntity entity = response.getEntity();
return entity != null ? EntityUtils.toString(entity) : null;
} else {
throw new ClientProtocolException("Unexpected response status: " + status);
}
}
};
String responseBody = httpclient.execute(httpget, responseHandler);
这段代码是HTTPComponents的HTTP Client组件中给出的样例,在很多情况下可以直接使用。这部分代码获得了一个字符串responseBody,里面保存着网页中的全部字符。
接着,就需要把responseBody传递给parsePage.java的parseFromString类提取链接
Parserparser=newParser(content);
HasAttributeFilterfilter=newHasAttributeFilter("href");
try{
NodeListlist=parser.parse(filter);
intcount=list.size();
//process every link on this page
for(inti=0;i<count;i++){
Nodenode=list.elementAt(i);
if(nodeinstanceofLinkTag){
在HTML文件中,链接一般都在a标签的href属性中,所以需要创建一个属性过滤器。NodeList保存着这个HTML文件中的所有DOM节点,通过在for循环中依次处理每个节点寻找符合要求的标签,可以把网页中的所有链接提取出来。
然后通过nextlink.startsWith()进一步筛选,只处理以“http://johnhany.net/”开头的链接并跳过以“http://johnhany.net/wp-content/”开头的链接。
sql="SELECT * FROM record WHERE URL = '"+nextlink+"'";
stmt=conn.createStatement(ResultSet.TYPE_FORWARD_ONLY,ResultSet.CONCUR_UPDATABLE);
rs=stmt.executeQuery(sql);
if(rs.next()){
}else{
//if the link does not exist in the database, insert it
sql="INSERT INTO record (URL, crawled) VALUES ('"+nextlink+"',0)";
pstmt=conn.prepareStatement(sql,Statement.RETURN_GENERATED_KEYS);
pstmt.execute();
在表record中查找是否已经存在这个链接,如果存在(rs.next()==true),不做任何处理;如果不存在(rs.next()==false),在表中插入这个地址并把crawled置为false。因为之前recordID设为AUTO_INCREMENT,所以要用 Statement.RETURN_GENERATED_KEYS获取适当的编号
nextlink=nextlink.substring(mainurl.length());
if(nextlink.startsWith("tag/")){
tag=nextlink.substring(4,nextlink.length()-1);
tag=URLDecoder.decode(tag,"UTF-8");
sql="INSERT INTO tags (tagname) VALUES ('"+tag+"')";
pstmt=conn.prepareStatement(sql,Statement.RETURN_GENERATED_KEYS);
pstmt.execute();
去掉链接开头的“http://johnhany.net/”几个字符,提高字符比较的速度。如果含有“tag/”说明其后的字符是一个标签的名字,把这给名字提取出来,用UTF-8编码,保证汉字的正常显示,然后存入表tags。类似地还可以加入判断“article/”,“author/”,或“2013/11/”等对其他链接进行归类。
结果
这是两张数据库的截图,显示了程序的部分结果:
在这里可以获得全部输出结果。可以与本博客的sitemap比较一下,看看如果想在其基础上实现sitemap生成工具,还要做哪些修改。
附录:网摘:爬虫的简单实现
网络爬虫是一个自动提取网页的程序,它为搜索引擎从万维网上下载网页,是搜索引擎的重要组成,其基本架构如下图所示:

传统爬虫从一个或若干初始网页的URL开始,获得初始网页上的URL,在抓取网页的过程中,不断从当前页面上抽取新的URL放入队列,直到满足系统的一定停止条件。对于垂直搜索来说,聚焦爬虫,即有针对性地爬取特定主题网页的爬虫,更为适合。
本文爬虫程序的核心代码如下:
Java代码
public void crawl() throws Throwable {
while (continueCrawling()) {
CrawlerUrl url = getNextUrl(); //获取待爬取队列中的下一个URL
if (url != null) {
printCrawlInfo();
String content = getContent(url); //获取URL的文本信息
//聚焦爬虫只爬取与主题内容相关的网页,这里采用正则匹配简单处理
if (isContentRelevant(content, this.regexpSearchPattern)) {
saveContent(url, content); //保存网页至本地
//获取网页内容中的链接,并放入待爬取队列中
Collection urlStrings = extractUrls(content, url);
addUrlsToUrlQueue(url, urlStrings);
} else {
System.out.println(url + " is not relevant ignoring ...");
}
//延时防止被对方屏蔽
Thread.sleep(this.delayBetweenUrls);
}
}
closeOutputStream();
}
整个函数由getNextUrl、getContent、isContentRelevant、extractUrls、addUrlsToUrlQueue等几个核心方法组成,下面将一一介绍。先看getNextUrl:
Java代码
private CrawlerUrl getNextUrl() throws Throwable {
CrawlerUrl nextUrl = null;
while ((nextUrl == null) && (!urlQueue.isEmpty())) {
CrawlerUrl crawlerUrl = this.urlQueue.remove();
//doWeHavePermissionToVisit:是否有权限访问该URL,友好的爬虫会根据网站提供的"Robot.txt"中配置的规则进行爬取
//isUrlAlreadyVisited:URL是否访问过,大型的搜索引擎往往采用BloomFilter进行排重,这里简单使用HashMap
//isDepthAcceptable:是否达到指定的深度上限。爬虫一般采取广度优先的方式。一些网站会构建爬虫陷阱(自动生成一些无效链接使爬虫陷入死循环),采用深度限制加以避免
if (doWeHavePermissionToVisit(crawlerUrl)
&& (!isUrlAlreadyVisited(crawlerUrl))
&& isDepthAcceptable(crawlerUrl)) {
nextUrl = crawlerUrl;
// System.out.println("Next url to be visited is " + nextUrl);
}
}
return nextUrl;
}
更多的关于robot.txt的具体写法,可参考以下这篇文章:
http://www.bloghuman.com/post/67/
getContent内部使用apache的httpclient 4.1获取网页内容,具体代码如下:
Java代码
private String getContent(CrawlerUrl url) throws Throwable {
//HttpClient4.1的调用与之前的方式不同
HttpClient client = new DefaultHttpClient();
HttpGet httpGet = new HttpGet(url.getUrlString());
StringBuffer strBuf = new StringBuffer();
HttpResponse response = client.execute(httpGet);
if (HttpStatus.SC_OK == response.getStatusLine().getStatusCode()) {
HttpEntity entity = response.getEntity();
if (entity != null) {
BufferedReader reader = new BufferedReader(
new InputStreamReader(entity.getContent(), "UTF-8"));
String line = null;
if (entity.getContentLength() > 0) {
strBuf = new StringBuffer((int) entity.getContentLength());
while ((line = reader.readLine()) != null) {
strBuf.append(line);
}
}
}
if (entity != null) {
entity.consumeContent();
}
}
//将url标记为已访问
markUrlAsVisited(url);
return strBuf.toString();
}
对于垂直型应用来说,数据的准确性往往更为重要。聚焦型爬虫的主要特点是,只收集和主题相关的数据,这就是isContentRelevant方法的作用。这里或许要使用分类预测技术,为简单起见,采用正则匹配来代替。其主要代码如下:
Java代码
public static boolean isContentRelevant(String content,
Pattern regexpPattern) {
boolean retValue = false;
if (content != null) {
//是否符合正则表达式的条件
Matcher m = regexpPattern.matcher(content.toLowerCase());
retValue = m.find();
}
return retValue;
}
extractUrls的主要作用,是从网页中获取更多的URL,包括内部链接和外部链接,代码如下:
Java代码
public List extractUrls(String text, CrawlerUrl crawlerUrl) {
Map urlMap = new HashMap();
extractHttpUrls(urlMap, text);
extractRelativeUrls(urlMap, text, crawlerUrl);
return new ArrayList(urlMap.keySet());
}
//处理外部链接
private void extractHttpUrls(Map urlMap, String text) {
Matcher m = httpRegexp.matcher(text);
while (m.find()) {
String url = m.group();
String[] terms = url.split("a href=\"");
for (String term : terms) {
// System.out.println("Term = " + term);
if (term.startsWith("http")) {
int index = term.indexOf("\"");
if (index > 0) {
term = term.substring(0, index);
}
urlMap.put(term, term);
System.out.println("Hyperlink: " + term);
}
}
}
}
//处理内部链接
private void extractRelativeUrls(Map urlMap, String text,
CrawlerUrl crawlerUrl) {
Matcher m = relativeRegexp.matcher(text);
URL textURL = crawlerUrl.getURL();
String host = textURL.getHost();
while (m.find()) {
String url = m.group();
String[] terms = url.split("a href=\"");
for (String term : terms) {
if (term.startsWith("/")) {
int index = term.indexOf("\"");
if (index > 0) {
term = term.substring(0, index);
}
String s = "http://" + host + term;
urlMap.put(s, s);
System.out.println("Relative url: " + s);
}
}
}
}
如此,便构建了一个简单的网络爬虫程序,可以使用以下程序来测试它:
Java代码
public static void main(String[] args) {
try {
String url = "http://www.amazon.com";
Queue urlQueue = new LinkedList();
String regexp = "java";
urlQueue.add(new CrawlerUrl(url, 0));
NaiveCrawler crawler = new NaiveCrawler(urlQueue, 100, 5, 1000L,
regexp);
// boolean allowCrawl = crawler.areWeAllowedToVisit(url);
// System.out.println("Allowed to crawl: " + url + " " +
// allowCrawl);
crawler.crawl();
} catch (Throwable t) {
System.out.println(t.toString());
t.printStackTrace();
}
}
当然,你可以为它赋予更为高级的功能,比如多线程、更智能的聚焦、结合Lucene建立索引等等。更为复杂的情况,可以考虑使用一些开源的蜘蛛程序,比如Nutch或是Heritrix等等,就不在本文的讨论范围了。