程序用到的类库:
commons-logging-1.1.1.jar
htmlparser.jar
httpclient-4.1.jar
httpcore-4.1.jar
package 简易网络爬虫;
import java.io.IOException;
import java.util.Set;
import org.apache.http.client.ClientProtocolException;
import org.htmlparser.util.ParserException;
/**
* 入口程序
* @author Administrator
*
*/
public class Crawer {
public static void crawlering(String rootUrl) throws ClientProtocolException, ParserException, IOException
{
//初始化种子
initCrawlerUrl(rootUrl);
}
public static void initCrawlerUrl(String rootUrl) throws ClientProtocolException, IOException, ParserException
{
LinkDB.enQueue(rootUrl);
LinkFilter linkFilter = new LinkFilter()
{
public boolean accept(String url)
{
return true;
}
};
while(!LinkDB.isQueueEmpty() && LinkDB.getVisitedUrl()<=50)
{
String url=LinkDB.deQueue();
FileDownLoader downLoader=new FileDownLoader();
//设置下载器类型
downLoader.setDownType("html");
downLoader.downloadFile(url);
LinkDB.addVisitedUrl(url);
Set<String> links=ExtractLink.extractLinks(url, linkFilter);
for(String str:links)
{
LinkDB.enQueue(str);
}
}
}
public static void main(String [] args) throws ClientProtocolException, ParserException, IOException
{
Crawer crawer=new Crawer();
crawer.crawlering("http://dblp.uni-trier.de/");
}
}
package 简易网络爬虫;
import java.util.HashSet;
import java.util.Set;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.OrFilter;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
/**
* 抽取网页中的URL
* @author Administrator
*
*/
public class ExtractLink {
public static String getFrameSrc(String text)
{
int start = text.indexOf("src");
text=text.substring(start);
int end = text.indexOf(" ");
if(end == -1)
end = text.indexOf(">")-1;
text=text.substring(5,end-1);
return text;
}
public static Set<String> extractLinks(String url,LinkFilter linkFilter) throws ParserException
{
Set<String> links=new HashSet<String>();
Parser parser=new Parser(url);
NodeFilter frameFilter=new NodeFilter(){
public boolean accept(Node node)
{
if(node.getText().contains("frame src="))
return true;
return false;
}
};
OrFilter orFilter= new OrFilter(new NodeClassFilter(LinkTag.class),frameFilter);
NodeList nodeList=parser.extractAllNodesThatMatch(new NodeClassFilter(LinkTag.class));
int size=nodeList.size();
String text="";
for(int i=0;i<size;i++)
{
Node node=nodeList.elementAt(i);
if(node instanceof LinkTag)
{
LinkTag linkTag=(LinkTag) node;
text=linkTag.getLink();
if(linkFilter.accept(text))
{
links.add(text);
}
}
else
{
text=getFrameSrc(node.getText());
links.add(text);
}
}
return links;
}
}
package 简易网络爬虫;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
/**
* 文件下载器
* @author Administrator
*
*/
public class FileDownLoader {
//文件类型
private String downType;
public String getDownType() {
return downType;
}
public void setDownType(String downType) {
this.downType = downType;
}
//根据url和网页类型生成需要保存的网页文件名,去除掉url中的非法字符
public String getFileNameByUrl(String url,String contentType)
{
url=url.substring(7); //取出http://
if(contentType.indexOf("html")!=-1)
{
url=url.replaceAll("[\\?/:*|<>\"]", "_")+".html";
return url;
}
else
{
return url.replaceAll("[\\?/:*|<>\"]", "_")+"."
+contentType.substring(contentType.lastIndexOf("/")+1);
}
}
//将页面内容保存到本地
public void saveToLocal(InputStream is , String filePath) throws IOException
{
BufferedReader br=new BufferedReader(new InputStreamReader(is));
BufferedWriter bw=new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(filePath))));
String str="";
while((str=br.readLine())!=null)
{
bw.write(str+"\n");
}
bw.close();
br.close();
is.close();
}
public void saveToLocal2(InputStream is , String filePath) throws IOException
{
byte[] buffer=new byte[1024];
int offset=0;
FileOutputStream fos=new FileOutputStream(new File(filePath));
while((offset=is.read(buffer))!=-1)
{
fos.write(buffer, 0, offset);
}
fos.close();
is.close();
}
//下载url指向的网页
public void downloadFile(String url) throws ClientProtocolException, IOException
{
String filePath=null;
//生成httpclient兑现归并设置参数
HttpClient httpClient=new DefaultHttpClient();
HttpGet getMethod=new HttpGet(url);
System.out.println(url);
getMethod.setHeader("Accept-Charset", "GB2312,utf-8;q=0.7,*;q=0.7");
HttpResponse response=httpClient.execute(getMethod);
if(response.getStatusLine().getStatusCode()==200)
{
HttpEntity entity=response.getEntity();
InputStream is=entity.getContent();
if(downType!=null)
{
filePath="F:\\download\\"+this.getFileNameByUrl(url, entity.getContentType().getValue());
if(downType.equals("zip"))
this.saveToLocal2(is, filePath);
else
this.saveToLocal(is, filePath);
}
}
}
public static void main(String[] args) throws ClientProtocolException, IOException
{
FileDownLoader fdl=new FileDownLoader();
fdl.setDownType("html");
String url="http://www.baidu.com";
fdl.downloadFile(url);
}
}
package 简易网络爬虫;
/**
* URL过滤
* @author Administrator
*
*/
public interface LinkFilter {
//抽取指定条件的URL
public boolean accept(String url);
}
package 简易网络爬虫;
import java.util.HashSet;
import java.util.Set;
/**
* 存放已下载和未下载的链接信息
* @author Administrator
*
*/
public class LinkDB {
//保存已访问过的url地址
private static Set<String> visitedUrl=new HashSet<String>();
//保存未访问的url地址
private static Queue<String> unvisitedUrl=new Queue<String>();
public static void addVisitedUrl(String url)
{
visitedUrl.add(url);
}
public static void enQueue(String url)
{
unvisitedUrl.enQueue(url);
}
public static String deQueue()
{
return unvisitedUrl.deQueue();
}
public static boolean isQueueEmpty()
{
return unvisitedUrl.isEmpty();
}
public static boolean isQueueContains(String url)
{
return unvisitedUrl.contains(url);
}
public static int getVisitedUrl()
{
return visitedUrl.size();
}
}
package 简易网络爬虫;
import java.util.LinkedList;
/**
* 存放未处理的URL
* @author Administrator
*
*/
public class Queue<T> {
private LinkedList<T> queue=new LinkedList<T>();
public T deQueue()
{
return queue.removeFirst();
}
public void enQueue(T t)
{
queue.addLast(t);
}
public boolean contains(T t)
{
return queue.contains(t);
}
public boolean isEmpty()
{
return queue.isEmpty();
}
}