------- <a href="http://www.itheima.com" target="blank">android培训</a>、<a href="http://www.itheima.com" target="blank">java培训</a>、期待与您交流! ----------
传统爬虫从一个或若干初始网页的URL开始,获得初始网页上的URL,在抓取网页的过程中,不断从当前页面上抽取新的URL放入队列,直到满足系统的一定停止条件。对于垂直搜索来说,聚焦爬虫,即有针对性地爬取特定主题网页的爬虫,更为适合。
本文爬虫程序是从某论坛网站上获取邮箱,代码如下:
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class NetSpidet {
/**
* @param args
* @throws IOException
* @throws ClassNotFoundException
*/
public static void main(String[] args) throws IOException, ClassNotFoundException {
/*
* 网络爬虫:其实就是一个应用程序,获取网络中的指定信息(符合指定规则的信息)。
*
* 网络中的邮件地址。
*
*/
String regex = "\\w+@\\w+(\\.\\w+)+";
// List<String> list = getMails(file,regex);
String str_url = "http://bbs.tianya.cn/post-enterprise-401802-2.shtml";
//http://tieba.baidu.com/p/2088202080 备选。
List<String> list = getMailsByNet(str_url, regex);
for(String mail : list){
System.out.println("List:"+mail);
}
}
//基于网络。
public static List<String> getMailsByNet(String str_url,String regex) throws IOException{
List<String> list = new ArrayList<String>();
//1,将str_url封装成URL对象。
URL url = new URL(str_url);
//2,打开连接。
URLConnection conn = url.openConnection();
//3,获取读取流。
InputStream in = conn.getInputStream();
BufferedReader bufIn = new BufferedReader(new InputStreamReader(in));
//4,将正则表达式编译成对象。
Pattern p = Pattern.compile(regex);
String line = null;
while((line=bufIn.readLine())!=null){
Matcher m = p.matcher(line);
while(m.find()){
list.add(m.group());
}
}
bufIn.close();
return list;
}
//基于本地文件。
public static List<String> getMails(File file, String regex) throws IOException {
List<String> list = new ArrayList<String>();
//1,读取文件
BufferedReader bufr = new BufferedReader(new FileReader(file));
//2,将正则规则编译成对象。
Pattern p = Pattern.compile(regex);
String line = null;
while((line=bufr.readLine())!=null){
Matcher m = p.matcher(line);
while(m.find()){
list.add(m.group());
}
}
bufr.close();
return list;
}
}
运行结果:List:apoky@163.com
List:517776304@qq.com
List:ruan138248@163.com
List:471643738@qq.com
List:184696525@qq.com
List:pual9580@163.com
List:yzharold@gmail.com
List:921841757@qq.com
List:1142132167@qq.com
List:1421587505@qq.com
List:32607178@qq.com
List:58293633@qq.com
List:644439645@qq.com
List:116578974@qq.com
List:517025703@qq.com
List:844690025@qq.com
List:wangxingminmin@163.com
List:191115195@qq.com
List:dxd_mail@126.com
List:daxiangtouzi@126.com
List:yuandeliang23@sohu.com
List:zjyjqiqi@163.com
List:79281451@qq.com
List:792814518@qq.com
List:hakkaworld@gmail.com
List:potterjackloco@126.com
List:yuemanxilu@163.com
List:605567813@qq.com
List:xujun888888@sina.com
List:clnyc@163.com
List:874073717@qq.com
List:fight@126.com
List:545361574@qq.com
List:haoldx@126.com
List:huangzhihui19@sohu.com
List:zhuxinhua790507@163.com
List:guanghai810823@21cn.com
List:acerjm1@126.com
List:54381814@163.com
List:63395671@qq.com
List:63395671@qq.com
List:34990269@qq.com
List:632973900@qq.com
List:8329899@163.com
List:916199012@qq.com
List:vipaso@126.com
List:121267817@qq.com
List:lichenghe112@163.com
List:wanglijun37@163.com
List:bosimao911@sohu.com
List:yuanyuanbsh@126.com
List:452342704@qq.com
List:1130017589@139.com
List:648716579@qq.com
List:wozhai8hao@163.com
List:my1070@sina.com
List:522409718@qq.com
List:972139804@qq.com
List:84sha@163.com
List:wqbi@163.com
List:skqjx2008@163.com
List:alxiaowei8141@163.com
List:wxf8555@126.com
List:winin@163.com
List:314675229@qq.com
List:linxinv@yeah.net
List:277544847@qq.com
List:yahoo_lem@sina.com
List:234349573@qq.com
List:yyh5882@126.com
List:hmywem@163.com
List:34262383@qq.com
List:742830028@qq.com
List:215678385@QQ.COM
List:ang1123112@163.com
List:42830028@qq.com
List:gaoge1999@yeah.net
List:641844924@qq.comLZ
List:276499154@qq.com
List:1521682085@qq.com
List:mengjian141@sina.com
List:mengjian141@sina.com
List:330403811@qq.com
List:445810139@qq.com
List:690431828@qq.com
List:enylong@163.com
List:jww_007@sina.com
List:952761708@qq.com
List:297085112@qq.com
List:651397875@qq.com
List:253306560@qq.com
List:fkeipkr@tom.com
List:zzpei14@126.com