package 我的虫子;
import java.net.*;
import java.util.regex.*;
import java.io.*;
public class Chongzi {
public static void main(String[] args) throws Exception {
File fi=new File("f:\\1.txt");
String str="";
BufferedWriter buw=new BufferedWriter(new FileWriter(fi,true));
for(int i=1;i<1000;i++){
String haha="";
String regx1="[\\<][/][a][\\>][\\<][\"][\\>]";
//关注贴吧查看贴吧排名页复制网址替代URL
URL u=new URL("http://tieba.baidu.com/f/like/furank?kw=java&pn="+i);
BufferedReader bur=new BufferedReader(new InputStreamReader(u.openStream()));
String regx="[\"][\\>](.){3,20}[\\<][//][a][/>][/<]";
//获得匹配器
Pattern p=Pattern.compile(regx);
//将匹配器传入
String line=null;
while((line=bur.readLine())!=null){
Matcher ma=p.matcher(line);
while(ma.find()){
haha=haha+ma.group();
//去除经过正则后仍然会有的全吧搜索字样
if(haha.contains("全吧搜索")){
haha=haha.substring(6, haha.length()-1);
}
}
}
String[] arr1=haha.split(regx1);
int count=0;
for(int j=0;j<arr1.length;j++){
if(arr1[j].contains("</a>\">")){
arr1[j]=arr1[j].substring(6, arr1[j].length());
}
if(arr1[j].contains("</a><")){
arr1[j]=arr1[j].substring(0, arr1[j].length()-5);
}
//通过设置j>2使得排除每页都会出现的吧主
if(j>2){
count++;
//由于贴吧召唤的限制,故选择每行显示5人
int a=count%5;
str=str+" "+"@"+arr1[j];
if(a==0){
buw.write(str+"\r\n");
buw.flush();
str="";
}
}
}
}
buw.close();
}
}