人生第一条爬虫,代码结构有待提高啊!!!爬取知乎一个页面下的问题及地址,运用了正则表达式,代码需要再重构一下!
知乎收集器单独作为一个类:
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URL;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class ZCollector implements Collector {
ArrayList<Zhihu> ls = new ArrayList<>();
private Pattern urlPattern ;//地址的正则式,见构造器
private Pattern questionPattern;//问题的正则式
//<a class="question_link" href="/question/39349640/answer/80899763">全麻手术中,麻醉医师如何定量给病人打麻药?</a>
public ZCollector() {
urlPattern = Pattern.compile("question_link.+?href=\"(.+?)\"");
questionPattern = Pattern.compile("question_link.+?>(.+?)<");
}
public void collectInfoFrom(String target) {
Matcher uMatcher = urlPattern.matcher(target);
Matcher qMatcher = questionPattern.matcher(target);
Zhihu z = null;
while(uMatcher.find()&&qMatcher.find()){//匹配则储存
URL url = null;
try {
url = new URL("http://www.zhihu.com" + uMatcher.group(1));
} catch (MalformedURLException e) {
System.out.println("error url");
e.printStackTrace();
}
String title = qMatcher.group(1);
z = new Zhihu(url, title);
ls.add(z);
}
}
public String toString () {
String s = "";
for(Zhihu z : ls){
s += z.toString();
}
return s;
}
}
zhihu类:
import java.net.URL;
import java.util.ArrayList;
public class Zhihu {
private URL url;
private String title;
public Zhihu(URL url, String title) {
this.url = url;
this.title = title;
}
public String toString() {
return "问题 : " + title +" " + "地址: " + url.toString() + "\n";
}
}
最后是客户端:
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class SpiderClient {
static ZCollector zc = new ZCollector();
public static String getXmlInfo(String source){
StringBuilder sb = new StringBuilder();
BufferedReader in = null;
try {
URL url = new URL(source);
URLConnection connection = url.openConnection();
connection.connect();//模拟浏览器链接网站
in = new BufferedReader(new InputStreamReader(connection.getInputStream(),"UTF-8"));
String line = "";
while((line = in.readLine()) != null){
sb.append(line + "\n");
}
} catch (Exception e) {
System.out.println("URL请求失败");
e.printStackTrace();
}
return sb.toString();
}
public static void main(String[] args) {
String source = "http://www.zhihu.com/explore/recommendations";
String result = getXmlInfo("http://www.zhihu.com/explore/recommendations");
zc.collectInfoFrom(result);
System.out.println(zc);
}
}