抽取豆瓣电影信息的例子:
我个人主要使用的类有NodeFilter ,TagNameFilter,AndFilter,HasAttributeFilter,NodeList ……
public void extract() {
BufferedWriter bw = null;
NodeFilter title_filter = new TagNameFilter("h1");
NodeFilter info_filter = new AndFilter(new TagNameFilter("div"),
new HasAttributeFilter("id","info"));
NodeFilter detail_filter = new AndFilter(new TagNameFilter("span"),
new HasAttributeFilter("class","pl"));
NodeFilter plot_filter = new AndFilter(new TagNameFilter("span"),
new HasAttributeFilter("property","v:summary"));
try {
//解析电影标题,创建写入文件
NodeList nodeList = this.getParser().parse(title_filter);
BufferedWriter disabledPages = new BufferedWriter(new FileWriter(new File("E:\\disabledPages.txt"), true));
if(nodeList.size() == 0 || nodeList.elementAt(0).getChildren().size() < 2){
//判断网页文件是否包含正常的title
//System.out.println("Disabled page:" + this.getInputFilePath());
String url = "http://" + this.getInputFilePath().substring(10, 42).replaceAll("\\\\", "/");
disabledPages.write(url + NEWLINE);
this.disabledCount++;
if(disabledPages != null) {
disabledPages.close();
}
return;
}
NodeIterator it1 = nodeList.elementAt(0).getChildren().elements();
StringBuffer title = new StringBuffer();
while (it1.hasMoreNodes()) {
Node tmpNode = it1.nextNode();
if(tmpNode instanceof Span) {
title.append(tmpNode.toPlainTextString()).append("_");
//System.out.println(tmpNode.toPlainTextString());
//String txt = tmpNode.toPlainTextString();
//title.append(txt).append("_");
}
}
title.deleteCharAt(title.length() - 1);
String subject = title.toString().replaceAll("_", NEWLINE);
/**
* 特殊字符写入文件名的问题,有些字符GBK编码不支持,改用utf-8 。eg:
* 哭泣的草原 (2004) http://movie.douban.com/subject/1388180/
* "/"不能用作文件名字符,换成" ",eg:变脸 Face\Off (1997)
*/
String GBKTitle = title.toString();
//String GBKTitle = new String(title.toString().getBytes("GBK"));
String writableTitle = GBKTitle.replaceAll(":|/|\\\\|\\*|\\?|\\||(")|\\n|(<)|(>)", " ").trim();
String path = this.getOutputPath();
String writeFile = path + writableTitle + ".txt";
if(writeFile.length() >= 260) {
//windows文件名长度限制为260字符,文件夹长度限制248字符
writeFile = writeFile.substring(0, 255) + ".txt";
}
FileOutputStream fos = new FileOutputStream(writeFile);
bw = new BufferedWriter(new OutputStreamWriter(fos, "UTF-8"));
int startPos = this.getInputFilePath().indexOf("mirror") + 6;
String url_seg = this.getInputFilePath().substring(startPos);
url_seg = url_seg.replaceAll("\\\\", "/");
//按路径生成URL,并去掉最后的“index.html”
String url = "http:/" + url_seg.substring(0, url_seg.length() - 10);
//System.out.println(url);
bw.write(url + NEWLINE);
bw.write(subject + NEWLINE);
this.getParser().reset();
//解析影片相关信息
NodeList infoList = this.getParser().parse(info_filter);
if(infoList.elementAt(0).getChildren() != null) {
NodeList detailList = infoList.elementAt(0).getChildren();
NodeIterator it2 = detailList.elements();
while(it2.hasMoreNodes()) {
Node tmpNode = (Node) it2.nextNode();
if(tmpNode instanceof Span)
bw.write(tmpNode.toPlainTextString());
//System.out.print(tmpNode.toPlainTextString());
else if(tmpNode instanceof LinkTag) {
bw.write(((LinkTag) tmpNode).extractLink());
//System.out.print(((LinkTag) tmpNode).extractLink());
}
else if(tmpNode instanceof TagNode) {
String str = ((TagNode) tmpNode).getTagName();
if(str.equals("BR"))
bw.newLine();
//System.out.println();
}
else if(tmpNode instanceof TextNode) {
String str = tmpNode.getText();
if(!str.trim().equals(""))
bw.write(tmpNode.getText());
//System.out.print(tmpNode.getText());
}
}
}
this.getParser().reset();
//解析电影剧情简介
//System.out.println("剧情简介:");
NodeList plotList = this.getParser().parse(plot_filter);
if(plotList.elementAt(0) == null) {
if(bw != null) {
bw.close();
}
return;
}
bw.write("剧情简介:" + NEWLINE);
NodeList summary = plotList.elementAt(0).getChildren();
NodeIterator it3 = summary.elements();
while(it3.hasMoreNodes()) {
Node tmpNode = (Node) it3.nextNode();
if(tmpNode instanceof TextNode) {
String str = tmpNode.getText();
int i=0;
while( i < str.length() && Character.isWhitespace(str.charAt(i))){
i++;
}
bw.write(str.substring(i));
//System.out.print(str.substring(i));
}
else if(tmpNode instanceof TagNode) {
String str = ((TagNode) tmpNode).getTagName();
if(str.equals("BR"))
bw.newLine();
//System.out.println();
}
}
if(bw != null) {
bw.close();
}
}catch(Exception e) {
e.printStackTrace();
System.out.println(this.getInputFilePath()+ ":Something is wrong!!!!!!!!");
}
}