public List<String[]> extractItems(String url, String encoding, String gp,
String itp) throws MalformedURLException,
UnsupportedEncodingException, IOException ...{
gp = sanifyPattern(gp);
itp = sanifyPattern(itp);
Pattern globalPattern = Pattern.compile(gp, Pattern.CASE_INSENSITIVE
| Pattern.DOTALL | Pattern.UNIX_LINES);
Pattern itemPattern = Pattern.compile(itp, Pattern.CASE_INSENSITIVE
| Pattern.DOTALL | Pattern.UNIX_LINES);
String html = source(url, encoding);
Matcher matcher = globalPattern.matcher(html);
List<String[]> items = new ArrayList<String[]>();
if (matcher != null && matcher.find()) ...{
String global = matcher.group(1);
Matcher itm = itemPattern.matcher(global);
while (itm != null && itm.find()) ...{
List<String> groups = new ArrayList<String>();
for (int i = 1; i <= itm.groupCount(); i++) ...{
groups.add(itm.group(i));
}
items.add(groups.toArray(new String[groups.size()]));
}
}
return items;
}
为了使得方便的抽取网页中的某些信息,采用JAVA里面的正则表达式写了一个可以抽取其中网页一些信息,并通过dom4j写为XML的程序。实现了对新闻,MP3等比较固定的网页模版的信息抽取工作。
一个全局的pattern选出有用信息块,然后通过itermpattern,重复的抽取网页中的信息单位。并将这些提取出来的items写成一个XML文件。 写dom4j的程序:
public String asXml(String fmt) ...{
Document doc = DocumentHelper.createDocument();
// <list>
Element root = doc.addElement("list");
List<Song> songs = getSongs();
for (Song song : songs) ...{
Element element = DocumentHelper.createElement("song");
element.addElement("title").setText(song.getTitle());
element.addElement("album").setText(song.getAlbum());
element.addElement("singer").setText(song.getSinger());
element.addElement("link").setText(song.getLink());
element.addElement("source").setText(song.getSource());
element.addElement("format").setText(song.getFormat());
element.addElement("megabyte").setText(
String.valueOf(song.getMegabyte()));
element.addElement("speed")
.setText(String.valueOf(song.getSpeed()));
root.add(element);
}
root.addAttribute("total", String.valueOf(songs.size()));
return doc.asXML();
}