以下程序需要htmlparser.jar。你可以直接从
http://umn.dl.sourceforge.net/sourceforge/htmlparser/htmlparser1_5_20040728.zip
下载,http://htmlparser.sourceforge.net是htmlparser的主页。
//copy from here.
/*******************************************************************************
* $Header$
* $Revision$
* $Date$
*
*==============================================================================
*
* Copyright (c) 2001-2004 XXX Technologies, Ltd.
* All rights reserved.
*
* Created on 2004-12-3
*******************************************************************************/
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.net.MalformedURLException;
import java.net.URL;
import java.text.MessageFormat;
import java.util.ArrayList;
import java.util.List;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import org.htmlparser.Node;
import org.htmlparser.Parser;
import org.htmlparser.lexer.Page;
import org.htmlparser.tags.Div;
import org.htmlparser.util.ParserException;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
/**
*
* @author 晏斐 (mailto:mr_yanfei&yahoo.com)
*/
/*
* 修改历史
* $Log$
*/
public final class BlogBackupTool {
private static final String RSS_URL = "http://blog.youkuaiyun.com/mr_yanfei/Rss.aspx";
private static final String SAVE_PATH = "d://temp";
private static final String CHANNEL = "channel";
private static final String CHANNEL_ITEM = "item";
private static final String ITEM_TITLE = "title";
private static final String ITEM_LINK = "link";
private static final boolean FILTER = true;
class Blog {
private String fTitle;
private String fLink;
public Blog(String title, String link) {
fTitle = title;
fLink = link;
}
public String getTitle() {
return fTitle;
}
public String getLink() {
return fLink;
}
}
private Blog[] getBlogs(String rssUrl) {
DocumentBuilderFactory factory =
DocumentBuilderFactory.newInstance();
factory.setNamespaceAware(true);
List result = new ArrayList();
try {
URL url = new URL(rssUrl);
DocumentBuilder builder = factory.newDocumentBuilder();
Document document = builder.parse(url.openStream());
Element channel = document.getDocumentElement();
channel = (Element)document.getElementsByTagName(CHANNEL).item(0);
if(CHANNEL.equals(channel.getLocalName())) {
NodeList nodes = channel.getChildNodes();
for(int i = 0; i < nodes.getLength(); i ++) {
org.w3c.dom.Node item = nodes.item(i);
if (CHANNEL_ITEM.equals(item.getLocalName())) {
String title = getChildNodeText(item, ITEM_TITLE);
String link = getChildNodeText(item, ITEM_LINK);
result.add(new Blog(title, link));
}
}
}
} catch (Exception ex){
ex.printStackTrace();
}
return (Blog[])result.toArray(new Blog[result.size()]);
}
private String getChildNodeText(org.w3c.dom.Node item, String nodeName) {
NodeList nodes = item.getChildNodes();
for(int i = 0; i < nodes.getLength(); i++) {
org.w3c.dom.Node node = nodes.item(i);
if (nodeName.equals(node.getLocalName())) {
return node.getFirstChild().getNodeValue();
}
}
return null;
}
private String validFilename(String name) {
String result = name.replace(':', '_');
result = result.replace('/', '_');
result = result.replace('//', '_');
result = result.replace('?', '?');
result = result.replace('*', '_');
result = result.replace('<', '_');
result = result.replace('>', '_');
result = result.replace('|', '_');
result = result.replace('"', '_');
return result;
}
private void saveBlogs(Blog[] blogs) throws Exception{
String title, link;
for (int i = 0; i < blogs.length; i++) {
title = blogs[i].getTitle();
link = blogs[i].getLink();
System.out.println("Get Blog " + title);
System.out.println("URL : " + link);
if (FILTER) {
Parser parser = null;
try {
parser = new Parser(link);
} catch (ParserException ex) {
continue;
}
Page page = parser.getLexer().getPage();
String pageUrl = page.getUrl();
Node[] bases = parser.extractAllNodesThatAre(Div.class);
for (int j = 0; j < bases.length; j++) {
String attr = ((Div)bases[j]).getAttribute("class");
if (attr == null)
attr = "";
if (attr.equals("post")) {
String content = ((Div)bases[j]).getChildrenHTML();
saveBlogToFile(title + ".html", content);
break;
}
}
parser.reset();
}
else {
StringBuffer buffer = getHtmlFromURL(link);
saveBlogToFile(title + ".html", buffer.toString());
}
}
}
private StringBuffer getHtmlFromURL(String url) {
StringBuffer buffer = new StringBuffer();
try {
URL pageUrl = new URL(url);
BufferedReader in = new BufferedReader(new InputStreamReader(pageUrl.openStream()));
String str;
while ((str = in.readLine()) != null) {
buffer.append(str);
}
in.close();
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return buffer;
}
private void saveBlogToFile(String filename, String content) {
try {
filename = validFilename(filename);
File file = new File(SAVE_PATH, filename);
OutputStream out = new FileOutputStream(file);
OutputStreamWriter writer = new OutputStreamWriter(out);
writer.write(content);
writer.close();
} catch (IOException ex) {
}
}
public static void main(String[] args) throws Exception{
BlogBackupTool reader = new BlogBackupTool();
Blog[] blogs = reader.getBlogs(RSS_URL);
reader.saveBlogs(blogs);
String msg = MessageFormat.format("Totle {0} blogs saved.", new String[]{Integer.toString(blogs.length)});
System.out.println(msg);
}
}
//end