public class simpleRegex {
public static void main(String[] args) throws IOException {
processFile("C:\\Users\\tianjun\\Desktop\\search\\ch10\\heritrixproject\\jobs\\xiaoxue-20150314050416274\\mirror\\");
}
public static void processFile(String path) throws IOException {
File[] files = new File(path).listFiles();
for (int i = 0; i < files.length; i++) {
if (files[i].isDirectory() == true) {
processFile(files[i].getAbsolutePath());
} else {
try {
BufferedReader reader = new BufferedReader(new FileReader(
files[i].getAbsoluteFile()));
StringBuffer sb = new StringBuffer();
String line = null;
while ((line = reader.readLine()) != null) {
sb.append(line);
sb.append("\n");
}
reader.close();
} catch (IOException e) {
e.printStackTrace();
} finally {
Extractor extractor = new Extractor();
extractor.extract(files[i].getAbsolutePath());
}
}
}
}
}
这个是对目录及文件的处理,接下来就是对具体处理的实现的一个例子的函数
public void extract(String filename) throws IOException {
String web = readFileByLines(filename);
web = new String(web.getBytes("utf-8"), "utf-8");
title = MatchStr("<h1 id=\"detail_article_title\">([\\w\\W]*?)</h1>",
web);
yd = MatchStr("<span class=\"view_count\">([\\w\\W]*?)</span>", web);
bjdp = MatchStr("<strong>[\\w\\W]*?</strong>([\\w\\W]*?)<br />", web);
zw = MatchStr("<div id=\"article\">([\\s\\S]*?)<div class=\"clear\">",
web);
lj = filename.substring(filename.indexOf("mirror") + 7,
filename.lastIndexOf("\\"));
lj = lj.replaceAll("\\\\", "/");
md5 = MD5.MD5EnCode(lj);
run();
if (this.user != null) {
}
}