public class CopyOfContentExtractor {
private static final int MIN_NODE_TEXT_LENGTH = 20; // //正文的最小长度
private static final int MIN_K = 30; // //有了链接分析,可以设置高一些,粗放一些
private static final double MAX_LINK_RATE = 0.5; // ///最小链接率
private double TEMP_MAX_LENGTH = 0;
private Node targetNode = null;
public String title = "";
public List<String> imgSrcList = new ArrayList<String>();
public String address;
/**
* 解析本地html文件
*
* @param address
*/
public void parseHTM(String location, String address) {
this.address = address;
try {
parse(new FileInputStream(new File(location)));
} catch (SAXException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* 解析传入的url地址
*
* @param address
*/
public void parseURL(String address) {
this.address = address;
try {
URL url = new URL(address);
parse(url.openConnection().getInputStream());
} catch (SAXException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
private void parse(InputStream ips) throws SAXException, IOException {
DOMParser parser = new DOMParser();
parser.setProperty("http://cyberneko.org/html/properties/default-encoding", "utf-8");
parser.parse(new InputSource(ips));
Document doc = parser.getDocument();
Node body = doc.getElementsByTagName("body").item(0); // 抽取内容
if (body != null) {
NodeList list = body.getChildNodes();
for (int i = 0; i < list.getLength(); i++) {
Node node = list.item(i);
loop(node); // //////递归调用
}
// //////此时已对targetNode赋值
if (targetNode != null) {
linkNodeFilter(targetNode);
// //////进行链接分析继续对目标node的dom过滤
// refixTargetNodeByLinkAnalysis(this.targetNode);
// //////抽取目标node内的图片地址
imgExtractor(targetNode);
} else {
System.out.println("extractor is NULL!!! ");
}
}
}
public void loop(Node node) {
if (node.getNodeType() == Node.ELEMENT_NODE) {
Element elmt = (Element) node;
if (!elmt.getTagName().equals("STYLE") && !elmt.getTagName().equals("SCRIPT")) {
NodeList list = node.getChildNodes();
if (list.getLength() == 1) {
loop(list.item(0));
} else {
double[] lengthArr = new double[list.getLength()];
for (int i = 0; i < list.getLength(); i++) { /////计算每个子节点的文本长度
String text = textExtractor(list.item(i));
///System.out.println("text: " + text);
lengthArr[i] = text.length();
}
// ////////遍历每一个长度,判断走向
double sum = 0.0;
for (double d : lengthArr) {
sum += d;
}
if (sum > MIN_NODE_TEXT_LENGTH) {
double mean = sum / lengthArr.length;
double varianceSum = 0.0;
for (double d : lengthArr) {
varianceSum += (d - mean) * (d - mean);
}
double variance = varianceSum / lengthArr.length; // /////方差
double k = variance / sum;
////System.out.println("k:" + k);
if (k < MIN_K) {
////System.out.println(k + " " + textExtractor(node));
////抽取正文node
if (sum > TEMP_MAX_LENGTH) {
TEMP_MAX_LENGTH = sum;
targetNode = node;
}
} else {
////重新采用链接分析的办法筛选
for (int i = 0; i < list.getLength(); i++) { /////计算每个子节点的文本长度
lengthArr[i] = lengthArr[i] - linkAnalysis(list.item(i)); ///减去链接文本长度
}
int maxIndex = getMaxIndex(lengthArr);
loop(list.item(maxIndex));
}
}
}
}
}
}
// 抽取节点文本递归部分
public String textExtractor(Node root) {
// 若是文本节点的话,直接返回
if (root.getNodeType() == Node.TEXT_NODE) {
return root.getNodeValue().trim();
}
if (root.getNodeType() == Node.ELEMENT_NODE) {
Element elmt = (Element) root;
// 去除脚本
if (elmt.getTagName().equals("STYLE") || elmt.getTagName().equals("SCRIPT"))
return "";
NodeList children = elmt.getChildNodes();
StringBuilder text = new StringBuilder();
for (int i = 0; i < children.getLength(); i++) {
String innerText = textExtractor(children.item(i));
if (innerText.length() > 5) {
text.append(innerText + " ");
}
}
return text.toString();
}
// 对其它类型的节点,返回空值
return "";
}
// 抽取图片地址递归部分
private void imgExtractor(Node root) {
if (root.getNodeType() == Node.ELEMENT_NODE) {
Element elmt = (Element) root;
if (elmt.getTagName().equals("IMG") || elmt.getTagName().equals("img")) {
String src = elmt.getAttribute("real_src").isEmpty() ? elmt.getAttribute("src") : elmt.getAttribute("real_src");
if (!src.startsWith("http://")) { // //////如果图片地址是相对地址
String sub = address.substring(7);
int index = sub.indexOf("/");
src = "http://" + sub.substring(0, index) + src;
}
imgSrcList.add(src);
}
NodeList children = elmt.getChildNodes();
for (int i = 0; i < children.getLength(); i++) {
imgExtractor(children.item(i));
}
}
}
/**
* 计算节点链接文本长度,非破坏性
* @param root
* @return
*/
private double linkAnalysis(Node root) {
// 若是文本节点的话,直接返回
if (root.getNodeType() == Node.TEXT_NODE
&& (root.getParentNode().getNodeName().endsWith("a") || root.getParentNode().getNodeName().endsWith("A"))) {
return root.getNodeValue().trim().length();
}
if (root.getNodeType() == Node.ELEMENT_NODE) {
Element elmt = (Element) root;
// 去除脚本
if (elmt.getTagName().equals("STYLE") || elmt.getTagName().equals("SCRIPT"))
return 0.0;
NodeList children = elmt.getChildNodes();
double temp = 0.0;
for (int i = 0; i < children.getLength(); i++) {
double innerTemp = linkAnalysis(children.item(i));
temp += innerTemp;
}
return temp;
}
// 对其它类型的节点,返回空值
return 0.0;
}
private int getMaxIndex(double[] input) {
int index = 0;
double temp = 0.0;
for (int i = 0; i < input.length; i++) {
if (temp < input[i]) {
temp = input[i];
index = i;
}
}
return index;
}
/**
* 采用remove风格处理targetNode:链接分析
*/
private void linkNodeFilter(Node node) {
if (node != null) {
NodeList children = node.getChildNodes();
// //拼接网页正文
for (int i = 0; i < children.getLength(); i++) {
Node subNode = children.item(i);
// ///链接分析
String text = textExtractor(subNode);
double textLength = text.length();
if (textLength > 0) {
// System.out.println("Text:" + text);
// System.out.println(subNode.getNodeName());
double linkLen = linkAnalysis(subNode);
double linkRate = linkLen / textLength;
// System.out.println("linkLen: " + linkLen + "; textLen: "
// + textLength + "; linkRate: " + linkRate);
// System.out.println(100 * linkRate / (linkLen +
// textLength));
if (linkRate < MAX_LINK_RATE) {
linkNodeFilter(subNode);
} else {
node.removeChild(subNode);
}
}
}
}
}
/**
*
* @return
*/
private String getContent() {
if(targetNode != null){
return textExtractor(targetNode);
}else{
return "NULL";
}
}
// //////////
public static void main(String[] argv) throws IOException {
int count = 0;
for (File f : new File("D:/fxreader/data/2012-09-29-00-13").listFiles()) {
CopyOfContentExtractor extractor = new CopyOfContentExtractor();
extractor.parseHTM(f.getAbsolutePath(), Options.rules_startsWith);
String content = extractor.getContent();
if(content.length() > 200 ){
count++;
PrintWriter pw = new PrintWriter(new OutputStreamWriter(new FileOutputStream(new File("D:/fxreader/content/2012-09-29-00-13/" + f.getName()+ "_" + content.length() + "_" + ".txt")), "utf-8"));
pw.append(content);
//FileUtils.writeStringToFile(new File("D:/fxreader/content/" + f.getName()+ "_" + content.length() + "_" + ".txt"), content, "utf-8");
pw.flush();
pw.close();
System.out.println("missing: " + f.getName());
}
System.out.println(count);
};
}
}
实验测试,错误率:42/500.其中,42篇中大多为原网页不含有“正文”的网页。