基于文本标签方差分析和链接分析的网页正文、正文图片地址抽取算法

本文介绍了一个用于从HTML文档中提取正文内容及图片地址的Java类。该类通过分析节点文本长度、链接比率等特征来确定正文节点,并提供了解析本地HTML文件和URL地址的方法。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >



public class CopyOfContentExtractor {

private static final int MIN_NODE_TEXT_LENGTH = 20; // //正文的最小长度
private static final int MIN_K = 30; // //有了链接分析,可以设置高一些,粗放一些
private static final double MAX_LINK_RATE = 0.5; // ///最小链接率

private double TEMP_MAX_LENGTH = 0;

private Node targetNode = null;
public String title = "";
public List<String> imgSrcList = new ArrayList<String>();
public String address;

/**
* 解析本地html文件
*
* @param address
*/
public void parseHTM(String location, String address) {
this.address = address;
try {
parse(new FileInputStream(new File(location)));
} catch (SAXException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}

/**
* 解析传入的url地址
*
* @param address
*/
public void parseURL(String address) {
this.address = address;
try {
URL url = new URL(address);
parse(url.openConnection().getInputStream());
} catch (SAXException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}

private void parse(InputStream ips) throws SAXException, IOException {

DOMParser parser = new DOMParser();
parser.setProperty("http://cyberneko.org/html/properties/default-encoding", "utf-8");
parser.parse(new InputSource(ips));
Document doc = parser.getDocument();

Node body = doc.getElementsByTagName("body").item(0); // 抽取内容

if (body != null) {
NodeList list = body.getChildNodes();
for (int i = 0; i < list.getLength(); i++) {
Node node = list.item(i);
loop(node); // //////递归调用
}
// //////此时已对targetNode赋值
if (targetNode != null) {

linkNodeFilter(targetNode);
// //////进行链接分析继续对目标node的dom过滤
// refixTargetNodeByLinkAnalysis(this.targetNode);
// //////抽取目标node内的图片地址
imgExtractor(targetNode);
} else {
System.out.println("extractor is NULL!!! ");
}
}
}

public void loop(Node node) {

if (node.getNodeType() == Node.ELEMENT_NODE) {
Element elmt = (Element) node;
if (!elmt.getTagName().equals("STYLE") && !elmt.getTagName().equals("SCRIPT")) {
NodeList list = node.getChildNodes();
if (list.getLength() == 1) {
loop(list.item(0));
} else {
double[] lengthArr = new double[list.getLength()];
for (int i = 0; i < list.getLength(); i++) { /////计算每个子节点的文本长度
String text = textExtractor(list.item(i));
///System.out.println("text: " + text);
lengthArr[i] = text.length();
}
// ////////遍历每一个长度,判断走向
double sum = 0.0;
for (double d : lengthArr) {
sum += d;
}
if (sum > MIN_NODE_TEXT_LENGTH) {

double mean = sum / lengthArr.length;
double varianceSum = 0.0;
for (double d : lengthArr) {
varianceSum += (d - mean) * (d - mean);
}
double variance = varianceSum / lengthArr.length; // /////方差

double k = variance / sum;
////System.out.println("k:" + k);
if (k < MIN_K) {
////System.out.println(k + " " + textExtractor(node));

////抽取正文node
if (sum > TEMP_MAX_LENGTH) {
TEMP_MAX_LENGTH = sum;
targetNode = node;
}

} else {
////重新采用链接分析的办法筛选
for (int i = 0; i < list.getLength(); i++) { /////计算每个子节点的文本长度
lengthArr[i] = lengthArr[i] - linkAnalysis(list.item(i)); ///减去链接文本长度
}
int maxIndex = getMaxIndex(lengthArr);
loop(list.item(maxIndex));
}
}
}
}
}
}

// 抽取节点文本递归部分
public String textExtractor(Node root) {
// 若是文本节点的话,直接返回
if (root.getNodeType() == Node.TEXT_NODE) {
return root.getNodeValue().trim();
}
if (root.getNodeType() == Node.ELEMENT_NODE) {
Element elmt = (Element) root;

// 去除脚本
if (elmt.getTagName().equals("STYLE") || elmt.getTagName().equals("SCRIPT"))
return "";

NodeList children = elmt.getChildNodes();
StringBuilder text = new StringBuilder();
for (int i = 0; i < children.getLength(); i++) {
String innerText = textExtractor(children.item(i));
if (innerText.length() > 5) {
text.append(innerText + " ");
}
}
return text.toString();
}
// 对其它类型的节点,返回空值
return "";
}

// 抽取图片地址递归部分
private void imgExtractor(Node root) {

if (root.getNodeType() == Node.ELEMENT_NODE) {
Element elmt = (Element) root;

if (elmt.getTagName().equals("IMG") || elmt.getTagName().equals("img")) {

String src = elmt.getAttribute("real_src").isEmpty() ? elmt.getAttribute("src") : elmt.getAttribute("real_src");
if (!src.startsWith("http://")) { // //////如果图片地址是相对地址
String sub = address.substring(7);
int index = sub.indexOf("/");
src = "http://" + sub.substring(0, index) + src;
}
imgSrcList.add(src);
}
NodeList children = elmt.getChildNodes();
for (int i = 0; i < children.getLength(); i++) {
imgExtractor(children.item(i));
}
}
}

/**
* 计算节点链接文本长度,非破坏性
* @param root
* @return
*/
private double linkAnalysis(Node root) {

// 若是文本节点的话,直接返回
if (root.getNodeType() == Node.TEXT_NODE
&& (root.getParentNode().getNodeName().endsWith("a") || root.getParentNode().getNodeName().endsWith("A"))) {
return root.getNodeValue().trim().length();
}

if (root.getNodeType() == Node.ELEMENT_NODE) {

Element elmt = (Element) root;
// 去除脚本
if (elmt.getTagName().equals("STYLE") || elmt.getTagName().equals("SCRIPT"))
return 0.0;

NodeList children = elmt.getChildNodes();
double temp = 0.0;
for (int i = 0; i < children.getLength(); i++) {
double innerTemp = linkAnalysis(children.item(i));
temp += innerTemp;
}
return temp;
}
// 对其它类型的节点,返回空值
return 0.0;
}

private int getMaxIndex(double[] input) {
int index = 0;
double temp = 0.0;
for (int i = 0; i < input.length; i++) {
if (temp < input[i]) {
temp = input[i];
index = i;
}
}
return index;
}

/**
* 采用remove风格处理targetNode:链接分析
*/
private void linkNodeFilter(Node node) {
if (node != null) {
NodeList children = node.getChildNodes();

// //拼接网页正文
for (int i = 0; i < children.getLength(); i++) {
Node subNode = children.item(i);
// ///链接分析
String text = textExtractor(subNode);

double textLength = text.length();
if (textLength > 0) {

// System.out.println("Text:" + text);
// System.out.println(subNode.getNodeName());
double linkLen = linkAnalysis(subNode);
double linkRate = linkLen / textLength;
// System.out.println("linkLen: " + linkLen + "; textLen: "
// + textLength + "; linkRate: " + linkRate);
// System.out.println(100 * linkRate / (linkLen +
// textLength));

if (linkRate < MAX_LINK_RATE) {
linkNodeFilter(subNode);
} else {
node.removeChild(subNode);
}
}
}
}
}

/**
*
* @return
*/
private String getContent() {
if(targetNode != null){
return textExtractor(targetNode);
}else{
return "NULL";
}
}


// //////////
public static void main(String[] argv) throws IOException {

int count = 0;
for (File f : new File("D:/fxreader/data/2012-09-29-00-13").listFiles()) {
CopyOfContentExtractor extractor = new CopyOfContentExtractor();
extractor.parseHTM(f.getAbsolutePath(), Options.rules_startsWith);

String content = extractor.getContent();

if(content.length() > 200 ){
count++;
PrintWriter pw = new PrintWriter(new OutputStreamWriter(new FileOutputStream(new File("D:/fxreader/content/2012-09-29-00-13/" + f.getName()+ "_" + content.length() + "_" + ".txt")), "utf-8"));
pw.append(content);
//FileUtils.writeStringToFile(new File("D:/fxreader/content/" + f.getName()+ "_" + content.length() + "_" + ".txt"), content, "utf-8");
pw.flush();
pw.close();

System.out.println("missing: " + f.getName());
}
System.out.println(count);
};

}

}


实验测试,错误率:42/500.其中,42篇中大多为原网页不含有“正文”的网页。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值