dom4j 生产UTF-8的编码格式

本文介绍了一个用于抓取和处理RSS新闻源的Java程序。该程序能够从指定URL读取RSS源,解析并转换为自定义XML格式,并保存到本地文件中。处理过程中包括了去除HTML标签、截断过长描述等功能。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

package com.telecomjs.uec.home.ehousekeeper.newspaper.common;
import java.io.File;
import java.io.FileOutputStream;
import java.util.List;
import org.apache.commons.configuration.PropertiesConfiguration;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.dom4j.Document;
import org.dom4j.DocumentHelper;
import org.dom4j.Element;
import org.dom4j.io.OutputFormat;
import org.dom4j.io.SAXReader;
import org.dom4j.io.XMLWriter;


/**
* @author liujy
*
*/
public class ManageRss {
Log log = LogFactory.getLog(ManageRss.class);

PropertiesManage propertiesManage=PropertiesManage.getInstance();
PropertiesConfiguration properties;

Document document;

SAXReader sax=new SAXReader();

@SuppressWarnings("unchecked")
public Document getRss(String url) throws Exception{


Document createdocument=DocumentHelper.createDocument();
Element root= createdocument.addElement("DataObject");

properties=propertiesManage.getProperties();


try {
document=sax.read(url);

List<Element> list=document.selectNodes("rss/channel/item");
// log.info("++++++++++++++++++++++=list.size="+list.size());
for(int i=0;i<list.size();i++){
Element node=list.get(i);
// log.info("==========================node=node"+i);
Element newsElement=root.addElement("news");
newsElement.addElement("title").addCDATA(node.elementText("title").toString());
newsElement.addElement("link").addCDATA(node.elementText("link").toString());

String description =node.elementText("description").toString();
description= description.replaceAll(
"\\&[a-zA-Z]{1,10};", "").replaceAll("<[^>]*>", "");
if(description.length()>90){
description=description.substring(0, 90)+"...";
}


newsElement.addElement("description").addCDATA(description);
}
}catch(Exception e){
log.error("errerrroeroeerrroeerrroeerrroeerrroeerrroeerrroeerrroeerrroe") ;
}


return createdocument;
}

public void savaFile(Document createdocument,String fileName) throws Exception{

if(!new File(properties.getProperty("newspaper.url").toString()).isDirectory()){
new File(properties.getProperty("newspaper.url").toString()).mkdirs();

}


[color=orange] FileOutputStream outputStream = new FileOutputStream(properties.getProperty("newspaper.url").toString()+fileName+".xml");
OutputFormat format = OutputFormat.createPrettyPrint();
format.setEncoding("utf-8");
XMLWriter writer = new XMLWriter(format);
writer.setOutputStream(outputStream);
writer.write(createdocument);
writer.close();[/color]


}

public String getDocement(String url) throws Exception{



document=sax.read(url);

log.info("-------"+document.asXML());


return document.asXML();
}

public static void main(String[] args) throws Exception {
// ManageRss manageRss=new ManageRss();
// manageRss.getRss("http://www.people.com.cn/rss/politics.xml");
// List<NewsPaper> list=manageRss.getNewsList("C:\\apache-tomcat-6.0.14\\webapps\\NGS_EHouseKeeper\\rss\\b5c8190a-489b-486c-bbbd-40726053d351.txt");
// System.out.println(list.size());
}




}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值