使用Jsoup解析XML抓取新浪新闻文章

<?xml version="1.0" encoding="UTF-8"?>
<result>
    <status>
        <code>0</code>
    </status>
    <encoding>utf-8</encoding>
    <serverSeconds>1420343599</serverSeconds>
    <total>298076</total>
    <count>22</count>
    <last_time>1420334026</last_time>
    <data>
        <item>
            <id>1-1-31356907</id>
            <column>tpxw</column>
            <title>组图:武汉举办“女神相亲会”3000多名媛报名</title>
            <url>http://slide.news.sina.com.cn/s/slide_1_2841_79556.html</url>
            <keywords>武汉,女神相亲会</keywords>
            <comment_channel />
            <img>http://www.sinaimg.cn/dy/slidenews/1_t500/2015_01/2841_532839_164977.jpg</img>
            <level>0</level>
            <createtime>1420334026</createtime>
            <old_level>2</old_level>
            <media_type>tw</media_type>
            <media_name>新浪图片</media_name>
        </item>
        <item>
            <id>1-1-31356801</id>
            <column>tpxw</column>
            <title>组图:郑州一火锅店墙上挂百万现金作举报奖</title>
            <url>http://slide.news.sina.com.cn/s/slide_1_2841_79546.html</url>
            <keywords>现金,火锅店,百万,地沟油,食品安全</keywords>
            <comment_channel />
            <img>http://www.sinaimg.cn/dy/slidenews/1_t500/2015_01/2841_532748_181307.jpg</img>
            <level>0</level>
            <createtime>1420331615</createtime>
            <old_level>2</old_level>
            <media_type>tw</media_type>
            <media_name>新浪图片</media_name>
        </item>
        <item>
            <id>1-1-31356788</id>
            <column>spxw</column>
            <title>视频:实拍两男贩9公斤冰毒被抓现场互推脱指控</title>
            <url>http://video.sina.com.cn/p/news/s/v/2015-01-04/082764468119.html</url>
            <keywords>毒贩,冰毒,反目</keywords>
            <comment_channel>vblog</comment_channel>
            <img>http://www.sinaimg.cn/dy/http/video.sina.com.cn/p/news/s/v/2015-01-04/136905268_2_s160x120.jpg</img>
            <level>0</level>
            <createtime>1420331230</createtime>
            <old_level>2</old_level>
            <media_type>sp</media_type>
            <media_name>齐鲁网</media_name>
        </item>
        <item>
            <id>1-1-31356783</id>
            <column>shwx</column>
            <title>男子在3600米海拔雪地里半裸求婚(图)</title>
            <url>http://news.sina.com.cn/s/p/2015-01-04/082231356783.shtml</url>
            <keywords>半裸,求婚</keywords>
            <comment_channel>sh</comment_channel>
            <img>http://www.sinaimg.cn/dy/s/p/2015-01-04/U10856P1T1D31356783F21DT20150104082241.jpg</img>
            <level>1</level>
            <createtime>1420330961</createtime>
            <old_level>1</old_level>
            <media_type>tw</media_type>
            <media_name>中国网</media_name>
        </item>
        <item>
            <id>1-1-31356712</id>
            <column>spxw</column>
            <title>视频:监拍救护车来迟医生遭家属暴打 护士大哭</title>
            <url>http://video.sina.com.cn/p/news/s/v/2015-01-04/080764468075.html</url>
            <keywords>救护车,来迟,家属,暴打</keywords>
            <comment_channel>vblog</comment_channel>
            <img>http://www.sinaimg.cn/dy/http/video.sina.com.cn/p/news/s/v/2015-01-04/136904998_2_s160x120.jpg</img>
            <level>0</level>
            <createtime>1420330051</createtime>
            <old_level>2</old_level>
            <media_type>sp</media_type>
            <media_name>齐鲁网</media_name>
        </item>
        <item>
            <id>1-1-31356710</id>
            <column>spxw</column>
            <title>视频:湖北交警曝光男女车内热吻亲热照引争议</title>
            <url>http://video.sina.com.cn/p/news/s/v/2015-01-04/080564468065.html</url>
            <keywords>交警,热吻,亲热</keywords>
            <comment_channel>vblog</comment_channel>
            <img>http://www.sinaimg.cn/dy/http/video.sina.com.cn/p/news/s/v/2015-01-04/136885413_2_s160x120.jpg</img>
            <level>0</level>
            <createtime>1420329921</createtime>
            <old_level>2</old_level>
            <media_type>sp</media_type>
            <media_name>广西台</media_name>
        </item>
        <item>
            <id>1-1-31356697</id>
            <column>shwx</column>
            <title>男子元旦过后上班突然晕倒不幸离世</title>
            <url>http://news.sina.com.cn/s/2015-01-04/075031356697.shtml</url>
            <keywords>突发疾病,工伤</keywords>
            <comment_channel>sh</comment_channel>
            <img />
            <level>0</level>
            <createtime>1420329018</createtime>
            <old_level>2</old_level>
            <media_type />
            <media_name>四川在线-华西都市报</media_name>
        </item>
        <item>
            <id>1-1-31356690</id>
            <column>spxw</column>
            <title>视频:哈尔滨大火致楼体3次坍塌 前后画面对比</title>
            <url>http://video.sina.com.cn/p/news/s/v/2015-01-04/074264468023.html</url>
            <keywords>哈尔滨,画面,大火</keywords>
            <comment_channel>vblog</comment_channel>
            <img>http://www.sinaimg.cn/dy/http/video.sina.com.cn/p/news/s/v/2015-01-04/136885015_2_s160x120.jpg</img>
            <level>0</level>
            <createtime>1420328533</createtime>
            <old_level>2</old_level>
            <media_type>sp</media_type>
            <media_name>东方卫视《看东方》</media_name>
        </item>
        <item>
            <id>1-1-31356715</id>
            <column>spxw</column>
            <title>视频:5名牺牲消防员名单公布 年龄最小仅18岁</title>
            <url>http://video.sina.com.cn/p/news/s/v/2015-01-04/071564468095.html</url>
            <keywords>消防员,火灾,年龄</keywords>
            <comment_channel>vblog</comment_channel>
            <img>http://www.sinaimg.cn/dy/http/video.sina.com.cn/p/news/s/v/2015-01-04/136904368_2_s160x120.jpg</img>
            <level>0</level>
            <createtime>1420326920</createtime>
            <old_level>2</old_level>
            <media_type>sp</media_type>
            <media_name>东方卫视《看东方》</media_name>
        </item>
        <item>
            <id>1-1-31356567</id>
            <column>shwx</column>
            <title>80后女子放弃高薪回乡创业卖鱼面 年赚20多万</title>
            <url>http://news.sina.com.cn/s/2015-01-04/070231356567.shtml</url>
            <keywords>创业</keywords>
            <comment_channel>sh</comment_channel>
            <img />
            <level>0</level>
            <createtime>1420326165</createtime>
            <old_level>2</old_level>
            <media_type />
            <media_name>重庆晨报</media_name>
        </item>
        <item>
            <id>1-1-31356566</id>
            <column>shwx</column>
            <title>民工偷床单御寒 警察接警后送其两床被子</title>
            <url>http://news.sina.com.cn/s/2015-01-04/070131356566.shtml</url>
            <keywords>偷床单</keywords>
            <comment_channel>sh</comment_channel>
            <img />
            <level>0</level>
            <createtime>1420326090</createtime>
            <old_level>2</old_level>
            <media_type />
            <media_name>扬子晚报</media_name>
        </item>
        <item>
            <id>1-1-31356563</id>
            <column>shwx</column>
            <title>1岁半小孩过马路遭汽车齐腰碾压无大碍(图)</title>
            <url>http://news.sina.com.cn/s/2015-01-04/065831356563.shtml</url>
            <keywords>碾压,汽车碾压</keywords>
            <comment_channel>sh</comment_channel>
            <img>http://www.sinaimg.cn/dy/s/2015-01-04/U11556P1T1D31356563F21DT20150104065850.jpg</img>
            <level>1</level>
            <createtime>1420325930</createtime>
            <old_level>1</old_level>
            <media_type>tw</media_type>
            <media_name>扬子晚报</media_name>
        </item>
        <item>
            <id>1-1-31356554</id>
            <column>qwys</column>
            <title>男子被甩后盗女友家得600元赃款</title>
            <url>http://news.sina.com.cn/s/2015-01-04/064631356554.shtml</url>
            <keywords>偷窃</keywords>
            <comment_channel>sh</comment_channel>
            <img />
            <level>0</level>
            <createtime>1420325219</createtime>
            <old_level>2</old_level>
            <media_type />
            <media_name>南方都市报</media_name>
        </item>
        <item>
            <id>1-1-31356551</id>
            <column>shwx</column>
            <title>外籍男子打车忘拿包报警后20分钟找回</title>
            <url>http://news.sina.com.cn/s/2015-01-04/064331356551.shtml</url>
            <keywords>外籍男子</keywords>
            <comment_channel>sh</comment_channel>
            <img />
            <level>0</level>
            <createtime>1420325030</createtime>
            <old_level>2</old_level>
            <media_type />
            <media_name>南方都市报</media_name>
        </item>
        <item>
            <id>1-1-31356520</id>
            <column>shwx</column>
            <title>4人野外挖洞烧烤时塌方致3人身亡</title>
            <url>http://news.sina.com.cn/s/2015-01-04/062931356520.shtml</url>
            <keywords>塌方</keywords>
            <comment_channel>sh</comment_channel>
            <img />
            <level>1</level>
            <createtime>1420324163</createtime>
            <old_level>1</old_level>
            <media_type />
            <media_name>南方都市报</media_name>
        </item>
        <item>
            <id>1-1-31356399</id>
            <column>shwx</column>
            <title>主人花300英镑为便秘小金鱼做手术(图)</title>
            <url>http://news.sina.com.cn/s/p/2015-01-04/061031356399.shtml</url>
            <keywords>小金鱼</keywords>
            <comment_channel>sh</comment_channel>
            <img>http://www.sinaimg.cn/dy/s/p/2015-01-04/U11556P1T1D31356399F21DT20150104061042.jpg</img>
            <level>0</level>
            <createtime>1420323042</createtime>
            <old_level>2</old_level>
            <media_type>tw</media_type>
            <media_name>现代快报</media_name>
        </item>
        <item>
            <id>1-1-31356398</id>
            <column>shwx</column>
            <title>男实习医生以看病为由施暴女网友被刑拘</title>
            <url>http://news.sina.com.cn/s/2015-01-04/060931356398.shtml</url>
            <keywords>施暴</keywords>
            <comment_channel>sh</comment_channel>
            <img />
            <level>0</level>
            <createtime>1420322948</createtime>
            <old_level>2</old_level>
            <media_type />
            <media_name>现代快报</media_name>
        </item>
        <item>
            <id>1-1-31356397</id>
            <column>shwx</column>
            <title>男子上楼取物车被人开跑 次日接电话被骂乱停车</title>
            <url>http://news.sina.com.cn/s/2015-01-04/060731356397.shtml</url>
            <keywords>乱停车</keywords>
            <comment_channel>sh</comment_channel>
            <img>http://www.sinaimg.cn/dy/s/2015-01-04/U10608P1T1D31356397F21DT20150104060808.jpg</img>
            <level>0</level>
            <createtime>1420322833</createtime>
            <old_level>2</old_level>
            <media_type>tw</media_type>
            <media_name>新文化报</media_name>
        </item>
        <item>
            <id>1-1-31356396</id>
            <column>shwx</column>
            <title>女子为使皮肤好连啃3天猪蹄下巴脱臼</title>
            <url>http://news.sina.com.cn/s/2015-01-04/060431356396.shtml</url>
            <keywords>脱臼</keywords>
            <comment_channel>sh</comment_channel>
            <img />
            <level>1</level>
            <createtime>1420322648</createtime>
            <old_level>1</old_level>
            <media_type />
            <media_name>中国网</media_name>
        </item>
        <item>
            <id>1-1-31356391</id>
            <column>shwx</column>
            <title>女孩与父亲争吵后失联半个月 被找到时已身亡</title>
            <url>http://news.sina.com.cn/s/2015-01-04/055931356391.shtml</url>
            <keywords>遇难,失联</keywords>
            <comment_channel>sh</comment_channel>
            <img />
            <level>0</level>
            <createtime>1420322345</createtime>
            <old_level>2</old_level>
            <media_type />
            <media_name>钱江晚报</media_name>
        </item>
        <item>
            <id>1-1-31356389</id>
            <column>shwx</column>
            <title>男子因与女儿争吵在高速上赌气下车后迷路</title>
            <url>http://news.sina.com.cn/s/2015-01-04/055931356389.shtml</url>
            <keywords>高速公路,争吵</keywords>
            <comment_channel>sh</comment_channel>
            <img />
            <level>0</level>
            <createtime>1420322345</createtime>
            <old_level>2</old_level>
            <media_type />
            <media_name>钱江晚报</media_name>
        </item>
        <item>
            <id>1-1-31356338</id>
            <column>shwx</column>
            <title>女子爬栏杆要跳河被6旬老人拉住</title>
            <url>http://news.sina.com.cn/s/2015-01-04/055831356338.shtml</url>
            <keywords>跳河</keywords>
            <comment_channel>sh</comment_channel>
            <img />
            <level>0</level>
            <createtime>1420322308</createtime>
            <old_level>2</old_level>
            <media_type />
            <media_name>现代快报</media_name>
        </item>
    </data>
</result>



package ivyy.taobao.com.domain.xml;

import ivyy.taobao.com.utils.GlobalConstants;

import java.net.URL;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 *@Author:liangjilong
 *@Date:2015-1-4
 *@Email:jilongliang@sina.com
 *@Version:1.0
 *@Description这个是通过jsoup处理的
 */
public class SinaNew {
	
	public static void main(String[] args)throws Exception {
		String requestURL = GlobalConstants.getUrl(2, "xml");
	    org.jsoup.nodes.Document doc=Jsoup.parse(new URL(requestURL), 3000);
		// String html=doc.html();
	    Elements items=doc.select("item");//获取item(item具有多个节点)
	    
	    String title = "", url = "", keywords = "", img = "", media_name = "";
	    int i=1;
	    for (Element its : items) {
			
	    	title=its.select("title").html();
	    	url=its.select("url").html();
	    	keywords=its.select("keywords").html();
	    	img=its.select("img").html();
	    	media_name=its.select("media_name").html();
	    	
	    	String newsText=GlobalConstants.getNewsContent(url);//处理新闻内容
	    	
	    	//System.out.println(title + "\n" + url + "\n" + keywords + "\n"+ url + "\n" + media_name);
	    	
	    	System.out.println("==================第"+i+"篇=================="+newsText);
	    	i++;
		}
	}
}

package ivyy.taobao.com.utils;

import java.net.URL;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

/**
 *@Author:liangjilong
 *@Date:2015-1-4
 *@Email:jilongliang@sina.com
 *@Version:1.0
 *@Description
 */
public class GlobalConstants {
	
	/***
	 * 获取url连接
	 * @param page第几页
	 * @param format格式(XML、JSON)
	 * @return
	 */
	public static String getUrl(Integer page,String format){
		StringBuffer buffer=new StringBuffer("http://api.roll.news.sina.com.cn/zt_list?channel=news");
		String url="";
		buffer.append("&cat_1=shxw");//显示新闻
		buffer.append("&cat_2==zqsk||=qwys||=shwx||=fz-shyf");
		buffer.append("&level==1||=2");//级别
		buffer.append("&show_ext=1");
		buffer.append("&show_all=1");//显示所有
		buffer.append("&show_num=22");//显示多少条
		buffer.append("&tag=1");
		buffer.append("&format="+format);
		buffer.append("&page="+page);
		buffer.append("&callback=newsloader");
		url=buffer.toString();
		return url;
	}
	
	
	/***
	 * 获取文章的内容
	 * 从新浪的网页分析,通过文章body的id就可以拿到相应的文章内容..
	 * @param url
	 * @return
	 */
	public static String getNewsContent(String url) throws Exception{
		Document doc=Jsoup.parse(new URL(url), 3000);
		if(doc!=null){
			String artibody=doc.getElementById("artibody").html();//通过网页的html的id去拿到新闻内容artibody
			return artibody;
		}else{
			return "网络异常";
		}
	}
}

package ivyy.taobao.com.utils;

import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;

/**
 *@Author:liangjilong
 *@Date:2015-1-4
 *@Email:jilongliang@sina.com
 *@Version:1.0
 *@Description
 */

public class HttpRequestUtils {
	/**
	 * 发送http请求
	 * POST和GET请求都可以
	 * @param requestUrl 请求地址
	 * @param method传入的执行的方式 是GET还是POST方式
	 * @return String
	 */
	public static String HttpURLConnRequest(String requestUrl,String method) {
		StringBuffer buffer = new StringBuffer();
		try {
			URL url = new URL(requestUrl);
			HttpURLConnection httpUrlConn = (HttpURLConnection) url.openConnection();
			httpUrlConn.setDoInput(true);
			httpUrlConn.setRequestMethod(method);
			httpUrlConn.setUseCaches(false);  
			httpUrlConn.setInstanceFollowRedirects(true); //重定向
			httpUrlConn.connect();
			// 将返回的输入流转换成字符串
			InputStream inputStream = httpUrlConn.getInputStream();
			InputStreamReader inputStreamReader = new InputStreamReader(inputStream, "utf-8");
			BufferedReader bufferedReader = new BufferedReader(inputStreamReader);

			String str = null;
			while ((str = bufferedReader.readLine()) != null) {
				buffer.append(str);
			}
			bufferedReader.close();
			inputStreamReader.close();
			// 释放资源
			inputStream.close();
			inputStream = null;
			httpUrlConn.disconnect();

		} catch (Exception e) {
			e.printStackTrace();
		}
		return buffer.toString();
	}
}


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

爱学习的蹭蹭

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值