最近学习了下Jsoup,拿来解析html非常好用。
今天来演示下爬极客公园:点击打开链接
一、工程目录
Jsoup去官网下载就行了下载地址.
二、Article.java
一个简单的POJO类,拿来装载一个文章的内容。
public class Article {
private String tag;
private String title;
private String description;
private String author;
private String comments;
/**
* @return the tag
*/
public String getTag() {
return tag;
}
/**
* @param tag
* the tag to set
*/
public void setTag(String tag) {
this.tag = tag;
}
/**
* @return the title
*/
public String getTitle() {
return title;
}
/**
* @param title
* the title to set
*/
public void setTitle(String title) {
this.title = title;
}
/**
* @return the description
*/
public String getDescription() {
return description;
}
/**
* @param description
* the description to set
*/
public void setDescription(String description) {
this.description = description;
}
/**
* @return the author
*/
public String getAuthor() {
return author;
}
/**
* @param author
* the author to set
*/
public void setAuthor(String author) {
this.author = author;
}
/**
* @return the comments
*/
public String getComments() {
return comments;
}
/**
* @param comments
* the comments to set
*/
public void setComments(String comments) {
this.comments = comments;
}
}
三、Jsoup.java
package com.ydalien;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import org.jsoup.Connection;
import org.jsoup.Connection.Method;
import org.jsoup.Connection.Response;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.jsoup.Jsoup;
public class JsoupTest {
private final static String BASEURL = "http://www.geekpark.net/articles_list?page=";
private int page=1;
private List<Article> datas;
public JsoupTest(){
try {
Connection conn = Jsoup.connect(BASEURL+page);
//设置头信息
conn.header("User-Agent",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:29.0) Gecko/20100101 Firefox/29.0");
Response rs = conn.method(Method.GET).execute();
Document doc = Jsoup.parse(rs.body());
Elements contents = doc.select(".article-item");
//查找出自己需要的内容
datas=new ArrayList<Article>();
for(Element el:contents){
Article temp=new Article();
temp.setTag(el.select("a.category-tag").text());
temp.setTitle(el.select("a.article-title").text());
temp.setAuthor(el.select("a.dib-middle.article-author").text());
temp.setDescription(el.select("p.article-description").text());
temp.setComments(el.select("a.source-right").text());
datas.add(temp);
}
//输出爬来的数据
for(int i=0;i<datas.size();i++){
System.out.println(datas.get(i).getTag()+"---"+datas.get(i).getTitle());
System.out.println(datas.get(i).getDescription());
System.out.println("作者:"+datas.get(i).getAuthor()+"----回复数:"+datas.get(i).getComments());
System.out.println("================================================");
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public static void main(String[] args){
JsoupTest jt=new JsoupTest();
}
}
四、结果