JAVA爬虫代码

工程目录:

所需要的jar包为:   jsoup-1.10.2.jar

 

复制代码
/**
 * Created by wangzheng on 2017/2/19.
 */
public class Article { /** * 文章链接的相对地址 */ private String address; /** * 文章标题 */ private String title; /** * 文章简介 */ private String desption; /** * 文章发表时间 */ private String time; public String getAddress() { return address; } public void setAddress(String address) { this.address = address; } public String getTitle() { return title; } public void setTitle(String title) { this.title = title; } public String getDesption() { return desption; } public void setDesption(String desption) { this.desption = desption; } public String getTime() { return time; } public void setTime(String time) { this.time = time; } }
复制代码
复制代码
/**
 * Created by wangzheng on 2017/2/19.
 */
import java.io.IOException;
import java.util.ArrayList; import java.util.List; import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; public class First { // 需要进行爬取得博客首页 // private static final String URL = "http://blog.youkuaiyun.com/guolin_blog"; private static final String URL = "http://blog.youkuaiyun.com/qq_33599520/article/list/1"; public static void main(String[] args) throws IOException { // 获取url地址的http链接Connection Connection conn = Jsoup.connect(URL) // 博客首页的url地址 .userAgent("Mozilla/5.0 (Windows NT 6.1; rv:47.0) Gecko/20100101 Firefox/47.0") // http请求的浏览器设置 .timeout(5000) // http连接时长 .method(Connection.Method.GET); // 请求类型是get请求,http请求还是post,delete等方式 //获取页面的html文档 Document doc = conn.get(); Element body = doc.body(); // 将爬取出来的文章封装到Artcle中,并放到ArrayList里面去 List<Article> resultList = new ArrayList<Article>(); Element articleListDiv = body.getElementById("article_list"); Elements articleList = articleListDiv.getElementsByClass("list_item"); for(Element article : articleList){ Article articleEntity = new Article(); Element linkNode = (article.select("div h1 a")).get(0); Element desptionNode = (article.getElementsByClass("article_description")).get(0); Element articleManageNode = (article.getElementsByClass("article_manage")).get(0); articleEntity.setAddress(linkNode.attr("href")); articleEntity.setTitle(linkNode.text()); articleEntity.setDesption(desptionNode.text()); articleEntity.setTime(articleManageNode.select("span:eq(0").text()); resultList.add(articleEntity); } // 遍历输出ArrayList里面的爬取到的文章 System.out.println("文章总数:" + resultList.size()); for(Article article : resultList) { System.out.println("文章绝对路劲地址:http://blog.youkuaiyun.com" + article.getAddress()); } } }
复制代码
复制代码
/**
 * Created by wangzheng on 2017/2/19.
 */
import java.io.IOException;
import java.util.ArrayList; import java.util.List; import org.jsoup.*; import org.jsoup.nodes.*; import org.jsoup.select.*; public class Main { private static final String URL = "http://blog.youkuaiyun.com/qq_33599520"; public static void main(String[] args) throws IOException { Connection conn = Jsoup.connect(URL) .userAgent("Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:50.0) Gecko/20100101 Firefox/50.0") .timeout(5000) .method(Connection.Method.GET); Document doc = conn.get(); Element body = doc.body(); // 获取总页数 String totalPageStr = body.getElementById("papelist").select("span:eq(0)").text(); String regex = ".+共(\\d+)页"; totalPageStr = totalPageStr.replaceAll(regex, "$1"); int totalPage = Integer.parseInt(totalPageStr); int pageNow = 1; List<Article> articleList = new ArrayList<Article>(); for(pageNow = 1; pageNow <= totalPage; pageNow++){ articleList.addAll(getArtitcleByPage(pageNow)); } // 遍历输出博主所有的文章 for(Article article : articleList) { System.out.println("文章标题:" + article.getTitle()); System.out.println("文章绝对路劲地址:http://blog.youkuaiyun.com" + article.getAddress()); System.out.println("文章简介:" + article.getDesption()); System.out.println("发表时间:" + article.getTime()); } } public static List<Article> getArtitcleByPage(int pageNow) throws IOException{ Connection conn = Jsoup.connect(URL + "/article/list/" + pageNow) .userAgent("Mozilla/5.0 (Windows NT 6.1; rv:47.0) Gecko/20100101 Firefox/47.") .timeout(5000) .method(Connection.Method.GET); Document doc = conn.get(); Element body = doc.body(); List<Article> resultList = new ArrayList<Article>(); Element articleListDiv = body.getElementById("article_list"); Elements articleList = articleListDiv.getElementsByClass("list_item"); for(Element article : articleList){ Article articleEntity = new Article(); Element linkNode = (article.select("div h1 a")).get(0); Element desptionNode = (article.getElementsByClass("article_description")).get(0); Element articleManageNode = (article.getElementsByClass("article_manage")).get(0); articleEntity.setAddress(linkNode.attr("href")); articleEntity.setTitle(linkNode.text()); articleEntity.setDesption(desptionNode.text()); articleEntity.setTime(articleManageNode.select("span:eq(0").text()); resultList.add(articleEntity); } return resultList; } }
复制代码

转载于:https://www.cnblogs.com/Jansens520/p/6419713.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值