数据库使用的是MySQL,新建test数据库创建csdnblog表:
- CREATE TABLE `csdnblog` (
- `id` int(11) unsigned NOT NULL auto_increment,
- `keyes` int(11) unsigned NOT NULL,
- `titles` varchar(255) NOT NULL,
- `content` varchar(10240) NOT NULL,
- `dates` varchar(16) default NULL,
- `tags` varchar(255) default NULL,
- `category` varchar(255) default NULL,
- `views` int(11) unsigned default NULL,
- `comments` int(11) unsigned default NULL,
- `copyright` int(1) unsigned default NULL,
- PRIMARY KEY (`id`)
- ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
接着新建实体类对应博客文章:
CsdnBlog.Java
- public class CsdnBlog {
- private int key;// 编号
- private String title;// 标题
- private String date;// 日期
- private String tags;// 标签
- private String category;// 分类
- private int view;// 阅读人数
- private int comments;// 评论人数
- private int copyright;// 是否原创
- private String content; //文字内容
- public String getContent() {
- return content;
- }
- public void setContent(String content) {
- this.content = content;
- }
- public int getKey() {
- return key;
- }
- public void setKey(int key) {
- this.key = key;
- }
- public String getTitle() {
- return title;
- }
- public void setTitle(String title) {
- this.title = title;
- }
- public String getDate() {
- return date;
- }
- public void setDate(String date) {
- this.date = date;
- }
- public String getTags() {
- return tags;
- }
- public void setTags(String tags) {
- this.tags = tags;
- }
- public String getCategory() {
- return category;
- }
- public void setCategory(String category) {
- this.category = category;
- }
- public int getView() {
- return view;
- }
- public void setView(int view) {
- this.view = view;
- }
- public int getComments() {
- return comments;
- }
- public void setComments(int comments) {
- this.comments = comments;
- }
- public int getCopyright() {
- return copyright;
- }
- public void setCopyright(int copyright) {
- this.copyright = copyright;
- }
- @Override
- public String toString() {
- return "CsdnBlog [key=" + key + ", title=" + title + ", content=" + content + ",date=" + date + ", tags=" + tags + ", category="
- + category + ", view=" + view + ", comments=" + comments + ", copyright=" + copyright + "]";
- }
- }
Dao类
CsdnBlogDao.java:
- public class CsdnBlogDao {
- private Connection conn = null;
- private Statement stmt = null;
- public CsdnBlogDao() {
- try {
- Class.forName("com.mysql.jdbc.Driver");
- String url = "jdbc:mysql://localhost:3306/test?"
- + "user=root&password=123&useUnicode=true&characterEncoding=UTF8";
- conn = DriverManager.getConnection(url);
- stmt = conn.createStatement();
- } catch (ClassNotFoundException e) {
- e.printStackTrace();
- } catch (SQLException e) {
- e.printStackTrace();
- }
- }
- public int add(CsdnBlog csdnBlog) {
- try {
- String sql = "INSERT INTO `test`.`csdnblog` (`keyes`, `titles`, `content` , `dates`, `tags`, `category`, `views`, `comments`, `copyright`) VALUES (?, ?, ?, ?, ?, ?, ?, ?,?);";
- PreparedStatement ps = conn.prepareStatement(sql);
- ps.setInt(1, csdnBlog.getKey());
- ps.setString(2, csdnBlog.getTitle());
- ps.setString(3,csdnBlog.getContent());
- ps.setString(4, csdnBlog.getDate());
- ps.setString(5, csdnBlog.getTags());
- ps.setString(6, csdnBlog.getCategory());
- ps.setInt(7, csdnBlog.getView());
- ps.setInt(8, csdnBlog.getComments());
- ps.setInt(9, csdnBlog.getCopyright());
- return ps.executeUpdate();
- } catch (SQLException e) {
- e.printStackTrace();
- }
- return -1;
- }
- }
测试类:
CsdnBlogPageProcessor.java
- public class CsdnBlogPageProcessor implements PageProcessor {
- private static String username="CHENYUFENG1991"; // 设置csdn用户名
- private static int size = 0;// 共抓取到的文章数量
- // 抓取网站的相关配置,包括:编码、抓取间隔、重试次数等
- private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);
- public Site getSite() {
- return site;
- }
- // process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑
- public void process(Page page) {
- // 列表页
- if (!page.getUrl().regex("http://blog\\.csdn\\.net/" + username + "/article/details/\\d+").match()) {
- // 添加所有文章页
- page.addTargetRequests(page.getHtml().xpath("//div[@id='article_list']").links()// 限定文章列表获取区域
- .regex("/" + username + "/article/details/\\d+")
- .replace("/" + username + "/", "http://blog.youkuaiyun.com/" + username + "/")// 巧用替换给把相对url转换成绝对url
- .all());
- // 添加其他列表页
- page.addTargetRequests(page.getHtml().xpath("//div[@id='papelist']").links()// 限定其他列表页获取区域
- .regex("/" + username + "/article/list/\\d+")
- .replace("/" + username + "/", "http://blog.youkuaiyun.com/" + username + "/")// 巧用替换给把相对url转换成绝对url
- .all());
- // 文章页
- } else {
- size++;// 文章数量加1
- // 用CsdnBlog类来存抓取到的数据,方便存入数据库
- CsdnBlog csdnBlog = new CsdnBlog();
- // 设置编号
- csdnBlog.setKey(Integer.parseInt(
- page.getUrl().regex("http://blog\\.csdn\\.net/" + username + "/article/details/(\\d+)").get()));
- // 设置标题
- csdnBlog.setTitle(
- page.getHtml().xpath("//div[@class='article_title']//span[@class='link_title']/a/text()").get());
- //设置内容
- csdnBlog.setContent(
- page.getHtml().xpath("//div[@class='article_content']/allText()").get());
- // 设置日期
- csdnBlog.setDate(
- page.getHtml().xpath("//div[@class='article_r']/span[@class='link_postdate']/text()").get());
- // 设置标签(可以有多个,用,来分割)
- csdnBlog.setTags(listToString(page.getHtml()
- .xpath("//div[@class='article_l']/span[@class='link_categories']/a/allText()").all()));
- // 设置类别(可以有多个,用,来分割)
- csdnBlog.setCategory(
- listToString(page.getHtml().xpath("//div[@class='category_r']/label/span/text()").all()));
- // 设置阅读人数
- csdnBlog.setView(Integer.parseInt(page.getHtml().xpath("//div[@class='article_r']/span[@class='link_view']")
- .regex("(\\d+)人阅读").get()));
- // 设置评论人数
- csdnBlog.setComments(Integer.parseInt(page.getHtml()
- .xpath("//div[@class='article_r']/span[@class='link_comments']").regex("\\((\\d+)\\)").get()));
- // 设置是否原创
- csdnBlog.setCopyright(page.getHtml().regex("bog_copyright").match() ? 1 : 0);
- // 把对象存入数据库
- new CsdnBlogDao().add(csdnBlog);
- // 把对象输出控制台
- System.out.println(csdnBlog);
- }
- }
- // 把list转换为string,用,分割
- public static String listToString(List<String> stringList) {
- if (stringList == null) {
- return null;
- }
- StringBuilder result = new StringBuilder();
- boolean flag = false;
- for (String string : stringList) {
- if (flag) {
- result.append(",");
- } else {
- flag = true;
- }
- result.append(string);
- }
- return result.toString();
- }
- public static void main(String[] args) {
- long startTime, endTime;
- System.out.println("【爬虫开始】...");
- startTime = System.currentTimeMillis();
- // 从用户博客首页开始抓,开启5个线程,启动爬虫
- Spider.create(new CsdnBlogPageProcessor()).addUrl("http://blog.youkuaiyun.com/" + username).thread(5).run();
- endTime = System.currentTimeMillis();
- System.out.println("【爬虫结束】共抓取" + size + "篇文章,耗时约" + ((endTime - startTime) / 1000) + "秒,已保存到数据库,请查收!");
- }
- }
运行main方法数据就保存到数据库中去了。
完整的demo: http://download.youkuaiyun.com/detail/u011781521/9672531