java抓取汽车之家上的汽车评测数据

java抓取汽车之家上的汽车评测数据

一、环境准备

1. 使用技术

  • java8
  • Springboot2.2.2
  • Mybatis-plus3.2.0
  • HttpClient4.5.10
  • Jsoup1.11.3
  • Quartz5.1.9.RELEASE

2.创建springboot项目,引入依赖

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
    <groupId>org.springframework.boot</groupId>
    <artifactId>spring-boot-starter-parent</artifactId>
    <version>2.2.2.RELEASE</version>
    <relativePath/> <!-- lookup parent from repository -->
</parent>
<groupId>cn.helin9s.crawler.autohome</groupId>
<artifactId>crawler-autohome</artifactId>
<version>0.0.1-SNAPSHOT</version>
<name>crawler-autohome</name>
<description>Demo project for Spring Boot</description>

<properties>
    <java.version>1.8</java.version>
</properties>

<dependencies>
    <dependency>
        <groupId>org.springframework.boot</groupId>
        <artifactId>spring-boot-starter-web</artifactId>
    </dependency>
    <!--lombok-->
    <dependency>
        <groupId>org.projectlombok</groupId>
        <artifactId>lombok</artifactId>
        <optional>true</optional>
    </dependency>
    <dependency>
        <groupId>org.springframework.boot</groupId>
        <artifactId>spring-boot-starter-test</artifactId>
        <scope>test</scope>
        <exclusions>
            <exclusion>
                <groupId>org.junit.vintage</groupId>
                <artifactId>junit-vintage-engine</artifactId>
            </exclusion>
        </exclusions>
    </dependency>
    <!--常用工具类-->
    <dependency>
        <groupId>commons-io</groupId>
        <artifactId>commons-io</artifactId>
        <version>2.6</version>
    </dependency>
    <dependency>
        <groupId>org.apache.commons</groupId>
        <artifactId>commons-lang3</artifactId>
        <version>3.8.1</version>
    </dependency>
    <!--定时任务Quartz-->
    <dependency>
        <groupId>org.quartz-scheduler</groupId>
        <artifactId>quartz</artifactId>
    </dependency>
    <dependency>
        <groupId>org.springframework</groupId>
        <artifactId>spring-context-support</artifactId>
        <version>5.1.9.RELEASE</version>
    </dependency>
    <!--HttpClient-->
    <dependency>
        <groupId>org.apache.httpcomponents</groupId>
        <artifactId>httpclient</artifactId>
    </dependency>
    <!--jsoup-->
    <dependency>
        <groupId>org.jsoup</groupId>
        <artifactId>jsoup</artifactId>
        <version>1.11.3</version>
    </dependency>
    <!--mybatisplus-->
    <dependency>
        <groupId>com.baomidou</groupId>
        <artifactId>mybatis-plus-boot-starter</artifactId>
        <version>3.2.0</version>
        <exclusions>
            <exclusion>
                <artifactId>jsqlparser</artifactId>
                <groupId>com.github.jsqlparser</groupId>
            </exclusion>
        </exclusions>
    </dependency>
    <dependency>
        <groupId>com.github.pagehelper</groupId>
        <artifactId>pagehelper-spring-boot-starter</artifactId>
        <version>1.2.12</version>
        <exclusions>
            <exclusion>
                <groupId>org.mybatis</groupId>
                <artifactId>mybatis</artifactId>
            </exclusion>
            <exclusion>
                <groupId>org.mybatis</groupId>
                <artifactId>mybatis-spring</artifactId>
            </exclusion>
        </exclusions>
    </dependency>
    <!--mysql连接驱动-->
    <dependency>
        <groupId>mysql</groupId>
        <artifactId>mysql-connector-java</artifactId>
    </dependency>
</dependencies>

<build>
    <plugins>
        <plugin>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-maven-plugin</artifactId>
        </plugin>
    </plugins>
</build>

</project>


3. 配置yml文件

  • application.yml
spring:
  profiles:
    active: dev
server:
  port: 8080
mybatis-plus:
  global-config:
    db-config:
      id-type: auto
  mapper-locations: classpath:mapper/*.xml
  configuration:
    map-underscore-to-camel-case: true
pagehelper:
  helper-dialect: mysql
  params: count=countSql
  support-methods-arguments: true
  • application-dev.yml
logging:
  level:
    cn.helin9s.crawler.autohome.crawlerautohome.dao: trace

spring:
  datasource:
    driver-class-name: com.mysql.cj.jdbc.Driver
    url: jdbc:mysql://localhost:3306/crawler?useUnicode=true&characterEncoding=utf-8&serverTimezone=Asia/Shanghai
    username: root
    password: 123456
    hikari:
      auto-commit: true
      connection-test-query: select 1

4. 测试数据库连接

创建mapper接口和xml

package cn.helin9s.crawler.autohome.crawlerautohome.dao;

import org.springframework.stereotype.Repository;

/**
 * @author helin
 * @project crawler-autohome
 * @package cn.helin9s.crawler.autohome.crawlerautohome.dao
 * @description @TODO
 * @create 2020-01-01 21:26
 */
@Repository
public interface TestDao {
    public String getNow();
}

<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd">
<mapper namespace="cn.helin9s.crawler.autohome.crawlerautohome.dao.TestDao">
    <select id="getNow" resultType="java.lang.String">
        select NOW();
    </select>
</mapper>

测试

package cn.helin9s.crawler.autohome.crawlerautohome;

import cn.helin9s.crawler.autohome.crawlerautohome.dao.TestDao;
import cn.helin9s.crawler.autohome.crawlerautohome.service.ApiService;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;

@SpringBootTest
class CrawlerAutohomeApplicationTests {

    @Autowired
    private TestDao testDao;

    @Test
    public void test(){
        String now = testDao.getNow();
        System.out.println(now);
    }
}

测试通过后开发环境准备完成。

二、 开发分析

1.流程分析

测评页面的url是 https://www.autohome.com.cn/bestauto/1

最后一个参数是当前页码,我们只需要按照顺序从第一页开始把所有页面都抓取下来即可。

UTOOLS1577942158659.png

  1. 根据url抓取html页面
  2. 对html页面进行解析,获取该页面所有的评测数据
  3. 遍历所有的评测数据
  4. 判断遍历的评测数据是否已经保存,如果已经保存则再次遍历下一条评测数据
  5. 如果没保存则保存数据。

2. 数据库表设计

创建数据库 crawler

CREATE DATABASE crawler;

创建汽车之家评测表 car_test

CREATE TABLE `car_test` (
  `id` int(10) unsigned NOT NULL AUTO_INCREMENT,
  `title` varchar(100) COLLATE utf8mb4_unicode_ci DEFAULT '' COMMENT '评测车辆名称',
  `test_speed` int(150) unsigned DEFAULT '0' COMMENT '评测项目-加速(0-100公里/小时),单位毫秒',
  `test_brake` int(150) unsigned DEFAULT '0' COMMENT '评测项目-刹车(100-0公里/小时),单位毫米',
  `test_oil` int(150) unsigned DEFAULT '0' COMMENT '评测项目-实测油耗(升/100公里),单位毫升',
  `editor_name1` varchar(10) COLLATE utf8mb4_unicode_ci DEFAULT '' COMMENT '评测编辑1',
  `editor_remark1` varchar(1000) COLLATE utf8mb4_unicode_ci DEFAULT '' COMMENT '点评内容1',
  `editor_name2` varchar(10) COLLATE utf8mb4_unicode_ci DEFAULT '' COMMENT '评测编辑2',
  `editor_remark2` varchar(1000) COLLATE utf8mb4_unicode_ci DEFAULT '' COMMENT '点评内容2',
  `editor_name3` varchar(10) COLLATE utf8mb4_unicode_ci DEFAULT '' COMMENT '评测编辑3',
  `editor_remark3` varchar(1000) COLLATE utf8mb4_unicode_ci DEFAULT '' COMMENT '点评内容3',
  `image` varchar(1000) COLLATE utf8mb4_unicode_ci DEFAULT '' COMMENT '评测图片,5张图片名,中间用,分隔',
  `created` datetime DEFAULT NULL COMMENT '创建时间',
  `updated` datetime DEFAULT NULL COMMENT '更新时间',
  PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci COMMENT='汽车之家评测表';

用idea的database工具连接数据库

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-gsg65aZy-1580035541769)(https://i.loli.net/2020/01/02/snfT9uNybOGqALP.png)]

右键点击创建好的car_test表,生成pojo对象

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-VyAkUFQl-1580035541770)(https://i.loli.net/2020/01/02/DxgsJknPpb2KwAH.png)]

改造对象

package cn.helin9s.crawler.autohome.crawlerautohome.pojo.model;

import com.baomidou.mybatisplus.annotation.TableName;
import lombok.*;

import java.util.Date;

@Data
@NoArgsConstructor
@AllArgsConstructor
@ToString
@EqualsAndHashCode
@TableName("car_test")
public class CarTestModel {

  private Long id;
  private String title;
  private Long testSpeed;
  private Long testBrake;
  private Long testOil;
  private String editorName1;
  private String editorRemark1;
  private String editorName2;
  private String editorRemark2;
  private String editorName3;
  private String editorRemark3;
  private String image;
  private Date created;
  private Date updated;
}

三、爬取数据实现ApiService

1. 编写连接池管理

  • 编写连接池管理器
package cn.helin9s.crawler.autohome.crawlerautohome.config;

import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;

/**
 * @author helin
 * @project crawler-autohome
 * @package cn.helin9s.crawler.autohome.crawlerautohome.config
 * @description @TODO
 * @create 2020-01-01 22:35
 */
@Configuration
public class HttpClientConnectManagerCfg {

    @Bean
    public PoolingHttpClientConnectionManager poolingHttpClientConnectionManager(){
        //创建连接管理器
        PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();
        //设置最大连接数
        cm.setMaxTotal(200);
        //设置每个主机的最大连接数
        cm.setDefaultMaxPerRoute(20);

        return cm;
    }
}

  • 编写定时任务调度器
package cn.helin9s.crawler.autohome.crawlerautohome.config;

import cn.helin9s.crawler.autohome.crawlerautohome.config.job.CloseConnectJob;
import org.quartz.Trigger;
import org.springframework.beans.factory.annotation.Qualifier;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.scheduling.quartz.CronTriggerFactoryBean;
import org.springframework.scheduling.quartz.JobDetailFactoryBean;
import org.springframework.scheduling.quartz.SchedulerFactoryBean;

/**
 * @author helin
 * @project crawler-autohome
 * @package cn.helin9s.crawler.autohome.crawlerautohome.config
 * @description @TODO
 * @create 2020-01-01 22:42
 */
@Configuration
public class SchedledCfg {

    /**
     * 定义关闭无效连接任务
     * @return
     */
    @Bean("closeConnectJobBean")
    public JobDetailFactoryBean jobDetailFactoryBean(){
        //创建任务描述的工厂bean
        JobDetailFactoryBean jb = new JobDetailFactoryBean();
        //设置spirng容器的key,任务中可以根据这个key获取spring容器
        jb.setApplicationContextJobDataKey("context");
        //设置任务
        jb.setJobClass(CloseConnectJob.class);
        //设置当没有触发器和任务绑定,不会删除任务
        jb.setDurability(true);
        return jb;
    }

    /**
     * 定义关闭无效连接触发器
     * //@Qualifier注解是通过名字注入bean,
     * @return
     */
    @Bean
    public CronTriggerFactoryBean cronTriggerFactoryBean(
            @Qualifier(value = "closeConnectJobBean") JobDetailFactoryBean itemJobBean){
        //创建一个表达式触发器工厂bean
        CronTriggerFactoryBean cronTriggerFactoryBean = new CronTriggerFactoryBean();
        //设置任务描述到触发器
        cronTriggerFactoryBean.setJobDetail(itemJobBean.getObject());
        //设置Cron表达式(从第0秒开始每隔5秒执行一次)
        cronTriggerFactoryBean.setCronExpression("0/5 * * * * ? ");
        return cronTriggerFactoryBean;
    }

    /**
     * 定义调度器
     * @return
     */
    @Bean
    public SchedulerFactoryBean schedulerFactoryBean(Trigger cronTriggerImpl){
        //创建任务调度器的工厂bean
        SchedulerFactoryBean bean = new SchedulerFactoryBean();
        //给任务调度器设置触发器
        bean.setTriggers(cronTriggerImpl);
        return bean;
    }
}

  • 编写定时任务(定时关闭失效连接)
package cn.helin9s.crawler.autohome.crawlerautohome.config.job;

import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.quartz.DisallowConcurrentExecution;
import org.quartz.JobExecutionContext;
import org.quartz.JobExecutionException;
import org.springframework.context.ApplicationContext;
import org.springframework.scheduling.quartz.QuartzJobBean;

/**
 * @author helin
 * @project crawler-autohome
 * @package cn.helin9s.crawler.autohome.crawlerautohome.config
 * @description 定时关闭HttpClient的无效连接
 * @create 2020-01-01 23:18
 */
//当定时任务没有执行完的情况下,不会再次启动新的任务
@DisallowConcurrentExecution
public class CloseConnectJob extends QuartzJobBean {
    @Override
    protected void executeInternal(JobExecutionContext context) throws JobExecutionException {
        //System.out.println("CloseConnectJob");

        //todo 获取Spring容器
        ApplicationContext applicationContext =(ApplicationContext)context.getJobDetail().getJobDataMap().get("context");

        //todo 从Spring容器中获取HttpClient的连接管理器,关闭无效连接
        applicationContext.getBean(PoolingHttpClientConnectionManager.class).closeExpiredConnections();
    }
}

2.编写和实现ApiService接口

  • 接口层
package cn.helin9s.crawler.autohome.crawlerautohome.service;

/**
 * @author helin
 * @project crawler-autohome
 * @package cn.helin9s.crawler.autohome.crawlerautohome.service
 * @description 1.Get请求获取页面数据 2.Get请求下载图片
 * @create 2020-01-02 11:01
 */
public interface ApiService {

    /**
     * Get请求获取页面数据
     * @param url
     * @return
     */
    public String getHtml(String url);

    /**
     * Get请求下载图片
     * @param url
     * @return
     */
    public String getImage(String url);
}
  • 实现层
package cn.helin9s.crawler.autohome.crawlerautohome.service.impl;

import cn.helin9s.crawler.autohome.crawlerautohome.service.ApiService;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.Optional;
import java.util.UUID;

/**
 * @author helin
 * @project crawler-autohome
 * @package cn.helin9s.crawler.autohome.crawlerautohome.service.impl
 * @description @TODO
 * @create 2020-01-02 11:04
 */
@Service
public class ApiServiceImpl implements ApiService {

    @Autowired
    private PoolingHttpClientConnectionManager cm;

    /**
     * 获取请求参数对象
     * @return
     */
    private RequestConfig getConfig(){
        return RequestConfig.custom()
                    .setConnectTimeout(1000)            //设置创建连接的 超时时间
                    .setConnectionRequestTimeout(500)   //设置获取连接的超时时间
                    .setSocketTimeout(10000)            //设置连接的超时时间
                    .build();
    }

    @Override
    public String getHtml(String url) {
        //todo 1.获取HttpClient对象
        CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();

        //todo 2.声明HttpGet请求对象
        HttpGet httpGet = new HttpGet(url);

        //todo 3.设置用户代理信息
        httpGet.setHeader("User-Agent", "");

        //todo 4.设置请求参数RequestConfig
        httpGet.setConfig(getConfig());

        //todo 5.使用HttpClient发起请求,返回response
        String html="";
        try(CloseableHttpResponse response = httpClient.execute(httpGet)) {
            //todo 6.解析response返回数据
            html = Optional.ofNullable(response).map(resp -> {
                if (resp.getStatusLine().getStatusCode() == 200) {
                    return resp.getEntity();
                } else {
                    return null;
                }
            }).map(entity -> {
                try {
                    return EntityUtils.toString(entity, "UTF-8");
                } catch (IOException e) {
                    e.printStackTrace();
                }
                return "";
            }).orElse("");
        } catch (IOException e) {
            e.printStackTrace();
        }
        return html;
    }

    @Override
    public String getImage(String url) {
        //todo 1.获取HttpClient对象
        CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();

        //todo 2.声明HttpGet请求对象
        HttpGet httpGet = new HttpGet(url);

        //todo 3.设置用户代理信息
        httpGet.setHeader("User-Agent", "");

        //todo 4.设置请求参数RequestConfig
        httpGet.setConfig(getConfig());

        //todo 5.使用HttpClient发起请求,返回response
        String imageName="";
        try(CloseableHttpResponse response = httpClient.execute(httpGet)) {
            //todo 6.解析response返回数据
            imageName = Optional.ofNullable(response).map(resp -> {
                if (resp.getStatusLine().getStatusCode() == 200) {
                    return resp.getEntity();
                } else {
                    return null;
                }
            }).map(entity -> {
                //获取文件类型(image/gif、image/png等)
                String extName = "." + entity.getContentType().getValue().split("/")[1];
                //todo 使用uuid生成图片名称
                String imgName = UUID.randomUUID().toString()+extName;
                //todo 输出的文件地址
                String path="F:/images/";
                try {
                    //todo 声明输出的文件
                    OutputStream outputStream = new FileOutputStream(new File(path+imgName));
                    //todo 使用响应体输出文件
                    entity.writeTo(outputStream);
                } catch (Exception e) {
                    e.printStackTrace();
                }
                return imgName;
            }).orElse("");
        } catch (IOException e) {
            e.printStackTrace();
        }
        return imageName;
    }
}

  • 测试
package cn.helin9s.crawler.autohome.crawlerautohome;

import cn.helin9s.crawler.autohome.crawlerautohome.service.ApiService;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;

@SpringBootTest
class CrawlerAutohomeApplicationTests {

    @Autowired
    private ApiService apiService;
    public static final String URL="https://www.autohome.com.cn/bestauto/1";

    public static final String IMGURL="https://car2.autoimg.cn/cardfs/product/g27/M06/0B/BD/1024x0_1_q95_autohomecar__ChcCQF4B36qALfbDAAcdSW2oROc338.jpg";

    @Test
    public void getHtml(){
        String html = apiService.getHtml(URL);
        Document dom = Jsoup.parse(html);
        System.out.println(dom.select("title").first().text());
    }

    @Test
    public void getImage(){
        String image = apiService.getImage(IMGURL);
        System.out.println(image);
    }

}

3.去重过滤器

  • 布隆过滤器

    布隆过滤器主要用于判断一个元素是否在一个集合中,它可以使用一个位数组简洁的表示一个数组。它的空间效率和查询时间远远超过一般的算法,但是它存在一定的误判的概率,适用于容忍误判的场景。如果布隆过滤器判断元素存在于一个集合中,那么大概率是存在在集合中,如果它判断元素不存在一个集合中,那么一定不存在于集合中。常常被用于大数据去重。

    算法思想

    布隆过滤器算法主要思想就是利用k个哈希函数计算得到不同的哈希值,然后映射到相应的位数组的索引上,将相应的索引位上的值设置为1。判断该元素是否出现在集合中,就是利用k个不同的哈希函数计算哈希值,看哈希值对应相应索引位置上面的值是否是1,如果有1个不是1,说明该元素不存在在集合中。但是也有可能判断元素在集合中,但是元素不在,这个元素所有索引位置上面的1都是别的元素设置的,这就导致一定的误判几率。布隆过滤的思想如下图所示:

    img

    java实现简单布隆过滤器(hash+bitset):

    import java.util.ArrayList;
    import java.util.BitSet;
    import java.util.List;
     
    public class BloomFilter {
        /*BitSet初始分配2^24个bit*/
        private static final int DEFAULT_SIZE = 2 << 24;
        /*不同哈希函数的种子,一般应取质数*/
        private static final int[] seeds = new int[] { 5, 7, 11, 13, 31, 37, 61 };
        private BitSet bits = new BitSet(DEFAULT_SIZE);
        /*哈希函数对象*/
        private SimpleHash[] func = new SimpleHash[seeds.length];
     
        public BloomFilter() {
            for (int i = 0; i < seeds.length; i++) {
                func[i] = new SimpleHash(DEFAULT_SIZE, seeds[i]);
            }
        }
    
        /*将需要去重的元素标记到bits中*/
        public void add(String value) {
            for (SimpleHash f : func) {
                bits.set(f.hash(value), true);
            }
        }
     
        public boolean contains(String value) {
            if (value == null) {
                return false;
            }
            boolean ret = true;
            for (SimpleHash f : func) {
                ret = ret && bits.get(f.hash(value));
            }
            return ret;
        }
     
        /*哈希函数内部类*/
        public static class SimpleHash {
            private int cap;
            private int seed;
     
            public SimpleHash(int cap, int seed) {
                this.cap = cap;
                this.seed = seed;
            }
     
            public int hash(String value) {
                int result = 0;
                int len = value.length();
                for (int i = 0; i < len; i++) {
                    result = seed * result + value.charAt(i);
                }
                return (cap - 1) & result;
            }
        }
     
        public static void main(String[] args) {
            BloomFilter bf = new BloomFilter();
            List<String> strs = new ArrayList<String>();
            strs.add("123456");
            strs.add("hello word");
            strs.add("transDocId");
            strs.add("123456");
            strs.add("transDocId");
            strs.add("hello word");
            strs.add("test");
            for (int i=0;i<strs.size();i++) {
                String s = strs.get(i);
                boolean bl = bf.contains(s);
                if(bl){
                    System.out.println(i+","+s);
                }else{
                    bf.add(s);
                }
            }
        }
     
    }
    
  • 配置过滤器

package cn.helin9s.crawler.autohome.crawlerautohome.config;

import cn.helin9s.crawler.autohome.crawlerautohome.dao.CarTestDao;
import cn.helin9s.crawler.autohome.crawlerautohome.pojo.model.CarTestModel;
import cn.helin9s.crawler.autohome.crawlerautohome.util.TitleFilter;
import com.baomidou.mybatisplus.core.conditions.query.QueryWrapper;
import com.baomidou.mybatisplus.core.metadata.IPage;
import com.baomidou.mybatisplus.extension.plugins.pagination.Page;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;

import java.util.List;

/**
 * @author helin
 * @project crawler-autohome
 * @package cn.helin9s.crawler.autohome.crawlerautohome.config
 * @description @TODO
 * @create 2020-01-02 16:08
 */
@Configuration
public class TitleFilterCfg {

    @Autowired
    private CarTestDao carTestDao;

    @Bean
    public TitleFilter titleFilter(){
        TitleFilter titleFilter;
        //声明页码数
        Integer page=1;
        //声明分页大小
        Integer pageSize=100;
        for (;;){
             titleFilter = new TitleFilter();
            //查询数据库中title数据,因为数据量过大,最好分页查询
            List<CarTestModel> list = carTestDao.getList(page-1, pageSize);
            for (CarTestModel carTestModel : list) {
                //初始化数据,把数据库中已有数据的汽车标题放到去重过滤器中
                titleFilter.add(carTestModel.getTitle());
            }

            //执行完成后页码加1
            page++;
            //如果最后一页则退出
            if (!pageSize.equals(list.size())){
                break;
            }
        }

        return titleFilter;
    }
}
  • 查询数据接口
package cn.helin9s.crawler.autohome.crawlerautohome.dao;

import cn.helin9s.crawler.autohome.crawlerautohome.pojo.model.CarTestModel;
import com.baomidou.mybatisplus.core.mapper.BaseMapper;
import org.apache.ibatis.annotations.Param;
import org.springframework.stereotype.Repository;

import java.util.List;

/**
 * @author helin
 * @project crawler-autohome
 * @package cn.helin9s.crawler.autohome.crawlerautohome.dao
 * @description @TODO
 * @create 2020-01-01 22:32
 */
@Repository
public interface CarTestDao extends BaseMapper<CarTestModel> {
    List<CarTestModel> getList(@Param("pageNum") Integer pageNum,@Param("pageSize") Integer pageSize);
}
  • 查询数据mybatis实现
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd">
<mapper namespace="cn.helin9s.crawler.autohome.crawlerautohome.dao.CarTestDao">

    <select id="getList" resultType="cn.helin9s.crawler.autohome.crawlerautohome.pojo.model.CarTestModel">
        select * from  car_test limit #{pageNum},#{pageSize}
    </select>

</mapper>

4.实现爬取测试方法

  • 提前通过测试方法获取一个页面存在本地
package cn.helin9s.crawler.autohome.crawlerautohome;

import cn.helin9s.crawler.autohome.crawlerautohome.service.ApiService;
import org.apache.commons.io.FileUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;

import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;

//@SpringBootTest
class CrawlerAutohomeApplicationTests {

    @Test
    public void testJsoupGetHtml() throws Exception {
        //解析url地址,第一个参数是需要解析的url,第二个参数是连接的超时时间(ms)
        Document dom = Jsoup.parse(new URL("https://www.autohome.com.cn/bestauto/1"), 10000);

        //获取页面信息,输出为html文件
        FileUtils.writeStringToFile(new File("C:/Users/he/Desktop/test1.html"), dom.html(), "UTF-8");

        //使用dom对象解析页面,获取title标签的内容
        String title = dom.getElementsByTag("title").first().text();
        System.out.println(title);
    }
}
  • 然后在springboot的测试方法中编写对该页面的解析逻辑
package cn.helin9s.crawler.autohome.crawlerautohome;

import cn.helin9s.crawler.autohome.crawlerautohome.pojo.model.CarTestModel;
import cn.helin9s.crawler.autohome.crawlerautohome.service.ApiService;
import cn.helin9s.crawler.autohome.crawlerautohome.service.ICarTestService;
import cn.helin9s.crawler.autohome.crawlerautohome.util.TitleFilter;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;

/**
 * @author helin
 * @project crawler-autohome
 * @package cn.helin9s.crawler.autohome.crawlerautohome
 * @description @TODO
 * @create 2020-01-03 17:06
 */
@SpringBootTest
public class AutohomeCrawlerTest {

    @Autowired
    private ApiService apiService;
    @Autowired
    private ICarTestService carTestService;
    @Autowired
    private TitleFilter titleFilter;

    @Test
    public void testCrawler() throws IOException {

        //todo 使用jsoup解析页面
        Document dom = Jsoup.parse(new File("C:/Users/he/Desktop/test1.html"), "UTF-8");

        //todo 获取所有评测的div
        Elements divs = dom.select("#bestautocontent div.uibox");

        //todo 遍历当前页所有评测数据
        for (Element div : divs) {
            //去重过滤,重复的数据就不需要再处理了
            String title = div.select("div.uibox-title").first().text();
            if (titleFilter.contains(title)){
                //如果数据存在,表示重复,进行下次遍历
                continue;
            }
            //解析页面,获取评测数据对象
            CarTestModel carTestModel = getCarTestModel(div);

            //解析页面,下载车辆评测图片
            String carImage = getCarImage(div);
            //把汽车数据存到数据库
            carTestModel.setImage(carImage);
            carTestService.save(carTestModel);
        }
    }

    /**
     * 解析传递进来的元素,封装为汽车评测对象
     * @param div
     * @return
     */
    private CarTestModel getCarTestModel(Element div){
        CarTestModel carTestModel = new CarTestModel(null,
                div.select("div.uibox-title").first().text(),
                changeStr2Num(div.select(".tabbox1 dd:nth-child(2) div.dd-div2").first().text()),
                changeStr2Num(div.select(".tabbox1 dd:nth-child(3) div.dd-div2").first().text()),
                changeStr2Num(div.select(".tabbox1 dd:nth-child(4) div.dd-div2").first().text()),
                div.select(".tabbox2 dd:nth-child(2) > div.dd-div1").first().text(),
                div.select(".tabbox2 dd:nth-child(2) > div.dd-div3").first().text(),
                div.select(".tabbox2 dd:nth-child(3) > div.dd-div1").first().text(),
                div.select(".tabbox2 dd:nth-child(3) > div.dd-div3").first().text(),
                div.select(".tabbox2 dd:nth-child(4) > div.dd-div1").first().text(),
                div.select(".tabbox2 dd:nth-child(4) > div.dd-div3").first().text(),
                null,
                new Date(),
                new Date());

        return carTestModel;
    }

    /**
     * 解析传递进来的元素,下载评测图片
     * @param div
     * @return 下载后的图片名字
     */
    private String getCarImage(Element div){
        //声明存放图片的集合
        List<String> images = new ArrayList<>();
        //获取图的url地址
        Elements page = div.select("ul.piclist02 li");
        //遍历评测图片的元素
        for (Element element : page) {
            //获取评测图片的展示地址
            String imagePage = "https:" + element.select("a").attr("href");
            //获取图片展示页面
            String html = apiService.getHtml(imagePage);
            //解析图片展示页面
            Document dom = Jsoup.parse(html);
            //获取图片地址
            String imageUrl ="https:" + dom.getElementById("img").attr("src");
            //下载图片并返回图片名称
            String imageName = apiService.getImage(imageUrl);
            images.add(imageName);
        }

        return ArrayUtils.toString(images);
    }

    /**
     * 把字符串最后一个字符去掉,转为数字,再乘以1000
     * @param str
     */
    private long changeStr2Num(String str){
        String substring = StringUtils.substring(str, 0, str.length() - 1);
        substring = isNumeric(substring.trim())?substring.trim():"0";
        //把字符串转为小数,必须用Number接收,否则会有精度丢失的问题
        Number num = substring.isEmpty() ? 0 : Double.parseDouble(substring.trim())*1000;
        return num.longValue();
    }

    /**
     * 判断字符串是否为数字
     * @param str
     * @return
     */
    public boolean isNumeric(String str) {
        //看它是否能转化为一个数
        try {
            double db=Double.parseDouble(str);
        } catch (Exception e) {
            return false;
        }
        return true;
    }
}

当测试通过后就可以用定时任务开始遍历这些逻辑代码了

5. 编写自动爬取数据的定时任务

  • 给定时任务中添加一个新的触发器
package cn.helin9s.crawler.autohome.crawlerautohome.config;

import cn.helin9s.crawler.autohome.crawlerautohome.config.job.CloseConnectJob;
import cn.helin9s.crawler.autohome.crawlerautohome.config.job.CrawlerAutohomeJobBean;
import org.quartz.Trigger;
import org.springframework.beans.factory.annotation.Qualifier;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.scheduling.quartz.CronTriggerFactoryBean;
import org.springframework.scheduling.quartz.JobDetailFactoryBean;
import org.springframework.scheduling.quartz.SchedulerFactoryBean;

/**
 * @author helin
 * @project crawler-autohome
 * @package cn.helin9s.crawler.autohome.crawlerautohome.config
 * @description @TODO
 * @create 2020-01-01 22:42
 */
@Configuration
public class SchedledCfg {


    /**
     * 获取spring容器
     * @return
     */
    @Bean("crawlerAutohomeJobBean")
    public JobDetailFactoryBean crawlerAutohomeJobBean(){
        //创建任务描述的工厂bean
        JobDetailFactoryBean jb = new JobDetailFactoryBean();
        //设置spirng容器的key,任务中可以根据这个key获取spring容器
        jb.setApplicationContextJobDataKey("context");
        //设置任务
        jb.setJobClass(CrawlerAutohomeJobBean.class);
        //设置当没有触发器和任务绑定,不会删除任务
        jb.setDurability(true);
        return jb;
    }

    /**
     * 定义数据爬取的触发器
     * //@Qualifier注解是通过名字注入bean,
     * @return
     */
    @Bean("crawlerAutohomeJobTrigger")
    public CronTriggerFactoryBean crawlerAutohomeJobTrigger(
            @Qualifier(value = "crawlerAutohomeJobBean") JobDetailFactoryBean itemJobBean){
        //创建一个表达式触发器工厂bean
        CronTriggerFactoryBean cronTriggerFactoryBean = new CronTriggerFactoryBean();
        //设置任务描述到触发器
        cronTriggerFactoryBean.setJobDetail(itemJobBean.getObject());
        //设置Cron表达式(从第0秒开始每隔5秒执行一次)
        cronTriggerFactoryBean.setCronExpression("0/5 * * * * ? ");
        return cronTriggerFactoryBean;
    }



    /**
     * 定义关闭无效连接任务
     * @return
     */
    @Bean("closeConnectJobBean")
    public JobDetailFactoryBean jobDetailFactoryBean(){
        //创建任务描述的工厂bean
        JobDetailFactoryBean jb = new JobDetailFactoryBean();
        //设置spirng容器的key,任务中可以根据这个key获取spring容器
        jb.setApplicationContextJobDataKey("context");
        //设置任务
        jb.setJobClass(CloseConnectJob.class);
        //设置当没有触发器和任务绑定,不会删除任务
        jb.setDurability(true);
        return jb;
    }

    /**
     * 定义关闭无效连接触发器
     * //@Qualifier注解是通过名字注入bean,
     * @return
     */
    @Bean("closeConnectJobTrigger")
    public CronTriggerFactoryBean cronTriggerFactoryBean(
            @Qualifier(value = "closeConnectJobBean") JobDetailFactoryBean itemJobBean){
        //创建一个表达式触发器工厂bean
        CronTriggerFactoryBean cronTriggerFactoryBean = new CronTriggerFactoryBean();
        //设置任务描述到触发器
        cronTriggerFactoryBean.setJobDetail(itemJobBean.getObject());
        //设置Cron表达式(从第0秒开始每隔5秒执行一次)
        cronTriggerFactoryBean.setCronExpression("0/5 * * * * ? ");
        return cronTriggerFactoryBean;
    }

    /**
     * 定义调度器
     * @return
     */
    @Bean
    public SchedulerFactoryBean schedulerFactoryBean(Trigger[] cronTriggerImpl){
        //创建任务调度器的工厂bean
        SchedulerFactoryBean bean = new SchedulerFactoryBean();
        //给任务调度器设置触发器
        bean.setTriggers(cronTriggerImpl);
        return bean;
    }
}

  • 编写一个新的定时任务,循环爬取所有页面的数据
package cn.helin9s.crawler.autohome.crawlerautohome.config.job;

import cn.helin9s.crawler.autohome.crawlerautohome.pojo.model.CarTestModel;
import cn.helin9s.crawler.autohome.crawlerautohome.service.ApiService;
import cn.helin9s.crawler.autohome.crawlerautohome.service.ICarTestService;
import cn.helin9s.crawler.autohome.crawlerautohome.util.TitleFilter;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.quartz.DisallowConcurrentExecution;
import org.quartz.JobExecutionContext;
import org.quartz.JobExecutionException;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.ApplicationContext;
import org.springframework.scheduling.quartz.QuartzJobBean;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;

/**
 * @author helin
 * @project crawler-autohome
 * @package cn.helin9s.crawler.autohome.crawlerautohome.config.job
 * @description @TODO
 * @create 2020-01-04 15:51
 */
//当前任务没有结束不开启下一个任务
@DisallowConcurrentExecution
public class CrawlerAutohomeJobBean extends QuartzJobBean {

    @Autowired
    private ApiService apiService;
    @Autowired
    private ICarTestService carTestService;
    @Autowired
    private TitleFilter titleFilter;

    public static final String URL="https://www.autohome.com.cn/bestauto/";

    @Override
    protected void executeInternal(JobExecutionContext context) throws JobExecutionException {
        //获取spring容器
        ApplicationContext applicationContext =(ApplicationContext) context.getJobDetail().getJobDataMap().get("context");

        this.apiService = applicationContext.getBean(ApiService.class);
        this.carTestService= applicationContext.getBean(ICarTestService.class);
        this.titleFilter= applicationContext.getBean(TitleFilter.class);
		
        //循环165页并爬取其中数据
        for (int i = 1; i <= 165; i++) {
            try {
                crawler(URL+i);
            } catch (IOException e) {
                e.printStackTrace();
            }
        }

    }
    
	/**
     * 解析并保存数据
     * @param url
     * @throws IOException
     */
    private void crawler(String url) throws IOException {

        //todo 1.使用apiService爬取数据
        String html=apiService.getHtml(url);
        //todo 2.使用jsoup解析页面
        Document dom = Jsoup.parse(html);

        //todo 3.获取所有评测的div
        Elements divs = dom.select("#bestautocontent div.uibox");

        //todo 4.遍历当前页所有评测数据
        for (Element div : divs) {
            //去重过滤,重复的数据就不需要再处理了
            String title = div.select("div.uibox-title").first().text();
            if (titleFilter.contains(title)){
                //如果数据存在,表示重复,进行下次遍历
                continue;
            }
            //解析页面,获取评测数据对象
            CarTestModel carTestModel = getCarTestModel(div);

            //解析页面,下载车辆评测图片
            String carImage = getCarImage(div);
            //把汽车数据存到数据库
            carTestModel.setImage(carImage);
            carTestService.save(carTestModel);
        }
    }

    /**
     * 解析传递进来的元素,封装为汽车评测对象
     * @param div
     * @return
     */
    private CarTestModel getCarTestModel(Element div){
        CarTestModel carTestModel = new CarTestModel(null,
                div.select("div.uibox-title").first().text(),
                changeStr2Num(div.select(".tabbox1 dd:nth-child(2) div.dd-div2").first().text()),
                changeStr2Num(div.select(".tabbox1 dd:nth-child(3) div.dd-div2").first().text()),
                changeStr2Num(div.select(".tabbox1 dd:nth-child(4) div.dd-div2").first().text()),
                div.select(".tabbox2 dd:nth-child(2) > div.dd-div1").first().text(),
                div.select(".tabbox2 dd:nth-child(2) > div.dd-div3").first().text(),
                div.select(".tabbox2 dd:nth-child(3) > div.dd-div1").first().text(),
                div.select(".tabbox2 dd:nth-child(3) > div.dd-div3").first().text(),
                div.select(".tabbox2 dd:nth-child(4) > div.dd-div1").first().text(),
                div.select(".tabbox2 dd:nth-child(4) > div.dd-div3").first().text(),
                null,
                new Date(),
                new Date());

        return carTestModel;
    }

    /**
     * 解析传递进来的元素,下载评测图片
     * @param div
     * @return 下载后的图片名字
     */
    private String getCarImage(Element div){
        //声明存放图片的集合
        List<String> images = new ArrayList<>();
        //获取图的url地址
        Elements page = div.select("ul.piclist02 li");
        //遍历评测图片的元素
        for (Element element : page) {
            //获取评测图片的展示地址
            String imagePage = "https:" + element.select("a").attr("href");
            //获取图片展示页面
            String html = apiService.getHtml(imagePage);
            //解析图片展示页面
            Document dom = Jsoup.parse(html);
            //获取图片地址
            String imageUrl ="https:" + dom.getElementById("img").attr("src");
            //下载图片并返回图片名称
            String imageName = apiService.getImage(imageUrl);
            images.add(imageName);
        }

        return ArrayUtils.toString(images);
    }

    /**
     * 把字符串最后一个字符去掉,转为数字,再乘以1000
     * @param str
     */
    private long changeStr2Num(String str){
        String substring = StringUtils.substring(str, 0, str.length() - 1);
        substring = isNumeric(substring.trim())?substring.trim():"0";
        //把字符串转为小数,必须用Number接收,否则会有精度丢失的问题
        Number num = substring.isEmpty() ? 0 : Double.parseDouble(substring.trim())*1000;
        return num.longValue();
    }

    /**
     * 判断字符串是否为数字
     * @param str
     * @return
     */
    public boolean isNumeric(String str) {
        //看它是否能转化为一个数
        try {
            double db=Double.parseDouble(str);
        } catch (Exception e) {
            return false;
        }
        return true;
    }
}

至此,爬虫功能以及全部开发完毕。

评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值