Python_JAVA第一章

最新推荐文章于 2024-06-08 16:55:51 发布

石国旺

最新推荐文章于 2024-06-08 16:55:51 发布

阅读量1.1k

点赞数

分类专栏： python

本文链接：https://blog.youkuaiyun.com/qq_41821006/article/details/104715826

版权

python 专栏收录该内容

5 篇文章

订阅专栏

文章目录

python第一章

python第一章

一、什么是爬虫

爬虫是一段程序，抓取互联网上的数据，保存到本地。
抓取过程：
- 1、使用程序模拟浏览器
- 2、向服务器发送请求。
- 3、服务器响应html
- 4、把页面中的有用的数据解析出来。
  解析页面中的链接地址。
  把链接地址添加到url队列中。
- 5、爬虫从url队列中取url，返回2的操作。

二、爬虫的抓取环节

抓取页面。
可以使用java api中提供的URLConnection类发送请求。
推荐使用工具包HttpClient。
是apache旗下的一个开源项目。
可以模拟浏览器。
对页面进行解析。
使用Jsoup工具包。
可以像使用jQuery一样解析html。

三、HttpClient

可以使用HttpClient模拟浏览器。

使用HttpClient发送get请求

步骤：
1）创建一个HttpClient对象，使用CloseableHttpClient，使用HttpClients工具类创建。
2）创建一个HttpGet对象，get对象封装请求的url
3）使用HttpClient执行请求
4）接收服务端响应的内容。
响应的内容包含响应头
包含响应的内容（html）
5）关闭连接

/**
 * 使用HttpClient发送get请求
 * @throws Exception
 */
@Test
public void testGet() throws Exception {
    //1）创建一个HttpClient对象，使用CloseableHttpClient，使用HttpClients工具类创建。
    CloseableHttpClient httpClient = HttpClients.createDefault();
    //2）创建一个HttpGet对象，get对象封装请求的url
    HttpGet get = new HttpGet("https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&wq=%E6%89%8B%E6%9C%BA&pvid=6c76113fd91343dbaf0cc9037e3433c3");
    //3）使用HttpClient执行请求
    CloseableHttpResponse response = httpClient.execute(get);
    //4）接收服务端响应的内容。
    StatusLine statusLine = response.getStatusLine();
    System.out.println(statusLine);// HTTP/1.1 200 OK
    //响应的内容包含响应头
    int statusCode = statusLine.getStatusCode();
    System.out.println(statusCode);// 200
    //包含响应的内容（html）
    HttpEntity entity = response.getEntity();
    String html = EntityUtils.toString(entity);
    System.out.println(html);
    //5）关闭连接
    response.close();
    httpClient.close();
}

使用HttpClient发送Post请求

步骤：

1）创建一个HttpClient对象
2）创建HttpPost对象，封装一个url
3）如果有参数就应该把参数封装到表单中。
4）使用HttpClient执行请求。
5）接收服务端响应html
6）关闭连接

/**
 * 使用HttpClient发送Post请求
 * @throws Exception
 */
@Test
public void testPost() throws Exception {
    //1）创建一个HttpClient对象
    CloseableHttpClient httpClient = HttpClients.createDefault();
    //2）创建HttpPost对象，封装一个url
    HttpPost post = new HttpPost("https://search.jd.com/");
    //3）如果有参数就应该把参数封装到表单中。
    List<NameValuePair> form = new ArrayList<>();
    form.add(new BasicNameValuePair("keyword", "手机"));
    form.add(new BasicNameValuePair("enc", "utf-8"));
    form.add(new BasicNameValuePair("wq", "手机"));
    form.add(new BasicNameValuePair("pvid", "929b7523b89642b2a92b655a111424d0"));
    UrlEncodedFormEntity entity = new UrlEncodedFormEntity(form);
    post.setEntity(entity);
    //4）使用HttpClient执行请求。
    CloseableHttpResponse response = httpClient.execute(post);
    //5）接收服务端响应html
    String html = EntityUtils.toString(response.getEntity());
    System.out.println(html);
    //6）关闭连接
    response.close();
    httpClient.close();
}

使用连接池创建HttpClient对象

1）创建一个连接池对象。在系统中应是单例的。
2）使用HttpClients工具类，设置使用的连接池对象。基于连接池创建HttpClient对象。
3）使用HttpClient发送请求。
4）接收服务端响应的数据。
5）关闭Response对象，HttpClient对象不需要关闭。

 /**
  * 使用连接池创建HttpClient对象
  * @throws Exception
  */
@Test
public void createHttpClientUserPool() throws Exception {
    //1）创建一个连接池对象。在系统中应是单例的。
    PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();
    //2）使用HttpClients工具类，设置使用的连接池对象。基于连接池创建HttpClient对象。
    CloseableHttpClient httpClient = HttpClients.custom()
        //设置使用的连接池对象
        .setConnectionManager(cm)
        .build();
    //3）使用HttpClient发送请求。
    HttpGet get = new HttpGet("https://www.jd.com/");
    CloseableHttpResponse response = httpClient.execute(get);
    //4）接收服务端响应的数据。如果乱码第二个参数可以设置编码集
    String html = EntityUtils.toString(response.getEntity(),"utf-8");
    System.out.println(html);
    //5）关闭Response对象，HttpClient对象不需要关闭。
    response.close();
}

使用自动工具类 HttpsUtils 访问被拦截的请求

package cn.sgwks.crawler;

import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.conn.socket.ConnectionSocketFactory;
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
import org.apache.http.conn.ssl.NoopHostnameVerifier;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.conn.ssl.TrustStrategy;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.ssl.SSLContextBuilder;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;

public class HttpsUtils {
    private static final String HTTP = "http";
    private static final String HTTPS = "https";
    private static SSLConnectionSocketFactory sslsf = null;
    private static PoolingHttpClientConnectionManager cm = null;
    private static SSLContextBuilder builder = null;
    static {
        try {
            builder = new SSLContextBuilder();
            // 全部信任 不做身份鉴定
            builder.loadTrustMaterial(null, new TrustStrategy() {
                @Override
                public boolean isTrusted(X509Certificate[] x509Certificates, String s) throws CertificateException {
                    return true;
                }
            });
            sslsf = new SSLConnectionSocketFactory(builder.build(), new String[]{"SSLv2Hello", "SSLv3", "TLSv1", "TLSv1.2"}, null, NoopHostnameVerifier.INSTANCE);
            Registry<ConnectionSocketFactory> registry = RegistryBuilder.<ConnectionSocketFactory>create()
                    .register(HTTP, new PlainConnectionSocketFactory())
                    .register(HTTPS, sslsf)
                    .build();
            cm = new PoolingHttpClientConnectionManager(registry);
            cm.setMaxTotal(200);//max connection
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public static CloseableHttpClient getHttpClient() throws Exception {
        CloseableHttpClient httpClient = HttpClients.custom()
                .setSSLSocketFactory(sslsf)
                .setConnectionManager(cm)
                .setConnectionManagerShared(true)
                .build();
        return httpClient;
    }
}

 /**
  * 使用工具类访问不到的数据
  * 访问互联网购物商城类似:京东，淘宝的搜索页面抓取不到的数据,但这里只解决了页面加载后的数据，后面的ajax数据却无法获取
  */
@Test
public void testHttps2() throws Exception {
    //1.创建一个HttpClient对象
    CloseableHttpClient httpClient = HttpsUtils.getHttpClient();
    //2.创建一个HttGet对象，封装url
    HttpGet get = new HttpGet("https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&pvid=d57afafc61054075b2819ab21b6e55b1");
    //3.设置访问头,火狐浏览器
    get.addHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64; rv:68.0) Gecko/20100101 Firefox/68.0");
    //设置访问头,谷歌浏览器
    //get.addHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36");
    //4.执行请求
    CloseableHttpResponse response = httpClient.execute(get);
    //5.接收结果
    String html = EntityUtils.toString(response.getEntity(),"utf-8");
    //打印结果
    System.out.println(html);
    //关闭Response
    response.close();
}

四、jsoup

jsoup就是一个java的工具包。解析html的工具包。

使用方法

1）使用Jsoup工具类提供的方法parse，解析html
parse的参数可以是url、本地文件、String（html）、InputStream
解析之后得到一个Document对象。

/**
 * 根据标签获取指定数据
 * @throws Exception
 */
@Test
public void parseHtml() throws Exception {
    //使用Jsoup工具类提供的方法parse，解析html
    Document document = Jsoup.parse(new URL("https://www.jd.com"), 3000);
    //Document document = Jsoup.parse(new File("src/main/java/cn/sgwks/crawler/login.html"), "utf-8");
    //解析title
    Elements elements = document.getElementsByTag("title");
    for (Element element : elements) {
        System.out.println("初始html:"+element);
        System.out.println("去除标签html:"+element.text());
    }
    //取链接地址
    System.out.println("取链接地址-----------------------");
    Elements elements1 = document.getElementsByTag("a");
    for (Element element : elements1) {
        System.out.println(element.attr("href"));
    }
    System.out.println("获取id为index值得标签-------------------");
    Element element = document.getElementById("index");
    System.out.println(element.text());
    System.out.println("获取包含id的属性标签-------------------");
}

2）可以使用Document对象的方法对页面进行解析

1、常用的方法
根据id选择
根据节点名称选择
根据属性选择
根据属性名称选择
根据class名称选择

Elements elements2 = document.getElementsByAttribute("id");
for (Element element1 : elements2) {
    System.out.println(element1);
}
System.out.println("获取属性为target得值为_blank的标签--------------");
Elements elements3 = document.getElementsByAttributeValue("target", "_blank");
for (Element element1 : elements3) {
    System.out.println(element1);
}
System.out.println("获取class标签-------------");
Elements elements4 = document.getElementsByClass("setting");
for (Element element1 : elements4) {
    System.out.println(element1);
}

2、使用css选择器
和jQuery相同的css选择器。
使用select方法执行css选择器。

Document的常用方法

根据标签名称选择节点：
getElementsByTag
根据id选择节点：
getElementById
根据属性选择节点：判断节点是否包含此属性。
getElementsByAttribute
根据属性的值选择节点：
getElementsByAttributeValue
根据class选择节点：
getElementsByClass
取标签内的文本信息：
text()
取标签的属性：
attr(“属性名称”)

/**
 * 根据属性获取数据
 * @throws Exception
 */
@Test
public void testCssSelector() throws Exception {
    Document document = Jsoup.parse(new URL("http://www.jd.com"), 3000); 
    //解析document对象
    //根据id选择
    Elements elements = document.select("#video");
    for (Element element : elements) {
        System.out.println(element);
    }
    //根据标签名称选择
    Elements elements1 = document.select("li");
    for (Element element : elements1) {
        System.out.println(element);
    }
    //根据class选择
    Elements elements2 = document.select(".s_name");
    for (Element element : elements2) {
        System.out.println(element);
    }
    //根据属性选择
    Elements elements3 = document.select("[class='slogan']");
    for (Element element : elements3) {
        System.out.println(element);
    }
    //css选择器可以组合使用
    Elements elements4 = document.select(".box_know > div:nth-child(1) > div:nth-child(1) > h2:nth-child(1)");
    for (Element element : elements4) {
        System.out.println(element);
    }
}

Document对象的css选择器
select方法解析css选择器。

五、案例

需求
- 抓取京东商城的数据，把商品数据保存到数据库。
功能分析
- 1）使用HttpClient发送一个get请求，请求搜索url，得到商品列表
- 2）使用jsoup解析搜索结果页面。
- 3）把商品信息封装一个对象中。
- 4）把商品数据保存到数据库。
  - ```
  https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&wq=%E6%89%8B%E6%9C%BA&pvid=6c76113fd91343dbaf0cc9037e3433c3
```
- 京东商城每次只展示30条数据，后30条数据是ajax动态加载的。取10页数据。
- 保存到数据库：
  创建一个数据库。
  需要的字段都是商品列表中可以解析出来的字段。
  持久层框架可以使用springDataJpa,使用springboot搭建工程。

工程搭建

1）创建一个springboot工程

2）添加父工程及依赖的jar包
SpringDataJpa的起步依赖
mysql的数据库驱动
添加spring-boot-stater-web模块
HttpClient
jsoup

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>cn.sgwks</groupId>
    <artifactId>crawler-jd</artifactId>
    <version>1.0-SNAPSHOT</version>
    <parent>
        <groupId>org.springframework.boot</groupId>
        <artifactId>spring-boot-starter-parent</artifactId>
        <version>2.0.2.RELEASE</version>
    </parent>
    <properties>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <maven.compiler.source>1.8</maven.compiler.source>
        <maven.compiler.target>1.8</maven.compiler.target>
    </properties>
    <dependencies>
        <!--SpringMVC-->
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-web</artifactId>
        </dependency>

        <!--SpringData Jpa-->
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-data-jpa</artifactId>
        </dependency>

        <!--MySQL连接包-->
        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
        </dependency>

        <!-- HttpClient -->
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
        </dependency>

        <!--Jsoup-->
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.10.3</version>
        </dependency>

        <!--工具包-->
        <dependency>
            <groupId>org.apache.commons</groupId>
            <artifactId>commons-lang3</artifactId>
        </dependency>
    </dependencies>
</project>

3）创建一个application.properties
其中配置数据库的连接信息

#DB Configuration:
spring:
  datasource:
    driver-class-name: com.mysql.jdbc.Driver
    url: jdbc:mysql://127.0.0.1:3306/crawler-sgw?useUnicode=true&characterEncoding=utf8
    username: root
    password: root
#JPA Configuration:
  jpa:
    database: mysql
    show-sql: true
    generate-ddl: true
    hibernate:
      ddl-auto: update

4）创建实体类、dao

package cn.sgwks.crawlerjd.entity;

import javax.persistence.*;
import java.util.Date;

@Entity
@Table(name = "jd_item")
public class Item {
    @Id
    @GeneratedValue(strategy = GenerationType.IDENTITY)
    private Long id;
    private Long spu;
    private Long sku;
    private String title;
    private Float price;
    private String pic;
    private String url;
    private Date created;
    private Date updated;
    public Long getId() {
        return id;
    }
    public void setId(Long id) {
        this.id = id;
    }
    public Long getSpu() {
        return spu;
    }
    public void setSpu(Long spu) {
        this.spu = spu;
    }
    public Long getSku() {
        return sku;
    }
    public void setSku(Long sku) {
        this.sku = sku;
    }
    public String getTitle() {
        return title;
    }
    public void setTitle(String title) {
        this.title = title;
    }
    public Float getPrice() {
        return price;
    }
    public void setPrice(Float price) {
        this.price = price;
    }
    public String getPic() {
        return pic;
    }
    public void setPic(String pic) {
        this.pic = pic;
    }
    public String getUrl() {
        return url;
    }
    public void setUrl(String url) {
        this.url = url;
    }
    public Date getCreated() {
        return created;
    }

    public void setCreated(Date created) {
        this.created = created;
    }
    public Date getUpdated() {
        return updated;
    }
    public void setUpdated(Date updated) {
        this.updated = updated;
    }
}

package cn.sgwks.crawlerjd.dao;

import cn.sgwks.crawlerjd.entity.Item;
import org.springframework.data.jpa.repository.JpaRepository;

public interface ItemDao extends JpaRepository<Item,Long> {
}

5）创建引导类。

package cn.sgwks.crawlerjd;

import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;

@SpringBootApplication
public class Application {
    public static void main(String[] args) {
        SpringApplication.run(Application.class , args);
    }
}
--启动类
package cn.sgwks.crawlerjd.controller;

import cn.sgwks.crawlerjd.component.Crawler;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;

@RestController
public class CrawlerController {
    @Autowired
    private Crawler crawler;

    @RequestMapping("/start")
    public String startCrawler() {
        new Thread(new Runnable() {
            @Override
            public void run() {
                System.out.println("新线程已经启动。。。。。");
                crawler.doCrawler();
            }
        }).start();
        return "OK";
    }
}

6）编写爬虫的业务逻辑

1、使用工具类创建一个HttpClient对象。

@Component
public class Crawler {
@Autowired
    private ItemDao itemDao;
    private String startUrl = "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E6%89%8B%E6%9C%BA&s=61&click=0&page=";
}

2、使用HttpClient发送请求，请求就是搜索的url+页码

public void doCrawler() {
    try {
        //1、使用工具类创建一个HttpClient对象。
        CloseableHttpClient httpClient = HttpsUtils.getHttpClient();
        //2、使用HttpClient发送请求，请求就是搜索的url+页码
        for (int i = 0; i < 5; i++) {
            //7、需要翻页。
            HttpGet get = new HttpGet(startUrl + (i * 2 + 1));
            //3.设置访问头,火狐浏览器
            get.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:68.0) Gecko/20100101 Firefox/68.0");
            CloseableHttpResponse response = httpClient.execute(get);
            //3、接收服务端响应html
            String html = EntityUtils.toString(response.getEntity(), "utf-8");
            //4、使用Jsoup解析html
            parseHtml(html);
        }
    } catch (Exception e) {
        e.printStackTrace();
    }
}

3、接收服务端响应html
4、使用Jsoup解析html
5、把解析的商品数据封装成Item对象

6、使用dao把商品写入数据库。

/**
 * 页面解析的业务逻辑
 *
 * @param html
 */
private void parseHtml(String html) throws Exception {
    //4、使用Jsoup解析html
    Document document = Jsoup.parse(html);
    Elements elements = document.select("li.gl-item");
    for (Element element : elements) {
        //解析节点中的商品数据
        //标准产品，商品信息
        String spu = element.attr("data-spu");
        //商品库存量
        String sku = element.attr("data-sku");
        //商品标题
        String title = element.select("div.p-name em").text();
        //商品价格
        String price = element.select("div.p-price i").text();
        //图片
        String imgUrl = element.select("div.p-img img").attr("source-data-lazy-img");
        //执行图片下载
        String imgName = downloadImage(imgUrl,title);
        //商品的url
        String url = element.select("div.p-img a").attr("href");
        //5、把解析的商品数据封装成Item对象
        Item item = new Item();
        item.setSpu(Long.parseLong(spu));
        item.setSku(Long.parseLong(sku));
        item.setTitle(title);
        if (StringUtils.isNotBlank(price)) {
            item.setPrice(Float.parseFloat(price));
        }
        item.setPic(imgName);
        item.setUrl(url);
        //创建时间
        item.setCreated(new Date());
        //修改时间
        item.setUpdated(new Date());
        //6、使用dao把商品写入数据库。
        itemDao.save(item);
    }
}

7、需要翻页。并且指定下载图片

/**
     * 图片下载
     * @param imgUrl
     * @return
     */
    private String downloadImage(String imgUrl,String title) throws Exception {
        //创建一个HttpClient对象
        CloseableHttpClient httpClient = HttpsUtils.getHttpClient();
        //创建一个HttpGet对象
        HttpGet get = new HttpGet("https:" + imgUrl);
        //3.设置访问头,火狐浏览器
        get.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:68.0) Gecko/20100101 Firefox/68.0");
        //发送请求
        CloseableHttpResponse response = httpClient.execute(get);
        //接收服务端响应的内容。
        HttpEntity entity = response.getEntity();
        //需要截取扩展名
        String extName = imgUrl.substring(imgUrl.lastIndexOf("."));
        //需要生成文件名。可以使用uuid生成文件名。并去除特殊字符
        String regEx="[\n`~!@#$%^&*()+=|{}':;',\\[\\].<>/?~！@#￥%……&*（）——+|{}【】‘；：”“’。， 、？]";
        String uuid = UUID.randomUUID().toString().substring(0, 5);
        String prefix = title.replaceAll(regEx, "");
        String fileName = prefix.substring(0, 15)+ uuid + extName;
        //需要生成文件名。可以使用uuid生成文件名。

        //String fileName = UUID.randomUUID() + extName;
        //存放地址 C:\Users\acer\Desktop\jdPhone
        //创建一个文件输出流，把文件保存到磁盘
        FileOutputStream fos = new FileOutputStream("C:\\Users\\acer\\Desktop\\jdPhone\\" + fileName);
        //接收流，把内容保存到磁盘。
        entity.writeTo(fos);
        //关闭流
        fos.close();
        //关闭Response对象
        response.close();
        return fileName;
    }