Spring Boot + JSoup 抓取京东商品信息

本博客介绍了一个使用Spring Boot实现的京东商品信息爬取项目,通过读取Excel中的商品URL,抓取商品详情页,提取商品信息,并将信息输出到新的Excel文件中。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

需求分析

  1. 导入京东商品URL列表
  2. 生成京东商品信息并输出到excel表

思路

  1. 读取excel获取URL列表
  2. 访问url并获得HTML源码
  3. 提取对应的商品信息字段
  4. 输出到excel

搭建框架

  1. 创建Spring Boot工程

    这里写图片描述

    这里写图片描述

  2. 选择依赖

    这里写图片描述

  3. 配置pom.xml

    <?xml version="1.0" encoding="UTF-8"?>
    <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
             xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
        <modelVersion>4.0.0</modelVersion>
    
        <groupId>com.landawang</groupId>
        <artifactId>operation-web</artifactId>
        <version>0.0.1-SNAPSHOT</version>
        <packaging>jar</packaging>
    
        <name>operation-web</name>
        <description>Demo project for Spring Boot</description>
    
        <parent>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-parent</artifactId>
            <version>2.0.3.RELEASE</version>
            <relativePath/> <!-- lookup parent from repository -->
        </parent>
    
        <properties>
            <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
            <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
            <java.version>10</java.version>
        </properties>
    
        <dependencies>
            <dependency>
                <groupId>org.springframework.boot</groupId>
                <artifactId>spring-boot-starter-web</artifactId>
            </dependency>
    
            <dependency>
                <groupId>org.springframework.boot</groupId>
                <artifactId>spring-boot-devtools</artifactId>
                <scope>runtime</scope>
            </dependency>
            <dependency>
                <groupId>org.springframework.boot</groupId>
                <artifactId>spring-boot-starter-test</artifactId>
                <scope>test</scope>
            </dependency>
    
            <!-- thymeleaf -->
            <dependency>
                <groupId>org.springframework.boot</groupId>
                <artifactId>spring-boot-starter-thymeleaf</artifactId>
            </dependency>
    
    
            <dependency>
                <groupId>org.apache.poi</groupId>
                <artifactId>poi-ooxml</artifactId>
                <version>3.10-FINAL</version>
            </dependency>
    
            <dependency>
                <groupId>org.apache.httpcomponents</groupId>
                <artifactId>httpclient</artifactId>
                <version>4.5.1</version>
            </dependency>
    
            <dependency>
                <!-- jsoup HTML parser library @ https://jsoup.org/ -->
                <groupId>org.jsoup</groupId>
                <artifactId>jsoup</artifactId>
                <version>1.11.3</version>
            </dependency>
        </dependencies>
    
        <build>
            <plugins>
                <plugin>
                    <groupId>org.springframework.boot</groupId>
                    <artifactId>spring-boot-maven-plugin</artifactId>
                </plugin>
            </plugins>
        </build>
    
    </project>
    
  4. 重命名application.properties为application.yml,修改内容

    server:
      port: 8080 # 配置端口
    
    spring:
      thymeleaf:
        # 开发时关闭缓存,不然无法看到实时页面
        cache: false
        prefix: classpath:/templates/
        suffix: .html
    
  5. 创建resources/templates/index.html

    <!DOCTYPE html>
    <html lang="en" xmlns:th="http://www.w3.org/1999/xhtml">
    <head>
        <meta charset="UTF-8">
        <meta name="viewport" content="width=device-width, initial-scale=1.0"/>
        <title>生成商品信息</title>
    </head>
    <body>
    
    <h3>请选择excel文件</h3>
    
    <form action="/excel/upload" enctype="multipart/form-data" method="post">
        <span style="white-space:pre"></span>
        <input type="file" name="file"/>
        <span style="white-space:pre"></span>
        <input type="submit" value="导入"/>
        <span style="white-space:pre"></span>
    </form>
    </body>
    </html>
    
  6. 创建PageController并访问页面

    /**
     * Created with IntelliJ IDEA.
     * Project: operation-web
     * Package: com.landawang.operationweb.controller
     * User: Wangxin
     * Date: 2018/7/12
     * Time: 16:46
     * Copyright © 2018年 黄旺鑫. All rights reserved.
     */
    
    
    package com.landawang.operationweb.controller;
    
    import org.springframework.stereotype.Controller;
    import org.springframework.web.bind.annotation.RequestMapping;
    
    @Controller
    public class PageController {
    
        @RequestMapping("/")
        public String index() {
            return "index";
        }
    }
    

提取商品信息

  1. ExcelController.java,编写实现逻辑

    /**
     * Created with IntelliJ IDEA.
     * Project: operation-web
     * Package: com.landawang.operationweb.controller
     * User: Wangxin
     * Date: 2018/7/12
     * Time: 17:26
     * Copyright © 2018年 黄旺鑫. All rights reserved.
     */
    
    
    package com.landawang.operationweb.controller;
    
    import org.apache.http.Consts;
    import org.apache.http.HttpEntity;
    import org.apache.http.client.methods.CloseableHttpResponse;
    import org.apache.http.client.methods.HttpGet;
    import org.apache.http.impl.client.CloseableHttpClient;
    import org.apache.http.impl.client.HttpClients;
    import org.apache.http.util.EntityUtils;
    import org.apache.poi.xssf.usermodel.*;
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
    import org.openxmlformats.schemas.spreadsheetml.x2006.main.STCellType;
    import org.springframework.web.bind.annotation.PostMapping;
    import org.springframework.web.bind.annotation.RequestMapping;
    import org.springframework.web.bind.annotation.RequestParam;
    import org.springframework.web.bind.annotation.RestController;
    import org.springframework.web.multipart.MultipartFile;
    
    import javax.servlet.http.HttpServletResponse;
    import java.util.Date;
    import java.util.HashMap;
    import java.util.Map;
    
    
    @RestController
    @RequestMapping("/excel")
    public class ExcelController {
    
        @PostMapping("/upload")
        public void uploadExcel(HttpServletResponse response, @RequestParam("file") MultipartFile file) {
    
            /**
             * 1. 获取上传的url列表
             * 2. 遍历获取url对应页面的HTML源码
             * 3. 提取对应的商品信息字段
             * 4. 输出的excel
             * */
    
            try {
                // 读取Excel文件
                XSSFWorkbook workbook = new XSSFWorkbook(file.getInputStream());
                // 读取Excel工作表
                XSSFSheet sheet = workbook.getSheetAt(0);
    
    
                // 创建输出Excel文件
                XSSFWorkbook newWorkbook = new XSSFWorkbook();
                // 创建Sheet
                XSSFSheet newSheet = newWorkbook.createSheet();
                // 创建标题行
                XSSFRow titleRow = newSheet.createRow(0);
                // 设置标题行
                XSSFCell cell1 = titleRow.createCell(0, STCellType.INT_STR);
                cell1.setCellValue("商品编码");
                XSSFCell cell2 = titleRow.createCell(1, STCellType.INT_STR);
                cell2.setCellValue("商品名称");
                XSSFCell cell3 = titleRow.createCell(2, STCellType.INT_STR);
                cell3.setCellValue("商品分类");
                // 设置宽度
                newSheet.setColumnWidth(0, 2560);
                newSheet.setColumnWidth(1, 25600);
                newSheet.setColumnWidth(2, 5120);
    
                // 遍历获取HTML源码,提取信息
                for (int i = 0; i < sheet.getLastRowNum(); i++) {
                    // 获取行
                    XSSFRow row = sheet.getRow(i);
                    // 获取列
                    XSSFCell cell = row.getCell(0);
                    // 获取url
                    String url = cell.getStringCellValue();
    
                    // 输出的Excel创建行
                    XSSFRow newRow = newSheet.createRow(i + 1);
    
                    // 判断url不为空并且包含http
                    if (!url.isEmpty() && url.contains("http")) {
                        // 获取商品信息集合
                        Map<String, String> data = getProductInfo(url);
    
                        // 输出商品信息到Excel表
                        if (data != null) {
                            XSSFCell cellOne = newRow.createCell(0, STCellType.INT_STR);
                            cellOne.setCellValue(data.get("sku"));
                            XSSFCell cellTwo = newRow.createCell(1, STCellType.INT_STR);
                            cellTwo.setCellValue(data.get("name"));
                            XSSFCell cellThree = newRow.createCell(2, STCellType.INT_STR);
                            cellThree.setCellValue(data.get("cat"));
                        }
                    }
    
                    // 打印调试
                    System.out.println("\n内容是:" + url);
                }
    
                // 下载excel
                response.setContentType("application/octet-stream");
                // 以时间戳命名
                String fileName = String.valueOf(new Date().getTime()) + ".xlsx";
                response.setHeader("Content-disposition", "attachment;filename=" + fileName);
                response.flushBuffer();
    
                // 输出excel
                newWorkbook.write(response.getOutputStream());
    
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
    
    
        /**
         * 提取商品信息
         * */
        private Map<String, String> getProductInfo(String url) throws Exception {
    
    
            CloseableHttpClient httpclient = HttpClients.createDefault();
            HttpGet httpGet = new HttpGet(url);
    
            // 模拟浏览器浏览
            httpGet.setHeader("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:60.0) Gecko/20100101 Firefox/60.0");
            CloseableHttpResponse response1 = httpclient.execute(httpGet);
            // The underlying HTTP connection is still held by the response object
            // to allow the response content to be streamed directly from the network socket.
            // In order to ensure correct deallocation of system resources
            // the user MUST call CloseableHttpResponse#close() from a finally clause.
            // Please note that if response content is not fully consumed the underlying
            // connection cannot be safely re-used and will be shut down and discarded
            // by the connection manager.
    
            // 结果集合
            Map<String, String> reslut = null;
    
            //获取响应状态码
            int StatusCode = response1.getStatusLine().getStatusCode();
    
            try {
    
                HttpEntity entity1 = response1.getEntity();
    
                //如果状态响应码为200,则获取html实体内容或者json文件
                if(StatusCode == 200){
                    String html = EntityUtils.toString(entity1, Consts.UTF_8);
    
                    // 提取HTML得到商品信息结果
                    reslut = getData(html);
                    
                    // 消耗掉实体
                    EntityUtils.consume(response1.getEntity());
                }else {
                    //否则,消耗掉实体
                    EntityUtils.consume(response1.getEntity());
                }
            } finally {
                response1.close();
            }
    
            return reslut;
        }
    
        private static Map<String, String> getData (String html) throws Exception{
            //获取的数据,存放在集合中
            Map<String, String> data = new HashMap<String,String>();
            
            //采用Jsoup解析
            Document doc = Jsoup.parse(html);
            
            //获取html标签中的内容
            // 标题
            String name = doc.select("div[class=sku-name]").text();
            if (name != null) {
                data.put("name", name);
            }
    
            // sku
            String sku = "";
            Elements elements = doc.select("a[data-sku]");
            for (Element ele: elements) {
                if (ele.hasAttr("data-sku")) {
                    sku = ele.attr("data-sku");
                    break;
                }
            }
    
            if (sku != null) {
                data.put("sku", sku);
            }
    
            String cat = doc.select("a[clstag=shangpin|keycount|product|mbNav-1]").text();
    
            if (cat != null) {
                data.put("cat", cat);
            }
    
            System.out.print(sku + "---------" + cat + "---------" + name);
            
            //返回数据
            return data;
        }
    
    }
    
  2. 解析HTML参考:https://jsoup.org

  3. 网络请求参考:http://hc.apache.org/httpcomponents-client-4.5.x/quickstart.html

测试

  1. 准备excel表
    这里写图片描述

  2. 访问http://localhost:8080,选取excel表
    这里写图片描述

  3. 获取时间戳命名的excel表
    这里写图片描述

评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

黄旺鑫

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值