需求分析
- 导入京东商品URL列表
- 生成京东商品信息并输出到excel表
思路
- 读取excel获取URL列表
- 访问url并获得HTML源码
- 提取对应的商品信息字段
- 输出到excel
搭建框架
-
创建Spring Boot工程
-
选择依赖
-
配置pom.xml
<?xml version="1.0" encoding="UTF-8"?> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>com.landawang</groupId> <artifactId>operation-web</artifactId> <version>0.0.1-SNAPSHOT</version> <packaging>jar</packaging> <name>operation-web</name> <description>Demo project for Spring Boot</description> <parent> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-parent</artifactId> <version>2.0.3.RELEASE</version> <relativePath/> <!-- lookup parent from repository --> </parent> <properties> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding> <java.version>10</java.version> </properties> <dependencies> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-web</artifactId> </dependency> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-devtools</artifactId> <scope>runtime</scope> </dependency> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-test</artifactId> <scope>test</scope> </dependency> <!-- thymeleaf --> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-thymeleaf</artifactId> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-ooxml</artifactId> <version>3.10-FINAL</version> </dependency> <dependency> <groupId>org.apache.httpcomponents</groupId> <artifactId>httpclient</artifactId> <version>4.5.1</version> </dependency> <dependency> <!-- jsoup HTML parser library @ https://jsoup.org/ --> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.11.3</version> </dependency> </dependencies> <build> <plugins> <plugin> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-maven-plugin</artifactId> </plugin> </plugins> </build> </project>
-
重命名application.properties为application.yml,修改内容
server: port: 8080 # 配置端口 spring: thymeleaf: # 开发时关闭缓存,不然无法看到实时页面 cache: false prefix: classpath:/templates/ suffix: .html
-
创建resources/templates/index.html
<!DOCTYPE html> <html lang="en" xmlns:th="http://www.w3.org/1999/xhtml"> <head> <meta charset="UTF-8"> <meta name="viewport" content="width=device-width, initial-scale=1.0"/> <title>生成商品信息</title> </head> <body> <h3>请选择excel文件</h3> <form action="/excel/upload" enctype="multipart/form-data" method="post"> <span style="white-space:pre"></span> <input type="file" name="file"/> <span style="white-space:pre"></span> <input type="submit" value="导入"/> <span style="white-space:pre"></span> </form> </body> </html>
-
创建PageController并访问页面
/** * Created with IntelliJ IDEA. * Project: operation-web * Package: com.landawang.operationweb.controller * User: Wangxin * Date: 2018/7/12 * Time: 16:46 * Copyright © 2018年 黄旺鑫. All rights reserved. */ package com.landawang.operationweb.controller; import org.springframework.stereotype.Controller; import org.springframework.web.bind.annotation.RequestMapping; @Controller public class PageController { @RequestMapping("/") public String index() { return "index"; } }
提取商品信息
-
ExcelController.java,编写实现逻辑
/** * Created with IntelliJ IDEA. * Project: operation-web * Package: com.landawang.operationweb.controller * User: Wangxin * Date: 2018/7/12 * Time: 17:26 * Copyright © 2018年 黄旺鑫. All rights reserved. */ package com.landawang.operationweb.controller; import org.apache.http.Consts; import org.apache.http.HttpEntity; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import org.apache.poi.xssf.usermodel.*; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.openxmlformats.schemas.spreadsheetml.x2006.main.STCellType; import org.springframework.web.bind.annotation.PostMapping; import org.springframework.web.bind.annotation.RequestMapping; import org.springframework.web.bind.annotation.RequestParam; import org.springframework.web.bind.annotation.RestController; import org.springframework.web.multipart.MultipartFile; import javax.servlet.http.HttpServletResponse; import java.util.Date; import java.util.HashMap; import java.util.Map; @RestController @RequestMapping("/excel") public class ExcelController { @PostMapping("/upload") public void uploadExcel(HttpServletResponse response, @RequestParam("file") MultipartFile file) { /** * 1. 获取上传的url列表 * 2. 遍历获取url对应页面的HTML源码 * 3. 提取对应的商品信息字段 * 4. 输出的excel * */ try { // 读取Excel文件 XSSFWorkbook workbook = new XSSFWorkbook(file.getInputStream()); // 读取Excel工作表 XSSFSheet sheet = workbook.getSheetAt(0); // 创建输出Excel文件 XSSFWorkbook newWorkbook = new XSSFWorkbook(); // 创建Sheet XSSFSheet newSheet = newWorkbook.createSheet(); // 创建标题行 XSSFRow titleRow = newSheet.createRow(0); // 设置标题行 XSSFCell cell1 = titleRow.createCell(0, STCellType.INT_STR); cell1.setCellValue("商品编码"); XSSFCell cell2 = titleRow.createCell(1, STCellType.INT_STR); cell2.setCellValue("商品名称"); XSSFCell cell3 = titleRow.createCell(2, STCellType.INT_STR); cell3.setCellValue("商品分类"); // 设置宽度 newSheet.setColumnWidth(0, 2560); newSheet.setColumnWidth(1, 25600); newSheet.setColumnWidth(2, 5120); // 遍历获取HTML源码,提取信息 for (int i = 0; i < sheet.getLastRowNum(); i++) { // 获取行 XSSFRow row = sheet.getRow(i); // 获取列 XSSFCell cell = row.getCell(0); // 获取url String url = cell.getStringCellValue(); // 输出的Excel创建行 XSSFRow newRow = newSheet.createRow(i + 1); // 判断url不为空并且包含http if (!url.isEmpty() && url.contains("http")) { // 获取商品信息集合 Map<String, String> data = getProductInfo(url); // 输出商品信息到Excel表 if (data != null) { XSSFCell cellOne = newRow.createCell(0, STCellType.INT_STR); cellOne.setCellValue(data.get("sku")); XSSFCell cellTwo = newRow.createCell(1, STCellType.INT_STR); cellTwo.setCellValue(data.get("name")); XSSFCell cellThree = newRow.createCell(2, STCellType.INT_STR); cellThree.setCellValue(data.get("cat")); } } // 打印调试 System.out.println("\n内容是:" + url); } // 下载excel response.setContentType("application/octet-stream"); // 以时间戳命名 String fileName = String.valueOf(new Date().getTime()) + ".xlsx"; response.setHeader("Content-disposition", "attachment;filename=" + fileName); response.flushBuffer(); // 输出excel newWorkbook.write(response.getOutputStream()); } catch (Exception e) { e.printStackTrace(); } } /** * 提取商品信息 * */ private Map<String, String> getProductInfo(String url) throws Exception { CloseableHttpClient httpclient = HttpClients.createDefault(); HttpGet httpGet = new HttpGet(url); // 模拟浏览器浏览 httpGet.setHeader("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:60.0) Gecko/20100101 Firefox/60.0"); CloseableHttpResponse response1 = httpclient.execute(httpGet); // The underlying HTTP connection is still held by the response object // to allow the response content to be streamed directly from the network socket. // In order to ensure correct deallocation of system resources // the user MUST call CloseableHttpResponse#close() from a finally clause. // Please note that if response content is not fully consumed the underlying // connection cannot be safely re-used and will be shut down and discarded // by the connection manager. // 结果集合 Map<String, String> reslut = null; //获取响应状态码 int StatusCode = response1.getStatusLine().getStatusCode(); try { HttpEntity entity1 = response1.getEntity(); //如果状态响应码为200,则获取html实体内容或者json文件 if(StatusCode == 200){ String html = EntityUtils.toString(entity1, Consts.UTF_8); // 提取HTML得到商品信息结果 reslut = getData(html); // 消耗掉实体 EntityUtils.consume(response1.getEntity()); }else { //否则,消耗掉实体 EntityUtils.consume(response1.getEntity()); } } finally { response1.close(); } return reslut; } private static Map<String, String> getData (String html) throws Exception{ //获取的数据,存放在集合中 Map<String, String> data = new HashMap<String,String>(); //采用Jsoup解析 Document doc = Jsoup.parse(html); //获取html标签中的内容 // 标题 String name = doc.select("div[class=sku-name]").text(); if (name != null) { data.put("name", name); } // sku String sku = ""; Elements elements = doc.select("a[data-sku]"); for (Element ele: elements) { if (ele.hasAttr("data-sku")) { sku = ele.attr("data-sku"); break; } } if (sku != null) { data.put("sku", sku); } String cat = doc.select("a[clstag=shangpin|keycount|product|mbNav-1]").text(); if (cat != null) { data.put("cat", cat); } System.out.print(sku + "---------" + cat + "---------" + name); //返回数据 return data; } }
-
解析HTML参考:https://jsoup.org
-
网络请求参考:http://hc.apache.org/httpcomponents-client-4.5.x/quickstart.html
测试
-
准备excel表
-
访问http://localhost:8080,选取excel表
-
获取时间戳命名的excel表