基于SpringBoot + MyBatis + WebMagic的爬虫
1、爬虫功能模块介绍
1)项目结构总览
- 数据的存储
- 数据导出为excel格式进行查看
- WebMagic爬虫逻辑编写
- 后端与前端建立通信掌握爬虫进度
- Web前端逻辑编写
- SpringBoot整合项目启动类
下面是各个模块代码的详细介绍
2、数据的存储
1)实体类对象
-
每个页面的存储对象都要相应的建立一个实体类对象
-
这里我拿
institutioninfo
举例/** * @author 90934 * @create 2020/2/2 * @since 1.0.0 * 只写变量名称,get、set和tostring()全部都直接自动生成 */ @Entity(name = "institution_info") public class InstitutionInfo { @Id @GeneratedValue(strategy = GenerationType.IDENTITY) private Long id; private String name; private String rnumber; private String oname; private String cperson; private String cnumber; private String pcode; private String fnumber; private String weburl; private String email; private String address; private String start; private String end; private String abasis; private String parameter; private String baseinfoid; }
2) 构造存储数据的方法
-
构造dao层中的数据库接口
/** * @author 90934 * 继承JpaRepository中的方法,减少代码的书写量 */ public interface InstitutionInfoDao extends JpaRepository<InstitutionInfo, Long> { }
3)构造存储的接口和实现方法
-
构造存储的service接口
/** * @author 90934 */ @Component public interface InstitutionInfoService { /** * fetch data by rule id * * @param institutionInfo rule id */ void save(InstitutionInfo institutionInfo); /** * fetch data by rule id * * @param institutionInfo rule id * @return Result<institutionInfo> */ List<InstitutionInfo> findInstitutionInfo(InstitutionInfo institutionInfo); /** * 查询数据库中所有的数据 * * @return InstitutionInfo */ List<InstitutionInfo> findAll(); }
-
构造service接口的实现类impl
/** * @author 90934 */ @Service public class InstitutionInfoServiceImpl implements InstitutionInfoService { private InstitutionInfoDao institutionInfoDao; @Autowired public void setInstitutionInfoDao(InstitutionInfoDao institutionInfoDao) { this.institutionInfoDao = institutionInfoDao; } @Override @Transactional(rollbackFor = Exception.class) public void save(InstitutionInfo institutionInfo) { //根据机构名称查询数据 InstitutionInfo param = new InstitutionInfo(); param.setName(institutionInfo.getName()); //执行查询 List<InstitutionInfo> list = this.findInstitutionInfo(param); //判断查询结果是否为空 if (list.size() == 0) { //如果结果为空,表示机构基本信息不存在,需要更新数据库 this.institutionInfoDao.save(institutionInfo); } //打开注释,将爬取的数据显示到web端页面进行查看,注意当爬虫数据过快已造成页面崩溃 try { ProductWebSocket.sendInfo("已成功采集 1 条数据!"); } catch (IOException e) { e.printStackTrace(); } } @Override public List<InstitutionInfo> findInstitutionInfo(InstitutionInfo institutionInfo) { //设置查询条件 Example<InstitutionInfo> example = Example.of(institutionInfo); //执行查询 return this.institutionInfoDao.findAll(example); } @Override public List<InstitutionInfo> findAll() { return this.institutionInfoDao.findAll(); } }
3、数据导出为excel格式进行查看
根据此博客内容进行修改:https://www.cnblogs.com/wlxslsb/p/10931130.html
1)构造实体类
构造两个实体类对象,分别存储表的信息和表中的字段名称
-
构造数据库中表信息的实体
-
/** * @author 90934 * @date 2020/2/29 23:47 * @description 导出主题表 * @since 0.1.0 * 只写变量名称,get、set和tostring()全部都直接自动生成 */ public class ExportBean { private Integer id; private String exportCode; private String exportName; private List<ExportFieldBean> fieldBeanList; }
-
-
构造数据库中表信息的实体
-
DROP TABLE IF EXISTS `export`; CREATE TABLE `export` ( `id` int(32) UNSIGNED NOT NULL AUTO_INCREMENT COMMENT '主键', `exportCode` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '导出主题英文名', `exportName` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '导出主题中文名', PRIMARY KEY (`id`) USING BTREE ) ENGINE = InnoDB AUTO_INCREMENT = 4 CHARACTER SET = utf8 COLLATE = utf8_general_ci COMMENT = '导出主题表' ROW_FORMAT = Dynamic;
-
-
构造表中的字段信息的实体对象
-
/** * @author 90934 * @date 2020/2/29 23:49 * @description 导出字段表 * @since 0.1.0 * 只写变量名称,get、set和tostring()全部都直接自动生成 */ public class ExportFieldBean { private Integer id; private Integer exportId; private String fieldCode; private String fieldName; private Integer sort; private ExportBean exportBean;
-
-
构造表中字段信息需要的sql语句
-
DROP TABLE IF EXISTS `export_field`; CREATE TABLE `export_field` ( `id` int(11) UNSIGNED NOT NULL AUTO_INCREMENT COMMENT '主键', `exportId` int(11) UNSIGNED NULL DEFAULT NULL COMMENT '导出主表ID', `fieldCode` varchar(55) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '字段英文名', `fieldName` varchar(64) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '字段中文名', `sort` int(11) UNSIGNED NULL DEFAULT 1 COMMENT '排序字段', PRIMARY KEY (`id`) USING BTREE ) ENGINE = InnoDB AUTO_INCREMENT = 40 CHARACTER SET = utf8 COLLATE = utf8_general_ci COMMENT = '导出字段表' ROW_FORMAT = Dynamic;
-
2)使用MyBatis构造mapper对象
使用级联操作查询数据库中相应表和表中字段的信息
-
构造ExportMapper接口查询数据库中表信息,级联在查询当前表信息的时候同时查询其中的表中的字段信息
-
** * @author 90934 * @date 2020/2/29 23:51 * @description 导出字段的方法接口 * @since 0.1.0 */ public interface ExportMapper { /** * 获取各个字段的名称 * @param exportKey 字段名称 * @return ExportBean */ @Select("select * from export where exportCode = #{exportKey}") @Results({ @Result(property="fieldBeanList",column="id",one=@One(select="com.hellof.crawler.mapper.ExportFieldMapper" + ".getExportFieldBeanByExportid")) }) ExportBean getExportByExportKey(String exportKey); }
-
-
构造ExportFieldMapper接口查询相应表中的字段信息
-
/** * @author 90934 * @date 2020/3/1 19:33 * @description 获取相关联的field对象 * @since 0.1.0 */ public interface ExportFieldMapper { /** * 获取各个field表中的字段信息 * @param exportid 识别属于表中信息的字段 * @return n */ @Select("select * from export_field where exportId = #{exportid}") ExportFieldBean getExportFieldBeanByExportid(String exportid); }
-
3)构造导出为Excel格式的Service接口及其impl实现类
构造IExportExcelService接口和其相对应的IExportExcelServiceImpl实现类
-
IExportExcelService方法接口
-
public interface IExportExcelService { /** * 根据exportKey查询需要导出的字段,并匹配list每个类中字段来到出excel * @param exportKey 数据库存储的导出英文名 * @param fileName 文件名 * @param list 要到出的数据 * @param req 请求 * @param resp 响应 */ public void exportExcelWithDispose(String exportKey, String fileName, List<?> list, HttpServletRequest req, HttpServletResponse resp); }
-
-
IEportExcelServiceImpl方法接口实现类
-
@Service public class ExportExcelServiceImpl implements IExportExcelService { private ExportMapper exportMapper; @Resource public void setExportMapper(ExportMapper exportMapper) { this.exportMapper = exportMapper; } @Override public void exportExcelWithDispose(String exportKey, String fileName, List<?> list, HttpServletRequest req, HttpServletResponse resp) { List<ExportFieldBean> fieldBeans = this.exportMapper.getExportByExportKey(exportKey).getFieldBeanList(); try { SXSSFWorkbook sxssfWorkbook = new SXSSFWorkbook(); SXSSFSheet sheet1 = sxssfWorkbook.createSheet(fileName); SXSSFRow headRow = sheet1.createRow(0); headRow.createCell(0).setCellValue("序号"); for (ExportFieldBean fieldBean: fieldBeans){ headRow.createCell(headRow.getLastCellNum()).setCellValue(fieldBean.getFieldName()); } int index = 0; SXSSFRow bodyRow = null; JSONArray jsonArray = JSONArray.fromObject(list); for (Object obj:jsonArray){ bodyRow = sheet1.createRow(sheet1.getLastRowNum() + 1); bodyRow.createCell(0).setCellValue(index++); int flag = 0; for (ExportFieldBean fieldBean: fieldBeans){ if (flag == 0){ bodyRow.createCell(bodyRow.getLastCellNum()).setCellValue((Integer) ((JSONObject)obj).get(fieldBean.getFieldCode())); flag = 1; }else { bodyRow.createCell(bodyRow.getLastCellNum()).setCellValue((String) ((JSONObject)obj).get(fieldBean.getFieldCode())); } } } FileOutputStream outputStream = new FileOutputStream(fileName + ".xlsx"); sxssfWorkbook.write(outputStream); outputStream.close(); sxssfWorkbook.close(); //打开注释,将爬取的数据显示到web端页面进行查看,注意当爬虫数据过快已造成页面崩溃 ProductWebSocket.sendInfo("已成功导出 " + list.size() + " 条数据!"); }catch (Exception e){ e.printStackTrace(); } } }
-
4、WebMagic爬虫逻辑代码
WebMagic是一个优秀的可二次开发的爬虫框架,此代码逻辑就是采用这个框架进行编写的
WebMagic框架对于爬虫中的大多数方法都有一个包装好的方法,使用者不必重复进行代码的编写
在使用WebMagic框架的时候,只需要重写PageProcessorr方法就可以完成一个爬虫的构建,如果有额外的需求只需要完成响应方法的重写就可以构建一个优秀的爬虫项目。
官方中文说明文档:http://webmagic.io/docs/zh/posts/ch1-overview/
1)重写PageProcessor方法完成代码逻辑
-
/** * @author 90934 * @date 2020/3/2 18:25 * @description 重写PageProcessor方法 * @since 0.1.0 */ @Component public class TestProcessor implements PageProcessor { @Override public void process(Page page) { } @Override public Site getSite() { return site; } }
-
重写process()方法发现链接,将相应链接加入到爬虫队列中等待进程,判断爬虫队列中的连接的类型调用相应的爬取规则,因为现在大多数网页的页面内容是使用ajax技术异步加载出来的,所以必须使用一些自动化测试包来加载网页,通过分析网页的组成来发现链接(本人使用的是Selenium)
@Override public void process(Page page) { //判断url的类型 String queryData = page.getUrl().regex("query\\w+").toString(); String institutionInfo = "queryOrgInfo1"; String domainInfo = "queryPublishSignatory"; String scopeInfo = "queryPublishIBAbilityQuery"; if (queryData.equals(institutionInfo)) { //判断url目的地址是不是queryOrgInfo1 this.saveInstitutionInfo(page); } else if (queryData.equals(domainInfo)) { //判断url目的地址是不是queryPublishSignatory this.saveDomainInfo(page); } else if (queryData.equals(scopeInfo)) { //判断url目的地址是不是queryPublishIBAbilityQuery this.saveScopeInfo(page); } else { List<String> urls = new ArrayList<>(); System.setProperty("webdriver.chrome.driver", "src/main/resources/static/chromedriver.exe"); WebDriver driver = new ChromeDriver(); String url = page.getUrl().toString(); driver.get(url); List<String> addressList = GetAddress.resultData(); //方便测试 // List<String> addressList = new ArrayList<>(); // addressList.add("北京"); // addressList.add("天津"); for (String address : addressList) { WebElement orgAddress = driver.findElement(By.id("orgAddress")); orgAddress.clear(); orgAddress.sendKeys(address); WebElement btn = driver.findElement(By.className("btn")); btn.click(); try { Thread.sleep(3000); } catch (InterruptedException e) { e.printStackTrace(); } boolean accept = true; while (accept) { try { WebElement pirbutton1 = driver.findElement(By.xpath("//*[@id=\"pirlbutton1\"]")); pirbutton1.click(); Thread.sleep(5000); boolean flagStr = driver.findElement(By.id("pirlAuthInterceptDiv_c")).isDisplayed(); if (!flagStr) { accept = false; } } catch (InterruptedException e) { e.printStackTrace(); } } boolean flagStr = driver.findElement(By.xpath("//*[@id=\"pirlAuthInterceptDiv_c\"]")).isDisplayed(); if (!flagStr) { int maxPage = Integer.parseInt(driver.findElement(By.id("yui-pg0-0-totalPages-span")).getText()); for (int num = 0; num < maxPage; num++) { Html html = Html.create(driver.findElement(By.xpath("//*")).getAttribute("outerHTML")); List<Selectable> list = html.css("div.yui-dt-liner a").nodes(); if (list.size() != 0) { for (Selectable selectable : list) { //获取id String urlStr = selectable.regex("id\\=\\w+").toString(); //组装url放入待爬取队列 urls.add("https://las.cnas.org.cn/LAS/publish/queryOrgInfo1.action?" + urlStr); } } if