一.Puppeteer的使用与注意事项
(1).环境准备
npm install -g koa-generator
koa2 crawler
npm install
npm i puppeteer -S
(2).初始化项目
2.删除app.js下的users引入和注释router下的use
(3),爬虫初始
const router = require('koa-router')(),
pt = require('puppeteer');
router.get('/', async (ctx, next) => {
const bs = await pt.launch(),
url = 'https://msiwei.ke.qq.com/#category=-1&tab=1',
pg = await bs.newPage();
await pg.goto(url,{
timeout:30*1000,
waitUtil:'networkidle2'
})
const result = await pg.evaluate(()=>{
const $ = window.$,
$item = $('.agency-big-banner-ul .agency-big-banner-li');
let data = [];
$item.each((index,item)=>{
const $el = $(item),
$elLink = $el.find('.js-banner-btnqq');
const dataItem = {
cid: $elLink.attr('data-id'),
href: $elLink.prop('href'),
imgUrl:$elLink.find('img').prop('src'),
title:$elLink.prop('title')
}
data.push(dataItem)
})
return data;
})
console.log(result)
await bs.close;
})
module.exports = router
http://localhost:3000/访问爬取
二.启动子进程运行爬虫脚本
1.根目录下建立puppeteer文件夹,建立crawler.js文件
const pt = require('puppeteer');
;(async ()=>{
const bs = await pt.launch(),
url = 'https://msiwei.ke.qq.com/#category=-1&tab=1',
pg = await bs.newPage();
await pg.goto(url,{
timeout:30*1000,
waitUtil:'networkidle2'
})
const result = await pg.evaluate(()=>{
const $ = window.$,
$item = $('.agency-big-banner-ul .agency-big-banner-li');
let data = [];
$item.each((index,item)=>{
const $el = $(item),
$elLink = $el.find('.js-banner-btnqq');
const dataItem = {
cid: $elLink.attr('data-id'),
href: $elLink.prop('href'),
imgUrl:$elLink.find('img').prop('src'),
title:$elLink.prop('title')
}
data.push(dataItem)
})
return data;
})
await bs.close;
process.send(result);
setTimeout(()=>{
process.exit(0)
})
})();
2.更改routes文件夹下的index.js调用子进程,且处理进程返回值
const router = require('koa-router')(),
cp = require('child_process'),
{resolve} = require('path')
router.get('/', async (ctx, next) => {
const script = resolve(__dirname,'../puppeteer/crawler.js'),
child = cp.fork(script,[]);
let invoked = false;
child.on('message',(data)=>{
console.log(data);
})
child.on('exit',(code)=>{
if(invoked){
return
}
invoked = true;
console.log(code);
})
child.on('error',(code)=>{
if(invoked){
return
}
invoked = true;
console.log(code);
})
})
module.exports = router
三.改造Koa2以及封装爬虫和开启子进程程序
1.建立lib文件夹,文件夹下建立crawler.js文件做爬取操作
const pt = require('puppeteer');
module.exports= async (options)=>{
const bs = await pt.launch(),
pg = await bs.newPage(),
url = options.url;
await pg.goto(url,{
waitUtil:'networkidle2'
})
const result = await pg.evaluate(options.callback);
await bs.close();
process.send(result);
setTimeout(() => {
process.exit(0)
}, 1000);
}
2.再新建一个utils.js,做信息处理操作
const cp = require('child_process'),
{resolve} = require('path');
module.exports={
startProcess(options){
const script = resolve(__dirname,options.path),
child = cp.fork(script,[]);
let invoked = false;
child.on('message',(data)=>{
options.message(data);
})
child.on('exit',(code)=>{
if(invoked){
return
}
invoked = true;
console.log(code);
})
child.on('error',(code)=>{
if(invoked){
return
}
invoked = true;
console.log(code);
})
}
}
3.建立crawler文件夹,再建立slider.js,做传参调用爬取方法
const crawler = require('../lib/crawler')
crawler({
url: 'https://msiwei.ke.qq.com/#category=-1&tab=1',
callback() {
const $ = window.$,
$item = $('.agency-big-banner-ul .agency-big-banner-li');
let data = [];
$item.each((index, item) => {
const $el = $(item),
$elLink = $el.find('.js-banner-btnqq');
const dataItem = {
cid: $elLink.attr('data-id'),
href: $elLink.prop('href'),
imgUrl: $elLink.find('img').prop('src'),
title: $elLink.prop('title')
}
data.push(dataItem)
})
return data;
}
})
4.将routes下的index.js改为crawler.js,且更改app.js里的配置
const router = require('koa-router')(),
crawlerController = require('../controller/crawler')
router.prefix('/crawler')
router.get('/crawl_slider_data',crawlerController.crawlSliderData )
module.exports = router