const puppeteer = require('puppeteer'); const async = require('async') //判断是否需要显示窗口 let isdev = function (flag) { if (flag) { return { 'headless': true, 'slowMo': 200 } } }; //获取第几个页面的商品id的方法 let scrape = async (query, pageNo, callback) => { var url = 'https://s.taobao.com/search?q=' + encodeURI(query) + '&s=' + pageNo; console.log('[searchproduct] url:', url); const browser = await puppeteer.launch(isdev(true)); const page = await browser.newPage(); await page.goto(url); const result = await page.evaluate(async () => { let goods = await document.querySelectorAll('.J_ItemPic'); let ids = []; await goods.forEach(await function (good, index) { ids.push(Number(good.id.slice(15))) }); return ids; }); browser.close(); callback && callback(null, result); return result; }; //爬取商品的id async function reptileIds(query, pages,callback) { let pageArr = []; for (let i = 1; i <= Number(pages); i++) { pageArr.push(i) } async.mapSeries(pageArr, function (item, callback) { console.log('正在爬取第' + item + '页'); scrape(query, item, callback); }, function (err, result) { //处理获取的id 数据 let ids = []; result.forEach((item) => { ids.push(...item) }); ids = Array.from(new Set(ids)); callback && callback(ids); }) } //获取商品细节... async function getGoodsDetail(id, callback) { let url = 'https://detail.tmall.com/item.htm?id=' + id; console.log('[searchgood] url:', url); const browser = await puppeteer.launch(isdev(true)); const page = await browser.newPage(); await page.goto(url); //await page.waitFor(200 * 10) const result = await page.evaluate(async () => { let goodMsg = {}; let price = document.querySelector('#J_PromoPrice > dd > div > span'); let sales = document.querySelector('#J_DetailMeta > div.tm-clear > div.tb-property > div > ul > li.tm-ind-item.tm-ind-sellCount > div > span.tm-count'); let title = document.querySelector('#J_DetailMeta > div.tm-clear > div.tb-property > div > div.tb-detail-hd > h1'); goodMsg = { price: price ? price.innerHTML : '', sales: sales ? sales.innerHTML : '', title: title ? title.innerHTML : '', }; return goodMsg; }); await browser.close(); result.id = id; callback(null, result); return result; } //调用进程 获取ids--->获取商品的细节 ---->返回数据 reptileIds('袜子', 2,function (ids) { //返回ids ,之后调用 获取商品具体信息 async.mapLimit(ids, 5, function (id, callback) { try { getGoodsDetail(id, callback); } catch (err) { console.log('未获取到数据,') } }, function (err, result) { console.log("result:"); console.log(result); }); });
被异步搞炸了:
1.异步数据传递的时候 ,注意,需要使用callback 来传递
https://blog.youkuaiyun.com/fangjian1204/article/details/50585073
可以使用 ES7的语法
async 和 await 来解决异步 --暂时没有找到更好的方法
https://juejin.im/entry/58523b908e450a006c4d0c5b
2.async模块的使用
控制并发,可以很好的 控制nodeJS带来的并发问题
https://blog.youkuaiyun.com/dai_jing/article/details/47058579