参考:https://zhuanlan.zhihu.com/p/26379515
1、获取商品分类
用scrapy进行爬取,mongodb进行存储,开始爬取的网址为https://www.jd.com/allSort.aspx
首先要获取的是21个大类的信息(如图片所示,图书,音像,电子书刊),然后在每个大类下面还有细分的类别(例如电子书,网络原创,数字杂志等),这些细分类别的url会含有不同的关键字,例如:
<a href="//e.jd.com/ebook.html" target="_blank">电子书</a> <a href="//mvd.jd.com/music.html" target="_blank">音乐</a> <a href="//list.jd.com/list.html?cat=1319,1523,7052" target="_blank">婴幼奶粉</a>
这里可以从含有list的url页面(第三个)获得分类信息并把url传给下一个函数,如果是不包含list的页面则再次执行函数搜索,代码如下:
def parse_category(self, response):
"""获取最大分类页,页面的21个大分类,以及能进到list页面的分类"""
try:
selector = Selector(response)
all_a = selector.xpath('//div[@class="categories-container clearfix"]//a/text()').extract()
for a in all_a:
firstcategoriesItem = FirstCategoriesItem()
firstcategoriesItem['name'] = a
yield firstcategoriesItem
texts = selector.xpath('//div[@class="category-item m"]/div[@class="mc"]/div[@class="items"]/dl/dd/a').extract()
for text in texts:
items = re.findall(r'<a href="(.*?)" target="_blank">(.*?)</a>', text)
for item in items:
if item[0].split('.')[0][2:] in key_word:
if item[0].split('.')[0][2:] != 'list':
yield Request(url='https:' + item[0], callback=self.parse_category)
else:
categoriesItem = CategoriesItem()
categoriesItem['name'] = item[1]
categoriesItem['url'] = 'https:' + item[0]
categoriesItem['_id'] = item[0].split('=')[1].split('&')[0]
yield categoriesItem
yield Request(url='https:' + item[0], callback=self.parse_list)
except Exception as e:
print('error:', e)
2、获得商品url并翻页
将每一页的所有商品url传给parse_product函数并且翻页重复执行
def parse_list(self, response):
"""分别获得商品的地址和下一页地址"""
meta = dict()
meta['category'] = response.url.split('=')[1].split('&')[0]
selector = Selector(response)
all_li = selector.xpath('//div[@id="plist"]//li[@class="gl-item"]')
for li in all_li:
href = li.xpath('//div[@class="p-img"]/a[@target="_blank"]/@href').extract()
#print(href[0])
yield Request(url='https:' + href[0], callback=self.parse_product, meta=meta) # 将meta也传递给下一个函数,可供提取
next_list = response.xpath('//a[@class="pn-next"]/@href').extract()
if next_list:
yield Request(url=Base_url + next_list[0], callback=self.parse_list)
3、获取店铺信息和商品信息
进入到商品页面后用正则表达式搜索vendeId和shopId,搜索出来的结果是这样的[(‘47969’, ‘44950’)],店铺url也跟shopId有关,所以就很容易构造出来了,而店铺名称可能处于不同位置,需要逐个查找。
然后就是商品信息了,商品价钱,商品优惠,商品评论都是异步加载,需要动态获取,可以看到其实返回结果都是json格式,优惠信息有两种,一种是优惠券,另一种是促销信息。
获取需要的商品信息后再把评论url传递给parse_comments函数。
def parse_product(self, response):
"""商品页获取title,price,product_id"""
category = response.meta['category'] # 相当于类别'_id'
ids = re.findall(r"venderId:(.*?),\s.*?shopId:'(.*?)'", response.text) # 有匹配换行符
if not ids:
ids = re.findall(r"venderId:(.*?),\s.*?shopId:(.*?),", response.text) # 没有'' "
print(ids) # eg:[('47969', '44950')]
vender_id = ids[0][0]
shop_id = ids[0][1]
shopItem = ShopItem()
shopItem['shopId'] = shop_id
shopItem['venderId'] = vender_id
shopItem['url1'] = 'http://mall.jd.com/index-%s.html' % (shop_id) # 店铺地址跟店铺id有关
try:
shopItem['url2'] = 'https:' + response.xpath('//ul[@class="parameter2 p-parameter-list"]/li/a/@href').extract()[0]
except:
shopItem['url2'] = shopItem['url1'] # 有的店铺有两个名称
# 店铺名称可能位置不同,逐个查找
if shop_id == '0':
name = '京东自营'
else:
try:
name = response.xpath('//ul[@class="parameter2 p-parameter-list"]/li/a//text()').extract()[0]
except:
try:
name = response.xpath('//div[@class="name"]/a//text()').extract()[0].strip()
except:
try:
name = response.xpath('//div[@class="shopName"]/strong/span/a//text()').extract()[0].strip()
except:
try:
name = response.xpath('//div[@class="seller-infor"]/a//text()').extract()[0].strip()
except:
name = u'京东自营'
shopItem['name'] = name
shopItem['_id'] = name
yield shopItem
productsItem = ProductsItem()
productsItem['shopId'] = shop_id
productsItem['category'] = category # 相当于类别'_id'
try:
title = response.xpath('//div[@class="sku-name"]/text()').extract()[0].replace(u"\xa0", "").strip() # 提取商品名称并去掉空白符
except Exception as e:
title = response.xpath('//div[@id="name"]/h1/text()').extract()[0]
productsItem['name'] = title
product_id = response.url.split('/')[-1][:-5] # https://item.jd.com/10194567708.html例如提取此网址
productsItem['_id'] = product_id
productsItem['url'] = response.url
# description 商品介绍,提取出所有文本内容
desc = response.xpath('//ul[@class="parameter2 p-parameter-list"]//text()').extract()
productsItem['description'] = ';'.join(i.strip() for i in desc)
# price
# 原始的价格url还带有其他参数,例如https://p.3.cn/prices/mgets?callback=jQuery3634274&type=1&area=1_72_4137_0&pdtk=&pduid=1517055597359941047719&pdpin=&pin=null&pdbp=0&skuIds=J_10194567708&ext=11000000&source=item-pc
price_json = response.json()
response = requests.get(url=price_url + product_id)
# 价钱格式[{"op":"99.80","m":"199.00","id":"J_10194567708","p":"84.00"}]
productsItem['reallyPrice'] = price_json[0]['p'] # 现价
productsItem['originalPrice'] = price_json[0]['m'] # 原价
# 优惠
res_url = favourable_url % (product_id, shop_id, vender_id, category.replace(',', '%2c'))
# print(res_url)
response = requests.get(res_url) # 下面一段代码报错则表示商品没有优惠信息
fav_data = response.json()
# 优惠券
if fav_data['skuCoupon']:
desc1 = []
for item in fav_data['skuCoupon']:
start_time = item['beginTime']
end_time = item['endTime']
time_dec = item['timeDesc']
fav_price = item['quota']
fav_count = item['discount']
fav_time = item['addDays']
desc1.append(u'有效期%s至%s,满%s减%s' % (start_time, end_time, fav_price, fav_count))
productsItem['favourableDesc1'] = ';'.join(desc1)
# 促销信息
if fav_data['prom'] and fav_data['prom']['pickOneTag']:
desc2 = []
for item in fav_data['prom']['pickOneTag']:
desc2.append(item['content']) # fav_data['prom']['pickOneTag'][0]['content']
productsItem['favourableDesc1'] = ';'.join(desc2)
data = dict()
data['product_id'] = product_id
data['page'] = 0
yield productsItem
# 将商品第一页的评价网址传给parse_comments
yield Request(url=comment_url % (product_id, '0'), callback=self.parse_comments, meta=data)
4、获取热评标签,评论图片,具体评论信息
获得json数据后进行存储,这里要注意的是每一页只显示10条评论,需要翻页获取,设定了最多只抓取60页的评论,并且利用meta参数记录抓取页数,翻页递归执行这个函数
def parse_comments(self, response):
"""获取商品comment,解析json存储"""
try:
data = json.loads(response.text)
except Exception as e:
print('get comment failed:', e)
return None
product_id = response.meta['product_id']
commentSummaryItem = CommentSummaryItem()
commentSummary = data.get('productCommentSummary')
commentSummaryItem['goodRateShow'] = commentSummary.get('goodRateShow') # 好评度
commentSummaryItem['poorRateShow'] = commentSummary.get('poorRateShow') # 差评度
commentSummaryItem['poorCountStr'] = commentSummary.get('poorCountStr') # 差评数量
commentSummaryItem['averageScore'] = commentSummary.get('averageScore')
commentSummaryItem['generalCountStr'] = commentSummary.get('generalCountStr') # 中评数量
commentSummaryItem['showCountStr'] = commentSummary.get('showCountStr')
commentSummaryItem['generalRate'] = commentSummary.get('generalRate') # 中评度
commentSummaryItem['skuId'] = commentSummary.get('skuId') # 商品id
commentSummaryItem['goodCountStr'] = commentSummary.get('goodCountStr') # 好评数量
commentSummaryItem['poorRate'] = commentSummary.get('poorRate') # 差评度
commentSummaryItem['goodRateStyle'] = commentSummary.get('goodRateStyle')
commentSummaryItem['skuIds'] = commentSummary.get('skuIds')
commentSummaryItem['poorRateStyle'] = commentSummary.get('poorRateStyle')
commentSummaryItem['generalRateStyle'] = commentSummary.get('generalRateStyle')
commentSummaryItem['commentCountStr'] = commentSummary.get('commentCountStr') # 总评论数
commentSummaryItem['productId'] = commentSummary.get('productId') # 同ProductsItem的id相同
commentSummaryItem['_id'] = commentSummary.get('productId') # 同ProductsItem的id相同
commentSummaryItem['afterCountStr'] = commentSummary.get('afterCountStr') # 追评
commentSummaryItem['generalRateShow'] = commentSummary.get('generalRateShow')
commentSummaryItem['jwotestProduct'] = data.get('jwotestProduct')
commentSummaryItem['maxPage'] = data.get('maxPage') # 最大页数
commentSummaryItem['score'] = data.get('score')
commentSummaryItem['soType'] = data.get('soType')
commentSummaryItem['imageListCount'] = data.get('imageListCount')
yield commentSummaryItem
for hotComment in data['hotCommentTagStatistics']: # 列表里面是字典
hotCommentTagItem = HotCommentTagItem()
hotCommentTagItem['_id'] = hotComment.get('id') # 分类id
hotCommentTagItem['name'] = hotComment.get('name') # 评价名称
hotCommentTagItem['status'] = hotComment.get('status')
hotCommentTagItem['rid'] = hotComment.get('rid')
hotCommentTagItem['productId'] = hotComment.get('productId') # 产品id
hotCommentTagItem['count'] = hotComment.get('count') # 评价次数
hotCommentTagItem['created'] = hotComment.get('created')
hotCommentTagItem['modified'] = hotComment.get('modified') # 修改时间
hotCommentTagItem['type'] = hotComment.get('type')
hotCommentTagItem['canBeFiltered'] = hotComment.get('canBeFiltered') # True or False
yield hotCommentTagItem
for comment_item in data['comments']:
comment = CommentItem()
comment['_id'] = comment_item.get('id')
comment['productId'] = product_id
comment['guid'] = comment_item.get('guid')
comment['content'] = comment_item.get('content')
comment['creationTime'] = comment_item.get('creationTime')
comment['isTop'] = comment_item.get('isTop')
comment['referenceId'] = comment_item.get('referenceId')
comment['referenceName'] = comment_item.get('referenceName')
comment['referenceType'] = comment_item.get('referenceType')
comment['referenceTypeId'] = comment_item.get('referenceTypeId')
comment['firstCategory'] = comment_item.get('firstCategory')
comment['secondCategory'] = comment_item.get('secondCategory')
comment['thirdCategory'] = comment_item.get('thirdCategory')
comment['replyCount'] = comment_item.get('replyCount')
comment['score'] = comment_item.get('score')
comment['status'] = comment_item.get('status')
comment['title'] = comment_item.get('title')
comment['usefulVoteCount'] = comment_item.get('usefulVoteCount')
comment['uselessVoteCount'] = comment_item.get('uselessVoteCount')
comment['userImage'] = 'http://' + comment_item.get('userImage')
comment['userImageUrl'] = 'http://' + comment_item.get('userImageUrl')
comment['userLevelId'] = comment_item.get('userLevelId')
comment['userProvince'] = comment_item.get('userProvince')
comment['viewCount'] = comment_item.get('viewCount')
comment['orderId'] = comment_item.get('orderId')
comment['isReplyGrade'] = comment_item.get('isReplyGrade')
comment['nickname'] = comment_item.get('nickname')
comment['userClient'] = comment_item.get('userClient')
comment['mergeOrderStatus'] = comment_item.get('mergeOrderStatus')
comment['discussionId'] = comment_item.get('discussionId')
comment['productColor'] = comment_item.get('productColor')
comment['productSize'] = comment_item.get('productSize')
comment['imageCount'] = comment_item.get('imageCount')
comment['integral'] = comment_item.get('integral')
comment['userImgFlag'] = comment_item.get('userImgFlag')
comment['anonymousFlag'] = comment_item.get('anonymousFlag')
comment['userLevelName'] = comment_item.get('userLevelName')
comment['plusAvailable'] = comment_item.get('plusAvailable')
comment['recommend'] = comment_item.get('recommend')
comment['userLevelColor'] = comment_item.get('userLevelColor')
comment['userClientShow'] = comment_item.get('userClientShow')
comment['isMobile'] = comment_item.get('isMobile')
comment['days'] = comment_item.get('days')
comment['afterDays'] = comment_item.get('afterDays')
yield comment
if 'images' in comment_item:
for image in comment_item['images']:
commentImageItem = CommentImageItem()
commentImageItem['_id'] = image.get('id')
commentImageItem['associateId'] = image.get('associateId') # 和CommentItem的discussionId相同
commentImageItem['productId'] = image.get('productId') # 不是ProductsItem的id,这个值为0
commentImageItem['imgUrl'] = 'http:' + image.get('imgUrl')
commentImageItem['available'] = image.get('available')
commentImageItem['pin'] = image.get('pin')
commentImageItem['dealt'] = image.get('dealt')
commentImageItem['imgTitle'] = image.get('imgTitle')
commentImageItem['isMain'] = image.get('isMain')
yield commentImageItem
# next page,利用meta记录抓取页数进行翻页,page=0是第一页,page=1是第二页,以此类推
max_page = int(data.get('maxPage', '1'))
if max_page > 60:
max_page = 60
comment_page = response.meta['page'] + 1
if comment_page < max_page:
url = comment_url % (product_id, comment_page)
meta = dict()
meta['product_id'] = product_id
meta['page'] = comment_page
yield Request(url=url, callback=self.parse_comments, meta=meta)
5、结果
21个大类
1183个小类
45066间店铺
37412件商品
20973个热评标签
37403条评论统计
657119张评论图片
1557612条评论
看最后的输出日志,一共请求了62万+次,异常最多的是KeyError,主要是某些商品没有评论数据,所以报错,另外一个问题是店铺数量比商品数量还要多?原因可能有些商品没有价钱,代码在yield productsItem前报错导致没有存储?
关注公众号获取个人分享的资源以及了解更多详情