Python实现通过ASIN爬取亚马逊产品评论
一、最近一直在研究爬取亚马逊评论相关的信息,亚马逊的反爬机制还是比较严格的,时不时就封cookie啊封ip啊啥的。而且他们的网页排版相对没有那么规则,所以对我们写爬虫的还是有点困扰的,经过一天的研究现在把成果及心得分享给大家
1.先是我们所需要的库,我们这里是用xpath进行内容匹配,将爬取的内容存入Mysql,所以以下就是我们所需要的库
import requests
import lxml.html
import pandas as pd
import pymysql
import random
import time
2.接下来是根据ASIN和请求头的cookie来获取网页
def get_response(ASIN, p, headers):
url = 'https://www.amazon.com/dp/product-reviews/' + str(ASIN) +'/ref=cm_cr_arp_d_paging_btm_prev_13?ie=UTF8&reviewerType=all_reviews&sortBy=recent' + '&pageNumber=%s'%str(p)
html = requests.get(url, headers=headers, timeout=8).text
response = lxml.html.fromstring(html)
return response
3.接下来就是正文,用requests获取网页,lxml解析,xpath匹配
def Spider(response, j):
info = []
new_date = response.xpath('//*[@id="cm_cr-review_list"]/div[%s]/div[1]/div[1]/span/text()' % str(j))
# 爬取评论者名称
new_name = response.xpath(
'//*[@id="cm_cr-review_list"]/div[%s]/div[1]/div[1]/div[1]/a/div[2]/span/text()' % str(j))
# 爬取评论星级
new_star = \
response.xpath('//*[@id="cm_cr-review_list"]/div[%s]/div[1]/div[1]/div[2]/a/i/span/text()' % str(j))[0]
# 爬取评论者购买size
new_size = response.xpath('//*[@id="cm_cr-review_list"]/div[%s]/div[1]/div[1]/div[3]/a/text()' % str(j))
# 爬取评论者购买颜色分类
new_color = response.xpath('//*[@id="cm_cr-review_list"]/div[%s]/div[1]/div[1]/div[3]/a/text()[2]' % str(j))
# 匹配评论者评论
new_content = response.xpath('//*[@id="cm_cr-review_list"]/div[%s]/div[1]/div[1]/div[4]/span/span/text()' % str(j))
# print(new_size, new_color)
if len