转载:
import requests
from lxml import etree
import time
import csv
# 定义函数抓取每页前30条商品信息
def crow_first(n):
# 构造每一页的url变化
url = 'https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&cid2=653&cid3=655&page=' + str(
2 * n - 1)
head = {'authority': 'search.jd.com',
'method': 'GET',
'path': '/s_new.php?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E6%89%8B%E6%9C%BA&cid2=653&cid3=655&page=4&s=84&scrolling=y&log_id=1529828108.22071&tpl=3_M&show_items=7651927,7367120,7056868,7419252,6001239,5934182,4554969,3893501,7421462,6577495,26480543553,7345757,4483120,6176077,6932795,7336429,5963066,5283387,25722468892,7425622,4768461',
'scheme': 'https',
'referer': 'https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E6%89%8B%E6%9C%BA&cid2=653&cid3=655&page=3&s=58&click=0',
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36',
'x-requested-with': 'XMLHttpRequest',
'Cookie': ''
}
r = requests.get(url, headers=head)
# 指定编码方式,不然会出现乱码
r.encoding = 'utf-8'
html1 = etree.HTML(r.text)
# 定位到每一个商品标签li
datas = html1.xpath('//li[contains(@class,"gl-item")]')
'''抓评论'''
#/attribute::表示 选取当前节点的所有属性
pids = html1.xpath('//li[contains(@class,"gl-item")]/attribute::data-sku')
print(pids)
#json页的网址形式:https://club.jd.com/comment/productCommentSummaries.action?referenceIds=100007926792,
#其中100007926792是data-sku的属性
comment_url = "https://club.jd.com/comment/productCommentSummaries.action?referenceIds="
for pid in pids:
comment_url += pid + ","
comment_r = requests.get(comment_url)
p_comment = []
for comment in comment_r.json()["CommentsCount"]:
p_comment.append([comment["CommentCount"], comment["AverageScore"],
comment["GoodCount"], comment["DefaultGoodCount"],
comment["GoodRate"], comment["AfterCount"], comment["VideoCount"],
comment["PoorCount"], comment["GeneralCount"]])
# 总评数,平均得分,好评数,默认好评,好评率,追评数,视频晒单数,差评数,中评数
# 将抓取的结果保存到本地CSV文件中
with open('JD_Phone1-1.csv', 'a', newline='', encoding='gb18030')as f:
write = csv.writer(f)
i = 0
for data in datas:
p_price = data.xpath('div/div[@class="p-price"]/strong/i/text()')
p_name = data.xpath('div/div[@class="p-name p-name-type-2"]/a/em')
# 这个if判断用来处理那些价格可以动态切换的商品,比如上文提到的小米MIX2,他们的价格位置在属性中放了一个最低价
if len(p_price) == 0:
p_price = data.xpath('div/div[@class="p-price"]/strong/@data-price')
# xpath('string(.)')用来解析混夹在几个标签中的文本
write.writerow([p_name[0].xpath('string(.)'), p_price[0], p_comment[i][0], p_comment[i][1], p_comment[i][2],
p_comment[i][3], p_comment[i][4], p_comment[i][5], p_comment[i][6], p_comment[i][7],
p_comment[i][8]])
i += 1
f.close()
# 定义函数抓取每页后30条商品信息
def crow_last(n):
# 获取当前的Unix时间戳,并且保留小数点后5位
a = time.time()
b = '%.5f' % a
url = 'https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E6%89%8B%E6%9C%BA&cid2=653&cid3=655&page=' + str(
2 * n) + '&s=' + str(48 * n - 20) + '&scrolling=y&log_id=' + str(b)
head = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36',
'x-requested-with': 'XMLHttpRequest', }
r = requests.get(url, headers=head)
r.encoding = 'utf-8'
html1 = etree.HTML(r.text)
datas = html1.xpath('//li[contains(@class,"gl-item")]')
'''抓评论'''
pids = html1.xpath('//li[contains(@class,"gl-item")]/attribute::data-sku')
print(pids)
comment_url = "https://club.jd.com/comment/productCommentSummaries.action?referenceIds="
for pid in pids:
comment_url += pid + ","
comment_r = requests.get(comment_url)
p_comment = []
for comment in comment_r.json()["CommentsCount"]:
p_comment.append([comment["CommentCount"], comment["AverageScore"],
comment["GoodCount"], comment["DefaultGoodCount"],
comment["GoodRate"], comment["AfterCount"], comment["VideoCount"],
comment["PoorCount"], comment["GeneralCount"]])
# 总评数,平均得分,好评数,默认好评,好评率,追评数,视频晒单数,差评数,中评数
with open('JD_Phone1-1.csv', 'a', newline='', encoding='gb18030')as f:
write = csv.writer(f)
i = 0
for data in datas:
p_price = data.xpath('div/div[@class="p-price"]/strong/i/text()')
p_name = data.xpath('div/div[@class="p-name p-name-type-2"]/a/em')
if len(p_price) == 0:
p_price = data.xpath('div/div[@class="p-price"]/strong/@data-price')
write.writerow([p_name[0].xpath('string(.)'), p_price[0], p_comment[i][0], p_comment[i][1], p_comment[i][2],
p_comment[i][3], p_comment[i][4], p_comment[i][5], p_comment[i][6], p_comment[i][7],
p_comment[i][8]])
i += 1
f.close()
if __name__ == '__main__':
for i in range(1, 30):
print('***************************************************')
try:
print(' First_Page: ' + str(i))
crow_first(i)
print(' Finish')
except Exception as e:
print(e)
print('------------------')
try:
print(' Last_Page: ' + str(i))
crow_last(i)
print(' Finish')
except Exception as e:
print(e)
实测可用。
博主:six66667
原文链接:https://blog.youkuaiyun.com/six66667/article/details/93487869