京东商城图书评论爬虫-优快云博客

本文链接：https://blog.youkuaiyun.com/m0_60255954/article/details/128212884

京东商城图书评分

会发现点击导航页的页数，url没有变化

import requests
import json
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0',}
# 限定抓取评论数
fetch_comment_count=15
# 设置代理
proxies={
    "https":"https://118.31.2.38:8999",
}
index=0
page_index=0
flag=True #用于控制循环是否退出
while flag:
    url='https://club.jd.com/comment/productPageComments.action?callback=' \
        'fetchJSON_comment98&productId=13168380&score=0&sortType=5&page={}' \
        '&pageSize=10&isShadowSku=0&rid=0&fold=1'.format(page_index)
    page_index+=1
    # 抓取评论页面
    # html=requests.get(url,headers=headers,proxies=proxies)
    html=requests.get(url,headers=headers)
    # 获取抓取的内容,先使用iso-8859-1格式编码
    text=str(html.content,encoding='iso-8859-1')
    # 下面的代码替换返回数据得部分内容,因为返回的数据并不是标准的JSON格式
    leftIndex=text.find('{')
    rightIndex=text.rfind('}')
    json_str=text.replace('fetchJSON_comment98(','')
    json_str=json_str.replace(')','')
    json_str=json_str.replace('true','"true"')
    json_str=json_str.replace('false','"false"')
    json_str=json_str.replace('null','"null"')
    json_str=json_str.replace(';','')
    json_obj=json.loads(json_str)
    # 循环输出评论数据
    for i in range(0,len(json_obj['comments'])):
        try:
            # 这里要进行iso-8859-1编码,然后再解码.否则会乱码
            # 按iso-8859-1编码是指按原样将字符转换成字节流
            # 获取评论内容
            comment=json_obj['comments'][i]['content'].encode(encoding='iso-8859-1').decode('GB18030')
            # 删除评论内容是"此用户未填写评价内容"的评论
            if comment != '此用户未填写评价内容':
                print('<',index+1,'>',comment)
                # 获取评论时间
                creationTime=json_obj['comments'][i]['creationTime']
                # 获取昵称
                nickname=json_obj['comments'][i]['nickname'].encode(encoding='iso-8859-1').decode('GB18030')
                print(creationTime)
                print(nickname)
                print('--------------')
                index+=1
        except:
            pass
        if index ==fetch_comment_count:
            flag=False
            break

爬虫笔记_异步数据02