爬取jd销量最高的商品名称和评论数

最新推荐文章于 2025-07-27 09:53:28 发布

wenqiang su

最新推荐文章于 2025-07-27 09:53:28 发布

阅读量825

点赞数

CC 4.0 BY-SA版权

分类专栏：爬虫

本文链接：https://blog.youkuaiyun.com/weixin_42681868/article/details/88056704

爬虫专栏收录该内容

2 篇文章

订阅专栏

该博客介绍了如何爬取京东网站上销量最高的商品名称及其对应的评论数量，通过详细步骤和代码实例，展示了网络爬虫在电商数据分析中的应用。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

参考博客：

https://blog.youkuaiyun.com/menghuannvxia/article/details/51333689

代码：

import requests
import re
from urllib.parse import quote
from lxml import etree
import sys


def get_jd_data(keyword):
    keyword = quote(keyword)  #解码中文
    url = "https://search.jd.com/Search?keyword={}&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&bs=1&wq={}&stock=1&psort=3&click=0".format(keyword,keyword)
    # print(url)
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0',
        'Cookie':'__jda=122270672.1551510490895255034850.1551510491.1551510491.1551510491.1; __jdb=122270672.12.1551510490895255034850|1.1551510491; __jdc=122270672; __jdv=122270672|direct|-|none|-|1551510490916; __jdu=1551510490895255034850; ipLoc-djd=1-72-2799-0; shshshfp=f21d0cc732d79f86048c96127c37924b; shshshfpa=20192d5f-27f0-cfdd-6be7-6b8564d8956b-1551510516; shshshsID=48476bb6cedc7b2515036bd5adb494f7_9_1551511402939; shshshfpb=fWfTtVViJK0Phfe8j4vlqfw%3D%3D; PCSYCityID=2; 3AB9D23F7A4B3C9B=4QG3GNC2A4EH3Q5AOYTTJ6N644PJZZWSXEJVD5A4FHRPXFN2KSIQJ4WRIICWOY2ON4UV2A2HXWCWHZGYSVUGDSBT64',
        'Refer':'https://www.jd.com/'
    }
    respose = requests.get(url=url,headers=headers).content.decode(encoding='utf-8')  #获取整个页面
    # print(respose)

    #解析页面内容
    tree = etree.HTML(respose)
    # 获取评价数目
    comment_list_url = tree.xpath(".//li[@class='gl-item']//div[@class='p-commit']//a/@id")
    sku_url_of_topthree = comment_list_url[:3]  # 获取商品编号
    #获取销量前三商品3的编号
    ids = []
    for j in range(3):
        #获取编号的数字部分
        url_sku = sku_url_of_topthree[j]
        id = ""
        for i in range(len(list(url_sku))):
            if i >= 10:
                id += list(url_sku)[i]
        ids.append(id)
    # print(ids)

    #*********************************************分隔符*************************************************

    result_jddata = []    #最终返回结果


    #获取商品名称
    for k in range(3):
        name_list = []
        url_sku_forname = 'https://item.jd.com/{}.html'.format(ids[k])
        re_forname = requests.get(url=url_sku_forname,headers=headers).content.decode('gbk','ignore')
        tree_forname = etree.HTML(re_forname)
        name = tree_forname.xpath("//div[@class='sku-name']/text()")
        #去掉商品名称中的换行空格
        name_sku = ''
        for na in name:
            if na != '/n' :
                name_sku += na.strip()
        # print(name_sku)
        name_list.append(name_sku)
        result_jddata.append(name_list)
    # print(result_jddata)

    # *******************************分隔符******************************************

    # 获取评论数量
    for j in range(3):
        url_sku_forcommt = 'http://club.jd.com/productpage/p-{}-s-0-t-3-p-0.html'.format(ids[j])
        res_sku = requests.get(url=url_sku_forcommt,headers=headers).content.decode('gbk','ignore')
        # print(res_sku)
        comment_number = re.findall('commentCountStr":"(.*?)","',res_sku )[0]
        result_jddata[j].append(comment_number)

    print(result_jddata)
    return result_jddata
    pass

if __name__ == '__main__':
    keyword = '茶'
    result = get_jd_data(keyword)
    pass