爬虫初学6—爬取京东手机列表2（价格及评论）

最新推荐文章于 2024-02-23 13:38:03 发布

mr_xinL

最新推荐文章于 2024-02-23 13:38:03 发布

阅读量1.1k

点赞数 1

分类专栏：爬虫文章标签： python

原文链接：https://blog.youkuaiyun.com/six66667/article/details/93487869

版权

爬虫专栏收录该内容

13 篇文章

订阅专栏

本文详细介绍了使用Python和相关库抓取京东网站上手机商品信息的方法，包括商品名称、价格及评论数据，通过实例展示了如何构造请求头、解析网页结构并抓取数据，最后将数据保存为CSV文件。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

转载：

import requests

from lxml import etree

import time

import csv


# 定义函数抓取每页前30条商品信息

def crow_first(n):
    # 构造每一页的url变化

    url = 'https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&cid2=653&cid3=655&page=' + str(
        2 * n - 1)

    head = {'authority': 'search.jd.com',

            'method': 'GET',

            'path': '/s_new.php?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E6%89%8B%E6%9C%BA&cid2=653&cid3=655&page=4&s=84&scrolling=y&log_id=1529828108.22071&tpl=3_M&show_items=7651927,7367120,7056868,7419252,6001239,5934182,4554969,3893501,7421462,6577495,26480543553,7345757,4483120,6176077,6932795,7336429,5963066,5283387,25722468892,7425622,4768461',

            'scheme': 'https',

            'referer': 'https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E6%89%8B%E6%9C%BA&cid2=653&cid3=655&page=3&s=58&click=0',

            'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36',

            'x-requested-with': 'XMLHttpRequest',

            'Cookie': ''

            }

    r = requests.get(url, headers=head)

    # 指定编码方式，不然会出现乱码

    r.encoding = 'utf-8'

    html1 = etree.HTML(r.text)

    # 定位到每一个商品标签li

    datas = html1.xpath('//li[contains(@class,"gl-item")]')

    '''抓评论'''
    #/attribute::表示 选取当前节点的所有属性
    pids = html1.xpath('//li[contains(@class,"gl-item")]/attribute::data-sku')
    print(pids)
    #json页的网址形式：https://club.jd.com/comment/productCommentSummaries.action?referenceIds=100007926792,
    #其中100007926792是data-sku的属性
    comment_url = "https://club.jd.com/comment/productCommentSummaries.action?referenceIds="

    for pid in pids:
        comment_url += pid + ","

    comment_r = requests.get(comment_url)

    p_comment = []

    for comment in comment_r.json()["CommentsCount"]:
        p_comment.append([comment["CommentCount"], comment["AverageScore"],

                          comment["GoodCount"], comment["DefaultGoodCount"],

                          comment["GoodRate"], comment["AfterCount"], comment["VideoCount"],

                          comment["PoorCount"], comment["GeneralCount"]])

        # 总评数，平均得分，好评数，默认好评，好评率，追评数，视频晒单数，差评数，中评数

    # 将抓取的结果保存到本地CSV文件中

    with open('JD_Phone1-1.csv', 'a', newline='', encoding='gb18030')as f:

        write = csv.writer(f)

        i = 0

        for data in datas:

            p_price = data.xpath('div/div[@class="p-price"]/strong/i/text()')

            p_name = data.xpath('div/div[@class="p-name p-name-type-2"]/a/em')

            # 这个if判断用来处理那些价格可以动态切换的商品，比如上文提到的小米MIX2，他们的价格位置在属性中放了一个最低价

            if len(p_price) == 0:
                p_price = data.xpath('div/div[@class="p-price"]/strong/@data-price')

                # xpath('string(.)')用来解析混夹在几个标签中的文本

            write.writerow([p_name[0].xpath('string(.)'), p_price[0], p_comment[i][0], p_comment[i][1], p_comment[i][2],
                            p_comment[i][3], p_comment[i][4], p_comment[i][5], p_comment[i][6], p_comment[i][7],
                            p_comment[i][8]])

            i += 1

    f.close()


# 定义函数抓取每页后30条商品信息

def crow_last(n):
    # 获取当前的Unix时间戳，并且保留小数点后5位

    a = time.time()

    b = '%.5f' % a

    url = 'https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E6%89%8B%E6%9C%BA&cid2=653&cid3=655&page=' + str(
        2 * n) + '&s=' + str(48 * n - 20) + '&scrolling=y&log_id=' + str(b)

    head = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36',

            'x-requested-with': 'XMLHttpRequest', }

    r = requests.get(url, headers=head)

    r.encoding = 'utf-8'

    html1 = etree.HTML(r.text)

    datas = html1.xpath('//li[contains(@class,"gl-item")]')

    '''抓评论'''

    pids = html1.xpath('//li[contains(@class,"gl-item")]/attribute::data-sku')
    print(pids)

    comment_url = "https://club.jd.com/comment/productCommentSummaries.action?referenceIds="

    for pid in pids:
        comment_url += pid + ","

    comment_r = requests.get(comment_url)

    p_comment = []

    for comment in comment_r.json()["CommentsCount"]:
        p_comment.append([comment["CommentCount"], comment["AverageScore"],

                          comment["GoodCount"], comment["DefaultGoodCount"],

                          comment["GoodRate"], comment["AfterCount"], comment["VideoCount"],

                          comment["PoorCount"], comment["GeneralCount"]])

        # 总评数，平均得分，好评数，默认好评，好评率，追评数，视频晒单数，差评数，中评数

    with open('JD_Phone1-1.csv', 'a', newline='', encoding='gb18030')as f:

        write = csv.writer(f)

        i = 0

        for data in datas:

            p_price = data.xpath('div/div[@class="p-price"]/strong/i/text()')

            p_name = data.xpath('div/div[@class="p-name p-name-type-2"]/a/em')

            if len(p_price) == 0:
                p_price = data.xpath('div/div[@class="p-price"]/strong/@data-price')

            write.writerow([p_name[0].xpath('string(.)'), p_price[0], p_comment[i][0], p_comment[i][1], p_comment[i][2],
                            p_comment[i][3], p_comment[i][4], p_comment[i][5], p_comment[i][6], p_comment[i][7],
                            p_comment[i][8]])

            i += 1

    f.close()


if __name__ == '__main__':

    for i in range(1, 30):

        print('***************************************************')


        try:

            print('   First_Page:   ' + str(i))

            crow_first(i)

            print('   Finish')

        except Exception as e:

            print(e)

        print('------------------')

        try:

            print('   Last_Page:   ' + str(i))

            crow_last(i)

            print('   Finish')

        except Exception as e:

            print(e)

实测可用。
博主：six66667
原文链接：https://blog.youkuaiyun.com/six66667/article/details/93487869