amazon爬虫

该文章演示了如何使用Python的Request库和lxml库来抓取亚马逊网站上关于笔记本电脑的商品信息,包括商品名称、评分和价格。通过设置User-Agent以模拟浏览器行为,避免被网站封锁。抓取的数据存储到列表中,最后用pandasDataFrame整合并导出为CSV文件。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

提示:文章写完后,目录可以自动生成,如何生成可参考右边的帮助文档


前言

提示:这里可以添加本文要记录的大概内容:

亚马逊商品页面轻量级爬虫-笔记本电脑页面


提示:以下是本篇文章正文内容,下面案例可供参考

一、基于Request库

代码如下:

import requests
from lxml import etree
import time
from fake_useragent import UserAgent
import random
import pandas as pd
import os


#global value
ua = UserAgent()
list_time = [1.5,2.0,1.9,2.1,1.7,1.8]

laptops = []
rates = []
price = []



def get_value_by_url(url):

    headers = {
    'Pragma': 'no-cache',
    'Cache-Control': 'no-cache',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': ua.random,
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng;q=0.8',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,fr;q=0.4,zh-TW;q=0.2',
    'Cookie': 'session-id=458-3361992-1195835; ubid-acbcn=460-3998372-8065142; ld=AZCNAGSTopnav; s_pers=%20s_fid%3D64783603F3EAFFA8-3A48637D04C10D2B%7C1838200629471%3B%20s_dl%3D1%7C1680349629472%3B%20gpv_page%3DCN%253AAZ%253ASOA-sell%7C1680349629474%3B%20s_ev15%3D%255B%255B%2527AZCNAGSTopnav%2527%252C%25271680347829475%2527%255D%255D%7C1838200629475%3B; s_nr=1680354391299-New; s_vnum=2112354391299%26vn%3D1; s_dslv=1680354391300; sst-main=Sst1|PQG2p1DGUgBji2VU6z-Nvi1YBp2Vgb-rKyCGMOIeeXOrxpgZb21wtPlT7kX0hIfGgXPe_qIvxd-8IR7sqecWV0vtV5BACC8OflqgCHDFX44rQ1ZTSgND8HT6NqipaRwn47InKqDNoFoovR5wy2xXVW-nmYynhaLwDtL8Xy7QD5T7cwMJD-3bYXcwZMVXX5Kvt0Hkr5b3A76oK7qH88Tv3peA41P-_xDtPcwE02SVBVcnN-w; i18n-prefs=CNY; session-id-time=2082787201l; session-token="qoP1yVtNPlkTmFpKuONdl0Qy2j5cvrAU7eboEIIHFv3pw1BPPM4WH97JTsOveFHNVNey15eEjpOHIrVqMmbUnK3e8PzWl7v+NKTHEOyTHzhIE3SQbRcZx9UU3DKppApx7KVdktsQknnegYMTrPAECGMRI1TQA7SBXFD+yGMDahfaLQWFmdyNzY7rR1p2CJV5v8oovz35pPXhUkKgVE4sKTXKbXCc8VvwzSUmRAzXt1U="; csm-hit=tb:48JTPK9V2N9GRFDWVR2X+s-48JTPK9V2N9GRFDWVR2X|1680426195393&t:1680426195393&adb:adblk_no'
}
        
    response = requests.get(url=url,headers=headers,timeout=6)
    print(response.raise_for_status)

    response.encoding = response.apparent_encoding  #编码


    page_text = response.text

    tree = etree.HTML(page_text)

    commdities = tree.xpath('//*[@id="search"]/div[1]/div[1]/div/span[1]/div[1]/div')
    print(len(commdities))
    with open('a.html','w',encoding='utf-8') as f:
        f.write(page_text)



    #爬取商品名称
    for good in commdities:
        a=good.xpath('./div/div/div/div/div[2]/div[1]/h2/a/span/text()')
        if a != []:
            laptops.append(a[0])
            print(a[0])
       
        else:
            a = good.xpath('./div/div/div/div/div[3]/div[1]/h2/a/span/text()')
            if a!= []:
                laptops.append(a[0])
          


        #爬取评分
        r = good.xpath('./div/div/div/div/div[2]/div[2]/div/span[1]/span[1]/text()')
        #//*[@id="search"]/div[1]/div[1]/div/span[1]/div[1]/div[2]/div/div/div/div/div[2]/div[2]/div/span[1]/span[1]
        #//*[@id="search"]/div[1]/div[1]/div/span[1]/div[1]/div[17]/div/div/div/div/div[3]/div[2]/div/span[1]/span[1]
        print(r)
        if r!= []:
            rates.append(r[0])
            

        else:
            r = good.xpath('div/div/div/div/div[3]/div[2]/div/span[1]/span[1]/text()')
            if r != []:
                rates.append(r[0])


        #爬取价格
        p = good.xpath('./div/div/div/div/div[2]/div[3]/div/a/span/span[2]/span[2]/text()')
        #//*[@id="search"]/div[1]/div[1]/div/span[1]/div[1]/div[3]/div/div/div/div/div[2]/div[2]/div/a/span/span[2]/span[2]
        #//*[@id="search"]/div[1]/div[1]/div/span[1]/div[1]/div[2]/div/div/div/div/div[2]/div[3]/div/a/span/span[2]/span[2]
        #//*[@id="search"]/div[1]/div[1]/div/span[1]/div[1]/div[17]/div/div/div/div/div[3]/div[3]/div/a/span/span[2]/span[2]
        if p!= []:
            price.append(p[0])

        else:
            p = good.xpath('div/div/div/div/div[3]/div[3]/div/a/span/span[2]/span[2]/text()')
            if p != []:
                price.append(p[0])
            else:
                p = good.xpath('./div/div/div/div/div[2]/div[2]/div/a/span/span[2]/span[2]/text()')
                if p != []:
                    price.append(p)

                


    print(len(laptops),len(rates),len(price))




    time.sleep(random.choice(list_time))

if __name__ == '__main__':
    for page in range(1,10):
        url = f'https://www.amazon.cn/s?k=%E7%AC%94%E8%AE%B0%E6%9C%AC%E7%94%B5%E8%84%91&rh=n%3A106200071&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&ref=nb_sb_noss&page={page}'
        get_value_by_url(url=url)

    data = pd.DataFrame({'goods':laptops,'rate':rates,'price':price})

    print(data)
    data.to_csv(os.path.join(os.getcwd(),'test.csv'),index=False,mode='a',encoding='utf-8_sig')

    print('finished')


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值