提示:文章写完后,目录可以自动生成,如何生成可参考右边的帮助文档
文章目录
前言
提示:这里可以添加本文要记录的大概内容:
亚马逊商品页面轻量级爬虫-笔记本电脑页面
提示:以下是本篇文章正文内容,下面案例可供参考
一、基于Request库
代码如下:
import requests
from lxml import etree
import time
from fake_useragent import UserAgent
import random
import pandas as pd
import os
#global value
ua = UserAgent()
list_time = [1.5,2.0,1.9,2.1,1.7,1.8]
laptops = []
rates = []
price = []
def get_value_by_url(url):
headers = {
'Pragma': 'no-cache',
'Cache-Control': 'no-cache',
'Upgrade-Insecure-Requests': '1',
'User-Agent': ua.random,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,fr;q=0.4,zh-TW;q=0.2',
'Cookie': 'session-id=458-3361992-1195835; ubid-acbcn=460-3998372-8065142; ld=AZCNAGSTopnav; s_pers=%20s_fid%3D64783603F3EAFFA8-3A48637D04C10D2B%7C1838200629471%3B%20s_dl%3D1%7C1680349629472%3B%20gpv_page%3DCN%253AAZ%253ASOA-sell%7C1680349629474%3B%20s_ev15%3D%255B%255B%2527AZCNAGSTopnav%2527%252C%25271680347829475%2527%255D%255D%7C1838200629475%3B; s_nr=1680354391299-New; s_vnum=2112354391299%26vn%3D1; s_dslv=1680354391300; sst-main=Sst1|PQG2p1DGUgBji2VU6z-Nvi1YBp2Vgb-rKyCGMOIeeXOrxpgZb21wtPlT7kX0hIfGgXPe_qIvxd-8IR7sqecWV0vtV5BACC8OflqgCHDFX44rQ1ZTSgND8HT6NqipaRwn47InKqDNoFoovR5wy2xXVW-nmYynhaLwDtL8Xy7QD5T7cwMJD-3bYXcwZMVXX5Kvt0Hkr5b3A76oK7qH88Tv3peA41P-_xDtPcwE02SVBVcnN-w; i18n-prefs=CNY; session-id-time=2082787201l; session-token="qoP1yVtNPlkTmFpKuONdl0Qy2j5cvrAU7eboEIIHFv3pw1BPPM4WH97JTsOveFHNVNey15eEjpOHIrVqMmbUnK3e8PzWl7v+NKTHEOyTHzhIE3SQbRcZx9UU3DKppApx7KVdktsQknnegYMTrPAECGMRI1TQA7SBXFD+yGMDahfaLQWFmdyNzY7rR1p2CJV5v8oovz35pPXhUkKgVE4sKTXKbXCc8VvwzSUmRAzXt1U="; csm-hit=tb:48JTPK9V2N9GRFDWVR2X+s-48JTPK9V2N9GRFDWVR2X|1680426195393&t:1680426195393&adb:adblk_no'
}
response = requests.get(url=url,headers=headers,timeout=6)
print(response.raise_for_status)
response.encoding = response.apparent_encoding #编码
page_text = response.text
tree = etree.HTML(page_text)
commdities = tree.xpath('//*[@id="search"]/div[1]/div[1]/div/span[1]/div[1]/div')
print(len(commdities))
with open('a.html','w',encoding='utf-8') as f:
f.write(page_text)
#爬取商品名称
for good in commdities:
a=good.xpath('./div/div/div/div/div[2]/div[1]/h2/a/span/text()')
if a != []:
laptops.append(a[0])
print(a[0])
else:
a = good.xpath('./div/div/div/div/div[3]/div[1]/h2/a/span/text()')
if a!= []:
laptops.append(a[0])
#爬取评分
r = good.xpath('./div/div/div/div/div[2]/div[2]/div/span[1]/span[1]/text()')
#//*[@id="search"]/div[1]/div[1]/div/span[1]/div[1]/div[2]/div/div/div/div/div[2]/div[2]/div/span[1]/span[1]
#//*[@id="search"]/div[1]/div[1]/div/span[1]/div[1]/div[17]/div/div/div/div/div[3]/div[2]/div/span[1]/span[1]
print(r)
if r!= []:
rates.append(r[0])
else:
r = good.xpath('div/div/div/div/div[3]/div[2]/div/span[1]/span[1]/text()')
if r != []:
rates.append(r[0])
#爬取价格
p = good.xpath('./div/div/div/div/div[2]/div[3]/div/a/span/span[2]/span[2]/text()')
#//*[@id="search"]/div[1]/div[1]/div/span[1]/div[1]/div[3]/div/div/div/div/div[2]/div[2]/div/a/span/span[2]/span[2]
#//*[@id="search"]/div[1]/div[1]/div/span[1]/div[1]/div[2]/div/div/div/div/div[2]/div[3]/div/a/span/span[2]/span[2]
#//*[@id="search"]/div[1]/div[1]/div/span[1]/div[1]/div[17]/div/div/div/div/div[3]/div[3]/div/a/span/span[2]/span[2]
if p!= []:
price.append(p[0])
else:
p = good.xpath('div/div/div/div/div[3]/div[3]/div/a/span/span[2]/span[2]/text()')
if p != []:
price.append(p[0])
else:
p = good.xpath('./div/div/div/div/div[2]/div[2]/div/a/span/span[2]/span[2]/text()')
if p != []:
price.append(p)
print(len(laptops),len(rates),len(price))
time.sleep(random.choice(list_time))
if __name__ == '__main__':
for page in range(1,10):
url = f'https://www.amazon.cn/s?k=%E7%AC%94%E8%AE%B0%E6%9C%AC%E7%94%B5%E8%84%91&rh=n%3A106200071&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&ref=nb_sb_noss&page={page}'
get_value_by_url(url=url)
data = pd.DataFrame({'goods':laptops,'rate':rates,'price':price})
print(data)
data.to_csv(os.path.join(os.getcwd(),'test.csv'),index=False,mode='a',encoding='utf-8_sig')
print('finished')