fake_useragent 虚假请求头的使用
需要安装模块——fake-useragent
from fake_useragent import UserAgent
ua =UserAgent()
print(ua.random)
ajax介绍
分析ajax数据接口案例——百度贴吧图片爬取
打开贴吧图片
找到图片的url
复制地址到源码中查看是否存在,如果存在直接请求该url,不存在则需要分析数据接口
不存在该源代码中
这时候打开Network,打开XHR
到response查看复制并进行json解析
用json.loads解析
import requests
import json
import os
# 目标url
url = 'https://tieba.baidu.com/photo/g/bw/picture/list?kw=%E8%BF%AA%E5%8D%A2%E6%9C%A8%E5%A4%9A&alt=jview&rn=200&tid=2126299747&pn=1&ps=1&pe=40&info=1&_=1620904625372'
headers = {'User-Agent':'Mozilla/5.0 (X11; OpenBSD i386) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36'
}
req = requests.get(url=url, headers=headers)
# UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc8 in position 98: invalid continuation byte 'utf-8'不能解析,换个别的
# req = req.content.decode('utf-8')'gb2312'
req = req.content.decode('gb2312')
# print(req)
dictreq =json.loads(req)
# print(dictreq,type(dictreq))
r =dictreq['data']['pic_list']
for i in r:
dicturl =i['murl']
print(dicturl)
正则方式
保存图片
问题出现:只爬取了40张,而图片共有167张
发现有5个url
for循环遍历5次就可以,for i in range(1,162,40),pe可以直接用39+i解决
name = 1
for i in range(1, 162, 40):
# url = 'https://tieba.baidu.com/photo/g/bw/picture/list?kw=%E8%BF%AA%E5%8D%A2%E6%9C%A8%E5%A4%9A&alt=jview&rn=200&tid=2126299747&pn=1&ps=1&pe=40&info=1&_=1620904625372'
url = 'https://tieba.baidu.com/photo/g/bw/picture/list?kw=%E8%BF%AA%E5%8D%A2%E6%9C%A8%E5%A4%9A&alt=jview&rn=200&tid=2126299747&pn=1' + '&ps=' + str(
i) + '&pe=' + str(i + 39) + '&wall_type=v&_=1620911658660'
headers = {
'User-Agent': 'Mozilla/5.0 (X11; OpenBSD i386) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36'
}
res = requests.get(url=url, headers=headers)
# 正则表达式匹配数据
img_url = re.findall('"murl":"(.*?)"', res.text)
for img in img_url:
img_resp = requests.get(img)
# 保存数据
with open('百度图片/%d.jpg' % name, 'wb')as f:
f.write(img_resp.content)
print('正在爬取第%d张图片' % name)
name += 1