1.先分析目标,本文章爬取sa
先分析,url区别
Request URL: https://www.toutiao.com/search_content/?offset=0&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=1&from=search_tab
Request URL: https://www.toutiao.com/search_content/?offset=20&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=1&from=search_tab
Request URL: https://www.toutiao.com/search_content/?offset=40&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=1&from=search_tab
每次offset自增20
2.直奔主题
import requests
import json
from urllib import request
import os
import re
offset = 0
sm = 1
while sm <= 3:
# 确认目标
url = 'https://www.toutiao.com/search_content/?offset={}&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=3&from=gallery'.format(offset)
headers = {
'cookie': 'tt_webid=6590212337033758215; WEATHER_CITY=%E5%8C%97%E4%BA%AC; tt_webid=6590212337033758215; UM_distinctid=1654191755a1aa-0d94b51d3b9332-6e1f147a-15f900-1654191755b646; csrftoken=95eeb3d7da9d09f5db6da3ca727b524c; uuid="w:1ec3fc683cdf45688a188e1a31f4555a"; __tasessionId=ba981y7rs1534424611024; CNZZDATA1259612802=1046781080-1534398512-https%253A%252F%252Fwww.baidu.com%252F%7C1534425512',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36'
}
# 获取连接
response = requests.get(url,headers=headers)
# 先判断创建文件是否存在
if not os.path.exists('img'):
#创建文件
os.mkdir('img')
if response:
# 当前是str类型,需要转json,方便下面取数据
res_dict = response.json()
# 切割数据
res_list = res_dict['data']
# 遍历获取每一条数据
for item in res_list:
if 'article_url' in item:
article_url = item['article_url']
# 进入到列表详情
url = requests.get(article_url,headers=headers)
infolist =url.text
# 定义规则
pattern = r'gallery: JSON\.parse\((.*)\),'
# 开始匹配
match_res = re.search(pattern,infolist)
# 获取第一个分组
if match_res == None:
continue
json_type = match_res.group(1)
# 转换类型 成str
res__str = json.loads(json_type)
# 转换成字典
res__dict = json.loads(res__str)
# 获取数据
res = res__dict['sub_images']
# print(res)
for j in res:
infourl= j['url']
# 拼接文件名
filename = 'img/'+infourl.split('/')[-1]+'.jpg'
request.urlretrieve(infourl,filename)
else:
print('下载失败')
offset += 20
sm += 1