试了几类图片都可以,可能还存在其他bug..
import random
import requests
from bs4 import BeautifulSoup
import aiohttp
import aiofiles
import asyncio
import re
# 获取子网页的所有地址
async def get_second_class_url(get_second_url, head):
tasks = []
# 获取图片的正则表达
obj = re.compile('.*?class="shenlue">.*?<b>(?P<page>.*?)</b>', re.S)
# 子网页URL存放
all_page_url_list = []
resp = requests.get(get_second_url, head)
resp.encoding = 'UTF-8'
# 正则拿到数据唯一值可以用match函数拿到需要的页面 也可使用for循环
page = re.match(obj, resp.text)
# 获取该分类的总页数
all_page_num = page.group('page')
# 获取所有页面的地址
for page in range(1, 3): # 测试用 只爬取了前2页 如果爬取全部页面将上面代码放开,
# for page in range(1, int(all_page_num)+1): # 获取该类图片的全部内容
if page == 1:
all_page_url_list.append(get_second_url)
else:
# 组合新的URL 拿到所有页面的地址
all_url = get_second_url.split('.html')[0] + '_' + str(page) + '.html'
all_page_url_list.append(all_url)
# 启动异步任务
print('共有' + str(len(all_page_url_list)) + '页数据等待加载...............')
for p in all_page_url_list:
print(p + '正在获取该网页图片的url地址.......\n\n')
tasks.append(asyncio.create_task(get_img_url(p, head)))
await asyncio.wait(tasks)
# 获取图片的url
async def get_img_url(url, head):
task = []
async with aiohttp.ClientSession() as session:
async with session.get(url, headers=head) as resp:
soup = BeautifulSoup(await resp.text(), 'html.parser')
# 拿到说有图片的div
divs = soup.find_all('div', class_='bot-div')
for div in divs:
# 获取图片链接
img_url = url.split('/tupian')[0] + div.a.get('href')
# 异步获取图片下载地址
print(f'{img_url}图片地址获取成功////等待获取图片下载链接............')
task.append(asyncio.create_task(get_all_img_url(session, img_url, head)))
await asyncio.wait(task)
# 获取图片的下载地址 (传session后,不用在下面的函数中每次都创建了)
async def get_all_img_url(session, url, head):
task = []
async with session.get(url, headers=head) as resp1:
soup1 = BeautifulSoup(await resp1.text(), 'html.parser')
p_label = soup1.find_all('p', class_='bg-bull btn-p com-right-down-btn')
for p in p_label:
# 拿到图片的真实下载地址
img_url_big = p.a.get('href')
# 异步通过地址下载并保持图片
task.append(asyncio.create_task(img_download(session, img_url_big, head)))
await asyncio.wait(task)
# 下载图片
async def img_download(session, url, head):
print(url + '图片地址获取成功准备下载...')
name = url.rsplit('/')[-1]
# 请求图片下载地址拿到数据 session 上面函数传入
try:
async with session.get(url, headers=head) as resp2:
async with aiofiles.open(f'img/{name}', 'wb') as f:
await f.write(await resp2.content.read())
print(name + '图片下载完毕......')
await asyncio.sleep(0.1)
except Exception as e:
print(url + '该地址图片下载失败.......')
async def main():
# 请求头池
user_agent_pool = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.167 '
'Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 '
'Safari/537.36 Edg/93.0.961.38',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 '
'Safari/537.36 Edg/93.0.961.44',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 '
'Safari/537.36 Edg/93.0.961.47',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 '
'Safari/537.36 Edg/93.0.961.52',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 '
'Safari/537.36 Edg/94.0.992.31',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 '
'Safari/537.36 Edg/94.0.992.37',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 '
'Safari/537.36 Edg/94.0.992.38',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 '
'Safari/537.36 Edg/94.0.992.47',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 '
'Safari/537.36 Edg/94.0.992.50',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 '
'Safari/537.36 Edg/95.0.1020.30',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 '
'Safari/537.36 Edg/95.0.1020.40',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 '
'Safari/537.36 Edg/95.0.1020.44',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/116.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/117.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/118.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/119.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0) Gecko/20100101 Firefox/120.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0',
]
headers = {'User-Agent': random.choice(user_agent_pool)}
# main_url = 'https://sc.chinaz.com/tupian/dongwutupian.html' # 动物图片
main_url = 'https://sc.chinaz.com/tupian/renwutupian.html' # 人物图片
# main_url = 'https://sc.chinaz.com/tupian/taikongkexuetupian.html' # 太空图片
# 获取一个网页中的说有图片网页地址
await get_second_class_url(main_url, headers)
print('所有图片下载完毕')
if __name__ == '__main__':
# 配合使用这句代码必须加上否则报错
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
asyncio.run(main())
本文介绍了一个使用Python编写的爬虫脚本,利用asyncio和随机的User-Agent来抓取网页图片,同时通过添加延迟来降低风险。脚本示例针对的是ChinaZ网站的图片类别,如动物、人物和太空图片。
6479

被折叠的 条评论
为什么被折叠?



