爬虫异步携程

最新推荐文章于 2025-04-14 10:49:19 发布

猿～阿峰

最新推荐文章于 2025-04-14 10:49:19 发布

阅读量613

点赞数 4

文章标签：爬虫 python 开发语言

本文链接：https://blog.youkuaiyun.com/2401_84049017/article/details/139327796

版权

爬虫异步携程

爬虫中用异步协程进行爬取内容，可以大大的节省时间，效率也大大的提高，用异步协程方法爬取一部小说只需几秒时间就爬完了，其效率非常高，下面是几个用异步协程爬取内容的几个案例

import asyncio

async def funci():
	print("你好啊，我叫潘金莲")
	#t1me.sleep(3)			#当程序出现了同步操作的时候，异步就中断了
	wait asyncio.sleep(3).		#异步操作的代码 wait 将代码挂起
	print("你好啊，我叫潘金莲)
	
async def func2():
	print("你好啊，我叫王建国")
	#time.sleep(2)
	await asyncio.sleep(2)
	print("你好啊，我叫压建国")
	
async def func3():
	print("你好啊，我叫李雪琴")
	await asyncio.sleep(4)
	print("你好啊，我叫李雪琴")

if __name__=='__main__':
	f1 = func1()
	f2 = func2()
	f3 = func3()
	tasks =[f1,f2,f3]
	t1 = time.time()   				
	asyncio.run(asyncio.wait(tasks))		#一次性启动多个任务（协程）
	t2= time.time()
	print(t2 t1)

另一种写法

import asyncio

async def func1():
	print("你好啊，我叫潘金莲")
	#t1me.sleep(3)			#当程序出现了同步操作的时候，异步就中断了
	await asyncio.sleep(3).		#异步操作的代码 wait 将代码挂起
	print("你好啊，我叫潘金莲)
	
async def func2():
	print("你好啊，我叫王建国")
	#time.sleep(2)
	await asyncio.sleep(2)
	print("你好啊，我叫压建国")
	
async def func3():
	print("你好啊，我叫李雪琴")
	await asyncio.sleep(4)
	print("你好啊，我叫李雪琴")
	
async def main():
	#第一种写法
	f1 = func1()
	f2 = func2()
	f3 = func3()
	#await f1					#一般awa1t挂起操作放在协程对象前面
	#第二种写法（推荐）
	tasks=[asyncio.create_task(func1()) , 
		   asyncio(func2()) , 
		   asyncio(func3())
		   ]
		   
	await asyncio.wait(tasks)

if __name__=='__main__':
	asyncio.run(main())

携程爬取几张图片

#requests,get()同步的代码->异步操作aiohttp
#pip install aiohttp

import asyncio
import aiohttp

urls=[
	"http://kr.shanghai-jiuxin.com/file/2020/1031/191468637cab2f0206f7d1d9b175ac81.ipg",
	"http://kr.shanghai-iiuxin.com/file/2020/1031/563337d07af599a9ea64e620729f367e.ipg",
	"http://kr.shanghai-jiuxin.com/file/2020/1031/774218be86d832f359637ab120eba52d.ipg"
]

async def aiodownload(url):
	#发送请求。
	#得到图片内容
	#保存到文件
	name=urL.rsplit("/",1)[1]		#从右边切，切一次，得到[1]位置的内容
	async with aiohttp.ClientSession() as session:		#相当于requests，s=aiohttp.ClientSession()<=>requests
		async with session.get(url) as resp:			#相当于resp = requests.get()
		#请求回来了，写入文件
		#可以自已去学习一个模块，aiofiles
			async with aiofiles.open(name,mode="wb") as f:				#创建文件
				await f.write(await resp.content.read())		#读取内容是异步的.需要await挂起，resp,text()
	print(name,"搞定")

async def main():
	tasks = []
	for url in urls:
		tasks.append(aiodownload(url))
		
	await asyncio.wait(tasks)


if __name__=='__main__':
	asyncio.run(main())

异步爬取一部小说

import requests
import asyncio
import aiohttp
import aiofiles
import json

#1.同步操作：访问getCatalog拿到所有章节的cid和名称
#2,异步操作：访问getChapterContent下载所有的文章内容

async def aiodownload(cid.b id,title):
	data={
	"book_id":b_id,
	"cid":f"[b_id}cid}",
	"need_bookinfo":1
	}
    data = json.dumps(data)
    url =f"http://dushu.baidu.com/api/pc/getChaptercontent?data=fdata}"

    async with aiohttp.ClientSession() as session:
    	async with session.get(url) as resp:
   	 		dic = await resp.json()
    		async with aiofiles.open(title,mode="w",encoding="utf-8") as f:
    			await f.write(dic['data']['novel']['content'])				#把小说内容写出

async def getcatalog(url):
    resp = requests.get(url)
    dic = resp.json()
    tasks = []
    for item in dic['data']['novel']['items']:		#item就是对应每一个章节的名称和cid
    	title=item['title']
    	cid = item['cid']
    #准备异步任务
    	tasks.append(aiodownload(cid,b_id,title))		#aiodownload（）函数来下载每一章的内容
    await asyncio.wait(tasks)
    
if __name__=='__main__':
    b1d='4306063500'
    url ='http://dushu.baidu.com/api/pc/getCatalog?data={"book id":"'+b id +'
    
    #asyncio.run(getCatalog(url))
  	loop = asyncio.get_event_loop()			#会出现RuntimeError: Event loop is closed错误，改成下面俩步就可以解决
    loop.run_until_complete(craw_text(url))

import requests
import asyncio
import aiohttp
import aiofiles
from bs4 import BeautifulSoup

async def parser(url,title):		#解析文章url，抓取内容
    async with aiohttp.ClientSession() as session:	#session相当于requests
        async with session.get(url) as rep:
            soup=BeautifulSoup(await rep.text(),'html.parser')
            text=soup.find('div',class_='noveContent').get_text()
            async with aiofiles.open(f'novel/{title}','w',encoding='utf-8') as f:
                await f.write(text)


async def craw_text(url):			#抓取内容
    rep=requests.get(url)
    soup=BeautifulSoup(rep.text,'html.parser')
 articles=soup.find('div',class_='c_con_mainbody').find('div',class_='c_con_list').find_all('div',class_='c_con_li_detail_p')
    tasks=[]
    for article in articles:
        href=article.find('a')['href']
        url='https:'+href
        title=article.find('a').get_text().replace('让你写益智剧，你写大明王朝？:','')
        #准备异步工作
        tasks.append(parser(url,title))			#parser（）函数，解析一个文章的url，抓取内容
    await asyncio.wait(tasks)

if __name__=='__main__':
    url='https://b.faloo.com/html_1409_1409469/'
    #asyncio.run(craw_text(url))        #会出现RuntimeError: Event loop is closed错误，改成下面俩步就可以解决
    loop = asyncio.get_event_loop()
    loop.run_until_complete(craw_text(url))