爬虫 异步携程
爬虫中用异步协程进行爬取内容,可以大大的节省时间,效率也大大的提高,用异步协程方法爬取一部小说只需几秒时间就爬完了,其效率非常高,下面是几个用异步协程爬取内容的几个案例
import asyncio
async def funci():
print("你好啊,我叫潘金莲")
#t1me.sleep(3) #当程序出现了同步操作的时候,异步就中断了
wait asyncio.sleep(3). #异步操作的代码 wait 将代码挂起
print("你好啊,我叫潘金莲)
async def func2():
print("你好啊,我叫王建国")
#time.sleep(2)
await asyncio.sleep(2)
print("你好啊,我叫压建国")
async def func3():
print("你好啊,我叫李雪琴")
await asyncio.sleep(4)
print("你好啊,我叫李雪琴")
if __name__=='__main__':
f1 = func1()
f2 = func2()
f3 = func3()
tasks =[f1,f2,f3]
t1 = time.time()
asyncio.run(asyncio.wait(tasks)) #一次性启动多个任务(协程)
t2= time.time()
print(t2 t1)
另一种写法
import asyncio
async def func1():
print("你好啊,我叫潘金莲")
#t1me.sleep(3) #当程序出现了同步操作的时候,异步就中断了
await asyncio.sleep(3). #异步操作的代码 wait 将代码挂起
print("你好啊,我叫潘金莲)
async def func2():
print("你好啊,我叫王建国")
#time.sleep(2)
await asyncio.sleep(2)
print("你好啊,我叫压建国")
async def func3():
print("你好啊,我叫李雪琴")
await asyncio.sleep(4)
print("你好啊,我叫李雪琴")
async def main():
#第一种写法
f1 = func1()
f2 = func2()
f3 = func3()
#await f1 #一般awa1t挂起操作放在协程对象前面
#第二种写法(推荐)
tasks=[asyncio.create_task(func1()) ,
asyncio(func2()) ,
asyncio(func3())
]
await asyncio.wait(tasks)
if __name__=='__main__':
asyncio.run(main())
- 携程爬取几张图片
#requests,get()同步的代码->异步操作aiohttp
#pip install aiohttp
import asyncio
import aiohttp
urls=[
"http://kr.shanghai-jiuxin.com/file/2020/1031/191468637cab2f0206f7d1d9b175ac81.ipg",
"http://kr.shanghai-iiuxin.com/file/2020/1031/563337d07af599a9ea64e620729f367e.ipg",
"http://kr.shanghai-jiuxin.com/file/2020/1031/774218be86d832f359637ab120eba52d.ipg"
]
async def aiodownload(url):
#发送请求。
#得到图片内容
#保存到文件
name=urL.rsplit("/",1)[1] #从右边切,切一次,得到[1]位置的内容
async with aiohttp.ClientSession() as session: #相当于requests,s=aiohttp.ClientSession()<=>requests
async with session.get(url) as resp: #相当于resp = requests.get()
#请求回来了,写入文件
#可以自已去学习一个模块,aiofiles
async with aiofiles.open(name,mode="wb") as f: #创建文件
await f.write(await resp.content.read()) #读取内容是异步的.需要await挂起,resp,text()
print(name,"搞定")
async def main():
tasks = []
for url in urls:
tasks.append(aiodownload(url))
await asyncio.wait(tasks)
if __name__=='__main__':
asyncio.run(main())
异步爬取一部小说
import requests
import asyncio
import aiohttp
import aiofiles
import json
#1.同步操作:访问getCatalog拿到所有章节的cid和名称
#2,异步操作:访问getChapterContent下载所有的文章内容
async def aiodownload(cid.b id,title):
data={
"book_id":b_id,
"cid":f"[b_id}cid}",
"need_bookinfo":1
}
data = json.dumps(data)
url =f"http://dushu.baidu.com/api/pc/getChaptercontent?data=fdata}"
async with aiohttp.ClientSession() as session:
async with session.get(url) as resp:
dic = await resp.json()
async with aiofiles.open(title,mode="w",encoding="utf-8") as f:
await f.write(dic['data']['novel']['content']) #把小说内容写出
async def getcatalog(url):
resp = requests.get(url)
dic = resp.json()
tasks = []
for item in dic['data']['novel']['items']: #item就是对应每一个章节的名称和cid
title=item['title']
cid = item['cid']
#准备异步任务
tasks.append(aiodownload(cid,b_id,title)) #aiodownload()函数来下载每一章的内容
await asyncio.wait(tasks)
if __name__=='__main__':
b1d='4306063500'
url ='http://dushu.baidu.com/api/pc/getCatalog?data={"book id":"'+b id +'
#asyncio.run(getCatalog(url))
loop = asyncio.get_event_loop() #会出现RuntimeError: Event loop is closed错误,改成下面俩步就可以解决
loop.run_until_complete(craw_text(url))
import requests
import asyncio
import aiohttp
import aiofiles
from bs4 import BeautifulSoup
async def parser(url,title): #解析文章url,抓取内容
async with aiohttp.ClientSession() as session: #session相当于requests
async with session.get(url) as rep:
soup=BeautifulSoup(await rep.text(),'html.parser')
text=soup.find('div',class_='noveContent').get_text()
async with aiofiles.open(f'novel/{title}','w',encoding='utf-8') as f:
await f.write(text)
async def craw_text(url): #抓取内容
rep=requests.get(url)
soup=BeautifulSoup(rep.text,'html.parser')
articles=soup.find('div',class_='c_con_mainbody').find('div',class_='c_con_list').find_all('div',class_='c_con_li_detail_p')
tasks=[]
for article in articles:
href=article.find('a')['href']
url='https:'+href
title=article.find('a').get_text().replace('让你写益智剧,你写大明王朝?:','')
#准备异步工作
tasks.append(parser(url,title)) #parser()函数,解析一个文章的url,抓取内容
await asyncio.wait(tasks)
if __name__=='__main__':
url='https://b.faloo.com/html_1409_1409469/'
#asyncio.run(craw_text(url)) #会出现RuntimeError: Event loop is closed错误,改成下面俩步就可以解决
loop = asyncio.get_event_loop()
loop.run_until_complete(craw_text(url))