aiohttp的异步爬虫使用方法

aiohttp是python3的一个异步模块,分为服务器端和客户端。廖雪峰的python3教程中,讲的是服务器端的使用方法。均益这里主要讲的是客户端的方法,用来写爬虫。使用异步协程的方式写爬虫,能提高程序的运行效率。

1、安装

Python
pip install <span class="wp_keywordlink_affiliate"><a href="https://www.168seo.cn/tag/aiohttp" title="View all posts in aiohttp" target="_blank">aiohttp</a></span>
1
2
pip install aiohttp
 

2、单一请求方法

Python
import aiohttp import asyncio async def fetch(session, url): async with session.get(url) as response: return await response.text() async def main(url): async with aiohttp.ClientSession() as session: html = await fetch(session, url) print(html) url = 'http://junyiseo.com' loop = asyncio.get_event_loop() loop.run_until_complete(main(url))
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
import aiohttp
import asyncio
 
async def fetch ( session , url ) :
     async with session . get ( url ) as response :
         return await response . text ( )
 
async def main ( url ) :
     async with aiohttp . ClientSession ( ) as session :
         html = await fetch ( session , url )
         print ( html )
url = 'http://junyiseo.com'
loop = asyncio . get_event_loop ( )
loop . run_until_complete ( main ( url ) )
 

3、多url请求方法

Python
import aiohttp import asyncio async def fetch(session, url): async with session.get(url) as response: return await response.text() async def main(url): async with aiohttp.ClientSession() as session: html = await fetch(session, url) print(html) loop = asyncio.get_event_loop() # 生成多个请求方法 url = "http://junyiseo.com" tasks = [main(url), main(url)] loop.run_until_complete(asyncio.wait(tasks)) loop.close()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
import aiohttp
import asyncio
 
async def fetch ( session , url ) :
     async with session . get ( url ) as response :
         return await response . text ( )
 
async def main ( url ) :
     async with aiohttp . ClientSession ( ) as session :
         html = await fetch ( session , url )
         print ( html )
 
 
loop = asyncio . get_event_loop ( )   
 
# 生成多个请求方法
url = "http://junyiseo.com"
tasks = [ main ( url ) , main ( url ) ]
loop . run_until_complete ( asyncio . wait ( tasks ) )
loop . close ( )
 

4、其他的请求方式

上面的代码中,我们创建了一个 ClientSession 对象命名为session,然后通过session的get方法得到一个 ClientResponse 对象,命名为resp,get方法中传入了一个必须的参数url,就是要获得源码的http url。至此便通过协程完成了一个异步IO的get请求。
aiohttp也支持其他的请求方式

Python
session.post('http://httpbin.org/post', data=b'data') session.put('http://httpbin.org/put', data=b'data') session.delete('http://httpbin.org/delete') session.head('http://httpbin.org/get') session.options('http://httpbin.org/get') session.patch('http://httpbin.org/patch', data=b'data')
1
2
3
4
5
6
7
8
session . post ( 'http://httpbin.org/post' , data = b 'data' )
session . put ( 'http://httpbin.org/put' , data = b 'data' )
session . delete ( 'http://httpbin.org/delete' )
session . head ( 'http://httpbin.org/get' )
session . options ( 'http://httpbin.org/get' )
session . patch ( 'http://httpbin.org/patch' , data = b 'data' )
 
 

5、请求方法中携带参数

GET方法带参数

Python
params = {'key1': 'value1', 'key2': 'value2'} async with session.get('http://httpbin.org/get', params=params) as resp: expect = 'http://httpbin.org/get?key2=value2&key1=value1' assert str(resp.url) == expect
1
2
3
4
5
6
params = { 'key1' : 'value1' , 'key2' : 'value2' }
async with session . get ( 'http://httpbin.org/get' ,
                       params = params ) as resp :
     expect = 'http://httpbin.org/get?key2=value2&key1=value1'
     assert str ( resp . url ) == expect
 

POST方法带参数

Python
payload = {'key1': 'value1', 'key2': 'value2'} async with session.post('http://httpbin.org/post', data=payload) as resp: print(await resp.text())
1
2
3
4
5
payload = { 'key1' : 'value1' , 'key2' : 'value2' }
async with session . post ( 'http://httpbin.org/post' ,
                         data = payload ) as resp :
     print ( await resp . text ( ) )
 

6、获取响应内容

resp.status 是http状态码,
resp.text() 是网页内容

Python
async with session.get('https://api.github.com/events') as resp: print(resp.status) print(await resp.text())
1
2
3
4
async with session . get ( 'https://api.github.com/events' ) as resp :
     print ( resp . status )
     print ( await resp . text ( ) )
 

gzip和deflate转换编码已经为你自动解码。

7、JSON请求处理

Python
async with aiohttp.ClientSession() as session: async with session.post(url, json={'test': 'object'})
1
2
3
async with aiohttp . ClientSession ( ) as session :
     async with session . post ( url , json = { 'test' : 'object' } )
 

返回json数据的处理

Python
async with session.get('https://api.github.com/events') as resp: print(await resp.json())
1
2
3
async with session . get ( 'https://api.github.com/events' ) as resp :
     print ( await resp . json ( ) )
 

8、以字节流的方式读取文件,可以用来下载

Python
async with session.get('https://api.github.com/events') as resp: await resp.content.read(10) #读取前10个字节
1
2
3
async with session . get ( 'https://api.github.com/events' ) as resp :
     await resp . content . read ( 10 ) #读取前10个字节
 

下载保存文件

Python
with open(filename, 'wb') as fd: while True: chunk = await resp.content.read(chunk_size) if not chunk: break fd.write(chunk)
1
2
3
4
5
6
7
with open ( filename , 'wb' ) as fd :
     while True :
         chunk = await resp . content . read ( chunk_size )
         if not chunk :
             break
         fd . write ( chunk )
 

9、上传文件

Python
url = 'http://httpbin.org/post' files = {'file': open('report.xls', 'rb')} await session.post(url, data=files)
1
2
3
4
5
url = 'http://httpbin.org/post'
files = { 'file' : open ( 'report.xls' , 'rb' ) }
 
await session . post ( url , data = files )
 

可以设置好文件名和content-type:

Python
url = 'http://httpbin.org/post' data = FormData() data.add_field('file', open('report.xls', 'rb'), filename='report.xls', content_type='application/vnd.ms-excel') await session.post(url, data=data)
1
2
3
4
5
6
7
8
9
url = 'http://httpbin.org/post'
data = FormData ( )
data . add_field ( 'file' ,
               open ( 'report.xls' , 'rb' ) ,
               filename = 'report.xls' ,
               content_type = 'application/vnd.ms-excel' )
 
await session . post ( url , data = data )
 

10、超时处理

默认的IO操作都有5分钟的响应时间 我们可以通过 timeout 进行重写,如果 timeout=None 或者 timeout=0 将不进行超时检查,也就是不限时长。

Python
async with session.get('https://github.com', timeout=60) as r: ...
1
2
3
async with session . get ( 'https://github.com' , timeout = 60 ) as r :
     . . .
 

11、自定义请求头

Python
url = 'http://example.com/image' payload = b'GIF89a\x01\x00\x01\x00\x00\xff\x00,\x00\x00' b'\x00\x00\x01\x00\x01\x00\x00\x02\x00;' headers = {'content-type': 'image/gif'} await session.post(url, data=payload, headers=headers)
1
2
3
4
5
6
7
8
9
url = 'http://example.com/image'
payload = b 'GIF89a\x01\x00\x01\x00\x00\xff\x00,\x00\x00'
           b '\x00\x00\x01\x00\x01\x00\x00\x02\x00;'
headers = { 'content-type' : 'image/gif' }
 
await session . post ( url ,
                   data = payload ,
                   headers = headers )
 

设置session的请求头

Python
headers={"Authorization": "Basic bG9naW46cGFzcw=="} async with aiohttp.ClientSession(headers=headers) as session: async with session.get("http://httpbin.org/headers") as r: json_body = await r.json() assert json_body['headers']['Authorization'] == \ 'Basic bG9naW46cGFzcw=='
1
2
3
4
5
6
7
headers = { "Authorization" : "Basic bG9naW46cGFzcw==" }
async with aiohttp . ClientSession ( headers = headers ) as session :
     async with session . get ( "http://httpbin.org/headers" ) as r :
         json_body = await r . json ( )
         assert json_body [ 'headers' ] [ 'Authorization' ] == \
             'Basic bG9naW46cGFzcw=='
 

12、自定义cookie

Python
url = 'http://httpbin.org/cookies' cookies = {'cookies_are': 'working'} async with ClientSession(cookies=cookies) as session: async with session.get(url) as resp: assert await resp.json() == { "cookies": {"cookies_are": "working"}}
1
2
3
4
5
6
7
url = 'http://httpbin.org/cookies'
cookies = { 'cookies_are' : 'working' }
async with ClientSession ( cookies = cookies ) as session :
     async with session . get ( url ) as resp :
         assert await resp . json ( ) == {
           "cookies" : { "cookies_are" : "working" } }
 

在多个请求中共享cookie

Python
async with aiohttp.ClientSession() as session: await session.get( 'http://httpbin.org/cookies/set?my_cookie=my_value') filtered = session.cookie_jar.filter_cookies( 'http://httpbin.org') assert filtered['my_cookie'].value == 'my_value' async with session.get('http://httpbin.org/cookies') as r: json_body = await r.json() assert json_body['cookies']['my_cookie'] == 'my_value'
1
2
3
4
5
6
7
8
9
10
async with aiohttp . ClientSession ( ) as session :
     await session . get (
         'http://httpbin.org/cookies/set?my_cookie=my_value' )
     filtered = session . cookie_jar . filter_cookies (
         'http://httpbin.org' )
     assert filtered [ 'my_cookie' ] . value == 'my_value'
     async with session . get ( 'http://httpbin.org/cookies' ) as r :
         json_body = await r . json ( )
         assert json_body [ 'cookies' ] [ 'my_cookie' ] == 'my_value'
 

13、限制同时请求数量

limit默认是100,limit=0的时候是无限制

Python
conn = aiohttp.TCPConnector(limit=30)
1
2
conn = aiohttp . TCPConnector ( limit = 30 )
 

14、SSL加密请求

有的请求需要验证加密证书,可以设置ssl=False,取消验证

Python
r = await session.get('https://example.com', ssl=False)
1
2
r = await session . get ( 'https://example.com' , ssl = False )
 

加入证书

Python
sslcontext = ssl.create_default_context( cafile='/path/to/ca-bundle.crt') r = await session.get('https://example.com', ssl=sslcontext)
1
2
3
4
sslcontext = ssl . create_default_context (
   cafile = '/path/to/ca-bundle.crt' )
r = await session . get ( 'https://example.com' , ssl = sslcontext )
 

15、代理请求

Python
async with aiohttp.ClientSession() as session: async with session.get("http://<span class="wp_keywordlink"><a href="http://www.168seo.cn/python" title="python">python</a></span>.org", proxy="http://proxy.com") as resp: print(resp.status)
1
2
3
4
5
async with aiohttp . ClientSession ( ) as session :
     async with session . get ( "http://python.org" ,
                           proxy = "http://proxy.com" ) as resp :
         print ( resp . status )
 

代理认证

Python
async with aiohttp.ClientSession() as session: proxy_auth = aiohttp.BasicAuth('user', 'pass') async with session.get("http://<span class="wp_keywordlink"><a href="http://www.168seo.cn/python" title="python">python</a></span>.org", proxy="http://proxy.com", proxy_auth=proxy_auth) as resp: print(resp.status)
1
2
3
4
5
6
7
async with aiohttp . ClientSession ( ) as session :
     proxy_auth = aiohttp . BasicAuth ( 'user' , 'pass' )
     async with session . get ( "http://python.org" ,
                           proxy = "http://proxy.com" ,
                           proxy_auth = proxy_auth ) as resp :
         print ( resp . status )
 

或者通过URL认证

Python
session.get("http://python.org", proxy="http://user:pass@some.proxy.com")
1
2
3
session . get ( "http://python.org" ,
             proxy = "http://user:pass@some.proxy.com" )
 

16、优雅的关闭程序

没有ssl的情况,加入这个语句关闭await asyncio.sleep(0)

Python
async def read_website(): async with aiohttp.ClientSession() as session: async with session.get('http://example.org/') as resp: await resp.read() loop = asyncio.get_event_loop() loop.run_until_complete(read_website()) # Zero-sleep to allow underlying connections to close loop.run_until_complete(asyncio.sleep(0)) loop.close()
1
2
3
4
5
6
7
8
9
10
11
async def read_website ( ) :
     async with aiohttp . ClientSession ( ) as session :
         async with session . get ( 'http://example.org/' ) as resp :
             await resp . read ( )
 
loop = asyncio . get_event_loop ( )
loop . run_until_complete ( read_website ( ) )
# Zero-sleep to allow underlying connections to close
loop . run_until_complete ( asyncio . sleep ( 0 ) )
loop . close ( )
 

如果是ssl请求,在关闭前需要等待一会

Python
loop.run_until_complete(asyncio.sleep(0.250)) loop.close()
1
2
3
loop . run_until_complete ( asyncio . sleep ( 0.250 ) )
loop . close ( )
 

*** 转自均益博客




  • zeropython 微信公众号 5868037 QQ号 5868037@qq.com QQ邮箱
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值