aiohttp是python3的一个异步模块,分为服务器端和客户端。廖雪峰的python3教程中,讲的是服务器端的使用方法。均益这里主要讲的是客户端的方法,用来写爬虫。使用异步协程的方式写爬虫,能提高程序的运行效率。
1、安装
1
2
|
pip
install
aiohttp
|
2、单一请求方法
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
|
import
aiohttp
import
asyncio
async
def
fetch
(
session
,
url
)
:
async
with
session
.
get
(
url
)
as
response
:
return
await
response
.
text
(
)
async
def
main
(
url
)
:
async
with
aiohttp
.
ClientSession
(
)
as
session
:
html
=
await
fetch
(
session
,
url
)
print
(
html
)
url
=
'http://junyiseo.com'
loop
=
asyncio
.
get_event_loop
(
)
loop
.
run_until_complete
(
main
(
url
)
)
|
3、多url请求方法
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
|
import
aiohttp
import
asyncio
async
def
fetch
(
session
,
url
)
:
async
with
session
.
get
(
url
)
as
response
:
return
await
response
.
text
(
)
async
def
main
(
url
)
:
async
with
aiohttp
.
ClientSession
(
)
as
session
:
html
=
await
fetch
(
session
,
url
)
print
(
html
)
loop
=
asyncio
.
get_event_loop
(
)
# 生成多个请求方法
url
=
"http://junyiseo.com"
tasks
=
[
main
(
url
)
,
main
(
url
)
]
loop
.
run_until_complete
(
asyncio
.
wait
(
tasks
)
)
loop
.
close
(
)
|
4、其他的请求方式
上面的代码中,我们创建了一个 ClientSession 对象命名为session,然后通过session的get方法得到一个 ClientResponse 对象,命名为resp,get方法中传入了一个必须的参数url,就是要获得源码的http url。至此便通过协程完成了一个异步IO的get请求。
aiohttp也支持其他的请求方式
1
2
3
4
5
6
7
8
|
session
.
post
(
'http://httpbin.org/post'
,
data
=
b
'data'
)
session
.
put
(
'http://httpbin.org/put'
,
data
=
b
'data'
)
session
.
delete
(
'http://httpbin.org/delete'
)
session
.
head
(
'http://httpbin.org/get'
)
session
.
options
(
'http://httpbin.org/get'
)
session
.
patch
(
'http://httpbin.org/patch'
,
data
=
b
'data'
)
|
5、请求方法中携带参数
GET方法带参数
1
2
3
4
5
6
|
params
=
{
'key1'
:
'value1'
,
'key2'
:
'value2'
}
async
with
session
.
get
(
'http://httpbin.org/get'
,
params
=
params
)
as
resp
:
expect
=
'http://httpbin.org/get?key2=value2&key1=value1'
assert
str
(
resp
.
url
)
==
expect
|
POST方法带参数
1
2
3
4
5
|
payload
=
{
'key1'
:
'value1'
,
'key2'
:
'value2'
}
async
with
session
.
post
(
'http://httpbin.org/post'
,
data
=
payload
)
as
resp
:
print
(
await
resp
.
text
(
)
)
|
6、获取响应内容
resp.status 是http状态码,
resp.text() 是网页内容
1
2
3
4
|
async
with
session
.
get
(
'https://api.github.com/events'
)
as
resp
:
print
(
resp
.
status
)
print
(
await
resp
.
text
(
)
)
|
gzip和deflate转换编码已经为你自动解码。
7、JSON请求处理
1
2
3
|
async
with
aiohttp
.
ClientSession
(
)
as
session
:
async
with
session
.
post
(
url
,
json
=
{
'test'
:
'object'
}
)
|
返回json数据的处理
1
2
3
|
async
with
session
.
get
(
'https://api.github.com/events'
)
as
resp
:
print
(
await
resp
.
json
(
)
)
|
8、以字节流的方式读取文件,可以用来下载
1
2
3
|
async
with
session
.
get
(
'https://api.github.com/events'
)
as
resp
:
await
resp
.
content
.
read
(
10
)
#读取前10个字节
|
下载保存文件
1
2
3
4
5
6
7
|
with
open
(
filename
,
'wb'
)
as
fd
:
while
True
:
chunk
=
await
resp
.
content
.
read
(
chunk_size
)
if
not
chunk
:
break
fd
.
write
(
chunk
)
|
9、上传文件
1
2
3
4
5
|
url
=
'http://httpbin.org/post'
files
=
{
'file'
:
open
(
'report.xls'
,
'rb'
)
}
await
session
.
post
(
url
,
data
=
files
)
|
可以设置好文件名和content-type:
1
2
3
4
5
6
7
8
9
|
url
=
'http://httpbin.org/post'
data
=
FormData
(
)
data
.
add_field
(
'file'
,
open
(
'report.xls'
,
'rb'
)
,
filename
=
'report.xls'
,
content_type
=
'application/vnd.ms-excel'
)
await
session
.
post
(
url
,
data
=
data
)
|
10、超时处理
默认的IO操作都有5分钟的响应时间 我们可以通过 timeout 进行重写,如果 timeout=None 或者 timeout=0 将不进行超时检查,也就是不限时长。
1
2
3
|
async
with
session
.
get
(
'https://github.com'
,
timeout
=
60
)
as
r
:
.
.
.
|
11、自定义请求头
1
2
3
4
5
6
7
8
9
|
url
=
'http://example.com/image'
payload
=
b
'GIF89a\x01\x00\x01\x00\x00\xff\x00,\x00\x00'
b
'\x00\x00\x01\x00\x01\x00\x00\x02\x00;'
headers
=
{
'content-type'
:
'image/gif'
}
await
session
.
post
(
url
,
data
=
payload
,
headers
=
headers
)
|
设置session的请求头
1
2
3
4
5
6
7
|
headers
=
{
"Authorization"
:
"Basic bG9naW46cGFzcw=="
}
async
with
aiohttp
.
ClientSession
(
headers
=
headers
)
as
session
:
async
with
session
.
get
(
"http://httpbin.org/headers"
)
as
r
:
json_body
=
await
r
.
json
(
)
assert
json_body
[
'headers'
]
[
'Authorization'
]
==
\
'Basic bG9naW46cGFzcw=='
|
12、自定义cookie
1
2
3
4
5
6
7
|
url
=
'http://httpbin.org/cookies'
cookies
=
{
'cookies_are'
:
'working'
}
async
with
ClientSession
(
cookies
=
cookies
)
as
session
:
async
with
session
.
get
(
url
)
as
resp
:
assert
await
resp
.
json
(
)
==
{
"cookies"
:
{
"cookies_are"
:
"working"
}
}
|
在多个请求中共享cookie
1
2
3
4
5
6
7
8
9
10
|
async
with
aiohttp
.
ClientSession
(
)
as
session
:
await
session
.
get
(
'http://httpbin.org/cookies/set?my_cookie=my_value'
)
filtered
=
session
.
cookie_jar
.
filter_cookies
(
'http://httpbin.org'
)
assert
filtered
[
'my_cookie'
]
.
value
==
'my_value'
async
with
session
.
get
(
'http://httpbin.org/cookies'
)
as
r
:
json_body
=
await
r
.
json
(
)
assert
json_body
[
'cookies'
]
[
'my_cookie'
]
==
'my_value'
|
13、限制同时请求数量
limit默认是100,limit=0的时候是无限制
1
2
|
conn
=
aiohttp
.
TCPConnector
(
limit
=
30
)
|
14、SSL加密请求
有的请求需要验证加密证书,可以设置ssl=False,取消验证
1
2
|
r
=
await
session
.
get
(
'https://example.com'
,
ssl
=
False
)
|
加入证书
1
2
3
4
|
sslcontext
=
ssl
.
create_default_context
(
cafile
=
'/path/to/ca-bundle.crt'
)
r
=
await
session
.
get
(
'https://example.com'
,
ssl
=
sslcontext
)
|
15、代理请求
1
2
3
4
5
|
async
with
aiohttp
.
ClientSession
(
)
as
session
:
async
with
session
.
get
(
"http://python.org"
,
proxy
=
"http://proxy.com"
)
as
resp
:
print
(
resp
.
status
)
|
代理认证
1
2
3
4
5
6
7
|
async
with
aiohttp
.
ClientSession
(
)
as
session
:
proxy_auth
=
aiohttp
.
BasicAuth
(
'user'
,
'pass'
)
async
with
session
.
get
(
"http://python.org"
,
proxy
=
"http://proxy.com"
,
proxy_auth
=
proxy_auth
)
as
resp
:
print
(
resp
.
status
)
|
或者通过URL认证
1
2
3
|
session
.
get
(
"http://python.org"
,
proxy
=
"http://user:pass@some.proxy.com"
)
|
16、优雅的关闭程序
没有ssl的情况,加入这个语句关闭await asyncio.sleep(0)
1
2
3
4
5
6
7
8
9
10
11
|
async
def
read_website
(
)
:
async
with
aiohttp
.
ClientSession
(
)
as
session
:
async
with
session
.
get
(
'http://example.org/'
)
as
resp
:
await
resp
.
read
(
)
loop
=
asyncio
.
get_event_loop
(
)
loop
.
run_until_complete
(
read_website
(
)
)
# Zero-sleep to allow underlying connections to close
loop
.
run_until_complete
(
asyncio
.
sleep
(
0
)
)
loop
.
close
(
)
|
如果是ssl请求,在关闭前需要等待一会
1
2
3
|
loop
.
run_until_complete
(
asyncio
.
sleep
(
0.250
)
)
loop
.
close
(
)
|
*** 转自均益博客