requests库是基于urllib库改写的,使用起来比urllib库更加简洁易用。进行爬虫时我们一般更多地会使用requests库。
安装requests
Python3
pip3 install requests
Python2
pip install requests
请求方式
有get、post、put、delete、head、options几种请求方式,常用的就是get和post请求
import requests
requests.get('http://httpbin.org/get')
requests.post('http://httpbin.org/post')
requests.put('http://httpbin.org/put')
requests.delete('http://httpbin.org/delete')
requests.head('http://httpbin.org/get')
requests.options('http://httpbin.org/get')
Get请求
基本用法
import requests
req = requests.get('http://httpbin.org/get')
print(req.text) #以文本形式输出
带参数的
import requests
#方法一
req = requests.get('http://httpbin.org/get?name=**&age=**')
print(req.text)
#方法二
params = {
'name': 'albert',
'age': 1
}
req = requests.get('http://httpbin.org/get', params = params)
print(req.text)
添加headers
import requests
headers = {
'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
}
req = requests.get('http://httpbin.org/get', headers = headers)
print(req.text)
Post请求
基本用法
import requests
data = {'name' : 'albert', 'age' : 1}
req = requests.post('http://httpbin.org/post', data = data)
print(req.text)
添加headers
import requests
data = {'name' : 'albert', 'age' : 1}
headers = {
'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
}
req = requests.post('http://httpbin.org/post', data = data, headers = headers)
print(req.text)
获取二进制数据(图片、视频等)并保存
import requests
req = requests.get('https://avatar.youkuaiyun.com/9/5/4/1_a564126786.jpg?1546960150')
print(req.content) #以bytes形式输出
with open('img.jpg', 'wb') as f:
f.write(req.content)
JSON响应内容
import requests
req = requests.get('http://httpbin.org/get')
print(req.json()) #将请求信息转换为JSON,与使用json库json.loads(req.text)相同
若是JSON解码失败,会抛出异常;但是没有抛出异常不表示响应成功。服务器可能会在失败响应中包含JSON对象,这种JSON会被解码返回,要检查请求是否成功,使用req.raise_for_status()或者req.status_code来判断
import requests
try:
req = requests.get('http://httpbin.org/get')
req.raise_for_status() #若状态码不是200,则抛出异常
except requests.RequestException as e:
print('error:', e)
else:
print(req.json())
响应属性
import requests
req = requests.get('http://www.baidu.com')
print(type(req)) #<class 'requests.models.Response'>
print(req.status_code) #200
print(req.headers) #响应头信息
print(req.headers['xxx']) 或 print(req.headers.get('xxx')) #获取某个响应头信息
print(req.cookies) #<RequestsCookieJar[<Cookie BDORZ=27315 for .baidu.com/>]>
print(req.url) #http://www.baidu.com/
print(req.history) #[] 请求历史,以list形式输出
文件上传
import requests
files = {'file' : open('img.jpg', 'rb')}
#files = {'file' : ('img.jpg', open('img.jpg', 'rb'), 'image/jpeg', {'Expires': '0'})} #设置文件名,文件类型和请求头
req = requests.post('http://httpbin.org/post', files = files)
print(req.text)
获取cookie
import requests
req = requests.get('http://www.baidu.com')
print(req.cookies)
for k, v in req.cookies.items():
print(k + '=' + v)
会话维持(一般用于保存登陆状态)
import requests
s = requests.session()
s.get('http://httpbin.org/cookies/set/number/123456')
req = s.get('http://httpbin.org/cookies')
print(req.text)
证书验证
import requests
from requests.packages import urllib3
urllib3.disable_warnings() #可以消除取消证书验证后出现的警告信息
req = requests.get('https://www.12306.cn', verify = False) #verify表示是否进行证书验证
print(req.status_code) #200
代理设置
import requests
proxies = {
'http': 'http://user:password@127.0.0.1:1087',
'https': 'https://user:password@127.0.0.1:1087'
}
req = requests.get('https://www.taobao.com', proxies = proxies)
print(req.text)
认证设置
有些网址需要进行身份认证,就要进行认证设置
import requests
from requests.auth import HTTPBasicAuth
req = requests.get('http://httpbin.org/hidden-basic-auth/user/passwd', auth=HTTPBasicAuth('user', 'passwd'))
#req = requests.get('http://httpbin.org/hidden-basic-auth/user/passwd', auth = ('user' , 'password')) #简写
异常处理
requests抛出的异常都继承自Requests.exceptions.RequestException,包括ConnectionError、HTTPError、Timeout、TooManyRedirects
import requests
from requests.exceptions import Timeout, HTTPError, ,ConnectionError, TooManyRedirects, RequestException
try:
req = requests.get('http://www.baidu.com', timeout = 0.01)
req.raise_for_status()
except Timeout as e: #请求超时
print('Timeout:', e)
except HTTPError as e: #状态码不为200
print('HTTPError:', e)
except ConnectionError as e: #DNS查询失败、拒绝连接等
print('ConnectionError:', e)
except TooManyRedirects as e: #超过设定的最大重定向次数
print('TooManyRedirects:', e)
except RequestException as e:
print('RequestException:', e)
else:
print(req.json())