Requests库的请求原理详解
1. 整体架构
┌─────────────────┐ ┌──────────────────┐ ┌────────────────┐
│ User Code │ │ requests API │ │ urllib3 │
│ │ │ │ │ │
│ requests.get() │───▶│ Session.request()│───▶│ PoolManager │
│ requests.post() │ │ │ │ │
└─────────────────┘ └──────────────────┘ └────────────────┘
│
▼
┌────────────────┐
│ HTTPAdapter │
│ │
│ HTTP/HTTPS │
└────────────────┘
│
▼
┌────────────────┐
│ socket │
│ TCP/IP stack │
└────────────────┘
2. 请求处理流程
2.1 高层API调用
import requests
# 简单调用
response = requests.get('https://api.example.com/data')
背后发生的事情:
- 创建一个临时的
Session对象 - 构建
Request对象 - 发送请求并获取响应
- 关闭连接(如果不是使用Session)
2.2 使用Session的详细流程
import requests
from requests.adapters import HTTPAdapter
# 创建会话
session = requests.Session()
# 添加中间件(适配器)
adapter = HTTPAdapter(
pool_connections=10,
pool_maxsize=10,
max_retries=3
)
session.mount('https://', adapter)
session.mount('http://', adapter)
# 发送请求
response = session.get(
'https://api.example.com/data',
params={'key': 'value'},
headers={'User-Agent': 'my-app'},
timeout=5
)
3. 核心组件详解
3.1 Session对象
Session对象是requests的核心,它维护了:
- Cookie持久化:跨请求保持Cookie
- 连接池:复用TCP连接
- 配置默认值:头信息、认证等
# Session内部主要属性
class Session:
def __init__(self):
self.headers = default_headers # 默认头信息
self.auth = None # 认证信息
self.proxies = {} # 代理配置
self.cookies = {} # Cookie jar
self.adapters = {} # 协议适配器映射
self.mount('https://', HTTPAdapter())
self.mount('http://', HTTPAdapter())
3.2 请求准备过程
# 伪代码:请求准备过程
def prepare_request(request, session=None):
# 1. 合并头信息
headers = merge_headers(session.headers, request.headers)
# 2. 处理认证
if request.auth or session.auth:
auth = request.auth or session.auth
headers = apply_auth(headers, auth)
# 3. 处理Cookie
cookies = merge_cookies(session.cookies, request.cookies)
# 4. 构建最终的请求对象
prepared_request = Request(
method=request.method,
url=request.url,
headers=headers,
data=request.data,
cookies=cookies
)
return prepared_request
3.3 适配器(Adapter)系统
适配器负责实际的HTTP传输工作:
class HTTPAdapter(BaseAdapter):
def __init__(self, pool_connections=10, pool_maxsize=10, max_retries=0):
# 连接池管理
self.poolmanager = PoolManager(
num_pools=pool_connections,
maxsize=pool_maxsize
)
# 重试策略
self.max_retries = max_retries
def send(self, request, **kwargs):
# 1. 获取连接
conn = self.get_connection(request.url)
# 2. 发送请求
response = conn.urlopen(
method=request.method,
url=request.url,
body=request.body,
headers=request.headers
)
# 3. 构建响应对象
return self.build_response(request, response)
3.4 连接池管理
urllib3的PoolManager负责TCP连接复用:
# 连接池工作原理
class PoolManager:
def __init__(self, num_pools=10, maxsize=10):
self.pools = RecentlyUsedContainer(num_pools)
self.maxsize = maxsize
def connection_from_url(self, url):
# 根据URL生成连接键(主机+端口)
pool_key = (scheme, host, port)
# 查找或创建连接池
if pool_key not in self.pools:
self.pools[pool_key] = HTTPConnectionPool(
host, port, maxsize=self.maxsize
)
return self.pools[pool_key]
4. 完整的请求生命周期
4.1 DNS解析阶段
# DNS解析过程
def resolve_hostname(hostname):
# 1. 检查本地hosts文件
# 2. 查询系统DNS缓存
# 3. 向配置的DNS服务器发送查询
# 4. 缓存解析结果
return ip_address
4.2 TCP连接建立
# TCP三次握手
def establish_tcp_connection(host, port):
# 1. SYN -> 服务器
# 2. SYN-ACK <- 服务器
# 3. ACK -> 服务器
# 连接建立完成
return socket_connection
4.3 SSL/TLS握手(HTTPS)
# TLS握手过程
def ssl_handshake(socket, hostname):
# 1. ClientHello -> 服务器
# 2. ServerHello <- 服务器
# 3. 证书验证
# 4. 密钥交换
# 5. 完成握手
return ssl_socket
4.4 HTTP请求发送
# HTTP请求构建和发送
def send_http_request(connection, request):
# 构建HTTP请求报文
request_line = f"{request.method} {request.path} HTTP/1.1\r\n"
headers = "\r\n".join([f"{k}: {v}" for k, v in request.headers.items()])
body = request.body or ""
http_message = request_line + headers + "\r\n\r\n" + body
# 发送数据
connection.send(http_message.encode())
# 接收响应
response = receive_http_response(connection)
return response
5. 响应处理
5.1 响应解析
def parse_http_response(raw_response):
# 分割状态行、头和正文
header_part, body = raw_response.split(b"\r\n\r\n", 1)
header_lines = header_part.split(b"\r\n")
# 解析状态行
status_line = header_lines[0].decode()
protocol, status_code, reason = status_line.split(' ', 2)
# 解析头信息
headers = {}
for line in header_lines[1:]:
if b':' in line:
key, value = line.split(b':', 1)
headers[key.strip().decode()] = value.strip().decode()
# 处理编码
encoding = headers.get('content-encoding', 'utf-8')
decoded_body = body.decode(encoding)
return {
'status_code': int(status_code),
'headers': headers,
'content': decoded_body
}
5.2 构建Response对象
# Response对象构建
class Response:
def __init__(self, raw_response, request):
self.request = request
self.status_code = raw_response['status_code']
self.headers = raw_response['headers']
self.content = raw_response['content']
self.text = self._decode_content()
self.cookies = self._extract_cookies()
self.url = request.url
self.history = [] # 重定向历史
def json(self):
import json
return json.loads(self.text)
def raise_for_status(self):
if 400 <= self.status_code < 600:
raise HTTPError(f"HTTP Error: {self.status_code}")
6. 高级特性原理
6.1 重定向处理
def handle_redirects(response, request, session):
while response.status_code in [301, 302, 303, 307, 308]:
redirect_url = response.headers.get('Location')
if not redirect_url:
break
# 更新请求URL
request.url = redirect_url
# 对于POST重定向,通常改为GET(除307/308外)
if response.status_code in [301, 302, 303] and request.method == 'POST':
request.method = 'GET'
request.data = None
# 发送新的请求
response = session.send(request.prepare())
response.history.append(response) # 记录重定向历史
return response
6.2 超时处理
def send_with_timeout(request, adapter, timeout):
# 设置socket超时
if timeout is not None:
if isinstance(timeout, (int, float)):
connect_timeout = read_timeout = timeout
else:
connect_timeout, read_timeout = timeout
# 设置连接超时
socket.setdefaulttimeout(connect_timeout)
try:
response = adapter.send(request)
# 设置读取超时
if read_timeout is not None:
response.raw._fp.fp.raw._sock.settimeout(read_timeout)
return response
except socket.timeout:
raise Timeout("Request timed out")
6.3 重试机制
def send_with_retries(request, adapter, max_retries):
for attempt in range(max_retries + 1):
try:
return adapter.send(request)
except (ConnectionError, Timeout) as e:
if attempt == max_retries:
raise
# 等待指数退避时间
sleep_time = 2 ** attempt * 0.1
time.sleep(sleep_time)
return None
7. 性能优化原理
7.1 连接复用
# 连接复用实现
class HTTPConnectionPool:
def __init__(self, host, port, maxsize=10):
self.host = host
self.port = port
self.connections = [] # 空闲连接
self.in_use = set() # 使用中的连接
self.maxsize = maxsize
def get_connection(self):
# 如果有空闲连接,复用
if self.connections:
conn = self.connections.pop()
self.in_use.add(conn)
return conn
# 如果没有空闲连接但未达上限,创建新连接
if len(self.in_use) < self.maxsize:
conn = create_connection(self.host, self.port)
self.in_use.add(conn)
return conn
# 已达上限,等待或抛出异常
raise ConnectionPoolTimeout("No available connections")
def release_connection(self, conn):
if conn in self.in_use:
self.in_use.remove(conn)
# 如果连接仍然健康,放回池中
if conn.is_healthy():
self.connections.append(conn)
else:
conn.close()
7.2 SSL会话复用
# SSL会话票证复用
def reuse_ssl_session(socket, hostname):
# 尝试复用现有SSL会话
if hasattr(ssl, 'SESSION_TICKET') and hostname in ssl_session_cache:
session_ticket = ssl_session_cache[hostname]
socket.session = session_ticket
return True
return False
# 缓存新会话
def cache_ssl_session(socket, hostname):
if hasattr(socket, 'session'):
ssl_session_cache[hostname] = socket.session
8. 完整请求流程图
┌─────────────────────────────────────────────────────────────────────────────┐
│ 请求生命周期 │
├─────────────────────────────────────────────────────────────────────────────┤
│ │
│ 1. 用户调用 requests.get() │
│ ↓ │
│ 2. 创建Request对象 → 3. 准备请求(头信息、认证、Cookie等) │
│ ↓ │
│ 4. 选择适配器(HTTP/HTTPS) │
│ ↓ │
│ 5. 获取连接(从池中复用或新建) │
│ ↓ │
│ 6. DNS解析(如果需要) → 7. TCP连接建立 → 8. SSL握手(HTTPS) │
│ ↓ │
│ 9. 发送HTTP请求 → 10. 接收响应 │
│ ↓ │
│ 11. 处理重定向(如果需要) → 回到步骤4 │
│ ↓ │
│ 12. 解析响应 → 13. 处理Cookie → 14. 解码内容 │
│ ↓ │
│ 15. 构建Response对象 → 16. 返回给用户 │
│ ↓ │
│ 17. 释放连接回连接池(如果可复用) │
│ │
└─────────────────────────────────────────────────────────────────────────────┘
2054

被折叠的 条评论
为什么被折叠?



