requests文件上传进度:分块上传与断点续传全指南
引言:你还在为大文件上传发愁吗?
当你尝试上传GB级别的备份文件或通过不稳定网络传输重要数据时,是否经常遇到以下痛点:
- 上传中断后需要从头开始
- 无法实时查看上传进度
- 内存溢出导致程序崩溃
- 服务器超时断开连接
本文将系统讲解如何基于Python的requests库实现分块上传(Chunked Upload)与断点续传(Resumable Upload)功能,解决上述问题。读完本文你将掌握:
- 流式传输实现原理与内存优化
- 自定义进度条显示上传状态
- 断点续传核心算法与实现
- 错误处理与上传恢复机制
- 生产环境性能调优策略
一、文件上传基础:从简单到复杂
1.1 传统表单上传的局限
requests库最基础的文件上传方式是通过multipart/form-data编码格式:
import requests
with open('large_file.iso', 'rb') as f:
files = {'file': ('archive.iso', f, 'application/octet-stream')}
response = requests.post('https://api.example.com/upload', files=files)
这种方式的三大缺陷:
- 整个文件需加载到内存
- 无法跟踪上传进度
- 中断后必须重新上传
1.2 流式传输(Streaming)基础
通过设置stream=True参数启用流式传输,实现边读边传:
def stream_upload(file_path, url, chunk_size=1024*1024):
with open(file_path, 'rb') as f:
response = requests.post(
url,
data=f, # 文件对象直接作为数据流
headers={'Content-Type': 'application/octet-stream'},
stream=True # 关键参数:不立即下载响应
)
return response
工作原理:
二、分块上传核心实现
2.1 自定义分块迭代器
创建带进度跟踪的文件分块迭代器:
class ProgressFile:
def __init__(self, file_path, chunk_size=1024*1024):
self.file = open(file_path, 'rb')
self.chunk_size = chunk_size
self.total_size = os.path.getsize(file_path)
self.uploaded = 0
def __iter__(self):
return self
def __next__(self):
data = self.file.read(self.chunk_size)
if not data:
self.file.close()
raise StopIteration
self.uploaded += len(data)
# 计算并打印进度百分比
progress = (self.uploaded / self.total_size) * 100
print(f"\rUploading: {progress:.2f}%", end="")
return data
2.2 集成tqdm进度条
使用tqdm库实现可视化进度条:
from tqdm import tqdm
def tqdm_upload(file_path, url, chunk_size=1024*1024):
file_size = os.path.getsize(file_path)
with open(file_path, 'rb') as f, tqdm(
total=file_size,
unit='B',
unit_scale=True,
unit_divisor=1024,
desc='Uploading'
) as pbar:
def read_chunk():
while True:
data = f.read(chunk_size)
if not data:
return
pbar.update(len(data))
yield data
response = requests.post(url, data=read_chunk())
return response
进度条效果:
Uploading: 45%|██████████████████▌ | 4.50G/10.0G [01:23<01:45, 52.3MB/s]
2.3 分块上传协议设计
实现符合HTTP标准的分块上传:
def chunked_upload(file_path, url, chunk_size=1024*1024):
file_size = os.path.getsize(file_path)
file_id = hashlib.md5(file_path.encode()).hexdigest() # 生成唯一文件ID
with open(file_path, 'rb') as f:
chunk_number = 0
while True:
data = f.read(chunk_size)
if not data:
break
# 构建分块请求
response = requests.post(
f"{url}/chunk",
data={
'file_id': file_id,
'chunk_number': chunk_number,
'total_chunks': (file_size + chunk_size - 1) // chunk_size,
'total_size': file_size
},
files={'chunk_data': data}
)
if response.status_code != 200:
raise Exception(f"Chunk {chunk_number} upload failed")
chunk_number += 1
# 通知服务器合并分块
response = requests.post(f"{url}/merge", data={'file_id': file_id})
return response
分块上传流程图:
三、断点续传实现方案
3.1 断点续传核心算法
def resume_upload(file_path, url, chunk_size=1024*1024):
file_size = os.path.getsize(file_path)
file_id = hashlib.md5(file_path.encode()).hexdigest()
# 查询已上传分块
response = requests.get(f"{url}/status", params={'file_id': file_id})
uploaded_chunks = set(response.json().get('uploaded_chunks', []))
total_chunks = (file_size + chunk_size - 1) // chunk_size
resume_from = min(uploaded_chunks) if uploaded_chunks else 0
print(f"Resuming upload from chunk {resume_from}/{total_chunks}")
with open(file_path, 'rb') as f:
# 定位到断点位置
f.seek(resume_from * chunk_size)
for chunk_number in range(resume_from, total_chunks):
data = f.read(chunk_size)
if not data:
break
# 跳过已上传分块
if chunk_number in uploaded_chunks:
continue
# 上传当前分块
response = requests.post(
f"{url}/chunk",
data={
'file_id': file_id,
'chunk_number': chunk_number,
'total_chunks': total_chunks
},
files={'chunk_data': data}
)
if response.status_code != 200:
raise Exception(f"Chunk {chunk_number} failed")
# 完成合并
return requests.post(f"{url}/merge", data={'file_id': file_id})
3.2 本地进度记录
实现客户端断点保存:
def upload_with_resume(file_path, url, state_file="upload_state.json"):
# 加载上传状态
try:
with open(state_file, 'r') as f:
state = json.load(f)
except (FileNotFoundError, json.JSONDecodeError):
state = {'uploaded_chunks': [], 'file_id': None}
# 初始化文件ID
if not state['file_id']:
state['file_id'] = hashlib.md5(file_path.encode()).hexdigest()
# ... 上传逻辑 ...
# 保存上传状态
with open(state_file, 'w') as f:
json.dump(state, f)
# 上传完成删除状态文件
if upload_complete:
os.remove(state_file)
3.3 断点续传状态机
四、高级特性与错误处理
4.1 上传校验机制
def verify_upload(file_path, url):
"""验证上传文件的完整性"""
# 计算本地文件哈希
local_hash = hashlib.sha256()
with open(file_path, 'rb') as f:
while chunk := f.read(1024*1024):
local_hash.update(chunk)
# 获取服务器端哈希
response = requests.get(f"{url}/verify", params={'file_id': file_id})
remote_hash = response.text
return local_hash.hexdigest() == remote_hash
4.2 并行分块上传
使用线程池实现多线程并行上传:
from concurrent.futures import ThreadPoolExecutor
def parallel_upload(file_path, url, chunk_size=1024*1024, max_workers=4):
# ... 获取已上传分块 ...
# 生成待上传分块列表
pending_chunks = [i for i in range(total_chunks) if i not in uploaded_chunks]
# 并行上传分块
with ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = []
for chunk_number in pending_chunks:
future = executor.submit(
upload_single_chunk, # 上传单个分块的函数
file_path, url, chunk_number, chunk_size, file_id
)
futures.append(future)
# 等待所有任务完成
for future in concurrent.futures.as_completed(futures):
try:
future.result()
except Exception as e:
print(f"Chunk failed: {e}")
4.3 错误处理策略
def robust_upload(file_path, url, max_retries=3, backoff_factor=0.3):
retry_count = 0
while retry_count < max_retries:
try:
return resume_upload(file_path, url)
except requests.exceptions.RequestException as e:
retry_count += 1
if retry_count >= max_retries:
raise
# 指数退避重试
sleep_time = backoff_factor * (2 ** (retry_count - 1))
print(f"Upload failed: {str(e)}. Retrying in {sleep_time:.1f}s...")
time.sleep(sleep_time)
五、生产环境优化
5.1 性能调优参数表
| 参数 | 推荐值 | 影响 |
|---|---|---|
| chunk_size | 1-10MB | 小分块:网络适应性好,开销大;大分块:效率高,内存占用大 |
| max_workers | 2-4 | 并行上传线程数,不宜超过CPU核心数 |
| timeout | (30, 120) | 连接超时30s,读取超时120s |
| buffer_size | 8KB | 文件读取缓冲区大小 |
| max_retries | 3-5 | 失败重试次数 |
5.2 服务端配合要求
# FastAPI服务端分块接收示例
from fastapi import FastAPI, UploadFile, File, Form
import aiofiles
import os
app = FastAPI()
UPLOAD_DIR = "./upload_chunks"
@app.post("/chunk")
async def upload_chunk(
file_id: str = Form(...),
chunk_number: int = Form(...),
total_chunks: int = Form(...),
chunk_data: UploadFile = File(...)
):
# 创建文件ID目录
os.makedirs(f"{UPLOAD_DIR}/{file_id}", exist_ok=True)
# 保存分块
chunk_path = f"{UPLOAD_DIR}/{file_id}/{chunk_number}"
async with aiofiles.open(chunk_path, 'wb') as f:
await f.write(await chunk_data.read())
return {"status": "ok"}
@app.post("/merge")
async def merge_chunks(file_id: str = Form(...)):
# 合并所有分块
chunk_dir = f"{UPLOAD_DIR}/{file_id}"
chunks = sorted(os.listdir(chunk_dir), key=lambda x: int(x))
with open(f"./uploads/{file_id}", 'wb') as outfile:
for chunk in chunks:
with open(f"{chunk_dir}/{chunk}", 'rb') as infile:
outfile.write(infile.read())
# 清理分块目录
import shutil
shutil.rmtree(chunk_dir)
return {"status": "merged", "file_id": file_id}
5.3 完整上传类实现
class ResumableUploader:
def __init__(self, url, chunk_size=1024*1024, max_workers=4, state_file=None):
self.url = url
self.chunk_size = chunk_size
self.max_workers = max_workers
self.state_file = state_file or "upload_state.json"
self.state = self._load_state()
def _load_state(self):
try:
with open(self.state_file, 'r') as f:
return json.load(f)
except (FileNotFoundError, json.JSONDecodeError):
return {'file_id': None, 'uploaded_chunks': []}
def _save_state(self):
with open(self.state_file, 'w') as f:
json.dump(self.state, f)
def upload(self, file_path):
# 完整实现见上文核心算法
pass
def verify(self, file_path):
# 完整性校验实现
pass
def cancel(self):
"""取消上传并清理临时文件"""
if os.path.exists(self.state_file):
os.remove(self.state_file)
# 可选:通知服务器删除已上传分块
六、总结与展望
本文详细介绍了基于requests库的文件上传技术演进:
- 从简单表单上传到流式传输
- 分块上传实现与进度跟踪
- 断点续传核心算法与状态管理
- 错误处理与性能优化策略
未来发展方向:
- WebSocket实时上传状态反馈
- 基于内容的断点续传(不依赖分块编号)
- 分布式文件系统集成
- 上传任务队列与优先级管理
通过本文技术,可构建可靠的大文件上传系统,解决网络不稳定环境下的文件传输问题。建议结合具体业务需求调整分块大小和重试策略,在可靠性与性能间取得平衡。
完整代码示例可通过以下命令获取:
git clone https://gitcode.com/GitHub_Trending/re/requests
cd requests/examples/upload
创作声明:本文部分内容由AI辅助生成(AIGC),仅供参考



