一、分块下载
特点:可以实现断点续下
import os
import requests
def download_file_with_resume(url, dest_path, chunk_size):
# 获取文件的总大小
response = requests.head(url)
total_size = int(response.headers.get('content-length', 0))
# 检查文件是否已部分下载
downloaded_size = 0
if os.path.exists(dest_path):
downloaded_size = os.path.getsize(dest_path)
with open(dest_path, 'ab') as f:
# 从上次下载的位置继续下载
for start in range(downloaded_size, total_size, chunk_size):
end = min(start + chunk_size - 1, total_size - 1)
headers = {'Range': f'bytes={start}-{end}'}
part_response = requests.get(url, headers=headers, stream=True)
total_size = int(part_response.headers.get('content-length', 0))
print("total_size:",total_size)
if part_response.status_code == 206 or part_response.status_code == 200:
f.write(part_response.content)
else:
raise Exception(f"Failed to download chunk: {part_response.status_code}")
print(f"Download completed: {dest_path}")
# 示例用法
url = 'https://xxxx.com//video.mp4'
dest_path = 'abc.mp4'
chunk_size = 1024*1024*1 # 1MB
download_file_with_resume(url, dest_path,chunk_size)
二、线程池分块下载(大幅提高下载速度)
特点:下载速度极快,1GB几十秒内下完,可实现断点续下
#coding=utf8
import os
import random
import requests
import math
import threadpool
import threading
import time
write_lock = threading.Lock()
class BlockDownload:
def __init__(self):
self.error_try = 3 # 请求下载错误重试次数
self.wait_time = 1 # 请求下载错误等待时间
self.proxies = None # 设置代理
self.timeout = 5 # 请求超时
self.finish = False # 下载成功标志
self.success_list = [] # 分块下载成功标志列表
def download_chunk(self,url, start_byte, end_byte,chunk_num, file_path):
"""
段块下载
:param url:
:param start_byte:
:param end_byte:
:param chunk_num:
:param file_path:
:return:
"""
# 过滤,实现断点下载
with open(file_path,'rb') as f:
f.seek(start_byte)
chunk_byte = f.read(end_byte-start_byte+1)
if not all(i == 0 for i in chunk_byte):
self.success_list.append(True)
return
headers = {
'user-agent': self.get_random_useragent(),
'Range':f'bytes={start_byte}-{end_byte}'
}
for i in range(self.error_try):
try:
response = requests.get(url, headers=headers,timeout=self.timeout,proxies=self.proxies,stream=True)
if len(response.content) <= 280:
print(f"retry {str(i)} response.content.size to : {str(len(response.content))},and response.status_code {str(response.status_code)}")
continue
if response.status_code == 206:
with write_lock:
with open(file_path, 'rb+') as file:
file.seek(start_byte)
file.write(response.content)
self.success_list.append(True)
print(f"block {str(chunk_num)} download finish!")
return
else:
print(f"retry {str(i)} block {str(chunk_num)} download failed code: {str(response.status_code)}")
continue
except Exception as e:
print(f"retry {str(i)} block {str(chunk_num)} download error. "+str(e))
time.sleep(self.wait_time)
continue
print(f"block {str(chunk_num)} download failed,process end!")
self.success_list.append(False)
def get_video_total_size(self,url):
"""
获取文件总大小
:param url:
:return:
"""
total_size = 0
headers = {
'user-agent': self.get_random_useragent()
}
for i in range(self.error_try):
try:
response = requests.get(url, headers=headers, stream=True,timeout=self.timeout, proxies=self.proxies)
if response.headers.get('Location', ""):
response = requests.head(response.headers.get('Location', url), headers=headers,timeout=self.timeout, proxies=self.proxies)
if response.status_code == 403:
print(f"Failed Download for response.status_code of head is {response.status_code}")
return False
if response.status_code == 200:
total_size = int(response.headers.get('content-length', 0))
if not total_size:
print(f"retry {i} for total_size {total_size}")
time.sleep(self.wait_time)
continue
break
except Exception as e:
print(f"retry {i} for {str(e)}")
time.sleep(self.wait_time)
return total_size
def get_optimal_chunk_size(slef,total_size):
"""
获取最优块大小
:param total_size:
:return:
"""
if total_size < 64 * 1024 * 1024: # 小于64MB
return 1 * 1024 * 1024 # 1MB
elif total_size < 512 * 1024 * 1024: # 小于512MB
return 2 * 1024 * 1024 # 2MB
elif total_size < 1024 * 1024 * 1024: # 小于1024MB
return 4 * 1024 * 1024 # 4MB
else:
return 8 * 1024 * 1024 # 8MB
def thread_download(self,url,file_path,thread_num):
"""
线程池分块下载,特点:极速、断点续下
:param url:
:param file_path:
:param thread_num:
:return:
"""
total_size = self.get_video_total_size(url)
if not total_size:
print(f"Failed Download for total_size:{total_size}")
return
if not os.path.exists(file_path): # 创建一个文件
with open(file_path, 'wb+') as f:
f.write(b'\0' * total_size)
print(f"file size:{total_size},start download...")
chunk_size = self.get_optimal_chunk_size(total_size)
block_num = math.ceil(total_size / chunk_size) # 总分块数
arguments_list = [] # 创建存放任务参数列表
for i in range(block_num):
start_byte = i * chunk_size
end_byte = start_byte + chunk_size - 1
if i == block_num - 1:
end_byte = total_size - 1 # 最后一个块可能会包含剩余的所有字节
arguments_list.append(([url, start_byte, end_byte, i, file_path], None))
pool = threadpool.ThreadPool(thread_num) # 创建线程
tasks_list = threadpool.makeRequests(self.download_chunk, arguments_list) # 按照参数列表长度创建任务列表
for task in tasks_list:
pool.putRequest(task) # 将要执行的任务放入线程池
pool.wait()
self.finish = all(self.success_list)
if not self.finish:
os.remove(file_path)
def get_random_useragent(self):
ua = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/527.31 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/527.31',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/517.26 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/517.26',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.33 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.33',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/547.34 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/547.34',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/597.56 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/597.56',
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/587.86 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/587.86",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/581.81 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/581.81",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/582.82 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/582.82",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/583.83 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/583.83",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/584.84 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/584.84",
]
return random.choice(ua)
def block_download(self,url,file_path):
"""
单线程分块下载,断点续下
:param url:
:param file_path:
:return:
"""
total_size = self.get_video_total_size(url)
if not total_size:
print("Failed Download for total_size:0")
return
chunk_size = self.get_optimal_chunk_size(total_size)
# 检查文件是否已部分下载
downloaded_size = 0
if os.path.exists(file_path):
downloaded_size = os.path.getsize(file_path)
with open(file_path, 'ab') as f:
# 从上次下载的位置继续下载
self.success_list = []
for start in range(downloaded_size, total_size, chunk_size):
end = min(start + chunk_size - 1, total_size - 1)
headers = {
'user-agent': self.get_random_useragent(),
'Range': f'bytes={start}-{end}'
}
success = False
for i in range(self.error_try):
try:
response = requests.get(url, headers=headers,timeout=self.timeout,proxies=self.proxies,stream=True)
if len(response.content) <= 280:
print(f"retry {i} response.content.size to : {len(response.content)} and response.status_code:{response.status_code}")
continue
if response.status_code == 206:
f.write(response.content)
success = True
break
else:
print(f"retry {i} response.status_code to: {response.status_code}")
time.sleep(self.wait_time)
except Exception as e:
print(f"retry {i} to : {e}")
time.sleep(self.wait_time)
self.success_list.append(success)
self.finish = all(self.success_list)
if not self.finish:
os.remove(file_path)
def run(self):
download_url = 'https://xxxxx.com/video.mp4'
file_path = r'D:\Video\video.mp4'
thread_num = 32 # 线程数,最好根据视频大小、带宽设置
self.thread_download(download_url,file_path,thread_num) # 线程池极速下载,断点续下(避免重复下载已成功的块)
# self.block_download(download_url,file_path) # 单线程下载,断点续下
if __name__ == '__main__':
bd = BlockDownload()
bd.run()