python库requests文件、视频多线程池分块分段下载

云霄IT

已于 2024-07-31 14:53:27 修改

阅读量449

点赞数 3

文章标签： python 音视频开发语言

于 2024-07-27 13:02:53 首次发布

本文链接：https://blog.youkuaiyun.com/weixin_51111267/article/details/140733944

版权

一、分块下载

特点：可以实现断点续下

import os
import requests
def download_file_with_resume(url, dest_path, chunk_size):
    # 获取文件的总大小
    response = requests.head(url)
    total_size = int(response.headers.get('content-length', 0))
    # 检查文件是否已部分下载
    downloaded_size = 0
    if os.path.exists(dest_path):
        downloaded_size = os.path.getsize(dest_path)
    with open(dest_path, 'ab') as f:
        # 从上次下载的位置继续下载
        for start in range(downloaded_size, total_size, chunk_size):
            end = min(start + chunk_size - 1, total_size - 1)
            headers = {'Range': f'bytes={start}-{end}'}
            part_response = requests.get(url, headers=headers, stream=True)
            total_size = int(part_response.headers.get('content-length', 0))
            print("total_size:",total_size)
            if part_response.status_code == 206 or part_response.status_code == 200:
                f.write(part_response.content)
            else:
                raise Exception(f"Failed to download chunk: {part_response.status_code}")
    print(f"Download completed: {dest_path}")

# 示例用法
url = 'https://xxxx.com//video.mp4'
dest_path = 'abc.mp4'
chunk_size = 1024*1024*1    # 1MB
download_file_with_resume(url, dest_path,chunk_size)

二、线程池分块下载（大幅提高下载速度）

特点：下载速度极快，1GB几十秒内下完，可实现断点续下

#coding=utf8
import os
import random
import requests
import math
import threadpool
import threading
import time

write_lock = threading.Lock()
class BlockDownload:
    def __init__(self):
        self.error_try = 3    # 请求下载错误重试次数
        self.wait_time = 1   # 请求下载错误等待时间
        self.proxies = None     # 设置代理
        self.timeout = 5    # 请求超时
        self.finish = False   # 下载成功标志
        self.success_list = []   # 分块下载成功标志列表
    def download_chunk(self,url, start_byte, end_byte,chunk_num, file_path):
        """
        段块下载
        :param url:
        :param start_byte:
        :param end_byte:
        :param chunk_num:
        :param file_path:
        :return:
        """
        # 过滤，实现断点下载
        with open(file_path,'rb') as f:
            f.seek(start_byte)
            chunk_byte = f.read(end_byte-start_byte+1)
            if not all(i == 0 for i in chunk_byte):
                self.success_list.append(True)
                return

        headers = {
            'user-agent': self.get_random_useragent(),
            'Range':f'bytes={start_byte}-{end_byte}'
        }
        for i in range(self.error_try):
            try:
                response = requests.get(url, headers=headers,timeout=self.timeout,proxies=self.proxies,stream=True)
                if len(response.content) <= 280:
                    print(f"retry {str(i)} response.content.size to : {str(len(response.content))},and response.status_code {str(response.status_code)}")
                    continue
                if response.status_code == 206:
                    with write_lock:
                        with open(file_path, 'rb+') as file:
                            file.seek(start_byte)
                            file.write(response.content)
                            self.success_list.append(True)
                        print(f"block {str(chunk_num)} download finish!")
                        return
                else:
                    print(f"retry {str(i)} block {str(chunk_num)} download failed code: {str(response.status_code)}")
                    continue
            except Exception as e:
                print(f"retry {str(i)} block {str(chunk_num)} download error. "+str(e))
                time.sleep(self.wait_time)
                continue
        print(f"block {str(chunk_num)} download failed,process end!")
        self.success_list.append(False)

    def get_video_total_size(self,url):
        """
        获取文件总大小
        :param url:
        :return:
        """
        total_size = 0
        headers = {
            'user-agent': self.get_random_useragent()
        }
        for i in range(self.error_try):
            try:
                response = requests.get(url, headers=headers, stream=True,timeout=self.timeout, proxies=self.proxies)
                if response.headers.get('Location', ""):
                    response = requests.head(response.headers.get('Location', url), headers=headers,timeout=self.timeout, proxies=self.proxies)
                if response.status_code == 403:
                    print(f"Failed Download for response.status_code of head is {response.status_code}")
                    return False
                if response.status_code == 200:
                    total_size = int(response.headers.get('content-length', 0))
                if not total_size:
                    print(f"retry {i} for total_size {total_size}")
                    time.sleep(self.wait_time)
                    continue

                break
            except Exception as e:
                print(f"retry {i} for {str(e)}")
                time.sleep(self.wait_time)
        return total_size

    def get_optimal_chunk_size(slef,total_size):
        """
        获取最优块大小
        :param total_size:
        :return:
        """
        if total_size < 64 * 1024 * 1024:  # 小于64MB
            return 1 * 1024 * 1024  # 1MB
        elif total_size < 512 * 1024 * 1024:  # 小于512MB
            return 2 * 1024 * 1024  # 2MB
        elif total_size < 1024 * 1024 * 1024:  # 小于1024MB
            return 4 * 1024 * 1024  # 4MB
        else:
            return 8 * 1024 * 1024  # 8MB
    def thread_download(self,url,file_path,thread_num):
        """
        线程池分块下载,特点：极速、断点续下
        :param url:
        :param file_path:
        :param thread_num:
        :return:
        """
        total_size = self.get_video_total_size(url)
        if not total_size:
            print(f"Failed Download for total_size:{total_size}")
            return

        if not os.path.exists(file_path):  # 创建一个文件
            with open(file_path, 'wb+') as f:
                f.write(b'\0' * total_size)
        print(f"file size:{total_size},start download...")
        chunk_size = self.get_optimal_chunk_size(total_size)
        block_num = math.ceil(total_size / chunk_size)  # 总分块数
        arguments_list = []  # 创建存放任务参数列表
        for i in range(block_num):
            start_byte = i * chunk_size
            end_byte = start_byte + chunk_size - 1
            if i == block_num - 1:
                end_byte = total_size - 1  # 最后一个块可能会包含剩余的所有字节
            arguments_list.append(([url, start_byte, end_byte, i, file_path], None))
        pool = threadpool.ThreadPool(thread_num)  # 创建线程
        tasks_list = threadpool.makeRequests(self.download_chunk, arguments_list)  # 按照参数列表长度创建任务列表
        for task in tasks_list:
            pool.putRequest(task)  # 将要执行的任务放入线程池
        pool.wait()
        self.finish = all(self.success_list)
        if not self.finish:
            os.remove(file_path)

    def get_random_useragent(self):
        ua = [
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/527.31 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/527.31',
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/517.26 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/517.26',
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.33 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.33',
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/547.34 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/547.34',
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/597.56 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/597.56',
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/587.86 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/587.86",
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/581.81 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/581.81",
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/582.82 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/582.82",
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/583.83 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/583.83",
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/584.84 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/584.84",
        ]
        return random.choice(ua)

    def block_download(self,url,file_path):
        """
        单线程分块下载，断点续下
        :param url:
        :param file_path:
        :return:
        """
        total_size = self.get_video_total_size(url)
        if not total_size:
            print("Failed Download for total_size:0")
            return
        chunk_size = self.get_optimal_chunk_size(total_size)
        # 检查文件是否已部分下载
        downloaded_size = 0
        if os.path.exists(file_path):
            downloaded_size = os.path.getsize(file_path)
        with open(file_path, 'ab') as f:
            # 从上次下载的位置继续下载
            self.success_list = []
            for start in range(downloaded_size, total_size, chunk_size):
                end = min(start + chunk_size - 1, total_size - 1)
                headers = {
                    'user-agent': self.get_random_useragent(),
                    'Range': f'bytes={start}-{end}'
                }
                success = False
                for i in range(self.error_try):
                    try:
                        response = requests.get(url, headers=headers,timeout=self.timeout,proxies=self.proxies,stream=True)
                        if len(response.content) <= 280:
                            print(f"retry {i} response.content.size to : {len(response.content)} and response.status_code:{response.status_code}")
                            continue
                        if response.status_code == 206:
                            f.write(response.content)
                            success = True
                            break
                        else:
                            print(f"retry {i} response.status_code to: {response.status_code}")
                            time.sleep(self.wait_time)
                    except Exception as e:
                        print(f"retry {i} to : {e}")
                        time.sleep(self.wait_time)
                self.success_list.append(success)
        self.finish = all(self.success_list)
        if not self.finish:
            os.remove(file_path)

    def run(self):
        download_url = 'https://xxxxx.com/video.mp4'
        file_path = r'D:\Video\video.mp4'
        thread_num = 32  # 线程数，最好根据视频大小、带宽设置
        self.thread_download(download_url,file_path,thread_num)       # 线程池极速下载,断点续下(避免重复下载已成功的块)
        # self.block_download(download_url,file_path)   # 单线程下载,断点续下

if __name__ == '__main__':
    bd = BlockDownload()
    bd.run()