python怎么下载视频文件,怎么用python下载文件_用python做一个视频下载工具-优快云博客

本文链接：https://blog.youkuaiyun.com/mynote/article/details/135662649

大家好，给大家分享一下python怎么下载安装视频教学，很多人还不知道这一点。下面详细解释一下。现在让我们来看看！

urllib.request下载

先使用最原始的urllib请求下载视频文件列表：

import os
import time
import urllib.request

urls = [
    'http://vod.146.xx.com/ebe8ebb801f4417c91967cc3c3e0f934.mp4',
    'http://vod.146.xx.com/cd278025ef3646c1aa332d970bccb686.mp4'
]

start_time = time.time()
for url in urls:
    with urllib.request.urlopen(url) as response:
        content = response.read()
        file_name = os.path.split(url)[1]
        with open(file_name, 'wb') as f:
            f.write(content)
end_time = time.time()
print("用时：%s" % (end_time - start_time))

输出：

用时：547.8745946884155

requests下载

使用高级一些的requests下载视频文件：

import os
import time
import requests

urls = [
    'http://vod.146.xx.com/ebe8ebb801f4417c91967cc3c3e0f934.mp4',
    'http://vod.146.xx.com/cd278025ef3646c1aa332d970bccb686.mp4'
]

start_time = time.time()
for url in urls:
    response = requests.get(url)
    content = response.content
    file_name = os.path.split(url)[1].strip()
    with open(file_name, 'wb') as f:
        f.write(content)
end_time = time.time()
print("用时：%s" % (end_time - start_time))

输出：

用时：454.0097427368164

多线程下载

如果视频文件很多，可以用多线程同时下载。下载速度会很快python基础知识重点。

import queue
import threading
import time
import os
import requests

urls = [
    'http://xx0K.mp4',
    'http://xx00K.mp4'
]


def download(url):
    file_name = os.path.split(url)[1].strip()
    retry_times = 0
    # 网络不稳定，最多重试5次
    while retry_times <= 5:
        try:
            # 流式下载，防止内存溢出
            response = requests.get(url, stream=True)
            with open(file_name, 'wb') as fh:
                for chunk in response.iter_content(chunk_size=1024):
                    fh.write(chunk)
            break
        except:
            pass
        retry_times += 1
    else:
        try:
            os.remove(file_name)
        except OSError:
            pass
        print("Failed to retrieve %s from %s.\n" % (url, file_name))


class Work(threading.Thread):

    def __init__(self, q, name):
        threading.Thread.__init__(self)
        self.q = q
        self.name = name

    def run(self):
        while True:
            # 如果队列生产很慢，消费很快队列很容易为空，就不能用empty判断了。
            # 可以给队列加（put）一个特殊值（如None）来作为结束判断。
            if self.q.empty():
                break
            else:
                url = self.q.get()
                download(url)
                self.q.task_done()


start_time = time.time()
work_queue = queue.Queue(len(urls))
threads = []
for url in urls:
    work_queue.put(url)

for i in range(3):
    thread = Work(work_queue, 'Thread-%s' % (i+1))
    # 设置主线程关掉的话，子线程也跟着关掉，防止子线程一直运行下去
    thread.setDaemon(True)
    thread.start()
    threads.append(thread)

if work_queue.join():
    for t in threads:
        t.join()
end_time = time.time()
print("用时：%s" % (end_time - start_time))

输出：

用时：20.341402292251587

另一种方式：

import threading
import random
import queue
from time import sleep
import sys


# 需求分析：有大批量数据需要执行，而且是重复一个函数操作（例如爆破密码），如果全部开始线程数N多，这里控制住线程数m个并行执行，其他等待
# 继承一个Thread类，在run方法中进行需要重复的单个函数操作
class Test(threading.Thread):
    def __init__(self, queue, lock, num):
        # 传递一个队列queue和线程锁，并行数
        threading.Thread.__init__(self)
        self.queue = queue
        self.lock = lock
        self.num = num

    def run(self):
        # while True:#不使用threading.Semaphore，直接开始所有线程，程序执行完毕线程都还不死，最后的print threading.enumerate()可以看出
        with self.num:  # 同时并行指定的线程数量，执行完毕一个则死掉一个线程
            # 以下为需要重复的单次函数操作
            n = self.queue.get()  # 等待队列进入
            lock.acquire()  # 锁住线程，防止同时输出造成混乱
            print('开始一个线程：', self.name, '模拟的执行时间：', n)
            print('队列剩余：', queue.qsize())
            print(threading.enumerate())
            lock.release()
            sleep(n)  # 执行单次操作，这里sleep模拟执行过程
            self.queue.task_done()  # 发出此队列完成信号


threads = []
queue = queue.Queue()
lock = threading.Lock()
num = threading.Semaphore(3)  # 设置同时执行的线程数为3，其他等待执行
# 启动所有线程
for i in range(10):  # 总共需要执行的次数
    t = Test(queue, lock, num)
    t.start()
    threads.append(t)
    # 吧队列传入线程，是run结束等待开始执行，放下面单独一个for也行，这里少个循环吧
    n = random.randint(1, 10)
    queue.put(n)  # 模拟执行函数的逐个不同输入
# 把队列传入线程，是run结束等待开始执行
# for t in threads:
#  n=random.randint(1,10)
#  queue.put(n)
# 等待线程执行完毕
for t in threads:
    t.join()
queue.join()  # 等待队列执行完毕才继续执行，否则下面语句会在线程未结束就开始执行
print('所有执行完毕')
print(threading.active_count())
print(threading.enumerate())

多进程

asyncio

import asyncio
import json
import os
import time

import requests

headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36"}


def get_page():
    page_urls = []
    for i in range(1, 21):
        url = 'https://game.gtimg.cn/images/lol/act/img/js/hero/{}.js'.format(i)
        print(url)
        page_urls.append(url)
    return page_urls


def get_img():
    img_urls = []
    page_urls = get_page()
    for page_url in page_urls:
        res = requests.get(page_url, headers=headers)
        result = res.content.decode('utf-8')
        res_dict = json.loads(result)
        skins = res_dict["skins"]

        for hero in skins:
            item = {}
            item['name'] = hero["heroName"]
            item['skin_name'] = hero["name"]
            if hero["mainImg"] == '':
                continue
            item['imgLink'] = hero["mainImg"]
            print(item)
            img_urls.append(item)
    return img_urls


async def save_img(index, img_url):
    path = "皮肤/" + img_url['name']
    if not os.path.exists(path):
        os.makedirs(path)
    content = requests.get(img_url['imgLink'], headers=headers).content
    with open('./皮肤/' + img_url['name'] + '/' + img_url['skin_name'] + str(index) + '.jpg', 'wb') as f:
        f.write(content)


def main():
    loop = asyncio.get_event_loop()
    img_urls = get_img()
    print(len(img_urls))
    tasks = [save_img(img[0], img[1]) for img in enumerate(img_urls)]
    try:
        loop.run_until_complete(asyncio.wait(tasks))
    finally:
        loop.close()


if __name__ == '__main__':
    start = time.time()
    main()
    end = time.time()
    print(end - start)

协程异步

对于多线程和多进程的缺点是在IO阻塞时会造成线程和进程的浪费，所以异步IO会是首选。
异步IO有很多种：

asyncio + aiohttp + requests
gevent + requests +grequests
twisted
tornado
asyncio
gevent+requests
grequests

asyncio + aiohttp + requests

import os
import re

import aiofiles
import aiohttp
import asyncio
from lxml import etree


# 发请求获取html文本
async def fetch(session, url):
    async with session.get(url) as response:
        return await response.text()


# 解析html获取每组的列表页链接
async def parser(html):
    tree = etree.HTML(html)
    pic_href_list = tree.xpath('//*[@class="listbox"]/a/@href')
    pic_title_list = tree.xpath('//*[@class="listbox"]/a/@title')
    for href, title in zip(pic_href_list, pic_title_list):
        path_id = re.findall('\d+', href)[0]
        dir_path = os.path.join(os.getcwd(), 'zdqx', f"{path_id}_{title}")
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)
        yield 'http://' + href[2:], dir_path


# 获取每组图片的所有的链接
async def detail_parser(html):
    tree = etree.HTML(html)
    src_list = tree.xpath('//div[@class="img-box"]/div/a/img/@src')
    return src_list[:-1]


# 下载图片并用异步文件库aiofiles进行存储
async def content(session, url, dir_path):
    async with session.get(url) as response:
        img = await response.read()
        async with aiofiles.open(dir_path, mode='wb') as f:
            await f.write(img)
            await f.close()


async def download(url):
    async with aiohttp.ClientSession() as session:
        html_text = await fetch(session, url)
        async for detail_url, dir_path in parser(html_text):
            detail_text = await fetch(session, detail_url)
            src_list = await detail_parser(detail_text)
            for index, src in enumerate(src_list):
                file_path = os.path.join(dir_path, f"{index}.jpg")
                if not os.path.exists(file_path):
                    try:
                        await content(session, src, file_path)
                    except AssertionError as e:
                        print(e)
                    finally:
                        print(src)


if __name__ == '__main__':
    urls = ['http://www.zdqx.com/qingchun/index.html'] + [f'http://www.zdqx.com/qingchun/index_{i}.html' for i in
                                                          range(2, 41)]

    loop = asyncio.get_event_loop()
    tasks = [asyncio.ensure_future(download(url)) for url in urls]
    tasks = asyncio.gather(*tasks)
    loop.run_until_complete(tasks)

gevent+requests

import gevent
import requests
from gevent import monkey
#替换内置的socket，更换成gevent封装的弄成非阻塞的
monkey.patch_all()
def fetch_async(method,url,req_kwargs):
    print(method,url,req_kwargs)
    response = requests.request(method=method,url=url,**req_kwargs)
    print(response.url,response.content)
#发送请求，可以称下面为三个协程
gevent.joinall([
    gevent.spawn(fetch_async,method="get",url="https://www.python.org/",req_kwargs={}),
    gevent.spawn(fetch_async, method="get", url="https://www.yahoo.com/", req_kwargs={}),
    gevent.spawn(fetch_async, method="get", url="https://github.com/", req_kwargs={}),
])

gevent+urllib

import gevent
import requests
import urllib.request
from gevent import monkey
#替换内置的socket，更换成gevent封装的弄成非阻塞的
monkey.patch_all()
def run_task(url):
    print("Visit --> %s" %url)
    try:
        response = urllib.request.urlopen(url)
        data = response.read()
        print("%d bytes received from %s." %(len(data),url))
    except Exception as e:
        print(e)

if __name__ == '__main__':
    urls = ['https://www.baidu.com','https://docs.python.org/3/library/urllib.html','https://www.cnblogs.com/wangmo/p/7784867.html']
    greenlets = [gevent.spawn(run_task,url) for url in urls]
    gevent.joinall(greenlets)

gevent协程池控制最大的协程数量

import gevent
import requests
import urllib.request
from gevent import monkey
#替换内置的socket，更换成gevent封装的弄成非阻塞的
monkey.patch_all()
def fetch_async(method,url,req_kwargs):
    print(method,url,req_kwargs)
    response = requests.request(method=method,url=url,**req_kwargs)
    print(response.url,response.content)
#发送请求，可以称下面为三个协程
#发送请求(协程池控制最大协程数量)
from gevent.pool import Pool
pool = Pool(3)
gevent.joinall([
    pool.spawn(fetch_async,method="get",url="https://www.python.org/",req_kwargs={}),
    pool.spawn(fetch_async, method="get", url="https://www.yahoo.com/", req_kwargs={}),
    pool.spawn(fetch_async, method="get", url="https://github.com/", req_kwargs={}),
])

grequests内置有gevent.joinall

import grequests
request_list = [
    grequests.get('http://httpbin.org/delay/1', timeout=0.001),
    grequests.get('http://fakedomain/'),
    grequests.get('http://httpbin.org/status/500')
]
####执行并获取响应列表####
response_list = grequests.map(request_list)
print(response_list)

twisted

事件循环是，循环等待获取请求返回的内容
当所有的请求都获取到了结果，事件循环会一直在循环，所以得判断当请求数与获取的结果数一样时,利用twisted.stop()停止事件循环：

#发送http请求
from twisted.web.client import getPage
#事件循环
from twisted.internet import reactor
REV_COUNTER = 0
REQ_COUNTER = 0
def callback(contents):
    print(contents)
    global REV_COUNTER
    REV_COUNTER +=1
    if REV_COUNTER == REQ_COUNTER:
        #已经获取到请求的所有数量时关闭事件循环
        reactor.stop()
url_list = ['http://www.bing.com', 'http://www.baidu.com', ]
REQ_COUNTER = len(url_list)
for url in url_list:
    deferred = getPage(bytes(url,encoding="utf8"))
    deferred.addCallback(callback)
#时间循环等待返回的结果
reactor.run()