大家好,给大家分享一下python怎么下载安装视频教学,很多人还不知道这一点。下面详细解释一下。现在让我们来看看!
Source code download: 本文相关源码
urllib.request下载
先使用最原始的urllib请求下载视频文件列表:
import os
import time
import urllib.request
urls = [
'http://vod.146.xx.com/ebe8ebb801f4417c91967cc3c3e0f934.mp4',
'http://vod.146.xx.com/cd278025ef3646c1aa332d970bccb686.mp4'
]
start_time = time.time()
for url in urls:
with urllib.request.urlopen(url) as response:
content = response.read()
file_name = os.path.split(url)[1]
with open(file_name, 'wb') as f:
f.write(content)
end_time = time.time()
print("用时:%s" % (end_time - start_time))
输出:
用时:547.8745946884155
requests下载
使用高级一些的requests下载视频文件:
import os
import time
import requests
urls = [
'http://vod.146.xx.com/ebe8ebb801f4417c91967cc3c3e0f934.mp4',
'http://vod.146.xx.com/cd278025ef3646c1aa332d970bccb686.mp4'
]
start_time = time.time()
for url in urls:
response = requests.get(url)
content = response.content
file_name = os.path.split(url)[1].strip()
with open(file_name, 'wb') as f:
f.write(content)
end_time = time.time()
print("用时:%s" % (end_time - start_time))
输出:
用时:454.0097427368164
多线程下载
如果视频文件很多,可以用多线程同时下载。下载速度会很快python基础知识重点。
import queue
import threading
import time
import os
import requests
urls = [
'http://xx0K.mp4',
'http://xx00K.mp4'
]
def download(url):
file_name = os.path.split(url)[1].strip()
retry_times = 0
# 网络不稳定,最多重试5次
while retry_times <= 5:
try:
# 流式下载,防止内存溢出
response = requests.get(url, stream=True)
with open(file_name, 'wb') as fh:
for chunk in response.iter_content(chunk_size=1024):
fh.write(chunk)
break
except:
pass
retry_times += 1
else:
try:
os.remove(file_name)
except OSError:
pass
print("Failed to retrieve %s from %s.\n" % (url, file_name))
class Work(threading.Thread):
def __init__(self, q, name):
threading.Thread.__init__(self)
self.q = q
self.name = name
def run(self):
while True:
# 如果队列生产很慢,消费很快队列很容易为空,就不能用empty判断了。
# 可以给队列加(put)一个特殊值(如None)来作为结束判断。
if self.q.empty():
break
else:
url = self.q.get()
download(url)
self.q.task_done()
start_time = time.time()
work_queue = queue.Queue(len(urls))
threads = []
for url in urls:
work_queue.put(url)
for i in range(3):
thread = Work(work_queue, 'Thread-%s' % (i+1))
# 设置主线程关掉的话,子线程也跟着关掉,防止子线程一直运行下去
thread.setDaemon(True)
thread.start()
threads.append(thread)
if work_queue.join():
for t in threads:
t.join()
end_time = time.time()
print("用时:%s" % (end_time - start_time))
输出:
用时:20.341402292251587
另一种方式:
import threading
import random
import queue
from time import sleep
import sys
# 需求分析:有大批量数据需要执行,而且是重复一个函数操作(例如爆破密码),如果全部开始线程数N多,这里控制住线程数m个并行执行,其他等待
# 继承一个Thread类,在run方法中进行需要重复的单个函数操作
class Test(threading.Thread):
def __init__(self, queue, lock, num):
# 传递一个队列queue和线程锁,并行数
threading.Thread.__init__(self)
self.queue = queue
self.lock = lock
self.num = num
def run(self):
# while True:#不使用threading.Semaphore,直接开始所有线程,程序执行完毕线程都还不死,最后的print threading.enumerate()可以看出
with self.num: # 同时并行指定的线程数量,执行完毕一个则死掉一个线程
# 以下为需要重复的单次函数操作
n = self.queue.get() # 等待队列进入
lock.acquire() # 锁住线程,防止同时输出造成混乱
print('开始一个线程:', self.name, '模拟的执行时间:', n)
print('队列剩余:', queue.qsize())
print(threading.enumerate())
lock.release()
sleep(n) # 执行单次操作,这里sleep模拟执行过程
self.queue.task_done() # 发出此队列完成信号
threads = []
queue = queue.Queue()
lock = threading.Lock()
num = threading.Semaphore(3) # 设置同时执行的线程数为3,其他等待执行
# 启动所有线程
for i in range(10): # 总共需要执行的次数
t = Test(queue, lock, num)
t.start()
threads.append(t)
# 吧队列传入线程,是run结束等待开始执行,放下面单独一个for也行,这里少个循环吧
n = random.randint(1, 10)
queue.put(n) # 模拟执行函数的逐个不同输入
# 把队列传入线程,是run结束等待开始执行
# for t in threads:
# n=random.randint(1,10)
# queue.put(n)
# 等待线程执行完毕
for t in threads:
t.join()
queue.join() # 等待队列执行完毕才继续执行,否则下面语句会在线程未结束就开始执行
print('所有执行完毕')
print(threading.active_count())
print(threading.enumerate())
多进程
asyncio
import asyncio
import json
import os
import time
import requests
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36"}
def get_page():
page_urls = []
for i in range(1, 21):
url = 'https://game.gtimg.cn/images/lol/act/img/js/hero/{}.js'.format(i)
print(url)
page_urls.append(url)
return page_urls
def get_img():
img_urls = []
page_urls = get_page()
for page_url in page_urls:
res = requests.get(page_url, headers=headers)
result = res.content.decode('utf-8')
res_dict = json.loads(result)
skins = res_dict["skins"]
for hero in skins:
item = {}
item['name'] = hero["heroName"]
item['skin_name'] = hero["name"]
if hero["mainImg"] == '':
continue
item['imgLink'] = hero["mainImg"]
print(item)
img_urls.append(item)
return img_urls
async def save_img(index, img_url):
path = "皮肤/" + img_url['name']
if not os.path.exists(path):
os.makedirs(path)
content = requests.get(img_url['imgLink'], headers=headers).content
with open('./皮肤/' + img_url['name'] + '/' + img_url['skin_name'] + str(index) + '.jpg', 'wb') as f:
f.write(content)
def main():
loop = asyncio.get_event_loop()
img_urls = get_img()
print(len(img_urls))
tasks = [save_img(img[0], img[1]) for img in enumerate(img_urls)]
try:
loop.run_until_complete(asyncio.wait(tasks))
finally:
loop.close()
if __name__ == '__main__':
start = time.time()
main()
end = time.time()
print(end - start)
协程异步
对于多线程和多进程的缺点是在IO阻塞时会造成线程和进程的浪费,所以异步IO会是首选。
异步IO有很多种:
- asyncio + aiohttp + requests
- gevent + requests +grequests
- twisted
- tornado
- asyncio
- gevent+requests
- grequests
asyncio + aiohttp + requests
import os
import re
import aiofiles
import aiohttp
import asyncio
from lxml import etree
# 发请求获取html文本
async def fetch(session, url):
async with session.get(url) as response:
return await response.text()
# 解析html获取每组的列表页链接
async def parser(html):
tree = etree.HTML(html)
pic_href_list = tree.xpath('//*[@class="listbox"]/a/@href')
pic_title_list = tree.xpath('//*[@class="listbox"]/a/@title')
for href, title in zip(pic_href_list, pic_title_list):
path_id = re.findall('\d+', href)[0]
dir_path = os.path.join(os.getcwd(), 'zdqx', f"{path_id}_{title}")
if not os.path.exists(dir_path):
os.makedirs(dir_path)
yield 'http://' + href[2:], dir_path
# 获取每组图片的所有的链接
async def detail_parser(html):
tree = etree.HTML(html)
src_list = tree.xpath('//div[@class="img-box"]/div/a/img/@src')
return src_list[:-1]
# 下载图片并用异步文件库aiofiles进行存储
async def content(session, url, dir_path):
async with session.get(url) as response:
img = await response.read()
async with aiofiles.open(dir_path, mode='wb') as f:
await f.write(img)
await f.close()
async def download(url):
async with aiohttp.ClientSession() as session:
html_text = await fetch(session, url)
async for detail_url, dir_path in parser(html_text):
detail_text = await fetch(session, detail_url)
src_list = await detail_parser(detail_text)
for index, src in enumerate(src_list):
file_path = os.path.join(dir_path, f"{index}.jpg")
if not os.path.exists(file_path):
try:
await content(session, src, file_path)
except AssertionError as e:
print(e)
finally:
print(src)
if __name__ == '__main__':
urls = ['http://www.zdqx.com/qingchun/index.html'] + [f'http://www.zdqx.com/qingchun/index_{i}.html' for i in
range(2, 41)]
loop = asyncio.get_event_loop()
tasks = [asyncio.ensure_future(download(url)) for url in urls]
tasks = asyncio.gather(*tasks)
loop.run_until_complete(tasks)
gevent+requests
import gevent
import requests
from gevent import monkey
#替换内置的socket,更换成gevent封装的弄成非阻塞的
monkey.patch_all()
def fetch_async(method,url,req_kwargs):
print(method,url,req_kwargs)
response = requests.request(method=method,url=url,**req_kwargs)
print(response.url,response.content)
#发送请求,可以称下面为三个协程
gevent.joinall([
gevent.spawn(fetch_async,method="get",url="https://www.python.org/",req_kwargs={}),
gevent.spawn(fetch_async, method="get", url="https://www.yahoo.com/", req_kwargs={}),
gevent.spawn(fetch_async, method="get", url="https://github.com/", req_kwargs={}),
])
gevent+urllib
import gevent
import requests
import urllib.request
from gevent import monkey
#替换内置的socket,更换成gevent封装的弄成非阻塞的
monkey.patch_all()
def run_task(url):
print("Visit --> %s" %url)
try:
response = urllib.request.urlopen(url)
data = response.read()
print("%d bytes received from %s." %(len(data),url))
except Exception as e:
print(e)
if __name__ == '__main__':
urls = ['https://www.baidu.com','https://docs.python.org/3/library/urllib.html','https://www.cnblogs.com/wangmo/p/7784867.html']
greenlets = [gevent.spawn(run_task,url) for url in urls]
gevent.joinall(greenlets)
gevent协程池控制最大的协程数量
import gevent
import requests
import urllib.request
from gevent import monkey
#替换内置的socket,更换成gevent封装的弄成非阻塞的
monkey.patch_all()
def fetch_async(method,url,req_kwargs):
print(method,url,req_kwargs)
response = requests.request(method=method,url=url,**req_kwargs)
print(response.url,response.content)
#发送请求,可以称下面为三个协程
#发送请求(协程池控制最大协程数量)
from gevent.pool import Pool
pool = Pool(3)
gevent.joinall([
pool.spawn(fetch_async,method="get",url="https://www.python.org/",req_kwargs={}),
pool.spawn(fetch_async, method="get", url="https://www.yahoo.com/", req_kwargs={}),
pool.spawn(fetch_async, method="get", url="https://github.com/", req_kwargs={}),
])
grequests内置有gevent.joinall
import grequests
request_list = [
grequests.get('http://httpbin.org/delay/1', timeout=0.001),
grequests.get('http://fakedomain/'),
grequests.get('http://httpbin.org/status/500')
]
####执行并获取响应列表####
response_list = grequests.map(request_list)
print(response_list)
twisted
事件循环是,循环等待获取请求返回的内容
当所有的请求都获取到了结果,事件循环会一直在循环,所以得判断当请求数与获取的结果数一样时,利用twisted.stop()停止事件循环:
#发送http请求
from twisted.web.client import getPage
#事件循环
from twisted.internet import reactor
REV_COUNTER = 0
REQ_COUNTER = 0
def callback(contents):
print(contents)
global REV_COUNTER
REV_COUNTER +=1
if REV_COUNTER == REQ_COUNTER:
#已经获取到请求的所有数量时关闭事件循环
reactor.stop()
url_list = ['http://www.bing.com', 'http://www.baidu.com', ]
REQ_COUNTER = len(url_list)
for url in url_list:
deferred = getPage(bytes(url,encoding="utf8"))
deferred.addCallback(callback)
#时间循环等待返回的结果
reactor.run()
参考
https://juejin.im/post/5d073e6ce51d45777621bb82
https://www.cnblogs.com/venvive/p/11657228.html