"""
https://blog.youkuaiyun.com/qq_45695453/article/details/105757919
https://github.com/inspurer/PythonSpider
"""import requests
import re
import json
from contextlib import closing
from pyquery import PyQuery as pq
from requests import RequestException
classbilibili():def__init__(self):
self.getHtmlHeaders={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36','Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8','Accept-Encoding':'gzip, deflate, br','Accept-Language':'zh-CN,zh;q = 0.9'}
self.downloadVideoHeaders={'Origin':'https://www.bilibili.com','Referer':'https://www.bilibili.com/video/av26522634','User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',}#一般这里得到的网页源码和F12查看看到的不一样,因为F12开发者工具里的源码经过了浏览器的解释defgetHtml(self,url):try:
response = requests.get(url=url, headers= self.getHtmlHeaders)print(response.status_code)if response.status_code ==200:return response.text
except RequestException:print('请求Html错误:')defparseHtml(self,html):#用pq解析得到视频标题
doc = pq(html)
video_title = doc('#viewbox_report > h1 > span').text()#用正则、json得到视频url;用pq失败后的无奈之举
pattern = r'\<script\>window\.__playinfo__=(.*?)\</script\>'
result = re.findall(pattern, html)[0]
temp = json.loads(result)#temp['durl']是一个列表,里面有很多字典#video_url = temp['durl']# for item in temp['data']['dash']['video'][0]:# video_url = item['base_url']# if 'base_url' in item.keys():# video_url = item['base_url']# break
video_url = temp['data']['dash']['video'][0]['base_url']#print(video_url)return{'title': video_title,'url': video_url
}defdownload_video(self,video):
title = re.sub(r'[\/:*?"<>|]','-', video['title'])# 去掉创建文件时的非法字符
url = video['url']
filename = title +'.flv'withopen(filename,"wb")as f:
f.write(requests.get(url=url, headers=self.downloadVideoHeaders, stream=True, verify=False).content)#closing适用于提供了 close() 实现的对象,比如网络连接、数据库连接# with closing(requests.get(video['url'], headers=self.downloadVideoHeaders, stream=True, verify=False)) as res:# if res.status_code == 200:# with open(filename, "wb") as f:# for chunk in res.iter_content(chunk_size=1024):# if chunk:# f.write(chunk)defrun(self,url):
self.download_video(self.parseHtml(self.getHtml(url)))if __name__ =='__main__':
url ='https://www.bilibili.com/video/av18100312'
bilibili().run(url)
下载视频(简化版)
import requests
import os
from bs4 import BeautifulSoup
import json
import re
import warnings
downloadVideoHeaders={'Origin':'https://www.bilibili.com','Referer':'https://www.bilibili.com/video/av26522634','User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',}defdownload_video(url):
r = requests.get(url)if r.status_code !=200:print("error")
exit(-1)
text = BeautifulSoup(r.text,'html.parser').get_text()
video = text[text.find('baseUrl')-1:text.find('base_url')-2]# video = re.findall(r'"baseUrl".*"base_url"', text)[0]# video = video.split('"base_url"')[0][:-1]
video =eval(video.replace('"baseUrl":',''))
filename ='%s.flv'%os.path.basename(url)withopen(filename,"wb")as f:
f.write(requests.get(url=video,headers=downloadVideoHeaders, stream=True, verify=False).content)
url ='https://www.bilibili.com/video/av18100312'
download_video(url)
下载简化版(详细信息)
import requests
import os
from bs4 import BeautifulSoup
import json
import re
import warnings
downloadVideoHeaders={'Origin':'https://www.bilibili.com','Referer':'https://www.bilibili.com/video/av26522634','User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',}defdownload_video(url):
r = requests.get(url)if r.status_code !=200:print("error")
exit(-1)
text = BeautifulSoup(r.text,'lxml')
title = text.find('title').text # or text.find('title').string
image = re.findall(r"http:.+\.jpg",str(text.find('meta',itemprop="image")))[0]
uploadDate = re.findall(r"\d+.+\d+",str(text.find('meta', itemprop="uploadDate")))[0]
view = text.find('span','view').text # 播放量
dm = text.find('span','dm').text # 弹幕
scripts = text.findAll('script')for script in scripts:if"window.__playinfo__="in script.text:# data = json.loads(script.text.split('=')[1].text)# video = eval(re.findall(r'"baseUrl":.+,',script.text)[0])# video = eval(re.findall(r'"https://upos.+,":.+,',script.text)[0])
text = script.text
video = text[text.find('baseUrl')-1:text.find('base_url')-2]
video =eval(video.replace('"baseUrl":',''))# filename = '%s.flv'%os.path.basename(url)
filename ='%s.flv'%title
withopen(filename,"wb")as f:
f.write(requests.get(url=video,headers=downloadVideoHeaders, stream=True, verify=False).content)
url ='https://www.bilibili.com/video/av18100312'
download_video(url)