import requests
import re # 正则模块
import uuid #uuid.uuid(4) 可以根据时间戳生成一段世界上唯一的随机字符串
# 爬虫三部曲
# 1、发送请求
# 2、解析数据
# 3、保存数据
#下面逐一编写并用函数封装,实现代码复用
# 1、发送请求
def get_page(url):
response = requests.get(url)
return response
# 2、解析数据
#解析主页获取视频详情页ID
def parse_index(text):
res = re.findall('<a href="video_(.*?)', response.text, re.S)
#print(res)
detail_url_list=[]
for m_id in res :
detail_url = 'https://www.pearvideo.com/video_'+m_id
detail_url_list.append(detail_url)
return detail_url_list
##解析详情页获取视频url
def parse_detail(text):
"""
用网页的开发者工具找标签:
<video webkit-playsinline="" playsinline="" x-webkit-airplay="" autoplay="autoplay"
src="https://video.pearvideo.com/mp4/adshort/20190613/cont-1566073-14015522_adpkg-ad_hd.mp4" s
tyle="width: 100%; height: 100%;"></video>
<video *?src="(.*?)"
# movie_url = re.findall(('<video *?src="(.*?)" ',text,re.S))
#但是发现该规则被js二次包装了,故需要重新修改正则规则
# 修改正则
"""
"""
打开详情页的源码,然后ctrl+F 查找src_Url
< script type = "text/javascript" >
var
contId = "1566073", liveStatusUrl = "liveStatus.jsp", liveSta = "", playSta = "1",
autoPlay =!1, isLiving =!1, isVrVideo =!1, hdflvUrl = "", sdflvUrl = "", hdUrl = "", sdUrl = "", ldUrl = "", \
srcUrl = "https://video.pearvideo.com/mp4/adshort/20190613/cont-1566073-14015522_adpkg-ad_hd.mp4",\
vdoUrl = srcUrl, skinRes = "//www.pearvideo.com/domain/skin", videoCDN = "//video.pearvideo.com";
推出正则表达式:
srcUrl ="(.*?)"
"""
movie_url = re.findall(('srcUrl ="(.*?)" ', text, re.S))
return movie_url
#3、保存数据
def save_movie(movie_url):
response = requests.get(movie_url)
#把视频写到本地
with open(f'{uuid.uuid4()}.mp4', 'wb') as f:
f.write(response.content)
f.flush()
#测试:调用函数并实现爬取
if __name__=='__main__':
# 1、对主页发送请求
index_res = response = get_page(url='https://www.pearvideo.com/')
# 2、对主页进行解析,获取详情页
detail_url_list = parse_index(index_res.text)
print(detail_url_list)
# 3.对每一个详情页url发送请求
for detail_url in detail_url_list :
detail_res= get_page(url=detail_url)
#print(detail_res.text)
# r如上爬完70多个页面
# 4.解析详情页获取视频url
movie_url = parse_detail(detail_res.text)
print(movie_url)
# 5、保存视频到本地
save_movie(movie_url)
# 请求url:
# 请求方式:get
import requests
import re # 正则模块
import uuid #uuid.uuid(4) 可以根据时间戳生成一段世界上唯一的随机字符串
# 爬虫三部曲
# 1、发送请求
# 2、解析数据
# 3、保存数据
#下面逐一编写并用函数封装,实现代码复用
# 1、发送请求
def get_page(url):
response = requests.get(url)
return response
# 2、解析数据
#解析主页获取视频详情页ID
def parse_index(text):
res = re.findall('<a href="video_(.*?)"', response.text, re.S)
print(res)
detail_url_list=[]
for m_id in res :
detail_url = 'https://www.pearvideo.com/video_'+ m_id
print(detail_url)
detail_url_list.append(detail_url)
return detail_url_list
##解析详情页获取视频url
def parse_detail(text):
print(text)
movie_url = re.findall('srcUrl="(.*?)"', text, re.S)[0]
return movie_url
#3、保存数据
def save_movie(movie_url):
response = requests.get(movie_url)
#把视频写到本地
with open(f'{uuid.uuid4()}.mp4', 'wb') as f:
f.write(response.content)
f.flush()
#测试:调用函数并实现爬取
if __name__ == '__main__':
# 1、对主页发送请求
index_res = response = get_page(url='https://www.pearvideo.com/')
# 2、对主页进行解析,获取详情页
detail_url_list = parse_index(index_res.text)
print(detail_url_list)
# 3.对每一个详情页url发送请求
for detail_url in detail_url_list :
print(detail_url)
detail_res= get_page(url=detail_url)
# print(detail_res.text)
# r如上爬完70多个页面
# 4.解析详情页获取视频url
movie_url = parse_detail(detail_res.text)
print(movie_url)
# 5、保存视频到本地
save_movie(movie_url)
出现问题的原因:正则表达式错误
另:出现错误后,应按照程序的进程慢慢推进,进行纠错