1 观察网站,
六间房网站–小视频页面地址 打开开发者工具,如下图所示:
可以看到请求资源链接:
https://v.6.cn/minivideo/getMiniVideoList.php?act=recommend&page=3&pagesize=25
查看预览可以看到服务器返回的资源链接
以上就有爬虫思路:
- 请求资源链接
- 读取链接的响应数据
- 创建文件夹
- 请求下载链接资源,将其放入文件夹中
2 代码
版本0.1
import requests
import re
import os
# 取消非法字符
def flash_chart(str):
new_filename=re.sub('[^\u4e00-\u9fa5]+', '', str)
return new_filename
# 六间房小视频地址,不需要登录
url = "https://v.6.cn/minivideo/getMiniVideoList.php?act=recommend&page=2&pagesize=25"
header ={
'referer': 'https://v.6.cn/minivideo/',
'sec-ch-ua': 'Not A;Brand";v="99", "Chromium";v="90", "Microsoft Edge";v="90',
'sec-ch-ua-mobile': '?0',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.72 Safari/537.36 Edg/90.0.818.42',
'x-requested-with': 'XMLHttpRequest'
}
response = requests.get(url = url, headers = header)
json_data = response.json()
data = json_data['content']['list']
# 创建目录
directory = os.mkdir('2')
for i in data:
video_url = i['playurl']
video_title = flash_chart(i['alias'])
f = open('2\\'+video_title+'.mp4','wb+')
video = requests.get(video_url)
f.write(video.content)
f.close()
版本 0.2
import requests
import re
import os
import time
# 六间房小视频地址,不需要登录
url = "https://v.6.cn/minivideo/getMiniVideoList.php?act=recommend&page="
# 传递参数 2"
# 选择爬取的页数
def page_select(num):
url_list = []
for x in range(1,num+1):
url_list.append(url+str(x)+'&pagesize=25')
return url_list
header ={
'referer': 'https://v.6.cn/minivideo/',
'sec-ch-ua': 'Not A;Brand";v="99", "Chromium";v="90", "Microsoft Edge";v="90',
'sec-ch-ua-mobile': '?0',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.72 Safari/537.36 Edg/90.0.818.42',
'x-requested-with': 'XMLHttpRequest'
}
# 过滤非法字符
def flash_chart(str):
new_filename=re.sub('[^\u4e00-\u9fa5]+', '', str)
return new_filename
# 爬取下载链接
def get_url_list(page_list,header):
video_url,video_title = [],[]
for x in page_list:
response = requests.get(url = x, headers = header)
json_data = response.json()
data = json_data['content']['list']
for i in data:
video_url.append(i['playurl'])
video_title.append(flash_chart(i['alias']))
return video_url,video_title
# 创建目录
def create_dir(dir_name):
if os.path.isdir(dir_name):
dir_new_name = input("重新输入目录名字")
return dir_name
else:
directory = os.mkdir(dir_name)
return dir_name
# 下载数据
def download_video(url,title,dir_name):
for i,j in zip(url,title):
f = open(dir_name+'\\'+j+'.mp4','wb+')
time.sleep(0.1)
video = requests.get(i)
f.write(video.content)
f.close()
def main():
print('--------六间房视频爬取--------')
print('------------------------------')
page = int(input('输入爬取页数:'))
page_num = page_select(page)
data = get_url_list(page_num,header)
vide_url = data[0]
vide_title = data[1]
dir_name = input('输入存取文件路径:')
dir_n = create_dir(dir_name)
print('正在爬取请稍后!!!')
download_video(vide_url,vide_title,dir_n)
print('完成')
if __name__=="__main__":
main()