六间房视频爬取

1 观察网站,

六间房网站–小视频页面地址 打开开发者工具,如下图所示:
在这里插入图片描述可以看到请求资源链接:

https://v.6.cn/minivideo/getMiniVideoList.php?act=recommend&page=3&pagesize=25

查看预览可以看到服务器返回的资源链接
在这里插入图片描述以上就有爬虫思路:

  1. 请求资源链接
  2. 读取链接的响应数据
  3. 创建文件夹
  4. 请求下载链接资源,将其放入文件夹中

2 代码

版本0.1

import requests
import re
import os

# 取消非法字符
def flash_chart(str):
    new_filename=re.sub('[^\u4e00-\u9fa5]+', '', str)
    return new_filename


# 六间房小视频地址,不需要登录
url = "https://v.6.cn/minivideo/getMiniVideoList.php?act=recommend&page=2&pagesize=25"

header ={
'referer': 'https://v.6.cn/minivideo/',
'sec-ch-ua': 'Not A;Brand";v="99", "Chromium";v="90", "Microsoft Edge";v="90',
'sec-ch-ua-mobile': '?0',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.72 Safari/537.36 Edg/90.0.818.42',
'x-requested-with': 'XMLHttpRequest'
}

response = requests.get(url = url, headers = header)
json_data = response.json()
data = json_data['content']['list']

# 创建目录
directory = os.mkdir('2')


for i in data:
    video_url = i['playurl']
    video_title = flash_chart(i['alias'])
    f = open('2\\'+video_title+'.mp4','wb+')
    video = requests.get(video_url)
    f.write(video.content)
    f.close()

版本 0.2

import requests
import re
import os
import time



# 六间房小视频地址,不需要登录
url = "https://v.6.cn/minivideo/getMiniVideoList.php?act=recommend&page="
# 传递参数 2"

# 选择爬取的页数
def page_select(num):
    url_list = []
    for x in range(1,num+1):
        url_list.append(url+str(x)+'&pagesize=25')
    return url_list


header ={
'referer': 'https://v.6.cn/minivideo/',
'sec-ch-ua': 'Not A;Brand";v="99", "Chromium";v="90", "Microsoft Edge";v="90',
'sec-ch-ua-mobile': '?0',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.72 Safari/537.36 Edg/90.0.818.42',
'x-requested-with': 'XMLHttpRequest'
}

# 过滤非法字符
def flash_chart(str):
    new_filename=re.sub('[^\u4e00-\u9fa5]+', '', str)
    return new_filename

# 爬取下载链接
def get_url_list(page_list,header):
    video_url,video_title = [],[]
    for x in page_list: 
        response = requests.get(url = x, headers = header)
        json_data = response.json()
        data = json_data['content']['list']
        for i in data:
            video_url.append(i['playurl']) 
            video_title.append(flash_chart(i['alias']))
    return video_url,video_title


# 创建目录
def create_dir(dir_name):
    if os.path.isdir(dir_name):
        dir_new_name = input("重新输入目录名字")
        return dir_name
    else:
        directory = os.mkdir(dir_name)
        return dir_name

# 下载数据
def download_video(url,title,dir_name):
    for i,j in zip(url,title):
        f = open(dir_name+'\\'+j+'.mp4','wb+')
        time.sleep(0.1)
        video = requests.get(i)
        f.write(video.content)
        f.close()


def main():
    print('--------六间房视频爬取--------')
    print('------------------------------')
    page = int(input('输入爬取页数:'))
    page_num = page_select(page)
    data = get_url_list(page_num,header)
    vide_url = data[0]
    vide_title = data[1]
    dir_name = input('输入存取文件路径:')
    dir_n = create_dir(dir_name)
    print('正在爬取请稍后!!!')
    download_video(vide_url,vide_title,dir_n)
    print('完成')
    

if __name__=="__main__":
	main()
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值