youtube爬虫搭建

本文介绍了一个使用Scrapy框架搭建的YouTube视频爬虫项目。爬虫能够抓取视频列表、解析视频详情,包括视频标题、播放时长、上传者、发布时间、缩略图、视频源链接等信息,并下载视频。此外,还提供了获取用户头像和搜索相关视频的功能。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

youtube爬虫搭建
原创置顶 张小竟 最后发布于2019-09-05 19:32:23 阅读数 2035  收藏
展开
一、scrapy代码

 

# encoding=utf-8
import json
import re
from urlparse import urljoin
from pytube import YouTube
import scrapy
from scrapy.http import Request
from scrapy.selector import Selector
from videos2.items import VideoItem
from videos2.util import getImage,getVideo
class VideoSiper(scrapy.Spider):
    name = 'video-youtube'
 
    def parse(self, response):
        sel = response.selector
        self.parse_video_list(response)
        for href in sel.xpath('//div[contains(@class,"branded-page-box")]/a/@href').extract():
            yield Request(url=urljoin(response.url, href), callback=self.parse_video_list)
    def start_requests(self):
        url ='https://www.youtube.com/results?search_query=%E6%97%85%E8%A1%8C'
        yield Request(url=url)
    ##列表页
    def parse_video_list(self,response):
        sel=response.selector
        #sel2=response.selector
        tmp1=sel.xpath('//ol[contains(@class,"item-section")]/li//a[contains(@class,"yt-uix-tile-link")]/@href').extract()
        tmp2=sel.xpath('//ol[contains(@class,"item-section")]/li//div[contains(@class,"yt-thumb")]//span[contains(@class,"video-time")]/text()').extract()
        for (href,length) in zip(tmp1,tmp2):
            yield  Request(url=urljoin(response.url,href),callback=self.parse_video_url,meta={"length":length})
    ##详情页
    def parse_video_url(self,response):
        sel =response.selector
        meta = response.meta
        url =response.url
        try :
            #yt
            tmp=sel.xpath('//div[contains(@id,"watch7-content")]')
        except:
            self.logger.warning('Invalid response: %s' % response.url)
            self.logger.warning(response.body)
 
        content=tmp.xpath('//meta[contains(@itemprop,"name")]/@content').extract()[0]
        videoPlayTimes=meta['length']
 
        user=sel.xpath('//div[contains(@id,"watch7-user-header")]//span[contains(@class,"yt-thumb-clip")]//img/@alt').extract()[0]
        time=sel.xpath('//meta[contains(@itemprop,"datePublished")]/@content').extract()[0]
        ShowImg=sel.xpath('//link[contains(@itemprop,"thumbnailUrl")]/@href').extract()
        realvideo1=getVideo(url)
        ###装配数据
        videoItem=VideoItem()
        videoItem['content']=content
        videoItem['user']=user
        videoItem['source']='youtube'
        videoItem['types']='video'
        videoItem['time']=time
        videoItem['ShowImg']=ShowImg
        videoItem['realvideo1']=realvideo1
        videoItem['videoPlayTimes']=videoPlayTimes
        videoItem['url']=response.url
        tmpUrl=url.replace('wacth','get_endscreen')
        yield Request(url=tmpUrl,callback=self.parse_avatar,meta={'item':videoItem})
        ##搜索相关详情视频
        for href in sel.xpath('//li[contains(@class,"video-list-item")]//a/@href').extract():
            yield Request(url=urljoin(response.url,href),callback=self.parse_video_url)
 
    def parse_avatar(self,response):
        html_text = json.loads(response.body[4:])['payload']['list_html']
        meta = response.meta
        videoItem = response.meta.get('item', VideoItem())
        user_avatar_old=html_text['elements'][0]['endscreenElementRenderer']['image']['thumbnails'][0]['url']
        user_avatar= getImage(user_avatar_old)
        videoItem['user_avatar']=user_avatar
        videoItem['user_avatar_old']=user_avatar_old
        yield videoItem
 

 

二、util.py

 

 

 

def getVideo(url):
    print 'Downloading. video..%s'%url
    key = hashlib.sha1(os.urandom(24)).hexdigest() + ".mp4"
    try:
        yt=YouTube(url)
        video = yt.filter('mp4')[-1]
        video.download(base_url)
        qiniu_video(key,video.filename+'.mp4')
    except:
        print 'Downloading. video. error.%s' % url
 

 
 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值