youtube爬虫搭建
原创置顶 张小竟 最后发布于2019-09-05 19:32:23 阅读数 2035 收藏
展开
一、scrapy代码
# encoding=utf-8
import json
import re
from urlparse import urljoin
from pytube import YouTube
import scrapy
from scrapy.http import Request
from scrapy.selector import Selector
from videos2.items import VideoItem
from videos2.util import getImage,getVideo
class VideoSiper(scrapy.Spider):
name = 'video-youtube'
def parse(self, response):
sel = response.selector
self.parse_video_list(response)
for href in sel.xpath('//div[contains(@class,"branded-page-box")]/a/@href').extract():
yield Request(url=urljoin(response.url, href), callback=self.parse_video_list)
def start_requests(self):
url ='https://www.youtube.com/results?search_query=%E6%97%85%E8%A1%8C'
yield Request(url=url)
##列表页
def parse_video_list(self,response):
sel=response.selector
#sel2=response.selector
tmp1=sel.xpath('//ol[contains(@class,"item-section")]/li//a[contains(@class,"yt-uix-tile-link")]/@href').extract()
tmp2=sel.xpath('//ol[contains(@class,"item-section")]/li//div[contains(@class,"yt-thumb")]//span[contains(@class,"video-time")]/text()').extract()
for (href,length) in zip(tmp1,tmp2):
yield Request(url=urljoin(response.url,href),callback=self.parse_video_url,meta={"length":length})
##详情页
def parse_video_url(self,response):
sel =response.selector
meta = response.meta
url =response.url
try :
#yt
tmp=sel.xpath('//div[contains(@id,"watch7-content")]')
except:
self.logger.warning('Invalid response: %s' % response.url)
self.logger.warning(response.body)
content=tmp.xpath('//meta[contains(@itemprop,"name")]/@content').extract()[0]
videoPlayTimes=meta['length']
user=sel.xpath('//div[contains(@id,"watch7-user-header")]//span[contains(@class,"yt-thumb-clip")]//img/@alt').extract()[0]
time=sel.xpath('//meta[contains(@itemprop,"datePublished")]/@content').extract()[0]
ShowImg=sel.xpath('//link[contains(@itemprop,"thumbnailUrl")]/@href').extract()
realvideo1=getVideo(url)
###装配数据
videoItem=VideoItem()
videoItem['content']=content
videoItem['user']=user
videoItem['source']='youtube'
videoItem['types']='video'
videoItem['time']=time
videoItem['ShowImg']=ShowImg
videoItem['realvideo1']=realvideo1
videoItem['videoPlayTimes']=videoPlayTimes
videoItem['url']=response.url
tmpUrl=url.replace('wacth','get_endscreen')
yield Request(url=tmpUrl,callback=self.parse_avatar,meta={'item':videoItem})
##搜索相关详情视频
for href in sel.xpath('//li[contains(@class,"video-list-item")]//a/@href').extract():
yield Request(url=urljoin(response.url,href),callback=self.parse_video_url)
def parse_avatar(self,response):
html_text = json.loads(response.body[4:])['payload']['list_html']
meta = response.meta
videoItem = response.meta.get('item', VideoItem())
user_avatar_old=html_text['elements'][0]['endscreenElementRenderer']['image']['thumbnails'][0]['url']
user_avatar= getImage(user_avatar_old)
videoItem['user_avatar']=user_avatar
videoItem['user_avatar_old']=user_avatar_old
yield videoItem
二、util.py
def getVideo(url):
print 'Downloading. video..%s'%url
key = hashlib.sha1(os.urandom(24)).hexdigest() + ".mp4"
try:
yt=YouTube(url)
video = yt.filter('mp4')[-1]
video.download(base_url)
qiniu_video(key,video.filename+'.mp4')
except:
print 'Downloading. video. error.%s' % url
本文介绍了一个使用Scrapy框架搭建的YouTube视频爬虫项目。爬虫能够抓取视频列表、解析视频详情,包括视频标题、播放时长、上传者、发布时间、缩略图、视频源链接等信息,并下载视频。此外,还提供了获取用户头像和搜索相关视频的功能。
1207

被折叠的 条评论
为什么被折叠?



