import requests
from lxml import etree
import os
import time
import random
def get_html(url):
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.5918.400 LBBROWSER/10.1.3090.400'}
try:
html = requests.get(url, headers = headers)
html.encoding = html.apparent_encoding
if html.status_code == 200:
print('获取源代码成功')
# print(html.text)
except Exception as e:
print('获取源代码失败:%s'%e)
return html.text
def parse_html(html):
html = etree.HTML(html)
lis = html.xpath('//div[@class="j-r-list"]/ul/li') # 每个li 标签代表一个段子
videourls = []
names = []
for li in lis:
title = li.xpath(".//div[@class='j-r-list-c-desc']/a/text()")[0]
videourl = li.xpath(".//li[@title='下载视频']/a/@href")[0]
videourls.append(videourl)
names.append(title)
return( videourls, names)
def downloadvideo(url, name):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.5918.400 LBBROWSER/10.1.3090.400'}
if '百思不得姐video' in os.listdir(r'E:\pycharm项目\爬虫初战'): # 此处一定要是绝对路径,否则会形成文件夹嵌套
pass
else:
os.mkdir(r'E:\pycharm项目\爬虫初战\百思不得姐video')
os.chdir(r'E:\pycharm项目\爬虫初战\百思不得姐video')
video = requests.get(url, headers = headers).content # .content 很重要,视频和音频、图片都这么保存 都是以二进制形式保存的
with open(name + '.mp4', 'wb')as f:
print('正在下载%s'%url)
f.write(video)
if __name__ == "__main__":
page = int(input('你要下载几页:'))
for i in range(page):
url = 'http://www.budejie.com/video/' + str(i + 1) # range 从零开始,且前闭后开。所以要加1
html = get_html(url)
videourls = parse_html(html)[0]
names = parse_html(html)[1]
for i in range(20):
time.sleep(random.randint(1, 4)+ random.random()) # 使得请求时间变得随机
downloadvideo(videourls[i], names[i])
对于爬虫的掌握还不太好,没有太深入,对于大型网站的反扒措施束手无策。
望各位大神指点迷津。