更新了GUI版本的下载功能,相较于GitHub版本的代码,在GUI版本弃用了一部分资源,可能对检索结果产生部分影响。
下图为2020-08-07更新后的效果图,可直接跳到后面查看。
这是非gui版的爬虫源码,功能更强大,支持视频下载,逻辑也更加完整,感兴趣可以看看
https://github.com/iskangkanga/PiracyVideo
简单实现GUI版本的视频爬虫
效果图:
代码:
import re
from service.geter import get_response
import requests
import traceback
from tkinter import *
def get_source():
# 响应第一个button,仅获取资源并输出到gui
result = ''
search_url = 'https://www.yszxwang.com/search.php'
data = {
'searchword': inp1.get()
}
resp = requests.post(search_url, data=data, timeout=15)
# print(resp.text)
# 若网站检索无结果,直接返回错误提示并关闭爬虫
if '对不起,没有找到' in resp.text:
txt.insert(END, 'ERROR, 网站检索无此结果!' + '\n')
return
# 进入搜索详情页,获取详情页链接
urls = re.findall('href="(/[a-z]+?/.*?\d+/)"', resp.text)
titles = re.findall('<h3><a.*?>(.*?)</a></h3>', resp.text)
types = re.findall('<h3><a href="/(.*?)/.*?\d+/">.*?</a></h3>', resp.text)
type_list = []
for t in types:
if t == 'tv':
t = '电视剧'
elif t == 'dm':
t = '动漫'
elif t == 'dy':
t = '电影'
elif t == 'zy':
t = '综艺'
type_list.append(t)
# 暂时已发现 tv:剧集 dm:动漫 dy:电影 zy:综艺
# print(titles)
for url in urls:
if len(url) > 60:
urls.remove(url)
r_urls = []
for u in urls:
if u not in r_urls:
r_urls.append(u)
txt.insert(END, '已为你检索到结果如下:' + '\n')
for i, title in enumerate(titles):
txt.insert(END, str(i+1) + ' ' + title + ' ' + type_list[i] + '\n')
def get_source1():
# 获取资源并返回到方法
result = ''
search_url = 'https://www.yszxwang.com/search.php'
data = {
'searchword': inp1.get()
}
resp = requests.post(search_url, data=data, timeout=15)
urls = re.findall('href="(/[a-z]+?/.*?\d+/)"', resp.text)
for url in urls:
if len(url) > 60:
urls.remove(url)
r_urls = []
for u in urls:
if u not in r_urls:
r_urls.append(u)
return r_urls
def begin_parse():
# 响应第二个button,开始解析
f_type = inp3.get()
choice = int(inp2.get())
r_urls = get_source1()
detail_url = 'https://www.yszxwang.com' + r_urls[choice - 1]
if f_type == '1':
# 单剧集解析
parse_alone(detail_url)
else:
# 多剧集解析
parse_many(detail_url)
def parse_alone(detail_url):
resp = get_response(detail_url)
if not resp:
txt2.insert(END, '失败' + '\n')
return
play_urls = re.findall('a title=.*? href=\'(.*?)\' target="_self"', resp)
play_url = get_real_url(play_urls)
if 'http' in play_url:
txt2.insert(END, "复制到浏览器播放" + '\n')
txt2.insert(END, play_url + '\n')
else:
txt2.insert(END, "解析失败" + '\n')
def get_real_url(play_urls):
for i, play_page_url in enumerate(play_urls):
play_page_url = 'https://www.yszxwang.com' + play_page_url
resp1 = get_response(play_page_url)
if resp1:
data_url = re.search('var now="(http.*?)"', resp1).group(1).strip()
resp2 = get_response(data_url)
if resp2:
return data_url
return 'defeat'
def parse_many(detail_url):
all_play_list, play_num = get_all_source(detail_url)
many_real_url = get_many_real_url(all_play_list, play_num)
if isinstance(many_real_url,list):
txt2.insert(END, '已为你检索到结果如下:' + '\n')
txt2.insert(END, "复制到浏览器播放" + '\n')
for i, r in enumerate(many_real_url):
txt2.insert(END, "第" + str(i+1) + '集' + ' ' + r + '\n')
else:
txt2.insert(END, "解析失败" + '\n')
def get_many_real_url(all_play_list, play_num):
for i, play_list in enumerate(all_play_list):
many_data_url = []
for j, play_page_url in enumerate(play_list):
play_page_url = 'https://www.yszxwang.com' + play_page_url
resp1 = get_response(play_page_url)
if resp1:
data_url = re.search('var now="(http.*?)"', resp1).group(1).strip()