接用python实现相声、评书自动下载(一)。
之前的代码成功爬到了这个评书网址的郭德纲的相声,但是每次都要手动去切下载链接,挺麻烦的。而且这个网站评书资源比较少,而且时不时放不了,所以换了一个网站。
这次要求高一点,不但要自动爬下载链接,而且如果有多页的情况,要能自动跳转到下一页,并且爬下载链接。
实现思路:
1.手动打开第一页(以后会做成自动化),先爬网页源代码;
driver = webdriver.Chrome()
targetUrl = 'https://www.pingshu8.com/MusicList/mmc_63_5764_1.Htm'
# targetUrl = 'https://www.pingshu8.com/Musiclist/mmc_63_5764_3.htm'
headUrl = 'https://www.pingshu8.com'
session = HTMLSession()
def openDriver(url):
driver.get(url) #打开浏览器.
def getAllSession(url): #当前页所有的链接
downLink = []
r = session.get(url)
link = r.html.absolute_links
downLink.append(link) # 把所有的链接放入列表
if downLink != None:
return downLink
2.在源代码里面获取最大的页数。这里有几个坑,一是跳转链接是一串字符串,且最后一位数表示是第几页(比如,\xxx\xx_7899_7979_10,最后的10表示总共有10页)。并且第一个跳转页是第二页(因为第一页已经手动打开了,所以等于是从第二页开始自动爬),所以要做三个异常处理:
1)如果只有一页,那么下一页就没有链接,为空,不做异常处理会报溢出;
2)设下一页(第二页)是num1,末页是num2.如果刚好只有两页,num2就是空值,要做异常处理
3)在获取最大页数之后,还要用正则切出最大的页“数”,比如上面的例子,我要用正则切出num2 = 10,然后做判断,最后拼接所有的跳转链接;
(有些是后面想到的,这个v1.1版本没有做那么多异常判断)
def jump_link(): #获取第一页的跳转链接
for i in getAllSession(targetUrl): # 这是页面跳转链接
pattern = re.compile(r'\/Musiclist\/mmc_\w+\.htm') # 编写正则表达式
m = pattern.findall(str(i)) # 用正则表达式查出总共有多少页
# print(m)
'''
通过split切割出最终的页码
'''
pagenum_list = []
for n in m:
pagename = re.split(r'\/Musiclist\/mmc_\d{2}_\d{0,}_', n, maxsplit=0, flags=0) # 去掉页码前缀
# print('pagename:',pagename)
pagenum = pagename[1].split('.htm') # 去掉页码后缀
pagenum_list.append(pagenum)
def end_num(): #最终的页码
num1 = pagenum_list[0][0]
num2 = pagenum_list[1][0]
if num1 < num2:
# print('num1:',num1,type(num1))
return num2
else:
# print('num2:',num2,type(num2))
return num1
'''
获取总页数之后,拼接每一页的跳转链接
'''
pagejump = []
pages = 2
# print(end_num())
num = end_num()
num = int(num)
result1 = re.match('(\/Musiclist\/mmc_\d{2}_\d{0,}_)', n)
t = result1.group(1)
result2 = '.htm'
while pages <= num:
result = headUrl + t + str(pages) + result2
pages += 1
pagejump.append(result)
# print('result:',result)
if pagejump != None:
return pagejump
3.获取每一页的下载链接,这个比较容易,但是麻烦的是爬下来的list是无序的。比如下载链接是[1,2,3,4,5],但是存在list里面的顺序是[4,3,5,2,1]。所以要在爬了所有的下载链接之后进行一个排序;
def down_link(url): #每一页的下载链接
save_down_link = []
for i in getAllSession(url): #这是所有的链接
pattern = re.compile(r'\/down_\d{5,6}\.html') #编写正则表达式+
re_link = pattern.findall(str(i)) #用正则表达式查出所有的down地址
for j in re_link:
links = f'{headUrl}'+j #拼接完整的下载地址
save_down_link.append(links)
if save_down_link != None:
return save_down_link
4.获取每一页的所有的标题;
def mp3Name(url): #当前页所有的评书题目
num = 5 #题目在网页源码里面是从li[5]开始的
pstitle = []
numlist = []
while num < 33:
numlist.append(num)
num += 3
maxfor = len(down_link(url))
for nums in numlist[0:maxfor]:
sel = driver.find_element_by_xpath(
f'/html/body/div[2]/div[13]/div[1]/ul[2]/div[2]/ul/form/li[{nums}]/a').text
pstitle.append(sel)
if pstitle != None:
return pstitle
5.所有的基础东西都到位了,下一步就是爬所有的下载链接了。不过这里也有坑,就是每一页最多10条,然后有可能只有一页不满10条,或者末页不满10条。这些都要做异常处理;
def getAllLinks(url): #打开每一页获取下载链接
ps_link_list = []
ps_txt_list = []
mylist = []
ps_link_list.append(down_link(targetUrl))
ps_txt_list.append(mp3Name(targetUrl))
for i in jump_link():
openDriver(i) # 打开第二页
result = driver.find_element_by_link_text('首页')
if result != None:
ps_link_list.append(down_link(i))
ps_txt_list.append(mp3Name(i))
else:
time.sleep(50)
ps_link_list.append(down_link(i))
ps_txt_list.append(mp3Name(i))
for links in ps_link_list:
links.sort()
print(ps_link_list)
x = 10 #正常情况每页有10条
y = len(ps_link_list)
z = len(ps_link_list[-1])
m = 0
while m < y-1: #正常遍历
for n in range(x):
mytext = ps_txt_list[m][n]
mylink = ps_link_list[m][n]
mylist.append((mytext, mylink))
m += 1
if z < x: #判断最后一页有没有10条,没有10条遍历10次会报溢出
l = 0
while l < z:
mytext = ps_txt_list[-1][l]
mylink = ps_link_list[-1][l]
mylist.append((mytext, mylink))
l += 1
if mylist != None:
return mylist
6.获取了标题和对应的链接之后,存到csv里面去,方便下载;
def csv(a):
df = pd.DataFrame(a)
df.columns = ['title','link']
df.to_csv('zgtAllLinks.csv', encoding='gbk', index=False)
7.跑起来。
def running():
openDriver(targetUrl)
jump_link()
#down_link(targetUrl)
#mp3Name(targetUrl)
print('getAllLinks:',getAllLinks(targetUrl))
a = getAllLinks(targetUrl)
print(a)
# csv(a)
running()
完整代码:
from selenium import webdriver
from requests_html import HTMLSession
import requests
import time
import re
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
'''
try:
element = WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.ID, "myDynamicElement")))
finally:
browser.quit()
'''
'''
chrome_options = Options()
chrome_options.add_experimental_option("debuggerAddress", "127.0.0.1:9222")
chrome_driver = "C:\Program Files\Google\Chrome\Application\chromedriver.exe"
driver = webdriver.Chrome(chrome_driver, chrome_options=chrome_options)
print(driver.title)
'''
driver = webdriver.Chrome()
targetUrl = 'https://www.pingshu8.com/MusicList/mmc_63_5764_1.Htm'
# targetUrl = 'https://www.pingshu8.com/Musiclist/mmc_63_5764_3.htm'
headUrl = 'https://www.pingshu8.com'
session = HTMLSession()
def openDriver(url):
driver.get(url) #打开浏览器.
def getAllSession(url): #当前页所有的链接
downLink = []
r = session.get(url)
link = r.html.absolute_links
downLink.append(link) # 把所有的链接放入列表
if downLink != None:
return downLink
def jump_link(): #获取第一页的跳转链接
for i in getAllSession(targetUrl): # 这是页面跳转链接
pattern = re.compile(r'\/Musiclist\/mmc_\w+\.htm') # 编写正则表达式
m = pattern.findall(str(i)) # 用正则表达式查出总共有多少页
# print(m)
'''
通过split切割出最终的页码
'''
pagenum_list = []
for n in m:
pagename = re.split(r'\/Musiclist\/mmc_\d{2}_\d{0,}_', n, maxsplit=0, flags=0) # 去掉页码前缀
# print('pagename:',pagename)
pagenum = pagename[1].split('.htm') # 去掉页码后缀
pagenum_list.append(pagenum)
def end_num(): #最终的页码
num1 = pagenum_list[0][0]
num2 = pagenum_list[1][0]
if num1 < num2:
# print('num1:',num1,type(num1))
return num2
else:
# print('num2:',num2,type(num2))
return num1
'''
获取总页数之后,拼接每一页的跳转链接
'''
pagejump = []
pages = 2
# print(end_num())
num = end_num()
num = int(num)
result1 = re.match('(\/Musiclist\/mmc_\d{2}_\d{0,}_)', n)
t = result1.group(1)
result2 = '.htm'
while pages <= num:
result = headUrl + t + str(pages) + result2
pages += 1
pagejump.append(result)
# print('result:',result)
if pagejump != None:
return pagejump
def down_link(url): #每一页的下载链接
save_down_link = []
for i in getAllSession(url): #这是所有的链接
pattern = re.compile(r'\/down_\d{5,6}\.html') #编写正则表达式+
#print('pattern:',pattern)
re_link = pattern.findall(str(i)) #用正则表达式查出所有的down地址
for j in re_link:
links = f'{headUrl}'+j
#print(links,type(links))#拼接完整的下载地址
save_down_link.append(links)
if save_down_link != None:
return save_down_link
def mp3Name(url): #当前页所有的评书题目
num = 5
pstitle = []
numlist = []
while num < 33:
numlist.append(num)
num += 3
maxfor = len(down_link(url))
for nums in numlist[0:maxfor]:
sel = driver.find_element_by_xpath(
f'/html/body/div[2]/div[13]/div[1]/ul[2]/div[2]/ul/form/li[{nums}]/a').text
pstitle.append(sel)
if pstitle != None:
return pstitle
def getAllLinks(url): #打开每一页获取下载链接
ps_link_list = []
ps_txt_list = []
mylist = []
ps_link_list.append(down_link(targetUrl))
ps_txt_list.append(mp3Name(targetUrl))
for i in jump_link():
openDriver(i) # 打开第二页
result = driver.find_element_by_link_text('首页')
if result != None:
ps_link_list.append(down_link(i))
ps_txt_list.append(mp3Name(i))
else:
time.sleep(50)
ps_link_list.append(down_link(i))
ps_txt_list.append(mp3Name(i))
for links in ps_link_list:
links.sort()
print(ps_link_list)
x = 10 #正常情况每页有10条
y = len(ps_link_list)
z = len(ps_link_list[-1])
m = 0
while m < y-1: #正常遍历
for n in range(x):
mytext = ps_txt_list[m][n]
mylink = ps_link_list[m][n]
mylist.append((mytext, mylink))
m += 1
if z < x: #判断最后一页有没有10条,没有10条遍历10次会报溢出
l = 0
while l < z:
mytext = ps_txt_list[-1][l]
mylink = ps_link_list[-1][l]
mylist.append((mytext, mylink))
l += 1
if mylist != None:
return mylist
def csv(a):
df = pd.DataFrame(a)
df.columns = ['title','link']
df.to_csv('zgtAllLinks.csv', encoding='gbk', index=False)
def running():
openDriver(targetUrl)
jump_link()
#down_link(targetUrl)
#mp3Name(targetUrl)
print('getAllLinks:',getAllLinks(targetUrl))
a = getAllLinks(targetUrl)
print(a)
# csv(a)
running()