爬取喜马拉雅三国中的前十章音频:
#导入requests模块
import requests
#导入正则表达式
import re
#解决反爬问题,导入UA
header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:57.0) Gecko/20100101 Firefox/57.0'}
#网页源代码中获取的前十章ID
sound_ids = (64686514,64689648,64695831,64695832,3218935,3822581,3419626,3513844,3593277,3773655)
for s in range(0,10):
for i in sound_ids:
# 每个音频的URL
url = 'http://www.ximalaya.com/tracks/'+str(sound_ids[s])+'.json'
#网页源代码
html = requests.get(url,headers=header)
#打印网页源代码
# print(html.text)
def get_find_url():
#正则匹配ID和对应的URL
reg = '"id":(.*?),"play_path_64":"(.*?)"'
#最终的音频URL数列
sound_url = re.findall(reg,html.text)
#打印音频URL数列
# print(sound_url)
return sound_url
#ID和音频URL单独取出来
for id,url_finall in get_find_url():
#打印最终音频URL
#print('第',s+1,'节:',url1)
#获取音频详细内容
m4a = requests.get(url_finall)
#取音频最后4位数,即就是.m4a作为后缀名
m4a_name = url_finall[-4:]
print('<正在下载第',s+1,'节> ',url_finall)
#音频内容存储到本地
with open('第'+str(s+1)+'节'+m4a_name,'wb') as f:
f.write(m4a.content)