最近开车总感觉车里的车载音乐非常的老土,现在都玩网抑云了,抽空看了一下它的页面,发现音乐好听,还可以下载,闲来无趣,写了个爬虫将榜单上所有的歌曲信息提取并分类下载了一下,效果还不错,比原来的车载音乐丰富多了!
import requests
from bs4 import BeautifulSoup
import os
class DownloadMusic(object):
"""
此程序用来爬取某云音乐所有排行榜音乐,学习使用,严禁商业用途
"""
def __init__(self):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'
}
self.url = "https://music.163.com/discover/toplist" # 框架源代码网址
self.title_url = {} # 存放排行榜标题和对应网址
def parse_html(self):
"""
解析网页
:return:
"""
response = requests.get(url=self.url, headers=self.headers)
html = BeautifulSoup(response.text, "html.parser") # 解析框架源码
a_list = html.select("a") # 提取所有a标签
# print(a_list)
for res_url in a_list:
if res_url.has_attr("href"):
url = res_url.attrs["href"]
if str(url).startswith("/discover/toplist?id="): # 提取所有排行榜的网址
self.title_url[res_url.string] = url # 将网址和排行榜标题关联成字典的键值对
del self.title_url[None] # 添加后出现一个重复网址,键名为None,需要删除
def ranking_list(self):
"""
排行榜信息提取
:return:
"""
for title_dir, url in self.title_url.items(): # 每次循环下载一个榜单音乐
song_url = {} # 用来存放歌曲名和网址,每次循环音乐网址字典都要刷新一次
# if not os.path.isdir(title_dir):
# os.makedirs(title_dir) # 创建文件夹
dirct = 'E:\\{}'.format(title_dir)
if not os.path.isdir(dirct):
os.makedirs('E:\\{}'.format(title_dir)) # 创建文件夹
ranking_url = "https://music.163.com" + url # 拼接单个排行榜网址
response = requests.get(url=ranking_url, headers=self.headers)
html = BeautifulSoup(response.text, "html.parser") # 解析框架源码
a_list = html.select("a") # 提取所有a标签
for res_url in a_list:
if res_url.has_attr("href"):
url = res_url.attrs["href"]
if str(url).startswith("/song?id"): # 提取单个排行榜下面的单个歌曲网址
if res_url.string == None or res_url.string.startswith("$"): # 筛除不合格网址信息
continue
# http://music.163.com/song/media/outer/url?id=1491137515.mp3 完整的下载地址格式
download_url = "http://music.163.com/song/media/outer/url" + url[5:] + ".mp3" # 拼接单个歌曲下载地址
song_url[res_url.string] = download_url # 将歌曲名和歌曲下载地址关联成字典的键值对
# print(song_url)
self.download(title_dir, song_url) # 调用下载歌曲函数,每次下载一个分类
# break
def download(self, title_dir, song_url):
"""
下载歌曲
:return:
"""
long = len(song_url)
print("{}共有{}首音乐,开始下载!".format(title_dir, long))
count = 0
for url_name, url in song_url.items():
# print(url_name, url)
# if os.path.isfile(title_dir + "/" + url_name + ".mp3"):
# print(url_name + ",本地已存在!")
# continue # 若歌曲已经存在,则无需重新下载
count += 1
response = requests.get(url, headers=self.headers, allow_redirects=False)
try:
r = requests.get(response.headers['Location'], stream=True)
size = int(r.headers['content-length'])
print('\033[0;31m' + str(count) + "/" + str(long) + " 正在下载-" + url_name + " 文件大小:" + str(
size) + "字节" + "\033[0m")
schedule = 0 # 下载进度
with open('E:\\{}'.format(title_dir) + "/" + url_name + ".mp3", "wb") as f:
# with open(title_dir + "/" + url_name + ".mp3", "wb") as f:
for chunk in r.iter_content(chunk_size=512 * 1024):
if chunk:
f.write(chunk)
schedule += len(chunk)
print("\r" + url_name + "--下载进度:" + '%3s' % (str(schedule * 100 // size)) + "%", end='')
print("\n第{}首音乐《{}》下载成功!".format(count, url_name))
r.close()
except Exception as e:
print(url_name + " 下载出错!" + " 错误信息" + str(e.args))
def run(self):
"""
运行主程序
:return:
"""
self.parse_html()
self.ranking_list()
down_music = DownloadMusic()
down_music.run()
效果如下,网速太慢了 ,每个榜单会单独生成一个文件夹,上一个榜单下载完成之后下个榜单才会生成文件夹并下载。