不过多解释,直接上代码。
编程语言:Python
系统环境:Ubuntu
时间:2014 / 4/ 5
Windows下运行会报错,因为Windows的终端太垃圾了
#-*- coding:utf-8 -*-
import re
import urllib2
import os
class dbDownloader :
programme_url = '' #记录歌单地址
songs_info = [] #用于储存歌名,歌手,下载地址的列表
doc = '' #用于保存音乐的文件夹
count = 0 # 记录下载歌曲数目
topass = 0 # 是否跳过此目录
def __init__(self, programme_url,doc,count):
self. programme_url = programme_url #记录歌单地址
self. songs_info = [] #用于储存歌名,歌手,下载地址的列表
self. doc = doc #要保存的文件夹
self. count += count
self. get_song_url()
if(self.topass == 0):
self. donwload_songs()
def get_song_url(self):
# 打开歌单页面
programme_page = urllib2 . urlopen(self .programme_url). read()
# 解析出歌曲名,歌手,sid,ssid
pattern = re . compile('<li class="song-item-wrapper sortable" id=".*?">.*?data-title="(.*?)" data-performer="(.*?)".*?data-songid="(.*?)" data-plength=".*?" data-ssid="(.*?)"', re .S)
result = re . findall(pattern,programme_page)
del result[- 1] #删除最后一行无用数据
# 获取要下载的文件夹的文件数,判断是否下载好了
if os .path. isdir(self .doc):
file_count = len(os.listdir(self.doc))
if(file_count == len(result)):
print "检测到",self.doc,"已下载完毕"
self.count += file_count
self.topass = 1
# 文件夹还未下载完
if(self.topass == 0):
for title,performer,sid,ssid in result:
# 获取歌曲地址
print "正在获取",title,"地址"
get_song_url = 'http://music.douban.com/j/songlist/get_song_url?' + 'sid=' + sid + '&ssid=' + ssid
# 发送请求
text = urllib2 . urlopen(get_song_url).read()
# 获取歌曲地址
song_url = re . findall('{".*?":"(.*?)"}' ,text)[0].replace('\\', '')
# 将歌名,歌手,下载地址储存到songs_info中
info = [title ,performer ,song_url]
self. songs_info.append(info)
def donwload_songs(self):
#检测文件夹是否存在,不存在则创建
if os .path. isdir(self .doc):
pass
else:
os. mkdir(self .doc)
print "### 开始下载歌单 : ",self. doc," ###"
for i in range(len(self. songs_info)):
# 构建歌曲文件名
file_name = self .songs_info[i][1] + ' ' + '-' + ' ' + self. songs_info[i][0] + ".mp3"
# 把不合法的字符处理掉
file_name = file_name. replace(r'\\' , '')
file_name = file_name . replace(r'/' ,'')
file_name = file_name. replace(r':' , '')
file_name = file_name. replace(r'\*' , '')
file_name = file_name. replace(r'?' , '')
file_name = file_name . replace(r'"' ,'')
file_name = file_name. replace(r'"' , '')
file_name = file_name. replace(r'<' , '')
file_name = file_name. replace(r'>' , '')
file_name = file_name. replace(r'|' , '')
# 歌曲保存路径
save_name = self .doc + '/' + file_name
#检查歌曲是否存在,这破网络你懂得,避免重复下载
if os .path. isfile(save_name):
self.count = self.count + 1
continue
# 将歌曲写入磁盘
print "正在下载第" ,self.count + 1, "首歌曲" ,file_name
self.count = self.count + 1
data = urllib2.urlopen(self. songs_info[i][2]). read()
f = open(save_name , 'ab')
f. write(data)
f. close()
if __name__ == '__main__':
url = 'http://music.douban.com/programmes/'
pattern = '<a class="lnk-songlist" href="(.*?)" target="_blank" title="(.*?)">'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.154 Safari/537.36',
'Host':'music.douban.com',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Cookie':'bid="Jv9Gkp2ygUs"; ll="118281"; __utma=30149280.1543039784.1396552085.1396627488.1396698064.9; __utmb=30149280.4.10.1396698064; __utmc=30149280; __utmz=30149280.1396622560.7.5.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided)'
}
req = urllib2.Request(url,'',headers)
page = urllib2 .urlopen(req).read()
programme_urls = re .findall(pattern,page)
count = 0
for i,v in programme_urls:
downloader = dbDownloader(i,v,count)
count = downloader.count;
print "总共下载了",count,"首歌曲"