python写的批量下载baidu mp3的程序至少到09-9-18仍然可用-优快云博客

mydown.py
#!/usr/bin/env python
# coding=utf-8
import httplib,urllib,urllib2
import re,os
from downmp3 import GetSize,DownMp3
def BaiduUrlDecode(enurl):
    import string
    from urllib import unquote
    k = u'0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
    decoded = ''
    # print enurl
    try:
        key = k.index(u't') - k.index(enurl[1])
        for i in range(enurl.__len__()):
            char = enurl[i]
            if char in k:
                decoded += k[(k.index(char) + key) % 62]
            else:
                decoded += char
    except IndexError:
        print 'enurl IndexError:',enurl,'$'
        # print 'enurl[1]',enurl[1]
    return unquote(decoded.encode('gbk'))

def BadiuUrlProcess(baidu_url):
    import re
    from urllib import quote
    match_CHchar = r'([^:._,~`!@#\|{}\^\*<>;%/\"\'\[\]\+\=\?\-\$\&\\\w]+)' #匹配中文
    ch_pattern = re.compile(match_CHchar)
    page_url = baidu_url
    if ch_pattern.search(baidu_url):
        page_url = ch_pattern.sub(quote(ch_pattern.search(baidu_url).groups()[0].encode('gbk')),baidu_url)
    return page_url


if __name__ == "__main__":

    print """   支持百度MP3的大部分列表音乐的下载，默认采用10线程下载

        [1] 新歌top100
        [2] 歌曲top500
        [3] 歌手top200 (暂不支持下载)
        [4] 中文金曲榜
        [5] 经典老歌
        [6] 热舞dj
        [7] 流金岁月
        [8] 电视金曲
        [9] 歌曲列表
        [0] 退出
                                --by auxten auxtenwpc[at]gmail[dot]com
    """.decode('utf-8').encode('gbk')
    id = int(raw_input('输入你想下载的list的编号: '.decode('utf-8').encode('gbk')))
    if id == 1: topid = '/list/newhits.html?id=1?top1'
    elif id == 2: topid = '/topso/mp3topsong.html?id=1?top2'
    elif id == 3: topid = '/list/tvs.html?id=1?top5';exit(1)
    elif id == 4: topid = '/list/bangping.html?id=1'#;exit(1)
    elif id == 5: topid = '/list/oldsong.html?top6'
    elif id == 6: topid = '/list/dj.html'
    elif id == 7: topid = '/list/liujinsuiyue.html'
    elif id == 8: topid = '/list/tvs.html?id=1?top5'
    elif id == 9: topid = '/list/tvs.html?id=1?top5'#;exit(1)
    elif id == 0: exit(1)
# topid = '/list/oldsong.html?top6'
    # topid = '/list/tvs.html?id=1?top5'
    print "Processing please wait

.:)"
    errorlist = []
    conn = httplib.HTTPConnection('list.mp3.baidu.com')
    conn.request("GET",topid)
    response = conn.getresponse()
    html = response.read().decode('gb18030')
    # print html.encode('gbk')
    conn.close()
    match_type1 = r'">(\d{,3})\.</td>' #编号
    match_type2 = r'">(\d{,3})\.</td>[\s\S]*?<a href="(.*?)" target="_blank">(.*?)</a>[^)].*' #no url songname for 流金岁月
    match_type3 = r'">(\d{,3})\.</td>[\s\S]*?<a href="(.*?)" target="_blank">(.*?)</a>[^)].*">(.*)</a>\)' #name+author
    match_type4 = r'<tr>[\s]*?<td>(\d{,3})</td>[\s\S]*?<a href="(http://.*?)" target="_blank">(.*?)</a></td>[\s\S]*?target="_blank">(.*?)</a>' #no url songname singer 中文金曲榜
    list_number = re.findall(match_type1, html)
    list_all = re.findall(match_type3, html)
    if list_all == []:
        # print 'list_all empty1!'
        list_all = re.findall(match_type2, html)
    if list_all == []:
        # print 'list_all empty2!'
        list_all = re.findall(match_type4, html)
    # print 'list_all',list_all
    # print 'list_all__len__',list_all.__len__()
    conn = httplib.HTTPConnection('mp3.baidu.com')
    songnumlst = range(0,list_all.__len__())

    for num in songnumlst:
        try:
            try: authorname = '-'+list_all[num][3]
            except IndexError:
                authorname = ''
            print list_all[num][0].encode('gbk'),list_all[num][2].encode('gbk'),authorname.encode('gbk')
    #        print num
            conn.request("GET",BadiuUrlProcess(list_all[num][1]))
            # print "URL!",BadiuUrlProcess(list_all[num][1]).encode('gbk')
            response = conn.getresponse()
            html = response.read().decode('gb18030')
            conn.close()
            # print html.encode('gbk')
            html = re.search(r'<a href="(.*?)" title', html).groups()[0]
            down_page_url = BadiuUrlProcess(html)
            html = urllib.urlopen(down_page_url).read().decode('gb18030')
    #        print html.encode('gbk')
            mp3_url_list = re.findall(r'{var B="(.*?)".*?{var C=\["(.*?)","(.*?)","(.*?)","(.*?)"\];', html)[0]
    #        print mp3_url_list
            mp3_url_list = [BaiduUrlDecode(i) for i in mp3_url_list ]
    #        print mp3_url_list
    #        for i in mp3_url_list:
    #            print i.decode('utf-8').encode('gbk')
            print 'Downloading

.'
        except UnicodeDecodeError:
            print "Error in main loop"#"UnicodeDecodeError"
            songnumlst.append(num)
            continue
        try: assert DownMp3(urlist = mp3_url_list, name = '%s%s' % (list_all[num][2],authorname) ) == 0
        except AssertionError:
            print 'DownMp3 error!'
            songnumlst.append(num)

HttpGetThread.py
#!/usr/bin/env python
#coding=utf-8
import re
import os
import sys
import time
import glob
import string
import socket
import getopt
import urllib
import urllib2
import threading

#############################################################################
#
# self-defined exception classes
#
#############################################################################
class ConnectionError(Exception): pass
class URLUnreachable(Exception):pass
class CanotDownload(Exception):pass

#############################################################################
#
# multiple threads download module starts here
#
#############################################################################
class HttpGetThread(threading.Thread):
    def __init__(self, name, url, filename, range=0):
        threading.Thread.__init__(self, name=name)
        self.url = url
        self.filename = filename
        self.range = range
        self.totalLength = range[1] - range[0] +1
        try:
            self.downloaded = os.path.getsize(self.filename)
        except OSError:
            self.downloaded = 0
        self.percent = self.downloaded/float(self.totalLength)*100
        self.headerrange = (self.range[0]+self.downloaded, self.range[1])
        self.bufferSize = 8192

    def run(self):
        try:
            self.downloaded = os.path.getsize(self.filename)
        except OSError:
            self.downloaded = 0
        self.percent = self.downloaded/float(self.totalLength)*100
        #self.headerrange = (self.range[0]+self.downloaded, self.range[1])
        self.bufferSize = 8192
        #request = urllib2.Request(self.url)
        #request.add_header('Range', 'bytes=%d-%d' %self.headerrange)
        downloadAll = False
        retries = 1
        while not downloadAll:
            if retries > 10:
                break
            try:
                self.headerrange = (self.range[0]+self.downloaded, self.range[1])
                request = urllib2.Request(self.url)
                request.add_header('Range', 'bytes=%d-%d' %self.headerrange)
                conn = urllib2.urlopen(request)
                startTime = time.time()
                data = conn.read(self.bufferSize)
                while data:
                    f = open(self.filename, 'ab')
                    f.write(data)
                    f.close()
                    self.time = int(time.time() - startTime)
                    self.downloaded += len(data)
                    self.percent = self.downloaded/float(self.totalLength) *100
                    data = conn.read(self.bufferSize)
                downloadAll = True
            except Exception, err:
                retries += 1
                time.sleep(1)
                continue

def Split(size,blocks):
    ranges = []
    blocksize = size / blocks
    for i in xrange(blocks-1):
        ranges.append(( i*blocksize, i*blocksize+blocksize-1))
    ranges.append(( blocksize*(blocks-1), size-1))

    return ranges

def GetHttpFileSize(url):
    length = 0
    try:
        conn = urllib.urlopen(url)
        headers = conn.info().headers
        for header in headers:
            if header.find('Length') != -1:
                length = header.split(':')[-1].strip()
                length = int(length)
    except Exception, err:
        pass

    return length

def hasLive(ts):
    for t in ts:
        if t.isAlive():
            return True
    return False

def MyHttpGet(url, output=None, connections=4):
    """
    arguments:
        url, in GBK encoding
        output, default encoding, do no convertion
        connections, integer
    """
    length = GetHttpFileSize(url)
    print length
    mb = length/1024/1024.0
    if length == 0:
        raise URLUnreachable
    blocks = connections
    if output:
        filename = output
    else:
        output = url.split('/')[-1]
    ranges = Split(length, blocks)
    names = ["%s_%d" %(output,i) for i in xrange(blocks)]

    ts = []
    for i in xrange(blocks):
        t = HttpGetThread(i, url, names[i], ranges[i])
        t.setDaemon(True)
        t.start()
        ts.append(t)

    live = hasLive(ts)
    startSize = sum([t.downloaded for t in ts])
    startTime = time.time()
    etime = 0
    lastd = 0
    nobytecounter = 0
    while live:
        try:
            etime = time.time() - startTime
            d = sum([t.downloaded for t in ts])/float(length)*100
            if lastd == d:
                nobytecounter += 1
            else:
                nobytecounter = 0
                lastd = d
            if nobytecounter > 100:
                raise CanotDownload

            downloadedThistime = sum([t.downloaded for t in ts])-startSize
            try:
                rate = downloadedThistime / float(etime)/1024
            except:
                rate = 100.0
            progressStr = u'\rFilesize: %d(%.2fM) Downloaded: %.2f%% Avg rate: %.1fKB/s' %(length, mb, d, rate)
            sys.stdout.write(progressStr)
            sys.stdout.flush()
            #sys.stdout.write('\b'*(len(progressStr)+1))
            live = hasLive(ts)
            time.sleep(0.2)
        except URLUnreachable:
            print
            print "Url Unreachable"
            for n in names:
                try:
                    os.remove(n)
                except:
                    pass
            return -1
        except CanotDownload:
            print
            print "can't download!"
            for n in names:
                try:
                    os.remove(n)
                except:
                    pass
            return -1
        except KeyboardInterrupt:
            print
            print "Exit

"
            for n in names:
                try:
                    os.remove(n)
                except:
                    pass
            sys.exit(1)

    print

    try:
        f = open(filename, 'wb')
        for n in names:
            f.write(open(n,'rb').read())
            try:
                os.remove(n)
            except:
                pass
        f.close()
    except :
        print
        print 'File write Error'
        for n in names:
            try:
                os.remove(n)
            except:
                pass
        return -1
    return 0

if __name__ == '__main__':
    MyHttpGet('http://jsz.com.cn./18/Hongdou.mp3','Hongdou.mp3',4)

downmp3.py
#!/usr/bin/env python
#coding=utf-8
url = ['http://www.efu.com.cn/topic/611/1.mp3',
    'http://bbs.baby8.cn//upload/vip/10/200831715101730.mp3',
    'http://www1.neacn.com/file_db/music/1153/f/26.wma',
    'http://www.tklk8.cn/bbs/uploadfile/2009-8/200981013174970090.mp3',
    'http://media.winglish.com/sound/winhao/free_content/music/music_20090507.mp3'
]
from HttpGetThread import *

def GetSize(resource_url):
    import httplib
    from urlparse import urlparse

    try:
        parsedurl = urlparse(resource_url)
        host = parsedurl[1]
        path = parsedurl[2]
        httpConn = httplib.HTTPConnection(host);
        httpConn.request("GET", path)
        r = httpConn.getresponse()
        httpConn.close()
        if r.status == 200:
            size = r.getheader('Content-Length')
            size = int(size) / 1024
        else:
            print r.status, r.reason
            size = -1

    except :
        size = -1
        print parsedurl
    return size




def DownMp3(urlist,name):
    mp3url = urlist[0]
    for url in urlist:
        Size = GetSize(url)
        # print 'Size            :',Size
        if  Size < 1536 or Size > 10240:
            pass
        else:
            mp3url = url
            # print 'change source!'
            break
    fulname = name+'.'+mp3url.rsplit('.',1)[1]
    try: assert MyHttpGet(mp3url.decode('gb18030').encode('gbk'),fulname,10) == 0
    except (AssertionError,UnicodeDecodeError):
        return -1
    else:
        return 0


if __name__ == '__main__':
    DownMp3(url,'红豆'.decode('utf-8').encode('gbk'))

setup.py
# coding=utf-8
from distutils.core import setup
import py2exe

setup(
    console=['mydown.py'],
    options={
        "py2exe":{
            "includes": ["downmp3"],
            "compressed": True,
            "optimize": 2,
            "bundle_files": 1,
        }
    },
    zipfile = None,
)