python写的批量下载baidu mp3的程序 至少到09-9-18仍然可用

windows平台下的Py2.6 移植到linux应该也很容易
默认10线程下载

其中多线程下载部分 是参考 http://hi.baidu.com/zjw0358/blog

ContractedBlock.gifExpandedBlockStart.gifmydown.py
#!/usr/bin/env python
#
 coding=utf-8
import httplib,urllib,urllib2
import re,os
from downmp3 import GetSize,DownMp3
def BaiduUrlDecode(enurl):
    
import string
    
from urllib import unquote 
    k 
= u'0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
    decoded 
= ''
    
# print enurl
    try:
        key 
= k.index(u't'- k.index(enurl[1])
        
for i in range(enurl.__len__()):
            char 
= enurl[i]
            
if char in k:
                decoded 
+= k[(k.index(char) + key) % 62]
            
else:
                decoded 
+= char
    
except IndexError:
        
print 'enurl IndexError:',enurl,'$'
        
# print 'enurl[1]',enurl[1]
    return unquote(decoded.encode('gbk'))

def BadiuUrlProcess(baidu_url):
    
import re
    
from urllib import quote
    match_CHchar 
= r'([^:._,~`!@#\|{}\^\*\(\)<>;%/\"\'\[\]\+\=\?\-\$\&\\\w]+)' #匹配中文
    ch_pattern = re.compile(match_CHchar)
    page_url 
= baidu_url
    
if ch_pattern.search(baidu_url):
        page_url 
= ch_pattern.sub(quote(ch_pattern.search(baidu_url).groups()[0].encode('gbk')),baidu_url)
    
return page_url
    



if __name__ == "__main__":
    
    
print """   支持百度MP3的大部分列表音乐的下载,默认采用10线程下载
    
        [1] 新歌top100 
        [2] 歌曲top500 
        [3] 歌手top200 (暂不支持下载)
        [4] 中文金曲榜
        [5] 经典老歌
        [6] 热舞dj
        [7] 流金岁月 
        [8] 电视金曲
        [9] 歌曲列表
        [0] 退出
                                --by auxten auxtenwpc[at]gmail[dot]com
    
""".decode('utf-8').encode('gbk')
    id 
= int(raw_input('输入你想下载的list的编号: '.decode('utf-8').encode('gbk')))
    
if id == 1: topid = '/list/newhits.html?id=1?top1'
    
elif id == 2: topid = '/topso/mp3topsong.html?id=1?top2'
    
elif id == 3: topid = '/list/tvs.html?id=1?top5';exit(1)
    
elif id == 4: topid = '/list/bangping.html?id=1'#;exit(1)
    elif id == 5: topid = '/list/oldsong.html?top6'
    
elif id == 6: topid = '/list/dj.html'
    
elif id == 7: topid = '/list/liujinsuiyue.html'
    
elif id == 8: topid = '/list/tvs.html?id=1?top5'
    
elif id == 9: topid = '/list/tvs.html?id=1?top5'#;exit(1)
    elif id == 0: exit(1)
# topid = '/list/oldsong.html?top6'
    # topid = '/list/tvs.html?id=1?top5'
    print "Processing please wait.:)"
    errorlist 
= []
    conn 
= httplib.HTTPConnection('list.mp3.baidu.com')
    conn.request(
"GET",topid)
    response 
= conn.getresponse()
    html 
= response.read().decode('gb18030')
    
# print html.encode('gbk')
    conn.close()
    match_type1 
= r'">(\d{,3})\.</td>' #编号
    match_type2 = r'">(\d{,3})\.</td>[\s\S]*?<a href="(.*?)" target="_blank">(.*?)</a>[^)].*' #no url songname for 流金岁月
    match_type3 = r'">(\d{,3})\.</td>[\s\S]*?<a href="(.*?)" target="_blank">(.*?)</a>[^)].*">(.*)</a>\)' #name+author
    match_type4 = r'<tr>[\s]*?<td>(\d{,3})</td>[\s\S]*?<a href="(http://.*?)" target="_blank">(.*?)</a></td>[\s\S]*?target="_blank">(.*?)</a>' #no url songname singer 中文金曲榜
    list_number = re.findall(match_type1, html)
    list_all 
= re.findall(match_type3, html)
    
if list_all == []:
        
# print 'list_all empty1!'
        list_all = re.findall(match_type2, html)
    
if list_all == []:
        
# print 'list_all empty2!'
        list_all = re.findall(match_type4, html)
    
# print 'list_all',list_all
    # print 'list_all__len__',list_all.__len__()
    conn = httplib.HTTPConnection('mp3.baidu.com')
    songnumlst 
= range(0,list_all.__len__())
    
    
for num in songnumlst:
        
try:
            
try: authorname = '-'+list_all[num][3]
            
except IndexError:
                authorname 
= ''
            
print list_all[num][0].encode('gbk'),list_all[num][2].encode('gbk'),authorname.encode('gbk')        
    
#        print num
            conn.request("GET",BadiuUrlProcess(list_all[num][1]))
            
# print "URL!",BadiuUrlProcess(list_all[num][1]).encode('gbk')
            response = conn.getresponse()
            html 
= response.read().decode('gb18030')
            conn.close()
            
# print html.encode('gbk')
            html = re.search(r'<a href="(.*?)" title', html).groups()[0]
            down_page_url 
= BadiuUrlProcess(html)
            html 
= urllib.urlopen(down_page_url).read().decode('gb18030')
    
#        print html.encode('gbk')
            mp3_url_list = re.findall(r'{var B="(.*?)".*?{var C=\["(.*?)","(.*?)","(.*?)","(.*?)"\];', html)[0]
    
#        print mp3_url_list
            mp3_url_list = [BaiduUrlDecode(i) for i in mp3_url_list ]
    
#        print mp3_url_list
    #        for i in mp3_url_list:
    #            print i.decode('utf-8').encode('gbk')
            print 'Downloading .'
        
except UnicodeDecodeError:
            
print "Error in main loop"#"UnicodeDecodeError"
            songnumlst.append(num)
            
continue
        
tryassert DownMp3(urlist = mp3_url_list, name = '%s%s' % (list_all[num][2],authorname) ) == 0
        
except AssertionError:
            
print 'DownMp3 error!'
            songnumlst.append(num)
        
ContractedBlock.gifExpandedBlockStart.gifHttpGetThread.py
#!/usr/bin/env python
#
coding=utf-8
import re
import os
import sys
import time
import glob
import string
import socket
import getopt
import urllib
import urllib2
import threading


#############################################################################
#
#
 self-defined exception classes
#
#
############################################################################
class ConnectionError(Exception): pass
class URLUnreachable(Exception):pass
class CanotDownload(Exception):pass

#############################################################################
#
#
 multiple threads download module starts here
#
#
############################################################################
class HttpGetThread(threading.Thread):
    
def __init__(self, name, url, filename, range=0):
        threading.Thread.
__init__(self, name=name)
        self.url 
= url
        self.filename 
= filename
        self.range 
= range
        self.totalLength 
= range[1- range[0] +1
        
try:
            self.downloaded 
= os.path.getsize(self.filename)
        
except OSError:
            self.downloaded 
= 0
        self.percent 
= self.downloaded/float(self.totalLength)*100
        self.headerrange 
= (self.range[0]+self.downloaded, self.range[1])
        self.bufferSize 
= 8192


    
def run(self):
        
try:
            self.downloaded 
= os.path.getsize(self.filename)
        
except OSError:
            self.downloaded 
= 0
        self.percent 
= self.downloaded/float(self.totalLength)*100
        
#self.headerrange = (self.range[0]+self.downloaded, self.range[1])
        self.bufferSize = 8192
        
#request = urllib2.Request(self.url)
        #request.add_header('Range', 'bytes=%d-%d' %self.headerrange)
        downloadAll = False
        retries 
= 1
        
while not downloadAll:
            
if retries > 10:
                
break
            
try:
                self.headerrange 
= (self.range[0]+self.downloaded, self.range[1])
                request 
= urllib2.Request(self.url)
                request.add_header(
'Range''bytes=%d-%d' %self.headerrange)
                conn 
= urllib2.urlopen(request)
                startTime 
= time.time()
                data 
= conn.read(self.bufferSize)
                
while data:
                    f 
= open(self.filename, 'ab')
                    f.write(data)
                    f.close()
                    self.time 
= int(time.time() - startTime)
                    self.downloaded 
+= len(data)
                    self.percent 
= self.downloaded/float(self.totalLength) *100             
                    data 
= conn.read(self.bufferSize)
                downloadAll 
= True
            
except Exception, err:
                retries 
+= 1
                time.sleep(
1)
                
continue

def Split(size,blocks):
    ranges 
= []
    blocksize 
= size / blocks
    
for i in xrange(blocks-1):
        ranges.append(( i
*blocksize, i*blocksize+blocksize-1))
    ranges.append(( blocksize
*(blocks-1), size-1))

    
return ranges

def GetHttpFileSize(url):
    length 
= 0
    
try:
        conn 
= urllib.urlopen(url)
        headers 
= conn.info().headers
        
for header in headers:
            
if header.find('Length'!= -1:
                length 
= header.split(':')[-1].strip()
                length 
= int(length)
    
except Exception, err:
        
pass
      
    
return length

def hasLive(ts):
    
for t in ts:
        
if t.isAlive():
            
return True
    
return False

def MyHttpGet(url, output=None, connections=4):
    
"""
    arguments:
        url, in GBK encoding
        output, default encoding, do no convertion
        connections, integer
    
"""
    length 
= GetHttpFileSize(url)
    
print length
    mb 
= length/1024/1024.0
    
if length == 0:
        
raise URLUnreachable
    blocks 
= connections
    
if output:
        filename 
= output
    
else:
        output 
= url.split('/')[-1]
    ranges 
= Split(length, blocks)
    names 
= ["%s_%d" %(output,i) for i in xrange(blocks)]
  
    ts 
= []
    
for i in xrange(blocks):
        t 
= HttpGetThread(i, url, names[i], ranges[i])
        t.setDaemon(True)
        t.start()
        ts.append(t)

    live 
= hasLive(ts)
    startSize 
= sum([t.downloaded for t in ts])
    startTime 
= time.time()
    etime 
= 0
    lastd 
= 0
    nobytecounter 
= 0
    
while live:
        
try:
            etime 
= time.time() - startTime
            d 
= sum([t.downloaded for t in ts])/float(length)*100
            
if lastd == d:
                nobytecounter 
+= 1
            
else:
                nobytecounter 
= 0
                lastd 
= d
            
if nobytecounter > 100:
                
raise CanotDownload
                
            downloadedThistime 
= sum([t.downloaded for t in ts])-startSize
            
try:
                rate 
= downloadedThistime / float(etime)/1024
            
except:
                rate 
= 100.0
            progressStr 
= u'\rFilesize: %d(%.2fM) Downloaded: %.2f%% Avg rate: %.1fKB/s' %(length, mb, d, rate)
            sys.stdout.write(progressStr)
            sys.stdout.flush()
            
#sys.stdout.write('\b'*(len(progressStr)+1))
            live = hasLive(ts)
            time.sleep(
0.2)
        
except URLUnreachable:
            
print
            
print "Url Unreachable"
            
for n in names:
                
try:
                    os.remove(n)
                
except:
                    
pass
            
return -1
        
except CanotDownload:
            
print
            
print "can't download!"
            
for n in names:
                
try:
                    os.remove(n)
                
except:
                    
pass
            
return -1
        
except KeyboardInterrupt:
            
print
            
print "Exit"
            
for n in names:
                
try:
                    os.remove(n)
                
except:
                    
pass
            sys.exit(
1)
          
    
print

    
try:
        f 
= open(filename, 'wb')
        
for n in names:
            f.write(open(n,
'rb').read())
            
try:
                os.remove(n)
            
except:
                
pass
        f.close()
    
except :
        
print 
        
print 'File write Error'
        
for n in names:
            
try:
                os.remove(n)
            
except:
                
pass
        
return -1
    
return 0


if __name__ == '__main__':
    MyHttpGet(
'http://jsz.com.cn./18/Hongdou.mp3','Hongdou.mp3',4)
ContractedBlock.gifExpandedBlockStart.gifdownmp3.py
#!/usr/bin/env python
#
coding=utf-8
url = ['http://www.efu.com.cn/topic/611/1.mp3',
    
'http://bbs.baby8.cn//upload/vip/10/200831715101730.mp3',
    
'http://www1.neacn.com/file_db/music/1153/f/26.wma',
    
'http://www.tklk8.cn/bbs/uploadfile/2009-8/200981013174970090.mp3',
    
'http://media.winglish.com/sound/winhao/free_content/music/music_20090507.mp3'
]
from HttpGetThread import *

def GetSize(resource_url):
    
import httplib
    
from urlparse import urlparse
    
    
try:
        parsedurl 
= urlparse(resource_url)
        host 
= parsedurl[1]
        path 
= parsedurl[2]
        httpConn 
= httplib.HTTPConnection(host);
        httpConn.request(
"GET", path)
        r 
= httpConn.getresponse()
        httpConn.close()
        
if r.status == 200:
            size 
= r.getheader('Content-Length')
            size 
= int(size) / 1024
        
else:
            
print r.status, r.reason
            size 
= -1
        
    
except :
        size 
= -1
        
print parsedurl
    
return size
    

    

def DownMp3(urlist,name):
    mp3url 
= urlist[0]
    
for url in urlist:
        Size 
= GetSize(url)
        
# print 'Size            :',Size
        if  Size < 1536 or Size > 10240:
            
pass
        
else:
            mp3url 
= url
            
# print 'change source!'
            break
    fulname 
= name+'.'+mp3url.rsplit('.',1)[1]
    
tryassert MyHttpGet(mp3url.decode('gb18030').encode('gbk'),fulname,10== 0
    
except (AssertionError,UnicodeDecodeError):
        
return -1
    
else:
        
return 0
    
    
if __name__ == '__main__':
    DownMp3(url,
'红豆'.decode('utf-8').encode('gbk'))
    
ContractedBlock.gifExpandedBlockStart.gifsetup.py
# coding=utf-8
from distutils.core import setup
import py2exe

setup(
    console
=['mydown.py'],
    options
={
        
"py2exe":{
            
"includes": ["downmp3"],
            
"compressed": True,
            
"optimize"2,
            
"bundle_files"1,
        }
    },
    zipfile 
= None,
)


嗯 就是这样 我比较懒 看代码吧

转载于:https://www.cnblogs.com/auxten/archive/2009/09/18/1569410.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值