Django Model设计:
先设计两张表,一张用来存储专辑(专辑名,封面图,专辑id,添加时间),另外一张存储专辑下每一集的信息(所属的专辑,序号,trackid,音频地址,添加时间)

class XimalayaMediaAlbum(models.Model):
"""
喜马拉雅专辑
"""
name = models.CharField(max_length=50, verbose_name="专辑名称")
imgurl=models.CharField(max_length=200,verbose_name=u'封面',default='')
index=models.IntegerField(verbose_name=u'喜马拉雅专辑id')
add_time = models.DateTimeField(default=datetime.now, verbose_name="添加时间")
class Meta:
verbose_name = "喜马拉雅音频专辑"
verbose_name_plural = verbose_name
def __str__(self):
return "{}-{}".format(self.name,self.index)
class XimalayaMedia(models.Model):
"""
喜马拉雅media
"""
album=models.ForeignKey(XimalayaMediaAlbum,verbose_name=u'专辑',default='')
index=models.IntegerField(verbose_name=u'序号')
xmlyid=models.IntegerField(verbose_name=u'喜马拉雅id')
url = models.CharField(max_length=500, verbose_name="地址")
add_time = models.DateTimeField(default=datetime.now, verbose_name="添加时间")
class Meta:
verbose_name = "喜马拉雅音频"
verbose_name_plural = verbose_name
def __str__(self):
return "{}-{}".format(self.album,self.index)
def getRecord(self):
record=XimalayaMediaPlayRecord.objects.filter(media=self).first()
if record:
return record.time
else:
return ""
为了防止正在爬的时候,有人再次提交。加一个status表。
class XimalayaAlbumScrapStatus(models.Model):
"""
喜马拉雅爬虫状态
"""
album = models.CharField(max_length=50, verbose_name="专辑id")
status=models.CharField(verbose_name=u'状态',default='running',max_length=50)
add_time = models.DateTimeField(default=datetime.now, verbose_name="添加时间")
class Meta:
verbose_name = "喜马拉雅爬虫状态"
verbose_name_plural = verbose_name
def __str__(self):
return "{}-{}".format(self.album,self.status)
前端布局:
首页的布局什么的就简单写了,只需要一个输入框,能输入新的专辑id,再来个列表,展示已经爬过的专辑。
输入框的样式是不是很熟悉呀?没错,直接抄的喜马拉雅主页的,连css名字都不想改。
<style>
body {
min-height: 100vh;
background-image: url("/static/images/intro-2.jpg");
background-size: cover;
}
.mid-content {
text-align: center;
padding: 50px 0;
width: 1200px;
height: 300px;
margin: 0 auto;
}
.header {
font-family: 'PingFangSC-Medium', 'Microsoft YaHei', sans-serif;
text-align: center;
color: #ffffff;
font-size: 20px;
font-weight: 700;
}
input::-webkit-outer-spin-button,
input::-webkit-inner-spin-button {
-webkit-appearance: none !important;
margin: 0;
}
.nextPre {
color: #262728;
font-size: 12px;
cursor: pointer;
}
label {
color: #fca429;
}
.xui-header-searchWrapper {
display: inline-block;
width: 350px;
height: 200px;
margin-left: 30px;
vertical-align: top;
}
.xui-header-search, .xui-header-search-input {
font-family: -apple-system, BlinkMacSystemFont, PingFangSC-Regular, PingFang SC, Microsoft YaHei, Helvetica Neue, Helvetica, Arial, sans-serif;
font-size: 14px;
-webkit-font-smoothing: antialiased;
-webkit-box-sizing: border-box;
box-sizing: border-box;
margin: 0;
list-style: none;
height: 40px;
}
.xui-header-search {
padding: 0;
position: relative;
top: 50%;
-webkit-transform: translateY(-50%);
-ms-transform: translateY(-50%);
transform: translateY(-50%);
}
.xui-header-search-input {
padding: 10px 70px 10px 20px !important;
width: 100%;
line-height: 1.5;
color: #333;
background-color: #fff;
border: 1px solid #f86442;
border-radius: 54px;
outline: none;
-webkit-transition: all .2s;
-o-transition: all .2s;
transition: all .2s;
}
.xui-header-search, .xui-header-search-input {
font-family: -apple-system, BlinkMacSystemFont, PingFangSC-Regular, PingFang SC, Microsoft YaHei, Helvetica Neue, Helvetica, Arial, sans-serif;
font-size: 14px;
-webkit-font-smoothing: antialiased;
-webkit-box-sizing: border-box;
box-sizing: border-box;
margin: 0;
list-style: none;
height: 40px;
}
.xui-header-search-button_position {
position: absolute;
right: 0;
top: 0;
}
.xui-header-search-button {
display: inline-block;
background-color: #f86442;
background-image: -webkit-gradient(linear, left top, right top, color-stop(1%, #ff9973), color-stop(99%, #ff7251));
background-image: -webkit-linear-gradient(left, #ff9973 1%, #ff7251 99%);
background-image: -o-linear-gradient(left, #ff9973 1%, #ff7251 99%);
background-image: linear-gradient(90deg, #ff9973 1%, #ff7251 99%);
width: 60px;
height: 100%;
color: #fff;
font-size: 20px;
text-align: center;
line-height: 40px;
border-radius: 0px 54px 54px 0px;
cursor: pointer;
}
.xuicon {
display: inline-block;
font-style: normal;
vertical-align: baseline;
text-align: center;
text-transform: none;
line-height: 1;
text-rendering: optimizeLegibility;
-webkit-font-smoothing: antialiased;
-moz-osx-font-smoothing: grayscale;
}
.list-panel {
text-align: center;
width: 1200px;
min-height: 500px;
margin: 0 auto;
}
</style>
<body>
<div class="mainContent">
<div class="mid-content">
<div class="e-1350701591 xui-header-searchWrapper">
<div class="e-1050963283 xui-header-search focused">
<input type="number" class="e-1050963283 xui-header-search-input" placeholder="专辑ID" value="">
<span class="e-1050963283 xui-header-search-button xui-header-search-button_position">
<i class="e-1050963283 xuicon xuicon-web_ic_search startSearch"
style=" padding: 10px 0 0 0; font-size: 17px;">确定</i>
</span>
</div>
</div>
</div>
<div class="list-panel">
{% for album in allAlbum %}
<div style="height: 50px;width: 200px;display: inline-block">
<a href="{% url 'main' album.index %}">
<div style="width:50px;float: left">
<img style="width: 100%;"
src="{{ album.imgurl }}">
</div>
<div style="color: white;float: left;font-size: 12px;margin: 15px 0 0 10px;width: 100px;overflow: hidden">{{ album.name }}</div>
<div style="clear: both;"></div>
</a>
</div>
{% endfor %}
</div>
</div>
</body>
输入了albumId之后,post到服务器,开始抓取index---trackid:
<script>
$().ready(function () {
function CheckStatus(album) {
window.location.reload();
}
$('.startSearch').click(function () {
var album = $('input').val();
if (album == "") {
alert('请输入');
return false;
}
var fd = new FormData();
fd.append('album', album);
var xhr = new XMLHttpRequest();
xhr.onreadystatechange = function () {
if (xhr.readyState == 4 && xhr.status == 200) {
var data = eval('(' + xhr.responseText + ')')
console.log(data);
alert(data.msg);
if (data.status == "success") {
setInterval(function () {
CheckStatus(album);
}, 5000);
}
}
};
xhr.open('POST', "/listen/");
xhr.setRequestHeader("X-CSRFToken", "{{ csrf_token }}");
xhr.send(fd);
});
});
</script>
首页路由url配置:
url(r'^listen/$', ListenMainPageView.as_view(), name='listen_main'),
View中处理逻辑:
class ListenMainPageView(View):
def get(self, request):
allAlbum=XimalayaMediaAlbum.objects.all()
return render(request, 'mainPage/index.html', locals())
def post(self, request):
def ScrapByAlbum(album):
def startScrap(album):
print('Scraping')
manager = ScrapManager(album)
manager.start()
thread = threading.Thread(target=startScrap, args=(album,))
thread.start()
XimalayaAlbumScrapStatus.objects.get_or_create(album=album)
album = request.POST.get('album', None)
ret = {}
# 查看是否已经在爬取中
statusRecord = XimalayaAlbumScrapStatus.objects.filter(album=album).first()
if statusRecord:
ret['status'] = 'fail'
ret['msg'] = '正在爬取中'
else:
#查看是否已存在专辑
if XimalayaMediaAlbum.objects.filter(index=album).count():
ret['status'] = 'fail'
ret['msg'] = "已存在"
else:
ret['status'] = 'success'
ret['msg'] = "开始爬取"
ScrapByAlbum(album)
return HttpResponse(json.dumps(ret), content_type="application/json")
把之前的两个接口封装到class ScrapManager中,没啥特别的地方,只是看起来要简洁些:
class ScrapManager:
def __init__(self, album=""):
self.album_id = album
self.executor = ThreadPoolExecutor(max_workers=10)
self.trackTotalCount=None
self.successCount=0
self.allTasks=[]
self.headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Host': 'www.ximalaya.com',
'If-None-Match': 'W/"2cc08-HvI5ufGZ9TNYyyZOgJLO8mPSV64"',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
}
def start(self):
self.createAlbum()
def getAlbumInfor(self):
s = requests.session()
ret = s.get(url='http://www.ximalaya.com/youshengshu/{}/'.format(self.album_id), headers=self.headers).content.decode(
'utf-8')
soup = BeautifulSoup(ret,"html.parser")
title = soup.img['alt']
src = soup.img['src']
return title, src
def createAlbum(self):
title,src=self.getAlbumInfor()
self.album_instance, create = XimalayaMediaAlbum.objects.get_or_create(index=self.album_id,name=title,imgurl=src)
self.getIndexAndID()
self.checkScrapStatus()
def checkScrapStatus(self):
XimalayaAlbumScrapStatus.objects.filter(album=self.album_id).delete()
def getDownloadUrl(self,id):
print('geting src from xmly web for {}'.format(id))
s = requests.session()
ret = s.get(url='http://www.ximalaya.com/revision/play/tracks?trackIds={}'.format(id), headers=self.headers).content
j = json.loads(ret)
src = j['data']['tracksForAudioPlay'][0]['src']
if src and src!="" and 'm4a' in src:
media= XimalayaMedia.objects.filter(xmlyid=id).first()
if media:
media.url=src
media.save()
return src
else:
return ""
def getIndexAndID(self):
for i in range(1,1000):
pageurl = 'http://www.ximalaya.com/revision/album/getTracksList?albumId={}&pageNum={}'.format(self.album_id, i)
s = requests.session()
ret = s.get(url=pageurl, headers=self.headers).content.decode('utf-8')
j = json.loads(ret)
if not self.trackTotalCount:
self.trackTotalCount=int(j['data']['trackTotalCount'])
tracks = j['data']['tracks']
if len(tracks) > 0:
for track in tracks:
# task = self.executor.submit(self.getDownloadUrl, index=track['index'], id=track['trackId'])
# self.allTasks.append(task)
XimalayaMedia.objects.get_or_create(album=self.album_instance, index=track['index'], xmlyid=track['trackId'])
else:
break