爬虫-QQ音乐动态爬取

import  requests,random,time
import json

def get_hot_song_list():
    url="https://u.y.qq.com/cgi-bin/musicu.fcg?-=getUCGI16992261760781546&g_tk=5381&loginUin=3004439232&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8&notice=0&platform=yqq.json&needNewCode=0&data=%7B%22detail%22%3A%7B%22module%22%3A%22musicToplist.ToplistInfoServer%22%2C%22method%22%3A%22GetDetail%22%2C%22param%22%3A%7B%22topId%22%3A26%2C%22offset%22%3A0%2C%22num%22%3A100%2C%22period%22%3A%222019_34%22%7D%7D%2C%22comm%22%3A%7B%22ct%22%3A24%2C%22cv%22%3A0%7D%7D"
    headers={
        "user-agent": "***",
        "Referer":"https://y.qq.com/n/yqq/toplist/26.html"
    }
    r=requests.get(url,headers=headers)
    resp_dict=json.loads(r.text)
    song_info=[]
    for once in resp_dict['detail']['data']['songInfoList']:
        name=once['name']
        mid=once['mid']
        singer='-'.join(list(map(lambda x:x['name'],once['singer'])))
        song_info.append({'name':name,'mid':mid,'singer':singer})
    return song_info

def get_song_resource(songlist):
    for once in songlist:
        mid=once['mid']
        ran=str(random.random()).replace("0.",'')
        # print(ran)
        url="https://u.y.qq.com/cgi-bin/musicu.fcg?-=getplaysongvkey"+ran+"&g_tk=5381&loginUin=3004439232&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8&notice=0&platform=yqq.json&needNewCode=0&data=%7B%22req_0%22%3A%7B%22module%22%3A%22vkey.GetVkeyServer%22%2C%22method%22%3A%22CgiGetVkey%22%2C%22param%22%3A%7B%22guid%22%3A%22397795637%22%2C%22songmid%22%3A%5B%22"+mid+"%22%5D%2C%22songtype%22%3A%5B0%5D%2C%22uin%22%3A%223004439232%22%2C%22loginflag%22%3A1%2C%22platform%22%3A%2220%22%7D%7D%2C%22comm%22%3A%7B%22uin%22%3A%223004439232%22%2C%22format%22%3A%22json%22%2C%22ct%22%3A24%2C%22cv%22%3A0%7D%7D"
        # print(url)
        headers = {
            "user-agent": "***",
            "Referer": "https://y.qq.com/portal/player.html"
        }
        r=requests.get(url,headers=headers)
        resp_dict=json.loads(r.text)
        purl=resp_dict['req_0']['data']['midurlinfo'][0]['purl']
        if purl=="":
            continue
        song_resource="http://isure.stream.qqmusic.qq.com/"+purl
        once['resource']=song_resource
    return songlist

def download(songinfo):
    headers={
        "user-agent": "**",
    }
    for once in songinfo:
        if 'resource' in once:
            audio_r=requests.get(once['resource'],headers=headers,stream=True)
            savePath="./MP3/%s-%s.mp3"%(once['name'],once['singer'])
            # print(savePath)
            with open(savePath,'wb') as file:
                for j in audio_r.iter_content(10240):
                    file.write(j)
            print("%s歌曲下载成功"%(once['name']))



songlist=get_hot_song_list()
# print(songlist)
songinfo=get_song_resource(songlist)
# print(songinfo)
download(songinfo)
使用 Python 爬虫爬取 QQ 音乐可以按以下步骤实现: ### 1. 明确爬取目标 确定要爬取 QQ 音乐的具体内容,例如歌曲信息、评论、排行榜等。 ### 2. 分析目标网页 使用浏览器开发者工具(如 Chrome 的开发者工具)分析 QQ 音乐网页的结构,找出数据所在的 HTML 元素或 API 接口。 ### 3. 发送 HTTP 请求 使用`requests`库向目标网页或 API 发送 HTTP 请求,获取响应内容。 ```python import requests # 示例:获取 QQ 音乐歌曲页面 url = 'https://y.qq.com/n/yqq/song/00123456789.html' # 替换为实际歌曲 URL response = requests.get(url) if response.status_code == 200: html_content = response.text ``` ### 4. 解析响应内容 根据数据所在的位置,使用`BeautifulSoup`或`lxml`库解析 HTML 内容,提取所需的数据。 ```python from bs4 import BeautifulSoup # 解析 HTML 内容 soup = BeautifulSoup(html_content, 'html.parser') # 示例:提取歌曲名称 song_name = soup.find('span', class_='data__name_txt').text ``` ### 5. 处理 API 接口 如果数据是通过 API 接口获取的,需要分析接口的请求参数和返回数据格式。 ```python # 示例:获取 QQ 音乐歌曲评论 API comment_api_url = 'https://c.y.qq.com/base/fcgi-bin/fcg_global_comment_h5.fcg' params = { 'g_tk': 5381, 'loginUin': 0, 'hostUin': 0, 'format': 'json', 'inCharset': 'utf8', 'outCharset': 'GB2312', 'notice': 0, 'platform': 'yqq.json', 'needNewCode': 0, 'cid': 205360772, 'reqtype': 2, 'biztype': 1, 'topid': 23456789, # 替换为实际歌曲 ID 'cmd': 8, 'needmusiccrit': 0, 'pagenum': 0, 'pagesize': 20 } response = requests.get(comment_api_url, params=params) if response.status_code == 200: comment_data = response.json() # 处理评论数据 comments = comment_data['comment']['commentlist'] for comment in comments: print(comment['rootcommentcontent']) ``` ### 6. 循环爬取多页数据 如果数据有多页,需要循环发送请求并处理每页的数据。 ```python # 示例:循环爬取歌曲的多页评论 for pagenum in range(5): # 爬取前 5 页评论 params['pagenum'] = pagenum response = requests.get(comment_api_url, params=params) if response.status_code == 200: comment_data = response.json() comments = comment_data['comment']['commentlist'] for comment in comments: print(comment['rootcommentcontent']) ``` ### 7. 处理反爬机制 QQ 音乐可能有反爬机制,如 IP 封禁、验证码等。可以使用代理 IP、设置请求头、控制请求频率等方法绕过反爬机制。 ```python # 设置请求头 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'} response = requests.get(url, headers=headers) ``` ### 8. 存储数据 将爬取到的数据存储到文件(如 CSV、JSON)或数据库(如 MySQL、MongoDB)中。 ```python import csv # 示例:将评论数据存储到 CSV 文件 with open('qq_music_comments.csv', 'w', newline='', encoding='utf-8-sig') as csvfile: fieldnames = ['comment_id', 'content', 'user_name'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for comment in comments: writer.writerow({ 'comment_id': comment['commentid'], 'content': comment['rootcommentcontent'], 'user_name': comment['nick'] }) ```
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值