【代码】网易云音乐歌手专辑歌曲评论的爬取

最新推荐文章于 2024-10-12 21:41:26 发布
azach64
最新推荐文章于 2024-10-12 21:41:26 发布
阅读量849
点赞数
CC 4.0 BY-SA版权
本文链接：https://blog.youkuaiyun.com/C_Python_/article/details/85418085
基于网易云音乐（周杰伦-晴天）评论的爬取爬取某首歌的评论
实现爬取某个歌手的所有专辑的所有歌曲的所有评论。要改进的有很多，先这样吧。。。
import random
import time
from math import ceil
import sys
from threading import Thread

import requests
import pymongo
from lxml import etree
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait


class GetSinger:
    def __init__(self, url):
        replace = random.randint(1, 9)
        self.headers = {
            'Host': 'music.163.com',
            # 'Referer': 'https://music.163.com/song?id=482999668',
            'Origin': 'https://music.163.com',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) '
                          'AppleWebKit/537.36 (KHTML, like Gecko) '
                          'Chrome/71.0.3578.98 Safari/537.36',
            # 加上cookies 否则{'code': -460, 'msg': 'Cheating'}
            'Cookie': '_ntes_nnid=3{}533f97b25070a32c249f59513ad20c,1{}92582485123; _ntes_nuid=3{}533f97b25070a32c249f59513ad20c;......'.format(replace, replace, replace)
        }
        self.proxies = {
            'https' : '123.163.117.246:33915',
            'https' : '180.125.17.139:49562',
            }    # 这些ip大概是无效的
        requests.adapters.DEFAULT_RETRIES = 5  # 增加重连次数
        self.session = requests.session()
        self.session.keep_alive = False  # 关闭多余连接
        self.base_url = 'https://music.163.com'
        self.url = url.replace('/#', '')
        self.album_url = self.url.replace('artist', 'artist/album') + '&limit=10000'  # 一页内展示所有专辑
        self.client = pymongo.MongoClient(host='localhost', port=27017)

    def get_albums(self):  # 获取专辑url
        html = self.session.get(self.album_url, headers=self.headers)
        text = etree.HTML(html.text)
        albums = text.xpath("//li/div/a[@class='msk']/@href")
        for album in albums:
            yield album  # 多次用yield，避免内存消耗，可能并非必须

    def get_songs(self):  # 获取某专辑下所有歌曲的地址
        for album in self.get_albums():
            html = self.session.get(self.base_url+album, headers=self.headers)
            # print(html.text)
            text = etree.HTML(html.text)
            # div和a之间有很多节点，通过'寻找所有这些节点下的a节点'定位失败，
            # 看到一些人定位的是div[@class="f-hide"],
            # 在这个节点的父节的兄弟节点div[@class='n-songtb'定位成功
            songs = text.xpath("//div[@class='n-songtb']//a/@href")
            yield album, songs  # 专辑，以及其中的歌曲地址列表

    def get_song(self):  # 获取某歌曲地址，附带所属的专辑地址
        for album, songs in self.get_songs():
            for song in songs:
                yield album, song

    
    def get_comment_num(self):     # 获取某歌曲评论最大页数
        for album, song in self.get_song():
            while True:
                try:
                    chrome_options = Options()
                    chrome_options.add_argument('--headless')
                    driver = webdriver.Chrome(chrome_options=chrome_options)
                    driver.get(self.base_url + song)
                    frame = WebDriverWait(driver, 10).until(lambda x: x.find_elements_by_id('g_iframe')[0])
                    driver.switch_to.frame(frame)
                    cmmt_num = WebDriverWait(driver, 10).until(
                        lambda x: x.find_elements_by_xpath("//span[@class='sub s-fc3']/span")[0].text)
                    driver.close()
                except Exception as e:
                    print(e)  # 如果获取不到 '评论条数' 元素，这里会报错list index out of range，那么继续去获取
                    		  # 但是这里如果评论数不为0而加载出来为0，这个问题还没解决！有时候直接加载出来和点击最后一页页码按钮加载出来的，略微不一致
                    continue
                else:
                    yield album, song, int(cmmt_num)
                    break

    def get_comment(self):   # 开始爬取某歌曲下的所有评论，存到数据库
        for album, song, cmmt_num in self.get_comment_num():
            pages_num = ceil(cmmt_num/20) # 分为两部分处理，防止重复
            pages_num_a = int(pages_num/10)*10
            pages_num_b = pages_num - pages_num_a
            print(pages_num, pages_num_a, pages_num_b)
            collections = self.url.replace('https://music.163.com/artist?id=', 'artist')+album.replace('/album?id=', 'a') + song.replace('/song?id=', 's')
            #  处理10的倍数页数的评论
            for i in range(0, pages_num_a, 10):
                ps = []
                # time.sleep(0.2)        # 每次创建线程间睡2-3秒左右
                for j in range(10):  # 测试设置为1， 实际是10
                    p = HandleChunkedEncodingError(target=self.save, name='thd'+str(i+j), args=(collections, song, i+j))
                    ps.append(p)
                for p in ps:
                    p.start()
                for p in ps:
                    p.join()
                info = 'page ' + str(i + 10) + 'done'  # 每10页爬取，保证能检测到问题
                print(info)
                # sys.stdout.write("\r{0}".format(info))
                # sys.stdout.flush()
		    #  处理剩下的10页以内的评论
            ps = []
            for i in range(pages_num_a, int(pages_num)):
                p = HandleChunkedEncodingError(target=self.save, name='thd'+str(i), args=(collections, song, i))
                ps.append(p)
            for p in ps:
                p.start()
            for p in ps:
                p.join()
            info = 'page left' + str(pages_num_b) + 'done'
            print(info)
            # sys.stdout.write("\r{0}".format(info))
            # sys.stdout.flush()

    def save(self, collection, song, page):
        for comment in self.get_content(song, page):
            user = comment.get('user')
            img_url = user.get('avatarUrl')
            name = user.get('nickname')
            uid = user.get('userId')
            cid = comment.get("commentId")
            ctime = comment.get("time")
            content = comment.get('content')
            item = {
                'name': name,
                'uid': uid,
                'img_url': img_url,
                'cid': cid,
                'ctime': ctime,
                'content': content
            }

            while True:
                try:
                    self.insert2db(collection, item)
                except Exception as e:
                    # print('inset error:', e)
                    # time.sleep(0.5)
                    pass
                else:
                    break

    def get_content(self, song, page):
        offset = str(page * 20)
        req_url = self.base_url+'/api/v1/resource/comments/R_SO_4_'+song.replace('/song?id=', '')+'?limit=20&offset=' + offset
        print(req_url)
        # req_url = 'https://music.163.com/api/v1/resource/comments/R_SO_4_1325711115?limit=20&offset=0'
        while True:  # get方式吧，这个api不需要表单数据，代理失效了，注释下
            response = self.session.get(url=req_url, headers=self.headers)#, proxies=self.proxies)
            if response.status_code != 200 and response.status_code != 500:  # 避免服务器内部错误刷屏
                # print(response.status_code)  # 如果请求不成功则睡2-3秒，再继续post
                time.sleep(2)
                continue

            try:
                result = response.json()
                comments = result.get("comments")
                if comments is None:  # 此处存疑
                    continue
                else:
                    return comments
            except Exception as e:
                # print('get error:', e)                # 显示异常
                continue

    def insert2db(self, collection, item):
        db = self.client.singer  # 创建sing库
        collection = db[collection]  # 歌手+专辑+歌名动态创建集合
        collection.insert(item)  # 插入评论
        time.sleep(1000)  # 阻塞，测试用


class HandleChunkedEncodingError(Thread):
    def __init__(self, target, name, args):
        Thread.__init__(self)
        self.name = name
        self.args = args
        self.target = target

    def run(self):
        while True:
            try:
                self.target(*self.args) # 运行线程
            except Exception as e:
                # print('thread', self.name, 'running error: ', e)
                time.sleep(5)  # 创建新线程
                rethd = HandleChunkedEncodingError(target=self.target, name=self.name, args=self.args)
                rethd.start()
                rethd.join()
            else:
                break


if __name__ == '__main__':
    url = 'https://music.163.com/#/artist?id=7217'
    fan = GetSinger(url)
    fan.get_comment()