参考爬取网易云音乐评论,典型的ajax加载,大多人是去破解js加密,有点繁琐。
爬取周杰伦-晴天这一条评论,因为数据量稍大。以后爬取整个歌手。
这是加密api的情况:
import requests
def get_comment():
url = r'https://music.163.com/weapi/v1/resource/comments/R_SO_4_2069470?csrf_token='
headers = {
'Host': 'music.163.com',
'Referer': 'https://music.163.com/song?id=482999668',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/71.0.3578.98 Safari/537.36',
# 加上cookies, 否则{'code': -460, 'msg': 'Cheating'}
# proxies也是必须的,这里勉强用自己的vpn代替了
}
proxies = {
'https': 'https://110.52.234.72',
'https': 'https://119.101.112.66'
}
formdata = {
"params": 'Wk3drXP2/Nj8YbOQoL3ORmBM784lqxwm0VELQyBipJWx/rd8fUmklRZ6vL+G1f2dbZ/8WE7f25gWe+2BdXp3+d2AwkiTy5DxeVd4SiHX5qat+jU642hSysQVtHDfJHmCi6rjndr/YEBSccqnzIbueeA9H08OlzAZoYa5T6xlbQpxgtTdX5E1MF6R71ykxkS8',
"encSecKey": '1d5e93ee97662d6f9dfaf07dbe4b9d4f9ffe6b90b484d8acc14696214a556000198d51ce3d87d9123db07f96307c919c02d84fa4a204e9d0a387404141fd43400fb2ec9aaa07ae99d99df133cc6d4c31ee8ab7859d83351b154c1ab2bed81a84159a25956ed1485551639e37fc3502ab049a03051ca40f85ef4dd648aabe9286'
}
response = requests.post(url=url, headers=headers, proxies=proxies)
print(response.status_code)
result = response.json()
print(result)
comments = result.get("comments")
for comment in comments:
user = comment.get('user')
img_url = user.get('avatarUrl')
name = user.get('nickname')
uid= user.get('userId')
commentid = comment.get("commentId")
commenttime = comment.get("time")
content = comment.get('content')
print(name, uid, img_url, commentid,commenttime,content)
get_comment()
偶然看到没有加密的url尝试了下,出现requests.exceptions.ProxyError
Process Process-4:
OSError: [WinError 10048] Only one usage of each socket address (protocol/network address/port) is normally permitted
During handling of the above exception, another exception occurred:
urllib3.exceptions.NewConnectionError: <urllib3.connection.HTTPConnection object at 0x0000019F0FB6EF28>: Failed to establish a new connection: [WinError 10048] Only one usage of each socket address (protocol/network address/port) is normally permitted
During handling of the above exception, another exception occurred:
During handling of the above exception, another exception occurred:
requests.exceptions.ProxyError: HTTPConnectionPool(host='127.0.0.1', port=1080): Max retries exceeded with url: http://music.163.com/api/v1/resource/comments/R_SO_4_186016?limit=20&offset=6840 (Caused by ProxyError('Cannot connect to proxy.', NewConnectionError('<urllib3.connection.HTTPConnection object at 0x0000019F0FB6EF28>: Failed to establish a new connection: [WinError 10048] Only one usage of each socket address (protocol/network address/port) is normally permitted',)))
网上两种做法,配置requests参数和加proxies
看了一些ip代理,测试一下无效
配置了requests参数,感觉会好一点
requests.adapters.DEFAULT_RETRIES = 5 # 增加重连次数
s = requests.session()
s.keep_alive = False # 关闭多余连接
response = s.post(url=url, headers=headers, proxies=proxies)
if response.status_code != 200:
print(response.status_code) # 如果请求不成功则睡2-3秒,再继续post
time.sleep(random.random()*5)
continue
到4800页的时候,出现requests.exceptions.ChunkedEncodingError
,一些解决办法
Process Process-4805:
ValueError: invalid literal for int() with base 16: b''
During handling of the above exception, another exception occurred:
http.client.IncompleteRead: IncompleteRead(0 bytes read)
During handling of the above exception, another exception occurred:
raise ProtocolError('Connection broken: %r' % e, e)
urllib3.exceptions.ProtocolError: ('Connection broken: IncompleteRead(0 bytes read)', IncompleteRead(0 bytes read))
During handling of the above exception, another exception occurred:
requests.exceptions.ChunkedEncodingError: ('Connection broken: IncompleteRead(0 bytes read)', IncompleteRead(0 bytes read))
重写Thread类的run方法,如果线程报错而退出,则创建新线程,继续跑:
class HandleChunkedEncodingError(Thread):
def __init__(self, target, name, args):
Thread.__init__(self)
self.name = name
self.args = args
self.target = target
def run(self):
while True:
try:
self.target(*self.args)
except Exception as e:
# print('thread', self.name, 'running error: ', e)
time.sleep(5) # 创建新线程
rethd = HandleChunkedEncodingError(target=self.target, name=self.name, args=self.args)
rethd.start()
rethd.join()
else:
break
到9000页时,报错mongodb端口占用,可能插入操作的进程没有及时join?不清楚
inset error localhost:27017: [WinError 10048] Only one usage of each socket address (protocol/network address/port) is normally permitted
用Process发现占用cpu,爬到1W页改成Thread,cpu占用率下降,又发现:
Exception in thread Thread-341:
for comment in get_comment(i):
TypeError: 'NoneType' object is not iterable
然而我TM判断了了啊
if comments is not None:
return comments # 如果收不到评论,则重新post, 否则返回
改成
` if comments is None:
continue
else:
return comments # 如果收不到评论,则重新post, 否则返回
爬到3.5W页的时候,打开网易云网易云音乐本站点根据 GDPR 条款升级更新中,暂停服务,敬请期待归来!
应该是ip切换到了欧洲
切换后,网页正常了,但不幸又{'code': -460, 'msg': 'Cheating'}
查到cookie里需要_ntes_nuid
字段,是一个32位字母数字混排的值,只要改一个字符居然就可以post。
换行打印太长了看着眼花,不写入.log
文件了,改成进度提示信息原地刷新:
sys.stdout.write("\r{0}".format(info))
sys.stdout.flush()
嘛,总之还可以,1h 12W条。有用selenium爬取的,1h 0.3W条。selenium还是不适合爬东西。
这是不加密api的情况:
import time
import sys
import random
from threading import Thread
import requests
import pymongo
def get_comment(i):
offset = str(i*20)
url = r'http://music.163.com/api/v1/resource/comments/R_SO_4_186016?' \
'limit=20&offset='+offset
replace = random.randint(1,9)
headers = {
'Host': 'music.163.com',
'Referer': 'https://music.163.com/song?id=482999668',
'Origin': 'https://music.163.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/71.0.3578.98 Safari/537.36',
# 加上cookies 否则{'code': -460, 'msg': 'Cheating'}
'Cookie': '_ntes_nnid=3{}533f97b25070a32c249f59513ad20c,1{}92582485123;
_ntes_nuid=3{}533f97b25070a32c249f59513ad20c;
.............'.format(replace, replace, replace)
}
proxies = {
'https' : '123.163.117.246:33915',
'https' : '180.125.17.139:49562',
'https' : '117.91.246.244:36838',
'https' : '121.227.24.195:40862',
'https' : '180.116.55.146:37993',
'https' : '117.90.7.57:28613',
'https' : '193.112.111.90:51933',
'https' : '61.130.236.216:58127',
'https' : '60.189.203.144:16820',
'https' : '111.177.185.148:9999'
} # 这些ip大概是无效的
while True:
requests.adapters.DEFAULT_RETRIES = 5 # 增加重连次数
s = requests.session()
s.keep_alive = False # 关闭多余连接
response = s.get(url=url, headers=headers, proxies=proxies)
if response.status_code != 200 and response.status_code != 500: # 避免服务器内部错误刷屏
# print(response.status_code) # 如果请求不成功则睡2-3秒,再继续post
time.sleep(2)
continue
try:
result = response.json()
comments = result.get("comments")
if comments is None: # 此处存疑
continue
else:
return comments
except Exception as e:
# print('get error:', e) # 显示异常
continue
def insert2db(item):
client = pymongo.MongoClient(host='localhost', port=27017)
db = client.music
collection = db.comment2
collection.insert(item)
def save(page):
for comment in get_comment(page):
user = comment.get('user')
img_url = user.get('avatarUrl')
name = user.get('nickname')
uid = user.get('userId')
cid = comment.get("commentId")
ctime = comment.get("time")
content = comment.get('content')
item = {
'name': name,
'uid': uid,
'img_url': img_url,
'cid': cid,
'ctime': ctime,
'content': content
}
while True:
try:
insert2db(item)
except Exception as e:
# print('inset error:', e)
# time.sleep(0.5)
pass
else:
break
class HandleChunkedEncodingError(Thread):
def __init__(self, target, name, args):
Thread.__init__(self)
self.name = name
self.args = args
self.target = target
def run(self):
while True:
try:
self.target(*self.args)
except Exception as e:
# print('thread', self.name, 'running error: ', e)
time.sleep(5) # 创建新线程
rethd = HandleChunkedEncodingError(target=self.target, name=self.name, args=self.args)
rethd.start()
rethd.join()
else:
break
def main():
for i in range(90840, 104485, 10):
ps = []
time.sleep(random.random() * 0) # 每次创建线程间睡2-3秒左右
for j in range(10):
p = HandleChunkedEncodingError(target=save, name='thd'+str(i+j), args=(i + j,))
ps.append(p)
for p in ps:
p.start()
for p in ps:
p.join()
info = 'page ' + str(i+10) + 'done' # 每10页爬取,保证能检测到问题
sys.stdout.write("\r{0}".format(info))
sys.stdout.flush()
if __name__ == '__main__':
main()