1.弹幕的爬取
import requests
import json
import chardet
import re
from pprint import pprint
def get_cid():
url = 'https://api.bilibili.com/x/player/pagelist?bvid=BV1PK4y1b7dt&jsonp=jsonp'
res = requests.get(url).text
json_dict = json.loads(res)
return json_dict["data"][0]["cid"]
"""
注意:哔哩哔哩的网页现在已经换了,那个list.so接口已经找不到,但是我们现在记住这个接口就行了。
"""
def get_data(cid):
final_url = "https://api.bilibili.com/x/v1/dm/list.so?oid=" + str(cid)
final_res = requests.get(final_url)
final_res.encoding = chardet.detect(final_res.content)['encoding']
final_res = final_res.text
pattern = re.compile('<d.*?>(.*?)</d>')
data = pattern.findall(final_res)
return data
def save_to_file(data):
with open("dan_mu.txt", mode="w", encoding="utf-8") as f:
for i in data:
f.write(i)
f.write("\n")
cid = get_cid()
data = get_data(cid)
save_to_file(data)
2.词云图的制作
import pandas as pd
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from imageio import imread
import warnings
warnings.filterwarnings("ignore")
with open("dan_mu.txt",encoding="utf-8") as f:
txt = f.read()
txt = txt.split()
data_cut = [jieba.lcut(x) for x in txt]
data_cut
with open(r"G:\6Tipdm\wordcloud\data\stoplist.txt",encoding="utf-8") as f:
stop = f.read()
stop = stop.split()
stop = [" ","道","说道","说"] + stop
s_data_cut = pd.Series(data_cut)
all_words_after = s_data_cut.apply(lambda x:[i for i in x if i not in stop])
all_words = []
for i in all_words_after:
all_words.extend(i)
word_count = pd.Series(all_words).value_counts()
back_picture = imread(r"G:\6Tipdm\wordcloud\jay1.jpg")
wc = WordCloud(font_path="G:\\6Tipdm\\wordcloud\\simhei.ttf",
background_color="white",
max_words=2000,
mask=back_picture,
max_font_size=200,
random_state=42
)
wc2 = wc.fit_words(word_count)
plt.figure(figsize=(16,8))
plt.imshow(wc2)
plt.axis("off")
plt.show()
wc.to_file("ciyun.png")