参考:https://blog.youkuaiyun.com/weixin_42555080/article/details/88363040
1. 爬取微博评论
1.1 网页解析
从经验来讲,爬取难度:微博网页端>手机端,参考 Blessy_Zhu.提出的方法,这里对微博移动端:htps://m.weibo.cn 进行爬取。
单从界面上来讲就能看出爬取的难度了。下面选择一条感兴趣的微博,我选择的链接为:https://weibo.cn/comment/JcgPYxrNf?uid=1713926427
右键检查,进入开发者工具,选择 network 面板,我们就得到了需要的信息
1.2 爬取评论
import requests
import re
import time
def get_one_page(url):#请求函数:获取某一网页上的所有内容
headers = {
'User-agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36',
'Host' : 'weibo.cn',
'Accept' : 'application/json, text/plain, */*',
'Accept-Language' : 'zh-CN,zh;q=0.9',
'Accept-Encoding' : 'gzip, deflate, br',
'Cookie' : '你的cookie',
'DNT' : '1',
'Connection' : 'keep-alive'
}#请求头的书写,包括User-agent,Cookie等
response = requests.get(url,headers = headers,verify=False)#利用requests.get命令获取网页html
if response.status_code == 200: #状态为200即为爬取成功
return response.text#返回值为html文档,传入到解析函数当中
return None
#解析html并存入到文档result.txt中
def parse_one_page(html):
pattern = re.compile('<span class="ctt">.*?</span>', re.S)
items = re.findall(pattern,html)
result = str(items)
with open('test.txt','a',encoding='utf-8') as fp:
fp.write(result)
# 当超过50页就不在获取新的评论了 所以就设为50
for i in range(50):
url = "https://weibo.cn/comment/JcgPYxrNf?uid=1713926427&rl=0&&page="+str(i)
html = get_one_page(url)
print(html)
print('正在爬取第 %d 页评论' % (i+1))
parse_one_page(html)
time.sleep(3)
test.txt内容:
2. 处理数据并存入数据库
下面要从爬取到的内容解析出评论,并把它们存入数据库
import re
import pandas as pd
import pymysql
import emoji
# 连接数据库
conn = pymysql.connect(
host='127.0.0.1',
port=3306,
user='root',
passwd='数据库密码',
db='weibo', # 数据库名
charset='utf8',
)
cursor = conn.cursor()
with open("/Users/guo/Desktop/爬虫/微博爬取评论数据_新参数/test.txt", "r") as f: # 打开文件
content = f.read() # 读取文件
rawResults = re.findall(">.*?<",content,re.S)
firstStepResults = []
for result in rawResults:
#print(result)
if ">\'][\'<" in result:
continue
if ">:<" in result:
continue
if "><" in result:
continue
if ">回复<" in result:
continue
if "><" in result:
continue
if ">\', \'<" in result:
continue
if "@" in result:
continue
if "> <" in result:
continue
else:
result = emoji.demojize(result) # 去除评论中的表情包
a = re.findall('[\u4e00-\u9fa5a-zA-Z0-9]+', result, re.S) # 只要字符串中的中文,字母,数字
a = "".join(a)
firstStepResults.append(a)
subTextHead = re.compile(">")
subTextFoot = re.compile("<")
i = 0
for lastResult in firstStepResults:
resultExcel1 = re.sub(subTextHead, '', lastResult)
resultExcel = re.sub(subTextFoot, '', resultExcel1)
sql = "insert into pinglun1(pinglun) values(%s)" # pinglun1:表明 pinglun:列名
cursor.execute(sql, resultExcel)
with open('result.txt', 'a') as f: # 'a'表示append,即在原来文件内容后继续写数据(不清楚原有数据)
f.write(resultExcel)
print(i,resultExcel)
i+=1
f.close()
cursor.close()
conn.commit()
conn.close()
注意,评论中会出现表情,存入数据库是会出现问题,用以下方法过滤:
result = emoji.demojize(result) # 去除评论中的表情包
是数据库的操作是,创建一张表pinglun1,定义列名pinglun
运行结果:
result.txt:
3. 提取数据
def readmysql(): #读取数据库
textlist = []
conn =pymysql.connect(host='127.0.0.1',
user='root',
password='密码',
db = 'weibo',
charset="utf8") #连接服务器
with conn:
cur = conn.cursor()
cur.execute("SELECT * FROM pinglun1")
rows = cur.fetchall()
for row in rows:
a = list(row)
textlist.append(a)
return textlist
使用fetchall()提取出来的数据是元组,我们用list(row)转为列表形式
4. 情感分析
def snowanalysis(textlist):
sentimentslist = []
for li in textlist:
li = str(li)
s = SnowNLP(li)
sentimentslist.append(s.sentiments)
fig1 = plt.figure("sentiment")
plt.hist(sentimentslist,bins=np.arange(0,1,0.02))
plt.show() # 当值大于0.5时代表句子的情感极性偏向积极,当分值小于0.5时,情感极性偏向消极
显示结果:
大部分的评论还是比较偏向中立的,并且积极的评论也要多于消极评论,说明在2020的上半年,尽管经历了疫情、洪水等一系列的考验,但是大家还是对国家充满信心,对未来充满希望。
5. 绘制词云
def word2cloud(textlist):
fulltext = ''
isCN = 1
back_coloring = imread("bjt.jpg")
cloud = WordCloud(font_path='/System/Library/Fonts/Supplemental/Arial Unicode.ttf',
background_color="white",
max_words=2000,
mask=back_coloring,
max_font_size=100,
random_state=42,
width=1000,height=860,margin=2)
for li in textlist:
fulltext += ' '.join(jieba.cut(str(li),cut_all =False)) # str(li)一定要转为str格式
wc = cloud.generate(fulltext)
image_colors = ImageColorGenerator(back_coloring)
plt.figure("wordc")
plt.imshow(wc.recolor(color_func=image_colors))
wc.to_file('评论词云.png')
image_produce = cloud.to_image()
image_produce.show()
运行结果:
结束
参考
[1]: https://blog.youkuaiyun.com/weixin_42555080/article/details/88363040