清洗数据并且分析评论词频
import nltk
import mysql.connector
import time
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
#text = "This is a sample text for keyword analysis. It demonstrates how NLTK can extract keywords from a given text."
# 建立连接
cnx = mysql.connector.connect(
host='*************',
user='root',
password='*****',
database='******'
)
start_time = time.time()
# 创建游标
cursor = cnx.cursor()
# 编写 SQL 查询语句
query1 = """ SELECT last_content,last_star FROM `lx_cs_reviews`where review_date >= '2022-08-20'; """
# 执行查询
cursor.execute(query1)
# 获取查询结果
results = cursor.fetchall()
# Function to get review sentiment based on field3_value
def get_review_sentiment(field3_value):
if field3_value in [1, 2, 3]:
return "差评"
elif field3_value in [4, 5]:
return "好评"
else:
return ""
for row in results:
# 获取元组中的第一个字段,并将其转换为小写后进行分词
field1_value = row[0]
field3_value = row[1] #评分
tokens = word_tokenize(field1_value.lower())
# print(tokens)
# Get the review sentiment
sentiment = get_review_sentiment(field3_value)
# 去除停用词
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
# 计算关键词的频率分布
freq_dist = FreqDist(filtered_tokens)
# 提取前 N 个关键词
top_n = 60
top_keywords = freq_dist.most_common(top_n)
# 输出关键词及其频率
for keyword, frequency in top_keywords:
print(field1_value,keyword, frequency)
# 编写 SQL 查询语句
query2 = "insert into comments_result_top (comment,frequency,is_no) values (%s, %s, %s)"
# 定义要插入的数据
data_to_insert = (keyword,frequency,sentiment)
# 执行插入操作
cursor.execute(query2, data_to_insert)
cnx.commit()
# 关闭mysql游标和连接
cursor.close()
cnx.close()
# 在这里放置你的代码,表示要计算执行时长的程序
end_time = time.time()
execution_time = end_time - start_time
print(f"程序执行时长: {execution_time} 秒")