1.jieba统计词频
# -*- coding: UTF-8 -*-
import numpy as np
import pandas as pd
import jieba
import jieba.analyse
import codecs
#设置pd的显示长度
pd.set_option('max_colwidth',500)
#载入数据
rows=pd.read_csv('datas1.csv', header=0,encoding='utf-8',dtype=str)
segments = []
for index, row in rows.iterrows():
content = row[2]
#TextRank 关键词抽取,只获取固定词性
words = jieba.analyse.textrank(content, topK=50,withWeight=False,allowPOS=('ns', 'n', 'vn', 'v'))
splitedStr = ''
for word in words:
# 记录全局分词
segments.append({'word':word, 'count':1})
splitedStr += word + ' '
dfSg = pd.DataFrame(segments)
# 词频统计
dfWord = dfSg.groupby('word')['count'].sum()
#导出csv
dfWord.to_csv('keywords.csv',encoding='utf-8')
2.jieba统计词频
data = pd.read_excel(GOODS_STANDARD_EXCEL_PATH)
jieba.analyse.set_stop_words(STOP_WORDS_FILE_PATH)
keywords_count_dict = {i[0]: 0 for i in reversed(keywords_count_list[:20])}
cut_words = jieba.cut(' '.join(DF_STANDARD.title))
for word in cut_words:
for keyword in keywords_count_dict.keys():
if word == keyword:
keywords_count_dict[keyword] = keywords_count_dict[keyword] + 1
print(keywords_count_dict)