# @Time : 2021/3/9 15:35
# @Author : chao
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import jieba
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
def seg_sentence(sentence, stopwords):
"""对句子进行分词,并去除停用词
自己安装jieba分词包:pip install jieba
Args:
sentence:句子(str)
stopwords:停用词List
Returns:
outstr:分词并去除停用词的句子(str)
"""
sentence_seged = jieba.cut(sentence.strip())
outstr = ''
for word in sentence_seged:
if word not in stopwords:
if word != '\t':
outstr += word
outstr += ' '
return outstr
swPath = r'C:\Users\\词典\stopword停用词.txt'
f = open(swPath, 'r', encoding='utf-8')
lines = f.readlines()
stopwords = list(map(lambda line: line.strip(),
filter(lambda x: x != '', lines)))
stopwords.append(' ')
stopwords = list(set(stopwords))
f.close()
with open(r'C:\Users\数据\预处理后数据\zong_data.txt', encoding='ANSI') as f:
text = f.readlines()
seg_text = list(map(lambda review: seg_sentence(review, stopwords), text))
ngram_vectorizer = CountVectorizer(ngram_range=(1, 3), max_features=50)
count = ngram_vectorizer.fit_transform(seg_text).toarray()
featureName = ngram_vectorizer.get_feature_names()
featureName = list(map(lambda vec: vec.replace(' ', '_'), featureName))
seg_text = []
# 将此词频矩阵转换为TF—IDF值
tfidf_transformer = TfidfTransformer()
word_vec = tfidf_transformer.fit_transform(count).toarray()
name = np.matrix(featureName)
final = np.vstack((name, word_vec))
df = pd.DataFrame(final)
# 最终特征名称和数据保存在csv文件中
df.to_csv(r'C:\Users\代码\特征提取\特征提取数据\zong_tfidf.csv',
index=False, header=False, encoding='gbk')
(二)文本挖掘——TF-IDF
最新推荐文章于 2024-08-30 23:09:26 发布