# -*- coding:utf-8 -*-
'''
将tfidf写入txt文件
'''
import string
import sys
import codecs
reload(sys)
import os
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
if __name__ == '__main__':
# =====================1、构造样本的目录文件名称======================
# 根目录文件
root_file = '.\Reduced_internet_preprocess'
# 目录列表
text_dir_list = []
# 遍历根目录文件
for root, dir, files in os.walk(root_file):
# 构造文件目录
for f in files:
text_init_dir = root + '\\' + f
text_dir_list.append(text_init_dir)
# =================== 2、从文件导入停用词表=======================
stpwrdpath = "E:\ly\stop_words.txt" # 停用词文件路径
stpwrd_dic = open(stpwrdpath, 'rb')
stpwrd_content = stpwrd_dic.read() # 将停用词表转换为list
stpwrdlst = stpwrd_content.splitlines()
stpwrd_dic.close()
# ====================3、遍历读取样本文件内容,制作语料库=========================
# 存储文档的语料库
corpus = []
# 把内容写入语料库
for f in text_dir_list:
# 打开文本文件
file_source = open(f, 'r+')
content = file_source.read()
file_source.close()
corpus.append(content)
# =====================4、对文档做词向量化,计算每个词语的tfidf值======================
vectorizer = CountVectorizer(stop_words=stpwrdlst)
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))
#vector = TfidfVectorizer(stop_words=stpwrdlst) # 生成向量对象,input参数是文件
#tfidf = vector.fit_transform(corpus)
word = vectorizer.get_feature_names() # 所有文本的关键字
weight = tfidf.toarray() # 对应的tfidf矩阵
print u'词向量数:', len(word)
print u'词向量数:', len(weight[0])
print u'词向量'
print word
print u'样本数:', len(weight)
print u'权重矩阵'
print weight
# =======================5、把每个文档中词语的tfidf值写入文件中保存=====================
# 创建每个样本的tfidf值结果目录
sFilePath = '.\Tfidf_file'
if not os.path.exists(sFilePath):
os.mkdir(sFilePath)
# 这里将每份文档词语的TF-IDF写入tfidffile文件夹中保存
for i in range(len(weight)):
print u"--------Writing all the tf-idf in the", i, u" file into ", sFilePath + '\\' + string.zfill(i,
5) + '.dat', "--------"
f = codecs.open(sFilePath + '\\' + string.zfill(i, 5) + '.dat', 'w', 'utf-8')
# 写入词向量行
for j in range(len(word)):
f.write(word[j] + " ")
f.write('\n')
# 写入tfidf
for j in range(len(word)):
f.write(str(weight[i][j]) + ' ')
f.close()
print '\n'
# =======================6、合成一个文档====================================
# 创建总结果目录
resultFilePath = '.\Result'
if not os.path.exists(resultFilePath):
os.mkdir(resultFilePath)
# 把每个样本文档的tfidf值写入一个文档中
f = codecs.open(resultFilePath + '\\' + 'alldata' + '.dat', 'w', 'utf-8')
#for j in range(len(word)):
#f.write(word[j] + " ")
#f.write('\n')
for i in range(len(weight)):
print u"--------Writing all the tf-idf in the", i, u" file into ", resultFilePath + '\\' + 'weight_alldata' + '.dat', "--------"
for j in range(len(word)):
f.write(str(weight[i][j]) + ' ')
f.write('\n')
f.close()
自然语言处理3---TFIDF
最新推荐文章于 2024-08-30 23:09:26 发布