自然语言处理3---TFIDF

最新推荐文章于 2024-08-30 23:09:26 发布

原创最新推荐文章于 2024-08-30 23:09:26 发布 · 469 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#自然语言处理

自然语言处理专栏收录该内容

3 篇文章

订阅专栏

本文深入探讨了TF-IDF算法在自然语言处理中的作用，解释了其计算原理，阐述了如何利用TF-IDF进行文本特征提取，并通过实例展示了其在文本分类和信息检索中的有效性。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

# -*- coding:utf-8 -*-

'''
将tfidf写入txt文件
'''
import string
import sys
import codecs

reload(sys)
import os
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer

if __name__ == '__main__':

    # =====================1、构造样本的目录文件名称======================

    # 根目录文件
    root_file = '.\Reduced_internet_preprocess'
    # 目录列表
    text_dir_list = []
    # 遍历根目录文件
    for root, dir, files in os.walk(root_file):
        # 构造文件目录
        for f in files:
            text_init_dir = root + '\\' + f
            text_dir_list.append(text_init_dir)

    # =================== 2、从文件导入停用词表=======================
    stpwrdpath = "E:\ly\stop_words.txt"  # 停用词文件路径
    stpwrd_dic = open(stpwrdpath, 'rb')
    stpwrd_content = stpwrd_dic.read()  # 将停用词表转换为list
    stpwrdlst = stpwrd_content.splitlines()
    stpwrd_dic.close()

    # ====================3、遍历读取样本文件内容，制作语料库=========================
    # 存储文档的语料库
    corpus = []
    # 把内容写入语料库
    for f in text_dir_list:
        # 打开文本文件
        file_source = open(f, 'r+')
        content = file_source.read()
        file_source.close()
        corpus.append(content)

    # =====================4、对文档做词向量化，计算每个词语的tfidf值======================

    vectorizer = CountVectorizer(stop_words=stpwrdlst)
    transformer = TfidfTransformer()
    tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))
    #vector = TfidfVectorizer(stop_words=stpwrdlst)  # 生成向量对象，input参数是文件
    #tfidf = vector.fit_transform(corpus)

    word = vectorizer.get_feature_names()  # 所有文本的关键字
    weight = tfidf.toarray()  # 对应的tfidf矩阵

    print u'词向量数:', len(word)
    print u'词向量数:', len(weight[0])
    print u'词向量'
    print word
    print u'样本数：', len(weight)
    print u'权重矩阵'
    print weight

    # =======================5、把每个文档中词语的tfidf值写入文件中保存=====================
    # 创建每个样本的tfidf值结果目录
    sFilePath = '.\Tfidf_file'
    if not os.path.exists(sFilePath):
        os.mkdir(sFilePath)

    # 这里将每份文档词语的TF-IDF写入tfidffile文件夹中保存
    for i in range(len(weight)):
        print u"--------Writing all the tf-idf in the", i, u" file into ", sFilePath + '\\' + string.zfill(i,
                                                                                                          5) + '.dat', "--------"
        f = codecs.open(sFilePath + '\\' + string.zfill(i, 5) + '.dat', 'w', 'utf-8')
        # 写入词向量行
        for j in range(len(word)):
            f.write(word[j] + " ")
        f.write('\n')
        # 写入tfidf
        for j in range(len(word)):
            f.write(str(weight[i][j]) + ' ')
        f.close()
    print '\n'
    # =======================6、合成一个文档====================================
    # 创建总结果目录
    resultFilePath = '.\Result'
    if not os.path.exists(resultFilePath):
        os.mkdir(resultFilePath)
    # 把每个样本文档的tfidf值写入一个文档中
    f = codecs.open(resultFilePath + '\\' + 'alldata' + '.dat', 'w', 'utf-8')
    #for j in range(len(word)):
        #f.write(word[j] + " ")
    #f.write('\n')
    for i in range(len(weight)):
        print u"--------Writing all the tf-idf in the", i, u" file into ", resultFilePath + '\\' + 'weight_alldata' + '.dat', "--------"
        for j in range(len(word)):
            f.write(str(weight[i][j]) + ' ')
        f.write('\n')
    f.close()