对语料库的每一个句子的每一个单词加权重

这篇博客介绍了如何对语料库中的每个句子的每个单词进行预处理,并利用TF-IDF方法来赋予它们权重。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

包括预处理,使用tfidf加权重

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# created by fhqplzj on 2017/05/15 上午10:48
import itertools
import re

import jieba
from six.moves import xrange
from sklearn.feature_extraction.text import TfidfVectorizer


def load_stopwords():
    path = '/Users/fhqplzj/PycharmProjects/data_service/service/dic/why/stopwords'
    content = open(path, 'rb').read().decode('utf-8')
    return frozenset(content.splitlines())


stopwords = load_stopwords()
chinese = re.compile(ur'^[0-9a-zA-Z_\u4e00-\u9fa5]+$')


def filter_func(word):
    result = True if re.match(chinese, word) else False
    return result and word not in stopwords


def my_tokenizer(sentence):
    words = jieba.lcut(sentence)
    return filter(filter_func, words)


def word_and_weight(corpus):
    vectorizer = TfidfVectorizer(tokenizer=my_tokenizer, norm='l1')
    tfidf_matrix = vectorizer.fit_transform(corpus)
    for row_idx in xrange(len(corpus)):
        pairs = []
        for word in my_tokenizer(corpus[row_idx]):
            try:
                weight = tfidf_matrix[row_idx, vectorizer.vocabulary_.get(word)]
            except IndexError:
                weight = 0.0
            pairs.append((word, weight))
        yield pairs


def load_corpus():
    path = '/Users/fhqplzj/PycharmProjects/data_service/service/dic/why/why'
    content = open(path, 'rb').read().decode('utf-8')
    lines = []
    for line in content.splitlines():
        try:
            lines.append(line.split('\t')[1])
        except IndexError:
            pass
    return lines


for pair in itertools.islice(word_and_weight(load_corpus()), 1250):
    for two in pair:
        print two[0], two[1]
    print


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值