包括预处理,使用tfidf加权重
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# created by fhqplzj on 2017/05/15 上午10:48
import itertools
import re
import jieba
from six.moves import xrange
from sklearn.feature_extraction.text import TfidfVectorizer
def load_stopwords():
path = '/Users/fhqplzj/PycharmProjects/data_service/service/dic/why/stopwords'
content = open(path, 'rb').read().decode('utf-8')
return frozenset(content.splitlines())
stopwords = load_stopwords()
chinese = re.compile(ur'^[0-9a-zA-Z_\u4e00-\u9fa5]+$')
def filter_func(word):
result = True if re.match(chinese, word) else False
return result and word not in stopwords
def my_tokenizer(sentence):
words = jieba.lcut(sentence)
return filter(filter_func, words)
def word_and_weight(corpus):
vectorizer = TfidfVectorizer(tokenizer=my_tokenizer, norm='l1')
tfidf_matrix = vectorizer.fit_transform(corpus)
for row_idx in xrange(len(corpus)):
pairs = []
for word in my_tokenizer(corpus[row_idx]):
try:
weight = tfidf_matrix[row_idx, vectorizer.vocabulary_.get(word)]
except IndexError:
weight = 0.0
pairs.append((word, weight))
yield pairs
def load_corpus():
path = '/Users/fhqplzj/PycharmProjects/data_service/service/dic/why/why'
content = open(path, 'rb').read().decode('utf-8')
lines = []
for line in content.splitlines():
try:
lines.append(line.split('\t')[1])
except IndexError:
pass
return lines
for pair in itertools.islice(word_and_weight(load_corpus()), 1250):
for two in pair:
print two[0], two[1]
print