词袋法
使用sklearn提供的CountVectorizer类,样例如下:
def corpus4sklearn():
return ["i am chinese", "i am in Hubei"]
def bow_extracter(corpus):
# ngram_range是说可以把几个词连起来视作一个特征,bow一般就是一个词一个特征,否则特征数会爆炸
vectorizer = CountVectorizer(stop_words=None, ngram_range=(1, 1), analyzer="word")
return do_extracter(vectorizer, corpus)
def do_extracter(vectorizer, corpus):
mat = vectorizer.fit_transform(corpus)
print("feature num:%d" % mat.shape[1])
print("vocab:%s" % vectorizer.vocabulary_)
return