import numpy as np
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
corpus =['He is going from Beijing to Shanghai.','He denied my request, but he actually lied.','Mike lost the phone, and phone was in the car.',]
X = vectorizer.fit_transform(corpus)print("文本的向量表示:Bag of Words")print("sklearn函数输出:")print(X.toarray())
1.2 手动计算
Y =[]
word_voc =['actually','and','beijing','but','car','denied','from','going','he','in','is','lied','lost','mike','my','phone','request','shanghai','the','to','was']for sentence_ in corpus:
sentence =[]for x in sentence_[:-1].replace(',','').split():
sentence.append(x.lower())
vector =[0]*len(word_voc)for word_index inrange(len(word_voc)):
word = word_voc[word_index]
vector[word_index]= Counter(sentence)[word]
Y.append(vector)
Y = np.array(Y)