# -*- coding: utf-8 -*-
"""
Created on Wed Oct 26 21:35:31 2016
@author: sirius
test word2word
"""
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
vectorizer=CountVectorizer(min_df=1)
"""
hash处理,把所有的词全部列出来,然后根据有几句话分为几行,
这句话中出现该单词则标为1,不是则为0
"""
corpus=[
'this is the first document',
'this is the second document',
'And the third one',
'what the hell is that']
x=vectorizer.fit_transform(corpus).toarray()
"""
结果如下:
[[0 1 1 0 1 0 0 0 1 0 1 0]
[0 1 0 0 1
机器学习(九)使用sklearn库进行数据分析_——文本特征处理
最新推荐文章于 2024-09-20 19:36:12 发布