github地址:https://github.com/fxsjy/jieba
示例1:对txt文本进行分词,并对获取的分词进行计数,最后将结果写入result.txt中。
http://www.cnblogs.com/chenbjin/p/3843800.html
import jieba import sys reload(sys) sys.setdefaultencoding('utf8') def fenci(argv) : filename = argv[1] f = open(filename,'r+') file_list = f.read() f.close() seg_list = jieba.cut(file_list,cut_all=True) tf={} for seg in seg_list : #print seg seg = ''.join(seg.split()) if (seg != '' and seg != "\n" and seg != "\n\n") : if seg in tf : tf[seg] += 1 else : tf[seg] = 1 f = open("result.txt","w+") for item in tf: #print item f.write(item+" "+str(tf[item])+"\n") f.close() if __name__ == '__main__' : fenci(sys.argv)
示例2:http://www.cnblogs.com/chenbjin/p/3851165.html
对100份文档进行分词,然后进行TF-IDF的计算,其效果相当好。
import os import jieba import jieba.posseg as pseg import sys import string from sklearn import feature_extraction from sklearn.feature_extraction.text import TfidfTransformer from sklearn.feature_extraction.text import CountVectorizer reload(sys) sys.setdefaultencoding('utf8') #获取文件列表(该目录下放着100份文档) def getFilelist(argv) : path = argv[1] filelist = [] files = os.listdir(path) for f in files : if(f[0] == '.') : pass else : filelist.append(f) return filelist,path #对文档进行分词处理 def fenci(argv,path) : #保存分词结果的目录 sFilePath = './segfile' if not os.path.exists(sFilePath) : os.mkdir(sFilePath) #读取文档 filename = argv f = open(path+filename,'r+') file_list = f.read() f.close() #对文档进行分词处理,采用默认模式 seg_list = jieba.cut(file_list,cut_all=True) #对空格,换行符进行处理 result = [] for seg in seg_list : seg = ''.join(seg.split()) if (seg != '' and seg != "\n" and seg != "\n\n") : result.append(seg) #将分词后的结果用空格隔开,保存至本地。比如"我来到北京清华大学",分词结果写入为:"我 来到 北京 清华大学" f = open(sFilePath+"/"+filename+"-seg.txt","w+") f.write(' '.join(result)) f.close() #读取100份已分词好的文档,进行TF-IDF计算 def Tfidf(filelist) : path = './segfile/' corpus = [] #存取100份文档的分词结果 for ff in filelist : fname = path + ff f = open(fname,'r+') content = f.read() f.close() corpus.append(content) vectorizer = CountVectorizer() transformer = TfidfTransformer() tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus)) word = vectorizer.get_feature_names() #所有文本的关键字 weight = tfidf.toarray() #对应的tfidf矩阵 sFilePath = './tfidffile' if not os.path.exists(sFilePath) : os.mkdir(sFilePath) # 这里将每份文档词语的TF-IDF写入tfidffile文件夹中保存 for i in range(len(weight)) : print u"--------Writing all the tf-idf in the",i,u" file into ",sFilePath+'/'+string.zfill(i,5)+'.txt',"--------" f = open(sFilePath+'/'+string.zfill(i,5)+'.txt','w+') for j in range(len(word)) : f.write(word[j]+" "+str(weight[i][j])+"\n") f.close() if __name__ == "__main__" : (allfile,path) = getFilelist(sys.argv) for ff in allfile : print "Using jieba on "+ff fenci(ff,path) Tfidf(allfile)