#中文分词技术
#jiaba的三种分词模式
import jieba
sent = "中文分词是文本处理不可缺少的一步!"
seg_list = jieba.cut(sent,cut_all=True)
print("全模式:",'/'.join(seg_list))
seg_list = jieba.cut(sent,cut_all=False)
print("精确模式:",'/'.join(seg_list))
seg_list = jieba.cut_for_search(sent)
print("搜索引擎模式:",'/'.join(seg_list))
#高频词提取
def get_content(path):
with open(path,'r',encoding='gbk',errors='ignore') as f:
content = ''
for l in f:
l = l.strip() #移除句子首尾的空格
content += l
return content
def get_TF(words,topk=10):
tf_dic = {}
for w in words: #对每个字典中的词计数,不存在的返回0
tf_dic[w] = tf_dic.get(w,0)+1
return sorted(tf_dic.items(),key = lambda x:x[1],reverse=True)[:topk]
#参数分别表示为:可迭代的对象、用来进行比较的元素、是否降序
#关键词提取
def keyword_extract(data,file_name):
tfidf = analyse.extract_tags
keywords = tfidf(data)
return keywords
def getkeywords(docpath,savepath):
with open(docpath,'r') as docf,open(savepath,'w') as outf:
for data in docf:
data = data[:len(data)-1]
keywords = keyword_extract(data,savepath)
for word in keywords:
outf.write(word +' ')
out.write('\n')
#词性标注
import jieba.posseg as psg
sent = "中文分词是文本处理不可或缺的一步!"
seg_list = psg.cut(sent)
print(' '.join(['{0}/{1}'.format(w,t) for w,t in seg_list]))
#中文/nz 分词/n 是/v 文本处理/n 不可或缺/l 的/uj 一步/m !/x