一.原理部分
1.TF/IDF原理
https://blog.youkuaiyun.com/asialee_bird/article/details/81486700
2.TextRank原理
https://blog.youkuaiyun.com/qq_41664845/article/details/82869596
3.LSI原理
https://blog.youkuaiyun.com/qq_16633405/article/details/80577851
4.LDA原理
https://blog.youkuaiyun.com/u011808673/article/details/82497195
https://blog.youkuaiyun.com/btujack/article/details/98477061
二.对2019年12.14日贸易战新闻进行提取关键词,训练corpus为人民日报新闻
# -*- coding: utf-8 -*-
import math
import jieba
import jieba.posseg as psg
from gensim import corpora, models
from jieba import analyse
import functools
# 停用词表加载方法
def get_stopword_list():
# 停用词表存储路径,每一行为一个词,按行读取进行加载
# 进行编码转换确保匹配准确率
stop_word_path = 'stopword.txt'
stopword_list = [sw.replace('\n', '') for sw in open(stop_word_path,encoding='utf-8').readlines()]
return stopword_list
# 分词方法,调用结巴接口
def seg_to_list(sentence, pos=False):
if not pos:
# 不进行词性标注的分词方法
seg_list = jieba.cut(sentence)
else:
# 进行词性标注的分词方法
seg_list = psg.cut(sentence)
return seg_list
# 去除干扰词
def word_filter(seg_list, pos=False):
stopword_list = get_stopword_list()
filter_list = []
# 根据POS参数选择是否词性过滤
## 不进行词性过滤,则将词性都标记为n,表示全部保留
for seg in seg_list:
if not pos:
word = seg
flag = 'n'
else:
word = seg.word
flag = seg.flag
if not flag.startswith('n'):

最低0.47元/天 解锁文章
1321





