BM 25也是计算TF、IDF、文档权重,只不过和经典的TFIDF表达式不同,参数也更多一些。
附上之前比赛的**版本代码,比赛结果还不错,不过如果时间充裕的话,可以把BM25和TextRank结合起来,效果会更好(在一篇论文里面看到的,有兴趣的可以去知网搜一下)
#!/usr/bin/python
#-*- coding:UTF-8-*-
import jieba
import jieba.posseg as pseg #引入结巴分词词性标注
import jieba.analyse
import numpy as np
import pandas
import pandas as pd
import csv
from gensim import corpora,models,similarities #引入文本相似度库
from gensim.corpora import Dictionary
from gensim.models import TfidfModel,LdaModel
from pandas import DataFrame
from collections import defaultdict
import time
#=============================================训练集分词============================
#读取文件,主要用以生成词库
import math
class BM25(object):
def __init__(self, docs):
self.D = len(docs)
self.avgdl = sum([len(doc)+0.0 for doc in do