【人工智能项目】深度学习实现知乎文本摘要
任务说明
文本摘要是NLP领域经典任务之一,在当前互联网时代,用户可以通过自动文本摘要对海量文本信息进行简化处理。本次比赛的的任务就是自动文本摘要。
使用gensim来生成摘要
import numpy as np
import pandas as pd
from gensim.summarization.summarizer import summarize
testdf=pd.read_csv("data/test.csv")
testdf.head(10)
from pyltp import Segmentor
from pyltp import SentenceSplitter
_segmentor = None
_sent_splitter = None
def split(content):
'''分句和分词'''
global _segmentor, _sent_splitter
if _segmentor is None:
model_path = r'cws.model'
segmentor = Segmentor() # 初始化实例
segmentor.load(model_path) # 加载分词模型
_segmentor = segmentor # 设置全局变量, 避免每次都重新加载模型, 耗时
_sent_splitter = SentenceSplitter() # 句子分割模型
sents = _sent_splitter.split(content) # 先进行分句
s = []
for sent in sents:
words = _segmentor.segment(sent) # 分词
sent = ' '.join(words) # 用空格把词隔开
s.append(sent)
content = '. '.join(s) # 用.把句子隔开
return content
def clean(content):
content = str(content).replace('.', '') # 删除句子分隔符
content = str(content).replace(' ', '') # 删除空格
return content
from gensim.summarization.summarizer import summarize
result=[]
#for i in range(10):
for i in testdf.index:
t=testdf['article'][i]
tokens = split(t)
s=summarize(tokens)
s=clean(s)
print(i,s+'\n')
result.append([i,s])
predresult=pd.DataFrame(result)
predresult.to_csv("mysubmission.csv",header=None, index=None)
使用textrank来生成摘要
import pandas as pd
from tqdm import tqdm
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')
train_df.head()
from textrank4zh import TextRank4Keyword,TextRank4Sentence
texts = test_df['article'].values
res = []
for text in tqdm(texts):
text = text.replace('<Paragraph>',' ')
tr4s = TextRank4Sentence()
tr4s.analyze(text=text, lower=True, source = 'no_stop_words')
key_sentences = tr4s.get_key_sentences(num=1, sentence_min_len=20)
res.append(key_sentences[0]['sentence'])
sub = pd.DataFrame()
sub['sum'] = res
sub.head()
sub.to_csv('sub.csv',header=None)
小结
下次见!!!