我用的是Anaconda3来编译python 代码
# -*- coding: utf-8 -*-
"""
Created on Fri Mar 10 11:02:50 2017
@author: Administrator
"""
#用余弦相似性进行比较
import numpy as np
#Numpy是Python的一个科学计算的库,提供了矩阵运算的功能
import jieba
#python分词器
import copy
import codecs,sys
#自然语言编码转换
title2= "张翰怒斥耍大牌被换角谣言"
title1 = "爱剪辑-危机四伏,落水就会被高压电弄死"
#title1 = "王凯《跨界歌王》姗姗来迟“低音炮”开嗓献唱 - 搜狐视频"
#title2 = "《跨界歌王》王凯清唱“好久不见” 低音炮名不虚传_视频在线观看 - 56.com"
#title1 = "王凯《跨界歌王》姗姗来迟“低音炮”开嗓献唱 - 搜狐视频"
#title2 = "范爷维权获赔15万全捐赠"
#sampfn = "C:\\Users\\Administrator\\Desktop\\sample.txt"
#定义了一个余弦相似度函数
def get_cossimi(x,y):
myx = np.array(x)
myy = np.array(y)
cos1 = np.sum(myx * myy)
cos21 = np.sqrt(sum(myx * myx))
cos22 = np.sqrt(sum(myy * myy))
return cos1 / (cos21 * cos22)
if __name__ == '__main__':
print("loading...")
print("working...")
#title1进行分词
f1_seg_list = jieba.cut(title1)#需要添加一个词典,来弥补结巴分词中没有的词语,从而保证更高的正确率
#title1进行分词