# -*- coding: utf-8 -*-
# * @author haoming
# * @date 2016/11/08
import os
os.chdir(u"G:\project\LRModel\wordseg")
import MySQLdb
import pandas as pd
import re
import codecs
import jieba
import jieba.analyse
import logging
from gensim import corpora, models, similarities
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import HashingVectorizer
## 根据trade_filter1提取关键词,作为用户dict,以便后续做分词
def write_userdict():
sql=('select distinct trade_filter1 from clue limit 5')
data = pd.read_sql(sql,conn)
datalist = []
for dataitem in data['trade_filter1']:
datalist.append(dataitem)
str_symptom = str(datalist).replace('u\'','\'')
dataset = (str_symptom.decode("unicode-escape"))
tags = jieba.analyse.extract_tags(dataset)#, topK=len(dataset)
words = " ".join(tags)
print type(words)
f = codecs.open('dict.txt', 'wb', 'utf-8')
for i in words:
f.write(i)
f.close()
## 选取reg_no, bus_scope,根据 bus_scope 切词
def word_cut(newdata):
r1 = u'[a-zA-Z0-9’!"#$%&\'()*+,-./:;<=>?@,。?★、…【】《》?“”‘’![\\]^_`{|}~]+'
r2 = u'\s+'
word_seg=[]
for word in newdata:
seg="\n".join(word)
seg_list = jieba.cut(re.sub(r2,"",re.sub(r1,"",seg)), cut_all=True)
word = ' '.join(seg_list)
word_seg.append(word)
texts = [[word for word in document.split()] for document in word_seg]
return texts
## 根据给定词组,查找与其最相似的词组,结果写入simscore.txt文件
def calculate_sim(word_list): #Tfidf
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
dictionary = corpora.Dictionary(word_list)
corpus = [dictionary.doc2bow(text) for text in word_list] # corpus 列出每篇文档每个单词出现的次数
tfidf = models.TfidfModel(corpus) #基于这些“训练文档”计算一个TF-IDF“模型”
corpus_tfidf = tfidf[corpus] #将上述用词频表示文档向量表示为一个用tf-idf值表示的文档向量
#for doc in corpus_tfidf:
# print doc
#print tfidf.dfs
#print tfidf.idfs
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=30)
#lsi.print_topics(5)
corpus_lsi = lsi[corpus_tfidf]
print "*****"*20
#for doc in corpus_lsi:
# print doc
q_file = open('query.txt', 'r')
query = q_file.readline()
q_file.close()
query = list(jieba.cut(query, cut_all=False))
query_bow = dictionary.doc2bow(query)
query_lsi = lsi[query_bow]
index = similarities.MatrixSimilarity(lsi[corpus]) #建立索引,以备后续查询相似文档
sims = index[query_lsi]
sort_sims = sorted(enumerate(sims), key=lambda item: -item[1])
print sort_sims
sim_file = open('simscore.txt', 'w')
for i in sort_sims:
sim_file.write(str(i)+'\n') #写入txt 时不要忘了编码
sim_file.close()
if __name__ == "__main__":
conn = MySQLdb.connect(host='10.0.0.2',user='root',passwd='root',db='clue',port=3306,charset='utf8')
sql=('select reg_no, trade_filter1, bus_scope from clue where length(bus_scope)>=6 limit 5')
data=pd.read_sql(sql,conn)
newdata=data.as_matrix(columns=['bus_scope'])
result = word_cut(newdata)
simscore = calculate_sim(result)
根据给定的文档,在数据库里面查询跟他最相似的文档