import numpy as np
import xlrd
import re
import json
# 数据加载
# jieba词库设置
#读取文本,读取其中1个
#统计包含a的单词和不包含a的单词
#统计主题
def chisquare(datasetTrue, datasetFlase):
dicta ={}
dictb ={}
for i in datasetTrue:
for word in i:
if word not in dicta:
dicta[word] = 0
else:
dicta[word] = dicta[word] + 1
for i in datasetFlase:
if word in i:
if word not in dictb:
dictb[word] = 0
else:
dictb[word] = dictb[word] + 1
keyset = set()
for i in dicta:
keyset.add(i)
for i in dictb:
keyset.add(i)
dictres = {}
for w in keyset:
if w not in dicta:
a = 0
else:
a = dicta[w]
if w not in dictb:
b = 0
else:
b = dictb[w]
c = len(datasetTrue)-a
d = len(datasetFlase)-b
#print(a,b,c,d)
if (a>0 or b>0):
chi = (a * d - b * c) ** 2 / ((a + c) * (a + b) * (b + d) * (c + d))
print(w,chi)
dictres[w] = chi
return dictres
pass
def data_gen():
xlsfile = r'E:\work\chisquare\file\test0111.xlsx'
book = xlrd.open_workbook(xlsfile) # 得到Excel文件的book对象
sheet0 = book.sheet_by_index(0) # 通过sheet索引获得sheet对象
nrows = sheet0.nrows
jiahuoTrue = []
jiahuoFlase = []
for rownum in range(1, nrows):
rowvalue = sheet0.cell_value(rownum, 2)
rowflag = sheet0.cell_value(rownum, 1)
if (rowflag == 1):
jiahuoTrue.append(rowvalue)
elif (rowflag == 0):
jiahuoFlase.append(rowvalue)
return jiahuoTrue,jiahuoFlase
def getwordlist(datasetTrue,datasetFlase):
s = set()
for i in datasetTrue:
for w in i:
s.add(w)
for i in datasetFlase:
for w in i:
s.add(w)
return s
# 运行评分过程
def run():
jt,jf =data_gen()
datasetTrue = [list(set(jieba.lcut(d))) for d in jt]
datasetFlase = [list(set(jieba.lcut(d))) for d in jf]
dict = chisquare(datasetTrue, datasetFlase)
import pandas as pd
arr = list(dict.items())
df = pd.DataFrame(list(dict.items()), columns=['word', 'score'])
return df
pass
if __name__ == '__main__':
df1=run()
df1.sort_values("score").to_csv("res2")
print("end")
之前的老版本一个词一个词计算,算法是错误的,要跑几分钟
import jieba
import numpy as np
import xlrd
import re
import json
# 数据加载
# jieba词库设置
#读取文本,读取其中1个
#统计包含a的单词和不包含a的单词
#统计主题
def chisquare(datasetTrue, datasetFlase, word):
a=b=c=d=0
for i in datasetTrue:
if word in i:
a+=1
else:
c+=1
for i in datasetFlase:
if word in i:
b+=1
else:
d+=1
chi = (a*d-b*c)**2/((a+c)*(a+b)*(b+d)*(c+d))
#print(chi)
return chi
pass
def data_gen():
xlsfile = r'E:\work\chisquare\file\test0111.xlsx'
book = xlrd.open_workbook(xlsfile) # 得到Excel文件的book对象
sheet0 = book.sheet_by_index(0) # 通过sheet索引获得sheet对象
nrows = sheet0.nrows
jiahuoTrue = []
jiahuoFlase = []
for rownum in range(1, nrows):
rowvalue = sheet0.cell_value(rownum, 2)
rowflag = sheet0.cell_value(rownum, 1)
if (rowflag == 1):
jiahuoTrue.append(rowvalue)
elif (rowflag == 0):
jiahuoFlase.append(rowvalue)
return jiahuoTrue,jiahuoFlase
def getwordlist(datasetTrue,datasetFlase):
s = set()
for i in datasetTrue:
for w in i:
s.add(w)
for i in datasetFlase:
for w in i:
s.add(w)
return s
# 运行评分过程
def run():
jt,jf =data_gen()
datasetTrue = [jieba.lcut(d) for d in jt]
datasetFlase = [jieba.lcut(d) for d in jf]
word = "物流"
s = getwordlist(datasetTrue, datasetFlase)
dict = {}
for word in s:
dict[word] = chisquare(datasetTrue, datasetFlase, word)
if dict[word]>0.01:
print(word,dict[word])
pass
if __name__ == '__main__':
run()
print("end")