import pandas as pd
import numpy as np
def gx_matrix(vol_li):
names = locals()
all_col0 = []
for row in vol_li:
all_col0 += row
for each in row:
try:
for each1 in row:
try:
names['dic_' + each][each1] = names['dic_' + each][each1] + 1
except:
names['dic_' + each][each1] = 1
except:
names['dic_' + each] = dict.fromkeys(row, 1)
all_col = list(set(all_col0))
all_col.sort(reverse=False)
df_final0 = pd.DataFrame(columns=all_col)
for each in all_col:
try:
temp = pd.DataFrame(names['dic_' + each], index=[each])
except:
names['dic_' + each] = dict.fromkeys(all_col, 0)
temp = pd.DataFrame(names['dic_' + each], index=[each])
df_final0 = pd.concat([df_final0, temp])
df_final = df_final0.fillna(0)
return df_final
if __name__ == '__main__':
temp1 = []
stop_words = '《,》,“,?,”,[,],",,,。,:,.,(,)'.split(',')
print(stop_words)
with open('./标题.txt','r',encoding='gbk') as f:
con = f.readlines()
for i in con:
for word in jieba.lcut(i):
if word not in stop_words:
if word.strip()!='':
temp1.append(word.strip())
temp_all = [temp1]
vol_li = pd.Series(temp_all)
df_matrix = gx_matrix(vol_li)
print(df_matrix)
df_matrix.to_csv(r'.\词频共现.csv')
```
```python
import os
import xlrd
import re
from pprint import pprint as pt
from tqdm import tqdm
import numpy as np
import jieba
from jieba.posseg import dt
allow_pos = frozenset(('ns', 'n', 'vn', 'v','a'))
def buildmatrix(x, y):
return [[0 for j in range(y)] for i in range(x)]
def dic(keygroup):
keytxt = '/'.join(keygroup)
keyfir = keytxt.split('/')
print(keyfir)
keylist = list(set([key for key in keytxt.split('/') if key != '']))
keydic = {}
pos = 0
for i in keylist:
pos = pos+1
keydic[pos] = str(i)
return keydic
def showmatrix(matrix):
matrixtxt = ''
count = 0
for i in tqdm(range(0, len(matrix))):
for j in range(0, len(matrix)):
matrixtxt = matrixtxt+str(matrix[i][j])+'\t'
matrixtxt = matrixtxt[:-1]+'\n'
count = count+1
return matrixtxt
def inimatrix(matrix, dic, length):
matrix[0][0] = '+'
for i in range(1, length):
matrix[0][i] = dic[i]
for i in range(1, length):
matrix[i][0] = dic[i]
return matrix
def countmatirx(matrix, dic, mlength, keylis):
for i in range(1, mlength):
for j in range(1, mlength):
count = 0
for k in keylis:
ech = k
if str(matrix[0][i]) in ech and str(matrix[j][0]) in ech and str(matrix[0][i]) != str(matrix[j][0]):
count = count+1
else:
continue
matrix[i][j] = str(count)
return matrix
def flag_filter( wp):
return (wp.flag in allow_pos) and (len(wp.word.strip()) >= 2)
def key_words(sentense):
words = tuple(dt.cut(sentense))
keydic = {}
pos = 0
key_word=set()
for i, wp in enumerate(words):
if flag_filter(wp):
key_word.add(wp.word)
for i in key_word:
pos = pos+1
keydic[pos] = i
return keydic
def data_loader(filepath):
keylis=[]
with open(filepath,'r',encoding="utf-8") as f:
content = f.readlines()
sen = ''
for i in content:
keylis.append([j for j in jieba.cut(i.strip())])
sen += i.strip()
keydic = key_words(sen)
return keydic,keylis
def main():
filepath = r'aa.txt'
keydic,keylis = data_loader(filepath)
print("keylis:",keylis)
print("keydic:",keydic)
length = len(keydic)+1
matrix = buildmatrix(length, length)
matrix = inimatrix(matrix, keydic, length)
matrix = countmatirx(matrix, keydic, length, keylis)
matrixtxt = showmatrix(matrix)
pt(matrix)
np.savetxt('词频共现矩阵.csv', matrix, delimiter = ',', fmt='%s')
print("词频共现矩阵已生成!")
if __name__ == '__main__':
main()