001 常见词向量表达
https://blog.youkuaiyun.com/sinat_26917383/article/details/52162589
https://blog.youkuaiyun.com/hubin232/article/details/81272126 【比较新】
我就直接用的Bow当做词得权重了。
测试文本用Bow表示
import json
import os
from collections import Counter
import time
#每篇文档用bow表示
def gen_vector(path):
testVector={}
cate=os.listdir(path)
for i,category in enumerate(cate):
print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()), '>', '=' * 30 + '[' + category + ']' + '=' * 30)
file_path=path+category+'/'
file_list=os.listdir(file_path)
testVector.setdefault(category,{})
for j,file_name in enumerate(file_list):
full_path=file_path+file_name
with open(full_path,"r",encoding='utf-8') as f:
conten