""" 构建霍夫曼树 """ class HuffmanNode: def __init__(self, word_id, frequency): self.word_id = word_id # 叶子结点存词对应的id, 中间节点存中间节点id self.frequency = frequency # 存单词频次 self.left_child = None self.right_child = None self.father = None self.Huffman_code = [] # 霍夫曼码(左1右0) self.path = [] # 根到叶子节点的中间节点id class HuffmanTree: def __init__(self, wordid_frequency_dict): self.word_count = len(wordid_frequency_dict) # 单词数量 self.wordid_code = dict() self.wordid_path = dict() self.root = None unmerge_node_list = [HuffmanNode(wordid, frequency) for wordid, frequency in wordid_frequency_dict.items()] # 未合并节点list self.huffman = [HuffmanNode(wordid, frequency) for wordid, frequency in wordid_frequency_dict.items()] # 存储所有的叶子节点和中间节点 # 构建huffman tree self.build_tree(unmerge_node_list) # 生成huffman code self.generate_huffman_code_and_path() def merge_node(self, node1, node2): sum_frequency = node1.frequency + node2.frequency mid_node_id = len(self.huffman) # 中间节点的value存中间节点id father_node = HuffmanNode(mid_node_id, sum_frequency) if node1.frequency >= node2.frequency: father_node.left_child = node1 father_node.right_child = node2 else: father_node.left_child = node2 father_node.right_child = node1 self.huffman.append(father_node) return father_node def build_tree(self, node_list): while len(node_list) > 1: i1 = 0 # 概率最小的节点 i2 = 1 # 概率第二小的节点 if node_list[i2].frequency < node_list[i1].frequency: [i1, i2] = [i2, i1] for i in range(2, len(node_list)): if node_list[i].frequency < node_list[i2].frequency: i2 = i if node_list[i2].frequency < node_list[i1].frequency: [i1, i2] = [i2, i1] father_node = self.merge_node(node_list[i1], node_list[i2]) # 合并最小的两个节点 if i1 < i2: node_list.pop(i2) node_list.pop(i1) elif i1 > i2: node_list.pop(i1) node_list.pop(i2) else: raise RuntimeError('i1 should not be equal to i2') node_list.insert(0, father_node) # 插入新节点 self.root = node_list[0] def generate_huffman_code_and_path(self): stack = [self.root] while len(stack) > 0: node = stack.pop() # 顺着左子树走 while node.left_child or node.right_child: code = node.Huffman_code path = node.path node.left_child.Huffman_code = code + [1] node.right_child.Huffman_code = code + [0] node.left_child.path = path + [node.word_id] node.right_child.path = path + [node.word_id] # 把没走过的右子树加入栈 stack.append(node.right_child) node = node.left_child word_id = node.word_id word_code = node.Huffman_code word_path = node.path self.huffman[word_id].Huffman_code = word_code self.huffman[word_id].path = word_path # 把节点计算得到的霍夫曼码、路径 写入词典的数值中 self.wordid_code[word_id] = word_code self.wordid_path[word_id] = word_path # 获取所有词的正向节点id和负向节点id数组 def get_all_pos_and_neg_path(self): positive = [] # 所有词的正向路径数组 negative = [] # 所有词的负向路径数组 for word_id in range(self.word_count): pos_id = [] # 存放一个词 路径中的正向节点id neg_id = [] # 存放一个词 路径中的负向节点id for i, code in enumerate(self.huffman[word_id].Huffman_code): if code == 1: pos_id.append(self.huffman[word_id].path[i]) else: neg_id.append(self.huffman[word_id].path[i]) positive.append(pos_id) negative.append(neg_id) return positive, negative def main(): words = "你 我 他 你们 我们 他们 它们" freqs = "50 10 8 7 6 3 2" word_to_id = dict((word, i) for i, word in enumerate(words.split())) print(word_to_id) word_frequency = dict((word_to_id[x], int(y)) for x, y in zip(words.split(), freqs.split())) tree = HuffmanTree(word_frequency) word_code = dict((word, tree.wordid_code[word_to_id[word]]) for word in words.split()) print(word_code) if __name__ == '__main__': main()
#!/usr/bin/env python3 #coding: utf-8 #基于训练好的词向量模型进行聚类 #聚类采用Kmeans算法 import math import re import json import jieba import numpy as np from gensim.models import Word2Vec from sklearn.cluster import KMeans from collections import defaultdict #输入模型文件路径 #加载训练好的模型 def load_word2vec_model(path): model = Word2Vec.load(path) return model def load_sentence(path): sentences = set() with open(path, encoding="utf8") as f: for line in f: sentence = line.strip() sentences.add(" ".join(jieba.cut(sentence))) print("获取句子数量:", len(sentences)) return sentences #将文本向量化 def sentences_to_vectors(sentences, model): vectors = [] for sentence in sentences: words = sentence.split() #sentence是分好词的,空格分开 vector = np.zeros(model.vector_size) #所有词的向量相加求平均,作为句子向量 for word in words: try: vector += model.wv[word] except KeyError: #部分词在训练中未出现,用全0向量代替 vector += np.zeros(model.vector_size) vectors.append(vector / len(words)) return np.array(vectors) def main(): model = load_word2vec_model(r"F:\Desktop\work_space\badou\八斗课程\week5 词向量及文本向量\model.w2v") #加载词向量模型 sentences = load_sentence("titles.txt") #加载所有标题 vectors = sentences_to_vectors(sentences, model) #将所有标题向量化 n_clusters = int(math.sqrt(len(sentences))) #指定聚类数量 print("指定聚类数量:", n_clusters) kmeans = KMeans(n_clusters) #定义一个kmeans计算类 kmeans.fit(vectors) #进行聚类计算 sentence_label_dict = defaultdict(list) for sentence, label in zip(sentences, kmeans.labels_): #取出句子和标签 sentence_label_dict[label].append(sentence) #同标签的放到一起 for label, sentences in sentence_label_dict.items(): print("cluster %s :" % label) for i in range(min(10, len(sentences))): #随便打印几个,太多了看不过来 print(sentences[i].replace(" ", "")) print("---------") if __name__ == "__main__": main()