"""
@Project :Nlp
@Author :fengl
@Date :2020/11/6
"""
import time
import pandas as pd
import numpy as np
import jieba.posseg
import jieba.analyse
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
import os
import math
from gensim import corpora, models
class GraphNode:
def __init__(self, idx):
self.idx = idx
self.parents = []
self.children = []
self.pruned = False
self.independent_tidset = None
def get_data(self, itemsets):
return itemsets[self.idx]
def get_frequency(self, frequencies):
return frequencies[self.idx]
def get_independency(self, tidsets):
tidset = tidsets[self.idx]
if len(self.children) == 0:
return 1.0, tidset
children_tidsets = [tidsets[cid] for cid in self.children]
children_tidset = set()
for ids in children_tidsets:
children_tidset.update(ids)
complement_ids = tidset - children_tidset
independency = float(len(complement_ids)) / len(tidset)
return independency, complement_ids
def get_entropy(self, frequencies):
"""
假设children是相互独立的,但假设不成立
:param frequencies:
:return:
"""
if len(self.children) == 0:
return 10000.0
children_freq = []
for child_idx in self.children:
children_freq.append(float(frequencies[child_idx]))
children_freq_sum = sum(children_freq)
children_prob = [f / children_freq_sum for f in children_freq]
entropy = sum([-p * math.log(p) for p in children_prob])
return entropy
class FPTreeNode:
def __init__(self, item=None, support=0, tid: int = -1):
self.item = item
self.support = support
self.tids = set()
if tid >= 0:
self.tids.add(tid)
self.children = {}
class FPGrowth:
"""
Parameters:
-----------
min_sup: float
"""
def __init__(self, min_sup=0.3):
self.min_sup = min_sup
self.tree_root = None
self.prefixes = {}
self.frequent_itemsets = []
self.set_frequencies = []
self.item_tidsets = []
def _calculate_support(self, item, transactions):
count = 0
tids = set()
for transaction in transactions:
if item in transaction[1]:
count += 1
tids.add(transaction[0])
support = count
return support, tids
def _get_frequent_items(self, transactions):
unique_items = set(
item for transaction in transactions for item in transaction[1])
items = []
for item in unique_items:
sup, tids = self._calculate_support(item, transactions)
if sup >= self.min_sup:
items.append([item, sup, tids])
items.sort(key=lambda item: item[1], reverse=True)
frequent_items = [[el[0]] for el in items]
frequencies = [el[1] for el in items]
tidsets = [el[2] for el in items]
return frequent_items, frequencies, tidsets
def _insert_tree(self, node, children):
if not children:
return
child_item = children[0][1]
child = FPTreeNode(item=child_item, support=1, tid=children[0][0])
if child_item in node.children:
node.children[child.item].support += 1
node.children[child.item].tids.add(children[0][0])
else:
node.children[child.item] = child
self._insert_tree(node.children[child.item], children[1:])
def _construct_tree(self, transactions, frequent_items=None):
if frequent_items is None:
frequent_items, frequencies, tidsets = self._get_frequent_items(transactions)
unique_frequent_items = list(
set(item for itemset in frequent_items for item in itemset))
root = FPTreeNode()
for transaction in transactions:
transaction = [(transaction[0], item) for item in transaction[1] if item in unique_frequent_items]
transaction.sort(key=lambda item: frequent_items.index([item[1]]))
self._insert_tree(root, transaction)
return root
def print_tree(self, node=None, indent_times=0):
if not node:
node = self.tree_root
indent = " " * indent_times
print("%s%s:%s" % (indent, node.item, node.support))
for child_key in node.children:
child = node.children[child_key]
self.print_tree(child, indent_times + 1)
def _is_prefix(self, itemset, node):
""" Makes sure that the first item in itemset is a child of node
and that every following item in itemset is reachable via that path """
for item in itemset:
if not item in node.children:
return False
node = node.children[item]
return True
def _determine_prefixes(self, itemset, node, prefixes=None):
""" Recursive method that adds prefixes to the itemset by traversing the
FP Growth Tree"""
if not prefixes:
prefixes = []
if self._is_prefix(itemset, node):
itemset_key = self._get_itemset_key(itemset)
if not itemset_key in self.prefixes:
self.prefixes[itemset_key] = []
self.prefixes[itemset_key] += [{
"prefix": prefixes,
"support": node.children[itemset[0]].support,
"tids": node.children[itemset[0]].tids}]
for child_key in node.children:
child = node.children[child_key]
self._determine_prefixes(itemset, child, prefixes + [child.item])
def _get_itemset_key(self, itemset):
""" Determines the look of the hashmap key for self.prefixes
List of more strings than one gets joined by '-' """
if len(itemset) > 1:
itemset_key = "-".join(itemset)
else:
itemset_key = str(itemset[0])
return itemset_key
def _determine_frequent_itemsets(self, conditional_database, suffix):
frequent_items, frequencies, tidsets = self._get_frequent_items(conditional_database)
cond_tree = None
if suffix:
cond_tree = self._construct_tree(conditional_database, frequent_items)
self.frequent_itemsets += [el + suffix for el in frequent_items]
self.set_frequencies += [frequency for frequency in frequencies]
self.item_tidsets += [tidset for tidset in tidsets]
self.prefixes = {}
for itemset in frequent_items:
if not cond_tree:
cond_tree = self.tree_root
self._determine_prefixes(itemset, cond_tree)
conditional_database = []
itemset_key = self._get_itemset_key(itemset)
if itemset_key in self.prefixes:
for el in self.prefixes[itemset_key]:
for tid in el["tids"]:
conditional_database.append((tid, el["prefix"]))
new_suffix = itemset + suffix if suffix else itemset
self._determine_frequent_itemsets(conditional_database, suffix=new_suffix)
def find_frequent_itemsets(self, transactions, suffix=None, show_tree=False):
transactions = [(i, transaction) for (i, transaction) in enumerate(transactions)]
self.tree_root = self._construct_tree(transactions)
if show_tree:
print("FP-Growth Tree:")
self.print_tree(self.tree_root)
self._determine_frequent_itemsets(transactions, suffix=None)
return self.frequent_itemsets
def tfidf_compress(documents, max_words=7):
dictionary = corpora.Dictionary(documents)
corpus = [dictionary.doc2bow(text) for text in documents]
tfidf_model = models.TfidfModel(corpus)
corpus_tfidf = [tfidf_model[doc] for doc in corpus]
new_documents = []
for item in corpus_tfidf:
if len(item) > max_words:
item.sort(key=lambda id_weight: id_weight[1])
item = item[0:max_words]
new_doc = set([dictionary.get(ii[0]) for ii in item])
new_documents.append(new_doc)
return new_documents
def get_ancestors(nodes, idx, ancestors: set = None) -> set:
node = nodes[idx]
if not ancestors:
ancestors = set()
ancestors.add(node.idx)
for parent in node.parents:
get_ancestors(nodes, parent, ancestors)
return ancestors
def build_graph(frequent_itemsets):
frequent_itemsets = [set(itemset) for itemset in frequent_itemsets]
level_idx_dict = dict()
nodes = []
for (i, itemset) in enumerate(frequent_itemsets):
lv = len(itemset)
if lv not in level_idx_dict:
level_idx_dict[lv] = []
level_idx_dict[lv].append(i)
nodes.append(GraphNode(i))
max_lv = max(level_idx_dict.keys())
print('MAX LEVEL:', max_lv)
levels = list(level_idx_dict.keys())
levels.sort()
for i in range(1, len(levels)):
lv_i = levels[i]
set_indices_i = level_idx_dict[lv_i]
for set_idx_i in set_indices_i:
ancestors = set()
for j in range(i - 1, -1, -1):
lv_j = levels[j]
set_indices_j = level_idx_dict[lv_j]
for set_idx_j in set_indices_j:
if set_idx_j not in ancestors and \
frequent_itemsets[set_idx_j].issubset(frequent_itemsets[set_idx_i]):
nodes[set_idx_i].parents.append(set_idx_j)
nodes[set_idx_j].children.append(set_idx_i)
ancestors_j = get_ancestors(nodes, set_idx_j)
ancestors.update(ancestors_j)
return nodes, level_idx_dict
def prune_nodes(nodes, level_idx_dict, item_tidsets, threshold, independent_count, do_update=False):
levels = list(level_idx_dict.keys())
levels.sort()
for i in range(0, len(levels)):
lv_i = levels[i]
set_indices_i = level_idx_dict[lv_i]
for set_idx_i in set_indices_i:
node = nodes[set_idx_i]
independency, complement_ids = node.get_independency(item_tidsets)
node.independent_tidset = complement_ids
if not (independency >= threshold or len(complement_ids) >= independent_count):
node.pruned = True
if do_update:
for pid in node.parents:
nodes[pid].children.remove(node.idx)
nodes[pid].children.extend(node.children)
def filter_by_independency(frequent_itemsets, set_frequencies, item_tidsets, threshold=0.1, independent_count=1):
t1 = time.time()
nodes, level_idx_dict = build_graph(frequent_itemsets)
t2 = time.time()
prune_nodes(nodes, level_idx_dict, item_tidsets, threshold, independent_count)
t3 = time.time()
filter_itemsets = []
filter_frequencies = []
filter_tidsets = []
for (i, item_set) in enumerate(frequent_itemsets):
if not nodes[i].pruned:
filter_itemsets.append(item_set)
filter_frequencies.append(set_frequencies[i])
filter_tidsets.append(nodes[i].independent_tidset)
t4 = time.time()
return filter_itemsets, filter_frequencies, filter_tidsets
def dataPrepos(text, stopkey):
"""
TF-IDF权重:
1、CountVectorizer 构建词频矩阵
2、TfidfTransformer 构建tfidf权值计算
3、文本的关键字
4、对应的tfidf矩阵
"""
l = []
pos = ['n', 'nz', 'v', 'vd', 'vn', 'l', 'a', 'd']
seg = jieba.posseg.cut(text)
for i in seg:
if i.word not in stopkey and i.flag in pos:
l.append(i.word)
return l
def getKeywords_tfidf(text, stopkey, topK):
source_question = text
corpus = []
text = dataPrepos(text, stopkey)
if text:
text = " ".join(text)
else:
text = source_question
corpus.append(text)
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(X)
word = vectorizer.get_feature_names()
weight = tfidf.toarray()
keys = []
for i in range(len(weight)):
df_word, df_weight = [], []
for j in range(len(word)):
df_word.append(word[j])
df_weight.append(weight[i][j])
df_word = pd.DataFrame(df_word, columns=['word'])
df_weight = pd.DataFrame(df_weight, columns=['weight'])
word_weight = pd.concat([df_word, df_weight], axis=1)
word_weight = word_weight.sort_values(by="weight", ascending=False)
keyword = np.array(word_weight['word'])
if len(keyword) < topK:
topK = len(keyword)
else:
topK = topK
word_split = [keyword[x] for x in range(0, topK)]
word_split = " ".join(word_split)
keys.append(word_split)
result = keys
return result
def key_words(text):
fenci_text = jieba.cut(text)
stopwords = {}.fromkeys([line.rstrip() for line in open('./stopwords.txt', encoding='utf-8')])
print(stopwords, '===============')
final = ""
for word in fenci_text:
if word not in stopwords:
if (word != "。" and word != ","):
final = final + " " + word
key_words_list = []
words_list = jieba.analyse.extract_tags(text, topK=4, withWeight=True, allowPOS=())
for word_tuple in words_list:
if len(word_tuple) == 2:
key_words_list.append(word_tuple[0])
return key_words_list
def get_frequent_words(words_list_data, min_sup=1, max_words=5, do_filter=True):
documents = []
for line in words_list_data:
line = line.strip()
if line:
documents.append(key_words(line))
documents = tfidf_compress(documents, max_words=7)
print(documents, '===============')
if min_sup == -1:
min_sup = min(math.log10(1 + len(documents)), 7.0)
print(min_sup, '-----------0')
fp_growth = FPGrowth(min_sup=min_sup)
frequent_itemsets = fp_growth.find_frequent_itemsets(documents, show_tree=False)
print(frequent_itemsets, '999999')
set_frequencies = fp_growth.set_frequencies
item_tidsets = fp_growth.item_tidsets
if do_filter:
frequent_itemsets, set_frequencies, item_tidsets = \
filter_by_independency(frequent_itemsets, set_frequencies, item_tidsets, threshold=0.8, independent_count=2)
for i in range(len(frequent_itemsets) - 1, -1, -1):
if len(frequent_itemsets[i]) > max_words:
frequent_itemsets.pop(i)
set_frequencies.pop(i)
item_tidsets.pop(i)
return frequent_itemsets, item_tidsets
def get_knowledge_point(words_list_data):
min_sup = 5
item_sets, item_tidsets = get_frequent_words(words_list_data, min_sup=min_sup)
documents = []
for line in words_list_data:
line = line.strip()
if line:
documents.append(line)
word_str = ''
data_result_list = []
intent_words = []
for itemset, tidset in zip(item_sets, item_tidsets):
if item_sets:
word_str = '/'.join(itemset)
for doc_id in tidset:
if documents[doc_id] not in intent_words:
intent_words.append(documents[doc_id])
data_result_list.append([word_str, documents[doc_id]])
return data_result_list
print(get_knowledge_point(
['疫苗结果不显示', '疫苗结果为啥显示不出来', '疫苗结果在哪显示帮帮我', '你好哇啊,疫苗结果在哪显示帮帮我哇', '为什么我的疫苗在北京健康报无法显示', '疫苗在哪显示', '疫苗在哪显示了', '怎么显示疫苗',
'我的疫苗在北京健康报无法显示', '我的疫苗在支付宝无法显示','北京健康报在哪显示疫苗我的无法显示','我的疫苗无法显示','为啥呢,没法显示我的接种状态',]))