# knowledge_manager.py (重构后)
import json
import os
import uuid
from datetime import datetime
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
class KnowledgeManager:
def __init__(self, storage_file="knowledge_base.json"):
"""
重构后的知识管理核心模块
参数:
storage_file -- 知识库存储文件路径
"""
self.storage_file = storage_file
self.knowledge_base = self._initialize_knowledge_base()
self.vectorizer = TfidfVectorizer(stop_words='english')
self.load_knowledge()
def _initialize_knowledge_base(self):
"""初始化知识库结构"""
return {
"knowledge": {},
"relationships": [],
"categories": defaultdict(list),
"metadata": {
"last_updated": datetime.now().isoformat(),
"knowledge_count": 0,
"learning_metrics": {
"total_learning_time": 0.0,
"last_learned": None,
"mastery_levels": {}
},
"integration_history": []
}
}
def load_knowledge(self):
"""从文件加载知识库"""
if os.path.exists(self.storage_file):
try:
with open(self.storage_file, 'r', encoding='utf-8') as f:
self.knowledge_base = json.load(f)
print(f"知识库已加载,包含 {self.knowledge_base['metadata']['knowledge_count']} 条知识")
except Exception as e:
print(f"加载知识库失败: {e}")
# 创建新知识库
self.knowledge_base = self._initialize_knowledge_base()
def save_knowledge(self):
"""保存知识库到文件"""
try:
with open(self.storage_file, 'w', encoding='utf-8') as f:
json.dump(self.knowledge_base, f, ensure_ascii=False, indent=2)
print("知识库已保存")
return True
except Exception as e:
print(f"保存知识库失败: {e}")
return False
def add_knowledge(self, title, content, tags=None, category=None):
"""
添加新知识条目
参数:
title -- 知识标题
content -- 知识内容
tags -- 标签列表 (可选)
category -- 分类 (可选)
"""
# 生成唯一ID
knowledge_id = str(uuid.uuid4())
# 创建时间戳
now = datetime.now().isoformat()
# 创建知识条目
new_entry = {
"id": knowledge_id,
"title": title,
"content": content,
"tags": tags or [],
"category": category,
"created_at": now,
"updated_at": now
}
# 添加到知识库
self.knowledge_base["knowledge"][knowledge_id] = new_entry
# 更新元数据
self.knowledge_base["metadata"]["knowledge_count"] += 1
self.knowledge_base["metadata"]["last_updated"] = now
# 添加到分类
if category:
self.knowledge_base["categories"][category].append(knowledge_id)
# 自动创建关系
self._auto_create_relationships(knowledge_id)
# 更新向量器
self._update_vectorizer()
print(f"知识添加成功! ID: {knowledge_id}")
return knowledge_id
def _auto_create_relationships(self, new_id):
"""自动为新知识创建关系"""
new_entry = self.knowledge_base["knowledge"][new_id]
# 1. 基于内容相似度
similarities = self.find_similar_knowledge(new_id, threshold=0.3)
for knowledge_id, similarity in similarities:
self.add_relationship(new_id, knowledge_id, "similar", similarity)
# 2. 基于标签匹配
for knowledge_id, entry in self.knowledge_base["knowledge"].items():
if knowledge_id == new_id:
continue
# 计算标签匹配度
common_tags = set(new_entry["tags"]) & set(entry["tags"])
if common_tags:
strength = len(common_tags) / (len(new_entry["tags"]) + len(entry["tags"]) - len(common_tags))
self.add_relationship(new_id, knowledge_id, "tag_match", strength)
def _update_vectorizer(self):
"""更新TF-IDF向量器"""
if not self.knowledge_base["knowledge"]:
return
# 收集所有知识内容
all_content = [entry["content"] for entry in self.knowledge_base["knowledge"].values()]
# 更新向量器
self.vectorizer.fit(all_content)
def add_relationship(self, source_id, target_id, rel_type, strength=0.5):
"""
添加知识关系
参数:
source_id -- 源知识ID
target_id -- 目标知识ID
rel_type -- 关系类型 (e.g., "similar", "dependency", "contrast")
strength -- 关系强度 (0.0-1.0)
"""
# 检查关系是否已存在
for rel in self.knowledge_base["relationships"]:
if rel["source"] == source_id and rel["target"] == target_id and rel["type"] == rel_type:
# 更新现有关系
rel["strength"] = max(rel["strength"], strength)
rel["updated_at"] = datetime.now().isoformat()
return
# 创建新关系
new_rel = {
"source": source_id,
"target": target_id,
"type": rel_type,
"strength": strength,
"created_at": datetime.now().isoformat(),
"updated_at": datetime.now().isoformat()
}
self.knowledge_base["relationships"].append(new_rel)
print(f"关系添加: {source_id[:8]} → {target_id[:8]} ({rel_type}, {strength:.2f})")
def get_knowledge(self, knowledge_id):
"""根据ID获取知识条目"""
return self.knowledge_base["knowledge"].get(knowledge_id)
def update_knowledge(self, knowledge_id, title=None, content=None, tags=None, category=None):
"""更新知识条目"""
if knowledge_id not in self.knowledge_base["knowledge"]:
print("知识条目不存在")
return False
entry = self.knowledge_base["knowledge"][knowledge_id]
# 更新字段
if title is not None:
entry["title"] = title
if content is not None:
entry["content"] = content
if tags is not None:
entry["tags"] = tags
if category is not None:
# 移除旧分类
old_category = entry.get("category")
if old_category and old_category in self.knowledge_base["categories"]:
if knowledge_id in self.knowledge_base["categories"][old_category]:
self.knowledge_base["categories"][old_category].remove(knowledge_id)
# 添加新分类
entry["category"] = category
self.knowledge_base["categories"][category].append(knowledge_id)
entry["updated_at"] = datetime.now().isoformat()
# 重新计算关系
self._auto_create_relationships(knowledge_id)
# 更新向量器
self._update_vectorizer()
print("知识更新成功")
return True
def delete_knowledge(self, knowledge_id):
"""删除知识条目"""
if knowledge_id not in self.knowledge_base["knowledge"]:
print("知识条目不存在")
return False
# 删除知识条目
del self.knowledge_base["knowledge"][knowledge_id]
# 更新元数据
self.knowledge_base["metadata"]["knowledge_count"] -= 1
self.knowledge_base["metadata"]["last_updated"] = datetime.now().isoformat()
# 从分类中移除
for category, items in self.knowledge_base["categories"].items():
if knowledge_id in items:
items.remove(knowledge_id)
# 删除相关关系
self.knowledge_base["relationships"] = [
rel for rel in self.knowledge_base["relationships"]
if rel["source"] != knowledge_id and rel["target"] != knowledge_id
]
# 更新向量器
self._update_vectorizer()
print("知识删除成功")
return True
def search_knowledge(self, query, top_n=5):
"""
搜索知识库
参数:
query -- 搜索查询
top_n -- 返回最相关的前n个结果
"""
if not self.knowledge_base["knowledge"]:
return []
# 获取所有知识内容
all_ids = list(self.knowledge_base["knowledge"].keys())
all_content = [self.knowledge_base["knowledge"][kid]["content"] for kid in all_ids]
# 添加查询到内容列表
all_content.append(query)
# 计算TF-IDF向量
tfidf_matrix = self.vectorizer.transform(all_content)
# 计算相似度(查询与所有知识条目)
query_vector = tfidf_matrix[-1]
knowledge_vectors = tfidf_matrix[:-1]
similarities = cosine_similarity(query_vector, knowledge_vectors).flatten()
# 获取最相关的结果
top_indices = similarities.argsort()[-top_n:][::-1]
results = []
for idx in top_indices:
knowledge_id = all_ids[idx]
entry = self.knowledge_base["knowledge"][knowledge_id]
results.append({
"id": knowledge_id,
"title": entry["title"],
"content": entry["content"][:150] + "..." if len(entry["content"]) > 150 else entry["content"],
"similarity": similarities[idx]
})
return results
def find_similar_knowledge(self, knowledge_id, threshold=0.2):
"""
查找相似知识条目
参数:
knowledge_id -- 参考知识ID
threshold -- 相似度阈值
"""
if knowledge_id not in self.knowledge_base["knowledge"]:
return []
# 获取所有知识内容
all_ids = list(self.knowledge_base["knowledge"].keys())
all_content = [self.knowledge_base["knowledge"][kid]["content"] for kid in all_ids]
# 计算TF-IDF向量
tfidf_matrix = self.vectorizer.transform(all_content)
# 找到参考知识的索引
ref_idx = all_ids.index(knowledge_id)
# 计算相似度
similarities = cosine_similarity(tfidf_matrix[ref_idx:ref_idx+1], tfidf_matrix).flatten()
# 收集相似结果
results = []
for i, sim in enumerate(similarities):
if i != ref_idx and sim > threshold:
results.append((all_ids[i], sim))
# 按相似度排序
results.sort(key=lambda x: x[1], reverse=True)
return results
def get_knowledge_graph_data(self, min_strength=0.1):
"""获取知识图谱数据(供可视化模块使用)"""
nodes = []
edges = []
# 添加节点
for kid, entry in self.knowledge_base["knowledge"].items():
nodes.append({
"id": kid,
"title": entry["title"],
"category": entry.get("category", "未分类")
})
# 添加边
for rel in self.knowledge_base["relationships"]:
if rel["strength"] >= min_strength:
edges.append({
"source": rel["source"],
"target": rel["target"],
"type": rel["type"],
"strength": rel["strength"]
})
return {"nodes": nodes, "links": edges}
def get_timeline_data(self):
"""获取时间线数据(供可视化模块使用)"""
timeline = defaultdict(int)
for entry in self.knowledge_base["knowledge"].values():
date = datetime.fromisoformat(entry["created_at"])
month_key = date.strftime("%Y-%m")
timeline[month_key] += 1
# 转换为排序列表
sorted_timeline = sorted(timeline.items(), key=lambda x: x[0])
return [{"month": k, "count": v} for k, v in sorted_timeline]
def get_integration_history(self):
"""获取整合历史记录"""
return self.knowledge_base["metadata"]["integration_history"]
def export_knowledge(self, format="json", output_file=None):
"""
导出知识库
参数:
format -- 导出格式 (json, md, html)
output_file -- 输出文件路径
"""
if not output_file:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_file = f"knowledge_export_{timestamp}.{format}"
if format == "json":
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(self.knowledge_base, f, ensure_ascii=False, indent=2)
print(f"知识已导出到 JSON 文件: {output_file}")
elif format == "md":
with open(output_file, 'w', encoding='utf-8') as f:
f.write("# 知识库导出\n\n")
f.write(f"> 导出时间: {datetime.now().isoformat()}\n")
f.write(f"> 知识条目数: {self.knowledge_base['metadata']['knowledge_count']}\n\n")
for kid, entry in self.knowledge_base["knowledge"].items():
f.write(f"## {entry['title']}\n")
f.write(f"**ID**: `{kid}` \n")
f.write(f"**创建时间**: {entry['created_at']} \n")
f.write(f"**最后更新**: {entry['updated_at']} \n")
if entry.get('tags'):
f.write(f"**标签**: {', '.join(entry['tags'])} \n")
if entry.get('category'):
f.write(f"**分类**: {entry['category']} \n")
f.write("\n### 内容\n")
f.write(entry["content"] + "\n\n")
# 添加相关关系
relations = [rel for rel in self.knowledge_base["relationships"]
if rel["source"] == kid or rel["target"] == kid]
if relations:
f.write("### 相关关系\n")
for rel in relations:
if rel["source"] == kid:
target = self.get_knowledge(rel["target"])
f.write(
f"- → [{target['title']}]({rel['target']}) ({rel['type']}, {rel['strength']:.2f})\n")
else:
source = self.get_knowledge(rel["source"])
f.write(
f"- ← [{source['title']}]({rel['source']}) ({rel['type']}, {rel['strength']:.2f})\n")
f.write("\n")
print(f"知识已导出到 Markdown 文件: {output_file}")
elif format == "html":
# 简化的HTML导出
with open(output_file, 'w', encoding='utf-8') as f:
f.write("<!DOCTYPE html>\n<html>\n<head>\n")
f.write("<title>知识库导出</title>\n")
f.write("<style>body {font-family: Arial, sans-serif; margin: 20px;}</style>\n")
f.write("</head>\n<body>\n")
f.write(f"<h1>知识库导出</h1>\n")
f.write(f"<p>导出时间: {datetime.now().isoformat()}</p>\n")
f.write(f"<p>知识条目数: {self.knowledge_base['metadata']['knowledge_count']}</p>\n")
for kid, entry in self.knowledge_base["knowledge"].items():
f.write(
f"<div style='margin-bottom: 30px; border: 1px solid #ddd; padding: 15px; border-radius: 5px;'>\n")
f.write(f"<h2>{entry['title']}</h2>\n")
f.write(f"<p><strong>ID</strong>: <code>{kid}</code></p>\n")
f.write(f"<p><strong>创建时间</strong>: {entry['created_at']}</p>\n")
f.write(f"<p><strong>最后更新</strong>: {entry['updated_at']}</p>\n")
if entry.get('tags'):
f.write(f"<p><strong>标签</strong>: {', '.join(entry['tags'])}</p>\n")
if entry.get('category'):
f.write(f"<p><strong>分类</strong>: {entry['category']}</p>\n")
f.write(f"<h3>内容</h3>\n")
f.write(f"<div style='white-space: pre-wrap;'>{entry['content']}</div>\n")
# 添加相关关系
relations = [rel for rel in self.knowledge_base["relationships"]
if rel["source"] == kid or rel["target"] == kid]
if relations:
f.write("<h3>相关关系</h3>\n<ul>\n")
for rel in relations:
if rel["source"] == kid:
target = self.get_knowledge(rel["target"])
f.write(
f"<li>→ <a href='#{rel['target']}'>{target['title']}</a> ({rel['type']}, {rel['strength']:.2f})</li>\n")
else:
source = self.get_knowledge(rel["source"])
f.write(
f"<li>← <a href='#{rel['source']}'>{source['title']}</a> ({rel['type']}, {rel['strength']:.2f})</li>\n")
f.write("</ul>\n")
f.write("</div>\n")
f.write("</body>\n</html>")
print(f"知识已导出到 HTML 文件: {output_file}")
else:
print(f"不支持的导出格式: {format}")
return False
return True
def get_report_data(self):
"""生成知识库报告数据(供可视化模块使用)"""
report = {
"knowledge_count": self.knowledge_base["metadata"]["knowledge_count"],
"last_updated": self.knowledge_base["metadata"]["last_updated"],
"categories": {},
"tag_cloud": defaultdict(int),
"relationship_stats": defaultdict(int),
"timeline_data": []
}
# 分类统计
for category, items in self.knowledge_base["categories"].items():
report["categories"][category] = len(items)
# 标签统计
for entry in self.knowledge_base["knowledge"].values():
for tag in entry.get("tags", []):
report["tag_cloud"][tag] += 1
# 关系统计
for rel in self.knowledge_base["relationships"]:
report["relationship_stats"][rel["type"]] += 1
# 时间线数据
if self.knowledge_base["knowledge"]:
dates = []
for entry in self.knowledge_base["knowledge"].values():
dates.append(datetime.fromisoformat(entry["created_at"]))
# 按月统计
months = defaultdict(int)
for date in dates:
month_key = date.strftime("%Y-%m")
months[month_key] += 1
# 按时间排序
sorted_months = sorted(months.items(), key=lambda x: x[0])
report["timeline_data"] = [{"month": k, "count": v} for k, v in sorted_months]
return report
# 使用示例
if __name__ == "__main__":
# 创建知识管理器
km = KnowledgeManager()
# 添加知识条目
km.add_knowledge(
title="Python列表推导式",
content="列表推导式提供了一种简洁的方法来创建列表。语法: [expression for item in iterable if condition]",
tags=["Python", "编程技巧"],
category="编程"
)
km.add_knowledge(
title="神经网络基础",
content="神经网络由输入层、隐藏层和输出层组成。使用反向传播算法训练权重。",
tags=["机器学习", "神经网络"],
category="AI"
)
km.add_knowledge(
title="Git基本命令",
content="常用Git命令: git init, git add, git commit, git push, git pull",
tags=["Git", "版本控制"],
category="开发工具"
)
# 添加关系
py_id = list(km.knowledge_base["knowledge"].keys())[0]
ai_id = list(km.knowledge_base["knowledge"].keys())[1]
km.add_relationship(py_id, ai_id, "related", 0.6)
# 搜索知识
print("\n搜索 'Python':")
results = km.search_knowledge("Python")
for res in results:
print(f"{res['title']} (相似度: {res['similarity']:.2f})")
# 获取图谱数据
graph_data = km.get_knowledge_graph_data()
print(f"\n知识图谱数据: {len(graph_data['nodes'])} 节点, {len(graph_data['links'])} 边")
# 获取时间线数据
timeline_data = km.get_timeline_data()
print(f"\n时间线数据: {len(timeline_data)} 个月份记录")
# 获取报告数据
report_data = km.get_report_data()
print(f"\n报告数据: {report_data['knowledge_count']} 条知识")
# 导出知识
km.export_knowledge(format="md")
# 保存知识库
km.save_knowledge()
这个需要改吗 怎么改
最新发布