Knowledge Tag

本文介绍了一种名为KnowledgeTag的工具,该工具旨在帮助用户积累并管理各类知识资源,通过标记文档和数据片段,用户可以轻松地收集和组织所需信息。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

很多用户在进行某类研究或撰写某类文章时,往往都需要从不同的资源中寻找自己所需要的东西,而有些东西是以前看过的却没有记忆下来的,而knowledge注重的就是积累,积累的多了融会贯通才有可能达到新的知识高度。而目前并没有这样的工具来帮助用户实现这个功能。Knowledge Tag的目的就是让知识积累变为可能,让任何所见到的数据,事实等随时记录下来让knowledge积累变得更加实际。
Knowledge Tag的功能包括:
1) Tag某个文档或数据片段为knowledge,然后将这个Knowledge存储到特定的Location中,前提是这个Location对于用户来说是可访问的
2) Tag之后添加元数据到Knowledge中,元数据包括Dubolin提供的元数据等
3) 实现智能Tag,程序根据某种规则自动搜索资源中的知识点,共用户选择

# knowledge_manager.py (重构后) import json import os import uuid from datetime import datetime from collections import defaultdict from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import numpy as np class KnowledgeManager: def __init__(self, storage_file="knowledge_base.json"): """ 重构后的知识管理核心模块 参数: storage_file -- 知识库存储文件路径 """ self.storage_file = storage_file self.knowledge_base = self._initialize_knowledge_base() self.vectorizer = TfidfVectorizer(stop_words='english') self.load_knowledge() def _initialize_knowledge_base(self): """初始化知识库结构""" return { "knowledge": {}, "relationships": [], "categories": defaultdict(list), "metadata": { "last_updated": datetime.now().isoformat(), "knowledge_count": 0, "learning_metrics": { "total_learning_time": 0.0, "last_learned": None, "mastery_levels": {} }, "integration_history": [] } } def load_knowledge(self): """从文件加载知识库""" if os.path.exists(self.storage_file): try: with open(self.storage_file, 'r', encoding='utf-8') as f: self.knowledge_base = json.load(f) print(f"知识库已加载,包含 {self.knowledge_base['metadata']['knowledge_count']} 条知识") except Exception as e: print(f"加载知识库失败: {e}") # 创建新知识库 self.knowledge_base = self._initialize_knowledge_base() def save_knowledge(self): """保存知识库到文件""" try: with open(self.storage_file, 'w', encoding='utf-8') as f: json.dump(self.knowledge_base, f, ensure_ascii=False, indent=2) print("知识库已保存") return True except Exception as e: print(f"保存知识库失败: {e}") return False def add_knowledge(self, title, content, tags=None, category=None): """ 添加新知识条目 参数: title -- 知识标题 content -- 知识内容 tags -- 标签列表 (可选) category -- 分类 (可选) """ # 生成唯一ID knowledge_id = str(uuid.uuid4()) # 创建时间戳 now = datetime.now().isoformat() # 创建知识条目 new_entry = { "id": knowledge_id, "title": title, "content": content, "tags": tags or [], "category": category, "created_at": now, "updated_at": now } # 添加到知识库 self.knowledge_base["knowledge"][knowledge_id] = new_entry # 更新元数据 self.knowledge_base["metadata"]["knowledge_count"] += 1 self.knowledge_base["metadata"]["last_updated"] = now # 添加到分类 if category: self.knowledge_base["categories"][category].append(knowledge_id) # 自动创建关系 self._auto_create_relationships(knowledge_id) # 更新向量器 self._update_vectorizer() print(f"知识添加成功! ID: {knowledge_id}") return knowledge_id def _auto_create_relationships(self, new_id): """自动为新知识创建关系""" new_entry = self.knowledge_base["knowledge"][new_id] # 1. 基于内容相似度 similarities = self.find_similar_knowledge(new_id, threshold=0.3) for knowledge_id, similarity in similarities: self.add_relationship(new_id, knowledge_id, "similar", similarity) # 2. 基于标签匹配 for knowledge_id, entry in self.knowledge_base["knowledge"].items(): if knowledge_id == new_id: continue # 计算标签匹配度 common_tags = set(new_entry["tags"]) & set(entry["tags"]) if common_tags: strength = len(common_tags) / (len(new_entry["tags"]) + len(entry["tags"]) - len(common_tags)) self.add_relationship(new_id, knowledge_id, "tag_match", strength) def _update_vectorizer(self): """更新TF-IDF向量器""" if not self.knowledge_base["knowledge"]: return # 收集所有知识内容 all_content = [entry["content"] for entry in self.knowledge_base["knowledge"].values()] # 更新向量器 self.vectorizer.fit(all_content) def add_relationship(self, source_id, target_id, rel_type, strength=0.5): """ 添加知识关系 参数: source_id -- 源知识ID target_id -- 目标知识ID rel_type -- 关系类型 (e.g., "similar", "dependency", "contrast") strength -- 关系强度 (0.0-1.0) """ # 检查关系是否已存在 for rel in self.knowledge_base["relationships"]: if rel["source"] == source_id and rel["target"] == target_id and rel["type"] == rel_type: # 更新现有关系 rel["strength"] = max(rel["strength"], strength) rel["updated_at"] = datetime.now().isoformat() return # 创建新关系 new_rel = { "source": source_id, "target": target_id, "type": rel_type, "strength": strength, "created_at": datetime.now().isoformat(), "updated_at": datetime.now().isoformat() } self.knowledge_base["relationships"].append(new_rel) print(f"关系添加: {source_id[:8]} → {target_id[:8]} ({rel_type}, {strength:.2f})") def get_knowledge(self, knowledge_id): """根据ID获取知识条目""" return self.knowledge_base["knowledge"].get(knowledge_id) def update_knowledge(self, knowledge_id, title=None, content=None, tags=None, category=None): """更新知识条目""" if knowledge_id not in self.knowledge_base["knowledge"]: print("知识条目不存在") return False entry = self.knowledge_base["knowledge"][knowledge_id] # 更新字段 if title is not None: entry["title"] = title if content is not None: entry["content"] = content if tags is not None: entry["tags"] = tags if category is not None: # 移除旧分类 old_category = entry.get("category") if old_category and old_category in self.knowledge_base["categories"]: if knowledge_id in self.knowledge_base["categories"][old_category]: self.knowledge_base["categories"][old_category].remove(knowledge_id) # 添加新分类 entry["category"] = category self.knowledge_base["categories"][category].append(knowledge_id) entry["updated_at"] = datetime.now().isoformat() # 重新计算关系 self._auto_create_relationships(knowledge_id) # 更新向量器 self._update_vectorizer() print("知识更新成功") return True def delete_knowledge(self, knowledge_id): """删除知识条目""" if knowledge_id not in self.knowledge_base["knowledge"]: print("知识条目不存在") return False # 删除知识条目 del self.knowledge_base["knowledge"][knowledge_id] # 更新元数据 self.knowledge_base["metadata"]["knowledge_count"] -= 1 self.knowledge_base["metadata"]["last_updated"] = datetime.now().isoformat() # 从分类中移除 for category, items in self.knowledge_base["categories"].items(): if knowledge_id in items: items.remove(knowledge_id) # 删除相关关系 self.knowledge_base["relationships"] = [ rel for rel in self.knowledge_base["relationships"] if rel["source"] != knowledge_id and rel["target"] != knowledge_id ] # 更新向量器 self._update_vectorizer() print("知识删除成功") return True def search_knowledge(self, query, top_n=5): """ 搜索知识库 参数: query -- 搜索查询 top_n -- 返回最相关的前n个结果 """ if not self.knowledge_base["knowledge"]: return [] # 获取所有知识内容 all_ids = list(self.knowledge_base["knowledge"].keys()) all_content = [self.knowledge_base["knowledge"][kid]["content"] for kid in all_ids] # 添加查询到内容列表 all_content.append(query) # 计算TF-IDF向量 tfidf_matrix = self.vectorizer.transform(all_content) # 计算相似度(查询与所有知识条目) query_vector = tfidf_matrix[-1] knowledge_vectors = tfidf_matrix[:-1] similarities = cosine_similarity(query_vector, knowledge_vectors).flatten() # 获取最相关的结果 top_indices = similarities.argsort()[-top_n:][::-1] results = [] for idx in top_indices: knowledge_id = all_ids[idx] entry = self.knowledge_base["knowledge"][knowledge_id] results.append({ "id": knowledge_id, "title": entry["title"], "content": entry["content"][:150] + "..." if len(entry["content"]) > 150 else entry["content"], "similarity": similarities[idx] }) return results def find_similar_knowledge(self, knowledge_id, threshold=0.2): """ 查找相似知识条目 参数: knowledge_id -- 参考知识ID threshold -- 相似度阈值 """ if knowledge_id not in self.knowledge_base["knowledge"]: return [] # 获取所有知识内容 all_ids = list(self.knowledge_base["knowledge"].keys()) all_content = [self.knowledge_base["knowledge"][kid]["content"] for kid in all_ids] # 计算TF-IDF向量 tfidf_matrix = self.vectorizer.transform(all_content) # 找到参考知识的索引 ref_idx = all_ids.index(knowledge_id) # 计算相似度 similarities = cosine_similarity(tfidf_matrix[ref_idx:ref_idx+1], tfidf_matrix).flatten() # 收集相似结果 results = [] for i, sim in enumerate(similarities): if i != ref_idx and sim > threshold: results.append((all_ids[i], sim)) # 按相似度排序 results.sort(key=lambda x: x[1], reverse=True) return results def get_knowledge_graph_data(self, min_strength=0.1): """获取知识图谱数据(供可视化模块使用)""" nodes = [] edges = [] # 添加节点 for kid, entry in self.knowledge_base["knowledge"].items(): nodes.append({ "id": kid, "title": entry["title"], "category": entry.get("category", "未分类") }) # 添加边 for rel in self.knowledge_base["relationships"]: if rel["strength"] >= min_strength: edges.append({ "source": rel["source"], "target": rel["target"], "type": rel["type"], "strength": rel["strength"] }) return {"nodes": nodes, "links": edges} def get_timeline_data(self): """获取时间线数据(供可视化模块使用)""" timeline = defaultdict(int) for entry in self.knowledge_base["knowledge"].values(): date = datetime.fromisoformat(entry["created_at"]) month_key = date.strftime("%Y-%m") timeline[month_key] += 1 # 转换为排序列表 sorted_timeline = sorted(timeline.items(), key=lambda x: x[0]) return [{"month": k, "count": v} for k, v in sorted_timeline] def get_integration_history(self): """获取整合历史记录""" return self.knowledge_base["metadata"]["integration_history"] def export_knowledge(self, format="json", output_file=None): """ 导出知识库 参数: format -- 导出格式 (json, md, html) output_file -- 输出文件路径 """ if not output_file: timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") output_file = f"knowledge_export_{timestamp}.{format}" if format == "json": with open(output_file, 'w', encoding='utf-8') as f: json.dump(self.knowledge_base, f, ensure_ascii=False, indent=2) print(f"知识已导出到 JSON 文件: {output_file}") elif format == "md": with open(output_file, 'w', encoding='utf-8') as f: f.write("# 知识库导出\n\n") f.write(f"> 导出时间: {datetime.now().isoformat()}\n") f.write(f"> 知识条目数: {self.knowledge_base['metadata']['knowledge_count']}\n\n") for kid, entry in self.knowledge_base["knowledge"].items(): f.write(f"## {entry['title']}\n") f.write(f"**ID**: `{kid}` \n") f.write(f"**创建时间**: {entry['created_at']} \n") f.write(f"**最后更新**: {entry['updated_at']} \n") if entry.get('tags'): f.write(f"**标签**: {', '.join(entry['tags'])} \n") if entry.get('category'): f.write(f"**分类**: {entry['category']} \n") f.write("\n### 内容\n") f.write(entry["content"] + "\n\n") # 添加相关关系 relations = [rel for rel in self.knowledge_base["relationships"] if rel["source"] == kid or rel["target"] == kid] if relations: f.write("### 相关关系\n") for rel in relations: if rel["source"] == kid: target = self.get_knowledge(rel["target"]) f.write( f"- → [{target['title']}]({rel['target']}) ({rel['type']}, {rel['strength']:.2f})\n") else: source = self.get_knowledge(rel["source"]) f.write( f"- ← [{source['title']}]({rel['source']}) ({rel['type']}, {rel['strength']:.2f})\n") f.write("\n") print(f"知识已导出到 Markdown 文件: {output_file}") elif format == "html": # 简化的HTML导出 with open(output_file, 'w', encoding='utf-8') as f: f.write("<!DOCTYPE html>\n<html>\n<head>\n") f.write("<title>知识库导出</title>\n") f.write("<style>body {font-family: Arial, sans-serif; margin: 20px;}</style>\n") f.write("</head>\n<body>\n") f.write(f"<h1>知识库导出</h1>\n") f.write(f"<p>导出时间: {datetime.now().isoformat()}</p>\n") f.write(f"<p>知识条目数: {self.knowledge_base['metadata']['knowledge_count']}</p>\n") for kid, entry in self.knowledge_base["knowledge"].items(): f.write( f"<div style='margin-bottom: 30px; border: 1px solid #ddd; padding: 15px; border-radius: 5px;'>\n") f.write(f"<h2>{entry['title']}</h2>\n") f.write(f"<p><strong>ID</strong>: <code>{kid}</code></p>\n") f.write(f"<p><strong>创建时间</strong>: {entry['created_at']}</p>\n") f.write(f"<p><strong>最后更新</strong>: {entry['updated_at']}</p>\n") if entry.get('tags'): f.write(f"<p><strong>标签</strong>: {', '.join(entry['tags'])}</p>\n") if entry.get('category'): f.write(f"<p><strong>分类</strong>: {entry['category']}</p>\n") f.write(f"<h3>内容</h3>\n") f.write(f"<div style='white-space: pre-wrap;'>{entry['content']}</div>\n") # 添加相关关系 relations = [rel for rel in self.knowledge_base["relationships"] if rel["source"] == kid or rel["target"] == kid] if relations: f.write("<h3>相关关系</h3>\n<ul>\n") for rel in relations: if rel["source"] == kid: target = self.get_knowledge(rel["target"]) f.write( f"<li>→ <a href='#{rel['target']}'>{target['title']}</a> ({rel['type']}, {rel['strength']:.2f})</li>\n") else: source = self.get_knowledge(rel["source"]) f.write( f"<li>← <a href='#{rel['source']}'>{source['title']}</a> ({rel['type']}, {rel['strength']:.2f})</li>\n") f.write("</ul>\n") f.write("</div>\n") f.write("</body>\n</html>") print(f"知识已导出到 HTML 文件: {output_file}") else: print(f"不支持的导出格式: {format}") return False return True def get_report_data(self): """生成知识库报告数据(供可视化模块使用)""" report = { "knowledge_count": self.knowledge_base["metadata"]["knowledge_count"], "last_updated": self.knowledge_base["metadata"]["last_updated"], "categories": {}, "tag_cloud": defaultdict(int), "relationship_stats": defaultdict(int), "timeline_data": [] } # 分类统计 for category, items in self.knowledge_base["categories"].items(): report["categories"][category] = len(items) # 标签统计 for entry in self.knowledge_base["knowledge"].values(): for tag in entry.get("tags", []): report["tag_cloud"][tag] += 1 # 关系统计 for rel in self.knowledge_base["relationships"]: report["relationship_stats"][rel["type"]] += 1 # 时间线数据 if self.knowledge_base["knowledge"]: dates = [] for entry in self.knowledge_base["knowledge"].values(): dates.append(datetime.fromisoformat(entry["created_at"])) # 按月统计 months = defaultdict(int) for date in dates: month_key = date.strftime("%Y-%m") months[month_key] += 1 # 按时间排序 sorted_months = sorted(months.items(), key=lambda x: x[0]) report["timeline_data"] = [{"month": k, "count": v} for k, v in sorted_months] return report # 使用示例 if __name__ == "__main__": # 创建知识管理器 km = KnowledgeManager() # 添加知识条目 km.add_knowledge( title="Python列表推导式", content="列表推导式提供了一种简洁的方法来创建列表。语法: [expression for item in iterable if condition]", tags=["Python", "编程技巧"], category="编程" ) km.add_knowledge( title="神经网络基础", content="神经网络由输入层、隐藏层和输出层组成。使用反向传播算法训练权重。", tags=["机器学习", "神经网络"], category="AI" ) km.add_knowledge( title="Git基本命令", content="常用Git命令: git init, git add, git commit, git push, git pull", tags=["Git", "版本控制"], category="开发工具" ) # 添加关系 py_id = list(km.knowledge_base["knowledge"].keys())[0] ai_id = list(km.knowledge_base["knowledge"].keys())[1] km.add_relationship(py_id, ai_id, "related", 0.6) # 搜索知识 print("\n搜索 'Python':") results = km.search_knowledge("Python") for res in results: print(f"{res['title']} (相似度: {res['similarity']:.2f})") # 获取图谱数据 graph_data = km.get_knowledge_graph_data() print(f"\n知识图谱数据: {len(graph_data['nodes'])} 节点, {len(graph_data['links'])} 边") # 获取时间线数据 timeline_data = km.get_timeline_data() print(f"\n时间线数据: {len(timeline_data)} 个月份记录") # 获取报告数据 report_data = km.get_report_data() print(f"\n报告数据: {report_data['knowledge_count']} 条知识") # 导出知识 km.export_knowledge(format="md") # 保存知识库 km.save_knowledge() 这个需要改吗 怎么改
最新发布
08-11
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值