摘要
PropertyGraphIndex是LlamaIndex中一种专门用于处理属性图数据的索引类型,它能够有效整合结构化数据与非结构化文本,构建丰富的属性图谱。与传统的知识图谱相比,属性图不仅包含实体和关系,还支持丰富的属性信息,使得数据表示更加丰富和精确。本文将深入探讨PropertyGraphIndex的工作原理、实现机制以及在实际应用中的使用方法。
正文
1. 引言
在前面的博客中,我们已经探讨了LlamaIndex中的多种索引类型,包括VectorStoreIndex、TreeIndex、KnowledgeGraphIndex等。今天我们来关注一种更加先进的索引类型——PropertyGraphIndex。PropertyGraphIndex结合了知识图谱的语义表达能力和属性图的丰富信息表达能力,为处理复杂的数据关系提供了强大的工具。
2. PropertyGraphIndex基础概念
2.1 什么是PropertyGraphIndex
PropertyGraphIndex是一种基于属性图模型的索引类型,它扩展了传统知识图谱的概念,在节点和关系上支持丰富的属性信息。属性图模型允许在图的节点和边上附加键值对形式的属性,这使得数据表示更加灵活和精确。
2.2 PropertyGraphIndex的核心特点
- 属性丰富:支持节点和关系的丰富属性信息
- 结构灵活:可以表示复杂的多类型关系网络
- 语义明确:通过属性增强语义表达能力
- 易于查询:支持基于属性的复杂查询
3. PropertyGraphIndex工作原理
3.1 属性图构建过程
PropertyGraphIndex的构建过程包括以下几个关键步骤:
- 数据解析:解析结构化数据和非结构化文本
- 实体识别:识别图中的节点实体
- 属性提取:提取节点和关系的属性信息
- 关系建立:建立实体间的关系
- 索引存储:将属性图存储为索引
3.2 查询处理机制
PropertyGraphIndex支持多种查询模式:
4. 创建和使用PropertyGraphIndex
4.1 基本用法
from llama_index.core import PropertyGraphIndex
from llama_index.core import SimpleDirectoryReader
# 加载文档
documents = SimpleDirectoryReader("./data").load_data()
# 创建PropertyGraphIndex
pg_index = PropertyGraphIndex.from_documents(documents)
# 创建查询引擎
query_engine = pg_index.as_query_engine()
# 执行查询
response = query_engine.query("具有哪些特定属性的实体与人工智能相关?")
print(response)
4.2 从结构化数据创建
from llama_index.core import PropertyGraphIndex
import pandas as pd
# 从CSV文件加载结构化数据
df = pd.read_csv("./employee_data.csv")
# 转换为文档格式
documents = []
for _, row in df.iterrows():
doc_text = f"员工姓名: {row['name']}, 部门: {row['department']}, 职位: {row['position']}, 入职时间: {row['hire_date']}"
doc = Document(
text=doc_text,
metadata={
"name": row['name'],
"department": row['department'],
"position": row['position'],
"hire_date": row['hire_date']
}
)
documents.append(doc)
# 创建属性图索引
pg_index = PropertyGraphIndex.from_documents(documents)
5. PropertyGraphIndex的配置选项
5.1 关键参数详解
from llama_index.core import PropertyGraphIndex
# PropertyGraphIndex的主要参数
pg_index = PropertyGraphIndex(
nodes=nodes, # 节点列表
llm=None, # 使用的LLM
embed_model=None, # 嵌入模型
property_graph_store=None, # 属性图存储
show_progress=False, # 是否显示进度
kg_extract_template=None, # 知识抽取模板
triple_extract_template=None # 三元组抽取模板
)
5.2 自定义属性图存储
from llama_index.core.graph_stores import SimplePropertyGraphStore
# 创建自定义属性图存储
class CustomPropertyGraphStore(SimplePropertyGraphStore):
def __init__(self):
super().__init__()
self.custom_properties = {}
def upsert_node(self, node_id, node_properties):
"""插入或更新节点"""
super().upsert_node(node_id, node_properties)
# 添加自定义属性处理
self.custom_properties[node_id] = self._process_custom_properties(node_properties)
def _process_custom_properties(self, properties):
"""处理自定义属性"""
# 实现自定义属性处理逻辑
processed_props = {}
for key, value in properties.items():
if key.startswith("custom_"):
processed_props[key] = self._transform_property(value)
return processed_props
def _transform_property(self, value):
"""转换属性值"""
# 实现属性值转换逻辑
return value.upper() if isinstance(value, str) else value
# 使用自定义属性图存储
custom_store = CustomPropertyGraphStore()
pg_index = PropertyGraphIndex.from_documents(
documents,
property_graph_store=custom_store
)
6. 属性图查询和操作
6.1 基于属性的查询
# 1. 简单属性查询
def query_by_properties(pg_index, property_conditions):
"""基于属性条件查询节点"""
# 获取图存储
graph_store = pg_index.property_graph_store
# 查询满足条件的节点
matching_nodes = graph_store.get_nodes_with_properties(property_conditions)
return matching_nodes
# 使用示例
conditions = {
"department": "技术部",
"position": "高级工程师"
}
tech_seniors = query_by_properties(pg_index, conditions)
6.2 复杂图查询
# 2. 图遍历查询
def traverse_graph_by_relationships(pg_index, start_node, relationship_types, max_depth=3):
"""基于关系类型遍历图"""
graph_store = pg_index.property_graph_store
# 实现图遍历逻辑
visited_nodes = set()
result_nodes = []
def dfs(current_node, depth):
if depth > max_depth or current_node in visited_nodes:
return
visited_nodes.add(current_node)
result_nodes.append(current_node)
# 获取当前节点的邻居
neighbors = graph_store.get_neighbors(current_node, relationship_types)
for neighbor in neighbors:
dfs(neighbor, depth + 1)
dfs(start_node, 0)
return result_nodes
# 使用示例
related_entities = traverse_graph_by_relationships(
pg_index,
start_node="员工_张三",
relationship_types=["同事", "汇报给"],
max_depth=2
)
6.3 模式匹配查询
# 3. 图模式匹配查询
def pattern_matching_query(pg_index, pattern):
"""基于模式匹配查询图"""
graph_store = pg_index.property_graph_store
# 定义查询模式 (示例: 员工-部门-经理的三角关系)
pattern = {
"nodes": [
{"type": "员工", "properties": {"department": "技术部"}},
{"type": "部门", "properties": {"name": "技术部"}},
{"type": "员工", "properties": {"position": "技术总监"}}
],
"relationships": [
{"from": 0, "to": 1, "type": "属于"},
{"from": 1, "to": 2, "type": "由...管理"}
]
}
# 执行模式匹配
matches = graph_store.match_pattern(pattern)
return matches
# 使用示例
tech_structure = pattern_matching_query(pg_index, tech_pattern)
7. 实际应用案例
7.1 企业组织架构管理系统
from llama_index.core import PropertyGraphIndex
from llama_index.core.schema import Document
# 创建企业组织架构数据
org_documents = [
Document(
text="张三,技术部,高级工程师,汇报给李四",
metadata={
"name": "张三",
"department": "技术部",
"position": "高级工程师",
"manager": "李四",
"employee_id": "E001",
"hire_date": "2020-03-15"
}
),
Document(
text="李四,技术部,技术总监,汇报给王五",
metadata={
"name": "李四",
"department": "技术部",
"position": "技术总监",
"manager": "王五",
"employee_id": "E002",
"hire_date": "2018-07-22"
}
),
Document(
text="王五,公司,CTO",
metadata={
"name": "王五",
"department": "公司",
"position": "CTO",
"employee_id": "E003",
"hire_date": "2015-01-10"
}
)
]
# 创建组织架构属性图索引
org_pg_index = PropertyGraphIndex.from_documents(org_documents)
# 企业组织架构查询系统
class OrganizationManagementSystem:
def __init__(self, pg_index):
self.pg_index = pg_index
self.query_engine = pg_index.as_query_engine()
self.graph_store = pg_index.property_graph_store
def find_subordinates(self, manager_name):
"""查找下属员工"""
query = f"谁向{manager_name}汇报工作?"
return self.query_engine.query(query)
def get_department_structure(self, department_name):
"""获取部门组织结构"""
# 直接查询属性图
department_nodes = self.graph_store.get_nodes_with_properties(
{"department": department_name}
)
return department_nodes
def analyze_reporting_chain(self, employee_name):
"""分析汇报链"""
def get_manager_chain(employee):
chain = [employee]
current = employee
while True:
# 查找当前员工的上级
manager = self.graph_store.get_node_property(current, "manager")
if not manager or manager == current:
break
chain.append(manager)
current = manager
return chain
return get_manager_chain(employee_name)
def team_collaboration_analysis(self, employee_name):
"""团队协作分析"""
# 查找同一部门的同事
employee_dept = self.graph_store.get_node_property(employee_name, "department")
colleagues = self.graph_store.get_nodes_with_properties(
{"department": employee_dept}
)
return [node for node in colleagues if node != employee_name]
# 使用示例
org_system = OrganizationManagementSystem(org_pg_index)
subordinates = org_system.find_subordinates("李四")
dept_structure = org_system.get_department_structure("技术部")
reporting_chain = org_system.analyze_reporting_chain("张三")
colleagues = org_system.team_collaboration_analysis("张三")
7.2 产品知识图谱系统
from llama_index.core import PropertyGraphIndex
from llama_index.core.schema import Document
# 创建产品数据
product_documents = [
Document(
text="iPhone 15 Pro,苹果公司生产,发布于2023年9月,搭载A17 Pro芯片,支持5G网络",
metadata={
"product_name": "iPhone 15 Pro",
"brand": "苹果",
"manufacturer": "苹果公司",
"release_date": "2023-09-12",
"processor": "A17 Pro",
"network_support": "5G",
"price": 7999,
"category": "智能手机"
}
),
Document(
text="Samsung Galaxy S24,三星公司生产,发布于2024年1月,搭载骁龙8 Gen 3芯片,支持5G网络",
metadata={
"product_name": "Samsung Galaxy S24",
"brand": "三星",
"manufacturer": "三星公司",
"release_date": "2024-01-17",
"processor": "骁龙8 Gen 3",
"network_support": "5G",
"price": 6999,
"category": "智能手机"
}
)
]
# 创建产品属性图索引
product_pg_index = PropertyGraphIndex.from_documents(product_documents)
# 产品知识图谱系统
class ProductKnowledgeGraph:
def __init__(self, pg_index):
self.pg_index = pg_index
self.query_engine = pg_index.as_query_engine()
self.graph_store = pg_index.property_graph_store
def compare_products(self, product1, product2):
"""比较两个产品"""
query = f"比较{product1}和{product2}的主要区别"
return self.query_engine.query(query)
def find_similar_products(self, product_name, similarity_criteria=None):
"""查找相似产品"""
if similarity_criteria is None:
similarity_criteria = ["category", "price_range"]
# 获取目标产品的属性
target_product = self.graph_store.get_node(product_name)
target_props = target_product.properties
# 根据相似性标准查找相似产品
similar_products = []
all_products = self.graph_store.get_nodes_with_properties({"category": target_props["category"]})
for product in all_products:
if product != product_name:
product_props = self.graph_store.get_node_properties(product)
similarity_score = self._calculate_similarity(target_props, product_props, similarity_criteria)
if similarity_score > 0.7: # 相似度阈值
similar_products.append((product, similarity_score))
# 按相似度排序
similar_products.sort(key=lambda x: x[1], reverse=True)
return similar_products
def _calculate_similarity(self, props1, props2, criteria):
"""计算属性相似度"""
total_score = 0
matched_criteria = 0
for criterion in criteria:
if criterion in props1 and criterion in props2:
matched_criteria += 1
if props1[criterion] == props2[criterion]:
total_score += 1
elif criterion == "price_range":
# 价格范围相似度计算
price1 = props1.get("price", 0)
price2 = props2.get("price", 0)
if abs(price1 - price2) / max(price1, price2) < 0.2: # 价格差异小于20%
total_score += 1
return total_score / matched_criteria if matched_criteria > 0 else 0
def recommend_products(self, user_preferences):
"""基于用户偏好推荐产品"""
# 构建推荐查询
preference_query = "推荐"
for key, value in user_preferences.items():
preference_query += f" {key}为{value}的"
preference_query += "产品"
return self.query_engine.query(preference_query)
def product_lifecycle_analysis(self, product_name):
"""产品生命周期分析"""
# 分析产品的发布时间、更新频率等
product_props = self.graph_store.get_node_properties(product_name)
release_date = product_props.get("release_date")
# 查找同品牌产品发布历史
brand = product_props.get("brand")
brand_products = self.graph_store.get_nodes_with_properties({"brand": brand})
return {
"product": product_name,
"release_date": release_date,
"brand_products": brand_products,
"lifecycle_stage": self._determine_lifecycle_stage(release_date)
}
def _determine_lifecycle_stage(self, release_date):
"""确定产品生命周期阶段"""
from datetime import datetime
release = datetime.strptime(release_date, "%Y-%m-%d")
now = datetime.now()
age_days = (now - release).days
if age_days < 90:
return "引入期"
elif age_days < 365:
return "成长期"
elif age_days < 730:
return "成熟期"
else:
return "衰退期"
# 使用示例
product_kg = ProductKnowledgeGraph(product_pg_index)
comparison = product_kg.compare_products("iPhone 15 Pro", "Samsung Galaxy S24")
similar_products = product_kg.find_similar_products("iPhone 15 Pro")
recommendations = product_kg.recommend_products({"category": "智能手机", "price_range": "5000-8000"})
lifecycle = product_kg.product_lifecycle_analysis("iPhone 15 Pro")
7.3 社交网络分析系统
from llama_index.core import PropertyGraphIndex
from llama_index.core.schema import Document
# 创建社交网络数据
social_documents = [
Document(
text="用户张三关注了用户李四,他们共同的兴趣包括摄影和旅行",
metadata={
"user_id": "user_001",
"username": "张三",
"follows": "user_002",
"common_interests": ["摄影", "旅行"],
"account_type": "个人",
"followers_count": 1250,
"following_count": 350
}
),
Document(
text="用户李四关注了用户王五,他们共同的兴趣包括技术分享和开源项目",
metadata={
"user_id": "user_002",
"username": "李四",
"follows": "user_003",
"common_interests": ["技术分享", "开源项目"],
"account_type": "个人",
"followers_count": 890,
"following_count": 220
}
)
]
# 创建社交网络属性图索引
social_pg_index = PropertyGraphIndex.from_documents(social_documents)
# 社交网络分析系统
class SocialNetworkAnalyzer:
def __init__(self, pg_index):
self.pg_index = pg_index
self.query_engine = pg_index.as_query_engine()
self.graph_store = pg_index.property_graph_store
def find_influencers(self, topic=None, min_followers=1000):
"""查找影响力用户"""
if topic:
# 查找特定话题的影响力用户
influencers = self.graph_store.get_nodes_with_properties({
"common_interests": topic,
"followers_count": f">={min_followers}"
})
else:
# 查找所有影响力用户
influencers = self.graph_store.get_nodes_with_properties({
"followers_count": f">={min_followers}"
})
return influencers
def analyze_community(self, user_name):
"""分析用户社区"""
# 获取用户关注的人
user_follows = self.graph_store.get_node_property(user_name, "follows")
# 获取用户的共同兴趣
user_interests = self.graph_store.get_node_property(user_name, "common_interests")
# 查找具有相似兴趣的用户
similar_users = []
for interest in user_interests:
users_with_interest = self.graph_store.get_nodes_with_properties({
"common_interests": interest
})
similar_users.extend(users_with_interest)
# 去重
similar_users = list(set(similar_users))
return {
"user": user_name,
"follows": user_follows,
"interests": user_interests,
"similar_users": similar_users
}
def recommend_connections(self, user_name, max_recommendations=5):
"""推荐连接"""
# 获取用户已关注的人
user_follows = self.graph_store.get_node_property(user_name, "follows")
if not isinstance(user_follows, list):
user_follows = [user_follows]
# 获取用户的兴趣
user_interests = self.graph_store.get_node_property(user_name, "common_interests")
# 查找朋友的朋友
friends_of_friends = set()
for friend in user_follows:
friend_follows = self.graph_store.get_node_property(friend, "follows")
if friend_follows:
if not isinstance(friend_follows, list):
friend_follows = [friend_follows]
for fof in friend_follows:
if fof != user_name and fof not in user_follows:
friends_of_friends.add(fof)
# 基于共同兴趣推荐
interest_based = set()
for interest in user_interests:
users_with_interest = self.graph_store.get_nodes_with_properties({
"common_interests": interest
})
for user in users_with_interest:
if user != user_name and user not in user_follows:
interest_based.add(user)
# 合并推荐并评分
recommendations = {}
for user in friends_of_friends:
recommendations[user] = recommendations.get(user, 0) + 2 # 朋友的朋友权重更高
for user in interest_based:
recommendations[user] = recommendations.get(user, 0) + 1 # 兴趣匹配权重
# 按评分排序
sorted_recommendations = sorted(recommendations.items(), key=lambda x: x[1], reverse=True)
return sorted_recommendations[:max_recommendations]
def detect_communities(self):
"""检测社区群体"""
# 基于共同兴趣检测社区
communities = {}
# 获取所有用户
all_users = self.graph_store.get_nodes()
for user in all_users:
interests = self.graph_store.get_node_property(user, "common_interests")
if interests:
if not isinstance(interests, list):
interests = [interests]
for interest in interests:
if interest not in communities:
communities[interest] = []
communities[interest].append(user)
# 过滤小型社区
significant_communities = {k: v for k, v in communities.items() if len(v) > 2}
return significant_communities
def analyze_network_centralization(self):
"""分析网络中心化程度"""
# 计算每个用户的中心性(基于关注者数量)
user_centralities = {}
all_users = self.graph_store.get_nodes()
for user in all_users:
followers_count = self.graph_store.get_node_property(user, "followers_count") or 0
user_centralities[user] = followers_count
# 计算网络中心化指标
max_centralization = max(user_centralities.values()) if user_centralities else 0
avg_centralization = sum(user_centralities.values()) / len(user_centralities) if user_centralities else 0
return {
"user_centralities": user_centralities,
"max_centralization": max_centralization,
"avg_centralization": avg_centralization,
"centralization_ratio": max_centralization / avg_centralization if avg_centralization > 0 else 0
}
# 使用示例
social_analyzer = SocialNetworkAnalyzer(social_pg_index)
influencers = social_analyzer.find_influencers(min_followers=1000)
community = social_analyzer.analyze_community("张三")
recommendations = social_analyzer.recommend_connections("张三")
communities = social_analyzer.detect_communities()
centralization = social_analyzer.analyze_network_centralization()
8. 与其他索引类型的比较
8.1 与KnowledgeGraphIndex的对比
| 特性 | PropertyGraphIndex | KnowledgeGraphIndex |
|---|---|---|
| 数据模型 | 属性图 | 知识图谱 |
| 属性支持 | 强(节点和关系都支持属性) | 弱(主要关注实体和关系) |
| 查询能力 | 强(支持复杂属性查询) | 中(主要基于实体关系查询) |
| 表达能力 | 强(丰富的属性信息) | 中(主要语义关系) |
| 应用场景 | 结构化数据整合 | 知识推理和问答 |
8.2 与VectorStoreIndex的对比
| 特性 | PropertyGraphIndex | VectorStoreIndex |
|---|---|---|
| 数据结构 | 图结构 | 向量空间 |
| 查询方式 | 基于属性和关系 | 基于向量相似度 |
| 语义理解 | 中(基于显式关系) | 强(基于向量语义) |
| 可解释性 | 强(明确的属性和关系) | 弱(向量空间距离) |
| 适用场景 | 结构化关系查询 | 语义相似性检索 |
9. 故障排除和最佳实践
9.1 常见问题及解决方案
-
属性数据不一致:
# 实现属性数据一致性检查 class PropertyConsistencyChecker: def __init__(self, pg_index): self.pg_index = pg_index self.graph_store = pg_index.property_graph_store def check_property_consistency(self): """检查属性一致性""" inconsistencies = [] # 获取所有节点 all_nodes = self.graph_store.get_nodes() for node in all_nodes: properties = self.graph_store.get_node_properties(node) # 检查必需属性是否存在 required_props = ["name", "type"] for prop in required_props: if prop not in properties: inconsistencies.append(f"节点{node}缺少必需属性{prop}") # 检查属性值类型 type_checks = { "age": int, "price": (int, float), "is_active": bool } for prop, expected_type in type_checks.items(): if prop in properties: value = properties[prop] if not isinstance(value, expected_type): inconsistencies.append(f"节点{node}的属性{prop}类型不正确") return inconsistencies def fix_inconsistencies(self, inconsistencies): """修复不一致性""" # 实现修复逻辑 for issue in inconsistencies: print(f"修复问题: {issue}") # 具体修复操作 -
图查询性能问题:
# 优化图查询性能 class OptimizedPropertyGraphQuery: def __init__(self, pg_index): self.pg_index = pg_index self.graph_store = pg_index.property_graph_store self.query_cache = {} def cached_query(self, query_key, query_func, *args, **kwargs): """带缓存的查询""" if query_key in self.query_cache: return self.query_cache[query_key] result = query_func(*args, **kwargs) self.query_cache[query_key] = result return result def optimized_traversal(self, start_node, relationship_types, max_depth=3): """优化的图遍历""" # 使用广度优先搜索而非深度优先搜索 from collections import deque queue = deque([(start_node, 0)]) # (节点, 深度) visited = set() result = [] while queue: current_node, depth = queue.popleft() if depth > max_depth or current_node in visited: continue visited.add(current_node) result.append(current_node) # 获取邻居节点 neighbors = self.graph_store.get_neighbors(current_node, relationship_types) for neighbor in neighbors: if neighbor not in visited: queue.append((neighbor, depth + 1)) return result
9.2 最佳实践建议
-
合理设计属性结构:
# 属性图设计最佳实践 class PropertyGraphDesignBestPractices: def __init__(self): self.design_principles = { "normalization": "避免数据冗余,合理拆分实体", "indexing": "为常用查询属性建立索引", "typing": "明确定义属性数据类型", "constraints": "设置合理的属性约束" } def design_property_schema(self, entity_types): """设计属性模式""" schema = {} for entity_type in entity_types: if entity_type == "person": schema[entity_type] = { "required": ["name", "age"], "optional": ["email", "phone", "address"], "types": {"name": str, "age": int, "email": str} } elif entity_type == "product": schema[entity_type] = { "required": ["name", "price"], "optional": ["description", "category", "brand"], "types": {"name": str, "price": float, "category": str} } return schema -
增量更新策略:
# 支持增量更新的属性图 class IncrementalPropertyGraph(PropertyGraphIndex): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.last_update_time = {} def add_documents_incrementally(self, new_documents): """增量添加文档""" for doc in new_documents: # 提取图信息 nodes, relationships = self._extract_graph_info(doc) # 更新图结构 self._update_graph(nodes, relationships) # 记录更新时间 doc_id = doc.metadata.get("id", doc.doc_id) self.last_update_time[doc_id] = datetime.now() def _extract_graph_info(self, document): """从文档中提取图信息""" # 实现信息提取逻辑 nodes = [] relationships = [] # 根据文档内容提取节点和关系 # ... return nodes, relationships def _update_graph(self, nodes, relationships): """更新图结构""" # 实现图更新逻辑 for node in nodes: self.property_graph_store.upsert_node(node.id, node.properties) for rel in relationships: self.property_graph_store.upsert_relationship( rel.source, rel.target, rel.type, rel.properties )
10. 高级功能探索
10.1 图算法集成
# 集成图算法的属性图索引
class GraphAlgorithmEnhancedPropertyGraph(PropertyGraphIndex):
"""集成图算法的增强属性图"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.algorithms = self._initialize_algorithms()
def _initialize_algorithms(self):
"""初始化图算法"""
return {
"pagerank": self._pagerank_algorithm,
"clustering_coefficient": self._clustering_coefficient,
"shortest_path": self._shortest_path_algorithm
}
def _pagerank_algorithm(self, iterations=100, damping=0.85):
"""PageRank算法实现"""
# 获取所有节点
nodes = self.property_graph_store.get_nodes()
node_count = len(nodes)
# 初始化PageRank值
pagerank = {node: 1.0 / node_count for node in nodes}
# 迭代计算PageRank
for _ in range(iterations):
new_pagerank = {}
for node in nodes:
rank_sum = 0
# 获取指向当前节点的节点
incoming_nodes = self.property_graph_store.get_incoming_nodes(node)
for incoming_node in incoming_nodes:
outgoing_count = len(self.property_graph_store.get_outgoing_nodes(incoming_node))
if outgoing_count > 0:
rank_sum += pagerank[incoming_node] / outgoing_count
new_pagerank[node] = (1 - damping) / node_count + damping * rank_sum
pagerank = new_pagerank
return pagerank
def _clustering_coefficient(self):
"""聚类系数算法"""
nodes = self.property_graph_store.get_nodes()
clustering_coeffs = {}
for node in nodes:
neighbors = self.property_graph_store.get_neighbors(node)
if len(neighbors) < 2:
clustering_coeffs[node] = 0
continue
# 计算邻居之间的连接数
edges_between_neighbors = 0
for i in range(len(neighbors)):
for j in range(i + 1, len(neighbors)):
if self.property_graph_store.has_relationship(neighbors[i], neighbors[j]):
edges_between_neighbors += 1
# 计算聚类系数
possible_edges = len(neighbors) * (len(neighbors) - 1) / 2
clustering_coeffs[node] = edges_between_neighbors / possible_edges if possible_edges > 0 else 0
return clustering_coeffs
def _shortest_path_algorithm(self, source, target):
"""最短路径算法(Dijkstra)"""
import heapq
# 初始化距离和前驱节点
distances = {node: float('inf') for node in self.property_graph_store.get_nodes()}
previous = {node: None for node in self.property_graph_store.get_nodes()}
distances[source] = 0
# 优先队列
pq = [(0, source)]
visited = set()
while pq:
current_distance, current_node = heapq.heappop(pq)
if current_node in visited:
continue
visited.add(current_node)
if current_node == target:
break
# 获取邻居节点
neighbors = self.property_graph_store.get_neighbors(current_node)
for neighbor in neighbors:
if neighbor in visited:
continue
# 假设所有边的权重为1
distance = current_distance + 1
if distance < distances[neighbor]:
distances[neighbor] = distance
previous[neighbor] = current_node
heapq.heappush(pq, (distance, neighbor))
# 重构路径
path = []
current = target
while current is not None:
path.append(current)
current = previous[current]
path.reverse()
return path if path[0] == source else []
# 使用示例
enhanced_pg = GraphAlgorithmEnhancedPropertyGraph.from_documents(documents)
pagerank_scores = enhanced_pg._pagerank_algorithm()
clustering_coeffs = enhanced_pg._clustering_coefficient()
shortest_path = enhanced_pg._shortest_path_algorithm("节点A", "节点Z")
10.2 实时图更新
import asyncio
from datetime import datetime
class RealTimePropertyGraph(PropertyGraphIndex):
"""支持实时更新的属性图"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.update_queue = asyncio.Queue()
self.start_update_processor()
def start_update_processor(self):
"""启动更新处理器"""
asyncio.create_task(self._process_updates())
async def _process_updates(self):
"""处理更新队列"""
while True:
try:
update = await self.update_queue.get()
await self._apply_update(update)
self.update_queue.task_done()
except Exception as e:
print(f"处理更新时出错: {e}")
async def add_real_time_data(self, data):
"""添加实时数据"""
update = {
"type": "add",
"data": data,
"timestamp": datetime.now()
}
await self.update_queue.put(update)
async def remove_real_time_data(self, node_id):
"""删除实时数据"""
update = {
"type": "remove",
"node_id": node_id,
"timestamp": datetime.now()
}
await self.update_queue.put(update)
async def update_real_time_data(self, node_id, properties):
"""更新实时数据"""
update = {
"type": "update",
"node_id": node_id,
"properties": properties,
"timestamp": datetime.now()
}
await self.update_queue.put(update)
async def _apply_update(self, update):
"""应用更新"""
if update["type"] == "add":
# 处理添加操作
nodes, relationships = self._extract_graph_info(update["data"])
for node in nodes:
self.property_graph_store.upsert_node(node.id, node.properties)
for rel in relationships:
self.property_graph_store.upsert_relationship(
rel.source, rel.target, rel.type, rel.properties
)
elif update["type"] == "remove":
# 处理删除操作
self.property_graph_store.delete_node(update["node_id"])
elif update["type"] == "update":
# 处理更新操作
self.property_graph_store.upsert_node(update["node_id"], update["properties"])
总结
PropertyGraphIndex作为LlamaIndex中一种先进的索引类型,通过引入属性图模型,为处理复杂的数据关系提供了强大的能力。它不仅继承了知识图谱的语义表达优势,还通过丰富的属性信息增强了数据的表达能力和查询的灵活性。
PropertyGraphIndex的主要优势包括:
- 丰富的属性支持:节点和关系都支持丰富的属性信息,使数据表示更加精确
- 灵活的查询能力:支持基于属性的复杂查询和图遍历操作
- 强大的表达能力:能够表示复杂的多类型关系网络
- 良好的可扩展性:支持增量更新和实时数据处理
在实际应用中,PropertyGraphIndex特别适用于以下场景:
- 企业知识管理:整合组织架构、员工信息、业务流程等复杂关系
- 产品知识图谱:管理产品属性、规格、关系等信息
- 社交网络分析:分析用户关系、兴趣匹配、社区发现等
- 金融风控:分析企业关系网络、风险传导路径等
- 医疗健康:整合病人信息、疾病关系、治疗方案等
使用PropertyGraphIndex时需要注意以下几点:
- 合理设计属性结构:明确定义节点和关系的属性模式
- 保证数据一致性:实施数据验证和一致性检查机制
- 优化查询性能:对常用查询属性建立索引,优化图遍历算法
- 增量更新策略:实现高效的增量更新机制以处理动态数据
通过合理使用PropertyGraphIndex,我们可以构建出更加智能和强大的数据应用系统,为复杂关系数据的处理和分析提供有力支持。随着图数据库技术和大语言模型的不断发展,PropertyGraphIndex将在更多领域发挥重要作用。
1186

被折叠的 条评论
为什么被折叠?



