1.#E:\AI_System\agent\diagnostic_system.py
import logging
import psutil
import time
import random
import traceback
from typing import Optional, Any, Dict, Union
from agent.base_module import UnifiedCognitiveModule
class DiagnosticSystem(UnifiedCognitiveModule):
"""系统诊断模块 - 增强安全版(支持协调器安全连接)"""
def __init__(
self,
name: str = "DiagnosticSystem",
coordinator: Optional[Any] = None,
config: Optional[Dict] = None
):
"""
诊断系统初始化
:param name: 系统名称
:param coordinator: 协调器实例
:param config: 配置字典
"""
super().__init__(name=name, coordinator=coordinator, config=config)
# 配置参数处理
config = config or {}
self.log_level = config.get("log_level", "INFO")
self.diagnostic_interval = config.get("diagnostic_interval", 30) # 默认30秒
# 日志配置 - 确保使用字符串名称
self.logger = logging.getLogger("DiagnosticSystem")
log_level = getattr(logging, self.log_level.upper(), logging.INFO)
self.logger.setLevel(log_level)
# 初始化计数器
self.diagnostic_count = 0
self.last_diagnostic_time = time.time()
# 星型架构协调器引用
self.orchestrator = None
self.logger.info(f"🔧 初始化诊断系统: {name}")
def set_orchestrator(self, orchestrator_info: Union[dict, Any]):
"""
安全设置协调器引用
:param orchestrator_info: 协调器信息(字典)或协调器实例
"""
try:
# 统一处理两种参数类型
if isinstance(orchestrator_info, dict):
self.orchestrator = orchestrator_info.get("instance")
orchestrator_name = orchestrator_info.get("name", "UnknownOrchestrator")
else:
self.orchestrator = orchestrator_info
orchestrator_name = getattr(orchestrator_info, "name", "UnknownOrchestrator")
self.logger.info(f"🔗 连接到星型协调器: {orchestrator_name}")
except Exception as e:
self.logger.error(f"❌ 设置协调器失败: {str(e)}", exc_info=True)
def shutdown(self) -> bool:
"""
关闭诊断系统
:return: 是否成功关闭
"""
try:
self.logger.info("🛑 正在关闭诊断系统...")
# 执行必要的清理操作
self.disconnect()
self.logger.info("✅ 诊断系统已安全关闭")
return True
except Exception as e:
self.logger.error(f"❌ 关闭失败: {str(e)}")
return False
# ============ 抽象方法实现 ============
def process(self, input_data: Any) -> Dict:
"""
处理输入数据 - 实现抽象方法
:param input_data: 输入数据
:return: 处理结果字典
"""
try:
self.logger.info(f"🔍 处理诊断输入: {input_data}")
self.diagnostic_count += 1
self.last_diagnostic_time = time.time()
# 执行诊断检查
diagnostic_results = self.check_modules()
return {
"status": "success",
"message": f"诊断系统 '{self.name}' 完成处理",
"input": input_data,
"results": diagnostic_results,
"timestamp": time.time()
}
except Exception as e:
self.logger.error(f"❌ 处理输入失败: {str(e)}")
return {
"status": "error",
"error": str(e),
"traceback": traceback.format_exc()
}
def process_command(self, command: str) -> Dict:
"""
处理命令 - 实现抽象方法
:param command: 命令字符串
:return: 处理结果字典
"""
try:
self.logger.info(f"⚙️ 处理诊断命令: {command}")
self.diagnostic_count += 1
self.last_diagnostic_time = time.time()
# 命令处理逻辑
if command == "full_diagnostic":
results = self.check_modules()
return {
"status": "success",
"command": command,
"results": results
}
elif command == "resource_check":
return {
"status": "success",
"command": command,
"resources": self._check_resources()
}
elif command == "health_check":
return {
"status": "success",
"command": command,
"health_status": self.is_healthy()
}
elif command == "system_status":
# 获取协调器状态(如果可用)
if self.orchestrator and hasattr(self.orchestrator, "get_system_status"):
status = self.orchestrator.get_system_status()
else:
status = {"error": "协调器不可用"}
return {
"status": "success",
"command": command,
"system_status": status
}
else:
return {
"status": "error",
"command": command,
"message": "未知命令",
"valid_commands": ["full_diagnostic", "resource_check", "health_check", "system_status"]
}
except Exception as e:
self.logger.error(f"❌ 处理命令失败: {str(e)}")
return {
"status": "error",
"command": command,
"error": str(e)
}
def connect(self) -> bool:
"""
连接环境 - 实现抽象方法
:return: 连接是否成功
"""
self.logger.info("🔌 正在连接诊断系统环境...")
try:
# 初始化资源监控
psutil.cpu_percent(interval=None) # 初始化CPU监控
# 尝试初始化GPU监控
try:
import gpustat
gpustat.new_query()
self.logger.info("✅ GPU监控已初始化")
except ImportError:
self.logger.warning("⚠️ gpustat未安装,GPU监控不可用")
except Exception:
self.logger.exception("❌ GPU监控初始化失败")
self.logger.info("✅ 诊断系统连接成功")
return True
except Exception as e:
self.logger.error(f"❌ 连接失败: {str(e)}")
return False
def disconnect(self) -> None:
"""
断开连接 - 实现抽象方法
"""
try:
self.logger.info("🔌 正在断开诊断系统连接...")
# 清理资源
if hasattr(self, '_gpu_monitor'):
del self._gpu_monitor
self.logger.info("✅ 诊断系统已安全断开")
except Exception as e:
self.logger.error(f"断开连接时出错: {str(e)}")
# ============ 诊断系统特定功能 ============
def is_healthy(self) -> bool:
"""
健康检查:系统资源是否在正常范围内
:return: 健康状态
"""
try:
cpu_usage = psutil.cpu_percent(interval=1)
mem_usage = psutil.virtual_memory().percent
# 详细记录
self.logger.debug(f"CPU使用率: {cpu_usage}%, 内存使用率: {mem_usage}%")
# 阈值判断
healthy = cpu_usage < 90 and mem_usage < 85
status = "健康 ✅" if healthy else "警告 ⚠️"
self.logger.info(f"系统健康检查: {status} (CPU: {cpu_usage}%, 内存: {mem_usage}%)")
return healthy
except Exception as e:
self.logger.error(f"健康检查失败: {e}", exc_info=True)
return False
def get_status(self) -> dict:
"""
返回诊断系统自身的状态
:return: 状态字典
"""
try:
return {
"name": self.name,
"status": "运行中",
"last_check": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(self.last_diagnostic_time)),
"diagnostic_count": self.diagnostic_count,
"system_load": f"{psutil.cpu_percent()}% CPU, {psutil.virtual_memory().percent}% RAM",
"orchestrator": getattr(self.orchestrator, "name", "未连接") if self.orchestrator else "未连接"
}
except Exception as e:
self.logger.error(f"获取状态失败: {str(e)}")
return {"error": str(e)}
def check_modules(self) -> Dict[str, dict]:
"""
检查核心模块状态
:return: 各模块状态字典
"""
try:
results = {
"diagnostic_system": self.get_status(),
"system_resources": self._check_resources()
}
# 如果连接到协调器,检查其他模块
if self.orchestrator:
try:
results["cognitive_system"] = self._check_cognitive()
results["environment_interface"] = self._check_environment()
results["affective_system"] = self._check_affective()
# 获取协调器管理的模块状态
if hasattr(self.orchestrator, "get_modules"):
modules = self.orchestrator.get_modules()
for name, module in modules.items():
if module and hasattr(module, "get_status"):
results[f"{name}_module"] = module.get_status()
except Exception as e:
self.logger.error(f"协调器模块检查失败: {str(e)}")
results["coordinator_error"] = str(e)
# 更新状态计数器
self.diagnostic_count += 1
self.last_diagnostic_time = time.time()
# 记录结果
self.logger.info("✅ 完成诊断检查")
return results
except Exception as e:
self.logger.exception("❌ 模块诊断失败")
return {"error": str(e)}
def run_periodic_diagnostics(self):
"""运行周期性诊断检查"""
while self.active:
try:
self.logger.info("🔄 运行周期性诊断检查...")
results = self.check_modules()
self.logger.debug(f"诊断结果: {results}")
# 发送结果到协调器
if self.orchestrator and hasattr(self.orchestrator, "report_diagnostic"):
self.orchestrator.report_diagnostic(results)
except Exception as e:
self.logger.error(f"❌ 周期性诊断失败: {str(e)}")
time.sleep(self.diagnostic_interval)
def _check_cognitive(self) -> dict:
"""
检查认知系统模块
:return: 状态字典
"""
try:
# 尝试访问协调器的认知模块
if self.orchestrator and hasattr(self.orchestrator, "get_module"):
cognitive = self.orchestrator.get_module("cognitive")
if cognitive:
return {
"status": "✅ 已连接",
"version": getattr(cognitive, "version", "未知"),
"last_activity": getattr(cognitive, "last_activity", "未知")
}
# 回退到模拟数据
return {
"status": "⚠️ 未连接",
"version": "1.2.5",
"last_heartbeat": time.time() - random.randint(1, 10)
}
except Exception as e:
return {"status": "❌ 异常", "error": str(e)}
def _check_environment(self) -> dict:
"""
检查环境接口模块
:return: 状态字典
"""
try:
# 尝试访问协调器的环境模块
if self.orchestrator and hasattr(self.orchestrator, "get_module"):
env = self.orchestrator.get_module("environment")
if env:
return {
"status": "✅ 已连接",
"sensors": getattr(env, "sensors", []),
"last_update": getattr(env, "last_update", "未知")
}
# 回退到模拟数据
return {
"status": "⚠️ 未连接",
"connection": "inactive",
"last_ping": time.time() - random.randint(1, 5)
}
except Exception as e:
return {"status": "❌ 异常", "error": str(e)}
def _check_affective(self) -> dict:
"""
检查情感系统模块
:return: 状态字典
"""
try:
# 尝试访问协调器的情感模块
if self.orchestrator and hasattr(self.orchestrator, "get_module"):
affective = self.orchestrator.get_module("affective")
if affective:
return {
"status": "✅ 已连接",
"state": affective.get_state() if hasattr(affective, "get_state") else "未知"
}
# 回退到模拟数据
emotions = ["neutral", "happy", "sad", "angry", "excited"]
return {
"status": "⚠️ 未连接",
"emotion_state": random.choice(emotions),
"intensity": random.randint(1, 100)
}
except Exception as e:
return {"status": "❌ 异常", "error": str(e)}
def _check_resources(self) -> dict:
"""
检查系统资源使用情况
:return: 资源状态字典
"""
try:
return {
"cpu": self._get_cpu_status(),
"memory": self._get_memory_status(),
"gpu": self._get_gpu_status(),
"disk": self._get_disk_usage()
}
except Exception as e:
return {"error": f"资源检查失败: {str(e)}"}
def _get_cpu_status(self) -> dict:
"""获取CPU状态"""
try:
return {
"usage": f"{psutil.cpu_percent(interval=1)}%",
"cores": psutil.cpu_count(logical=False),
"threads": psutil.cpu_count(logical=True),
"freq": f"{psutil.cpu_freq().current:.2f} MHz"
}
except Exception as e:
return {"error": f"CPU检测失败: {str(e)}"}
def _get_memory_status(self) -> dict:
"""获取内存状态"""
try:
mem = psutil.virtual_memory()
return {
"total": f"{mem.total // (1024 ** 3)} GB",
"used": f"{mem.used // (1024 ** 3)} GB",
"free": f"{mem.free // (1024 ** 3)} GB",
"percent": f"{mem.percent}%"
}
except Exception as e:
return {"error": f"内存检测失败: {str(e)}"}
def _get_gpu_status(self) -> Any:
"""
获取GPU状态
:return: GPU状态信息
"""
try:
import gpustat
stats = gpustat.new_query()
return [
{
"id": gpu.index,
"name": gpu.name,
"utilization": f"{gpu.utilization}%",
"memory": f"{gpu.memory_used}/{gpu.memory_total} MB",
"temperature": f"{gpu.temperature}°C",
"power": f"{gpu.power_draw}W"
}
for gpu in stats.gpus
]
except ImportError:
return {"warning": "gpustat 未安装"}
except Exception as e:
return {"error": f"GPU检测失败: {str(e)}"}
def _get_disk_usage(self) -> Dict[str, Any]:
"""
获取磁盘使用情况
:return: 磁盘状态字典
"""
try:
partitions = []
for partition in psutil.disk_partitions():
try:
usage = psutil.disk_usage(partition.mountpoint)
partitions.append({
"device": partition.device,
"mountpoint": partition.mountpoint,
"fstype": partition.fstype,
"total": f"{usage.total // (1024 ** 3)} GB",
"used": f"{usage.used // (1024 ** 3)} GB",
"free": f"{usage.free // (1024 ** 3)} GB",
"percent": f"{usage.percent}%"
})
except Exception as e:
partitions.append({
"device": partition.device,
"error": str(e)
})
return partitions
except Exception as e:
return {"error": f"磁盘检测失败: {str(e)}"}
2.# diagnostic_system.py
class DiagnosticSystem:
# ... 现有代码 ...
def model_health_check(self, model_manager):
"""检查模型健康状况"""
results = {}
for name, model in model_manager.loaded_models.items():
try:
# 测试模型是否响应
if hasattr(model, 'generate'):
test_input = torch.tensor([[0]]) # 简单测试输入
model.generate(test_input)
results[name] = "healthy"
else:
results[name] = "no_generate_method"
except Exception as e:
results[name] = f"error: {str(e)}"
return results