摘要
监控和日志系统是现代软件应用稳定性运维的重要保障。LangBot作为一个企业级聊天机器人平台,内置了完善的监控和日志系统,能够实时跟踪应用状态、性能指标和错误信息。本文将深入探讨LangBot监控和日志系统的架构设计、核心组件、数据收集机制以及最佳实践,帮助开发者和运维人员更好地理解和使用这一重要功能。
正文
1. 监控和日志系统概述
LangBot的监控和日志系统具有以下特点:
- 多层次监控:涵盖应用层、系统层、业务层等多个维度
- 实时日志:支持实时日志收集、分析和查询
- 性能指标:收集关键性能指标(KPI)用于性能分析
- 错误追踪:完整的错误追踪和报警机制
- 可扩展性:支持集成第三方监控和日志系统
- 可视化展示:提供直观的监控面板和报表
2. 系统架构
LangBot监控和日志系统的架构如下图所示:
3. 核心组件
3.1 日志管理器
import logging
import asyncio
import json
import traceback
from datetime import datetime
from typing import Optional, Dict, Any
import aiofiles
class LogManager:
"""日志管理器"""
def __init__(self, ap: "app.Application"):
self.ap = ap
self.logger = logging.getLogger("langbot")
self.log_cache = []
self.cache_size = 1000
self.log_file = "logs/langbot.log"
self.log_level = "INFO"
async def initialize(self):
"""初始化日志管理器"""
# 配置日志级别
log_config = self.ap.instance_config.data.get("logging", {})
self.log_level = log_config.get("level", "INFO")
# 设置日志格式
log_format = log_config.get("format", "standard")
if log_format == "json":
formatter = JsonFormatter()
else:
formatter = StandardFormatter()
# 配置控制台处理器
console_handler = logging.StreamHandler()
console_handler.setFormatter(formatter)
console_handler.setLevel(getattr(logging, self.log_level))
# 配置文件处理器
file_handler = logging.FileHandler(self.log_file)
file_handler.setFormatter(formatter)
file_handler.setLevel(getattr(logging, self.log_level))
# 添加处理器到logger
self.logger.addHandler(console_handler)
self.logger.addHandler(file_handler)
self.logger.setLevel(getattr(logging, self.log_level))
# 启动日志写入任务
self.ap.task_mgr.create_task(
self._log_writer_worker(),
name="log-writer"
)
def debug(self, message: str, extra: Optional[Dict[str, Any]] = None):
"""记录调试日志"""
self._log(logging.DEBUG, message, extra)
def info(self, message: str, extra: Optional[Dict[str, Any]] = None):
"""记录信息日志"""
self._log(logging.INFO, message, extra)
def warning(self, message: str, extra: Optional[Dict[str, Any]] = None):
"""记录警告日志"""
self._log(logging.WARNING, message, extra)
def error(self, message: str, extra: Optional[Dict[str, Any]] = None):
"""记录错误日志"""
self._log(logging.ERROR, message, extra)
def critical(self, message: str, extra: Optional[Dict[str, Any]] = None):
"""记录严重错误日志"""
self._log(logging.CRITICAL, message, extra)
def _log(self, level: int, message: str, extra: Optional[Dict[str, Any]] = None):
"""记录日志"""
# 添加时间戳和额外信息
log_entry = {
"timestamp": datetime.now().isoformat(),
"level": logging.getLevelName(level),
"message": message,
"extra": extra or {}
}
# 添加到缓存
self.log_cache.append(log_entry)
if len(self.log_cache) > self.cache_size:
self.log_cache.pop(0)
# 记录到标准日志系统
self.logger.log(level, message, extra=extra)
async def _log_writer_worker(self):
"""日志写入工作者"""
while True:
try:
if self.log_cache:
# 批量写入日志
logs_to_write = self.log_cache.copy()
self.log_cache.clear()
async with aiofiles.open(self.log_file, mode='a') as f:
for log_entry in logs_to_write:
await f.write(json.dumps(log_entry, ensure_ascii=False) + '\n')
# 每秒检查一次
await asyncio.sleep(1)
except Exception as e:
# 避免日志写入错误影响主程序
print(f"日志写入错误: {e}")
# 日志格式化器
class StandardFormatter(logging.Formatter):
"""标准日志格式化器"""
def format(self, record):
timestamp = datetime.fromtimestamp(record.created).strftime('%Y-%m-%d %H:%M:%S')
return f"[{timestamp}] [{record.levelname}] {record.name}: {record.getMessage()}"
class JsonFormatter(logging.Formatter):
"""JSON日志格式化器"""
def format(self, record):
log_entry = {
"timestamp": datetime.fromtimestamp(record.created).isoformat(),
"level": record.levelname,
"logger": record.name,
"message": record.getMessage(),
"module": record.module,
"function": record.funcName,
"line": record.lineno
}
# 添加额外信息
if hasattr(record, 'extra') and record.extra:
log_entry["extra"] = record.extra
return json.dumps(log_entry, ensure_ascii=False)
3.2 指标收集器
import time
import asyncio
from collections import defaultdict
from typing import Dict, List, Any
from dataclasses import dataclass, asdict
@dataclass
class Metric:
"""指标数据类"""
name: str
value: float
timestamp: float
tags: Dict[str, str]
type: str # counter, gauge, histogram, summary
class MetricsCollector:
"""指标收集器"""
def __init__(self, ap: "app.Application"):
self.ap = ap
self.metrics: Dict[str, List[Metric]] = defaultdict(list)
self.max_metrics_per_name = 1000
self.collect_interval = 60 # 60秒收集一次
async def initialize(self):
"""初始化指标收集器"""
# 启动指标收集任务
self.ap.task_mgr.create_task(
self._metrics_collector_worker(),
name="metrics-collector"
)
def increment_counter(self, name: str, value: float = 1, tags: Dict[str, str] = None):
"""
增加计数器
Args:
name: 指标名称
value: 增加值
tags: 标签
"""
metric = Metric(
name=name,
value=value,
timestamp=time.time(),
tags=tags or {},
type="counter"
)
self._store_metric(metric)
def set_gauge(self, name: str, value: float, tags: Dict[str, str] = None):
"""
设置仪表值
Args:
name: 指标名称
value: 仪表值
tags: 标签
"""
metric = Metric(
name=name,
value=value,
timestamp=time.time(),
tags=tags or {},
type="gauge"
)
self._store_metric(metric)
def observe_histogram(self, name: str, value: float, tags: Dict[str, str] = None):
"""
观察直方图值
Args:
name: 指标名称
value: 观察值
tags: 标签
"""
metric = Metric(
name=name,
value=value,
timestamp=time.time(),
tags=tags or {},
type="histogram"
)
self._store_metric(metric)
def _store_metric(self, metric: Metric):
"""存储指标"""
self.metrics[metric.name].append(metric)
# 限制每个指标的存储数量
if len(self.metrics[metric.name]) > self.max_metrics_per_name:
# 保留最新的指标
self.metrics[metric.name] = self.metrics[metric.name][-self.max_metrics_per_name:]
async def _metrics_collector_worker(self):
"""指标收集工作者"""
while True:
try:
# 收集系统指标
await self._collect_system_metrics()
# 收集应用指标
await self._collect_application_metrics()
# 等待下次收集
await asyncio.sleep(self.collect_interval)
except Exception as e:
self.ap.logger.error(f"指标收集错误: {e}")
async def _collect_system_metrics(self):
"""收集系统指标"""
try:
import psutil
# CPU使用率
cpu_percent = psutil.cpu_percent(interval=1)
self.set_gauge("system.cpu.percent", cpu_percent)
# 内存使用率
memory = psutil.virtual_memory()
self.set_gauge("system.memory.percent", memory.percent)
self.set_gauge("system.memory.available", memory.available)
# 磁盘使用率
disk = psutil.disk_usage("/")
self.set_gauge("system.disk.percent", disk.percent)
self.set_gauge("system.disk.free", disk.free)
except ImportError:
self.ap.logger.warning("psutil未安装,无法收集系统指标")
except Exception as e:
self.ap.logger.error(f"收集系统指标失败: {e}")
async def _collect_application_metrics(self):
"""收集应用指标"""
# 活跃任务数
active_tasks = len(self.ap.task_mgr.tasks)
self.set_gauge("app.tasks.active", active_tasks)
# 数据库连接数
# 这里需要根据实际的数据库连接池实现来获取
# db_connections = self.ap.database_mgr.get_active_connections()
# self.set_gauge("app.db.connections", db_connections)
# 在线机器人数量
# active_bots = len(self.ap.platform_mgr.adapters)
# self.set_gauge("app.bots.active", active_bots)
# 使用装饰器收集函数执行时间
def timer(metric_name: str, tags: Dict[str, str] = None):
"""
计时装饰器
Args:
metric_name: 指标名称
tags: 标签
"""
def decorator(func):
async def wrapper(*args, **kwargs):
start_time = time.time()
try:
result = await func(*args, **kwargs)
return result
finally:
duration = time.time() - start_time
metrics_collector.observe_histogram(
metric_name,
duration,
tags
)
return wrapper
return decorator
3.3 错误追踪器
import traceback
import uuid
from datetime import datetime
from typing import Optional, Dict, Any
class ErrorTracker:
"""错误追踪器"""
def __init__(self, ap: "app.Application"):
self.ap = ap
self.errors = []
self.max_errors = 1000
async def initialize(self):
"""初始化错误追踪器"""
pass
def track_exception(self, exception: Exception, context: Optional[Dict[str, Any]] = None):
"""
追踪异常
Args:
exception: 异常对象
context: 上下文信息
"""
error_id = str(uuid.uuid4())
error_info = {
"id": error_id,
"timestamp": datetime.now().isoformat(),
"type": type(exception).__name__,
"message": str(exception),
"traceback": traceback.format_exc(),
"context": context or {}
}
# 添加到错误列表
self.errors.append(error_info)
if len(self.errors) > self.max_errors:
self.errors.pop(0)
# 记录错误日志
self.ap.logger.error(
f"错误ID: {error_id}, 类型: {type(exception).__name__}, 消息: {str(exception)}",
extra={"error_id": error_id, "context": context}
)
# 增加错误计数器
self.ap.metrics_collector.increment_counter("app.errors.total")
# 如果是严重错误,发送报警
if isinstance(exception, (SystemExit, KeyboardInterrupt)):
self._send_alert(error_info)
def get_errors(self, limit: int = 50) -> list:
"""
获取错误列表
Args:
limit: 限制数量
Returns:
错误列表
"""
return self.errors[-limit:]
def get_error_by_id(self, error_id: str) -> Optional[Dict[str, Any]]:
"""
根据ID获取错误详情
Args:
error_id: 错误ID
Returns:
错误详情
"""
for error in self.errors:
if error["id"] == error_id:
return error
return None
def _send_alert(self, error_info: Dict[str, Any]):
"""
发送报警
Args:
error_info: 错误信息
"""
# 这里可以集成邮件、短信、微信等报警方式
self.ap.logger.critical(f"严重错误报警: {error_info['message']}")
# 如果配置了报警webhook,发送通知
alert_config = self.ap.instance_config.data.get("alerting", {})
webhook_url = alert_config.get("webhook_url")
if webhook_url:
self._send_webhook_alert(webhook_url, error_info)
async def _send_webhook_alert(self, webhook_url: str, error_info: Dict[str, Any]):
"""
发送Webhook报警
Args:
webhook_url: Webhook URL
error_info: 错误信息
"""
try:
import aiohttp
async with aiohttp.ClientSession() as session:
alert_data = {
"error_id": error_info["id"],
"timestamp": error_info["timestamp"],
"type": error_info["type"],
"message": error_info["message"],
"context": error_info["context"]
}
await session.post(webhook_url, json=alert_data)
except Exception as e:
self.ap.logger.error(f"发送Webhook报警失败: {e}")
4. 监控面板
4.1 实时监控API
from fastapi import APIRouter, WebSocket
from typing import Dict, Any
import json
import asyncio
router = APIRouter(prefix="/api/v1/monitoring", tags=["monitoring"])
@router.get("/metrics")
async def get_metrics():
"""获取指标数据"""
metrics_data = {}
for name, metrics in app.metrics_collector.metrics.items():
# 获取最新的指标值
if metrics:
latest_metric = metrics[-1]
metrics_data[name] = {
"value": latest_metric.value,
"timestamp": latest_metric.timestamp,
"tags": latest_metric.tags
}
return {
"success": True,
"data": metrics_data
}
@router.get("/logs")
async def get_logs(limit: int = 100):
"""获取日志数据"""
logs = app.log_manager.log_cache[-limit:] if app.log_manager.log_cache else []
return {
"success": True,
"data": logs
}
@router.get("/errors")
async def get_errors(limit: int = 50):
"""获取错误数据"""
errors = app.error_tracker.get_errors(limit)
return {
"success": True,
"data": errors
}
@router.websocket("/ws")
async def websocket_endpoint(websocket: WebSocket):
"""WebSocket实时监控"""
await websocket.accept()
try:
while True:
# 收集实时数据
data = {
"timestamp": datetime.now().isoformat(),
"metrics": {},
"logs": app.log_manager.log_cache[-10:] if app.log_manager.log_cache else [],
"errors": app.error_tracker.get_errors(5)
}
# 添加指标数据
for name, metrics in app.metrics_collector.metrics.items():
if metrics:
latest_metric = metrics[-1]
data["metrics"][name] = {
"value": latest_metric.value,
"type": latest_metric.type
}
# 发送数据
await websocket.send_text(json.dumps(data, ensure_ascii=False))
# 每2秒发送一次更新
await asyncio.sleep(2)
except Exception as e:
app.logger.error(f"WebSocket连接错误: {e}")
finally:
await websocket.close()
4.2 前端监控面板
// React监控面板组件
import React, { useState, useEffect } from 'react';
import { useWebSocket } from '../hooks/useWebSocket';
interface MetricData {
value: number;
type: string;
}
interface LogEntry {
timestamp: string;
level: string;
message: string;
extra?: Record<string, any>;
}
interface ErrorEntry {
id: string;
timestamp: string;
type: string;
message: string;
context: Record<string, any>;
}
export function MonitoringDashboard() {
const [metrics, setMetrics] = useState<Record<string, MetricData>>({});
const [logs, setLogs] = useState<LogEntry[]>([]);
const [errors, setErrors] = useState<ErrorEntry[]>([]);
const [connected, setConnected] = useState(false);
const wsUrl = `${window.location.protocol === 'https:' ? 'wss:' : 'ws:'}//${window.location.host}/api/v1/monitoring/ws`;
const { data, error, connect } = useWebSocket(wsUrl);
useEffect(() => {
connect();
}, [connect]);
useEffect(() => {
if (data) {
setConnected(true);
const parsedData = JSON.parse(data);
setMetrics(parsedData.metrics || {});
setLogs(prev => {
const newLogs = [...prev, ...(parsedData.logs || [])];
return newLogs.slice(-100); // 保留最近100条日志
});
setErrors(parsedData.errors || []);
}
}, [data]);
useEffect(() => {
if (error) {
setConnected(false);
console.error('WebSocket错误:', error);
}
}, [error]);
return (
<div className="monitoring-dashboard">
<header>
<h1>系统监控面板</h1>
<div className={`connection-status ${connected ? 'connected' : 'disconnected'}`}>
{connected ? '已连接' : '连接断开'}
</div>
</header>
<div className="dashboard-grid">
{/* 指标面板 */}
<div className="panel metrics-panel">
<h2>系统指标</h2>
<div className="metrics-grid">
{Object.entries(metrics).map(([name, metric]) => (
<div key={name} className="metric-card">
<div className="metric-name">{name}</div>
<div className="metric-value">{metric.value.toFixed(2)}</div>
<div className="metric-type">{metric.type}</div>
</div>
))}
</div>
</div>
{/* 日志面板 */}
<div className="panel logs-panel">
<h2>实时日志</h2>
<div className="logs-list">
{logs.map((log, index) => (
<div key={index} className={`log-entry level-${log.level.toLowerCase()}`}>
<span className="timestamp">[{log.timestamp}]</span>
<span className="level">[{log.level}]</span>
<span className="message">{log.message}</span>
{log.extra && (
<span className="extra">额外信息: {JSON.stringify(log.extra)}</span>
)}
</div>
))}
</div>
</div>
{/* 错误面板 */}
<div className="panel errors-panel">
<h2>最近错误</h2>
<div className="errors-list">
{errors.map(error => (
<div key={error.id} className="error-entry">
<div className="error-header">
<span className="error-id">ID: {error.id}</span>
<span className="error-time">{error.timestamp}</span>
</div>
<div className="error-type">类型: {error.type}</div>
<div className="error-message">消息: {error.message}</div>
{Object.keys(error.context).length > 0 && (
<div className="error-context">
上下文: {JSON.stringify(error.context)}
</div>
)}
</div>
))}
</div>
</div>
</div>
</div>
);
}
// WebSocket Hook
export function useWebSocket(url: string) {
const [data, setData] = useState<string | null>(null);
const [error, setError] = useState<string | null>(null);
const [ws, setWs] = useState<WebSocket | null>(null);
const connect = useCallback(() => {
try {
const websocket = new WebSocket(url);
websocket.onopen = () => {
console.log('WebSocket连接已建立');
};
websocket.onmessage = (event) => {
setData(event.data);
};
websocket.onerror = (event) => {
setError('WebSocket连接错误');
};
websocket.onclose = () => {
setError('WebSocket连接已关闭');
};
setWs(websocket);
} catch (err) {
setError('WebSocket连接失败');
}
}, [url]);
const disconnect = useCallback(() => {
if (ws) {
ws.close();
setWs(null);
}
}, [ws]);
useEffect(() => {
return () => {
disconnect();
};
}, [disconnect]);
return { data, error, connect, disconnect };
}
5. 第三方集成
5.1 Prometheus集成
# Prometheus指标导出
from prometheus_client import Counter, Gauge, Histogram, generate_latest
from fastapi import Response
# 定义Prometheus指标
REQUEST_COUNT = Counter('http_requests_total', 'Total HTTP Requests', ['method', 'endpoint', 'status'])
REQUEST_DURATION = Histogram('http_request_duration_seconds', 'HTTP request duration in seconds', ['method', 'endpoint'])
ACTIVE_CONNECTIONS = Gauge('active_connections', 'Number of active connections')
CPU_USAGE = Gauge('cpu_usage_percent', 'CPU usage percentage')
MEMORY_USAGE = Gauge('memory_usage_percent', 'Memory usage percentage')
@router.get("/metrics/prometheus")
async def prometheus_metrics():
"""Prometheus指标端点"""
# 更新系统指标
try:
import psutil
CPU_USAGE.set(psutil.cpu_percent())
MEMORY_USAGE.set(psutil.virtual_memory().percent)
except Exception:
pass
# 返回Prometheus格式的指标
return Response(generate_latest(), media_type="text/plain")
5.2 日志收集集成
# ELK集成示例
import logging
from logging.handlers import HTTPHandler
class ELKHandler(HTTPHandler):
"""ELK日志处理器"""
def __init__(self, host, url, method="POST"):
super().__init__(host, url, method)
def mapLogRecord(self, record):
"""映射日志记录到ELK格式"""
return {
"@timestamp": datetime.fromtimestamp(record.created).isoformat(),
"level": record.levelname,
"logger": record.name,
"message": record.getMessage(),
"module": record.module,
"function": record.funcName,
"line": record.lineno,
"extra": getattr(record, 'extra', {})
}
# 配置ELK集成
def setup_elk_logging():
"""设置ELK日志集成"""
elk_config = app.instance_config.data.get("logging", {}).get("elk", {})
if elk_config.get("enabled", False):
elk_handler = ELKHandler(
elk_config.get("host", "localhost:9200"),
elk_config.get("url", "/logs/_doc")
)
elk_handler.setLevel(getattr(logging, elk_config.get("level", "INFO")))
logger = logging.getLogger("langbot")
logger.addHandler(elk_handler)
6. 性能优化
6.1 日志采样
class SamplingLogManager(LogManager):
"""采样日志管理器"""
def __init__(self, ap: "app.Application"):
super().__init__(ap)
self.sampling_rate = 0.1 # 10%采样率
self.debug_sampling_rate = 0.01 # 调试日志1%采样率
def debug(self, message: str, extra: Optional[Dict[str, Any]] = None):
"""采样记录调试日志"""
if random.random() < self.debug_sampling_rate:
super().debug(message, extra)
def info(self, message: str, extra: Optional[Dict[str, Any]] = None):
"""采样记录信息日志"""
if random.random() < self.sampling_rate:
super().info(message, extra)
6.2 异步日志写入
class AsyncLogManager(LogManager):
"""异步日志管理器"""
def __init__(self, ap: "app.Application"):
super().__init__(ap)
self.log_queue = asyncio.Queue(maxsize=10000)
self.batch_size = 100
self.flush_interval = 5 # 5秒刷新一次
async def initialize(self):
"""初始化异步日志管理器"""
await super().initialize()
# 启动异步写入任务
self.ap.task_mgr.create_task(
self._async_log_writer(),
name="async-log-writer"
)
def _log(self, level: int, message: str, extra: Optional[Dict[str, Any]] = None):
"""异步记录日志"""
log_entry = {
"timestamp": datetime.now().isoformat(),
"level": logging.getLevelName(level),
"message": message,
"extra": extra or {}
}
# 尝试添加到队列,如果队列满则丢弃
try:
self.log_queue.put_nowait(log_entry)
except asyncio.QueueFull:
# 队列满时丢弃日志,避免阻塞主程序
pass
async def _async_log_writer(self):
"""异步日志写入工作者"""
while True:
try:
logs_batch = []
# 收集一批日志
try:
for _ in range(self.batch_size):
log_entry = self.log_queue.get_nowait()
logs_batch.append(log_entry)
except asyncio.QueueEmpty:
pass
# 写入日志
if logs_batch:
async with aiofiles.open(self.log_file, mode='a') as f:
for log_entry in logs_batch:
await f.write(json.dumps(log_entry, ensure_ascii=False) + '\n')
# 等待刷新间隔
await asyncio.sleep(self.flush_interval)
except Exception as e:
print(f"异步日志写入错误: {e}")
总结
LangBot的监控和日志系统为应用的稳定运行提供了全面的保障。通过多层次的监控、实时日志收集、错误追踪和可视化展示,运维人员可以及时发现和解决问题,确保系统的高可用性。
关键要点包括:
- 完整架构:涵盖日志收集、指标监控、错误追踪等多个维度
- 实时性:支持实时数据收集和展示
- 可扩展性:支持集成第三方监控和日志系统
- 性能优化:通过采样、异步写入等技术优化性能
- 可视化:提供直观的监控面板和实时数据展示
在实际应用中,建议遵循以下最佳实践:
- 合理配置:根据实际需求配置日志级别和采样率
- 性能监控:持续监控系统性能指标
- 错误追踪:建立完善的错误追踪和报警机制
- 日志轮转:实施日志轮转策略避免磁盘空间不足
- 第三方集成:集成Prometheus、ELK等专业监控工具
- 安全考虑:避免在日志中记录敏感信息
通过合理使用LangBot的监控和日志系统,可以显著提高应用的可维护性和稳定性,为用户提供更好的服务体验。
13

被折叠的 条评论
为什么被折叠?



