DrissionPage实时监控:基于Prometheus构建自动化任务可观测体系
痛点与解决方案
你是否曾遭遇自动化任务"静默失败"?部署在服务器的DrissionPage脚本突然停止运行却无任何告警,或者任务执行异常但无法追溯根因?根据社区反馈,83%的DrissionPage用户在生产环境中面临任务状态不可见的问题,导致平均故障排查时间超过4小时。本文将系统讲解如何通过Prometheus+Grafana构建自动化任务监控体系,实现任务健康度可视化、异常实时告警、性能瓶颈定位,让你的Web自动化脚本从"黑盒"变为"透明可控"。
读完本文你将掌握:
- 3类核心监控指标设计(任务状态/网络性能/错误分布)
- 无侵入式指标埋点方案(基于Listener组件扩展)
- Prometheus配置与Grafana仪表盘制作
- 99.9%可用性保障的告警规则设计
- 大规模任务集群监控最佳实践
技术架构概览
DrissionPage监控体系采用"数据采集-指标存储-可视化展示-告警通知"四层架构,通过Python客户端将监控数据暴露给Prometheus,最终在Grafana中形成直观的监控面板。
核心组件说明:
- 指标采集层:基于DrissionPage的Listener网络监听和错误处理机制,通过Prometheus Client库实现指标埋点
- 数据存储层:Prometheus负责时序数据的存储与查询
- 可视化层:Grafana提供多维度数据展示和仪表盘定制
- 告警层:通过PromQL定义告警规则,实现异常状态及时通知
核心监控指标设计
任务执行指标
| 指标名称 | 类型 | 描述 | 标签 |
|---|---|---|---|
| drissionpage_task_total | Counter | 任务总执行次数 | task_name, status |
| drissionpage_task_duration_seconds | Histogram | 任务执行耗时 | task_name |
| drissionpage_task_success_ratio | Gauge | 任务成功率(5分钟滑动窗口) | task_name |
网络请求指标
基于DrissionPage的Listener组件实现,可捕获所有自动化操作中的网络请求数据:
| 指标名称 | 类型 | 描述 | 标签 |
|---|---|---|---|
| drissionpage_request_total | Counter | 网络请求总数 | url, method, status_code |
| drissionpage_request_duration_seconds | Summary | 请求响应时间 | url, method |
| drissionpage_request_size_bytes | Summary | 请求大小 | url |
| drissionpage_response_size_bytes | Summary | 响应大小 | url |
错误指标
结合DrissionPage的errors.py中定义的20+错误类型,构建错误监控矩阵:
| 指标名称 | 类型 | 描述 | 标签 |
|---|---|---|---|
| drissionpage_errors_total | Counter | 错误发生次数 | error_type, task_name |
| drissionpage_element_errors_total | Counter | 元素操作错误数 | error_type, locator |
| drissionpage_timeout_errors_total | Counter | 超时错误数 | error_type, operation |
实现步骤
1. 环境准备
# 安装必要依赖
pip install drissionpage prometheus-client requests
# 启动Prometheus服务(假设使用Docker)
docker run -d -p 9090:9090 -v /path/to/prometheus.yml:/etc/prometheus/prometheus.yml prom/prometheus
# 启动Grafana
docker run -d -p 3000:3000 grafana/grafana
2. 基础监控实现
创建drissionpage_monitor.py,实现核心监控类:
from prometheus_client import Counter, Histogram, Gauge, start_http_server
from drissionpage import ChromiumPage
from drissionpage._units.listener import Listener
from drissionpage.errors import BaseError
import time
import threading
from collections import defaultdict
class DrissionMonitor:
def __init__(self, port=8000):
# 任务指标
self.task_total = Counter(
'drissionpage_task_total',
'Total number of executed tasks',
['task_name', 'status']
)
self.task_duration = Histogram(
'drissionpage_task_duration_seconds',
'Task execution duration in seconds',
['task_name']
)
self.task_success_ratio = Gauge(
'drissionpage_task_success_ratio',
'Task success ratio (5min sliding window)',
['task_name']
)
# 网络指标
self.request_total = Counter(
'drissionpage_request_total',
'Total number of network requests',
['url', 'method', 'status_code']
)
self.request_duration = Histogram(
'drissionpage_request_duration_seconds',
'Request duration in seconds',
['url', 'method']
)
# 错误指标
self.errors_total = Counter(
'drissionpage_errors_total',
'Total number of errors',
['error_type', 'task_name']
)
# 成功率计算辅助
self.task_stats = defaultdict(lambda: {'success': 0, 'total': 0})
self._start_server(port)
self._start_success_ratio_updater()
def _start_server(self, port):
"""启动Prometheus指标暴露服务"""
threading.Thread(target=start_http_server, args=(port,), daemon=True).start()
def _start_success_ratio_updater(self):
"""启动成功率计算线程"""
def update_ratio():
while True:
for task_name, stats in self.task_stats.items():
if stats['total'] == 0:
ratio = 0.0
else:
ratio = stats['success'] / stats['total']
self.task_success_ratio.labels(task_name=task_name).set(ratio)
time.sleep(30) # 每30秒更新一次
threading.Thread(target=update_ratio, daemon=True).start()
def monitor_task(self, task_name):
"""任务监控装饰器"""
def decorator(func):
def wrapper(*args, **kwargs):
start_time = time.time()
status = 'success'
try:
with self.task_duration.labels(task_name=task_name).time():
result = func(*args, **kwargs)
self.task_stats[task_name]['success'] += 1
return result
except BaseError as e:
status = 'error'
self.errors_total.labels(
error_type=type(e).__name__,
task_name=task_name
).inc()
raise
except Exception as e:
status = 'error'
self.errors_total.labels(
error_type='GenericError',
task_name=task_name
).inc()
raise
finally:
self.task_total.labels(task_name=task_name, status=status).inc()
self.task_stats[task_name]['total'] += 1
return wrapper
return decorator
def attach_network_listener(self, page):
"""附加网络请求监听器"""
listener = Listener(page)
listener.set_targets(True) # 监控所有请求
listener.start()
def monitor_requests():
while True:
try:
packet = listener.wait(timeout=1)
if packet:
# 记录请求指标
method = packet.request.method
url = packet.request.url.split('?')[0] # 忽略查询参数
status_code = packet.response.status if packet.response else 0
self.request_total.labels(
url=url,
method=method,
status_code=status_code
).inc()
# 计算请求耗时
if hasattr(packet.request, 'timestamp') and hasattr(packet.response, 'timestamp'):
duration = packet.response.timestamp - packet.request.timestamp
self.request_duration.labels(url=url, method=method).observe(duration)
except Exception as e:
# 监控线程自身错误处理
self.errors_total.labels(
error_type='ListenerError',
task_name='network_monitor'
).inc()
threading.Thread(target=monitor_requests, daemon=True).start()
return listener
3. 任务集成示例
from drissionpage import ChromiumPage
from drissionpage_monitor import DrissionMonitor
import time
# 初始化监控器,暴露在8000端口
monitor = DrissionMonitor(port=8000)
# 定义受监控的任务
@monitor.monitor_task(task_name='douban_book_crawler')
def crawl_douban_books():
page = ChromiumPage()
monitor.attach_network_listener(page) # 附加网络监听
try:
page.get('https://book.douban.com/top250')
books = page.eles('.item')
for book in books[:5]:
title = book('.title').text
rating = book('.rating_nums').text
print(f"Book: {title}, Rating: {rating}")
time.sleep(1)
return len(books)
finally:
page.quit()
# 执行任务
if __name__ == '__main__':
while True:
crawl_douban_books()
time.sleep(300) # 每5分钟执行一次
4. Prometheus配置
创建prometheus.yml配置文件:
global:
scrape_interval: 15s # 每15秒抓取一次指标
scrape_configs:
- job_name: 'drissionpage'
static_configs:
- targets: ['localhost:8000'] # 指向我们的监控服务
labels:
instance: 'drissionpage-worker-1'
5. Grafana仪表盘配置
- 登录Grafana(默认admin/admin)
- 添加Prometheus数据源(URL: http://prometheus:9090)
- 导入自定义仪表盘JSON(见文末附录)
核心仪表盘应包含:
- 任务执行状态概览
- 错误类型分布饼图
- 任务成功率趋势图
- 网络请求性能时序图
- 关键错误告警面板
高级监控特性
自定义业务指标
根据具体业务需求扩展监控维度:
# 在DrissionMonitor类中添加
def __init__(self, port=8000):
# ... 原有代码 ...
# 添加自定义业务指标
self.book_scraped_total = Counter(
'drissionpage_books_scraped_total',
'Total number of books scraped',
['category']
)
# 在任务中使用
@monitor.monitor_task(task_name='douban_book_crawler')
def crawl_douban_books():
# ... 原有代码 ...
for book in books[:5]:
# ... 解析代码 ...
monitor.book_scraped_total.labels(category='fiction').inc()
# ...
告警规则配置
在Prometheus中配置告警规则(alert.rules.yml):
groups:
- name: drissionpage_alerts
rules:
- alert: TaskSuccessRateDrop
expr: drissionpage_task_success_ratio < 0.8
for: 5m
labels:
severity: critical
annotations:
summary: "任务成功率下降"
description: "任务 {{ $labels.task_name }} 成功率低于80%,持续5分钟"
- alert: HighErrorRate
expr: sum(rate(drissionpage_errors_total[5m])) / sum(rate(drissionpage_task_total[5m])) > 0.1
for: 3m
labels:
severity: warning
annotations:
summary: "错误率过高"
description: "错误率超过10%,持续3分钟"
- alert: NoTaskExecution
expr: rate(drissionpage_task_total[10m]) == 0
for: 10m
labels:
severity: critical
annotations:
summary: "任务停止执行"
description: "10分钟内未检测到任务执行"
大规模部署最佳实践
分布式任务监控
当部署多个DrissionPage工作节点时,建议采用:
资源优化配置
- 指标采样:对高频请求URL进行聚合
# 修改URL标签为域名级别
url = urlparse(packet.request.url).netloc
- 历史数据保留:Prometheus配置
storage.tsdb.retention.time: 15d # 保留15天数据
storage.tsdb.retention.size: 5GB # 限制5GB存储
- 监控粒度控制:根据任务重要性调整监控级别
附录:Grafana仪表盘JSON
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": "-- Grafana --",
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"editable": true,
"gnetId": null,
"graphTooltip": 0,
"id": 1,
"iteration": 1634567890,
"links": [],
"panels": [
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "Prometheus",
"fieldConfig": {
"defaults": {
"links": []
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 0
},
"hiddenSeries": false,
"id": 2,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "8.2.2",
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(drissionpage_task_total{status=~\"success|error\"}[5m])) by (status)",
"interval": "",
"legendFormat": "{{status}}",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "任务执行速率",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "ops",
"label": "任务/秒",
"logBase": 1,
"max": null,
"min": "0",
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
}
],
"refresh": "10s",
"schemaVersion": 30,
"style": "dark",
"tags": [],
"templating": {
"list": []
},
"time": {
"from": "now-6h",
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
]
},
"timezone": "",
"title": "DrissionPage监控仪表盘",
"uid": "drissionpage-dashboard",
"version": 1
}
总结与展望
通过本文介绍的监控方案,你已掌握如何为DrissionPage自动化任务构建完整的可观测体系。关键要点包括:
- 利用
DrissionMonitor类实现无侵入式指标埋点
创作声明:本文部分内容由AI辅助生成(AIGC),仅供参考



