在生产环境中,Elasticsearch 索引健康检查是保障集群稳定、预防数据丢失、提前发现性能瓶颈的关键运维手段。
为此,我们设计一套 Elasticsearch 索引健康检查工具,支持:
- 自动扫描所有索引;
- 多维度健康评分;
- 根因诊断;
- 修复建议;
- 告警通知;
- 可视化报告。
本文提供一套 可落地的索引健康检查工具设计方案 + Python 实现模板。
一、目标与核心功能
| 功能 | 说明 |
|---|---|
| ✅ 全面检查 | 覆盖状态、分片、映射、设置、性能等维度 |
| ✅ 健康评分 | 为每个索引计算健康分(0~100) |
| ✅ 根因诊断 | 识别 UNASSIGNED、fielddata、dynamic: true 等问题 |
| ✅ 修复建议 | 自动生成可执行优化建议 |
| ✅ 告警通知 | 钉钉、企业微信、邮件告警 |
| ✅ 报告导出 | 支持 HTML / JSON / CSV 导出 |
| ✅ 定时任务 | 支持 cron 自动运行 |
二、健康检查维度
| 维度 | 检查项 |
|---|---|
| 集群状态 | red / yellow |
| 分片健康 | UNASSIGNED 分片数 |
| 索引状态 | 是否关闭、只读 |
| 分片大小 | 是否过大(>50GB)或过小(<5GB) |
| 副本数 | 是否为 0(无高可用) |
| 动态映射 | dynamic: true(生产环境风险) |
| 字段类型 | text 字段用于聚合? |
| 缓存使用 | fielddata 内存过高? |
| 慢查询 | 是否有慢日志 |
| 生命周期 | 是否启用 ILM? |
三、健康评分模型(Health Score)
为每个索引计算综合健康分:
health_score =
20 * (1 - unassigned_ratio) +
15 * (1 - shard_too_large_ratio) +
10 * (1 - shard_too_small_ratio) +
10 * (replica_count >= 1 ? 1 : 0) +
15 * (dynamic == 'strict' ? 1 : 0) +
10 * (ilm_enabled ? 1 : 0) +
10 * (fielddata_memory < threshold ? 1 : 0) +
10 * (no_slow_queries ? 1 : 0)
满分 100,低于 70 为“警告”,低于 50 为“危险”。
四、Python 实现:索引健康检查工具
文件结构
es-health-check/
├── checker.py # 核心检查逻辑
├── report.py # 报告生成
├── notify.py # 通知模块
├── config.yaml # 配置文件
└── main.py # 主程序
1. 核心检查逻辑 checker.py
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Elasticsearch 索引健康检查工具
"""
import logging
from elasticsearch import Elasticsearch
from elasticsearch.exceptions import NotFoundError, TransportError
logger = logging.getLogger(__name__)
class IndexHealthChecker:
def __init__(self, es_client):
self.es = es_client
self.results = []
def check_all_indices(self):
"""检查所有索引"""
indices = self.es.cat.indices(format='json', h='index,status')
for idx in indices:
try:
result = self.check_index(idx['index'])
self.results.append(result)
except Exception as e:
logger.error(f"检查索引 {idx['index']} 失败: {e}")
return self.results
def check_index(self, index_name):
"""检查单个索引"""
result = {
"index": index_name,
"checks": [],
"score": 0,
"status": "healthy"
}
# 1. 获取 settings 和 mappings
try:
settings = self.es.indices.get_settings(index=index_name)
mappings = self.es.indices.get_mapping(index=index_name)
stats = self.es.indices.stats(index=index_name, metric="store,search,indexing")
except NotFoundError:
result["status"] = "missing"
return result
except Exception as e:
result["status"] = "error"
result["error"] = str(e)
return result
settings = settings[index_name]['settings']
mapping = mappings[index_name]['mappings']
# 2. 检查分片状态
shards = self.es.cat.shards(index=index_name, format='json', h='state')
unassigned = [s for s in shards if s['state'] == 'UNASSIGNED']
if unassigned:
result["checks"].append({
"type": "shard",
"severity": "critical",
"message": f"发现 {len(unassigned)} 个 UNASSIGNED 分片"
})
# 3. 检查分片大小
store_size = stats['indices']['store']['size_in_bytes']
primary_shards = int(settings['index']['number_of_shards'])
avg_shard_size = store_size / primary_shards / (1024*1024*1024) # GB
if avg_shard_size > 50:
result["checks"].append({
"type": "shard",
"severity": "warning",
"message": f"分片过大: {avg_shard_size:.1f}GB > 50GB"
})
elif avg_shard_size < 5:
result["checks"].append({
"type": "shard",
"severity": "info",
"message": f"分片过小: {avg_shard_size:.1f}GB < 5GB"
})
# 4. 检查副本数
replicas = int(settings['index']['number_of_replicas'])
if replicas == 0:
result["checks"].append({
"type": "replica",
"severity": "warning",
"message": "副本数为 0,无高可用"
})
# 5. 检查 dynamic mapping
dynamic = settings.get('index', {}).get('mapping', {}).get('dynamic', 'true')
if dynamic != 'strict':
result["checks"].append({
"type": "mapping",
"severity": "warning",
"message": f"dynamic mapping 未设为 strict: {dynamic}"
})
# 6. 检查 fielddata
fielddata = stats['indices']['fielddata']['memory_size_in_bytes']
if fielddata > 1024*1024*1024: # >1GB
result["checks"].append({
"type": "cache",
"severity": "warning",
"message": f"fielddata 内存使用: {fielddata/(1024*1024):.1f}MB"
})
# 7. 检查 ILM
if 'index.lifecycle.name' not in settings.get('index', {}):
result["checks"].append({
"type": "ilm",
"severity": "info",
"message": "未启用 ILM 生命周期管理"
})
# 8. 计算健康分
score = 100
for item in result["checks"]:
if item["severity"] == "critical":
score -= 20
elif item["severity"] == "warning":
score -= 10
elif item["severity"] == "info":
score -= 5
result["score"] = max(0, score)
result["status"] = "danger" if score < 50 else "warning" if score < 70 else "healthy"
return result
2. 报告生成 report.py
#!/usr/bin/env python3
from typing import List, Dict
def generate_html_report(results: List[Dict], cluster_info: Dict):
"""生成 HTML 报告"""
html = f"""
<!DOCTYPE html>
<html>
<head>
<title>Elasticsearch 索引健康检查报告</title>
<style>
body {{ font-family: Arial, sans-serif; margin: 20px; }}
.table {{ border-collapse: collapse; width: 100%; }}
.table th, .table td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
.table th {{ background-color: #f2f2f2; }}
.critical {{ color: red; }}
.warning {{ color: orange; }}
.info {{ color: blue; }}
.healthy {{ background-color: #d4edda; }}
.warning {{ background-color: #fff3cd; }}
.danger {{ background-color: #f8d7da; }}
</style>
</head>
<body>
<h1>Elasticsearch 索引健康检查报告</h1>
<p><strong>集群:</strong> {cluster_info['cluster_name']}</p>
<p><strong>时间:</strong> {cluster_info['timestamp']}</p>
<table class="table">
<tr><th>索引</th><th>健康分</th><th>状态</th><th>问题数</th></tr>
"""
for r in results:
status_class = r["status"]
checks_count = len(r["checks"])
html += f"""
<tr class="{status_class}">
<td>{r['index']}</td>
<td>{r['score']}</td>
<td>{r['status'].upper()}</td>
<td>{checks_count}</td>
</tr>
"""
html += "</table><h2>详细问题</h2><ul>"
for r in results:
for c in r["checks"]:
severity_class = c["severity"]
html += f"<li class='{severity_class}'>[{r['index']}] {c['message']}</li>"
html += "</ul></body></html>"
with open("health-report.html", "w", encoding="utf-8") as f:
f.write(html)
print("✅ HTML 报告已生成: health-report.html")
3. 通知模块 notify.py
#!/usr/bin/env python3
import requests
import smtplib
from email.mime.text import MIMEText
def send_dingtalk(webhook: str, message: str):
"""发送钉钉告警"""
if not webhook:
return
data = {"msgtype": "text", "text": {"content": message}}
try:
requests.post(webhook, json=data, timeout=5)
except Exception as e:
print(f"发送钉钉失败: {e}")
def send_email(smtp_config: dict, to: list, subject: str, body: str):
"""发送邮件"""
msg = MIMEText(body, 'html', 'utf-8')
msg['Subject'] = subject
msg['From'] = smtp_config['from']
msg['To'] = ', '.join(to)
try:
s = smtplib.SMTP(smtp_config['host'], smtp_config['port'])
s.starttls()
s.login(smtp_config['user'], smtp_config['password'])
s.sendmail(smtp_config['from'], to, msg.as_string())
s.quit()
print("✅ 邮件发送成功")
except Exception as e:
print(f"❌ 邮件发送失败: {e}")
4. 主程序 main.py
#!/usr/bin/env python3
from elasticsearch import Elasticsearch
from checker import IndexHealthChecker
from report import generate_html_report
from notify import send_dingtalk, send_email
import yaml
import datetime
# 加载配置
with open("config.yaml", "r", encoding="utf-8") as f:
config = yaml.safe_load(f)
# 初始化 ES 客户端
es = Elasticsearch(
config['es_hosts'],
basic_auth=(config['username'], config['password']),
use_ssl=config.get('use_ssl', False),
verify_certs=config.get('verify_certs', True)
)
def main():
print("🔍 开始 Elasticsearch 索引健康检查...")
checker = IndexHealthChecker(es)
results = checker.check_all_indices()
# 获取集群信息
cluster_info = es.info()
cluster_info['timestamp'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
# 生成报告
generate_html_report(results, cluster_info)
# 统计
total = len(results)
healthy = len([r for r in results if r['status'] == 'healthy'])
warning = len([r for r in results if r['status'] == 'warning'])
danger = len([r for r in results if r['status'] == 'danger'])
summary = f"""
📊 健康检查完成:
总索引数: {total}
健康: {healthy}
警告: {warning}
危险: {danger}
"""
print(summary)
# 告警
if danger > 0:
alert_msg = f"🚨 发现 {danger} 个危险索引!详情见报告"
send_dingtalk(config['notify']['dingtalk_webhook'], alert_msg)
send_email(
config['notify']['smtp'],
config['notify']['email_to'],
"Elasticsearch 索引健康告警",
alert_msg + "\n\n请立即检查 health-report.html"
)
if __name__ == "__main__":
main()
5. 配置文件 config.yaml
es_hosts:
- http://es-node1:9200
username: admin
password: changeme
use_ssl: false
notify:
dingtalk_webhook: "https://oapi.dingtalk.com/robot/send?access_token=xxx"
smtp:
host: smtp.company.com
port: 587
user: alert@company.com
password: xxx
from: "Elasticsearch Monitor <alert@company.com>"
email_to: ["ops@company.com"]
五、使用方式
1. 安装依赖
pip install elasticsearch pyyaml requests
2. 运行检查
python main.py
3. 输出
- 控制台摘要
health-report.html详细报告- 钉钉/邮件告警(如有危险)
六、最佳实践 ✅
| 项目 | 建议 |
|---|---|
| 检查频率 | 每天一次(cron) |
| 告警阈值 | 危险索引 > 0 时告警 |
| 报告归档 | 保留 30 天报告 |
| 权限控制 | 使用只读用户 |
| 多集群支持 | 扩展配置支持多个集群 |

被折叠的 条评论
为什么被折叠?



