在使用elasticsearch7.x的过程中,发现elasticsearch默认的副本数和分片数都为1,随着数据量不断增多,一个分片导致写入索引的效率越来越低,之后决定对业务层和数仓的索引进行重建
# -*- coding: utf-8 -*-
# @Time : 2019/9/21 13:48
# @Author : Cocktail_py
import logging
import traceback
import requests
from elasticsearch import Elasticsearch
from requests.auth import HTTPBasicAuth
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(message)s",
level=logging.INFO)
es = Elasticsearch("host1:19200,host2:19200,host3:19200".split(","),
http_auth=("username", "password"), timeout=180,
max_retries=10,
retry_on_timeout=True)
# 获取数仓索引以及相应的mapping
for da in list(es.indices.get_mapping("dw_gofish_*").items()):
index = da[0]
mapping = da[1]
mapping["settings"] = {
# 先改為-1(禁用刷新),之後根據業務場景設置(30s,或者更長)
"refresh_interval": "-1",
"translog": {
"flush_threshold_size": "2gb",
"sync_interval": "120s",
"durability": "async"
},
"index": {
# 分片數,分片数量建议为>=节点数,平均每个分片数据不超过30G
"number_of_shards": "10",
# 副本先改為0,reindex之後改為1
"number_of_replicas": "0"
}
}
# 旧索引
index_nw = "{}_nw".format(index)
# 新建新的索引
try:
result = es.indices.create(index_nw, mapping)
logging.info(result)
except:
logging.error(traceback.format_exc())
pass
# 取别名
es.indices.put_alias([index, index_nw], name="{}_alias".format(index))
# 异步reindex
result = requests.post(
"http://ip1:19200/_reindex?refresh&wait_for_completion=false",
json={
"source": {
"index": index,
"size": 10000
},
"dest": {
"index": index_nw
}
},
headers={'Content-Type': 'application/json'},
# auth 用戶名密碼
auth=HTTPBasicAuth("username", "password"))
logging.info(result.text)
查看索引已存在的索引、文档数量,占用存储空间大小等信息
# pri(主分片数)、rep(副分片数)、docs.count(索引现有文档数)、docs.deleted(索引标记为删除的文档数)、store.size(索引总大小)、pri.store.size(索引主分片大小)
GET _cat/indices/index_name?v
查看索引settings设置
GET /index_name/_settings
查看reindex进度
GET _tasks?detailed=true&actions=*reindex
# 删除所有滚动
import requests
from requests.auth import HTTPBasicAuth
result = requests.delete(
"http://xxx.xxx.xxxx:19200/_search/scroll/_all",
headers={'Content-Type': 'application/json'},
# auth 用戶名密碼
auth=HTTPBasicAuth("username", "password"))
print(result)
本文介绍如何通过调整Elasticsearch索引配置来提高写入效率及查询性能,包括更改刷新间隔、设置分片数量及副本数,并演示了具体的Python脚本实现。
7544

被折叠的 条评论
为什么被折叠?



