查询ealsticsearch中routingId为某个值的文档,并使用python定时脚本删除数据
一.查看elasticsearch官方文档查询routing为某个值的restful接口如下
curl --location --request GET 'http://iP:port/indexName/_search?pretty' \
--header 'Content-Type: application/json' \
--data-raw '{
"query": {
"term": {
"_routing": "5b6526ec00b0da42660851d1"
}
}
}'
二.使用python脚本定时删除数据
1.使用pip -install 安装python模块
pip install Elasticsearch==5.5.3 -i http://pypi.douban.com/simple --trusted-host pypi.douban.com
pip install APScheduler==3.7.0 -i http://pypi.douban.com/simple --trusted-host pypi.douban.com
2.引入相关包
import os
from apscheduler.schedulers.blocking import BlockingScheduler
from elasticsearch import Elasticsearch
3.定义相应的变量
es = Elasticsearch([
{'host': 'ip', 'port': 9200}]
, http_auth="elastic:ljm01173231")
body = {
"_source": ["id"],
"query": {
"term": {
"_routing": "5b6526ec00b0da42660851d1"
}
}
}
routing_body = {
"routing": "5b6526ec00b0da42660851d1"
}
4.定义游标查询全索引方法,并获取相应返回的元素
def get_search_result(elasticsearch_host, index, doc_type, scroll='5m', timeout='1m', size=1000, query_body=None):
if query_body is None:
query_body = body
queryData = elasticsearch_host.search(
index=index,
doc_type=doc_type,
scroll=scroll,
timeout=timeout,
size=size,
body=query_body
)
mdata = queryData.get("hits").get("hits")
if not mdata:
print('empty')
return None
scroll_id = queryData["_scroll_id"]
total = queryData["hits"]["total"]
for i in range(int(total / 1000)):
res = es.scroll(scroll_id=scroll_id, scroll='5m')
mdata = mdata + res["hits"]["hits"]
return mdata
5.定义删除数据方法
def delete_by_routing_with_id(elasticsearch_host, index, doc_type, doc_id, delete_body=None):
if delete_body is None:
delete_body = routing_body
elasticsearch_host.delete(
index=index,
id=doc_id,
doc_type=doc_type,
params=delete_body
)
6.定义定时任务方法
def job():
result = get_search_result(es, 'newusernetwork', 'doc')
if not result:
print("no data to search")
return
else:
f = open('id.txt', 'w')
for item in result:
item_fullPath = item['_source']['id']
f.writelines(item_fullPath)
f.write('\n')
f.flush()
f.close()
f = open('id.txt')
line = f.readline().strip('\n')
while line:
try:
delete_by_routing_with_id(es, 'indexname', 'doc', line, routing_body)
line = f.readline().strip('\n')
except Exception as failure_exception:
# 找不到数据删除失败
continue
f.close()
os.remove("id.txt")
7.在main方法中调用定时任务
if __name__ == '__main__':
schedule = BlockingScheduler()
schedule.add_job(job, 'cron', hour='21', minute='30')
schedule.start()
8.完整代码如下,注意在linux环境下文件头部应添加 # -- coding: utf-8 --
# -*- coding: utf-8 -*-
import os
from apscheduler.schedulers.blocking import BlockingScheduler
from elasticsearch import Elasticsearch
es = Elasticsearch([
{'host': 'ip', 'port': 9200}]
, http_auth="elastic:ljm01173231")
body = {
"_source": ["id"],
"query": {
"term": {
"_routing": "5b6526ec00b0da42660851d1"
}
}
}
routing_body = {
"routing": "5b6526ec00b0da42660851d1"
}
def get_search_result(elasticsearch_host, index, doc_type, scroll='5m', timeout='1m', size=1000, query_body=None):
if query_body is None:
query_body = body
queryData = elasticsearch_host.search(
index=index,
doc_type=doc_type,
scroll=scroll,
timeout=timeout,
size=size,
body=query_body
)
mdata = queryData.get("hits").get("hits")
if not mdata:
print('empty')
return None
scroll_id = queryData["_scroll_id"]
total = queryData["hits"]["total"]
for i in range(int(total / 1000)):
res = es.scroll(scroll_id=scroll_id, scroll='5m')
mdata = mdata + res["hits"]["hits"]
return mdata
def delete_by_routing_with_id(elasticsearch_host, index, doc_type, doc_id, delete_body=None):
if delete_body is None:
delete_body = routing_body
elasticsearch_host.delete(
index=index,
id=doc_id,
doc_type=doc_type,
params=delete_body
)
def job():
result = get_search_result(es, 'newusernetwork', 'doc')
if not result:
print("no data to search")
return
else:
f = open('id.txt', 'w')
for item in result:
item_fullPath = item['_source']['id']
f.writelines(item_fullPath)
f.write('\n')
f.flush()
f.close()
f = open('id.txt')
line = f.readline().strip('\n')
while line:
try:
delete_by_routing_with_id(es, 'newusernetwork', 'doc', line, routing_body)
line = f.readline().strip('\n')
except Exception as failure_exception:
# 找不到数据删除失败
continue
f.close()
os.remove("id.txt")
if __name__ == '__main__':
schedule = BlockingScheduler()
schedule.add_job(job, 'cron', hour='21', minute='30')
schedule.start()
三.总结
1.由于时间仓促,很多变量都可以动态的配置,但是该脚本写死,如需使用可相应修改
2.每天学习一点,接受新的挑战
3.今天分析一家客户的elasticsearch时,发现一个特别有趣的问题,客户的x-pack过期了,导致校验的时候能够请求到,返回的是提示错误的信息,真的是有点神奇