查询ealsticsearch中routingId为某个值的文档,并使用python定时脚本删除数据

查询ealsticsearch中routingId为某个值的文档,并使用python定时脚本删除数据

一.查看elasticsearch官方文档查询routing为某个值的restful接口如下

curl --location --request GET 'http://iP:port/indexName/_search?pretty' \
--header 'Content-Type: application/json' \
--data-raw '{
  "query": {
    "term": {
      "_routing": "5b6526ec00b0da42660851d1" 
    }
  }
}'

二.使用python脚本定时删除数据

1.使用pip -install 安装python模块

pip install Elasticsearch==5.5.3 -i http://pypi.douban.com/simple --trusted-host pypi.douban.com
pip install APScheduler==3.7.0 -i http://pypi.douban.com/simple --trusted-host pypi.douban.com

2.引入相关包

import os
from apscheduler.schedulers.blocking import BlockingScheduler
from elasticsearch import Elasticsearch

3.定义相应的变量

es = Elasticsearch([
    {'host': 'ip', 'port': 9200}]
    , http_auth="elastic:ljm01173231")
body = {
    "_source": ["id"],
    "query": {
        "term": {
            "_routing": "5b6526ec00b0da42660851d1"
        }
    }
}
routing_body = {
    "routing": "5b6526ec00b0da42660851d1"
}

4.定义游标查询全索引方法,并获取相应返回的元素

def get_search_result(elasticsearch_host, index, doc_type, scroll='5m', timeout='1m', size=1000, query_body=None):
    if query_body is None:
        query_body = body
    queryData = elasticsearch_host.search(
        index=index,
        doc_type=doc_type,
        scroll=scroll,
        timeout=timeout,
        size=size,
        body=query_body
    )

    mdata = queryData.get("hits").get("hits")

    if not mdata:
        print('empty')
        return None
    scroll_id = queryData["_scroll_id"]
    total = queryData["hits"]["total"]
    for i in range(int(total / 1000)):
        res = es.scroll(scroll_id=scroll_id, scroll='5m')
        mdata = mdata + res["hits"]["hits"]

    return mdata

5.定义删除数据方法

def delete_by_routing_with_id(elasticsearch_host, index, doc_type, doc_id, delete_body=None):
    if delete_body is None:
        delete_body = routing_body
    elasticsearch_host.delete(
        index=index,
        id=doc_id,
        doc_type=doc_type,
        params=delete_body
    )

6.定义定时任务方法

def job():
    result = get_search_result(es, 'newusernetwork', 'doc')
    if not result:
        print("no data to search")
        return
    else:
        f = open('id.txt', 'w')
        for item in result:
            item_fullPath = item['_source']['id']
            f.writelines(item_fullPath)
            f.write('\n')
            f.flush()
        f.close()
        f = open('id.txt')
        line = f.readline().strip('\n')
        while line:
            try:
                delete_by_routing_with_id(es, 'indexname', 'doc', line, routing_body)
                line = f.readline().strip('\n')
            except Exception as failure_exception:
                # 找不到数据删除失败
                continue
        f.close()
        os.remove("id.txt")

7.在main方法中调用定时任务

if __name__ == '__main__':
    schedule = BlockingScheduler()
    schedule.add_job(job, 'cron', hour='21', minute='30')
    schedule.start()

8.完整代码如下,注意在linux环境下文件头部应添加 # -- coding: utf-8 --

# -*- coding: utf-8 -*-
import os
from apscheduler.schedulers.blocking import BlockingScheduler
from elasticsearch import Elasticsearch

es = Elasticsearch([
    {'host': 'ip', 'port': 9200}]
    , http_auth="elastic:ljm01173231")
body = {
    "_source": ["id"],
    "query": {
        "term": {
            "_routing": "5b6526ec00b0da42660851d1"
        }
    }
}
routing_body = {
    "routing": "5b6526ec00b0da42660851d1"
}


def get_search_result(elasticsearch_host, index, doc_type, scroll='5m', timeout='1m', size=1000, query_body=None):
    if query_body is None:
        query_body = body
    queryData = elasticsearch_host.search(
        index=index,
        doc_type=doc_type,
        scroll=scroll,
        timeout=timeout,
        size=size,
        body=query_body
    )

    mdata = queryData.get("hits").get("hits")

    if not mdata:
        print('empty')
        return None
    scroll_id = queryData["_scroll_id"]
    total = queryData["hits"]["total"]
    for i in range(int(total / 1000)):
        res = es.scroll(scroll_id=scroll_id, scroll='5m')
        mdata = mdata + res["hits"]["hits"]

    return mdata


def delete_by_routing_with_id(elasticsearch_host, index, doc_type, doc_id, delete_body=None):
    if delete_body is None:
        delete_body = routing_body
    elasticsearch_host.delete(
        index=index,
        id=doc_id,
        doc_type=doc_type,
        params=delete_body
    )


def job():
    result = get_search_result(es, 'newusernetwork', 'doc')
    if not result:
        print("no data to search")
        return
    else:
        f = open('id.txt', 'w')
        for item in result:
            item_fullPath = item['_source']['id']
            f.writelines(item_fullPath)
            f.write('\n')
            f.flush()
        f.close()
        f = open('id.txt')
        line = f.readline().strip('\n')
        while line:
            try:
                delete_by_routing_with_id(es, 'newusernetwork', 'doc', line, routing_body)
                line = f.readline().strip('\n')
            except Exception as failure_exception:
                # 找不到数据删除失败
                continue
        f.close()
        os.remove("id.txt")


if __name__ == '__main__':
    schedule = BlockingScheduler()
    schedule.add_job(job, 'cron', hour='21', minute='30')
    schedule.start()

三.总结

1.由于时间仓促,很多变量都可以动态的配置,但是该脚本写死,如需使用可相应修改

2.每天学习一点,接受新的挑战

3.今天分析一家客户的elasticsearch时,发现一个特别有趣的问题,客户的x-pack过期了,导致校验的时候能够请求到,返回的是提示错误的信息,真的是有点神奇

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值