es查询 scroll_scan用法

最新推荐文章于 2025-03-15 23:18:15 发布

patkritLee

最新推荐文章于 2025-03-15 23:18:15 发布

阅读量1.6k

点赞数

分类专栏： python

本文链接：https://blog.youkuaiyun.com/patkritLee/article/details/78906319

版权

python 专栏收录该内容

15 篇文章

订阅专栏

本文介绍了一种利用Elasticsearch的Scroll Scan方法实时查询大量日志数据的技术方案。该方法通过查询一分钟前的数据来减少延迟影响，并采用特定策略获取跨天的索引名称。此外，还提供了Python实现的具体代码示例。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

需求大概是：需要实时地推送日志系统的日志消息，提供给其他同事查询使用。当前时间查询前一分钟的数据，因为数据量大，考虑用es的scroll_scan方法。

代码：

#!/usr/bin/env python
# -*- coding:utf-8 -*-
"""
查询es数据 demo
由于es数据存在一定延迟, 所以统计一分钟前的数据.
"""
import sys
import os
import requests

setting_path = (os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
sys.path.insert(0, setting_path)

import json
import datetime
import logging
logger = logging.getLogger('demo')

ES_URL_PRE = "http://XXX/es/"


def request_get(url, params, timeout=2):
    """统一的request.gets函数借口，加上异常处理.
    Note: 适用于返回数据为json字符串的cgi接口.

    """
    try:
        r = requests.get(url=url, params=params, timeout=timeout)
        if r.ok:
            ret = r.json()
            return ret
        else:
            logger.error('{0} faild, code: {1}, cause: {2}'\
                .format(url, r.status_code, r.text[:200]))
    except requests.exceptions.ConnectionError:
        logger.exception('connection error: %s' % (url, ))
    except requests.exceptions.RequestException:
        logger.exception('request {0} error'.format(url))
    return {}


def requests_post(url, data, timeout=2):
    """统一的requests.post函数接口,加上异常处理.
    Note: 适用于返回数据为json字符串的cgi接口.
    """
    try:
        r = requests.post(url=url, data=data, timeout=timeout)
        if r.ok:
            ret = r.json()
            return ret
        else:
            logger.error('{0} faild, code: {1}, cause: {2}'\
                         .format(url, r.status_code, r.text))
    except requests.exceptions.ConnectionError:
        logger.exception('connection error: %s' % (url, ))
    except Exception:
        logger.exception('request {0} error'.format(url))
    return {}


def gen_index(date, name="logstash"):
    return '{0}-{1}'.format(name, date.strftime('%Y.%m.%d'))


def get_exact_index_name(from_time, to_time, name="logstash"):
    """获取精确的index名称"""
    from_time -= datetime.timedelta(hours=8)
    to_time -= datetime.timedelta(hours=8)

    day = to_time.day - from_time.day
    if day >= 1:
        indexs = []
        for idx in range(day + 1):
            indexs.append(gen_index(from_time + datetime.timedelta(days=idx), name))
        index_name = ",".join(indexs)
    else:
        index_name = gen_index(to_time, name)
    return index_name


def get_query_data(from_time, to_time, should_terms):
    should = []
    for item in should_terms:
        should.append({"term": item})

    query_template = {
        "query": {
            "filtered": {
                "filter": {
                    "bool": {
                        "should": should
                    },
                },
            "query": {
                    "range": {
                        "@timestamp": {
                            "gte": from_time,
                            "lt": to_time,
                            "time_zone": "+08:00"
                        }
                    }
                }
            }
        }
    }
    return json.dumps(query_template)


def get_type_data(from_time, to_time, type_name, size=500):
    index_name = get_exact_index_name(from_time, to_time)
    initial_url = ES_URL_PRE + "{0}/{1}/_search/?scroll=2m&size={2}&search_type=scan".format(index_name, type_name, size)

    messages, counts = [], 0
    should_terms = [{"opt": "1_1"}, {"opt": "4_12"}]
    data = get_query_data(from_time.strftime("%Y-%m-%dT%H:%M:%S"),
                          to_time.strftime("%Y-%m-%dT%H:%M:%S"),
                          should_terms)
    rets = requests_post(initial_url, data, timeout=2)
    if not rets:
        return messages, counts
    scroll_id, counts = rets.get("_scroll_id", ""), rets.get("hits", []).get("total", 0)
    if not counts:
        return messages, counts
    scroll_url = ES_URL_PRE + "_search/scroll?"
    while True:
        params = {'scroll_id': scroll_id, "scroll": "2m", "size": size}
        res = request_get(scroll_url, params=params, timeout=1)
        hits = res.get("hits", {}).get("hits", [])
        if not hits:
            break
        for hit in hits:
            messages.append(hit.get("_source", {}))
        scroll_id = res.get("_scroll_id", "")
    return messages, counts


def main(from_time, to_time):
    type_name = "bilog"
    size = 1000
    messages, counts = get_type_data(from_time, to_time, type_name, size=size)
    return messages, counts


if __name__ == "__main__":
    start_time = datetime.datetime.now()
    to_time = start_time.replace(second=0, microsecond=0) \
        - datetime.timedelta(minutes=1)

    from_time = (to_time - datetime.timedelta(minutes=1))
    messages, counts = main(from_time, to_time)
    end_time = datetime.datetime.now()
    print end_time-start_time