需求大概是:需要实时地推送日志系统的日志消息,提供给其他同事查询使用。当前时间查询前一分钟的数据,因为数据量大,考虑用es的scroll_scan方法。
代码:
#!/usr/bin/env python
# -*- coding:utf-8 -*-
"""
查询es数据 demo
由于es数据存在一定延迟, 所以统计一分钟前的数据.
"""
import sys
import os
import requests
setting_path = (os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
sys.path.insert(0, setting_path)
import json
import datetime
import logging
logger = logging.getLogger('demo')
ES_URL_PRE = "http://XXX/es/"
def request_get(url, params, timeout=2):
"""统一的request.gets函数借口,加上异常处理.
Note: 适用于返回数据为json字符串的cgi接口.
"""
try:
r = requests.get(url=url, params=params, timeout=timeout)
if r.ok:
ret = r.json()
return ret
else:
logger.error('{0} faild, code: {1}, cause: {2}'\
.format(url, r.status_code, r.text[:200]))
except requests.exceptions.ConnectionError:
logger.exception('connection error: %s' % (url, ))
except requests.exceptions.RequestException:
logger.exception('request {0} error'.format(url))
return {}
def requests_post(url, data, timeout=2):
"""统一的requests.post函数接口,加上异常处理.
Note: 适用于返回数据为json字符串的cgi接口.
"""
try:
r = requests.post(url=url, data=data, timeout=timeout)
if r.ok:
ret = r.json()
return ret
else:
logger.error('{0} faild, code: {1}, cause: {2}'\
.format(url, r.status_code, r.text))
except requests.exceptions.ConnectionError:
logger.exception('connection error: %s' % (url, ))
except Exception:
logger.exception('request {0} error'.format(url))
return {}
def gen_index(date, name="logstash"):
return '{0}-{1}'.format(name, date.strftime('%Y.%m.%d'))
def get_exact_index_name(from_time, to_time, name="logstash"):
"""获取精确的index名称"""
from_time -= datetime.timedelta(hours=8)
to_time -= datetime.timedelta(hours=8)
day = to_time.day - from_time.day
if day >= 1:
indexs = []
for idx in range(day + 1):
indexs.append(gen_index(from_time + datetime.timedelta(days=idx), name))
index_name = ",".join(indexs)
else:
index_name = gen_index(to_time, name)
return index_name
def get_query_data(from_time, to_time, should_terms):
should = []
for item in should_terms:
should.append({"term": item})
query_template = {
"query": {
"filtered": {
"filter": {
"bool": {
"should": should
},
},
"query": {
"range": {
"@timestamp": {
"gte": from_time,
"lt": to_time,
"time_zone": "+08:00"
}
}
}
}
}
}
return json.dumps(query_template)
def get_type_data(from_time, to_time, type_name, size=500):
index_name = get_exact_index_name(from_time, to_time)
initial_url = ES_URL_PRE + "{0}/{1}/_search/?scroll=2m&size={2}&search_type=scan".format(index_name, type_name, size)
messages, counts = [], 0
should_terms = [{"opt": "1_1"}, {"opt": "4_12"}]
data = get_query_data(from_time.strftime("%Y-%m-%dT%H:%M:%S"),
to_time.strftime("%Y-%m-%dT%H:%M:%S"),
should_terms)
rets = requests_post(initial_url, data, timeout=2)
if not rets:
return messages, counts
scroll_id, counts = rets.get("_scroll_id", ""), rets.get("hits", []).get("total", 0)
if not counts:
return messages, counts
scroll_url = ES_URL_PRE + "_search/scroll?"
while True:
params = {'scroll_id': scroll_id, "scroll": "2m", "size": size}
res = request_get(scroll_url, params=params, timeout=1)
hits = res.get("hits", {}).get("hits", [])
if not hits:
break
for hit in hits:
messages.append(hit.get("_source", {}))
scroll_id = res.get("_scroll_id", "")
return messages, counts
def main(from_time, to_time):
type_name = "bilog"
size = 1000
messages, counts = get_type_data(from_time, to_time, type_name, size=size)
return messages, counts
if __name__ == "__main__":
start_time = datetime.datetime.now()
to_time = start_time.replace(second=0, microsecond=0) \
- datetime.timedelta(minutes=1)
from_time = (to_time - datetime.timedelta(minutes=1))
messages, counts = main(from_time, to_time)
end_time = datetime.datetime.now()
print end_time-start_time