elasticsearch 遍历索引数据

原创已于 2023-08-22 16:11:34 修改 · 420 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#elasticsearch #大数据

于 2023-08-22 15:58:50 首次发布

博客涉及Elasticsearch和大数据相关内容，但具体信息缺失。Elasticsearch是大数据处理中常用工具，可用于数据存储、搜索等，大数据则涵盖数据采集、分析等多方面。

#-*- coding:utf8 -*-
 
from elasticsearch import Elasticsearch, helpers
import json
import pdb
 
class ElasticsearchService:
 
    def __init__(self, hosts):
        self.__elasticsearch = Elasticsearch(hosts, sniff_on_start=True, sniff_on_connection_fail=True, sniffer_timeout=60, timeout=30, retry_on_timeout=True, max_retries=5)
 
    def search_scroll(self, index, doc_type, query):
        try:
            return self.__elasticsearch.search(index=index, query=query, search_type="query_then_fetch", scroll="1m")
        except BaseException as e:
            print str(e)
            pass
 
        return {}
 
    def scroll_scan(self, scroll,scroll_id):
        try:
            resJson = self.__elasticsearch.scroll(scroll, scroll_id)
            return resJson 
        except BaseException as e:
            print str(e)
            pass
 
        return []
 
    def delete_by_query(self, index, query):
        try:
            resJson = self.__elasticsearch.delete_by_query(index=index, body=query)
            return resJson.get('deleted')
        except BaseException as e:
            print 'delete fail'
            pass
 
        return 0
 
    def update_by_query(self, index, query):
        try:
            resJson = self.__elasticsearch.update_by_query(index=index, body=query)
            return resJson.get('updated')
        except BaseException as e:
            pass
 
        return 0 
 
    def insert_bulk(self, data_lst):
        try:
            return helpers.bulk(self.__elasticsearch, data_lst, request_timeout=60) 
        except BaseException as e:
            return [0, []]
 
 
if __name__ == '__main__':
    es_hosts = ["IP:PORT"]
    baike_all_index = 'baike_index'
    baike_all_type = 'baike_all'
    elastic_service = ElasticsearchService(es_hosts)
    #这里是进行第一次查询，query中size指定每个批次的大小，返回的结果中不仅有查询到的数据，还有一个scroll_id， 这个scrool_id可以认为是下一次查询的起始位置
    res = elastic_service.search_scroll(baike_all_index, baike_all_type, {"query": {"match_all": {}},"_source": ["url"], "size": 10000})
    hits = res.get('hits')
    if hits.get('total') > 0:
        for hit in hits.get('hits'):
            print hit['_source']['url']  #这里是取我自己的数据
    while res.get('_scroll_id') and hits.get('total') > 0:
        #后续的每次查询都需要带上上一次查询结果中得到的scroll_id参数
        res = elastic_service.scroll_scan(scroll='1m', scroll_id=res.get('_scroll_id'))
        hits = res.get('hits')
        if hits.get('total') > 0:
            for hit in hits.get('hits'):
                print hit['_source']['url']