elasticsearch 遍历索引数据

博客涉及Elasticsearch和大数据相关内容,但具体信息缺失。Elasticsearch是大数据处理中常用工具,可用于数据存储、搜索等,大数据则涵盖数据采集、分析等多方面。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

#-*- coding:utf8 -*-
 
from elasticsearch import Elasticsearch, helpers
import json
import pdb
 
class ElasticsearchService:
 
    def __init__(self, hosts):
        self.__elasticsearch = Elasticsearch(hosts, sniff_on_start=True, sniff_on_connection_fail=True, sniffer_timeout=60, timeout=30, retry_on_timeout=True, max_retries=5)
 
    def search_scroll(self, index, doc_type, query):
        try:
            return self.__elasticsearch.search(index=index, query=query, search_type="query_then_fetch", scroll="1m")
        except BaseException as e:
            print str(e)
            pass
 
        return {}
 
    def scroll_scan(self, scroll,scroll_id):
        try:
            resJson = self.__elasticsearch.scroll(scroll, scroll_id)
            return resJson 
        except BaseException as e:
            print str(e)
            pass
 
        return []
 
    def delete_by_query(self, index, query):
        try:
            resJson = self.__elasticsearch.delete_by_query(index=index, body=query)
            return resJson.get('deleted')
        except BaseException as e:
            print 'delete fail'
            pass
 
        return 0
 
    def update_by_query(self, index, query):
        try:
            resJson = self.__elasticsearch.update_by_query(index=index, body=query)
            return resJson.get('updated')
        except BaseException as e:
            pass
 
        return 0 
 
    def insert_bulk(self, data_lst):
        try:
            return helpers.bulk(self.__elasticsearch, data_lst, request_timeout=60) 
        except BaseException as e:
            return [0, []]
 
 
if __name__ == '__main__':
    es_hosts = ["IP:PORT"]
    baike_all_index = 'baike_index'
    baike_all_type = 'baike_all'
    elastic_service = ElasticsearchService(es_hosts)
    #这里是进行第一次查询,query中size指定每个批次的大小,返回的结果中不仅有查询到的数据,还有一个scroll_id, 这个scrool_id可以认为是下一次查询的起始位置
    res = elastic_service.search_scroll(baike_all_index, baike_all_type, {"query": {"match_all": {}},"_source": ["url"], "size": 10000})
    hits = res.get('hits')
    if hits.get('total') > 0:
        for hit in hits.get('hits'):
            print hit['_source']['url']  #这里是取我自己的数据
    while res.get('_scroll_id') and hits.get('total') > 0:
        #后续的每次查询都需要带上上一次查询结果中得到的scroll_id参数
        res = elastic_service.scroll_scan(scroll='1m', scroll_id=res.get('_scroll_id'))
        hits = res.get('hits')
        if hits.get('total') > 0:
            for hit in hits.get('hits'):
                print hit['_source']['url']

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

会发paper的学渣

您的鼓励和将是我前进的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值