ES-es-ElasticSearch:打破默认取回10条,scroll连续取回

本文介绍了一种使用Python批量处理Elasticsearch中大量数据的方法,通过scan API遍历所有文档,结合MongoDB进行数据验证,并对无效数据进行删除操作。此过程包括了配置Elasticsearch客户端、定义查询体、执行滚动搜索以及处理搜索结果。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

import random
import json
import datetime
import time
from pymongo import MongoClient
from pymongo import MongoClient, ASCENDING, UpdateOne, InsertOne, DeleteOne, ReplaceOne
from pymongo.errors import BulkWriteError
from io import BytesIO
import pymysql.cursors
from bson.objectid import ObjectId
from bson import json_util as jsonb
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
from elasticsearch import helpers


import logging

# 默认的日志级别设置为WARNING(日志级别等级CRITICAL > ERROR > WARNING > INFO > DEBUG > NOTSET)
logging.basicConfig(level=logging.DEBUG,
                    format="%(asctime)s %(name)s [line:%(lineno)d] %(levelname)s %(message)s",
                    datefmt='%a, %d %b %Y %H:%M:%S',
                    filename="/root/backend/db/cront.log",
                    filemode="a")

dddd = MongoClient('120.133.26.118:20002', username='xwk', password='495vvFul015dV0cV')
mongo_post = MongoClient('dds-2ze197183eba5c941.mongodb.rds.aliyuncs.com:3717', username='root', password='lyp82nLF')
es = Elasticsearch(['es-cn-xxxx.elasticsearch.aliyuncs.com'], http_auth=('elastic', 'xxxx'), port=9200, timeout=50000)


# 要加定时任务
def delete_es_posts():

    current_start = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
    logging.info("delete posts ok start-%s"%(current_start))
    print("start:", current_start)
    # {"index": {"max_result_window": 1000000}}

    query_body = {
        "query": {
            "match_all": {}
        }
    }

    # 返回条数有限,默认10条,单次处理
    # _searched = es.search(index="goodlook", doc_type="post", body=query_body)
    # for hit in _searched['hits']['hits']:
    #     # print(hit)
    #     print("---------------------------------------------------------------")
    #     post_id = hit.get("_id")
    #     one_post = mongo_post['admin']['post'].find_one({"_id": ObjectId("%s"%(post_id))})
    #     print("es_post_id, one_post:", post_id, one_post)
    #     if one_post is None:
    #         del_ret = es.delete(index="goodlook", doc_type="post", id=post_id)
    #         print("delete es post:", post_id)
    #         print("delete es status:", del_ret.get("result"))
    #         print("---------------------------------------------------------------")

    # 采用scroll方法返回,返回条数不受限制
    # clear_scroll default = True
    _searched = helpers.scan(
        client=es,
        query=query_body,
        scroll='10m',
        index='goodlook',
        doc_type='post',
        timeout='10m',
        size=2000,
        clear_scroll=True
    )

    for search in _searched:
        # print(search)
        # {'_index': 'goodlook', '_type': 'post', '_id': '5c83779c8443a458eba30749', '_score': None,
        # '_source': {'color': [], 'items': [], 'scene': 'show', 'year': '2019', 'season': '春夏',
        # 'show_name_en': 'Spring 2019 Menswear', 'show_name_cn': '2019春夏男装系列'}, 'sort': [11206]}

        post_id = search.get("_id")
        one_post = mongo_post['admin']['post'].find_one({"_id": ObjectId("%s"%(post_id))})
        print("es_post_id, one_post:", post_id, one_post)
        if one_post is None:
            del_ret = es.delete(index="goodlook", doc_type="post", id=post_id)
            print("delete es post:", post_id)
            print("delete es status:", del_ret.get("result"))
            print("---------------------------------------------------------------")

    current_end = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
    logging.info("delete posts ok over-%s" % (current_end))
    print("end:", current_end)
    print("sustained time:", current_end-current_start)

delete_es_posts()

from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
from elasticsearch import helpers
query_body = {
    "query": {
        "match_all": {}
    }
}
es = Elasticsearch(['es-cn-xxxx.elasticsearch.aliyuncs.com'], http_auth=('elastic', 'xxxx'), port=9200, timeout=50000)
_searched = helpers.scan(
    client=es,
    query=query_body,
    scroll='10m',
    index='goodlook',
    doc_type='post',
    timeout='10m',
    size=2000,
    clear_scroll=True
)

for search in _searched:
    pass
  

  

 

转载于:https://www.cnblogs.com/adamans/articles/10701680.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值