es分页from+size,scroll,search_after

当使用Elasticsearch进行分页查询时,from+size方式在数据超过1万条后无法获取更多数据。本文通过示例介绍了如何利用scroll API和search_after参数来实现高效的数据分页,确保能获取到存储的所有source信息。

es分页用from+size的方式超过1万条就查不出数据了。

查询大于1万的数据这里使用scroll和go-elasiticsearch简单写一个demo

package main

import (
	"bytes"
	"fmt"
	"io"
	"log"
	"net/http"
	"os"
	"strconv"
	"strings"
	"sync"
	"time"

	"github.com/elastic/go-elasticsearch/v8"
	"github.com/tidwall/gjson"
)

var c *elasticsearch.Client
var once sync.Once

func main() {
	log.SetFlags(0) //这里设置的0即取消log格式化输出,输出的内容和使用fmt包下的println()格式一样

	var (
		batchNum int
		scrollID string
	)

	es := setElastic([]string{"http://ip:9200"})

	// Index 100 documents into the "test-scroll" index
	//测试写入100个document
	log.Println("Indexing the documents...")
	for i := 1; i <= 100; i++ {
		res, err := es.Index(
			"test-scroll",
			strings.NewReader(`{"title" : "test"}`),
			es.Index.WithDocumentID(strconv.Itoa(i)),
		)
		if err != nil || res.IsError() {
			log.Fatalf("Error: %s: %s", err, res)
		}
	}
	es.Indices.Refresh(es.Indices.Refresh.WithIndex("test-scroll"))

	// Perform the initial search request to get
	// the first batch of data and the scroll ID
	//
	log.Println("Scrolling the index...")
	log.Println(strings.Repeat("-", 80))
	res, _ := es.Search(
		es.Search.WithIndex("test-scroll"),
		es.Search.WithSort("_doc"),
		es.Search.WithSize(10),
		es.Search.WithScroll(time.Minute),
	)

	// Handle the first batch of data and extract the scrollID
	//
	json := read(res.Body)
	res.Body.Close()

	scrollID = gjson.Get(json, "_scroll_id").String()

	log.Println("Batch   ", batchNum)
	log.Println("ScrollID", scrollID)
	log.Println("IDs     ", gjson.Get(json, "hits.hits.#._id"))
	log.Println(strings.Repeat("-", 80))

	// Perform the scroll requests in sequence
	//
	for {
		batchNum++

		// Perform the scroll request and pass the scrollID and scroll duration
		//
		res, err := es.Scroll(es.Scroll.WithScrollID(scrollID), es.Scroll.WithScroll(time.Minute))
		if err != nil {
			log.Fatalf("Error: %s", err)
		}
		if res.IsError() {
			log.Fatalf("Error response: %s", res)
		}

		json := read(res.Body)
		res.Body.Close()

		// Extract the scrollID from response
		//
		scrollID = gjson.Get(json, "_scroll_id").String()

		// Extract the search results
		//
		hits := gjson.Get(json, "hits.hits")

		// Break out of the loop when there are no results
		//
		if len(hits.Array()) < 1 {
			log.Println("Finished scrolling")
			break
		} else {
			log.Println("Batch   ", batchNum)
			log.Println("ScrollID", scrollID)
			log.Println("IDs     ", gjson.Get(hits.Raw, "#._id"))
			log.Println(strings.Repeat("-", 80))
		}
	}
}

func read(r io.Reader) string {
	var b bytes.Buffer
	b.ReadFrom(r)
	return b.String()
}

//sync.Once,它有一个Do方法,在它中的函数go会只保证仅仅调用一次!
func setElastic(hosts []string) *elasticsearch.Client {
	once.Do(func() {
		cfg := elasticsearch.Config{
			Addresses: hosts,
			Transport: &http.Transport{
				MaxIdleConnsPerHost:   10,
				ResponseHeaderTimeout: 60 * time.Second,
				DisableKeepAlives:     true,
			},
		}
		var err error
		if c, err = elasticsearch.NewClient(cfg); err != nil {
			fmt.Println("ElasticSearch: " + err.Error())
			os.Exit(1)
		}
	})
	return c
}

 

存储所有source信息返回

package main

import (
	"bytes"
	"encoding/json"
	"fmt"
	"io"
	"log"
	"net/http"
	"os"
	"strings"
	"sync"
	"time"

	"github.com/elastic/go-elasticsearch/v7"
	"github.com/tidwall/gjson"
)

var c *elasticsearch.Client
var once sync.Once

//MAXNUM 导出的最大条数
const MAXNUM = 100

func main() {
	log.SetFlags(0) //这里设置的0即取消log格式化输出,输出的内容和使用fmt包下的println()格式一样

	var (
		scrollID string
		resHits  []map[string]interface{}
	)

	es := setElastic([]string{"http://ip:9200"})
	index := []string{"a-*", "b-*", "c-*", "d-*"}
	log.Println("Scrolling the index...")
	log.Println(strings.Repeat("-", 80))
	res, _ := es.Search(
		es.Search.WithIndex(index...),
		es.Search.WithSort("_id"),
		es.Search.WithSize(50),
		es.Search.WithScroll(5*time.Minute),
	)

	rsJson := read(res.Body)
	res.Body.Close()
	scrollID = gjson.Get(rsJson, "_scroll_id").String()

	hitSource := gjson.Get(rsJson, "hits.hits.#._source")
	var tmp []map[string]interface{}
	json.Unmarshal([]byte(hitSource.Raw), &tmp)
	resHits = append(resHits, tmp...)

	// Perform the scroll requests in sequence
	//
	for i := 1; i < MAXNUM/50; i++ {

		// Perform the scroll request and pass the scrollID and scroll duration
		//
		res, err := es.Scroll(es.Scroll.WithScrollID(scrollID), es.Scroll.WithScroll(5*time.Minute))
		if err != nil {
			log.Fatalf("Error: %s", err)
		}
		if res.IsError() {
			log.Fatalf("Error response: %s", res)
		}

		rjson := read(res.Body)
		res.Body.Close()

		// Extract the scrollID from response
		//
		scrollID = gjson.Get(rjson, "_scroll_id").String()

		// Extract the search results
		//
		hits := gjson.Get(rjson, "hits.hits")
		hitSce := gjson.Get(rjson, "hits.hits.#._source")
		// Break out of the loop when there are no results
		//
		if len(hits.Array()) < 1 {
			log.Println("Finished scrolling")
			break
		} else {
			var tmp []map[string]interface{}
			json.Unmarshal([]byte(hitSce.Raw), &tmp)
			resHits = append(resHits, tmp...)
		}
	}

	fmt.Println("+++++++++++++++++++++++++++++")
	fmt.Println(len(resHits), resHits[0]["attacker_ip"])
}

func read(r io.Reader) string {
	var b bytes.Buffer
	b.ReadFrom(r)
	return b.String()
}

//sync.Once,它有一个Do方法,在它中的函数go会只保证仅仅调用一次!
func setElastic(hosts []string) *elasticsearch.Client {
	once.Do(func() {
		cfg := elasticsearch.Config{
			Addresses: hosts,
			Transport: &http.Transport{
				MaxIdleConnsPerHost:   10,
				ResponseHeaderTimeout: 60 * time.Second,
				DisableKeepAlives:     true,
			},
		}
		var err error
		if c, err = elasticsearch.NewClient(cfg); err != nil {
			fmt.Println("ElasticSearch: " + err.Error())
			os.Exit(1)
		}
	})
	return c
}

 

分页方式性能优点缺点场景
from + size灵活性好,实现简单深度分页问题数据量比较小,能容忍深度分页问题
scroll解决了深度分页问题

无法反应数据的实时性(快照版本)

维护成本高,需要维护一个 scroll_id

海量数据的导出(比如笔者刚遇到的将es中20w的数据导入到excel)

需要查询海量结果集的数据

search_after

性能最好

不存在深度分页问题

能够反映数据的实时变更

实现复杂,需要有一个全局唯一的字段

连续分页的实现会比较复杂,因为每一次查询都需要上次查询的结果

海量数据的分页
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值