es分页from+size,scroll,search_after

最新推荐文章于 2025-07-03 09:45:06 发布

原创最新推荐文章于 2025-07-03 09:45:06 发布 · 556 阅读

0 ·

CC 4.0 BY-SA版权

es 专栏收录该内容

6 篇文章

订阅专栏

当使用Elasticsearch进行分页查询时，from+size方式在数据超过1万条后无法获取更多数据。本文通过示例介绍了如何利用scroll API和search_after参数来实现高效的数据分页，确保能获取到存储的所有source信息。

es分页用from+size的方式超过1万条就查不出数据了。

查询大于1万的数据这里使用scroll和go-elasiticsearch简单写一个demo

package main

import (
	"bytes"
	"fmt"
	"io"
	"log"
	"net/http"
	"os"
	"strconv"
	"strings"
	"sync"
	"time"

	"github.com/elastic/go-elasticsearch/v8"
	"github.com/tidwall/gjson"
)

var c *elasticsearch.Client
var once sync.Once

func main() {
	log.SetFlags(0) //这里设置的0即取消log格式化输出，输出的内容和使用fmt包下的println()格式一样

	var (
		batchNum int
		scrollID string
	)

	es := setElastic([]string{"http://ip:9200"})

	// Index 100 documents into the "test-scroll" index
	//测试写入100个document
	log.Println("Indexing the documents...")
	for i := 1; i <= 100; i++ {
		res, err := es.Index(
			"test-scroll",
			strings.NewReader(`{"title" : "test"}`),
			es.Index.WithDocumentID(strconv.Itoa(i)),
		)
		if err != nil || res.IsError() {
			log.Fatalf("Error: %s: %s", err, res)
		}
	}
	es.Indices.Refresh(es.Indices.Refresh.WithIndex("test-scroll"))

	// Perform the initial search request to get
	// the first batch of data and the scroll ID
	//
	log.Println("Scrolling the index...")
	log.Println(strings.Repeat("-", 80))
	res, _ := es.Search(
		es.Search.WithIndex("test-scroll"),
		es.Search.WithSort("_doc"),
		es.Search.WithSize(10),
		es.Search.WithScroll(time.Minute),
	)

	// Handle the first batch of data and extract the scrollID
	//
	json := read(res.Body)
	res.Body.Close()

	scrollID = gjson.Get(json, "_scroll_id").String()

	log.Println("Batch   ", batchNum)
	log.Println("ScrollID", scrollID)
	log.Println("IDs     ", gjson.Get(json, "hits.hits.#._id"))
	log.Println(strings.Repeat("-", 80))

	// Perform the scroll requests in sequence
	//
	for {
		batchNum++

		// Perform the scroll request and pass the scrollID and scroll duration
		//
		res, err := es.Scroll(es.Scroll.WithScrollID(scrollID), es.Scroll.WithScroll(time.Minute))
		if err != nil {
			log.Fatalf("Error: %s", err)
		}
		if res.IsError() {
			log.Fatalf("Error response: %s", res)
		}

		json := read(res.Body)
		res.Body.Close()

		// Extract the scrollID from response
		//
		scrollID = gjson.Get(json, "_scroll_id").String()

		// Extract the search results
		//
		hits := gjson.Get(json, "hits.hits")

		// Break out of the loop when there are no results
		//
		if len(hits.Array()) < 1 {
			log.Println("Finished scrolling")
			break
		} else {
			log.Println("Batch   ", batchNum)
			log.Println("ScrollID", scrollID)
			log.Println("IDs     ", gjson.Get(hits.Raw, "#._id"))
			log.Println(strings.Repeat("-", 80))
		}
	}
}

func read(r io.Reader) string {
	var b bytes.Buffer
	b.ReadFrom(r)
	return b.String()
}

//sync.Once，它有一个Do方法，在它中的函数go会只保证仅仅调用一次！
func setElastic(hosts []string) *elasticsearch.Client {
	once.Do(func() {
		cfg := elasticsearch.Config{
			Addresses: hosts,
			Transport: &http.Transport{
				MaxIdleConnsPerHost:   10,
				ResponseHeaderTimeout: 60 * time.Second,
				DisableKeepAlives:     true,
			},
		}
		var err error
		if c, err = elasticsearch.NewClient(cfg); err != nil {
			fmt.Println("ElasticSearch: " + err.Error())
			os.Exit(1)
		}
	})
	return c
}

存储所有source信息返回

package main

import (
	"bytes"
	"encoding/json"
	"fmt"
	"io"
	"log"
	"net/http"
	"os"
	"strings"
	"sync"
	"time"

	"github.com/elastic/go-elasticsearch/v7"
	"github.com/tidwall/gjson"
)

var c *elasticsearch.Client
var once sync.Once

//MAXNUM 导出的最大条数
const MAXNUM = 100

func main() {
	log.SetFlags(0) //这里设置的0即取消log格式化输出，输出的内容和使用fmt包下的println()格式一样

	var (
		scrollID string
		resHits  []map[string]interface{}
	)

	es := setElastic([]string{"http://ip:9200"})
	index := []string{"a-*", "b-*", "c-*", "d-*"}
	log.Println("Scrolling the index...")
	log.Println(strings.Repeat("-", 80))
	res, _ := es.Search(
		es.Search.WithIndex(index...),
		es.Search.WithSort("_id"),
		es.Search.WithSize(50),
		es.Search.WithScroll(5*time.Minute),
	)

	rsJson := read(res.Body)
	res.Body.Close()
	scrollID = gjson.Get(rsJson, "_scroll_id").String()

	hitSource := gjson.Get(rsJson, "hits.hits.#._source")
	var tmp []map[string]interface{}
	json.Unmarshal([]byte(hitSource.Raw), &tmp)
	resHits = append(resHits, tmp...)

	// Perform the scroll requests in sequence
	//
	for i := 1; i < MAXNUM/50; i++ {

		// Perform the scroll request and pass the scrollID and scroll duration
		//
		res, err := es.Scroll(es.Scroll.WithScrollID(scrollID), es.Scroll.WithScroll(5*time.Minute))
		if err != nil {
			log.Fatalf("Error: %s", err)
		}
		if res.IsError() {
			log.Fatalf("Error response: %s", res)
		}

		rjson := read(res.Body)
		res.Body.Close()

		// Extract the scrollID from response
		//
		scrollID = gjson.Get(rjson, "_scroll_id").String()

		// Extract the search results
		//
		hits := gjson.Get(rjson, "hits.hits")
		hitSce := gjson.Get(rjson, "hits.hits.#._source")
		// Break out of the loop when there are no results
		//
		if len(hits.Array()) < 1 {
			log.Println("Finished scrolling")
			break
		} else {
			var tmp []map[string]interface{}
			json.Unmarshal([]byte(hitSce.Raw), &tmp)
			resHits = append(resHits, tmp...)
		}
	}

	fmt.Println("+++++++++++++++++++++++++++++")
	fmt.Println(len(resHits), resHits[0]["attacker_ip"])
}

func read(r io.Reader) string {
	var b bytes.Buffer
	b.ReadFrom(r)
	return b.String()
}

//sync.Once，它有一个Do方法，在它中的函数go会只保证仅仅调用一次！
func setElastic(hosts []string) *elasticsearch.Client {
	once.Do(func() {
		cfg := elasticsearch.Config{
			Addresses: hosts,
			Transport: &http.Transport{
				MaxIdleConnsPerHost:   10,
				ResponseHeaderTimeout: 60 * time.Second,
				DisableKeepAlives:     true,
			},
		}
		var err error
		if c, err = elasticsearch.NewClient(cfg); err != nil {
			fmt.Println("ElasticSearch: " + err.Error())
			os.Exit(1)
		}
	})
	return c
}

分页方式	性能	优点	缺点	场景
from + size	低	灵活性好，实现简单	深度分页问题	数据量比较小，能容忍深度分页问题
scroll	中	解决了深度分页问题	无法反应数据的实时性（快照版本）维护成本高，需要维护一个 scroll_id	海量数据的导出（比如笔者刚遇到的将es中20w的数据导入到excel）需要查询海量结果集的数据
search_after	高	性能最好不存在深度分页问题能够反映数据的实时变更	实现复杂，需要有一个全局唯一的字段连续分页的实现会比较复杂，因为每一次查询都需要上次查询的结果	海量数据的分页