es分页用from+size的方式超过1万条就查不出数据了。
查询大于1万的数据这里使用scroll和go-elasiticsearch简单写一个demo
package main
import (
"bytes"
"fmt"
"io"
"log"
"net/http"
"os"
"strconv"
"strings"
"sync"
"time"
"github.com/elastic/go-elasticsearch/v8"
"github.com/tidwall/gjson"
)
var c *elasticsearch.Client
var once sync.Once
func main() {
log.SetFlags(0) //这里设置的0即取消log格式化输出,输出的内容和使用fmt包下的println()格式一样
var (
batchNum int
scrollID string
)
es := setElastic([]string{"http://ip:9200"})
// Index 100 documents into the "test-scroll" index
//测试写入100个document
log.Println("Indexing the documents...")
for i := 1; i <= 100; i++ {
res, err := es.Index(
"test-scroll",
strings.NewReader(`{"title" : "test"}`),
es.Index.WithDocumentID(strconv.Itoa(i)),
)
if err != nil || res.IsError() {
log.Fatalf("Error: %s: %s", err, res)
}
}
es.Indices.Refresh(es.Indices.Refresh.WithIndex("test-scroll"))
// Perform the initial search request to get
// the first batch of data and the scroll ID
//
log.Println("Scrolling the index...")
log.Println(strings.Repeat("-", 80))
res, _ := es.Search(
es.Search.WithIndex("test-scroll"),
es.Search.WithSort("_doc"),
es.Search.WithSize(10),
es.Search.WithScroll(time.Minute),
)
// Handle the first batch of data and extract the scrollID
//
json := read(res.Body)
res.Body.Close()
scrollID = gjson.Get(json, "_scroll_id").String()
log.Println("Batch ", batchNum)
log.Println("ScrollID", scrollID)
log.Println("IDs ", gjson.Get(json, "hits.hits.#._id"))
log.Println(strings.Repeat("-", 80))
// Perform the scroll requests in sequence
//
for {
batchNum++
// Perform the scroll request and pass the scrollID and scroll duration
//
res, err := es.Scroll(es.Scroll.WithScrollID(scrollID), es.Scroll.WithScroll(time.Minute))
if err != nil {
log.Fatalf("Error: %s", err)
}
if res.IsError() {
log.Fatalf("Error response: %s", res)
}
json := read(res.Body)
res.Body.Close()
// Extract the scrollID from response
//
scrollID = gjson.Get(json, "_scroll_id").String()
// Extract the search results
//
hits := gjson.Get(json, "hits.hits")
// Break out of the loop when there are no results
//
if len(hits.Array()) < 1 {
log.Println("Finished scrolling")
break
} else {
log.Println("Batch ", batchNum)
log.Println("ScrollID", scrollID)
log.Println("IDs ", gjson.Get(hits.Raw, "#._id"))
log.Println(strings.Repeat("-", 80))
}
}
}
func read(r io.Reader) string {
var b bytes.Buffer
b.ReadFrom(r)
return b.String()
}
//sync.Once,它有一个Do方法,在它中的函数go会只保证仅仅调用一次!
func setElastic(hosts []string) *elasticsearch.Client {
once.Do(func() {
cfg := elasticsearch.Config{
Addresses: hosts,
Transport: &http.Transport{
MaxIdleConnsPerHost: 10,
ResponseHeaderTimeout: 60 * time.Second,
DisableKeepAlives: true,
},
}
var err error
if c, err = elasticsearch.NewClient(cfg); err != nil {
fmt.Println("ElasticSearch: " + err.Error())
os.Exit(1)
}
})
return c
}
存储所有source信息返回
package main
import (
"bytes"
"encoding/json"
"fmt"
"io"
"log"
"net/http"
"os"
"strings"
"sync"
"time"
"github.com/elastic/go-elasticsearch/v7"
"github.com/tidwall/gjson"
)
var c *elasticsearch.Client
var once sync.Once
//MAXNUM 导出的最大条数
const MAXNUM = 100
func main() {
log.SetFlags(0) //这里设置的0即取消log格式化输出,输出的内容和使用fmt包下的println()格式一样
var (
scrollID string
resHits []map[string]interface{}
)
es := setElastic([]string{"http://ip:9200"})
index := []string{"a-*", "b-*", "c-*", "d-*"}
log.Println("Scrolling the index...")
log.Println(strings.Repeat("-", 80))
res, _ := es.Search(
es.Search.WithIndex(index...),
es.Search.WithSort("_id"),
es.Search.WithSize(50),
es.Search.WithScroll(5*time.Minute),
)
rsJson := read(res.Body)
res.Body.Close()
scrollID = gjson.Get(rsJson, "_scroll_id").String()
hitSource := gjson.Get(rsJson, "hits.hits.#._source")
var tmp []map[string]interface{}
json.Unmarshal([]byte(hitSource.Raw), &tmp)
resHits = append(resHits, tmp...)
// Perform the scroll requests in sequence
//
for i := 1; i < MAXNUM/50; i++ {
// Perform the scroll request and pass the scrollID and scroll duration
//
res, err := es.Scroll(es.Scroll.WithScrollID(scrollID), es.Scroll.WithScroll(5*time.Minute))
if err != nil {
log.Fatalf("Error: %s", err)
}
if res.IsError() {
log.Fatalf("Error response: %s", res)
}
rjson := read(res.Body)
res.Body.Close()
// Extract the scrollID from response
//
scrollID = gjson.Get(rjson, "_scroll_id").String()
// Extract the search results
//
hits := gjson.Get(rjson, "hits.hits")
hitSce := gjson.Get(rjson, "hits.hits.#._source")
// Break out of the loop when there are no results
//
if len(hits.Array()) < 1 {
log.Println("Finished scrolling")
break
} else {
var tmp []map[string]interface{}
json.Unmarshal([]byte(hitSce.Raw), &tmp)
resHits = append(resHits, tmp...)
}
}
fmt.Println("+++++++++++++++++++++++++++++")
fmt.Println(len(resHits), resHits[0]["attacker_ip"])
}
func read(r io.Reader) string {
var b bytes.Buffer
b.ReadFrom(r)
return b.String()
}
//sync.Once,它有一个Do方法,在它中的函数go会只保证仅仅调用一次!
func setElastic(hosts []string) *elasticsearch.Client {
once.Do(func() {
cfg := elasticsearch.Config{
Addresses: hosts,
Transport: &http.Transport{
MaxIdleConnsPerHost: 10,
ResponseHeaderTimeout: 60 * time.Second,
DisableKeepAlives: true,
},
}
var err error
if c, err = elasticsearch.NewClient(cfg); err != nil {
fmt.Println("ElasticSearch: " + err.Error())
os.Exit(1)
}
})
return c
}
| 分页方式 | 性能 | 优点 | 缺点 | 场景 |
| from + size | 低 | 灵活性好,实现简单 | 深度分页问题 | 数据量比较小,能容忍深度分页问题 |
| scroll | 中 | 解决了深度分页问题 |
无法反应数据的实时性(快照版本) 维护成本高,需要维护一个 scroll_id |
海量数据的导出(比如笔者刚遇到的将es中20w的数据导入到excel) 需要查询海量结果集的数据 |
| search_after | 高 |
性能最好 不存在深度分页问题 能够反映数据的实时变更 |
实现复杂,需要有一个全局唯一的字段 连续分页的实现会比较复杂,因为每一次查询都需要上次查询的结果 | 海量数据的分页 |
当使用Elasticsearch进行分页查询时,from+size方式在数据超过1万条后无法获取更多数据。本文通过示例介绍了如何利用scroll API和search_after参数来实现高效的数据分页,确保能获取到存储的所有source信息。
4705

被折叠的 条评论
为什么被折叠?



