Elasticsearch是一个实时的分布式搜索分析引擎,它被用作快速全文搜索、结构化搜索、分析以及这三个功能的结合。具体如何使用本文不做讲解,大家可以去Elasticsearch: 权威指南观看文档进行学习。
本文主要是使用Spark读写Elasticsearch,基于以下版本开发测试。
scala-version:2.11.8
spark-version:2.2.0
hadoop-version:2.7.0
接下来正式开始了。
1. 添加Maven依赖
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.2.0</version>
</dependency>
<dependency>
<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch-hadoop</artifactId>
<version>6.2.4</version>
</dependency>
</dependencies>
2. 写数据到Elasticsearch
2.1 RDD 的 Map 方式
import org.apache.spark.{SparkConf, SparkContext}
import org.elasticsearch.spark._
object ES2Spark {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local").setAppName("ES2Spark")
.set("es.index.auto.create", "true")
.set("es.nodes", "127.0.0.1")
.set("es.port", "9200")
val sc = new SparkContext(conf)
val numbers = Map("one"->1, "two"->2, "three"->3)
val airports = Map("arrival"->"Otopeni", "SFO"->"San Fran")
sc.makeRDD(Seq(numbers, airports)).saveToEs("spark/docs")
sc.stop()
}
}
首先将elasticsearch后台启动(bin/elasticsearch -d),然后运行,结果可以通过http://127.0.0.1:9200/spark/docs/_search 进行查看
2.2 RDD 的 JSON 方式
import org.apache.spark.{SparkConf, SparkContext}
import org.elasticsearch.spark.rdd.EsSpark
object ES2Spark {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local").setAppName("ES2Spark")
.set("es.index.auto.create", "true")
.set("es.nodes", "127.0.0.1")
.set("es.port", "9200")
val sc = new SparkContext(conf)
val json1 = """{"name":"jack", "age":24, "sex":"man"}"""
val json2 = """{"name":"rose", "age":22, "sex":"woman"}"""
val rddData = sc.makeRDD(Seq(json1, json2))
EsSpark.saveJsonToEs(rddData, "spark/json")
//自定义id
EsSpark.saveJsonToEs(rddData, "spark/json", Map("es.mapping.id"->"name"))
sc.stop()
}
}
结果可以通过http://127.0.0.1:9200/spark/json/_search 进行查看
3. 从Elasticsearch读取数据
import org.apache.spark.{SparkConf, SparkContext}
import org.elasticsearch.spark.rdd.EsSpark
object ES2Spark {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local").setAppName("ES2Spark")
.set("es.index.auto.create", "true")
.set("es.nodes", "127.0.0.1")
.set("es.port", "9200")
val sc = new SparkContext(conf)
val result = EsSpark.esJsonRDD(sc, "spark/json", "?q=jack*")
result.foreach(println)
sc.stop()
}
}