spark把hive数据同步到ES中(upsert)
链接es的配置
private static Map<String, String> getEsOption() {
Map<String, String> map = new HashMap<>(6);
map.put("es.index.auto.create", "true");
map.put("es.nodes.wan.only", "true");
map.put("es.nodes", "192.168.99.41,192.168.99.42,192.168.99.43");
map.put("es.mapping.id", "obj_id");
map.put("es.port", "9200");
map.put("es.write.operation", "upsert");
return map;
}
写入es操作,注意版本兼容
- es : 6.8.1
- elasticsearch-spark-20_2.11 : 6.8.1
public static void main(String[] args) {
String preDate = args[0];
SparkConf conf = new SparkConf().setAppName("es_import");
String currentDateStr = DateUtity.DateToDayStr(new Date(), "yyyy-MM-dd");
SparkSession spark = SparkSession.builder().enableHiveSupport().config(conf).getOrCreate();
spark.udf().register("appendChar", (UDF1<String, String>) s -> s.concat("_"), DataTypes.StringType);
Dataset<Row> dataset = spark.table("hugeleaflabs.basic_info_me").filter((FilterFunction<Row>) row -> {
String create_time = row.getAs("create_time");
return create_time.equals(preDate);
});
Dataset<Row> soleDs = dataset.sort(dataset.col("id").desc()).dropDuplicates("type", "obj_id");
Column type = soleDs.col("type");
Column obj_id = soleDs.col("obj_id");
Dataset<Row> objDs = soleDs.withColumn("obj_id",
functions.concat(functions.callUDF("appendChar", type), obj_id));
objDs.write()
.format("org.elasticsearch.spark.sql")
.options(getEsOption())
.mode(SaveMode.Append)
.save("entertainment_idx_sole/culture_data");
}