Java整合Flink将结果写入ElasticSearch

最新推荐文章于 2025-03-20 18:49:36 发布

宝华的小岛

最新推荐文章于 2025-03-20 18:49:36 发布

阅读量934

点赞数

分类专栏：大数据技术 Java 文章标签： java 大数据 flink elasticsearch

本文链接：https://blog.youkuaiyun.com/xxkalychen/article/details/117190236

版权

Java 同时被 3 个专栏收录

74 篇文章

订阅专栏

技术

29 篇文章

订阅专栏

大数据

26 篇文章

订阅专栏

本文介绍了如何使用Flink从Kafka读取数据，进行wordcount计算，并将结果持久化到ElasticSearch。通过封装数据类、ElasticSearch工具类，实现了数据写入和初始化旧数据的逻辑。测试表明，程序能正确累加新旧数据并写入ElasticSearch。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

上次修改https://blog.youkuaiyun.com/xxkalychen/article/details/117152948?spm=1001.2014.3001.5501把数据源确定为消息中间件kafka，从数据源来讲已经比较符合标准模型了。数据处理的最终结果也不能只是控制台打印，终究还是要持久化的。我们可以写入HBase，可以写入HDFS，我这里还是选择写入ElasticSearch。

首先我们要启动ElasticSearch服务器，zookeeper和kafka。

一、添加ElasticSearch的pom依赖。

<dependency>
    <groupId>org.elasticsearch.client</groupId>
    <artifactId>elasticsearch-rest-high-level-client</artifactId>
    <version>7.12.0</version>
</dependency>
<dependency>
    <groupId>com.google.code.gson</groupId>
    <artifactId>gson</artifactId>
    <version>2.8.6</version>
</dependency>

二、封装数据类。用于封装数据便于写入ES。

package com.chris.flink.model;

import java.io.Serializable;

/**
 * @author Chris Chan
 * Create on 2021/5/21 12:56
 * Use for:
 * Explain:
 */
public class WordCount implements Serializable {
    private String word;
    private long count;

    public WordCount() {
    }

    public WordCount(String word, long count) {
        this.word = word;
        this.count = count;
    }

    public String getWord() {
        return word;
    }

    public void setWord(String word) {
        this.word = word;
    }

    public long getCount() {
        return count;
    }

    public void setCount(long count) {
        this.count = count;
    }
}

三、编写ElasticSearch工具类。

package com.chris.flink.utils;

import com.chris.flink.model.WordCount;
import com.google.gson.Gson;
import org.apache.http.HttpHost;
import org.elasticsearch.action.bulk.BulkRequest;
import org.elasticsearch.action.bulk.BulkResponse;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.action.index.IndexResponse;
import org.elasticsearch.action.search.SearchRequest;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.client.RequestOptions;
import org.elasticsearch.client.RestClient;
import org.elasticsearch.client.RestHighLevelClient;
import org.elasticsearch.client.indices.CreateIndexRequest;
import org.elasticsearch.client.indices.GetIndexRequest;
import org.elasticsearch.common.xcontent.XContentType;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.builder.SearchSourceBuilder;
import sun.misc.BASE64Encoder;

import java.io.IOException;
import java.util.List;
import java.util.UUID;
import java.util.concurrent.ConcurrentHashMap;

/**
 * @author Chris Chan
 * Create on 2021/5/19 7:37
 * Use for:
 * Explain:
 */
public class ElasticSearchUtil {
    private static RestHighLevelClient client = null;
    private static Gson gson = new Gson().newBuilder().create();
    private static BASE64Encoder base64Encoder = new BASE64Encoder();

    static {
        ElasticSearchUtil.client = new RestHighLevelClient(RestClient.builder(new HttpHost("192.168.0.52", 9200, "http")));
    }

    public static RestHighLevelClient getClient() {
        return client;
    }

    public static void close() {
        try {
            ElasticSearchUtil.client.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    public static boolean isIndexExists(String indexName) {
        try {
            return ElasticSearchUtil.client.indices().exists(new GetIndexRequest(indexName), RequestOptions.DEFAULT);
        } catch (IOException e) {
            e.printStackTrace();
        }
        return false;
    }

    public static void createIndex(String indexName) {
        if (isIndexExists(indexName)) {
            return;
        }
        try {
            ElasticSearchUtil.client.indices().create(new CreateIndexRequest(indexName), RequestOptions.DEFAULT);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    public static <T> IndexResponse add(T obj, String indexName) {
        IndexRequest indexRequest = new IndexRequest(indexName).id(UUID.randomUUID().toString());
        indexRequest.source(new Gson().toJson(obj), XContentType.JSON);

        try {
            return ElasticSearchUtil.client.index(indexRequest, RequestOptions.DEFAULT);
        } catch (IOException e) {
            e.printStackTrace();
        }
        return null;
    }

    /**
     * 初始化本地单词计数收集器
     * 程序启动时，需要到ES去读取旧的数据，以便累计单词总数
     *
     * @param indexName
     * @param wordCountMap
     */
    public static void initWordCountMap(String indexName, ConcurrentHashMap<String, Long> wordCountMap) {
        SearchRequest request = new SearchRequest(indexName);
        SearchSourceBuilder builder = new SearchSourceBuilder();
        builder.query(QueryBuilders.matchAllQuery());
        request.source(builder);
        try {
            SearchResponse response = ElasticSearchUtil.client.search(request, RequestOptions.DEFAULT);
            SearchHit[] hits = response.getHits().getHits();
            for (SearchHit hit : hits) {
                WordCount wordCount = gson.fromJson(hit.getSourceAsString(), WordCount.class);
                wordCountMap.put(wordCount.getWord(), wordCount.getCount());
            }

        } catch (IOException e) {
            e.printStackTrace();
        }


    }

    /**
     * 向ES写入单词计数结果
     *
     * @param wordCount
     * @param indexName
     */
    public static IndexResponse addWordCount(WordCount wordCount, String indexName) {
        IndexRequest indexRequest = new IndexRequest(indexName).id(base64Encoder.encode(wordCount.getWord().getBytes()));
        indexRequest.source(new Gson().toJson(wordCount), XContentType.JSON);

        try {
            return ElasticSearchUtil.client.index(indexRequest, RequestOptions.DEFAULT);
        } catch (IOException e) {
            e.printStackTrace();
        }
        return null;
    }

    /**
     * 批量向ES写入单词计数结果
     *
     * @param wordCountList
     * @param indexName
     * @return
     */
    public static BulkResponse addWordCountList(List<WordCount> wordCountList, String indexName) {
        BulkRequest bulkRequest = new BulkRequest();

        wordCountList.forEach(wordCount -> {
            bulkRequest.add(new IndexRequest(indexName).id(base64Encoder.encode(wordCount.getWord().getBytes())).source(gson.toJson(wordCount), XContentType.JSON));
        });
        try {
            return ElasticSearchUtil.client.bulk(bulkRequest, RequestOptions.DEFAULT);
        } catch (IOException e) {
            e.printStackTrace();
        }
        return null;
    }
}

工具类中提供将数据写入ES的方法，为了保持单词数据的唯一性，我们将单词本身经过Base64编码之后作为固定的ID。

四、编写测试类。

package com.chris.flink;

import com.chris.flink.model.WordCount;
import com.chris.flink.utils.ElasticSearchUtil;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;
import org.apache.flink.util.Collector;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Properties;
import java.util.concurrent.ConcurrentHashMap;

/**
 * @author Chris Chan
 * Create on 2021/5/22 7:23
 * Use for:
 * Explain: Flink流式处理从Kafka获取的数据并写入ElasticSearch
 */
public class KafkaToESTest {
    //ElasticSearch内索引名称
    public static final String INDEX_NAME = "topic_flink";
    //本地缓存旧数据
    private static ConcurrentHashMap<String, Long> wordCountMap = new ConcurrentHashMap<>(16);

    static {
        ElasticSearchUtil.createIndex(INDEX_NAME);
        ElasticSearchUtil.initWordCountMap(INDEX_NAME, wordCountMap);
    }

    public static void main(String[] args) throws Exception {
        new KafkaToESTest().execute(args);
    }

    /**
     * 初始化 缓存ElasticSearch旧数据
     *
     * @param env
     * @return
     */
    private DataStreamSource<Tuple2<String, Long>> init(StreamExecutionEnvironment env) {
        List<Tuple2<String, Long>> wordCountList = new ArrayList<>(wordCountMap.size());
        wordCountMap.forEach((key, value) -> wordCountList.add(new Tuple2<>(key, value)));
        //避免集合为空
        if (wordCountList.size() == 0) {
            wordCountList.add(new Tuple2<>("flink", 0L));
        }
        return env.fromCollection(wordCountList);
    }

    private void execute(String[] args) throws Exception {
        //获取执行环境
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(1);

        //配置kafka
        Properties properties = new Properties();
        properties.put("bootstrap.servers", "flink.chris.com:9092");
        properties.put("group.id", "flink_group_1");
        //初始化
        DataStreamSource<Tuple2<String, Long>> initStream = init(env);
        //从socket获取数据
        DataStreamSource<String> streamSource = env.addSource(new FlinkKafkaConsumer<>("topic_flink", new SimpleStringSchema(), properties));

        //wordcount计算
        SingleOutputStreamOperator<Tuple2<String, Long>> operator = streamSource.flatMap(new FlatMapFunction<String, Tuple2<String, Long>>() {
            /**
             * map计算
             * @param value 输入数据 用空格分隔的句子
             * @param out map计算之后的收集器
             * @throws Exception
             */
            @Override
            public void flatMap(String value, Collector<Tuple2<String, Long>> out) throws Exception {
                //用空格分隔为单词
                String[] words = value.split(" ");
                //统计单词使用频次，放入收集器
                Arrays.stream(words)
                        //洗去前后空格
                        .map(String::trim)
                        //过滤掉空字符串
                        .filter(word -> !"".equals(word))
                        //加入收集器
                        .forEach(word -> out.collect(new Tuple2<>(word, 1L)));
            }
        });
        //合并初始化流，按照二元组第一个字段word分组，把第二个字段统计出来
        SingleOutputStreamOperator<Tuple2<String, Long>> resultOperator = operator.union(initStream).keyBy(new KeySelector<Tuple2<String, Long>, Object>() {
            @Override
            public Object getKey(Tuple2<String, Long> value) throws Exception {
                return value.f0;
            }
        }).sum(1);

        resultOperator.print();

        //收集WordCount
        resultOperator.map(new MapFunction<Tuple2<String, Long>, WordCount>() {
            @Override
            public WordCount map(Tuple2<String, Long> value) throws Exception {
                WordCount wordCount = new WordCount(value.f0, value.f1);
                //写入ElasticSearch
                if (wordCount.getCount() > 0) {
                    ElasticSearchUtil.addWordCount(wordCount, INDEX_NAME);
                }
                return wordCount;
            }
        });

        env.execute();
    }


}

这个测试类为了可用，解决了从ES获取旧数据的逻辑。Flink作业每次启动，缓存数据是没有的，我们要在原来统计结果上累加，就需要原来的旧数据。所以在init方法中，我们读取了ES的旧数据，创建了一个流，并且在新的流分组之前与之合并，这样新旧数据就累加了。不过这样做是把旧数据放在了内存中，生产环境不能这么做。可以设计放入Redis，虽然增加了和Redis的IO消耗，但是节省了内存。也可以忽略Flink本身的缓存，直接向Redis累加，然后采用异步任务跟ES的数据合并并清除Redis。我这里只尝试数据处理模型的解决思路的实现，不关注细节。

测试类建立好之后，可以修改清单中的主类，方便使用。

好，我们将程序打包上传，建立job并提交。在kafka输入数据测试。