Java整合Flink将结果写入ElasticSearch

本文介绍了如何使用Flink从Kafka读取数据,进行wordcount计算,并将结果持久化到ElasticSearch。通过封装数据类、ElasticSearch工具类,实现了数据写入和初始化旧数据的逻辑。测试表明,程序能正确累加新旧数据并写入ElasticSearch。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

上次修改https://blog.youkuaiyun.com/xxkalychen/article/details/117152948?spm=1001.2014.3001.5501把数据源确定为消息中间件kafka,从数据源来讲已经比较符合标准模型了。数据处理的最终结果也不能只是控制台打印,终究还是要持久化的。我们可以写入HBase,可以写入HDFS,我这里还是选择写入ElasticSearch。

首先我们要启动ElasticSearch服务器,zookeeper和kafka。

一、添加ElasticSearch的pom依赖。

<dependency>
    <groupId>org.elasticsearch.client</groupId>
    <artifactId>elasticsearch-rest-high-level-client</artifactId>
    <version>7.12.0</version>
</dependency>
<dependency>
    <groupId>com.google.code.gson</groupId>
    <artifactId>gson</artifactId>
    <version>2.8.6</version>
</dependency>

二、封装数据类。用于封装数据便于写入ES。

package com.chris.flink.model;

import java.io.Serializable;

/**
 * @author Chris Chan
 * Create on 2021/5/21 12:56
 * Use for:
 * Explain:
 */
public class WordCount implements Serializable {
    private String word;
    private long count;

    public WordCount() {
    }

    public WordCount(String word, long count) {
        this.word = word;
        this.count = count;
    }

    public String getWord() {
        return word;
    }

    public void setWord(String word) {
        this.word = word;
    }

    public long getCount() {
        return count;
    }

    public void setCount(long count) {
        this.count = count;
    }
}

三、编写ElasticSearch工具类。

package com.chris.flink.utils;

import com.chris.flink.model.WordCount;
import com.google.gson.Gson;
import org.apache.http.HttpHost;
import org.elasticsearch.action.bulk.BulkRequest;
import org.elasticsearch.action.bulk.BulkResponse;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.action.index.IndexResponse;
import org.elasticsearch.action.search.SearchRequest;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.client.RequestOptions;
import org.elasticsearch.client.RestClient;
import org.elasticsearch.client.RestHighLevelClient;
import org.elasticsearch.client.indices.CreateIndexRequest;
import org.elasticsearch.client.indices.GetIndexRequest;
import org.elasticsearch.common.xcontent.XContentType;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.builder.SearchSourceBuilder;
import sun.misc.BASE64Encoder;

import java.io.IOException;
import java.util.List;
import java.util.UUID;
import java.util.concurrent.ConcurrentHashMap;

/**
 * @author Chris Chan
 * Create on 2021/5/19 7:37
 * Use for:
 * Explain:
 */
public class ElasticSearchUtil {
    private static RestHighLevelClient client = null;
    private static Gson gson = new Gson().newBuilder().create();
    private static BASE64Encoder base64Encoder = new BASE64Encoder();

    static {
        ElasticSearchUtil.client = new RestHighLevelClient(RestClient.builder(new HttpHost("192.168.0.52", 9200, "http")));
    }

    public static RestHighLevelClient getClient() {
        return client;
    }

    public static void close() {
        try {
            ElasticSearchUtil.client.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    public static boolean isIndexExists(String indexName) {
        try {
            return ElasticSearchUtil.client.indices().exists(new GetIndexRequest(indexName), RequestOptions.DEFAULT);
        } catch (IOException e) {
            e.printStackTrace();
        }
        return false;
    }

    public static void createIndex(String indexName) {
        if (isIndexExists(indexName)) {
            return;
        }
        try {
            ElasticSearchUtil.client.indices().create(new CreateIndexRequest(indexName), RequestOptions.DEFAULT);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    public static <T> IndexResponse add(T obj, String indexName) {
        IndexRequest indexRequest = new IndexRequest(indexName).id(UUID.randomUUID().toString());
        indexRequest.source(new Gson().toJson(obj), XContentType.JSON);

        try {
            return ElasticSearchUtil.client.index(indexRequest, RequestOptions.DEFAULT);
        } catch (IOException e) {
            e.printStackTrace();
        }
        return null;
    }

    /**
     * 初始化本地单词计数收集器
     * 程序启动时,需要到ES去读取旧的数据,以便累计单词总数
     *
     * @param indexName
     * @param wordCountMap
     */
    public static void initWordCountMap(String indexName, ConcurrentHashMap<String, Long> wordCountMap) {
        SearchRequest request = new SearchRequest(indexName);
        SearchSourceBuilder builder = new SearchSourceBuilder();
        builder.query(QueryBuilders.matchAllQuery());
        request.source(builder);
        try {
            SearchResponse response = ElasticSearchUtil.client.search(request, RequestOptions.DEFAULT);
            SearchHit[] hits = response.getHits().getHits();
            for (SearchHit hit : hits) {
                WordCount wordCount = gson.fromJson(hit.getSourceAsString(), WordCount.class);
                wordCountMap.put(wordCount.getWord(), wordCount.getCount());
            }

        } catch (IOException e) {
            e.printStackTrace();
        }


    }

    /**
     * 向ES写入单词计数结果
     *
     * @param wordCount
     * @param indexName
     */
    public static IndexResponse addWordCount(WordCount wordCount, String indexName) {
        IndexRequest indexRequest = new IndexRequest(indexName).id(base64Encoder.encode(wordCount.getWord().getBytes()));
        indexRequest.source(new Gson().toJson(wordCount), XContentType.JSON);

        try {
            return ElasticSearchUtil.client.index(indexRequest, RequestOptions.DEFAULT);
        } catch (IOException e) {
            e.printStackTrace();
        }
        return null;
    }

    /**
     * 批量向ES写入单词计数结果
     *
     * @param wordCountList
     * @param indexName
     * @return
     */
    public static BulkResponse addWordCountList(List<WordCount> wordCountList, String indexName) {
        BulkRequest bulkRequest = new BulkRequest();

        wordCountList.forEach(wordCount -> {
            bulkRequest.add(new IndexRequest(indexName).id(base64Encoder.encode(wordCount.getWord().getBytes())).source(gson.toJson(wordCount), XContentType.JSON));
        });
        try {
            return ElasticSearchUtil.client.bulk(bulkRequest, RequestOptions.DEFAULT);
        } catch (IOException e) {
            e.printStackTrace();
        }
        return null;
    }
}

工具类中提供将数据写入ES的方法,为了保持单词数据的唯一性,我们将单词本身经过Base64编码之后作为固定的ID。

四、编写测试类。

package com.chris.flink;

import com.chris.flink.model.WordCount;
import com.chris.flink.utils.ElasticSearchUtil;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;
import org.apache.flink.util.Collector;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Properties;
import java.util.concurrent.ConcurrentHashMap;

/**
 * @author Chris Chan
 * Create on 2021/5/22 7:23
 * Use for:
 * Explain: Flink流式处理从Kafka获取的数据并写入ElasticSearch
 */
public class KafkaToESTest {
    //ElasticSearch内索引名称
    public static final String INDEX_NAME = "topic_flink";
    //本地缓存旧数据
    private static ConcurrentHashMap<String, Long> wordCountMap = new ConcurrentHashMap<>(16);

    static {
        ElasticSearchUtil.createIndex(INDEX_NAME);
        ElasticSearchUtil.initWordCountMap(INDEX_NAME, wordCountMap);
    }

    public static void main(String[] args) throws Exception {
        new KafkaToESTest().execute(args);
    }

    /**
     * 初始化 缓存ElasticSearch旧数据
     *
     * @param env
     * @return
     */
    private DataStreamSource<Tuple2<String, Long>> init(StreamExecutionEnvironment env) {
        List<Tuple2<String, Long>> wordCountList = new ArrayList<>(wordCountMap.size());
        wordCountMap.forEach((key, value) -> wordCountList.add(new Tuple2<>(key, value)));
        //避免集合为空
        if (wordCountList.size() == 0) {
            wordCountList.add(new Tuple2<>("flink", 0L));
        }
        return env.fromCollection(wordCountList);
    }

    private void execute(String[] args) throws Exception {
        //获取执行环境
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(1);

        //配置kafka
        Properties properties = new Properties();
        properties.put("bootstrap.servers", "flink.chris.com:9092");
        properties.put("group.id", "flink_group_1");
        //初始化
        DataStreamSource<Tuple2<String, Long>> initStream = init(env);
        //从socket获取数据
        DataStreamSource<String> streamSource = env.addSource(new FlinkKafkaConsumer<>("topic_flink", new SimpleStringSchema(), properties));

        //wordcount计算
        SingleOutputStreamOperator<Tuple2<String, Long>> operator = streamSource.flatMap(new FlatMapFunction<String, Tuple2<String, Long>>() {
            /**
             * map计算
             * @param value 输入数据 用空格分隔的句子
             * @param out map计算之后的收集器
             * @throws Exception
             */
            @Override
            public void flatMap(String value, Collector<Tuple2<String, Long>> out) throws Exception {
                //用空格分隔为单词
                String[] words = value.split(" ");
                //统计单词使用频次,放入收集器
                Arrays.stream(words)
                        //洗去前后空格
                        .map(String::trim)
                        //过滤掉空字符串
                        .filter(word -> !"".equals(word))
                        //加入收集器
                        .forEach(word -> out.collect(new Tuple2<>(word, 1L)));
            }
        });
        //合并初始化流,按照二元组第一个字段word分组,把第二个字段统计出来
        SingleOutputStreamOperator<Tuple2<String, Long>> resultOperator = operator.union(initStream).keyBy(new KeySelector<Tuple2<String, Long>, Object>() {
            @Override
            public Object getKey(Tuple2<String, Long> value) throws Exception {
                return value.f0;
            }
        }).sum(1);

        resultOperator.print();

        //收集WordCount
        resultOperator.map(new MapFunction<Tuple2<String, Long>, WordCount>() {
            @Override
            public WordCount map(Tuple2<String, Long> value) throws Exception {
                WordCount wordCount = new WordCount(value.f0, value.f1);
                //写入ElasticSearch
                if (wordCount.getCount() > 0) {
                    ElasticSearchUtil.addWordCount(wordCount, INDEX_NAME);
                }
                return wordCount;
            }
        });

        env.execute();
    }


}

这个测试类为了可用,解决了从ES获取旧数据的逻辑。Flink作业每次启动,缓存数据是没有的,我们要在原来统计结果上累加,就需要原来的旧数据。所以在init方法中,我们读取了ES的旧数据,创建了一个流,并且在新的流分组之前与之合并,这样新旧数据就累加了。不过这样做是把旧数据放在了内存中,生产环境不能这么做。可以设计放入Redis,虽然增加了和Redis的IO消耗,但是节省了内存。也可以忽略Flink本身的缓存,直接向Redis累加,然后采用异步任务跟ES的数据合并并清除Redis。我这里只尝试数据处理模型的解决思路的实现,不关注细节。

测试类建立好之后,可以修改清单中的主类,方便使用。

好,我们将程序打包上传,建立job并提交。在kafka输入数据测试。

逻辑流程已经打通。

我们取消job,重新执行一个。我们会发现标准输出会帮我们打印旧数据的统计结果。

17行以后的信息是ES里面旧的统计数据,此时我们还没有输入新的数据。

现在我们尝试输入一行新的数据。

我们再看看ElasticSearch数据变化。

我们看到了数据累加的效果。

好了,一个流式处理的标准模型就算基本完成了。

 

致敬袁老!袁老走好!!

 

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值