上次修改https://blog.youkuaiyun.com/xxkalychen/article/details/117152948?spm=1001.2014.3001.5501把数据源确定为消息中间件kafka,从数据源来讲已经比较符合标准模型了。数据处理的最终结果也不能只是控制台打印,终究还是要持久化的。我们可以写入HBase,可以写入HDFS,我这里还是选择写入ElasticSearch。
首先我们要启动ElasticSearch服务器,zookeeper和kafka。
一、添加ElasticSearch的pom依赖。
<dependency>
<groupId>org.elasticsearch.client</groupId>
<artifactId>elasticsearch-rest-high-level-client</artifactId>
<version>7.12.0</version>
</dependency>
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.8.6</version>
</dependency>
二、封装数据类。用于封装数据便于写入ES。
package com.chris.flink.model;
import java.io.Serializable;
/**
* @author Chris Chan
* Create on 2021/5/21 12:56
* Use for:
* Explain:
*/
public class WordCount implements Serializable {
private String word;
private long count;
public WordCount() {
}
public WordCount(String word, long count) {
this.word = word;
this.count = count;
}
public String getWord() {
return word;
}
public void setWord(String word) {
this.word = word;
}
public long getCount() {
return count;
}
public void setCount(long count) {
this.count = count;
}
}
三、编写ElasticSearch工具类。
package com.chris.flink.utils;
import com.chris.flink.model.WordCount;
import com.google.gson.Gson;
import org.apache.http.HttpHost;
import org.elasticsearch.action.bulk.BulkRequest;
import org.elasticsearch.action.bulk.BulkResponse;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.action.index.IndexResponse;
import org.elasticsearch.action.search.SearchRequest;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.client.RequestOptions;
import org.elasticsearch.client.RestClient;
import org.elasticsearch.client.RestHighLevelClient;
import org.elasticsearch.client.indices.CreateIndexRequest;
import org.elasticsearch.client.indices.GetIndexRequest;
import org.elasticsearch.common.xcontent.XContentType;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.builder.SearchSourceBuilder;
import sun.misc.BASE64Encoder;
import java.io.IOException;
import java.util.List;
import java.util.UUID;
import java.util.concurrent.ConcurrentHashMap;
/**
* @author Chris Chan
* Create on 2021/5/19 7:37
* Use for:
* Explain:
*/
public class ElasticSearchUtil {
private static RestHighLevelClient client = null;
private static Gson gson = new Gson().newBuilder().create();
private static BASE64Encoder base64Encoder = new BASE64Encoder();
static {
ElasticSearchUtil.client = new RestHighLevelClient(RestClient.builder(new HttpHost("192.168.0.52", 9200, "http")));
}
public static RestHighLevelClient getClient() {
return client;
}
public static void close() {
try {
ElasticSearchUtil.client.close();
} catch (IOException e) {
e.printStackTrace();
}
}
public static boolean isIndexExists(String indexName) {
try {
return ElasticSearchUtil.client.indices().exists(new GetIndexRequest(indexName), RequestOptions.DEFAULT);
} catch (IOException e) {
e.printStackTrace();
}
return false;
}
public static void createIndex(String indexName) {
if (isIndexExists(indexName)) {
return;
}
try {
ElasticSearchUtil.client.indices().create(new CreateIndexRequest(indexName), RequestOptions.DEFAULT);
} catch (IOException e) {
e.printStackTrace();
}
}
public static <T> IndexResponse add(T obj, String indexName) {
IndexRequest indexRequest = new IndexRequest(indexName).id(UUID.randomUUID().toString());
indexRequest.source(new Gson().toJson(obj), XContentType.JSON);
try {
return ElasticSearchUtil.client.index(indexRequest, RequestOptions.DEFAULT);
} catch (IOException e) {
e.printStackTrace();
}
return null;
}
/**
* 初始化本地单词计数收集器
* 程序启动时,需要到ES去读取旧的数据,以便累计单词总数
*
* @param indexName
* @param wordCountMap
*/
public static void initWordCountMap(String indexName, ConcurrentHashMap<String, Long> wordCountMap) {
SearchRequest request = new SearchRequest(indexName);
SearchSourceBuilder builder = new SearchSourceBuilder();
builder.query(QueryBuilders.matchAllQuery());
request.source(builder);
try {
SearchResponse response = ElasticSearchUtil.client.search(request, RequestOptions.DEFAULT);
SearchHit[] hits = response.getHits().getHits();
for (SearchHit hit : hits) {
WordCount wordCount = gson.fromJson(hit.getSourceAsString(), WordCount.class);
wordCountMap.put(wordCount.getWord(), wordCount.getCount());
}
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* 向ES写入单词计数结果
*
* @param wordCount
* @param indexName
*/
public static IndexResponse addWordCount(WordCount wordCount, String indexName) {
IndexRequest indexRequest = new IndexRequest(indexName).id(base64Encoder.encode(wordCount.getWord().getBytes()));
indexRequest.source(new Gson().toJson(wordCount), XContentType.JSON);
try {
return ElasticSearchUtil.client.index(indexRequest, RequestOptions.DEFAULT);
} catch (IOException e) {
e.printStackTrace();
}
return null;
}
/**
* 批量向ES写入单词计数结果
*
* @param wordCountList
* @param indexName
* @return
*/
public static BulkResponse addWordCountList(List<WordCount> wordCountList, String indexName) {
BulkRequest bulkRequest = new BulkRequest();
wordCountList.forEach(wordCount -> {
bulkRequest.add(new IndexRequest(indexName).id(base64Encoder.encode(wordCount.getWord().getBytes())).source(gson.toJson(wordCount), XContentType.JSON));
});
try {
return ElasticSearchUtil.client.bulk(bulkRequest, RequestOptions.DEFAULT);
} catch (IOException e) {
e.printStackTrace();
}
return null;
}
}
工具类中提供将数据写入ES的方法,为了保持单词数据的唯一性,我们将单词本身经过Base64编码之后作为固定的ID。
四、编写测试类。
package com.chris.flink;
import com.chris.flink.model.WordCount;
import com.chris.flink.utils.ElasticSearchUtil;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;
import org.apache.flink.util.Collector;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Properties;
import java.util.concurrent.ConcurrentHashMap;
/**
* @author Chris Chan
* Create on 2021/5/22 7:23
* Use for:
* Explain: Flink流式处理从Kafka获取的数据并写入ElasticSearch
*/
public class KafkaToESTest {
//ElasticSearch内索引名称
public static final String INDEX_NAME = "topic_flink";
//本地缓存旧数据
private static ConcurrentHashMap<String, Long> wordCountMap = new ConcurrentHashMap<>(16);
static {
ElasticSearchUtil.createIndex(INDEX_NAME);
ElasticSearchUtil.initWordCountMap(INDEX_NAME, wordCountMap);
}
public static void main(String[] args) throws Exception {
new KafkaToESTest().execute(args);
}
/**
* 初始化 缓存ElasticSearch旧数据
*
* @param env
* @return
*/
private DataStreamSource<Tuple2<String, Long>> init(StreamExecutionEnvironment env) {
List<Tuple2<String, Long>> wordCountList = new ArrayList<>(wordCountMap.size());
wordCountMap.forEach((key, value) -> wordCountList.add(new Tuple2<>(key, value)));
//避免集合为空
if (wordCountList.size() == 0) {
wordCountList.add(new Tuple2<>("flink", 0L));
}
return env.fromCollection(wordCountList);
}
private void execute(String[] args) throws Exception {
//获取执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
//配置kafka
Properties properties = new Properties();
properties.put("bootstrap.servers", "flink.chris.com:9092");
properties.put("group.id", "flink_group_1");
//初始化
DataStreamSource<Tuple2<String, Long>> initStream = init(env);
//从socket获取数据
DataStreamSource<String> streamSource = env.addSource(new FlinkKafkaConsumer<>("topic_flink", new SimpleStringSchema(), properties));
//wordcount计算
SingleOutputStreamOperator<Tuple2<String, Long>> operator = streamSource.flatMap(new FlatMapFunction<String, Tuple2<String, Long>>() {
/**
* map计算
* @param value 输入数据 用空格分隔的句子
* @param out map计算之后的收集器
* @throws Exception
*/
@Override
public void flatMap(String value, Collector<Tuple2<String, Long>> out) throws Exception {
//用空格分隔为单词
String[] words = value.split(" ");
//统计单词使用频次,放入收集器
Arrays.stream(words)
//洗去前后空格
.map(String::trim)
//过滤掉空字符串
.filter(word -> !"".equals(word))
//加入收集器
.forEach(word -> out.collect(new Tuple2<>(word, 1L)));
}
});
//合并初始化流,按照二元组第一个字段word分组,把第二个字段统计出来
SingleOutputStreamOperator<Tuple2<String, Long>> resultOperator = operator.union(initStream).keyBy(new KeySelector<Tuple2<String, Long>, Object>() {
@Override
public Object getKey(Tuple2<String, Long> value) throws Exception {
return value.f0;
}
}).sum(1);
resultOperator.print();
//收集WordCount
resultOperator.map(new MapFunction<Tuple2<String, Long>, WordCount>() {
@Override
public WordCount map(Tuple2<String, Long> value) throws Exception {
WordCount wordCount = new WordCount(value.f0, value.f1);
//写入ElasticSearch
if (wordCount.getCount() > 0) {
ElasticSearchUtil.addWordCount(wordCount, INDEX_NAME);
}
return wordCount;
}
});
env.execute();
}
}
这个测试类为了可用,解决了从ES获取旧数据的逻辑。Flink作业每次启动,缓存数据是没有的,我们要在原来统计结果上累加,就需要原来的旧数据。所以在init方法中,我们读取了ES的旧数据,创建了一个流,并且在新的流分组之前与之合并,这样新旧数据就累加了。不过这样做是把旧数据放在了内存中,生产环境不能这么做。可以设计放入Redis,虽然增加了和Redis的IO消耗,但是节省了内存。也可以忽略Flink本身的缓存,直接向Redis累加,然后采用异步任务跟ES的数据合并并清除Redis。我这里只尝试数据处理模型的解决思路的实现,不关注细节。
测试类建立好之后,可以修改清单中的主类,方便使用。
好,我们将程序打包上传,建立job并提交。在kafka输入数据测试。
逻辑流程已经打通。
我们取消job,重新执行一个。我们会发现标准输出会帮我们打印旧数据的统计结果。
17行以后的信息是ES里面旧的统计数据,此时我们还没有输入新的数据。
现在我们尝试输入一行新的数据。
我们再看看ElasticSearch数据变化。
我们看到了数据累加的效果。
好了,一个流式处理的标准模型就算基本完成了。
致敬袁老!袁老走好!!