kafka streams中自定义state store实现对状态存储的灵活操作

原创已于 2022-05-26 15:24:28 修改 · 4.6k 阅读

6 ·

CC 4.0 BY-SA版权

文章标签：

#kafka

于 2019-06-16 16:27:20 首次发布

kafka 专栏收录该内容

33 篇文章

订阅专栏

本文介绍如何使用Kafka Streams创建状态存储，实现URL执行次数监控并发出警告。通过自定义TransformerSupplier，对statestore进行操作，实现特定业务逻辑。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

1.启动zookeeper ：

[root@localhost kafka_2.12-2.1.0]# bin/zookeeper-server-start.sh -daemon config/zookeeper.properties

2.启动kafka：

[root@localhost kafka_2.12-2.1.0]# bin/kafka-server-start.sh config/server.properties &

3.创建maven项目，pom文件：

<!-- kafka -->
<dependency>
	 <groupId>org.apache.kafka</groupId>
	 <artifactId>kafka-clients</artifactId>
	 <version>2.1.0</version>
</dependency>
<!-- kafka streams -->
<dependency>
	 <groupId>org.apache.kafka</groupId>
	 <artifactId>kafka-streams</artifactId>
	 <version>2.1.0</version>
</dependency>
<!-- JSONObject -->
<dependency>
	    <groupId>org.json</groupId>
	    <artifactId>json</artifactId>
	    <version>20160810</version>
</dependency>

4.创建流

package com.wyh;

import java.util.Properties;
import org.apache.kafka.common.serialization.Serdes;
import org.apache.kafka.streams.KafkaStreams;
import org.apache.kafka.streams.StreamsBuilder;
import org.apache.kafka.streams.StreamsConfig;
import org.apache.kafka.streams.kstream.KStream;
import org.apache.kafka.streams.kstream.Predicate;
import org.apache.kafka.streams.state.KeyValueStore;
import org.apache.kafka.streams.state.StoreBuilder;
import org.apache.kafka.streams.state.Stores;

public class TestStateStoreStream {

	public static void main(String[] args) {
		
		//配置kafka服务信息
		Properties prop = new Properties();
		prop.put(StreamsConfig.APPLICATION_ID_CONFIG, "wyh-stream-application");
		prop.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, "192.168.184.128:9092");
		prop.put(StreamsConfig.DEFAULT_KEY_SERDE_CLASS_CONFIG, Serdes.String().getClass());
		prop.put(StreamsConfig.DEFAULT_VALUE_SERDE_CLASS_CONFIG, Serdes.String().getClass());
		prop.put(StreamsConfig.STATE_DIR_CONFIG, "C:\\IT\\tool\\kafka-state-store");//设置状态仓库的存储路径
		
		//手动创建一个state store
		StoreBuilder<KeyValueStore<String, String>> testStateStore = Stores.keyValueStoreBuilder(
				//指定state-store的名字，运行时会自动在前面配置的STATE_DIR_CONFIG路径下创建该文件夹
				Stores.persistentKeyValueStore("wyh-state-store"),
				Serdes.String(),
				Serdes.String())
				.withCachingEnabled();
		StreamsBuilder builder = new StreamsBuilder();
		//向builder中添加state-store
		builder.addStateStore(testStateStore);
		//创建KStreams,从wyh-topic中读入流
		KStream<String, String> inputStream = builder.stream("wyh-topic-in");
		//使用transform实现自定义statestore,参数一是一个TransformerSupplier的实现类的对象，参数二是前面指定的state-store的名字
		KStream<String, String> transformStream = inputStream.transform(new TestTransformerSupplier(testStateStore.name()), testStateStore.name());
		//输出topic
		transformStream.to("wyh-topic-out");
		KafkaStreams streams = new KafkaStreams(builder.build(), prop);
		streams.start();
	}

}

5.创建TransformerSupplier实现类：

package com.wyh;

import java.time.Duration;

import org.apache.kafka.streams.KeyValue;
import org.apache.kafka.streams.kstream.Transformer;
import org.apache.kafka.streams.kstream.TransformerSupplier;
import org.apache.kafka.streams.processor.ProcessorContext;
import org.apache.kafka.streams.processor.PunctuationType;
import org.apache.kafka.streams.state.KeyValueIterator;
import org.apache.kafka.streams.state.KeyValueStore;
import org.json.JSONObject;

public class TestTransformerSupplier implements TransformerSupplier<String, String, KeyValue<String, String>>{

	final private String stateStoreName;
	//初始化构造函数，把statestore传进来
	public TestTransformerSupplier(String stateStoreName) {
		this.stateStoreName = stateStoreName;
	}
	
	public Transformer<String, String, KeyValue<String, String>> get() {
		
		return new Transformer<String, String, KeyValue<String, String>>(){
			
			private KeyValueStore<String, String> testStateStore;
			private ProcessorContext context;
			
			@SuppressWarnings("unchecked")
			public void init(ProcessorContext context) {
				testStateStore = (KeyValueStore<String, String>) context.getStateStore(stateStoreName);
				this.context = context;
				this.context.schedule(Duration.ofMinutes(10), PunctuationType.STREAM_TIME, (timestamp) ->{
					KeyValueIterator<String, String> iterator = this.testStateStore.all();
					//遍历statestore中存储的每一个key-value实体
					while(iterator.hasNext()) {
						//这里即使你不对遍历出来的对象做什么操作，也需要执行这条语句，否则就会读不到数据
						KeyValue<String, String> object = iterator.next();
						//可以设置statestore中的数据过期时间或者其他的条件，可以执行删除,这里写的是删除操作，根据key删除，根据自己的业务加上删除的判断条件
						//testStateStore.delete(object.key);
					}
					iterator.close();
					context.commit();
				});
			}

			//这个方法中主要是写自己的业务逻辑
			//这里假设我要实现的是判断如果每个url执行了3次，那么就产生一条警告，并将这条警告放到要流出的topic中
			public KeyValue<String, String> transform(String key, String value) {
				String url = value;
				Integer count = null;
				//判断，先从statestore中获取这个url是不是已经存在，如果已经存在，就获取原来的count值，然后加1
				if(testStateStore.get(url) != null) {
					count = Integer.parseInt(testStateStore.get(url));
					count += 1;
					System.out.println(url+"执行次数:"+count);
					//将累加后的count重新放到statestore里面，就会采用最新值，参数一是存在statestore中的key，是自己指定的
					testStateStore.put(url, count.toString());//put()中的参数类型都是只能是string
					if(count >= 3) {//如果某个Url执行了3次，那么就产生一条消息，并将这个Url的count置0，重新计算
						testStateStore.put(url, "0");
						//返回的key,value就是要返回给调用该类的那个对象，也就是返回给TestStateStoreStream中调用者，然后再流出topic
						return KeyValue.pair(url+"-warning", count.toString());
					}
				}else {
					//该url未出现过，那么第一次来就把它置为1
					testStateStore.put(url, "1");
					//第一次出现，不符合我们想要达到的出现3次产生消息的需求，所以这里就返回null
					return null;
				}
				//其余条件全部返回Null
				return null;
			}

			public void close() {
				//此处我们可以不用操作，因为kafka streams会在必要的时候自动关闭statestore
			}
			
		};

	}

}

6.创建输入输出的topic：

[root@localhost kafka_2.12-2.1.0]# bin/kafka-topics.sh --create --zookeeper 192.168.184.128:2181 --replication-factor 1 --partitions 1 --topic wyh-topic-in
Created topic "wyh-topic-in".
[root@localhost kafka_2.12-2.1.0]# bin/kafka-topics.sh --create --zookeeper 192.168.184.128:2181 --replication-factor 1 --partitions 1 --topic wyh-topic-out
Created topic "wyh-topic-out".

7.运行TestStateStoreStream

8.运行consumer

9.发送消息：

producer.send(new ProducerRecord<>("wyh-topic-in","/index"));
producer.send(new ProducerRecord<>("wyh-topic-in","/article"));
producer.send(new ProducerRecord<>("wyh-topic-in","/article"));
producer.send(new ProducerRecord<>("wyh-topic-in","/index"));
producer.send(new ProducerRecord<>("wyh-topic-in","/article"));
producer.send(new ProducerRecord<>("wyh-topic-in","/index"));
producer.send(new ProducerRecord<>("wyh-topic-in","/article"));
producer.send(new ProducerRecord<>("wyh-topic-in","/article"));
producer.send(new ProducerRecord<>("wyh-topic-in","/index"));
producer.send(new ProducerRecord<>("wyh-topic-in","/article"));

10.查看控制台打印：

在我们发送的消息中有6次/article，4次/index，所以就打印出两个/article，一个/index。这样就实现了使用transformer来实现statestore的功能，使用这种方法的好处就是我们可以对statestore中的key/value进行操作，如果使用官网word count那种方式虽然也是实现count这样的statestore的功能，但是他只能通过group by的方式聚合之后流到下一步中，不能具体对这个statestore中的值进行更多的操作。使用自定义的这种statestore，可以通过put,get的方式灵活地对key/value操作。

但是特别要注意的一点是，在针对流入的topic，如果是多个partition，那么在自动创建statestore时就会是每个partition都会对应有一个statestore，而消息过来会轮询的放在不同的partition中，如果业务中需要对相同的key有聚合的需求（比如相同的key做count），这种情况下就要求输入的topic消息必须有key，才能保证同一个key的放在同一个Partition中，这样的count才有意义，否则在没有key的情况下，并且是多partition时，轮训之后，每个partition背后的每个statestore都是相互独立的，这样的话，本来应该count在一起的数据被轮询放在了不同的partition中，那statestore记录的只是对应的partition的状态，这样得到的结果不不正确的。

综上，如果输入流是多partition，那就必须要求消息是有key的。

代码地址：https://github.com/wyhuiii/kafka-state-store