1.启动zookeeper :
[root@localhost kafka_2.12-2.1.0]# bin/zookeeper-server-start.sh -daemon config/zookeeper.properties
2.启动kafka:
[root@localhost kafka_2.12-2.1.0]# bin/kafka-server-start.sh config/server.properties &
3.创建maven项目,pom文件:
<!-- kafka -->
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-clients</artifactId>
<version>2.1.0</version>
</dependency>
<!-- kafka streams -->
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-streams</artifactId>
<version>2.1.0</version>
</dependency>
<!-- JSONObject -->
<dependency>
<groupId>org.json</groupId>
<artifactId>json</artifactId>
<version>20160810</version>
</dependency>
4.创建流
package com.wyh;
import java.util.Properties;
import org.apache.kafka.common.serialization.Serdes;
import org.apache.kafka.streams.KafkaStreams;
import org.apache.kafka.streams.StreamsBuilder;
import org.apache.kafka.streams.StreamsConfig;
import org.apache.kafka.streams.kstream.KStream;
import org.apache.kafka.streams.kstream.Predicate;
import org.apache.kafka.streams.state.KeyValueStore;
import org.apache.kafka.streams.state.StoreBuilder;
import org.apache.kafka.streams.state.Stores;
public class TestStateStoreStream {
public static void main(String[] args) {
//配置kafka服务信息
Properties prop = new Properties();
prop.put(StreamsConfig.APPLICATION_ID_CONFIG, "wyh-stream-application");
prop.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, "192.168.184.128:9092");
prop.put(StreamsConfig.DEFAULT_KEY_SERDE_CLASS_CONFIG, Serdes.String().getClass());
prop.put(StreamsConfig.DEFAULT_VALUE_SERDE_CLASS_CONFIG, Serdes.String().getClass());
prop.put(StreamsConfig.STATE_DIR_CONFIG, "C:\\IT\\tool\\kafka-state-store");//设置状态仓库的存储路径
//手动创建一个state store
StoreBuilder<KeyValueStore<String, String>> testStateStore = Stores.keyValueStoreBuilder(
//指定state-store的名字,运行时会自动在前面配置的STATE_DIR_CONFIG路径下创建该文件夹
Stores.persistentKeyValueStore("wyh-state-store"),
Serdes.String(),
Serdes.String())
.withCachingEnabled();
StreamsBuilder builder = new StreamsBuilder();
//向builder中添加state-store
builder.addStateStore(testStateStore);
//创建KStreams,从wyh-topic中读入流
KStream<String, String> inputStream = builder.stream("wyh-topic-in");
//使用transform实现自定义statestore,参数一是一个TransformerSupplier的实现类的对象,参数二是前面指定的state-store的名字
KStream<String, String> transformStream = inputStream.transform(new TestTransformerSupplier(testStateStore.name()), testStateStore.name());
//输出topic
transformStream.to("wyh-topic-out");
KafkaStreams streams = new KafkaStreams(builder.build(), prop);
streams.start();
}
}
5.创建TransformerSupplier实现类:
package com.wyh;
import java.time.Duration;
import org.apache.kafka.streams.KeyValue;
import org.apache.kafka.streams.kstream.Transformer;
import org.apache.kafka.streams.kstream.TransformerSupplier;
import org.apache.kafka.streams.processor.ProcessorContext;
import org.apache.kafka.streams.processor.PunctuationType;
import org.apache.kafka.streams.state.KeyValueIterator;
import org.apache.kafka.streams.state.KeyValueStore;
import org.json.JSONObject;
public class TestTransformerSupplier implements TransformerSupplier<String, String, KeyValue<String, String>>{
final private String stateStoreName;
//初始化构造函数,把statestore传进来
public TestTransformerSupplier(String stateStoreName) {
this.stateStoreName = stateStoreName;
}
public Transformer<String, String, KeyValue<String, String>> get() {
return new Transformer<String, String, KeyValue<String, String>>(){
private KeyValueStore<String, String> testStateStore;
private ProcessorContext context;
@SuppressWarnings("unchecked")
public void init(ProcessorContext context) {
testStateStore = (KeyValueStore<String, String>) context.getStateStore(stateStoreName);
this.context = context;
this.context.schedule(Duration.ofMinutes(10), PunctuationType.STREAM_TIME, (timestamp) ->{
KeyValueIterator<String, String> iterator = this.testStateStore.all();
//遍历statestore中存储的每一个key-value实体
while(iterator.hasNext()) {
//这里即使你不对遍历出来的对象做什么操作,也需要执行这条语句,否则就会读不到数据
KeyValue<String, String> object = iterator.next();
//可以设置statestore中的数据过期时间或者其他的条件,可以执行删除,这里写的是删除操作,根据key删除,根据自己的业务加上删除的判断条件
//testStateStore.delete(object.key);
}
iterator.close();
context.commit();
});
}
//这个方法中主要是写自己的业务逻辑
//这里假设我要实现的是判断如果每个url执行了3次,那么就产生一条警告,并将这条警告放到要流出的topic中
public KeyValue<String, String> transform(String key, String value) {
String url = value;
Integer count = null;
//判断,先从statestore中获取这个url是不是已经存在,如果已经存在,就获取原来的count值,然后加1
if(testStateStore.get(url) != null) {
count = Integer.parseInt(testStateStore.get(url));
count += 1;
System.out.println(url+"执行次数:"+count);
//将累加后的count重新放到statestore里面,就会采用最新值,参数一是存在statestore中的key,是自己指定的
testStateStore.put(url, count.toString());//put()中的参数类型都是只能是string
if(count >= 3) {//如果某个Url执行了3次,那么就产生一条消息,并将这个Url的count置0,重新计算
testStateStore.put(url, "0");
//返回的key,value就是要返回给调用该类的那个对象,也就是返回给TestStateStoreStream中调用者,然后再流出topic
return KeyValue.pair(url+"-warning", count.toString());
}
}else {
//该url未出现过,那么第一次来就把它置为1
testStateStore.put(url, "1");
//第一次出现,不符合我们想要达到的出现3次产生消息的需求,所以这里就返回null
return null;
}
//其余条件全部返回Null
return null;
}
public void close() {
//此处我们可以不用操作,因为kafka streams会在必要的时候自动关闭statestore
}
};
}
}
6.创建输入输出的topic:
[root@localhost kafka_2.12-2.1.0]# bin/kafka-topics.sh --create --zookeeper 192.168.184.128:2181 --replication-factor 1 --partitions 1 --topic wyh-topic-in
Created topic "wyh-topic-in".
[root@localhost kafka_2.12-2.1.0]# bin/kafka-topics.sh --create --zookeeper 192.168.184.128:2181 --replication-factor 1 --partitions 1 --topic wyh-topic-out
Created topic "wyh-topic-out".
7.运行TestStateStoreStream
8.运行consumer
9.发送消息:
producer.send(new ProducerRecord<>("wyh-topic-in","/index"));
producer.send(new ProducerRecord<>("wyh-topic-in","/article"));
producer.send(new ProducerRecord<>("wyh-topic-in","/article"));
producer.send(new ProducerRecord<>("wyh-topic-in","/index"));
producer.send(new ProducerRecord<>("wyh-topic-in","/article"));
producer.send(new ProducerRecord<>("wyh-topic-in","/index"));
producer.send(new ProducerRecord<>("wyh-topic-in","/article"));
producer.send(new ProducerRecord<>("wyh-topic-in","/article"));
producer.send(new ProducerRecord<>("wyh-topic-in","/index"));
producer.send(new ProducerRecord<>("wyh-topic-in","/article"));
10.查看控制台打印:
在我们发送的消息中有6次/article,4次/index,所以就打印出两个/article,一个/index。这样就实现了使用transformer来实现statestore的功能,使用这种方法的好处就是我们可以对statestore中的key/value进行操作,如果使用官网word count那种方式虽然也是实现count这样的statestore的功能,但是他只能通过group by的方式聚合之后流到下一步中,不能具体对这个statestore中的值进行更多的操作。使用自定义的这种statestore,可以通过put,get的方式灵活地对key/value操作。
但是特别要注意的一点是,在针对流入的topic,如果是多个partition,那么在自动创建statestore时就会是每个partition都会对应有一个statestore,而消息过来会轮询的放在不同的partition中,如果业务中需要对相同的key有聚合的需求(比如相同的key做count),这种情况下就要求输入的topic消息必须有key,才能保证同一个key的放在同一个Partition中,这样的count才有意义,否则在没有key的情况下,并且是多partition时,轮训之后,每个partition背后的每个statestore都是相互独立的,这样的话,本来应该count在一起的数据被轮询放在了不同的partition中,那statestore记录的只是对应的partition的状态,这样得到的结果不不正确的。
综上,如果输入流是多partition,那就必须要求消息是有key的。