import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import kafka.common.TopicAndPartition;
import kafka.message.MessageAndMetadata;
import kafka.serializer.StringDecoder;
import kafka.utils.ZkUtils;
import org.I0Itec.zkclient.ZkClient;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.streaming.Seconds;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka.HasOffsetRanges;
import org.apache.spark.streaming.kafka.KafkaCluster;
import org.apache.spark.streaming.kafka.KafkaCluster.LeaderOffset;
import org.apache.spark.streaming.kafka.KafkaUtils;
import org.apache.spark.streaming.kafka.OffsetRange;
import scala.Tuple2;
import scala.collection.JavaConverters;
import scala.collection.immutable.HashSet;
import scala.collection.mutable.ArrayBuffer;
import scala.util.Either.RightProjection;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
public class CanKafka {
public static final Long SPARK_STREAMING_ITERVAL_SECOND=3l;//sparkstreaming时间间隔
public static final String OFFSET_ZK_PATH_ROOT="/KafkaOffset";//zk上保存offset的根目录
public static final String KAFKA_GOUP_ID="JbcCanParserGroup";//主题消费组id
public static final String KAFKA_BROKER_LIST="10.70.19.222:9092";//kafka服务地址
public static final String CUSTOMER_TOPIC="cantest5";//消费的主题
public static final String ZK_HOST="10.70.19.222:2181";//kafka的zk
public static void main(String[] args) throws InterruptedException {
Logger.getLogger("org.apache.kafka").setLevel(Level.ERROR);
Logger.getLogger("org.apache.zookeeper").setLevel(Level.ERROR) ;
Logger.getLogger("org.apache.spark").setLevel(Level.ERROR);
Logger.getLogger("kafka.utils").setLevel(Level.ERROR);
//构建streamingcontext
SparkConf conf = new SparkConf();
conf.setAppName("Jbc_Can_Parser");
conf.setMaster("local[2]");
JavaSparkContext sparkContext = new JavaSparkContext(conf);
JavaStreamingContext ssc = new JavaStreamingContext(sparkContext, Seconds.apply(SPARK_STREAMING_ITERVAL_SECOND));
//构建kafka配置
Map<String,String> kafkaParams=Maps.newHashMap();
kafkaParams.put("metadata.broker.list", KAFKA_BROKER_LIST);
kafkaParams.put("zookeeper.connect", ZK_HOST);
kafkaParams.put("group.id", KAFKA_GOUP_ID);
//配置消费主题
Set<String> topic= Sets.newHashSet();
topic.add(CUSTOMER_TOPIC);
//查询zk上的offset
Map<TopicAndPartition, Long> offset = queryOffset();
//创建kafka数据流
JavaDStream<String> dStream = initStreaming(ssc, offset, kafkaParams, topic);
//处理数据同时更新zk中的offset
dStream.foreachRDD(new VoidFunction<JavaRDD<String>>() {
public void call(JavaRDD<String> rdd) throws Exception {
//业务处理
if(rdd.isEmpty()){
return;
}
rdd.foreach(new VoidFunction<String>() {
public void call(String t) throws Exception {
System.out.println(t);
}
});
//获取offset并保存到zk中
HasOffsetRanges hor=(HasOffsetRanges) rdd.rdd();
OffsetRange[] offsetRanges = hor.offsetRanges();
ZkClient zkClient= new ZkClient(ZK_HOST);
try {
for(OffsetRange of:offsetRanges){
String offsetPath=OFFSET_ZK_PATH_ROOT+"/"+KAFKA_GOUP_ID+"/"+of.topic()+"/"+of.partition();
ZkUtils.updatePersistentPath(zkClient, offsetPath,String.valueOf(of.untilOffset()));
}
} finally {
zkClient.close();
}
}
});
ssc.start();
ssc.awaitTermination();
}
//构造kafka数据流
public static JavaDStream<String> initStreaming(JavaStreamingContext ssc,Map<TopicAndPartition, Long> offset,Map<String,String> kafkaParams,Set<String> topic){
if(offset == null){
//如果zk上没有offset,将偏移量设置为最新offset
//把java的map和set转为scala的map和set
scala.collection.immutable.HashMap<String,String> scalaMapKafkaParams = new scala.collection.immutable.HashMap<String, String>();
Set<Entry<String,String>> entrySet = kafkaParams.entrySet();
for(Entry<String,String> entry:entrySet){
scalaMapKafkaParams= scalaMapKafkaParams.$plus(new Tuple2<String, String>(entry.getKey(), entry.getValue()));
}
HashSet<String> scalaSetTopic = new scala.collection.immutable.HashSet<String>();
for(String t:topic){
scalaSetTopic=scalaSetTopic.$plus(t);
}
//初始化kafka客户端
KafkaCluster kc = new KafkaCluster(scalaMapKafkaParams);
RightProjection<ArrayBuffer<Throwable>, scala.collection.immutable.Set<TopicAndPartition>> partitions = kc.getPartitions(scalaSetTopic).right();
scala.collection.immutable.Map<TopicAndPartition, LeaderOffset> leaderOffsets = kc.getLatestLeaderOffsets(partitions.get()).right().get();
Map<TopicAndPartition, LeaderOffset> javaLeaderOffsets = JavaConverters.asJavaMapConverter(leaderOffsets).asJava();
Map<TopicAndPartition, Long> ofs=Maps.newHashMap();
for(Entry<TopicAndPartition, LeaderOffset> en:javaLeaderOffsets.entrySet()){
ofs.put(en.getKey(), en.getValue().offset());
}
offset=ofs;
}
//如果zk上存在offset,则构造一个从该offset开始的数据流
JavaDStream<String> dStream = KafkaUtils.createDirectStream(ssc, String.class, String.class,StringDecoder.class, StringDecoder.class,String.class, kafkaParams,offset , new Function<MessageAndMetadata<String, String>, String>() {
private static final long serialVersionUID = -5659185124670300349L;
public String call(MessageAndMetadata<String, String> mmd)
throws Exception {
return mmd.message();
}
});
return dStream;
}
//从zk中查询offset
public static Map<TopicAndPartition,Long> queryOffset(){
ZkClient zkClient= new ZkClient(ZK_HOST);
try {
//统计消费的主题下有多少个分区节点
int countChildren = zkClient.countChildren(OFFSET_ZK_PATH_ROOT+"/"+KAFKA_GOUP_ID+"/"+CUSTOMER_TOPIC);
if(countChildren == 0){
return null;
}
//读取分区节点上存储的offset
HashMap map = new HashMap();
for(int i=0;i<countChildren;i++){
String readData = zkClient.readData(OFFSET_ZK_PATH_ROOT+"/"+KAFKA_GOUP_ID+"/"+CUSTOMER_TOPIC+"/"+i);
map.put(new TopicAndPartition(CUSTOMER_TOPIC, i), Long.valueOf(readData));
}
return map;
} finally {
zkClient.close();
}
}
}spark kafka 零数据丢失-----java版
最新推荐文章于 2019-08-09 10:20:28 发布
本文介绍了一个使用Apache Spark Streaming消费Kafka消息的应用实例。该应用通过Spark Streaming定期从Kafka拉取消息,并将消费进度记录在ZooKeeper中以实现故障恢复。文章详细展示了如何配置Spark Streaming和Kafka参数,以及如何实现自定义的数据处理逻辑。
6851

被折叠的 条评论
为什么被折叠?



