Spark 2017 BigData Update(5)Spark Streaming in Java

本文介绍了一个使用Java实现的Spark Streaming应用示例,该应用通过Kafka接收数据,并统计包含特定关键词的消息数量。文章提供了完整的代码示例及运行指令。
Spark 2017 BigData Update(5)Spark Streaming in Java

The Streaming Small example Class, WordCountStreamingApp.java

package com.sillycat.sparkjava.app;

import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.common.serialization.StringDeserializer;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.streaming.Duration;
import org.apache.spark.streaming.api.java.JavaInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka010.ConsumerStrategies;
import org.apache.spark.streaming.kafka010.KafkaUtils;
import org.apache.spark.streaming.kafka010.LocationStrategies;

import com.sillycat.sparkjava.base.SparkBaseApp;

public class WordCountStreamingApp extends SparkBaseApp {

private static final long serialVersionUID = 7401544141510431796L;

protected String getAppName() {
return "WordCountStreamingApp";
}

public void executeTask(List<String> params) {
SparkConf conf = this.getSparkConf();
// The time interval at which streaming data will be divided into
// batches
logger.info("Start to have the streaming");
JavaStreamingContext ssc = new JavaStreamingContext(conf, new Duration(30000));
ssc.checkpoint(this.getAppName());
logger.info("Prepare the resource for streaming");
processStream(ssc, "carl");
logger.info("Streaming is working");
try {
ssc.start();
ssc.awaitTermination();
} catch (InterruptedException e) {
logger.error("InterruptedException:", e);
}
}

private void processStream(JavaStreamingContext ssc, String keyword) {

Map<String, Object> kafkaParams = new HashMap<>();
kafkaParams.put("bootstrap.servers", "fr-stage-api:9092,fr-stage-consumer:9092,fr-perf1:9092");
kafkaParams.put("key.deserializer", StringDeserializer.class);
kafkaParams.put("value.deserializer", StringDeserializer.class);
kafkaParams.put("group.id", "WordCountStreamingApp");
kafkaParams.put("auto.offset.reset", "latest");
kafkaParams.put("enable.auto.commit", true);
Collection<String> topics = Arrays.asList("sillycat-topic");

logger.info("Init the Kafka Clients to fetch lines");
JavaInputDStream<ConsumerRecord<String, String>> dStream = KafkaUtils.createDirectStream(ssc,
LocationStrategies.PreferConsistent(),
ConsumerStrategies.<String, String>Subscribe(topics, kafkaParams));
dStream.foreachRDD(rdd -> {
processRows(rdd, keyword);
});
}

private void processRows(JavaRDD<ConsumerRecord<String, String>> rdds, String keyword) {
JavaRDD<String> rows = rdds.map(record -> record.value());
JavaRDD<String> lines = rows.filter(new Function<String, Boolean>() {
private static final long serialVersionUID = 1L;

public Boolean call(String s) throws Exception {
if (s == null || s.trim().length() < 1) {
return false;
}
if (!s.contains(keyword)) {
return false;
}
return true;
}
});
long count = lines.count();
logger.info("Kafka received " + count + " " + keyword);
}

}

Here is how we should run all these testing:

#Run the local#

>java -jar target/sillycat-spark-java-1.0-jar-with-dependencies.jar com.sillycat.sparkjava.app.CountLinesOfKeywordApp

>java -jar target/sillycat-spark-java-1.0-jar-with-dependencies.jar com.sillycat.sparkjava.app.WordCountStreamingApp

#Run binary on local#

>bin/spark-submit --class com.sillycat.sparkjava.SparkJavaApp /Users/carl/work/sillycat/sillycat-spark-java/target/sillycat-spark-java-1.0-jar-with-dependencies.jar com.sillycat.sparkjava.app.CountLinesOfKeywordApp

>bin/spark-submit --class com.sillycat.sparkjava.SparkJavaApp /home/ec2-user/users/carl/sillycat-spark-java/target/sillycat-spark-java-1.0-jar-with-dependencies.jar com.sillycat.sparkjava.app.WordCountStreamingApp

#Run binary on Remote YARN Cluster#

>bin/spark-submit --class com.sillycat.sparkjava.SparkJavaApp --master yarn-client /home/ec2-user/users/carl/sillycat-spark-java/target/sillycat-spark-java-1.0-jar-with-dependencies.jar com.sillycat.sparkjava.app.CountLinesOfKeywordApp

>bin/spark-submit --class com.sillycat.sparkjava.SparkJavaApp --master yarn-client /home/ec2-user/users/carl/sillycat-spark-java/target/sillycat-spark-java-1.0-jar-with-dependencies.jar com.sillycat.sparkjava.app.WordCountStreamingApp


References:
http://sillycat.iteye.com/blog/2407639
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值