Spark 2017 BigData Update(5)Spark Streaming in Java
The Streaming Small example Class, WordCountStreamingApp.java
package com.sillycat.sparkjava.app;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.common.serialization.StringDeserializer;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.streaming.Duration;
import org.apache.spark.streaming.api.java.JavaInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka010.ConsumerStrategies;
import org.apache.spark.streaming.kafka010.KafkaUtils;
import org.apache.spark.streaming.kafka010.LocationStrategies;
import com.sillycat.sparkjava.base.SparkBaseApp;
public class WordCountStreamingApp extends SparkBaseApp {
private static final long serialVersionUID = 7401544141510431796L;
protected String getAppName() {
return "WordCountStreamingApp";
}
public void executeTask(List<String> params) {
SparkConf conf = this.getSparkConf();
// The time interval at which streaming data will be divided into
// batches
logger.info("Start to have the streaming");
JavaStreamingContext ssc = new JavaStreamingContext(conf, new Duration(30000));
ssc.checkpoint(this.getAppName());
logger.info("Prepare the resource for streaming");
processStream(ssc, "carl");
logger.info("Streaming is working");
try {
ssc.start();
ssc.awaitTermination();
} catch (InterruptedException e) {
logger.error("InterruptedException:", e);
}
}
private void processStream(JavaStreamingContext ssc, String keyword) {
Map<String, Object> kafkaParams = new HashMap<>();
kafkaParams.put("bootstrap.servers", "fr-stage-api:9092,fr-stage-consumer:9092,fr-perf1:9092");
kafkaParams.put("key.deserializer", StringDeserializer.class);
kafkaParams.put("value.deserializer", StringDeserializer.class);
kafkaParams.put("group.id", "WordCountStreamingApp");
kafkaParams.put("auto.offset.reset", "latest");
kafkaParams.put("enable.auto.commit", true);
Collection<String> topics = Arrays.asList("sillycat-topic");
logger.info("Init the Kafka Clients to fetch lines");
JavaInputDStream<ConsumerRecord<String, String>> dStream = KafkaUtils.createDirectStream(ssc,
LocationStrategies.PreferConsistent(),
ConsumerStrategies.<String, String>Subscribe(topics, kafkaParams));
dStream.foreachRDD(rdd -> {
processRows(rdd, keyword);
});
}
private void processRows(JavaRDD<ConsumerRecord<String, String>> rdds, String keyword) {
JavaRDD<String> rows = rdds.map(record -> record.value());
JavaRDD<String> lines = rows.filter(new Function<String, Boolean>() {
private static final long serialVersionUID = 1L;
public Boolean call(String s) throws Exception {
if (s == null || s.trim().length() < 1) {
return false;
}
if (!s.contains(keyword)) {
return false;
}
return true;
}
});
long count = lines.count();
logger.info("Kafka received " + count + " " + keyword);
}
}
Here is how we should run all these testing:
#Run the local#
>java -jar target/sillycat-spark-java-1.0-jar-with-dependencies.jar com.sillycat.sparkjava.app.CountLinesOfKeywordApp
>java -jar target/sillycat-spark-java-1.0-jar-with-dependencies.jar com.sillycat.sparkjava.app.WordCountStreamingApp
#Run binary on local#
>bin/spark-submit --class com.sillycat.sparkjava.SparkJavaApp /Users/carl/work/sillycat/sillycat-spark-java/target/sillycat-spark-java-1.0-jar-with-dependencies.jar com.sillycat.sparkjava.app.CountLinesOfKeywordApp
>bin/spark-submit --class com.sillycat.sparkjava.SparkJavaApp /home/ec2-user/users/carl/sillycat-spark-java/target/sillycat-spark-java-1.0-jar-with-dependencies.jar com.sillycat.sparkjava.app.WordCountStreamingApp
#Run binary on Remote YARN Cluster#
>bin/spark-submit --class com.sillycat.sparkjava.SparkJavaApp --master yarn-client /home/ec2-user/users/carl/sillycat-spark-java/target/sillycat-spark-java-1.0-jar-with-dependencies.jar com.sillycat.sparkjava.app.CountLinesOfKeywordApp
>bin/spark-submit --class com.sillycat.sparkjava.SparkJavaApp --master yarn-client /home/ec2-user/users/carl/sillycat-spark-java/target/sillycat-spark-java-1.0-jar-with-dependencies.jar com.sillycat.sparkjava.app.WordCountStreamingApp
References:
http://sillycat.iteye.com/blog/2407639
The Streaming Small example Class, WordCountStreamingApp.java
package com.sillycat.sparkjava.app;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.common.serialization.StringDeserializer;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.streaming.Duration;
import org.apache.spark.streaming.api.java.JavaInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka010.ConsumerStrategies;
import org.apache.spark.streaming.kafka010.KafkaUtils;
import org.apache.spark.streaming.kafka010.LocationStrategies;
import com.sillycat.sparkjava.base.SparkBaseApp;
public class WordCountStreamingApp extends SparkBaseApp {
private static final long serialVersionUID = 7401544141510431796L;
protected String getAppName() {
return "WordCountStreamingApp";
}
public void executeTask(List<String> params) {
SparkConf conf = this.getSparkConf();
// The time interval at which streaming data will be divided into
// batches
logger.info("Start to have the streaming");
JavaStreamingContext ssc = new JavaStreamingContext(conf, new Duration(30000));
ssc.checkpoint(this.getAppName());
logger.info("Prepare the resource for streaming");
processStream(ssc, "carl");
logger.info("Streaming is working");
try {
ssc.start();
ssc.awaitTermination();
} catch (InterruptedException e) {
logger.error("InterruptedException:", e);
}
}
private void processStream(JavaStreamingContext ssc, String keyword) {
Map<String, Object> kafkaParams = new HashMap<>();
kafkaParams.put("bootstrap.servers", "fr-stage-api:9092,fr-stage-consumer:9092,fr-perf1:9092");
kafkaParams.put("key.deserializer", StringDeserializer.class);
kafkaParams.put("value.deserializer", StringDeserializer.class);
kafkaParams.put("group.id", "WordCountStreamingApp");
kafkaParams.put("auto.offset.reset", "latest");
kafkaParams.put("enable.auto.commit", true);
Collection<String> topics = Arrays.asList("sillycat-topic");
logger.info("Init the Kafka Clients to fetch lines");
JavaInputDStream<ConsumerRecord<String, String>> dStream = KafkaUtils.createDirectStream(ssc,
LocationStrategies.PreferConsistent(),
ConsumerStrategies.<String, String>Subscribe(topics, kafkaParams));
dStream.foreachRDD(rdd -> {
processRows(rdd, keyword);
});
}
private void processRows(JavaRDD<ConsumerRecord<String, String>> rdds, String keyword) {
JavaRDD<String> rows = rdds.map(record -> record.value());
JavaRDD<String> lines = rows.filter(new Function<String, Boolean>() {
private static final long serialVersionUID = 1L;
public Boolean call(String s) throws Exception {
if (s == null || s.trim().length() < 1) {
return false;
}
if (!s.contains(keyword)) {
return false;
}
return true;
}
});
long count = lines.count();
logger.info("Kafka received " + count + " " + keyword);
}
}
Here is how we should run all these testing:
#Run the local#
>java -jar target/sillycat-spark-java-1.0-jar-with-dependencies.jar com.sillycat.sparkjava.app.CountLinesOfKeywordApp
>java -jar target/sillycat-spark-java-1.0-jar-with-dependencies.jar com.sillycat.sparkjava.app.WordCountStreamingApp
#Run binary on local#
>bin/spark-submit --class com.sillycat.sparkjava.SparkJavaApp /Users/carl/work/sillycat/sillycat-spark-java/target/sillycat-spark-java-1.0-jar-with-dependencies.jar com.sillycat.sparkjava.app.CountLinesOfKeywordApp
>bin/spark-submit --class com.sillycat.sparkjava.SparkJavaApp /home/ec2-user/users/carl/sillycat-spark-java/target/sillycat-spark-java-1.0-jar-with-dependencies.jar com.sillycat.sparkjava.app.WordCountStreamingApp
#Run binary on Remote YARN Cluster#
>bin/spark-submit --class com.sillycat.sparkjava.SparkJavaApp --master yarn-client /home/ec2-user/users/carl/sillycat-spark-java/target/sillycat-spark-java-1.0-jar-with-dependencies.jar com.sillycat.sparkjava.app.CountLinesOfKeywordApp
>bin/spark-submit --class com.sillycat.sparkjava.SparkJavaApp --master yarn-client /home/ec2-user/users/carl/sillycat-spark-java/target/sillycat-spark-java-1.0-jar-with-dependencies.jar com.sillycat.sparkjava.app.WordCountStreamingApp
References:
http://sillycat.iteye.com/blog/2407639

本文介绍了一个使用Java实现的Spark Streaming应用示例,该应用通过Kafka接收数据,并统计包含特定关键词的消息数量。文章提供了完整的代码示例及运行指令。
1803

被折叠的 条评论
为什么被折叠?



