依赖
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.4.3</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.4.3</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.9.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-auth</artifactId>
<version>2.9.2</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>1.2.4</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-server</artifactId>
<version>1.2.4</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.38</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql-kafka-0-10_2.11</artifactId>
<version>2.4.3</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
<version>2.4.3</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>2.4.3</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>4.0.1</version>
<executions>
<execution>
<id>scala-compile-first</id>
<phase>process-resources</phase>
<goals>
<goal>add-source</goal>
<goal>compile</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>2.4.3</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
代码
package com.baizhi.demo11
import org.apache.spark.sql.streaming.OutputMode
import org.apache.spark.sql.{DataFrame, SparkSession}
object StructureStreamingkafka {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().master("spark://spark:7077").appName("wordcount").getOrCreate()
import spark.implicits._
val df: DataFrame = spark.readStream
.format("kafka")
.option("kafka.bootstrap.servers", "spark:9092")
.option("subscribe", "topic01").load()
.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
import org.apache.spark.sql.functions._
val frame = df.select("value")
.as[String]
.flatMap(line => line.split("\\s+"))
.map((_, 1))
.toDF("word", "count").groupBy("word")
.agg(sum("count") as "count")
.selectExpr("word", "CAST(count AS STRING)")
.withColumnRenamed("word", "key")
.withColumnRenamed("count", "value")
val query = frame.writeStream
.outputMode(OutputMode.Update())
.format("kafka")
.option("kafka.bootstrap.servers", "spark:9092")
.option("topic","topic02")
.option("checkpointLocation","hdfs:///checkpoints01")
.start()
query.awaitTermination()
}
}

打成jar包
在虚拟机启动 spark kafka zk hadoop
依次运行 以下步骤
[root@CentOS spark-2.4.3]# ./bin/spark-submit --master spark://spark:7077 --class
com.baizhi.demo11.StructureStreamingkafka --deploy-mode cluster --name wordcount --packages
org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.3 --total-executor-cores 4 /root/original-structurestreaming-1.0-SNAPSHOT.jar
在 spark-defaut.conf 配置改参数
spark.executor.extraClassPath=/root/.ivy2/jars/*
spark.driver.extraClassPath=/root/.ivy2/jars/*
[root@CentOS spark-2.4.3]# ./bin/spark-submit --master spark://spark:7077 --class
com.baizhi.demo11.StructureStreamingkafka --deploy-mode cluster --name wordcount --packages
org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.3 --total-executor-cores 4 /root/original-structurestreaming-1.0-
SNAPSHOT.jar
然后开始测试