用structuredStreaming将kafka数据存到kafka中 (jar包形式运行)

依赖
  <dependencies>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_2.11</artifactId>
            <version>2.4.3</version>
               <scope>provided</scope>
        </dependency>

        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql_2.11</artifactId>
            <version>2.4.3</version>
               <scope>provided</scope>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>2.9.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-auth</artifactId>
            <version>2.9.2</version>
        </dependency>
        <!--对接HBase-->
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-client</artifactId>
            <version>1.2.4</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-server</artifactId>
            <version>1.2.4</version>
        </dependency>

        <!--对接MysQL-->
        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>5.1.38</version>
        </dependency>

        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql-kafka-0-10_2.11</artifactId>
            <version>2.4.3</version>
        </dependency>

        <!--对接Kafka-->
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
            <version>2.4.3</version>
        </dependency>
        <!--对接Streaming-->
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-streaming_2.11</artifactId>
            <version>2.4.3</version>
        </dependency>
    </dependencies>
    <build>
        <plugins>
            <plugin>
                <groupId>net.alchim31.maven</groupId>
                <artifactId>scala-maven-plugin</artifactId>
                <version>4.0.1</version>
                <executions>
                    <execution>
                        <id>scala-compile-first</id>
                        <phase>process-resources</phase>
                        <goals>
                            <goal>add-source</goal>
                            <goal>compile</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-shade-plugin</artifactId>
                <version>2.4.3</version>
                <executions>
                    <execution>
                        <phase>package</phase>
                        <goals>
                            <goal>shade</goal>
                        </goals>
                        <configuration>
                            <filters>
                                <filter>
                                    <artifact>*:*</artifact>
                                    <excludes>
                                        <exclude>META-INF/*.SF</exclude>
                                        <exclude>META-INF/*.DSA</exclude>
                                        <exclude>META-INF/*.RSA</exclude>
                                    </excludes>
                                </filter>
                            </filters>
                        </configuration>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>
</project>

代码


package com.baizhi.demo11

import org.apache.spark.sql.streaming.OutputMode
import org.apache.spark.sql.{DataFrame, SparkSession}

object StructureStreamingkafka {
  def main(args: Array[String]): Unit = {
//    System.setProperty("HADOOP_USER_NAME","root")
    val spark = SparkSession.builder().master("spark://spark:7077").appName("wordcount").getOrCreate()
//    val spark = SparkSession.builder().master("local[6]").appName("wordcount").getOrCreate()
    import spark.implicits._

    val df: DataFrame = spark.readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", "spark:9092")
      .option("subscribe", "topic01").load()
      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")

    import  org.apache.spark.sql.functions._
    val frame = df.select("value")
      .as[String]
      .flatMap(line => line.split("\\s+"))
      .map((_, 1))
      .toDF("word", "count").groupBy("word")
      .agg(sum("count") as "count")
      .selectExpr("word", "CAST(count AS STRING)")
      .withColumnRenamed("word", "key")
      .withColumnRenamed("count", "value")


    val query = frame.writeStream
      .outputMode(OutputMode.Update())
      .format("kafka")
      .option("kafka.bootstrap.servers", "spark:9092")
      .option("topic","topic02")
//      .option("checkpointLocation","hdfs://spark:9000/checkpoints")
      .option("checkpointLocation","hdfs:///checkpoints01")
      .start()
    query.awaitTermination()
  }

}

在这里插入图片描述

打成jar包
在虚拟机启动 spark kafka zk hadoop
依次运行 以下步骤
[root@CentOS spark-2.4.3]# ./bin/spark-submit --master spark://spark:7077 --class 
com.baizhi.demo11.StructureStreamingkafka --deploy-mode cluster --name wordcount --packages 
org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.3  --total-executor-cores 4 /root/original-structurestreaming-1.0-SNAPSHOT.jar
在 spark-defaut.conf 配置改参数
spark.executor.extraClassPath=/root/.ivy2/jars/*
spark.driver.extraClassPath=/root/.ivy2/jars/*
[root@CentOS spark-2.4.3]# ./bin/spark-submit --master spark://spark:7077 --class 
com.baizhi.demo11.StructureStreamingkafka --deploy-mode cluster --name wordcount --packages 
org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.3  --total-executor-cores 4 /root/original-structurestreaming-1.0-
SNAPSHOT.jar
然后开始测试
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值