spark streaming 入门案例演示
1. 环境
- scala 2.12.12
- jdk 1.8
- idea 2020.1
- maven 3.6.3
- spark 3.0.1
- kafka 0.10
- pom
<!-- 定义常量 -->
<properties>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
<scala.version>2.12.12</scala.version>
<spark.version>3.0.1</spark.version>
<encoding>UTF-8</encoding>
</properties>
<dependencies>
<!-- 导入scala的 -->
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>${scala.version}</version>
</dependency>
<!-- 导入spark streaming-->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.12</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_2.12</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.49</version>
</dependency>
</dependencies>
<build>
<pluginManagement>
<plugins>
<!-- 编译scala插件 -->
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>3.2.2</version>
</plugin>
<!-- 编译java插件 -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.5.1</version>
</plugin>
</plugins>
</pluginManagement>
<plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<executions>
<execution>
<id>scala-compile-first</id>
<phase>process-resources</phase>
<goals>
<goal>add-source</goal>
<goal>compile</goal>
</goals>
</execution>
<execution>
<id>scala-test-compile</id>
<phase>process-test-resources</phase>
<goals>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<executions>
<execution>
<phase>compile</phase>
<goals>
<goal>compile</goal>
</goals>
</execution>
</executions>
</plugin>
<!-- 打jar插件 -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>2.4.3</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
2. 案例1
离线
1.创建SparkConf
2.创建SparkContext
3.使用SparkContext创建RDD
4.调用RDD的Transformation(s),最后调用
5.释放资源(sc.stop)
def main(args: Array[String]): Unit = {
// 实时
// 创建SparkConf
val conf = new SparkConf().setAppName(this.getClass.getSimpleName).setMaster("local[*]")
// 创建StreamingContext(SparkContext的一个高级的增强包装类)
val ssc = new StreamingContext(conf, Milliseconds(5000)) //一个批次的时间间隔
// 创建DStream (DSteam是对RDD高度封装,DStream是按照这顶的时间间隔生成逻辑相同的RDD)
val lines: DStream[String] = ssc.socketTextStream("localhost", 8888)
// 调用Transformation(s)和Action
val words: DStream[String] = lines.flatMap(_.split(" "))
val wordAndOne: DStream[(String, Int)] = words.map((_, 1))
val reduced: DStream[(String, Int)] = wordAndOne.reduceByKey(_ + _)
// 触发Action
reduced.print()
// 将任务启动并挂起
ssc.start()
ssc.awaitTermination()
}
3. 案例2
def main(args: Array[String]): Unit = {
// 实时
// 创建SparkConf
val conf = new SparkConf().setAppName(this.getClass.getSimpleName).setMaster("local[*]")
// 创建StreamingContext(SparkContext的一个高级的增强包装类)
val ssc = new StreamingContext(conf, Milliseconds(5000)) //一个批次的时间间隔
ssc.checkpoint("./ck")
ssc.sparkContext.setLogLevel("WARN");
// 创建DStream (DSteam是对RDD高度封装,DStream是按照这顶的时间间隔生成逻辑相同的RDD)
val lines: DStream[String] = ssc.socketTextStream("localhost", 8888)
// 调用Transformation(s)和Action
val words: DStream[String] = lines.flatMap(_.split(" "))
val wordAndOne: DStream[(String, Int)] = words.map((_, 1))
//reduceByKey只能进行当前批次的数据进行运算,不能累加历史的数据
//val reduced: DStream[(String, Int)] = wordAndOne.reduceByKey(_ + _)
val reduced = wordAndOne.updateStateByKey(updateFunc)
//触发Action
reduced.print()
// 将任务启动并挂起
ssc.start()
ssc.awaitTermination()
}
//updateFunc: (Seq[V], Option[S]) => Option[S]
val updateFunc = (s: Seq[Int], o: Option[Int]) => {
Some(s.sum + o.getOrElse(0))
}