一:Transform
1.简单转换算子 map flatMap Filter
2.键控流转换算子 KeyBy DataStream → KeyedStream:逻辑地将一个流拆 分成不相交的分区,每个分区包含具有相同 key 的元 素,在内部以 hash 的形式实现的。
3.滚动聚合算子(Rolling Aggregation)
sum() min() max() minBy() maxBy()
4.提取当前每个传感器的最低温度。
package com.atguigu.source
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
import org.apache.flink.streaming.api.scala._
object TransformTest {
def main(args: Array[String]): Unit = {
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
val dataStream: DataStream[String] = env.readTextFile(“in/sensor.txt”)
//先转换数据类型
val dsSenReading: DataStream[SensorReading] = dataStream.map(data => {
val arr: Array[String] = data.split(",")
SensorReading(arr(0), arr(1).toLong, arr(2).toDouble)
}
)
//将数据按照ID分组 取出最低温度值
val aggStream: DataStream[SensorReading] = dsSenReading
.keyBy(“id”)
.minBy(“temperature”)
aggStream.print()
env.execute()
}
}
5.取出需要输出当前最小的温度值,以及最近的时间戳
val reduceDs: DataStream[SensorReading] = dsSenReading
.keyBy(“id”)
.reduce((curStata, newData) => {
SensorReading(curStata.id, newData.timestamp, curStata.temperature.min(newData.temperature))
})
二:1.多流转换。split+select 配合使用。。
//分流操作 将传感器温度分成高低温两条刘
val splitStream: SplitStream[SensorReading] = dsSenReading.split(data => {
if (data.temperature> 30) Seq(“high”)
else Seq(“low”)
})
val highStream: DataStream[SensorReading] = splitStream.select(“high”)
val lowStream: DataStream[SensorReading] = splitStream.select(“low”)
2.合流操作
1)ConnectedStreams → DataStream:作用于ConnectedStreams 上,功能与map
和flatMap 一样,对ConnectedStreams 中的每一个Stream 分别进行map 和flatMap
处理。
//合流操作 将两条流按照不一样的形式重新转换
val warningStream: DataStream[(String, Double)] = highStream.map(data=>(data.id,data.temperature))
val connectStream: ConnectedStreams[(String, Double), SensorReading] = warningStream.connect(lowStream)
val connectStream2: DataStream[Product with Serializable] = connectStream.map(
data => (data._1, data._2, “warning”),
data2 => (data2.id, “healthy”)
)
connectStream2.print()
2)uinon 合并 多条流数据类型必须一模一样
val unionStream: DataStream[SensorReading] = highStream.union(lowStream)
四:Flink支持的数据类型。
支持所有java scala的基础数据类型。最常用 样例类
五:实现UDF 函数——更细粒度的控制流
val filterStram: DataStream[SensorReading] = unionStream.filter(sensor => sensor.id.contains(“2”))
六:“富函数”是DataStream API 提供的一个函数类的接口,所有Flink 函数类都
有其Rich 版本。它与常规函数的不同在于,可以获取运行环境的上下文,并拥有一
些生命周期方法,所以可以实现更复杂的功能。
RichMapFunction
RichFlatMapFunction
RichFilterFunction
…
//富函数 ,可以获取到运行时上下文,还有一些生命周期
class MyRichMapper extends RichMapFunction[SensorReading,SensorReading]{
//初始化时调用一次 比如数据库连接
override def open(parameters: Configuration): Unit = super.open(parameters)
override def setRuntimeContext(t: RuntimeContext): Unit = super.setRuntimeContext(t)
override def getRuntimeContext: RuntimeContext = super.getRuntimeContext
override def getIterationRuntimeContext: IterationRuntimeContext = super.getIterationRuntimeContext
//收尾工作 关闭链接 或者清空状态
override def close(): Unit = super.close()
override def map(value: SensorReading): SensorReading = ???
}
七:Sink
val sinkDs: DataStreamSink[SensorReading] = ds.addSink(
StreamingFileSink.forRowFormat(
new Path(“out2/out.txt”),
new SimpleStringEncoderSensorReading
).build()
八:写入到kafka
kafka-console-consumer.sh --bootstrap-server hadoop203:9092 --topic sinktest
val ds: DataStream[String] = dataStream.map(data => {
val arr: Array[String] = data.split(",")
SensorReading(arr(0), arr(1).toLong, arr(2).toDouble).toString
})
ds.addSink(new FlinkKafkaProducer011[String](“hadoop203:9092”,“sinktest”,new SimpleStringSchema()))
env.execute(“kafkasink”)
九://构建数据管道 从kafka sensor 进 kafka sinktest出
bin/kafka-console-producer.sh --broker-list localhost:9092 --topic sensor
kafka-console-consumer.sh --bootstrap-server hadoop203:9092 --topic sinktest
package com.atguigu.sourceandsink
import java.util.Properties
import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment, _}
import org.apache.flink.streaming.connectors.kafka.{FlinkKafkaConsumer011, FlinkKafkaProducer011}
//构建数据管道 从kafka sensor 进 kafka sinktest出
object Kafka_Source_Sink {
def main(args: Array[String]): Unit = {
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
val properties = new Properties()
properties.setProperty(“bootstrap.servers”, “hadoop203:9092”)
properties.setProperty(“group.id”, “consumer-group”)
//3********3.从kafka中读取流 一般化方法
val kafkastream: DataStream[String] = env.addSource(new FlinkKafkaConsumer011[String](“sensor”,new SimpleStringSchema(),properties))
kafkastream.addSink(new FlinkKafkaProducer011[String](“hadoop203:9092”,“sinktest”,new SimpleStringSchema()))
env.execute(“kafkasink”)
}
}
十:写入MySql
package com.atguigu.sourceandsink
import java.sql.{Connection, DriverManager, PreparedStatement}
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.functions.sink.{RichSinkFunction, SinkFunction}
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
import org.apache.flink.streaming.api.scala._
object Sink_MySql {
def main(args: Array[String]): Unit = {
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(1)
val stream: DataStream[String] = env.readTextFile(“in/sensor.txt”)
val ds: DataStream[SensorReading] = stream.map(data => {
val arr: Array[String] = data.split(",")
SensorReading(arr(0), arr(1).toLong, arr(2).toDouble)
})
ds.print()
ds.addSink(new MyJdbcSink())
env.execute(“mysql”)
}
}
class MyJdbcSink() extends RichSinkFunction[SensorReading]{
var conn: Connection = _
var insertStmt: PreparedStatement = _
var updateStmt: PreparedStatement = _
// open 主要是创建连接
override def open(parameters: Configuration): Unit = {
super.open(parameters)
conn = DriverManager.getConnection(“jdbc:mysql://hadoop203:3306/test”,
“root”, “000000”)
insertStmt = conn.prepareStatement(“INSERT INTO sensor_temp (id, temp) VALUES (?, ?)”)
updateStmt = conn.prepareStatement(“UPDATE sensor_temp SET temp = ? WHERE id = ?”)
}
// 调
override def invoke(value: SensorReading, context:
SinkFunction.Context[]): Unit = {
updateStmt.setDouble(1, value.temperature)
updateStmt.setString(2, value.id)
updateStmt.execute()
if (updateStmt.getUpdateCount == 0) {
insertStmt.setString(1, value.id)
insertStmt.setDouble(2, value.temperature)
insertStmt.execute()
}
}
override def close(): Unit = {
insertStmt.close()
updateStmt.close()
conn.close()
}
}
class MyJdbcSinkFunc() extends RichSinkFunction[SensorReading]{
//定义连接
var conn :Connection=
var insert: PreparedStatement=_
var update: PreparedStatement=_
//每来一条数据需要做的连接
override def open(parameters: Configuration): Unit = {
super.open(parameters)
//定义连接
conn = DriverManager.getConnection(“jdbc:mysql://hadoop203:3306/test”,“root”,“000000”)
//定义预编译语句
val insert: PreparedStatement = conn.prepareStatement(“INSERT INTO sensor_temp (id,temp) VALUES (?, ?)”)
val update: PreparedStatement = conn.prepareStatement(“UPDATE sensor_temp SET temp = ? WHERE id = ?”)
}
//每次写入时需要调用的
override def invoke(value: SensorReading, context: SinkFunction.Context[_]): Unit = {
//先查询书否有数据 查到后就更新
update.setDouble(1,value.temperature)
update.setString(2,value.id)
update.execute()
if(update.getUpdateCount ==0 ){ //如果更新了0条
insert.setString(1,value.id)
insert.setDouble(2,value.temperature)
insert.execute()
}
}
override def close(): Unit = {
insert.close()
update.close()
conn.close()
}
}