distinct join join funtion leftOuterJoin rightOuterJoin fullOuterJoin union first coGroup cross
transformation
map
对集合元素,进行一一遍历处理
示例功能:给集合中的每一一行,都拼接字符串
package com.opensourceteams.module.bigdata.flink.example.dataset.transformation.map
import org.apache.flink.api.scala.ExecutionEnvironment
import org.apache.flink.api.scala._
object Run {
def main(args: Array[String]): Unit = {
val env = ExecutionEnvironment.getExecutionEnvironment
val dataSet = env.fromElements("c a b d a c","d c a b c d")
val dataSet2 = dataSet.map(_.toUpperCase + "字符串连接")
dataSet2.print()
}
}
输出结果
C A B D A C字符串连接
D C A B C D字符串连接
flatMap
对集合元素,进行一一遍历处理,并把子集合中的数据拉到一个集合中
示例功能:把行进行拆分后,再把不同的行拆分之后的元素,汇总到一个集合中
package com.opensourceteams.module.bigdata.flink.example.dataset.transformation.flatmap
import org.apache.flink.api.scala.{ExecutionEnvironment, _}
object Run {
def main(args: Array[String]): Unit = {
val env = ExecutionEnvironment.getExecutionEnvironment
val dataSet = env.fromElements("c a b d a c","d c a b c d")
val dataSet2 = dataSet.flatMap(_.toUpperCase().split(" "))
dataSet2.print()
}
}
输出结果
C
A
B
D
A
C
D
C
A
B
C
D
filter
对集合元素,进行一一遍历处理,只过滤满足条件的元素
示例功能:过滤空格数据
package com.opensourceteams.module.bigdata.flink.example.dataset.transformation.filter
import org.apache.flink.api.scala.{ExecutionEnvironment, _}
/**
* filter 过滤器,对数据进行过滤处理
*/
object Run {
def main(args: Array[String]): Unit = {
val env = ExecutionEnvironment.getExecutionEnvironment
val dataSet = env.fromElements("c a b d a c","d c a b c d")
val dataSet2 = dataSet.flatMap(_.toUpperCase().split(" ")).filter(_.nonEmpty)
dataSet2.print()
}
}
输出结果
C
A
B
D
A
C
D
C
A
B
C
D
reduce
对集合中所有元素,两两之间进行reduce函数表达式的计算
示例功能:统计所有数据的和
package com.opensourceteams.module.bigdata.flink.example.dataset.transformation.map
package com.opensourceteams.module.bigdata.flink.example.dataset.transformation.reduce
import org.apache.flink.api.scala.{ExecutionEnvironment, _}
/**
* 相当于进行所有元素的累加操作,求和操作
*/
object Run {
def main(args: Array[String]): Unit = {
val env = ExecutionEnvironment.getExecutionEnvironment
val dataSet = env.fromElements(3,5,8,9)
// 3 + 5 + 8 + 9
val dataSet2 = dataSet.reduce((a,b) => {
println(s"${a} + ${b} = ${a +b}")
a + b
})
dataSet2.print()
}
}
package com.opensourceteams.module.bigdata.flink.example.dataset.transformation.cogroup
import java.lang
import org.apache.flink.api.common.functions.CoGroupFunction
import org.apache.flink.api.scala.{ExecutionEnvironment, _}
import org.apache.flink.util.Collector
object Run {
def main(args: Array[String]): Unit = {
val env = ExecutionEnvironment.getExecutionEnvironment
val dataSet = env.fromElements(("a",1),("g",1),("a",1))
val dataSet2 = env.fromElements(("a",1),("f",1))
//全外连接
val dataSet3 = dataSet.coGroup(dataSet2).where(0).equalTo(0)
{
new CoGroupFunction[(String,Int),(String,Int), Collector[(String,Int)]] {
override def coGroup(first: lang.Iterable[(String, Int)], second: lang.Iterable[(String, Int)], out: Collector[Collector[(String, Int)]]): Unit = {
println("==============开始")
println("first")
println(first)
val iteratorFirst = first.iterator()
while (iteratorFirst.hasNext()){
println(iteratorFirst.next())
}
println("second")
println(second)
val iteratorSecond = second.iterator()
while (iteratorSecond.hasNext()){
println(iteratorSecond.next())
}
println("==============结束")
}
}
}
dataSet3.print()
}
}
输出结果
==============开始
first
org.apache.flink.runtime.util.NonReusingKeyGroupedIterator$ValuesIterator@3500e7b0
(a,1)
(a,1)
second
org.apache.flink.runtime.util.NonReusingKeyGroupedIterator$ValuesIterator@41230ea2
(a,1)
==============结束
==============开始
first
org.apache.flink.runtime.util.NonReusingKeyGroupedIterator$ValuesIterator@14602d0a
(g,1)
second
[]
==============结束
==============开始
first
[]
second
org.apache.flink.runtime.util.NonReusingKeyGroupedIterator$ValuesIterator@2b0a15b5
(f,1)
==============结束
Process finished with exit code 0
cross
交叉连接
package com.opensourceteams.module.bigdata.flink.example.dataset.transformation.cross
import org.apache.flink.api.scala.{ExecutionEnvironment, _}
object Run {
def main(args: Array[String]): Unit = {
val env = ExecutionEnvironment.getExecutionEnvironment
val dataSet = env.fromElements(("a",1),("g",1),("f",1))
val dataSet2 = env.fromElements(("d",1),("f",1),("g",1),("f",1))
//全外连接
val dataSet3 = dataSet.cross(dataSet2)
dataSet3.print()
}
}