- 数据
a.txt:
u1 12 zs
u2 15 xx
u3 18 aaa
u4 20 xa1
u5 22 xa2
b.txt
u1 2016 9 m1
u2 2017 12 m2
u3 2017 1 m3
u3 2014 2 m4
u3 2012 3 m5
2. 需求
两个数据集:
数据集A id,age,name
数据集B id,year,month,movie
数据集都是使用空格来分割的
要求,输出:id,age,name,year,month,movie(同一个用户,按year升序,没有数据B的id,用null补
- 逻辑
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.mutable
object DealDataExample1_3 {
def main(args: Array[String]): Unit = {
if (args.length!=3){
println("cn.edu360.Example.DealDataExample1 <input1><input2><output>")
sys.exit()
}
//输入参数
val Array(input1,input2,output)=args
val conf = new SparkConf()
conf.setMaster("local")
conf.setAppName(DealDataExample1_1.getClass.getSimpleName)
val sc = new SparkContext(conf)
//获取数据
val data1: RDD[String] = sc.textFile(input1)
val data2: RDD[String] = sc.textFile(input2)
//切割数据
val cutRes1: RDD[(String, String)] = data1.map({
t =>
val res = t.split(" ", 2)
val id: String = res(0)
val values: String = res(1)
//println(s"id=${id},values=${values}")
(id, values)
})
val cutRes2: RDD[(String, String)] = data2.map({
t =>
val res = t.split(" ", 2)
val id: String = res(0)
val values: String = res(1)
//println(s"id=${id},values=${values}")
(id, values)
})
//println(cutRes2.collect().toBuffer)
//组合数据
val combyRes: RDD[(String, (Iterable[String], Iterable[String]))] = cutRes1.cogroup(cutRes2)
//println(combyRes.collect().toBuffer)
//(u4,(CompactBuffer(20 xa1),CompactBuffer()))
//数据预处理
val dealRes: RDD[(String, String)] = combyRes.map({
t =>
(t._1, t._2._1.toString().concat(",").concat(t._2._2.toString()))
})
//println(dealRes.collect().toBuffer)
//以key分组
val groupRes: RDD[(String, Iterable[String])] = dealRes.groupByKey(1)
val selectRes: RDD[(String, String)] = groupRes.mapValues({
t =>
val userMsg = t.head
val movieMsg = t.tail
val movieData: String = if (movieMsg.isEmpty) {
"null,null,null"
} else {
movieMsg.toList.sortBy(_.split(" ")(0)).mkString(" ")
}
userMsg.concat(",").concat(movieData)
})
println(selectRes.collect().toBuffer)
sc.stop()
}
}
- 结果
u1 12 zs,2016 9 m1
u2 15 xx,2017 12 m2
u3 18 aaa,2012 3 m5 2014 2 m4 2017 1 m3
u4 20 xa1,null,null,null
u5 22 xa2,null,null,null