spark单表关联
题目:求孙子和祖父母的关系列表
数据:
child parent
Tom Lucy
Tom Jack
Jone Lucy
Jone Jack
Lucy Mary
Lucy Ben
Jack Alice
Jack Jesse
Terry Alice
Terry Jesse
Philip Terry
Philip Alma
Mark Terry
Mark Alma
spark代码:
import org.apache.spark.{SparkConf, SparkContext} object danbiaoRelation { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("paixu").setMaster("local") val sc = new SparkContext(conf) val child_parent_rdd = sc.textFile("D:\\wc\\danbiaoInput\\*.txt").filter(x=>{if(x.contains("child")) false else true}) .map(x=>{val str=x.replaceAll("\\s+"," ").split(" ");(str(0),str(1))}) val parent_child_rdd = child_parent_rdd.map(x=>(x._2,x._1)) val child_grand_rdd = child_parent_rdd.join(parent_child_rdd);//(父母,(祖父母,孙子)) val grandchild_grandparent_rdd = child_grand_rdd.map(x=>(x._2._2,x._2._1)).repartition(1) grandchild_grandparent_rdd.saveAsTextFile("D://wc/grandchild_grandparent_rdd") } }
运行结果:
(Tom,Mary)
(Jone,Mary)
(Tom,Ben)
(Jone,Ben)
(Philip,Alice)
(Mark,Alice)
(Philip,Jesse)
(Mark,Jesse)
(Tom,Alice)
(Jone,Alice)
(Tom,Jesse)
(Jone,Jesse)