飞机航班信息字段如下:求随机一个机场到其它所有机场的最短航线:
(要点:得到需要的graph即可,其余初始化,pregel为固定格式固定代码)
月中第几天,
周中第几天,
航空公司,
飞机注册号,
航班号,
起飞机场编号,------
起飞机场,-------
到达机场编号,-------
到达机场,--------
预计起飞时间,
起飞时间,
起飞延迟,
预计到达时间,
到达时间,
到达延迟,
预计飞行时间,
飞行距离---------
package graph.etl
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
object FlightDemo {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[8]").setAppName("ETL")
val spark: SparkSession = SparkSession.builder().config(conf).getOrCreate()
val sc: SparkContext = spark.sparkContext
import spark.implicits._
//加载数据
val flightRDD: RDD[String] = sc.textFile("in/flight.csv")
// flightRDD.collect().foreach(println)
val airPort: RDD[(VertexId, String)] = flightRDD.map(x => x.split(","))
.flatMap(x => Array((x(5).toLong, x(6)), ((x(7).toLong, x(8)))))
.distinct()
airPort
// airPort.collect().foreach(println)
val lines: RDD[Edge[PartitionID]] = flightRDD.map(x => x.split(",")) //起飞机场编号,降落机场编号,飞行距离
.map(x => (x(5).toLong, x(7).toLong, x(16).toInt))
.distinct()
.map(x => Edge(x._1, x._2, x._3))
// lines.collect().foreach(println)
// println(lines.count())
//构建graph对象
val graph = Graph(airPort,lines)
// graph.triplets.collect().foreach(println)
//机场数量
val numvertices: VertexId = graph.numVertices
println("机场数量"+numvertices)
val numeddges: VertexId = graph.numEdges
println("航线数量"+numeddges)
//计算最长的航线
val rdd1: RDD[EdgeTriplet[String, PartitionID]] = graph.triplets.sortBy(x=>x.attr,false)
// rdd1.collect().foreach(println)
rdd1.take(1)(0)
val rdd2: RDD[String] = rdd1.map(triplet=>triplet.srcAttr+" "+triplet.dstAttr+"距离:"+triplet.attr)
//rdd2.collect().foreach(println)
rdd2.take(1)(0) //take(n)返回前n个元素,组成一个Array()数组,(0)表示第一个
// for (elem <- strings) {
// println(elem)
// }
//最繁忙机场
val indegrees: VertexRDD[PartitionID] = graph.inDegrees
// indegrees.collect().foreach(println)
val busyAirPort: (VertexId, PartitionID) = indegrees.sortBy(x=>x._2,false).take(1)(0)
println(busyAirPort)
val outdegrees: VertexRDD[PartitionID] = graph.outDegrees
// outdegrees.collect().foreach(println)
val busyAirPort2: (VertexId, PartitionID) = outdegrees.sortBy(x=>x._2,false).take(1)(0)
println(busyAirPort2)
//找出最重要的飞行航线
val vertices: VertexRDD[Double] = graph.pageRank(0.05).vertices
// vertices.sortBy(x=>x._2,false).collect().foreach(println)
//找出最便宜的飞行航线
//price = 18.0 + distance*0.15
val value: RDD[(VertexId, String)] = airPort.sample(false,1.0/airPort.count(),1)
val source_id: VertexId = 13930.toLong
val srcAirportName: String = value.first()._2
val srcAirportId: VertexId = value.first()._1
println(source_id,srcAirportId,srcAirportName)
val init_graph: Graph[Double, Double] = graph.mapVertices((id, value) => {
if (id == srcAirportId) value=0.0
else value=Double.PositiveInfinity
}).mapEdges(e => 180 + 0.15 * e.attr.toDouble)
val pregel_graph: Graph[Double, Double] = init_graph.pregel(
Double.PositiveInfinity,
Int.MaxValue,
EdgeDirection.Out
)(
(id, dist, new_dist) => math.min(dist, new_dist),
triple => {
if (triple.attr + triple.srcAttr < triple.dstAttr)
Iterator((triple.dstId, triple.attr + triple.srcAttr))
else
Iterator.empty
},
(dist, new_dist) => math.min(dist, new_dist)
)
// pregel_graph.triplets.collect.foreach(println)
val tuples: Array[(VertexId, Double)] = pregel_graph.vertices.sortBy(x=>x._2,false).take(3)
println(tuples.toList)
}
}
`1``