from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local').appName('GraphFrames').getOrCreate()
# 设置文件路径
tripdelaysFilePath = "./data/flights/departuredelays.csv"
airportsnaFilePath = "./data/flights/airport-codes-na.txt"
# 获取机场数据集
airportsna = spark.read.csv(airportsnaFilePath, header='true', inferSchema='true', sep='\t')
airportsna.createOrReplaceTempView("airports_na")
# 获取延误航班数据集
departureDelays = spark.read.csv(tripdelaysFilePath, header='true')
departureDelays.createOrReplaceTempView("departureDelays")
departureDelays.cache()
# 从延误航班数据集中获取可用的IATA代码
tripIATA = spark.sql("select distinct iata from (select distinct origin as iata from departureDelays union all select distinct destination as iata from departureDelays) a")
tripIATA.createOrReplaceTempView("tripIATA")
# 仅包括离港延迟数据集至少一次行程的机场
airports = spark.sql("select f.IATA, f.City, f.State, f.Country from airports_na f join tripIATA t on t.IATA = f.IATA")
airports.createOrReplaceTempView("airports")
airports.cache()
DataFrame[IATA: string, City: string, State: string, Country: string]
departureDelays.count()
1391578
构建无重复起飞的机场代码列表,可以构建airports DataFrame,它只包含departureDelays数据集中存在的机场代码。
生成一个新的DataFrame(departureDelays_geo),它由一些关键属性,包括航班日期、延迟、距离和机场信息(出发地、目的地)组成
departureDelays_geo = spark.sql("select cast(f.date as int) as tripid, cast(concat(concat(concat(concat(concat(concat('2014-', concat(concat(substr(cast(f.date as string), 1, 2), '-')), substr(cast(f.date as string), 3, 2)), ' '), substr(cast(f.date as string), 5, 2)), ':'), substr(cast(f.date as string), 7, 2)), ':00') as timestamp) as `localdate`, cast(f.delay as int), cast(f.distance as int), f.origin as src, f.destination as dst, o.city as city_src, d.city as city_dst, o.state as state_src, d.state as state_dst from departuredelays f join airports o on o.iata = f.origin join airports d on d.iata = f.destination")
# 建立临时视图和缓存
departureDelays_geo.createOrReplaceTempView('departureDelays_geo')
departureDelays_geo.cache()
DataFrame[tripid: int, localdate: timestamp, delay: int, distance: int, src: string, dst: string, city_src: string, city_dst: string, state_src: string, state_dst: string]
# 查看数据
departureDelays_geo.show(10)
+-------+-------------------+-----+--------+---+---+-----------+-------------------+---------+---------+
| tripid| localdate|delay|distance|src|dst| city_src| city_dst|state_src|state_dst|
+-------+-------------------+-----+--------+---+---+-----------+-------------------+---------+---------+
|1011111|2014-01-01 11:11:00| -5| 221|MSP|INL|Minneapolis|International Falls| MN| MN|
|1021111|2014-01-02 11:11:00| 7| 221|MSP|INL|Minneapolis|International Falls| MN| MN|
|1031111|2014-01-03 11:11:00| 0| 221|MSP|INL|Minneapolis|International Falls| MN| MN|
|1041925|2014-01-04 19:25:00| 0| 221|MSP|INL|Minneapolis|International Falls| MN| MN|
|1061115|2014-01-06 11:15:00| 33| 221|MSP|INL|Minneapolis|International Falls| MN| MN|
|1071115|2014-01-07 11:15:00| 23| 221|MSP|INL|Minneapolis|International Falls| MN| MN|
|1081115|2014-01-08 11:15:00| -9| 221|MSP|INL|Minneapolis|International Falls| MN| MN|
|1091115|2014-01-09 11:15:00| 11| 221|MSP|INL|Minneapolis|International Falls| MN| MN|
|1101115|2014-01-10 11:15:00| -3| 221|MSP|INL|Minneapolis|International Falls| MN| MN|
|1112015|2014-01-11 20:15:00| -7| 221|MSP|INL|Minneapolis|International Falls| MN| MN|
+-------+-------------------+-----+--------+---+---+-----------+-------------------+---------+---------+
only showing top 10 rows
构建图形
from pyspark.sql.functions import *
from graphframes import *
# Create Vertices(ariports) and Edges(flights)
tripVertices = airports.withColumnRenamed("IATA", "id").distinct()
tripEdges = departureDelays_geo.select("tripid", "delay", "src", "dst", "city_dst", "state_dst")
# Cache Vertices and Edges
tripEdges.cache()
tripVertices.cache()
DataFrame[id: string, City: string, State: string, Country: string]
display(tripEdges)
DataFrame[tripid: int, delay: int, src: string, dst: string, city_dst: string, state_dst: string]
tripEdges.show(10)
tripVertices.show(10)
+-------+-----+---+---+-------------------+---------+
| tripid|delay|src|dst| city_dst|state_dst|
+-------+-----+---+---+-------------------+---------+
|1011111| -5|MSP|INL|International Falls| MN|
|1021111| 7|MSP|INL|International Falls| MN|
|1031111| 0|MSP|INL|International Falls| MN|
|1041925| 0|MSP|INL|International Falls| MN|
|1061115| 33|MSP|INL|International Falls| MN|
|1071115| 23|MSP|INL|International Falls| MN|
|1081115| -9|MSP|INL|International Falls| MN|
|1091115| 11|MSP|INL|International Falls| MN|
|1101115| -3|MSP|INL|International Falls| MN|
|1112015| -7|MSP|INL|International Falls| MN|
+-------+-----+---+---+-------------------+---------+
only showing top 10 rows
+---+----------------+-----+-------+
| id| City|State|Country|
+---+----------------+-----+-------+
|FAT| Fresno| CA| USA|
|CMH| Columbus| OH| USA|
|PHX| Phoenix| AZ| USA|
|PAH| Paducah| KY| USA|
|COS|Colorado Springs| CO| USA|
|MYR| Myrtle Beach| SC| USA|
|RNO| Reno| NV| USA|
|SRQ| Sarasota| FL| USA|
|VLD| Valdosta| GA| USA|
|PSC| Pasco| WA| USA|
+---+----------------+-----+-------+
only showing top 10 rows
现在有两个DataFrame,可以使用GraphFrame创建一个GraphFrame:
tripGraph = GraphFrame(tripVertices, tripEdges)
执行简单查询
从一组简单的图形开始查询了解航班的表现和出发延误的情况
确定机场和航班的数量
print('Airports: %d' % tripGraph.vertices.count())
print('Trips: %d' % tripGraph.edges.count())
Airports: 279
Trips: 1361141
从以上结果看到,共279个机场和136万次航班
确定这个数据集中的最长延误时间
要确定数据集中航班最长的延误时间,可以执行一下查询:
tripGraph.edges.groupBy().max('delay').show()
+----------+
|max(delay)|
+----------+
| 1642|
+----------+
确定延误和准点/早到航班的数量对比
print('On-time / Early Flights: %d' % tripGraph.edges.filter('delay <= 0').count())
print('Delayed Filghts: %d' % tripGraph.edges.filter('delay > 0').count())
On-time / Early Flights: 780469
Delayed Filghts: 580672
哪一班从西雅图出发的航班最有可能出现重大延误
进一步挖掘这些数据,找出可能造成重大延误的从西雅图出发的航班。可以通过以下查询实现:
tripGraph.edges.filter('src="SEA" and delay > 0').groupBy('src', 'dst').avg('delay').sort(desc('avg(delay)')).show(5)
+---+---+------------------+
|src|dst| avg(delay)|
+---+---+------------------+
|SEA|PHL|55.666666666666664|
|SEA|COS| 43.53846153846154|
|SEA|FAT| 43.03846153846154|
|SEA|LGB| 39.39705882352941|
|SEA|IAD|37.733333333333334|
+---+---+------------------+
only showing top 5 rows
从上结果看出,从西雅图出发的航班出现延误的前五名城市是Philadelphia(PHL)、Colorado Springs(COS)、Fresno(FAT)、Long Beach(LGB)和Washington D.C(IDA):
西雅图出发到哪个州的航班最有可能出现重大延误
查看从西雅图出发到哪个州的航班有最长的累积延误(单次延误 > 100分钟)。
tripGraph.edges.filter('src = "SEA" and delay > 100').show()
+-------+-----+---+---+-------------+---------+
| tripid|delay|src|dst| city_dst|state_dst|
+-------+-----+---+---+-------------+---------+
|3201938| 108|SEA|BUR| Burbank| CA|
|3201655| 107|SEA|SNA|Orange County| CA|
|1011950| 123|SEA|OAK| Oakland| CA|
|1021950| 194|SEA|OAK| Oakland| CA|
|1021615| 317|SEA|OAK| Oakland| CA|
|1021755| 385|SEA|OAK| Oakland| CA|
|1031950| 283|SEA|OAK| Oakland| CA|
|1031615| 364|SEA|OAK| Oakland| CA|
|1031325| 130|SEA|OAK| Oakland| CA|
|1061755| 107|SEA|OAK| Oakland| CA|
|1081330| 118|SEA|OAK| Oakland| CA|
|2282055| 150|SEA|OAK| Oakland| CA|
|3061600| 130|SEA|OAK| Oakland| CA|
|3170815| 199|SEA|DCA|Washington DC| null|
|2151845| 128|SEA|KTN| Ketchikan| AK|
|2281845| 104|SEA|KTN| Ketchikan| AK|
|3130720| 117|SEA|KTN| Ketchikan| AK|
|1011411| 177|SEA|IAH| Houston| TX|
|1022347| 158|SEA|IAH| Houston| TX|
|1021411| 170|SEA|IAH| Houston| TX|
+-------+-----+---+---+-------------+---------+
only showing top 20 rows
理解节点的度
节点的度数是该节点周围的边数。度数是到节点(即机场)的边的总数(即航班)。
tripGraph.degrees.sort(desc('degree')).show(10)
+---+------+
| id|degree|
+---+------+
|ATL|179774|
|DFW|133966|
|ORD|125405|
|LAX|106853|
|DEN|103699|
|IAH| 85685|
|PHX| 79672|
|SFO| 77635|
|LAS| 66101|
|CLT| 56103|
+---+------+
only showing top 10 rows
tripGraph.outDegrees.sort(desc('outDegree')).show(10)
+---+---------+
| id|outDegree|
+---+---------+
|ATL| 90141|
|DFW| 68199|
|ORD| 63751|
|LAX| 53669|
|DEN| 52961|
|IAH| 43173|
|PHX| 40053|
|SFO| 38994|
|LAS| 33107|
|CLT| 28059|
+---+---------+
only showing top 10 rows
确定最大的中转机场
1.理解机场节点度的一个扩展是确定最大的中转机场。许多机场被用作中转站而不是目的地。一个简单的计算方法是通过计算inDegrees(到达该机场的航班数量)/outDegress(离开机场的航班数量)的比例。接近1的值可能表示大量的中转航班,而值<1表示出战航班较多,值>1表示入站航班较多。
inDeg = tripGraph.inDegrees
outDeg = tripGraph.outDegrees
degreeRatio = inDeg.join(outDeg, inDeg.id == outDeg.id) \
.drop(outDeg.id) \
.selectExpr("id", "double(inDegree)/double(outDegree) as degreeRatio") \
.cache()
TransferAirports = degreeRatio.join(airports, degreeRatio.id == airports.IATA) \
.selectExpr("id", "city", "degreeRatio") \
.filter("degreeRatio < .9 or degreeRatio > 1.1")
TransferAirports.orderBy('degreeRatio').show(10)
+---+-----------+-------------------+
| id| city| degreeRatio|
+---+-----------+-------------------+
|BRW| Barrow|0.28651685393258425|
|OME| Nome| 0.5084745762711864|
|FAI| Fairbanks| 1.1232686980609419|
|GFK|Grand Forks| 1.3333333333333333|
+---+-----------+-------------------+
理解Motif
为了更容易理解城市和航班之间的复杂关系,使用modifs命令进一步挖掘机场(节点)和航班(边)之间的关系。DataFrame结果中的列名通过Motif key给出。
Motif查找是作为GraphFrames的一部分被支持的新图形算法之一。
确定最受欢迎的直飞航班
import pyspark.sql.functions as func
topTrips = tripGraph.edges.groupBy('src', 'dst').agg(func.count('delay').alias('trips'))
topTrips.orderBy(topTrips.trips.desc()).show(10)
+---+---+-----+
|src|dst|trips|
+---+---+-----+
|SFO|LAX| 3232|
|LAX|SFO| 3198|
|LAS|LAX| 3016|
|LAX|LAS| 2964|
|JFK|LAX| 2720|
|LAX|JFK| 2719|
|ATL|LGA| 2501|
|LGA|ATL| 2500|
|LAX|PHX| 2394|
|PHX|LAX| 2387|
+---+---+-----+
only showing top 10 rows
使用广度优先搜索
广度优先算法(BFS)是一种新的算法,它作为GraphFrames的一部分,可以找出从一组节点到另一组节点的最短路径。通过BFS遍历tripGraph命令来快速找到所需的节点(机场)和边(航班)。
filteredPaths = tripGraph.bfs(fromExpr='id = "SEA"',
toExpr='id = "SFO"',
maxPathLength=1)
filteredPaths.show()
+--------------------+--------------------+--------------------+
| from| e0| to|
+--------------------+--------------------+--------------------+
|[SEA, Seattle, WA...|[1010710, 31, SEA...|[SFO, San Francis...|
|[SEA, Seattle, WA...|[1012125, -4, SEA...|[SFO, San Francis...|
|[SEA, Seattle, WA...|[1011840, -5, SEA...|[SFO, San Francis...|
|[SEA, Seattle, WA...|[1010610, -4, SEA...|[SFO, San Francis...|
|[SEA, Seattle, WA...|[1011230, -2, SEA...|[SFO, San Francis...|
|[SEA, Seattle, WA...|[1010955, -6, SEA...|[SFO, San Francis...|
|[SEA, Seattle, WA...|[1011100, 2, SEA,...|[SFO, San Francis...|
|[SEA, Seattle, WA...|[1011405, 0, SEA,...|[SFO, San Francis...|
|[SEA, Seattle, WA...|[1020710, -1, SEA...|[SFO, San Francis...|
|[SEA, Seattle, WA...|[1022125, -4, SEA...|[SFO, San Francis...|
|[SEA, Seattle, WA...|[1021840, -5, SEA...|[SFO, San Francis...|
|[SEA, Seattle, WA...|[1020610, 3, SEA,...|[SFO, San Francis...|
|[SEA, Seattle, WA...|[1021230, -2, SEA...|[SFO, San Francis...|
|[SEA, Seattle, WA...|[1020955, -7, SEA...|[SFO, San Francis...|
|[SEA, Seattle, WA...|[1021100, -1, SEA...|[SFO, San Francis...|
|[SEA, Seattle, WA...|[1021405, -4, SEA...|[SFO, San Francis...|
|[SEA, Seattle, WA...|[1030710, -13, SE...|[SFO, San Francis...|
|[SEA, Seattle, WA...|[1032125, -3, SEA...|[SFO, San Francis...|
|[SEA, Seattle, WA...|[1031840, -1, SEA...|[SFO, San Francis...|
|[SEA, Seattle, WA...|[1030610, -5, SEA...|[SFO, San Francis...|
+--------------------+--------------------+--------------------+
only showing top 20 rows