# 航班数据分析实战
# 1, 查看航班信息Schema
# 2, 提取关注字段, 做航班信息简明情况报表
# 3, 指定日期, 查询航班信息简报
# 4, 分组查询, 按航班号分组, 查询延迟次数
# 5, 分组查询, 按目的地机场分组, 查询延迟次数
# 6, 聚合查询, 查询某天某机场到达航班总数量
# 7, 聚合查询, 查询某天某机场到达航班平均延迟时间
# 8, 分组聚合, 查询某机场每天延迟到达航班架次
# 9, 分组聚合, 查询某机场每天延迟30分钟以上到达航班
from pyspark.sql import SparkSession,Row
import pandas as pd
spark = SparkSession.builder\
.master('local[1]')\
.appName('dataFrameDemo')\
.getOrCreate()
df = spark.read.csv('file:///work/大数据课件/data/2008.csv',
inferSchema = True,
header=True)
df.printSchema()
#2
#|Year|Month|DayofMonth|DayOfWeek|DepTime|ArrTime
#|年|月|日|星期|起飞时间|到达时间
#|FlightNum|TailNum|ActualElapsedTime
#|航班号|后续|实际耗时
#|AirTime|ArrDelay|DepDelay|Origin|Dest|Distance
#|飞行时间|到达延迟|起飞延迟|出发|到达|距离
df1 = df.select(df.Month,df.DayofMonth,df.FlightNum,\
df.Origin,df.DepTime,df.Dest,df.ArrTime,\
df.DepDelay,df.ArrDelay,df.AirTime,df.Distance)
df1.registerTempTable('AirInfo')
#3
#df2 = df1.where('Month = 4 and DayofMonth = 5')
#df2.show()
strSQL = 'select * from AirInfo where Month=4 and DayofMonth=5'
dfQ = spark.sql(strSQL)
dfQ.show()
spark.sql("select FlightNum,count(*) from AirInfo where not ArrDelay like '-%' group by FlightNum").show()
spark.stop()
root
|-- Year: integer (nullable = true)
|-- Month: integer (nullable = true)
|-- DayofMonth: integer (nullable = true)
|-- DayOfWeek: integer (nullable = true)
|-- DepTime: string (nullable = true)
|-- CRSDepTime: integer (nullable = true)
|-- ArrTime: string (nullable = true)
|-- CRSArrTime: integer (nullable = true)
|-- UniqueCarrier: string (nullable = true)
|-- FlightNum: integer (nullable = true)
|-- TailNum: string (nullable = true)
|-- ActualElapsedTime: string (nullable = true)
|-- CRSElapsedTime: string (nullable = true)
|-- AirTime: string (nullable = true)
|-- ArrDelay: string (nullable = true)
|-- DepDelay: string (nullable = true)
|-- Origin: string (nullable = true)
|-- Dest: string (nullable = true)
|-- Distance: integer (nullable = true)
|-- TaxiIn: string (nullable = true)
|-- TaxiOut: string (nullable = true)
|-- Cancelled: integer (nullable = true)
|-- CancellationCode: string (nullable = true)
|-- Diverted: integer (nullable = true)
|-- CarrierDelay: string (nullable = true)
|-- WeatherDelay: string (nullable = true)
|-- NASDelay: string (nullable = true)
|-- SecurityDelay: string (nullable = true)
|-- LateAircraftDelay: string (nullable = true)
+-----+----------+---------+------+-------+----+-------+--------+--------+-------+--------+
|Month|DayofMonth|FlightNum|Origin|DepTime|Dest|ArrTime|DepDelay|ArrDelay|AirTime|Distance|
+-----+----------+---------+------+-------+----+-------+--------+--------+-------+--------+
| 4| 5| 661| ABQ| 1804| AMA| 1959| -1| -1| 45| 277|
| 4| 5| 1858| ABQ| 1043| BWI| 1609| -2| -11| 196| 1670|
| 4| 5| 1223| ABQ| 1424| DAL| 1654| -6| -16| 79| 580|
| 4| 5| 1376| ABQ| 1946| DAL| 2210| -4| -10| 74| 580|
| 4| 5| 2407| ABQ| 1819| DAL| 2051| -1| -4| 78| 580|
| 4| 5| 2507| ABQ| 955| DAL| 1225| 0| -5| 79| 580|
| 4| 5| 2836| ABQ| 1310| DAL| 1541| -5| -14| 78| 580|
| 4| 5| 2956| ABQ| 1648| DAL| 1918| -2| -12| 79| 580|
| 4| 5| 3282| ABQ| 658| DAL| 927| -2| -13| 77| 580|
| 4| 5| 901| ABQ| 843| DEN| 956| -2| -4| 58| 349|
| 4| 5| 1022| ABQ| 1420| DEN| 1526| 0| -9| 53| 349|
| 4| 5| 2582| ABQ| 1718| ELP| 1806| -2| -9| 37| 223|
| 4| 5| 3103| ABQ| 1035| ELP| 1123| 5| 3| 38| 223|
| 4| 5| 2679| ABQ| 707| HOU| 959| -3| -11| 98| 759|
| 4| 5| 3269| ABQ| 1536| HOU| 1822| 1| -13| 97| 759|
| 4| 5| 681| ABQ| 632| LAS| 702| -3| -3| 78| 487|
| 4| 5| 788| ABQ| 900| LAS| 931| 0| -4| 78| 487|
| 4| 5| 864| ABQ| 1824| LAS| 1855| 4| 5| 78| 487|
| 4| 5| 1108| ABQ| 1208| LAS| 1239| 23| 19| 79| 487|
| 4| 5| 2370| ABQ| 1449| LAS| 1521| -1| -4| 81| 487|
+-----+----------+---------+------+-------+----+-------+--------+--------+-------+--------+
only showing top 20 rows
+---------+--------+
|FlightNum|count(1)|
+---------+--------+
| 1580| 1155|
| 463| 1142|
| 2366| 511|
| 496| 1004|
| 833| 570|
| 1829| 637|
| 148| 1354|
| 2866| 526|
| 1342| 342|
| 3918| 284|
| 1088| 891|
| 1591| 820|
| 2122| 477|
| 3749| 569|
| 1238| 732|
| 7754| 152|
| 2659| 453|
| 2142| 339|
| 7240| 175|
| 7253| 247|
+---------+--------+
only showing top 20 rows
# 4
# 先过滤, 所有延迟的航班 where('ArrDelay > 0')
# groupBy(df1.FlightNum).count()
# 注意数据类型的问题, 可先转换成数值类型
# sql 语句
# select FlightNum,count(*) from AirInfo
# group by FlightNum
# where ArrDelay > 0
#spark.sql("
# select FlightNum,count(*) as DelayCount
# from AirInfo
# where not ArrDelay like '-%'
# group by FlightNum
#").show()
# having 子句, 用于过滤分组之后的结果
# 聚合之后的结果,想过滤掉 迟到不足1000次的航班
# having DelayCount < 1000
#DataFrame 连接运算
from pyspark.sql import SparkSession,Row
spark = SparkSession.builder\
.master('local[1]')\
.appName('dataFrameDemo')\
.getOrCreate()
listA = ['Tom','Jerry','Kate','Jim']
listB = [5,4,3,6]
tupleA = zip(listA,listB)
listC =['Tom','Kate','David','Henry']
listD =['M','F','M','M']
tupleB = zip(listC,listD)
listE = ['Dom','Jerry','Kate','Gim']
listF = [5,4,3,6]
tupleC = zip(listE,listF)
df1 = spark.createDataFrame(
list(tupleA),schema=['Name','Age']
)
df2 = spark.createDataFrame(
list(tupleB),schema=['Name','Gender']
)
df4 = spark.createDataFrame(
list(tupleC),schema=['Name','TAge']
)
df3 = df1.crossJoin(df2)
df5 = df1.join(df2,df1.Name == df2.Name,'inner')
df1.show()
df2.show()
df3.show()
df5.show()
df1.write.csv('file:///Users/chuzhengkai/Desktop/df1.csv',mode='overwrite')
df1.write.json('file:///Users/chuzhengkai/Desktop/df1.json',mode='overwrite')
df1.write.parquet('file:///Users/chuzhengkai/Desktop/df1')
spark.stop()
+-----+---+
| Name|Age|
+-----+---+
| Tom| 5|
|Jerry| 4|
| Kate| 3|
| Jim| 6|
+-----+---+
+-----+------+
| Name|Gender|
+-----+------+
| Tom| M|
| Kate| F|
|David| M|
|Henry| M|
+-----+------+
+-----+---+-----+------+
| Name|Age| Name|Gender|
+-----+---+-----+------+
| Tom| 5| Tom| M|
| Tom| 5| Kate| F|
| Tom| 5|David| M|
| Tom| 5|Henry| M|
|Jerry| 4| Tom| M|
|Jerry| 4| Kate| F|
|Jerry| 4|David| M|
|Jerry| 4|Henry| M|
| Kate| 3| Tom| M|
| Kate| 3| Kate| F|
| Kate| 3|David| M|
| Kate| 3|Henry| M|
| Jim| 6| Tom| M|
| Jim| 6| Kate| F|
| Jim| 6|David| M|
| Jim| 6|Henry| M|
+-----+---+-----+------+
+----+---+----+------+
|Name|Age|Name|Gender|
+----+---+----+------+
|Kate| 3|Kate| F|
| Tom| 5| Tom| M|
+----+---+----+------+
# '1123' 0x00000031 0x00000031 0x00000032 0x00000033
# 1123 = 1024 + 64 + 32 + 2 +1
# 0100 0110 0011
# 0x00000463
# 读parquet
from pyspark.sql import SparkSession
spark = SparkSession.builder\
.master('local[1]')\
.appName('dataFrameDemo')\
.getOrCreate()
df1 = spark.read.parquet('file:///Users/chuzhengkai/Desktop/df1/part-00000-264747d4-288f-4f96-b686-f5a1a854385f-c000.snappy.parquet')
df1.show()
df1.select(df1.Name,(df1.Age +1).alias('NewAge')).show()
spark.stop()
+-----+---+
| Name|Age|
+-----+---+
| Tom| 5|
|Jerry| 4|
| Kate| 3|
| Jim| 6|
+-----+---+
+-----+------+
| Name|NewAge|
+-----+------+
| Tom| 6|
|Jerry| 5|
| Kate| 4|
| Jim| 7|
+-----+------+