spark sql

本文通过实战案例介绍了使用PySpark进行航班数据分析的方法。主要内容包括:检查航班数据的结构;筛选关键字段并制作简明报表;针对特定日期查询航班详情;按航班号及目的地机场分组统计延误次数;聚合查询某机场到达航班的数量与平均延误时间等。
# 航班数据分析实战

# 1, 查看航班信息Schema
# 2, 提取关注字段, 做航班信息简明情况报表
# 3, 指定日期, 查询航班信息简报
# 4, 分组查询, 按航班号分组, 查询延迟次数
# 5, 分组查询, 按目的地机场分组, 查询延迟次数
# 6, 聚合查询, 查询某天某机场到达航班总数量
# 7, 聚合查询, 查询某天某机场到达航班平均延迟时间
# 8, 分组聚合, 查询某机场每天延迟到达航班架次
# 9, 分组聚合, 查询某机场每天延迟30分钟以上到达航班

from pyspark.sql import SparkSession,Row
import pandas as pd
spark = SparkSession.builder\
        .master('local[1]')\
        .appName('dataFrameDemo')\
        .getOrCreate()

df = spark.read.csv('file:///work/大数据课件/data/2008.csv',
                    inferSchema = True, 
                    header=True)
df.printSchema()

#2
#|Year|Month|DayofMonth|DayOfWeek|DepTime|ArrTime
#|年|月|日|星期|起飞时间|到达时间
#|FlightNum|TailNum|ActualElapsedTime
#|航班号|后续|实际耗时
#|AirTime|ArrDelay|DepDelay|Origin|Dest|Distance
#|飞行时间|到达延迟|起飞延迟|出发|到达|距离

df1 = df.select(df.Month,df.DayofMonth,df.FlightNum,\
        df.Origin,df.DepTime,df.Dest,df.ArrTime,\
        df.DepDelay,df.ArrDelay,df.AirTime,df.Distance)

df1.registerTempTable('AirInfo')

#3 
#df2 = df1.where('Month = 4 and DayofMonth = 5')
#df2.show()

strSQL = 'select * from AirInfo where Month=4 and DayofMonth=5'

dfQ = spark.sql(strSQL)
dfQ.show()

spark.sql("select FlightNum,count(*) from AirInfo where not ArrDelay like '-%'  group by FlightNum").show()

spark.stop()

root
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- DayofMonth: integer (nullable = true)
 |-- DayOfWeek: integer (nullable = true)
 |-- DepTime: string (nullable = true)
 |-- CRSDepTime: integer (nullable = true)
 |-- ArrTime: string (nullable = true)
 |-- CRSArrTime: integer (nullable = true)
 |-- UniqueCarrier: string (nullable = true)
 |-- FlightNum: integer (nullable = true)
 |-- TailNum: string (nullable = true)
 |-- ActualElapsedTime: string (nullable = true)
 |-- CRSElapsedTime: string (nullable = true)
 |-- AirTime: string (nullable = true)
 |-- ArrDelay: string (nullable = true)
 |-- DepDelay: string (nullable = true)
 |-- Origin: string (nullable = true)
 |-- Dest: string (nullable = true)
 |-- Distance: integer (nullable = true)
 |-- TaxiIn: string (nullable = true)
 |-- TaxiOut: string (nullable = true)
 |-- Cancelled: integer (nullable = true)
 |-- CancellationCode: string (nullable = true)
 |-- Diverted: integer (nullable = true)
 |-- CarrierDelay: string (nullable = true)
 |-- WeatherDelay: string (nullable = true)
 |-- NASDelay: string (nullable = true)
 |-- SecurityDelay: string (nullable = true)
 |-- LateAircraftDelay: string (nullable = true)

+-----+----------+---------+------+-------+----+-------+--------+--------+-------+--------+
|Month|DayofMonth|FlightNum|Origin|DepTime|Dest|ArrTime|DepDelay|ArrDelay|AirTime|Distance|
+-----+----------+---------+------+-------+----+-------+--------+--------+-------+--------+
|    4|         5|      661|   ABQ|   1804| AMA|   1959|      -1|      -1|     45|     277|
|    4|         5|     1858|   ABQ|   1043| BWI|   1609|      -2|     -11|    196|    1670|
|    4|         5|     1223|   ABQ|   1424| DAL|   1654|      -6|     -16|     79|     580|
|    4|         5|     1376|   ABQ|   1946| DAL|   2210|      -4|     -10|     74|     580|
|    4|         5|     2407|   ABQ|   1819| DAL|   2051|      -1|      -4|     78|     580|
|    4|         5|     2507|   ABQ|    955| DAL|   1225|       0|      -5|     79|     580|
|    4|         5|     2836|   ABQ|   1310| DAL|   1541|      -5|     -14|     78|     580|
|    4|         5|     2956|   ABQ|   1648| DAL|   1918|      -2|     -12|     79|     580|
|    4|         5|     3282|   ABQ|    658| DAL|    927|      -2|     -13|     77|     580|
|    4|         5|      901|   ABQ|    843| DEN|    956|      -2|      -4|     58|     349|
|    4|         5|     1022|   ABQ|   1420| DEN|   1526|       0|      -9|     53|     349|
|    4|         5|     2582|   ABQ|   1718| ELP|   1806|      -2|      -9|     37|     223|
|    4|         5|     3103|   ABQ|   1035| ELP|   1123|       5|       3|     38|     223|
|    4|         5|     2679|   ABQ|    707| HOU|    959|      -3|     -11|     98|     759|
|    4|         5|     3269|   ABQ|   1536| HOU|   1822|       1|     -13|     97|     759|
|    4|         5|      681|   ABQ|    632| LAS|    702|      -3|      -3|     78|     487|
|    4|         5|      788|   ABQ|    900| LAS|    931|       0|      -4|     78|     487|
|    4|         5|      864|   ABQ|   1824| LAS|   1855|       4|       5|     78|     487|
|    4|         5|     1108|   ABQ|   1208| LAS|   1239|      23|      19|     79|     487|
|    4|         5|     2370|   ABQ|   1449| LAS|   1521|      -1|      -4|     81|     487|
+-----+----------+---------+------+-------+----+-------+--------+--------+-------+--------+
only showing top 20 rows

+---------+--------+
|FlightNum|count(1)|
+---------+--------+
|     1580|    1155|
|      463|    1142|
|     2366|     511|
|      496|    1004|
|      833|     570|
|     1829|     637|
|      148|    1354|
|     2866|     526|
|     1342|     342|
|     3918|     284|
|     1088|     891|
|     1591|     820|
|     2122|     477|
|     3749|     569|
|     1238|     732|
|     7754|     152|
|     2659|     453|
|     2142|     339|
|     7240|     175|
|     7253|     247|
+---------+--------+
only showing top 20 rows

# 4
# 先过滤, 所有延迟的航班  where('ArrDelay > 0')
# groupBy(df1.FlightNum).count()
# 注意数据类型的问题, 可先转换成数值类型


# sql 语句
# select FlightNum,count(*) from AirInfo
# group by FlightNum
# where ArrDelay > 0

#spark.sql("
#    select FlightNum,count(*) as DelayCount
#    from AirInfo
#    where not ArrDelay like '-%'
#    group by FlightNum
#").show()

#   having 子句, 用于过滤分组之后的结果
#   聚合之后的结果,想过滤掉 迟到不足1000次的航班
#   having DelayCount < 1000

#DataFrame 连接运算

from pyspark.sql import SparkSession,Row

spark = SparkSession.builder\
        .master('local[1]')\
        .appName('dataFrameDemo')\
        .getOrCreate()

listA = ['Tom','Jerry','Kate','Jim']
listB = [5,4,3,6]
tupleA = zip(listA,listB)

listC =['Tom','Kate','David','Henry']
listD =['M','F','M','M']
tupleB = zip(listC,listD)

listE = ['Dom','Jerry','Kate','Gim']
listF = [5,4,3,6]
tupleC = zip(listE,listF)


df1 = spark.createDataFrame(
    list(tupleA),schema=['Name','Age']
)
df2 = spark.createDataFrame(
    list(tupleB),schema=['Name','Gender']
)
df4 = spark.createDataFrame(
    list(tupleC),schema=['Name','TAge']
)

df3 = df1.crossJoin(df2)

df5 = df1.join(df2,df1.Name == df2.Name,'inner')
df1.show()
df2.show()
df3.show()
df5.show()
df1.write.csv('file:///Users/chuzhengkai/Desktop/df1.csv',mode='overwrite')
df1.write.json('file:///Users/chuzhengkai/Desktop/df1.json',mode='overwrite')
df1.write.parquet('file:///Users/chuzhengkai/Desktop/df1')
spark.stop()

+-----+---+
| Name|Age|
+-----+---+
|  Tom|  5|
|Jerry|  4|
| Kate|  3|
|  Jim|  6|
+-----+---+

+-----+------+
| Name|Gender|
+-----+------+
|  Tom|     M|
| Kate|     F|
|David|     M|
|Henry|     M|
+-----+------+

+-----+---+-----+------+
| Name|Age| Name|Gender|
+-----+---+-----+------+
|  Tom|  5|  Tom|     M|
|  Tom|  5| Kate|     F|
|  Tom|  5|David|     M|
|  Tom|  5|Henry|     M|
|Jerry|  4|  Tom|     M|
|Jerry|  4| Kate|     F|
|Jerry|  4|David|     M|
|Jerry|  4|Henry|     M|
| Kate|  3|  Tom|     M|
| Kate|  3| Kate|     F|
| Kate|  3|David|     M|
| Kate|  3|Henry|     M|
|  Jim|  6|  Tom|     M|
|  Jim|  6| Kate|     F|
|  Jim|  6|David|     M|
|  Jim|  6|Henry|     M|
+-----+---+-----+------+

+----+---+----+------+
|Name|Age|Name|Gender|
+----+---+----+------+
|Kate|  3|Kate|     F|
| Tom|  5| Tom|     M|
+----+---+----+------+

# '1123'  0x00000031 0x00000031 0x00000032 0x00000033
# 1123 = 1024 + 64 + 32 + 2 +1
# 0100 0110 0011
# 0x00000463

# 读parquet
from pyspark.sql import SparkSession

spark = SparkSession.builder\
        .master('local[1]')\
        .appName('dataFrameDemo')\
        .getOrCreate()

df1 = spark.read.parquet('file:///Users/chuzhengkai/Desktop/df1/part-00000-264747d4-288f-4f96-b686-f5a1a854385f-c000.snappy.parquet')
df1.show()
df1.select(df1.Name,(df1.Age +1).alias('NewAge')).show()
spark.stop()

+-----+---+
| Name|Age|
+-----+---+
|  Tom|  5|
|Jerry|  4|
| Kate|  3|
|  Jim|  6|
+-----+---+

+-----+------+
| Name|NewAge|
+-----+------+
|  Tom|     6|
|Jerry|     5|
| Kate|     4|
|  Jim|     7|
+-----+------+

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值