// 第一步:先将df设置临时表:
//只要是关于df中的sql执行,都要创建临时表 df.createTempView("time01")
val rdd12: DataFrame = rdd03.map(_.split(" ")) //按照分隔符把一行分为两个子段
.map(x => (x(0).toString, x(1)))
.toDF("time", "Logistics_status")
rdd12.show(false)
+----------------+---------------------------+
|time |Logistics_status |
+----------------+---------------------------+
|2020-09-12 12:05|.不重要的字段 |
|2020-09-12 07:10|.不重要的字段 |
|2020-09-12 02:58|,不重要的字段 |
|2020-09-11 22:28|,不重要的字段 |
|2020-09-11 07:31|,不重要的字段 |
|2020-09-11 04:12|,不重要的字段 |
|2020-09-10 21:26|,不重要的字段 |
|2020-09-10 20:11|,不重要的字段 |
|2020-09-01 06:59|,不重要的字段 |
+----------------+--------------------------+
rdd12.createTempView("time04")
//在说下关于时间格式中求max和min值:
val rdd99 = spark.sql(
"""
select
|max(to_timestamp(time,'yyyy-MM-dd HH:mm')) as max,
|min(to_timestamp(time,'yyyy-MM-dd HH:mm')) as min,
datediff(max(to_date(time)),min(to_date(time))) as day //最大值和最小值中间隔的天数
from
test05
""".stripMargin)
rdd99.show(false)
rdd99.createTempView("time01")
+-------------------+-------------------+---+
|max |min |day|
+-------------------+-------------------+---+
|2020-09-12 12:05:00|2020-09-01 06:59:00|11 |
+-------------------+-------------------+---+
val rdd18: DataFrame = spark.sql(
"""
select
substring(TrackingRequest.trackNo,0,22) as trackNo, //这是切割字符串算子
TrackingRequest.shipClassType as shipClassType,
|TrackingRequest.labelId as labelId,
current_timestamp as create_time
from
test02
""".stripMargin)
rdd18.show(false)
rdd18.createTempView("test03")
+----------------------+-------------+-------+-----------------------+
|trackNo |shipClassType|labelId|create_time |
+----------------------+-------------+-------+-----------------------+
|94001aaaaaaaaa63367192|USPS |0 |2020-09-29 20:14:13.635|
+----------------------+-------------+-------+-----------------------+
//现在将这三个df合并为一条df 这里运用的sql中的full join
//这里有个bug 在spark中用full join时要先开启这个设置
spark.conf.set("spark.sql.crossJoin.enabled", "true")
//然后在执行join操作:
val rdd24: DataFrame = spark.sql(
"""
| select
|c.trackNo as trackNo,
|c.shipClassType as shipClassType,
|c.labelId as labelId,
|d.max as Latest_status_time,
|d.min as Billing_start_time,
|d.day as days,
|c.time as time,
|c.Logistics_status as Logistics_status
|from
(
|select
|b.trackNo as trackNo,
|b.shipClassType as shipClassType,
|b.labelId as labelId,
|b.create_time as create_time,
|a.time as time,
|a.Logistics_status as Logistics_status
|from
|test04 a
|full join
|test03 b
) c
|full join
|time01 d limit 5
"""
.stripMargin)
rdd24.show(false)
rdd24.printSchema()
+----------------------+-------------+-------+-------------------+-------------------+----+----------------+----------------------
|trackNo |shipClassType|labelId|Latest_status_time |Billing_start_time |days|time |Logistics_status
+----------------------+-------------+-------+-------------------+-------------------+----+----------------+----------------------
|9400aaaaaaaaaaa3367192|wwww |0 |2020-09-12 12:05:00|2020-09-01 06:59:00|11 |2020-09-12 12:05|.不重要的字段
|9400aaaaaaaaaaa3367192|wwww |0 |2020-09-12 12:05:00|2020-09-01 06:59:00|11 |2020-09-12 07:10|.不重要的字段
|9400aaaaaaaaaaa3367192|wwww |0 |2020-09-12 12:05:00|2020-09-01 06:59:00|11 |2020-09-12 02:58|.不重要的字段
|9400aaaaaaaaaaa3367192|wwww |0 |2020-09-12 12:05:00|2020-09-01 06:59:00|11 |2020-09-11 22:28|.不重要的字段
|9400aaaaaaaaaaa3367192|wwww |0 |2020-09-12 12:05:00|2020-09-01 06:59:00|11 |2020-09-11 07:31|.不重要的字段
+----------------------+-------------+-------+-------------------+-------------------+----+----------------+----------------------
root
|-- trackNo: string (nullable = true)
|-- shipClassType: string (nullable = true)
|-- labelId: long (nullable = true)
|-- Latest_status_time: timestamp (nullable = true)
|-- Billing_start_time: timestamp (nullable = true)
|-- days: integer (nullable = true)
|-- time: string (nullable = true)
|-- Logistics_status: string (nullable = true)
如果需要修改数据结构添加array数组数据结构请点击:
https://blog.youkuaiyun.com/data_curd/article/details/108874932