spark 高级语法总结

xiedelong

已于 2023-11-09 16:23:55 修改

阅读量842

点赞数 1

分类专栏：大数据文章标签： spark big data python

于 2021-03-29 11:04:53 首次发布

本文链接：https://blog.youkuaiyun.com/xiedelong/article/details/115296032

版权

大数据专栏收录该内容

16 篇文章

订阅专栏

本文档详细介绍了如何在Apache Spark中进行DataFrame的操作和转换，包括Pandas与Spark DataFrame的互换、DataFrame的保存、分组聚合、条件判断、列操作、数据类型转换、UDF使用、窗口函数及统计计算等核心功能。此外，还展示了处理JSON数据和填充缺失值的方法。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >


# pandas 和 pyspark dataframe 互换
# pandas 里的每一列里的所有元素必须是同一种 type 类型才可转换
spark_df = spark.createDataFrame(pandas_df)
pandas_df = spark_df.toPandas()

# 保存分区表（覆盖 & 追加）
df.write.saveAsTable("db.table", mode="overwrite", partitionBy=['dt'])
df.write.saveAsTable("db.table", mode="append", partitionBy=['dt'])

# group by 的一些语法
group_df = df.where(F.col("diff_day") > -10).groupBy("abgroup", "dt")\
    .agg(F.count(F.col("username")).alias("sum_count"), F.sum(F.col("ticket")).alias("sum_ticket"))\
    .withColumn("s2p", F.col("sum_ticket") / F.col("sum_count"))\
    .orderBy("dt", "p_abgroup").show(1000)

# 当需要使用 group_df 进行 join 的时候，需要将 agg 聚合列 alias 一下，不然会报错：https://stackoverflow.com/questions/45713290/how-to-resolve-the-analysisexception-resolved-attributes-in-spark/53848160
df = df.join(group_df.select(F.col("diff_day").alias("diff_day"), F.col("sum_count").alias("sum_count")), on=['diff_day', 'sum_count'], how='left')

# df F.when 多个条件
df = df.withColumn("match", F.when((F.col("3_month_pay_count") == 0) & (F.col("6_month_pay_count") > 0), 1).otherwise(0))

# 新增一列常数
df = df.withColumn("col_1", F.lit(1))

# string 转 int
data_df = data_df.withColumn("Plays", F.col("Plays").cast(IntegerType()))

# schema 的使用， udf 的使用， udf 同时返回两列的使用
def get_sessions(list_loc, list_hotel):
    session_list = []  # 该用户的所有 session list
    submit_list = []  # 购买的酒店
    for loc, hotel in zip(list_loc, list_hotel):
		# action
        submit_list.append("")
        session_list.append(each_list)
    return session_list, submit_list


# 将用户行为分小 session
schema = StructType([
    StructField("session_list", ArrayType(ArrayType(StringType())), False),
    StructField("submit_list", ArrayType(StringType()), False),
])

udf_get_sessions = F.udf(get_sessions, schema)
result_df = result_df.withColumn("schema", udf_get_sessions(F.col("list_loc"), F.col("list_hotel"))) \
    .withColumn("session_list", F.col("schema.session_list")) \
    .withColumn("submit_list", F.col("schema.submit_list")).drop("schema")

# 中位数计算
wind = Window.partitionBy('name')
med = F.expr('percentile_approx(score, 0.5)')
df.groupBy('name').agg(med.alias('med_val')).show()

# 众数计算：用户的时间
df = (server_df.groupby("username").agg(F.collect_list("hour").alias('hour_list')))
df.cache().count()

# 定义 udf
def get_mode(x):
    try:
        mode = stats.mode(x)[0][0]
        # mode = pd.Series(data=x).mode()[0]
        return int(mode)
    except:
        return -1


# 增加列
udf_get_mode = F.udf(get_mode, IntegerType())
df_result = df.withColumn("hour_mode", udf_get_mode(F.col("hour_list")))
df_result = df_result.drop("hour_list").drop_duplicates(subset=["username"])

# 字符串函数，多条件 when
# fly_time = 3h20m or 3小时20分
search_df = search_df.withColumn("fly_time", F.regexp_replace(F.regexp_replace(F.col("fly_time"), "小时", "h"), "分", 'm'))\
    .withColumn("fly_hour", F.substring_index(F.col("fly_time"), 'h', 1).cast(IntegerType()))\
    .withColumn("fly_minute", F.regexp_replace(F.substring_index(F.col("fly_time"), 'h', -1), "m", "").cast(IntegerType()))\
    .withColumn("fly_time_minutes",
                F.when((F.col("fly_hour").isNotNull()) & (F.col("fly_minute").isNotNull()), F.col("fly_hour") * 60 + F.col("fly_minute"))
                .when(F.col("fly_minute").isNull(), F.col("fly_hour") * 60)
                .when(F.col("fly_hour").isNull(), F.col("fly_minute")))\
    .drop("fly_hour", "fly_minute")


# spark json 使用
show_df = spark.sql("""select *** from ***""")

show_df = show_df.withColumn("action_json", F.regexp_replace(F.split(F.col("action"), "set\*")[1], "：", ":")).drop("action") \
# 注意这里不能先将 true 替换成 True，否则 F.from_json 会解析不出来 json
#    .withColumn("request", F.regexp_replace(F.col("request"), "true", "True"))\
    .withColumn("full_time", F.concat_ws(" ", F.col("logdate"), F.col("logtime"))) \
    .withColumn("time", F.unix_timestamp(F.col("full_time")))

attributes_show_schema = StructType([
    StructField("action", StringType(), True),
    StructField("attributes", StructType([
        StructField("flightItem", StructType([
            StructField("title", StringType(), True),
            StructField("flightInfoList", ArrayType(StructType([
                StructField("depCity", StringType(), True),
                StructField("arrCity", StringType(), True),
                StructField("price", StringType(), True)
            ])), True)
        ]), True),
        StructField("index", StringType(), True)
    ]))
])

# 解析 json
show_df = show_df.withColumn("attributes_schema", F.from_json(F.col("action_json"), attributes_show_schema)) \
    .withColumn("action", F.col("attributes_schema.action")) \
    .withColumn("title", F.col("attributes_schema.attributes.flightItem.title")) \
    .withColumn("flightInfoList", F.explode(F.col("attributes_schema.attributes.flightItem.flightInfoList")))

# 数组一行转多行
show_df = show_df.withColumn("originDepDate", F.col("flightInfoList.originDepDate")) \
    .withColumn("arrCity", F.col("flightInfoList.arrCity")) \
    .withColumn("price", F.col("flightInfoList.price")) \
    .withColumn("recommendIndex", F.lit(None).cast(StringType())) \
    .drop("action_json", "attributes_schema", "flightInfoList")

show_df.show()

# dataframe 如何填充 list na 值
df = df.withColumn("vector", F.when(F.col("vector").isNull(), F.array([F.lit(i) for i in mean_vector])).otherwise(F.col("vector")))