1. 查看pyspark的版本
>>> import pyspark
>>> pyspark.__version__
'3.5.2'
也可以在 Python 解释器中直接查询 PySpark 的版本,如下所示:
from pyspark import SparkConf, SparkContext
conf = SparkConf()
sc = SparkContext(conf=conf)
print(sc.version)
从 Spark 2.0 开始,SparkContext 的初始化不再需要传递 SparkConf 实例,因此也可以简化为:
from pyspark import SparkContext
sc = SparkContext()
print(sc.version)
2. udf的使用
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType
# 创建 SparkSession
spark = SparkSession.builder.appName("udf_example").getOrCreate()
# 创建一个简单的 DataFrame
data = [(1, "Alice"), (2, "Bob"), (3, "Charlie")]
columns = ["id", "name"]
df = spark.createDataFrame(data=data, schema=columns)
# 定义一个简单的 UDF,例如将 id 列的值乘以 2
def multiply_by_two(n):
# print(n)
return n * 2
# 使用 udf 函数将 Python 函数转换为 UDF,并指定返回类型
multiply_by_two_udf = udf(multiply_by_two, IntegerType())
# 在 DataFrame 上应用 UDF
df_with_udf = df.withColumn("multiplied_id", multiply_by_two_udf(df["id"]))
# 显示结果
df_with_udf.show()
3. groupBy + agg + udf
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, sum as spark_sum, col
from pyspark.sql.types import IntegerType
# 创建 SparkSession
spark = SparkSession.builder.appName("groupBy_udf_example").getOrCreate()
# 创建一个简单的 DataFrame
data = [
(1, "Alice", 100),
(1, "Alice", 200),
(2, "Bob", 300),
(2, "Bob", 400),
(3, "Charlie", 500)
]
columns = ["group_id", "name", "value"]
df = spark.createDataFrame(data=data, schema=columns)
df.show()
# 定义一个 UDF,例如计算某个值的平方
def square(n):
return n ** 2
# 注册 UDF
square_udf = udf(square, IntegerType())
# 使用 groupBy 和 agg 结合 UDF
# 假设我们想要按 group_id 分组,并计算每个组内 value 列的平方和
grouped_df = df.groupBy("group_id", "name").agg(
spark_sum(square_udf(col("value"))).alias("squared_sum")
)
# 显示结果
grouped_df.show()
4. groupBy + array_agg + explode
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, array_agg, struct
# 创建 SparkSession
spark = SparkSession.builder.appName("MultiRowAggregation").getOrCreate()
# 创建一个简单的 DataFrame
data = [
(1, "Alice", 100),
(1, "Bob", 200),
(2, "Charlie", 300),
(2, "David", 400),
]
columns = ["group_id", "name", "value"]
df = spark.createDataFrame(data=data, schema=columns)
# 使用 groupBy 和 array_agg 进行聚合,将多行聚合成一个数组
grouped_df = df.groupBy("group_id").agg(
array_agg(struct("name", "value")).alias("name_value_pairs")
)
grouped_df.show(300)
# 使用 explode 将聚合后的数组“展开”成多行
exploded_df = grouped_df.select(
"group_id", explode(col("name_value_pairs")).alias("exploded")
).select("group_id", "exploded.name", "exploded.value")
exploded_df.show()
# 假设我们想要对 value 列进行某种变换,例如乘以 2
transformed_df = exploded_df.withColumn("transformed_value", col("value") * 2)
# 显示结果
transformed_df.show()
5. groupBy + array_agg + udf + explode
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, array_agg, struct, udf
from pyspark.sql.types import ArrayType, StructType, StructField, StringType, LongType
# 创建 SparkSession
spark = SparkSession.builder.appName("ApplyOnGroupedDF").getOrCreate()
# 创建一个简单的 DataFrame
data = [
(1, "Alice", 100),
(1, "Bob", 200),
(2, "Charlie", 300),
(2, "David", 400),
]
columns = ["group_id", "name", "value"]
df = spark.createDataFrame(data=data, schema=columns)
# 使用 groupBy 和 array_agg 进行聚合
grouped_df = df.groupBy("group_id").agg(
array_agg(struct("name", "value")).alias("name_value_pairs")
)
grouped_df.show()
# +--------+--------------------+
# |group_id| name_value_pairs|
# +--------+--------------------+
# | 1|[{Alice, 100}, {B...|
# | 2|[{Charlie, 300}, ...|
# +--------+--------------------+
# 定义一个处理聚合后数组的 UDF
schema = StructType(
[
StructField("name", StringType(), True),
StructField("value_doubled", LongType(), True),
]
)
@udf(ArrayType(schema))
def process_name_value_pairs(pairs):
return [[pair.name, pair.value * 2] for pair in pairs]
# 应用 UDF 到聚合后的 DataFrame
result_df = grouped_df.withColumn(
"processed_pairs", process_name_value_pairs(col("name_value_pairs"))
)
result_df.show()
# +--------+--------------------+--------------------+
# |group_id| name_value_pairs| processed_pairs|
# +--------+--------------------+--------------------+
# | 1|[{Alice, 100}, {B...|[{Alice, 200}, {B...|
# | 2|[{Charlie, 300}, ...|[{Charlie, 600}, ...|
# +--------+--------------------+--------------------+
# 使用 explode 将处理后的数组展开成多行(如果需要)
exploded_df = result_df.select(
"group_id", explode(col("processed_pairs")).alias("processed")
).select("group_id", "processed.name", "processed.value_doubled")
# 显示结果
exploded_df.show(truncate=False)
# +--------+-------+-------------+
# |group_id|name |value_doubled|
# +--------+-------+-------------+
# |1 |Alice |200 |
# |1 |Bob |400 |
# |2 |Charlie|600 |
# |2 |David |800 |
# +--------+-------+-------------+
6. udf + MapType + create_map + lit + col
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, create_map, lit
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType, MapType, StringType
# 创建 SparkSession
spark = SparkSession.builder.appName("CreateDictInSpark").getOrCreate()
# 创建一个简单的 DataFrame
data = [
(1, "Alice", 100),
(1, "Bob", 200),
(2, "Charlie", 300),
(2, "David", 400),
]
columns = ["group_id", "name", "value"]
df = spark.createDataFrame(data=data, schema=columns)
def value_x_2(_dict):
_dict["value"] = str(2 * _dict["value"]) # 将结果转换为字符串以确保类型一致
_dict["name"] = _dict["name"] + "_new"
return _dict
output_type = MapType(StringType(), StringType())
multiply_by_two_udf = udf(value_x_2, output_type)
# 使用 create_map 创建一个字典类型的列
df_with_map = df.withColumn(
"name_value_map",
multiply_by_two_udf(
create_map(lit("name"), col("name"), lit("value"), col("value"))
),
)
# 显示结果,查看字典列
df_with_map.show(truncate=False)