参考:https://stackoverflow.com/questions/37486910/pivot-string-column-on-pyspark-dataframe
使用 from pyspark.sql.functions import first
import pyspark
import os
from pyspark import SparkContext, SparkConf
from pyspark.sql.session import SparkSession
conf = SparkConf().setAppName('test_parquet')
sc = SparkContext('local', 'test', conf=conf)
spark = SparkSession(sc)
data = [("aaa", 1, 'x'), ("aaa", 2, 'xx'), ("bbb", 1, 'xxx'), ("ccc", 1, 'xxxx'),("ddd", 3, 'xxxx')]
column = ['cuid', 'key_type', 'value']
df = spark.createDataFrame(data=data, schema=column)
df.printSchema()
#root
# |-- cuid: string (nullable = true)
# |-- key_type: long (nullable = true)
# |-- value: string (nullable = true)
df.show(truncate=False)
#+----+--------+-----+
#|cuid|key_type|value|
#+----+--------+-----+
#|aaa |1 |x |
#|aaa |2 |xx |
#|bbb |1 |xxx |
#|ccc |1 |xxxx |
#|ddd |3 |xxxx |
#+----+--------+-----+
from pyspark.sql.functions import first
df.groupby('cuid').pivot("key_type").agg(first("value")).show()
#+----+----+----+----+
#|cuid| 1| 2| 3|
#+----+----+----+----+
#| aaa| x| xx|null|
#| bbb| xxx|null|null|
#| ccc|xxxx|null|null|
#| ddd|null|null|xxxx|
#+----+----+----+----+