初始化
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql.functions import col, lit, when, sum, expr, mean
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
# 优化SparkSession配置,减少资源浪费
spark = SparkSession.builder.appName("test") \
.config("spark.executor.instances", "1") \
.config("spark.executor.memory", "2g") \
.config("spark.executor.cores", "1") \
.config("spark.sql.shuffle.partitions", "2") \
.enableHiveSupport() \
.getOrCreate()
sc = spark.sparkContext
计算平均值(两种方法)
data = [1,5,7,10,23,20,6,5,10,7,10]
df = spark.createDataFrame(data, "int")
# 计算平均值
mean_value = df.agg(mean(col("value"))).first()[0]
print(mean_value)
9.454545454545455
rdd = sc.parallelize(data)
print("avg=", rdd.reduce(lambda x, y: x + y)/rdd.count())
计算众数
mode_value = df.groupBy("value").count().orderBy(col("count").desc()).first()[0]
print("mode=", mode_value)
mode= 10
排序并返回序号
按从小到大排序并返回序号, 大小相同的序号可以不同
求解topN
任务:有一批学生信息表格,包括name,age,score, 找出score排名前3的学生, score相同可以任取
df.withColumn("rank", expr("row_number() over (order by value desc)")).show()
+-----+----+
|value|rank|
+-----+----+
| 23| 1|
| 20| 2|
| 10| 3|
| 10| 4|
| 10| 5|
| 7| 6|
| 7| 7|
| 6| 8|
| 5| 9|
| 5| 10|
| 1| 11|
+-----+----+
students = [("LiLei", 18, 87), ("HanMeiMei", 16, 77), ("DaChui", 16, 66), ("Jim", 18, 77), ("RuHua", 18, 50)]
n = 3
stu_df = spark.createDataFrame(students, schema="name string, age int, score int")
print(stu_df.orderBy("score", ascending=False).limit(n).select("name").collect()[:])
[Row(name='LiLei'), Row(name='Jim'), Row(name='HanMeiMei')]
二次排序
首先根据学生的score从大到小排序,如果score相同,根据age从大到小
stu_df.orderBy(col("score").desc(), col("age").desc()).show()
+---------+---+-----+
| name|age|score|
+---------+---+-----+
| LiLei| 18| 87|
| Jim| 18| 77|
|HanMeiMei| 16| 77|
| DaChui| 16| 66|
| RuHua| 18| 50|
+---------+---+-----+
连接操作
已知班级信息表和成绩表,找出班级平均分在75分以上的班级
班级信息表包括class,name,成绩表包括name,score
classes = [("class1", "LiLei"), ("class1", "HanMeiMei"), ("class2", "DaChui"), ("class2", "RuHua")]
scores = [("LiLei", 76), ("HanMeiMei", 80), ("DaChui", 70), ("RuHua", 60)]
classes_df = spark.createDataFrame(classes, schema="class_name string, student_name string")
scores_df = spark.createDataFrame(scores, schema="student_name string, score int")
# 已知班级信息表和成绩表,找出班级平均分在75分以上的班级
classes_df.join(
scores_df, on="student_name", how="inner"
).groupBy("class_name").agg(mean("score")).filter("avg(score)>75").select("class_name", "avg(score)").show()
+----------+----------+
|class_name|avg(score)|
+----------+----------+
| class1| 78.0|
+----------+----------+
分组求众数
有一批学生信息表格,包括class和age。求每个班级学生年龄的众数。
from pyspark.sql.functions import count, rank
from pyspark.sql import Window
class_data = [("class1", 15), ("class1", 15), ("class2", 16), ("class2", 16), ("class1", 17), ("class2", 19)]
class_df = spark.createDataFrame(class_data, schema="class_name string, age int")
age_cnt_df = class_df.groupBy("class_name", "age").count()
window_spec = Window.partitionBy("class_name").orderBy(col("count").desc())
age_cnt_df.withColumn("rank", rank().over(window_spec)).filter("rank=1").select("class_name", "age").show()
+----------+---+
|class_name|age|
+----------+---+
| class1| 15|
| class2| 16|
+----------+---+
资源释放
sc.stop()
1140

被折叠的 条评论
为什么被折叠?



