pyspark之RDD，Data Frame，SQL Context 转换与操作_pyspark的给rdd变成row-优快云博客

本文介绍了如何在PySpark中进行RDD、DataFrame和SQL Context之间的转换及数据操作，包括创建表后的SQL查询，展示了使用RDD、DataFrame API及SQL语句进行数据查询的方法。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

# 加载数据
Path='file:/home/swt/pythonwork/PythonProject/'
RawUserRDD = sc.textFile(Path+'data/u.user')
RawUserRDD.count()

# 查看RDD数据
RawUserRDD.take(5)

['1|24|M|technician|85711',
 '2|53|F|other|94043',
 '3|23|M|writer|32067',
 '4|24|M|technician|43537',
 '5|33|F|other|15213']

# 按照|分割
userRDD = RawUserRDD.map(lambda x:x.split('|'))
userRDD.take(5)

[['1', '24', 'M', 'technician', '85711'],
 ['2', '53', 'F', 'other', '94043'],
 ['3', '23', 'M', 'writer', '32067'],
 ['4', '24', 'M', 'technician', '43537'],
 ['5', '33', 'F', 'other', '15213']]

# 将RDD格式转换为Row格式
sqlContext = SparkSession.builder.getOrCreate()
from pyspark.sql import Row
user_roes = userRDD.map(lambda x:Row(userid=int(x[0]),age=int(x[1]),gender=x[2],occupation=x[3],zipcode=x[4]))
user_roes.take(5)

[Row(age=24, gender='M', occupation='technician', userid=1, zipcode='85711'),
 Row(age=53, gender='F', occupation='other', userid=2, zipcode='94043'),
 Row(age=23, gender='M', occupation='writer', userid=3, zipcode='32067'),
 Row(age=24, gender='M', occupation='technician', userid=4, zipcode='43537'),
 Row(age=33, gender='F', occupation='other', userid=5, zipcode='15213')]

# 转换成DataFrame
user_df = sqlContext.createDataFrame(user_roes)
user_df.printSchema()
user_df.show(5)

root
 |-- age: long (nullable = true)
 |-- gender: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- userid: long (nullable = true)
 |-- zipcode: string (nullable = true)

+---+------+----------+------+-------+
|age|gender|occupation|userid|zipcode|
+---+------+----------+------+-------+
| 24|     M|technician|     1|  85711|
| 53|     F|     other|     2|  94043|
| 23|     M|    writer|     3|  32067|
| 24|     M|technician|     4|  43537|
| 33|     F|     other|     5|  15213|
+---+------+----------+------+-------+
only showing top 5 rows

# 给DATa Frame去别名，结果同上
df = user_df.alias("df")
df.show(5)

# 创建表
df.createOrReplaceTempView("user_table")

创建表之后就可以进行类似sql的数据操作了。。。
RDD查询数据

# RDD select 
userRDDnew = userRDD.map(lambda x:(x[3],x[2],x[1]))
userRDDnew.take(5)

# 计算字段
userRDDnew1 = userRDD.map(lambda x:(x[0],x[3],x[2],x[1],2016-int(x[1])))
userRDDnew1.take(5)

# 筛选查询
userRDD.filter(lambda r:r[3]=='technician' and r[2]=='M' and r[1]=="24").take(5)

# 排序 升序
userRDD.takeOrdered(5,key=lambda x:int(x[1]))
# 降序
userRDD.takeOrdered(5,key=lambda x:-1*int(x[1]))

# 多字段排序
userRDD.takeOrdered(5,key=lambda x:(-int(x[1]),x[2]))

# 显示不重复数据
userRDD.map(lambda x:x[2]).distinct().collect()

# 分组统计
userRDD.map(lambda x:(x[2],1)).reduceByKey(lambda x,y:(x+y)).collect()

dataframe查询字段

# dataframe 查询字段
df.select("age","gender").show(5)
df.select(df.userid,df.age,df.gender).show(5)
# []查询
df[df['userid'],df['age'],df['gender']].show(5)

# 计算字段
df.select('userid','occupation','gender','age',(2016-df.age).alias('bithyaer')).show(5)

# 筛选查询
df.filter("occupation='technician'").filter("gender='M'").filter("age=24").show(5)
df.filter((df.occupation=='technician')&(df.age==24)&(df.gender=='M')).show(5)
df.filter((df['occupation']