# 加载数据
Path='file:/home/swt/pythonwork/PythonProject/'
RawUserRDD = sc.textFile(Path+'data/u.user')
RawUserRDD.count()
# 查看RDD数据
RawUserRDD.take(5)
['1|24|M|technician|85711',
'2|53|F|other|94043',
'3|23|M|writer|32067',
'4|24|M|technician|43537',
'5|33|F|other|15213']
# 按照|分割
userRDD = RawUserRDD.map(lambda x:x.split('|'))
userRDD.take(5)
[['1', '24', 'M', 'technician', '85711'],
['2', '53', 'F', 'other', '94043'],
['3', '23', 'M', 'writer', '32067'],
['4', '24', 'M', 'technician', '43537'],
['5', '33', 'F', 'other', '15213']]
# 将RDD格式转换为Row格式
sqlContext = SparkSession.builder.getOrCreate()
from pyspark.sql import Row
user_roes = userRDD.map(lambda x:Row(userid=int(x[0]),age=int(x[1]),gender=x[2],occupation=x[3],zipcode=x[4]))
user_roes.take(5)
[Row(age=24, gender='M', occupation='technician', userid=1, zipcode='85711'),
Row(age=53, gender='F', occupation='other', userid=2, zipcode='94043'),
Row(age=23, gender='M', occupation='writer', userid=3, zipcode='32067'),
Row(age=24, gender='M', occupation='technician', userid=4, zipcode='43537'),
Row(age=33, gender='F', occupation='other', userid=5, zipcode='15213')]
# 转换成DataFrame
user_df = sqlContext.createDataFrame(user_roes)
user_df.printSchema()
user_df.show(5)
root
|-- age: long (nullable = true)
|-- gender: string (nullable = true)
|-- occupation: string (nullable = true)
|-- userid: long (nullable = true)
|-- zipcode: string (nullable = true)
+---+------+----------+------+-------+
|age|gender|occupation|userid|zipcode|
+---+------+----------+------+-------+
| 24| M|technician| 1| 85711|
| 53| F| other| 2| 94043|
| 23| M| writer| 3| 32067|
| 24| M|technician| 4| 43537|
| 33| F| other| 5| 15213|
+---+------+----------+------+-------+
only showing top 5 rows
# 给DATa Frame去别名,结果同上
df = user_df.alias("df")
df.show(5)
# 创建表
df.createOrReplaceTempView("user_table")
创建表之后就可以进行类似sql的数据操作了。。。
RDD查询数据
# RDD select
userRDDnew = userRDD.map(lambda x:(x[3],x[2],x[1]))
userRDDnew.take(5)
# 计算字段
userRDDnew1 = userRDD.map(lambda x:(x[0],x[3],x[2],x[1],2016-int(x[1])))
userRDDnew1.take(5)
# 筛选查询
userRDD.filter(lambda r:r[3]=='technician' and r[2]=='M' and r[1]=="24").take(5)
# 排序 升序
userRDD.takeOrdered(5,key=lambda x:int(x[1]))
# 降序
userRDD.takeOrdered(5,key=lambda x:-1*int(x[1]))
# 多字段排序
userRDD.takeOrdered(5,key=lambda x:(-int(x[1]),x[2]))
# 显示不重复数据
userRDD.map(lambda x:x[2]).distinct().collect()
# 分组统计
userRDD.map(lambda x:(x[2],1)).reduceByKey(lambda x,y:(x+y)).collect()
dataframe查询字段
# dataframe 查询字段
df.select("age","gender").show(5)
df.select(df.userid,df.age,df.gender).show(5)
# []查询
df[df['userid'],df['age'],df['gender']].show(5)
# 计算字段
df.select('userid','occupation','gender','age',(2016-df.age).alias('bithyaer')).show(5)
# 筛选查询
df.filter("occupation='technician'").filter("gender='M'").filter("age=24").show(5)
df.filter((df.occupation=='technician')&(df.age==24)&(df.gender=='M')).show(5)
df.filter((df['occupation']