Spark DataFrames DataSet

最新推荐文章于 2023-12-07 13:00:40 发布

原创最新推荐文章于 2023-12-07 13:00:40 发布 · 766 阅读

0 ·

CC 4.0 BY-SA版权

大数据同时被 2 个专栏收录

54 篇文章

订阅专栏

spark

25 篇文章

订阅专栏

本文介绍如何使用Apache Spark处理JSON文件，包括加载JSON文件到DataFrame、基本的DataFrame操作如选择特定列、增加年龄字段值、过滤、分组计数等。同时展示了从DataFrame到Dataset的转换过程及从MySQL数据库加载数据。

Json文件内容:

{"name":"Michael"}

{"name":"Andy", "age":30}
{"name":"Justin", "age":19}
-- 加载json文件转换成DataFrames
scala> val df = sqlContext.jsonFile("/spark/json")
warning: there were 1 deprecation warning(s); re-run with -deprecation for details
df: org.apache.spark.sql.DataFrame = [age: bigint, name: string]
// 显示
scala> df.show
+----+-------+
| age| name|
+----+-------+
|null|Michael|
| 30| Andy|
| 19| Justin|
+----+-------+
-- 打印schema
scala> df.printSchema
root
|-- age: long (nullable = true)
|-- name: string (nullable = true)
-- 按列选择
scala> df.select("age").show
+----+
| age|
+----+
|null|
| 30|
| 19|
+----+
scala> df.select("age","name").show
+----+-------+
| age| name|
+----+-------+
|null|Michael|
| 30| Andy|
| 19| Justin|
+----+-------+
-- 对字段值进行操作 age这一列+2
scala> df.select(df("name"),df("age")+2).show
+-------+---------+
| name|(age + 2)|
+-------+---------+
|Michael| null|
| Andy| 32|
| Justin| 21|
+-------+---------+
-- filter过滤器
scala> df.filter(df("age")>20).show
+---+----+
|age|name|
+---+----+
| 30|Andy|
+---+----+
-- 分组函数 ---必须要加聚合函数
scala> df.groupBy("age").count().show()
+----+-----+
| age|count|
+----+-----+
|null| 1|
| 19| 1|
| 30| 1|
+----+-----+
val sc: SparkContext // An existing SparkContext.
val sqlContext = new org.apache.spark.sql.SQLContext(sc)

// Create the DataFrame
val df = sqlContext.read.json("examples/src/main/resources/people.json")

// Show the content of the DataFrame
df.show()
// age name
// null Michael
// 30 Andy
// 19 Justin

// Print the schema in a tree format
df.printSchema()
// root
// |-- age: long (nullable = true)
// |-- name: string (nullable = true)

// Select only the "name" column
df.select("name").show()
// name
// Michael
// Andy
// Justin

// Select everybody, but increment the age by 1
df.select(df("name"), df("age") + 1).show()
// name (age + 1)
// Michael null
// Andy 31
// Justin 20

// Select people older than 21
df.filter(df("age") > 21).show()
// age name
// 30 Andy

// Count people by age
df.groupBy("age").count().show()
// age count
// null 1
// 19 1
// 30 1
-- 读取hive表中的数据转化成DataFrame
scala> val df = sqlContext.sql("select * from t_hdrc_type_month");
scala> df.count
res6: Long = 121144

-- ***********创建RDD----DF和DS互相转化 -----Creating Datasets*****************/

-- DataFrames can be converted to a Dataset by providing a class. Mapping will be done by name.
// Encoders for most common types are automatically provided by importing sqlContext.implicits._
val ds = Seq(1, 2, 3).toDS()
ds.map(_ + 1).collect() // Returns: Array(2, 3, 4)

// Encoders are also created for case classes.
case class Person(name: String, age: Long)
val ds = Seq(Person("Andy", 32)).toDS()

// DataFrames can be converted to a Dataset by providing a class. Mapping will be done by name.
val path = "examples/src/main/resources/people.json"
val people = sqlContext.read.json(path).as[Person]

-- dataFrame转换成DataSet
scala> sqlContext.read.json("hdfs://suixingpay199:9000/user/app/spark/people.json")
res6: org.apache.spark.sql.DataFrame = [age: bigint, name: string]

scala> res6.as[Person]
res8: org.apache.spark.sql.Dataset[Person] = [name: string, age: bigint]

-- 数据源来源于mysql
val jdbcDF = sqlContext.load("jdbc", Map("url" -> "jdbc:mysql://suixingpay190:3306/azkaban?user=root&password=123", "dbtable" -> "projects"))
scala> jdbcDF.show
+---+--------------+------+-------------+-------------+-------+----------------+--------------+--------+--------------------+
| id| name|active|modified_time| create_time|version|last_modified_by| description|enc_type| settings_blob|
+---+--------------+------+-------------+-------------+-------+----------------+--------------+--------+--------------------+
| 1| test| true|1452602214512|1452599858502| 4| azkaban| test| 2|[31, -117, 8, 0, ...|
| 2| touch| true|1452655867334|1452601523613| 7| azkaban| touch| 2|[31, -117, 8, 0, ...|
| 3|DataWherehouse| false|1452688047362|1452662944078| 34| azkaban|datawherehouse| 2|[31, -117, 8, 0, ...|
| 4|DataWhereHouse| true|1452689575193|1452688067801| 3| azkaban|DataWhereHouse| 2|[31, -117, 8, 0, ...|
| 5| HIveAnalysis| true|1453440935221|1453440922278| 1| baige| HIveAnalysis| 2|[31, -117, 8, 0, ...|
+---+--------------+------+-------------+-------------+-------+----------------+--------------+--------+--------------------+
-- 查询指定列
jdbcDF.select("name","id").show
+--------------+---+
| name| id|
+--------------+---+
| test| 1|
| touch| 2|
|DataWherehouse| 3|
|DataWhereHouse| 4|
| HIveAnalysis| 5|
+--------------+---+