一.行转列操作
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Dataset, Row, SparkSession}
case class Info(id:String,tags:String)
object SQLDemo {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder()
.appName(this.getClass.getCanonicalName)
.master("local[*]")
.getOrCreate()
spark.sparkContext.setLogLevel("warn")
import spark.implicits._
val arr = Array("1 1,2,3","2 2,3","3 1,2")
val infoRDD: RDD[Info] = spark.sparkContext.makeRDD(arr).map {
line =>
val fields: Array[String] = line.split("\\s+")
Info(fields(0), fields(1))
}
val ds: Dataset[Info] = spark.createDataset(infoRDD)
ds.createOrReplaceTempView("t1")
ds.show()
//Hive 执行方式
spark.sql(
"""
|select id,tag
|from t1
|lateral view explode(split(tags,",")) t2 as tag
|""".stripMargin
).show()
//SparkSql
spark.sql(
"""
|select id,explode(split(tags,",")) tag
|from t1
|""".stripMargin
).show()
//通过上边两种方式实现可以证明SparkSql和HQL兼容,并且SparkSql更加简洁
spark.close()
}
}
+---+-----+
| id| tags|
+---+-----+
| 1|1,2,3|
| 2| 2,3|
| 3| 1,2|
+---+-----+
+---+---+
| id|tag|
+---+---+
| 1| 1|
| 1| 2|
| 1| 3|
| 2| 2|
| 2| 3|
| 3| 1|
| 3| 2|
+---+---+
+---+---+
| id|tag|
+---+---+
| 1| 1|
| 1| 2|
| 1| 3|
| 2| 2|
| 2| 3|
| 3| 1|
| 3| 2|
+---+---+
二.输入和输出
1.parquet文件读写
读
import spark._
//parquet 二进制文件
val df1: DataFrame = spark.read.load("src/main/data/users.parquet")
df1.show()
//如果我们要进行表的操作,我们需要创建表
//法一创建表
df1.createOrReplaceTempView("t1")
//法二创建表
sql(
"""
|create or replace temporary view users
|using parquet
|options(path "src/main/data/users.parquet")
|""".stripMargin)
//进行使用表
sql(
"""
|select * from users
|""".stripMargin).show()
+------+--------------+----------------+
| name|favorite_color|favorite_numbers|
+------+--------------+----------------+
|Alyssa| null| [3, 9, 15, 20]|
| Ben| red| []|
+------+--------------+----------------+
+------+--------------+----------------+
| name|favorite_color|favorite_numbers|
+------+--------------+----------------+
|Alyssa| null| [3, 9, 15, 20]|
| Ben| red| []|
+------+--------------+----------------+
写
df1.write
.mode(SaveMode.Overwrite)
.save("src/main/output/")
2.csv文件
读
val df2: DataFrame = spark.read.format("csv")
.option("header", "true")
.option("inferschema", "true")
.load("src/main/data/people1.csv")
df2.show()
sql(
"""
|create or replace temporary view people
|using csv
|options(path "src/main/data/people1.csv",
| header "true",
| inferschema "true")
|""".stripMargin)
sql("select * from people").show()
写
sql("select * from people").write
.format("csv")
.mode(SaveMode.Overwrite)
.save("src/main/output/")
3.json
读
//法一
val df2: DataFrame = spark.read.format("json").load("src/main/data/emp.json")
df2.show()
//法二
sql(
"""
|create or replace temporary view emp
|using json
|options(path "src/main/data/emp.json")
|""".stripMargin)
写
sql(
"""
|select * from emp
|""".stripMargin).write
.format("json")
.mode(SaveMode.Overwrite)
.save("src/main/output/")
ps:文件格式可以互相转化
4.Jdbc
读
val jdbcDF: DataFrame = spark.read.format("jdbc")
.option("url", "jdbc:mysql://Linux123:3306/ebiz?useSSL=false")
.option("user", "root")
.option("password", "123456")
.option("driver", "com.mysql.jdbc.Driver")
.option("dbtable", "lagou_product_info")
.load()
jdbcDF.show()
写
我们发现自动写入的表默认字符集是latin1
mysql> show create table lagou_product_info_bak;
+------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Table | Create Table |
+------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| lagou_product_info_bak | CREATE TABLE `lagou_product_info_bak` (
`productId` bigint(20) DEFAULT NULL,
`productName` text,
`shopId` bigint(20) DEFAULT NULL,
`price` decimal(11,2) DEFAULT NULL,
`isSale` int(11) DEFAULT NULL,
`status` int(11) DEFAULT NULL,
`categoryId` int(11) DEFAULT NULL,
`createTime` text,
`modifyTime` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP
) ENGINE=InnoDB DEFAULT CHARSET=latin1 |
+------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
1 row in set (0.01 sec)
mysql>
我们通过修改表字符集进行再次写入并且在参数中加入字符集设置
alter table lagou_product_info_bak convert to character set utf8;
jdbcDF.write
.format("jdbc")
.option("url", "jdbc:mysql://Linux123:3306/ebiz?useSSL=false&characterEncoding=utf8")
.option("user", "root")
.option("password", "123456")
.option("driver", "com.mysql.jdbc.Driver")
.option("dbtable", "lagou_product_info_bak")
.mode(SaveMode.Append)
.save()