-
maven依赖添加spark-hive、mysql-connector-java
3. 代码编写
初始化sparkSession时,添加 .enableHiveSupport()开启hive操作
def initSparkSession:SparkSession = SparkSession.builder()
.appName(Test.getClass.getSimpleName)
.master(“local”)
.config(“spark.sql.warehouse.dir”, warehouseLocaion)
.config(“hive.exec.dynamic.partition”, “true”)
.config(“hive.exec.dynamic.partition.mode”, “nonstrict”)
.config(“hive.exec.max.dynamic.partitions”, 2000)
.enableHiveSupport()
.getOrCreate()
4、读取Hive
(1):读取Hive 1:
object Idmapping {
//val warehouseLocaion = “hdfs://hadoop-master:9000/user/hive/warehouse”
val warehouseLocaion = new File(“spark-warehouse”).getAbsolutePath
def main(args: Array[String]): Unit = {
val spark = initSparkSession
spark.sql("select * from cp_data.api_item_flow").limit(10).show()
spark.stop()
}
}
(2):读取Hive 其他写法2:
def main(args: Array[String]): Unit = {
val spark = initSparkSession
spark.table(“cp_data.api_item_flow”)
.createOrReplaceTempView(“tmp_table”)
spark.sql("select * from tmp_table").limit(10).show()
spark.stop()
}
(3): 读取Hive 其他写法3:
def main(args: Array[String]): Unit = {
val spark = initSparkSession
val sqlStr =
"""
| select
| *
| from cp_data.api_item_flow
| limit 10
""".stripMargin
import spark.sql
sql(sqlStr).show()
spark.stop()
}
5、数据读取Hive*
(1):读取本地数据写入Hive:
val dataFrame = spark.read.parquet(inpath)
dataFrame.write.saveAsTable(“cp_data.api_item_flow”)
//或者添加模式
dataFrame.write.mode("append").format("parquet")
.saveAsTable("cp_data.api_item_flow")
(2):读取本地数据写入Hive:
import spark.sql
sql(“use cp_data”)
sql(“load data local inpath ‘/opt/apps/data/xx.csv’” +
" overwrite into table cp_data.api_item_flow")
(3):查询表数据写入Hive:
val dataFrame = spark.sql(sqlStr)
dataFrame.createOrReplaceTempView(“tmp_table”)
val sqlStr2 =
s"""
| INSERT OVERWRITE TABLE cp_data.api_item_flow
| select * from tmp_table
""".stripMargin
val resDF = spark.sql(sqlStr2)
resDF.write.insertInto("cp_data.api_item_flow")