连接数据库和导入数据库
pom导入
<dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <version>8.0.29</version> </dependency> <dependency> <groupId>org.apache.parquet</groupId> <artifactId>parquet-common</artifactId> <version>1.8.2</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-hive_2.11</artifactId> <version>2.3.4</version> </dependency>
java代码:
import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
import java.util.Properties
object JDBCUtile {
val url = "jdbc:mysql://192.168.52.146:3306/etclog"
val driver = "com.mysql.cj.jdbc.Driver"
val user = "root"
val password = "ok"
val frame = "frame";
private val prop = new Properties()
prop.setProperty("user", user)
prop.setProperty("password", password)
prop.setProperty("dirver", driver)
def dataFrameToMysql(df: DataFrame, table: String, op: Int = 1): Unit = {
if (op == 0) {
df.write.mode(SaveMode.Append).jdbc(url, table, prop) //
} else if (op == 1) {
df.write.mode(SaveMode.Overwrite).jdbc(url, table, prop)
}
}
def dataFrameToParquet(df: DataFrame, output: String, op: Int = 1): Unit = {
if (op == 0) {
df.write.mode(SaveMode.Append).parquet(output)
} else if (op == 1) {
df.write.mode(SaveMode.Overwrite).parquet(output)
}
}
//保存到Hive
def dataFrameHive(df: DataFrame, table: String, op: Int = 1): Unit = {
if (op == 0) {
df.write.mode(SaveMode.Append).saveAsTable(table) //模式选择
} else if (op == 1) {
df.write.mode(SaveMode.Overwrite).saveAsTable(table)
}
}
//从MySQL中取数据到spark dataframe
def getDataFrameFromMysql(spark: SparkSession, table: String): DataFrame = {
val frame: DataFrame = spark.read.jdbc(url, table, prop)
frame
}
//从Parquest中取数据到spark dataframe
def getDataFrameFromParquest(spark: SparkSession, path: String): DataFrame = {
val frame: DataFrame = spark.read.parquet(path)
frame
}
//从hive中取数据到spark dataframe
def getDataFrameFromHive(spark: SparkSession, table: String): DataFrame = {
val frame: DataFrame = spark.sql("select * from " + table)
frame
}
}
从MySQL导入数据处理演示(另外几个差不多),可以将数据库的数据进行处理再导进数据库
import org.apache.spark.sql.SparkSession
object aaa {
def main(args: Array[String]): Unit = {
val spark=SparkSession.builder().appName("Student")
.master("local[*]").getOrCreate()
val Student = JDBCUtile.getDataFrameFromMysql(spark, "Student")
val spark2=SparkSession.builder().appName("SC")
.master("local[*]").getOrCreate()
val SC = JDBCUtile.getDataFrameFromMysql(spark2, "SC")
import spark.implicits._ //隐式类
val aaa=SC.filter(x=>x.getAs("CId")=="01")
val bbb=SC.filter(x=>x.getAs("CId")=="02")
val ccc=aaa.join(bbb,Seq("SId"),"inner").toDF("SID","CID1","score1","CID2","score2")
.filter(x=>x.getAs("score1").toString.toDouble > x.getAs("score2").toString.toDouble).select("SID")
val ddd=ccc.join(Student,Seq("SId"),"inner").show()
spark.close()
spark2.close()
}
}
将数据导入MySQL或者HIVE,Parquet也很简单,根据所要的类塞进去就可以了
//JDBCUtile.dataFrameToMysql(frame,JDBCUtile.frame) //将dataframe数据导入MySQL //JDBCUtile.dataFrameToParquet(frame,"out/fullog",op=1) //将dataframe数据导入Parquet //JDBCUtile.dataFrameHive(frame,"sss",1) //将dataframe数据导入Hive
Spark SQL 数据操作与存储

1728

被折叠的 条评论
为什么被折叠?



