spark<Dataframe 直接写入hive表>

本文介绍了一种使用Apache Spark从MySQL数据库中读取数据,并将其写入Hive表的方法。通过具体代码示例展示了如何配置连接参数、加载数据及保存至Hive的过程。
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode}
import org.apache.spark.{SparkConf, SparkContext}

object Main {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("datacopy")
    /*.setMaster("local")*/
    val sc = new SparkContext(conf)
    val sqlContext = new HiveContext(sc)

    val url = "jdbc:mysql://192.168.20.29:3306/vboxDB?useUnicode=true&characterEncoding=utf-8&useSSL=false"
    val reader = sqlContext.read.format("jdbc")
    var options = Map[String, String]()
    options += ("url" -> url)
    options += ("driver" -> "com.mysql.jdbc.Driver")
    options += ("user" -> "root")
    options += ("password" -> "new.1234")
    options += ("dbtable" -> "play_name")
    reader.options(options)
    var originalDF: DataFrame = reader.load()
    originalDF.show()
    //写入hive
    originalDF.write.mode(SaveMode.Overwrite).saveAsTable("vboxdb.play_name")

  }

}

这里有一个点需要注意的是在spark-shell中的sqlcontext是Hivecontext的实例而不是sqlContext的实例

package ads import common.PortraitCommon.{ck_dim, ckdriver, ckpassword, ckurl, ckuser, dim_user_info_tmp} import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} import org.apache.spark.sql.functions.col import utils.{ClickhouseUtils, SparkUtils} object ads_car_brand_cut { def main(args: Array[String]): Unit = { // TODO: 构建spark环境 val spark: SparkSession = SparkUtils.getSpark("ads_car_brand_cut") // TODO: 取数据 val dwdreadData = spark.read.format("jdbc") .option("url", ckurl) .option("user", ckuser) .option("password", ckpassword) .option("dbtable", "tieta_v2_dim.dim_user_info") .option("driver", ckdriver) .load() // val mysqlurl = "jdbc:mysql://192.168.3.117:3309/lol" // val mysqlusername = "root" // val mysqlpassword = "123456" // TODO: 过滤品牌 val filterDF: Dataset[Row] = dwdreadData.filter(col("car_brand_model") =!= "未知") // TODO: 对车辆型号进行统计 val resDF: DataFrame = filterDF.groupBy("car_brand_model").count() .withColumnRenamed("count", "brand_count") resDF.show() // // TODO: 将数据写入 ClickHouse // ClickhouseUtils.writeClickHouse(resDF, "tieta_v2_ads", "ads_car_brand_cut") // println("数据成功写入 ClickHouse") // resDF.write // .format("jdbc") // .option("url", mysqlurl) // .option("dbtable", "ads_car_brand_cut") // .option("user", mysqlusername) // .option("password", mysqlpassword) // .option("driver", "com.mysql.cj.jdbc.Driver") // .mode("append") // .save() // // println("Data successfully written to mysql") } }我想在虚拟机的spark运行这个文件,我的整个项目打包的jar包在、srv/untitled3-1.0-SNAPSHOT.jar <?xml version="1.0" encoding="UTF-8"?> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>org.example</groupId> <artifactId>untitled3</artifactId> <version>1.0-SNAPSHOT</version> <properties> <maven.compiler.source>8</maven.compiler.source> <maven.compiler.target>8</maven.compiler.target> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <!-- 统一版本管理 --> <spark.version>3.1.1</spark.version> <hive.version>2.3.7</hive.version> <!-- 与 Spark 3.1.1 兼容的 Hive 版本 --> </properties> <dependencies> <!-- Spark Core --> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-core_2.12</artifactId> <version>${spark.version}</version> </dependency> <!-- Spark SQL --> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-sql_2.12</artifactId> <version>${spark.version}</version> </dependency> <!-- Spark Hive Support --> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-hive_2.12</artifactId> <version>${spark.version}</version> </dependency> <!-- Spark MLlib --> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-mllib_2.12</artifactId> <version>${spark.version}</version> </dependency> <!-- Hadoop Client --> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-client</artifactId> <version>3.2.0</version> </dependency> <!-- Hudi Spark Bundle --> <dependency> <groupId>org.apache.hudi</groupId> <artifactId>hudi-spark3.1-bundle_2.12</artifactId> <version>0.12.0</version> </dependency> <!-- ClickHouse JDBC --> <dependency> <groupId>ru.yandex.clickhouse</groupId> <artifactId>clickhouse-jdbc</artifactId> <version>0.3.2</version> </dependency> <!-- Jackson Databind --> <dependency> <groupId>com.fasterxml.jackson.core</groupId> <artifactId>jackson-databind</artifactId> <version>2.10.0</version> </dependency> <!-- Jackson Core --> <dependency> <groupId>com.fasterxml.jackson.core</groupId> <artifactId>jackson-core</artifactId> <version>2.10.0</version> </dependency> <!-- Akka Actor --> <dependency> <groupId>com.typesafe.akka</groupId> <artifactId>akka-actor_2.12</artifactId> <version>2.6.16</version> </dependency> <!-- JSch --> <dependency> <groupId>com.jcraft</groupId> <artifactId>jsch</artifactId> <version>0.1.51</version> </dependency> <!-- Typesafe Config --> <dependency> <groupId>com.typesafe</groupId> <artifactId>config</artifactId> <version>1.4.2</version> </dependency> <!-- MySQL Connector --> <dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <version>8.0.29</version> </dependency> <!-- DOM4J --> <dependency> <groupId>org.dom4j</groupId> <artifactId>dom4j</artifactId> <version>2.1.4</version> </dependency> </dependencies> <build> <plugins> <!-- Maven Shade Plugin 用于创建包含所有依赖的fat JAR --> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-shade-plugin</artifactId> <version>3.2.4</version> <executions> <execution> <phase>package</phase> <goals> <goal>shade</goal> </goals> <configuration> <transformers> <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer"> <!-- 如果需要设置主类,可以在这里指定 --> <!-- <mainClass>your.main.Class</mainClass> --> </transformer> </transformers> <filters> <filter> <artifact>*:*</artifact> <excludes> <exclude>META-INF/*.SF</exclude> <exclude>META-INF/*.DSA</exclude> <exclude>META-INF/*.RSA</exclude> </excludes> </filter> </filters> </configuration> </execution> </executions> </plugin> </plugins> </build> </project>这是我的pom文件,应该没问题吧
最新发布
08-12
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值