任务是读取一张 会员ID+167个字段、124万数据的表,去连接查询一张 手机号码+会员ID、350多万数据的表,把会员ID替换成手机号后导出成csv文件,在此之前这两份数据是已经转换成了spark的标准数据格式。
废话不多上代码
Spark 类
import java.util.Properties
import com.typesafe.config.ConfigFactory
import org.apache.spark.SparkConf
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql._
import scala.reflect.ClassTag
object Spark extends java.io.Serializable {
val config = ConfigFactory.load()
val env = config.getString("env")
val appName = config.getString("app")
val master = config.getString(env + ".spark.master")
val database = config.getString(env + ".jdbc.url")
val user = config.getString(env + ".jdbc.username")
val password = config.getString(env + ".jdbc.password")
val sparkConf = new SparkConf().setMaster(master).setAppName(appName).set("spark.sql.crossJoin.enabled", "true")
val session = SparkSession.builder.config(sparkConf).getOrCreate()
val context = session.sparkContext
val fnc = Functions
def read(): DataFrameReader = {
session.read
}
def load(tableName: String,
filename: String,
baseDir: String = Config.dataDir): Unit = {
unpersist(tableName)
Spark.read.parquet(s"${baseDir}/${filename}").createOrReplaceTempView(tableName)
}
def unpersist(tableName: String): Unit = {
try {
val table = session.table(tableName)
if (table != null)
table.unpersist()
} catch {
case ex => println(ex.getMessage)
}
}
/**
* 执行查询
*
* @param sqlText 查询语句
* @param tableName 写入的目标表
* @param operation 操作, 1 写入内存, 2 写入数据库, 3 同时写入内存和数据库
* @param mode 数据库写入模式
* - overwrite overwrite the existing data.
* - append append the data.
* - ignore ignore the operation (i.e. no-op).
* - error default option, throw an exception at runtime.
* @return
*/
def sql(sqlText: String,
tableName: String = null,
operation: Int = 1,
mode: SaveMode = SaveMode.Append): DataFrame = {
val df = session.sql(sqlText)
if (tableName != null && !tableName.isEmpty) {
operation match {
case 1 =>
df.persist.createOrReplaceTempView(tableName)
case 2 =>
val jdbcProperties = new Properties()
jdbcProperties.put("user", user)
jdbcProperties.put("password", password)
df.write.mode(mode).jdbc(database, tableName, jdbcProperties)
case 3 =>
df.persist.createOrReplaceTempView(tableName)
val jdbcProperties = new Properties()
jdbcProperties.put("user", user)
jdbcProperties.put("password", password)
df.write.mode(mode).jdbc(database, tableName, jdbcProperties)
}
}
df
}
}
具体调用类:
package me.ocheng.analysis
import java.util.Properties
import org.apache.spark.sql.SaveMode
object App extends Serializable {
def main(args: Array[String]): Unit = {
Spark.load("member", "member", "C:/Users/EDZ/Desktop")
Spark.load("result", "result", "C:/Users/EDZ/Desktop")
Spark.sql("select memberBrandId,phoneNumber from member group by memberBrandId,phoneNumber").createOrReplaceTempView("oc_member")
Spark.sql("select m.phoneNumber,s.* from result s join oc_member m on m.memberBrandId = s._id where s.last6Month0Qty >0 or s.recent180Day0Qty >0")
.repartition(1).write.format("csv").mode(SaveMode.Overwrite)
.option("header", "true")
.option("delimiter", ",")
.save("C:/Users/EDZ/Desktop/oc_多维度.csv")
}
}
spark刚开始使用给我感觉是跟java语法很像,但是跟java不同的是不能持续交互,读取数据进的是内存,一般情况下都是读spark的数据格式,直接连接数据库代价有些大。用于处理数据,写复杂sql和计算数据速度很快,导出百万数据到csv也只用十几秒。