packageetlimportjava.util.Propertiesimportorg.apache.commons.lang.StringUtilsimportorg.apache.spark.rdd.RDD
importorg.apache.spark.{SparkConf,SparkContext}importorg.apache.spark.sql._
importorg.apache.spark.sql.types.{StringType,StructField,StructType}importscala.tools.scalap.scalax.util.StringUtil
object EtlDemo{
def main(args:Array[String]):Unit={
val conf:SparkConf=newSparkConf().setMaster("local[*]").setAppName("etldemo")
val spark:SparkSession=SparkSession.builder().master("local[*]").appName("demo").config(conf).getOrCreate()
val sc:SparkContext= spark.sparkContext
importspark.implicits._
//加载数据 按照\t分割 过滤出长度为8的数据 将数据封装在Row里面
val rdd: RDD[Row]= sc.textFile("in/test.log").map(x => x.split("\t")).filter(x => x.length ==8).map(x =>Row(x(0),x(1),x(2),x(3),x(4),x(5),x(6),x(7)))//创建schema
val log_schema =StructType(Array(StructField("event_time",StringType),StructField("url",StringType),StructField("method",StringType),StructField("status",StringType),StructField("sip",StringType),StructField("user_uip",StringType),StructField("action_prepend",StringType),StructField("action_client",StringType)))
val logDF:DataFrame= spark.createDataFrame(rdd,log_schema)//去除重复的event_tiem url 列 , 过滤出状态为200的数据 去除event_time为空的数据
val filterLogs:Dataset[Row]= logDF.dropDuplicates("event_time","url").filter(x =>x(3)=="200").filter(x =>StringUtils.isNotEmpty(x(0).toString))//将url按照”&”以及”=”切割
val full_log_rdd: RDD[Row]= filterLogs.map(line =>{
val str:String= line.getAs[String]("url")
val paramsArray:Array[String]= str.split("\\?")var paramsMap:Map[String,String]=nullif(paramsArray.length ==2){
val strings:Array[String]=paramsArray(1) split ("&")
paramsMap = strings.map(x => x.split("=")).filter(x => x.length ==2).map(x =>(x(0),x(1))).toMap
// for (a<-paramsMap){// println(a)// }}(
line.getAs[String]("event_time"),
paramsMap.getOrElse[String]("userUID",default=""),
paramsMap.getOrElse[String]("userSID",default=""),
paramsMap.getOrElse[String]("actionBegin",default=""),
paramsMap.getOrElse[String]("actionEnd",default=""),
paramsMap.getOrElse[String]("actionType",default=""),
paramsMap.getOrElse[String]("actionName",default=""),
paramsMap.getOrElse[String]("actionValue",default=""),
paramsMap.getOrElse[String]("actionTest",default=""),
paramsMap.getOrElse[String]("ifEquipment",default=""),
line.getAs[String]("method"),
line.getAs[String]("status"),
line.getAs[String]("sip"),
line.getAs[String]("user_uip"),
line.getAs[String]("action_prepend"),
line.getAs[String]("action_client"))}).toDF().rdd
val full_log_schema=StructType(Array(StructField("event_time",StringType),StructField("userUID",StringType),StructField("userSID",StringType),StructField("actionBegin",StringType),StructField("actionEnd",StringType),StructField("actionType",StringType),StructField("actionName",StringType),StructField("actionValue",StringType),StructField("actionTest",StringType),StructField("ifEquipment",StringType),StructField("method",StringType),StructField("status",StringType),StructField("sip",StringType),StructField("user_uip",StringType),StructField("action_prepend",StringType),StructField("action_client",StringType)))
val full_logDF:DataFrame= spark.createDataFrame(full_log_rdd,full_log_schema)
full_logDF.show(3,false)
full_logDF.printSchema()
val url ="jdbc:mysql://192.168.232.211:3306/etldemo"
val user ="root"
val pwd ="ok"
val driver ="com.mysql.jdbc.Driver"
val properties =newProperties()
properties.setProperty("user",user)
properties.setProperty("password",pwd)
properties.setProperty("dirver",driver)println("写入filterLogs到数据库")
filterLogs.write.mode(SaveMode.Overwrite).jdbc(url,table ="access_log",properties)println("写入filterLogs到数据库完成")
full_logDF.write.mode(SaveMode.Overwrite).jdbc(url,table ="full_access_log",properties)}}
配置文件
packageetl.utilimportjava.util.Propertiesimportorg.apache.spark.sql.{DataFrame,SaveMode,SparkSession}
object JdbcUtils{
val url ="jdbc:mysql://192.168.232.211:3306/etldemo"
val user ="root"
val pwd ="ok"
val driver ="com.mysql.jdbc.Driver"
val properties =newProperties()
properties.setProperty("user",JdbcUtils.user)
properties.setProperty("password",JdbcUtils.pwd)
properties.setProperty("driver",JdbcUtils.driver)
val table_access_log:String="access_log"
val table_full_access_log:String="full_access_log"
val table_week_retention:String="week_retention"
val table_day_retention:String="day_retention"
val table_day_active:String="day_active"//从mysql读取
def getDataFrameByTableName(spark:SparkSession,table:String):DataFrame={
spark.read.jdbc(JdbcUtils.url,table,properties)}//写入到mysql
def dataFrameToMysql(df:DataFrame,table:String,op:Int):Unit={if(op==0) df.write.mode(SaveMode.Append).jdbc(JdbcUtils.url,table,properties)else df.write.mode(SaveMode.Overwrite).jdbc(JdbcUtils.url,table,properties)}}
留存率
packageetlimportjava.text.SimpleDateFormatimportjava.util.Propertiesimportetl.util.JdbcUtilsimportorg.apache.commons.lang.StringUtilsimportorg.apache.spark.sql.expressions.UserDefinedFunctionimportorg.apache.spark.{SparkConf,SparkContext}importorg.apache.spark.sql.{DataFrame,SparkSession}
object Retention{
def main(args:Array[String]):Unit={
val conf:SparkConf=newSparkConf().setMaster("local[*]").setAppName("etldemo")
val spark:SparkSession=SparkSession.builder().master("local[*]").appName("demo").config(conf).getOrCreate()
val sc:SparkContext= spark.sparkContext
val properties =newProperties()
properties.setProperty("user",JdbcUtils.user)
properties.setProperty("password",JdbcUtils.pwd)
properties.setProperty("driver",JdbcUtils.driver)importorg.apache.spark.sql._
importspark.implicits._
importorg.apache.spark.sql.functions._
val logs:DataFrame= spark.read.jdbc(JdbcUtils.url,JdbcUtils.table_full_access_log,properties)
logs.cache()//缓存//拉取信息actionName为Registered 的数据
val register:DataFrame= logs.filter($"actionName"==="Registered").withColumnRenamed("event_time","register_time").select("userUID","register_time")//拉取信息actionName为Signin的数据
val signin:DataFrame= logs.filter($"actionName"==="Signin").withColumnRenamed("event_time","signin_time").select("userUID","signin_time")// register.show(3,false)
val regis_sign_join:DataFrame= register.join(signin,Seq("userUID"),"left")
val spdf =newSimpleDateFormat("yyyy-MM-dd")//注册UDF 传入参数 2018-09-04T20:27:33+08:00 字符串 输出 Long类型 的数字
val gszh:UserDefinedFunction= spark.udf.register("gszh",(event_time:String)=>{if(StringUtils.isEmpty(event_time))0else spdf.parse(event_time.substring(0,10)).getTime
})//增加列
val joined2:DataFrame= regis_sign_join.withColumn("register_date",gszh($"register_time")).withColumn("signin_date",gszh($"signin_time"))// joined2.show(2,false)//第二天登陆人数
val signinNum:DataFrame= joined2.filter($"register_date"+86400000=== $"signin_date").groupBy($"register_date").agg(countDistinct($"userUID").as("signin_num"))//注册人数
val registerNum:DataFrame= joined2.groupBy($"register_date").agg(countDistinct($"userUID").as("register_num"))//计算留存率
val resultone:DataFrame= signinNum.join(registerNum,Seq("register_date")).select($"register_date",($"signin_num"/ $"register_num").as("percent"))//七日留存率
val signinNumSeven:DataFrame= joined2.filter($"register_date"+86400000*7>= $"signin_date").groupBy($"register_date").agg(countDistinct($"userUID").as("signin_num_seven"))//计算留存率
val resultseven:DataFrame= signinNumSeven.join(registerNum,Seq("register_date")).select($"register_date",($"signin_num_seven"/ $"register_num").as("percent"))JdbcUtils.dataFrameToMysql(resultone,JdbcUtils.table_day_retention,1)}}
活跃用户
packageetl.utilimportorg.apache.spark.{SparkConf,SparkContext}importorg.apache.spark.sql.{DataFrame,Dataset,Row,SparkSession}
object Active{
def main(args:Array[String]):Unit={
val conf:SparkConf=newSparkConf().setMaster("local[*]").setAppName("etldemo")
val spark:SparkSession=SparkSession.builder().master("local[*]").appName("demo").config(conf).getOrCreate()
val sc:SparkContext= spark.sparkContext
importspark.implicits._
val logs:DataFrame=JdbcUtils.getDataFrameByTableName(spark,JdbcUtils.table_full_access_log)
logs.cache()
logs.show(1,false)
logs.printSchema()
val ds:Dataset[Row]= logs.filter($"actionName"==="BuyCourse"|| $"actionName"==="StartLearn")
val ds2:Dataset[(String,String)]= ds.map(x =>(
x.getAs[String]("userUID"),
x.getAs[String]("event_time").substring(0,10)))
ds2.printSchema()
ds2.show(3,false)importorg.apache.spark.sql.functions._
val frame:DataFrame= ds2.withColumnRenamed("_1","userUID").withColumnRenamed("_2","date").groupBy($"date").agg(countDistinct($"userUID").as("activeNum"))JdbcUtils.dataFrameToMysql(frame,JdbcUtils.table_day_active,1)}}