需求:拆分test.log每行数据,得到用户每日留存率
环境:idea,mven,scala.spark,mysql
要点:
读取文件得到dataFrame,
将结果文件dataFrame写入mysql,
将常量写入配置文件(配置文件的读取)
DataFrame算子的使用(split切分,filter过滤,map转换,rdd转df,dropDuplicates去重,去除空StringUtils.isNotEmpty,
自定义函数,聚合,计数,join连接)
文件:(已用回车符合制表符做了切分)
2018-09-04T20:27:31+08:00
http://datacenter.bdqn.cn/logs/user?actionBegin=1536150451540
& actionClient =Mozilla%2F5.0+%28Windows+NT+10.0%3B+WOW64%29+AppleWebKit%2F537.36+%28KHTML%2C+like+Gecko%29+Chrome% 2F58.0.3029.110+Safari%2F537.36+SE+2.X+MetaSr+1.0
& actionEnd =1536150451668&actionName=startEval
& actionTest =0
& actionType =3
& actionValue =272090
& clientType =001_kgc
& examType =001
& ifEquipment =web
& isFromContinue =false
& skillIdCount =0
& skillLevel =0
& testType =jineng
& userSID =B842B843AE317425D53D0C567A903EF7.exam-tomcat-node3.exam-tomcat-node3
& userUID =272090
& userUIP =1.180.18.157
GET 200 192.168.168.64 - - Apache-HttpClient/4.1.2 (java 1.5)
1.常量文件
object JdbcUtils {
val driver="com.mysql.jdbc.Driver"
val url="jdbc:mysql://hadoop8:3306/test"
val user="root"
val password="ok"
val path="in/test.log"
val savepath="hdfs://hadoop8:9000/hello.txt"
val table_access_log="table_access_log"
val table_full_access_log="table_full_access_log1"
}
2.数据清洗
import java.util.Properties
import org.apache.commons.lang.StringUtils
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql._
object EtlDemo {
def main(args: Array[String]): Unit = {
//Spark数据导入--数据清洗--数据导出到mysql里
//数据导入
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("hello")
val spark: SparkSession = SparkSession.builder().config(conf).getOrCreate()
val sc: SparkContext = spark.sparkContext
val rowRDD1: RDD[String] = sc.textFile("in/test.log")
//数据清洗:
// 按照Tab切割数据,过滤出字段数量为8个的;
val rowRDD: RDD[Row] = rowRDD1
.map(x=>x.split("\t"))
.filter(x=>x.length==8)
.map(x=>Row(x(0),x(1),x(2),x(3),x(4),x(5),x(6),x(7)))
//构建schema
val schema = StructType {
Array(
StructField("event_time", StringType),
StructField("url", StringType),
StructField("method", StringType),
StructField("status", StringType),
StructField("sip", StringType),
StructField("user_uip", StringType),
StructField("action_prepend", StringType),
StructField("action_client", StringType)
)
}
val logDF: DataFrame = spark.createDataFrame(rowRDD,schema)//---------------
logDF.show(3)
//按照第一列和第二列对数据进行去重
//过滤掉状态码非200
//过滤掉event_time为空的数据
val filterlogs: Dataset[Row] = logDF.dropDuplicates("event_time", "url")
.filter(x => x(3) == "200")
.filter(x => StringUtils.isNotEmpty(x(0).toString)) //------------
//将url按照”&”以及”=”切割,切割后返回rdd(没有表头)
import spark.implicits._
val full_logs_RDD: RDD[Row] = filterlogs.map(line => {
val str: String = line.getAs[String]("url")
//-----------------
val paramsArray: Array[String] = str.split("\\?")
var paramsMap: Map[String, String] = null
if (paramsArray.length == 2) {
paramsMap = paramsArray(1).split("&")
.map(x => x.split("=")).filter(x => x.length == 2).map(x => (x(0), x(1))).toMap
}
(
line.getAs[String]("event_time"),
paramsMap.getOrElse[String]("userUID", ""),
paramsMap.getOrElse[String]("userSID", ""),
paramsMap.getOrElse[String]("actionBegin", ""),
paramsMap.getOrElse[String]("actionEnd", ""),
paramsMap.getOrElse[String]("actionType", ""),
paramsMap.getOrElse[String]("actionName", ""),
paramsMap.getOrElse[String]("actionValue", ""),
paramsMap.getOrElse[String]("actionTest", ""),
paramsMap.getOrElse[String]("ifEquipment", ""),
line.getAs[String]("method"),
line.getAs[String]("status"),
line.getAs[String]("sip"),
line.getAs[String]("user_uip"),
line.getAs[String]("action_prepend"),
line.getAs[String]("action_client")
)
}).toDF().rdd
//给rdd定义表头
val full_logs_as_schema = StructType {
Array(
StructField("event_time", StringType),
StructField("userUID", StringType),
StructField("userSID", StringType),
StructField("actionBegin", StringType),
StructField("actionEnd", StringType),
StructField("actionType", StringType),
StructField("actionName", StringType),
StructField("actionValue", StringType),
StructField("actionTest", StringType),
StructField("ifEquipment", StringType),
StructField("method", StringType),
StructField("status", StringType),
StructField("sip", StringType),
StructField("user_uip", StringType),
StructField("action_prepend", StringType),
StructField("action_client", StringType)
)
}
val full_logDF: DataFrame = spark.createDataFrame(full_logs_RDD,full_logs_as_schema)
full_logDF.show(3)
//将数据写入mysql表中
val properties = new Properties()
properties.setProperty("user",JdbcUtils.user)
properties.setProperty("password",JdbcUtils.password)
properties.setProperty("driver",JdbcUtils.driver)
full_logDF.write.mode(SaveMode.Overwrite).jdbc(JdbcUtils.url,JdbcUtils.table_full_access_log,properties)
}
}
3.数据分析
import java.text.SimpleDateFormat
import java.util.Properties
import org.apache.commons.lang.StringUtils
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.expressions.UserDefinedFunction
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
object Retention {
def main(args: Array[String]): Unit = {
//求当天新增用户总数n
//求当天新增的用户ID与次日登录的用户ID的交集,得出新增用户次日登录总数m (次日留存数)
//m/n*100%
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("hello")
val spark: SparkSession = SparkSession.builder().config(conf).getOrCreate()
val sc: SparkContext = spark.sparkContext
import spark.implicits._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
//从mysql读取数据
val properties = new Properties()
properties.setProperty("user",JdbcUtils.user)
properties.setProperty("password",JdbcUtils.password)
properties.setProperty("driver",JdbcUtils.driver)
val logs: DataFrame = spark.read.jdbc(JdbcUtils.url,JdbcUtils.table_full_access_log,properties)
logs.show(3,false)
//得到userUID,event_time,actionName
val registered: DataFrame = logs.filter($"actionName" === "Registered")
.withColumnRenamed("event_time", "register_time")
.select("userUID", "register_time")
registered.show(2)
//得到userUID,event_time,actionName
val signed: DataFrame = logs.filter($"actionName" === "Signin")
.withColumnRenamed("event_time", "signin_time")
.select("userUID", "signin_time")
signed.show(2)
//Join得到登录=注册+1天的表,cout即为当天留存数
val joined: DataFrame = registered.join(signed,Seq("userUID"),"left")
//处理时间
val spdf = new SimpleDateFormat("yyyy-MM-dd")
// val joined2: Dataset[Row] = joined
// .filter(date_sub(date_format($"signin_time", "yyyy-MM-dd"),
// 1) === date_format($"register_time", "yyyy-MM-dd"))
//
// val count22: DataFrame = joined2.groupBy("register_time").agg(countDistinct("userUID").as("count2"))
// val count11: DataFrame = registered.groupBy("register_time").agg(countDistinct("userUID").as("count1"))
// count11.join(count22,Seq("register_time"),"left")
// .withColumn("liucunlv",round($"count2"*100.0/$"count1",2)+"%").show(2)
//注册UDF,作日期的数值转换
// val gszh: UserDefinedFunction = spark.udf.register("gszh", (event_time: String) => {
// if (StringUtils.isEmpty(event_time)) 0
// else {
// spdf.parse(event_time)
// }.getTime()
// }
// )
// val joined2: DataFrame = joined.withColumn("register_date", gszh($"register_time"))
// .withColumn("signin_date", gszh($"signin_time"))
//
// val count11: DataFrame = joined2.groupBy($"register_date").agg(countDistinct("userUID").as("num1"))
// val count22: DataFrame = joined2.filter($"register_date" + 86400000 === $"signin_date")
// .groupBy($"register_date").agg(countDistinct("userUID").as("num2"))
// count11.join(count22,Seq("register_date"),"left")
// .withColumn("liucunlv",$"num2"/$"num1").show(2)
}
}