Spark项目1

最新推荐文章于 2023-05-31 17:15:56 发布

非淡泊无以明志，非宁静无以致远—孔明

最新推荐文章于 2023-05-31 17:15:56 发布

阅读量4.7k

点赞数

文章标签：大数据 spark

本文链接：https://blog.youkuaiyun.com/weixin_53788274/article/details/116995874

版权

需求：拆分test.log每行数据，得到用户每日留存率
环境：idea,mven,scala.spark,mysql
要点：
读取文件得到dataFrame，
将结果文件dataFrame写入mysql,
将常量写入配置文件（配置文件的读取）
DataFrame算子的使用（split切分，filter过滤，map转换，rdd转df，dropDuplicates去重，去除空StringUtils.isNotEmpty，
自定义函数，聚合，计数，join连接）

文件：(已用回车符合制表符做了切分）
2018-09-04T20:27:31+08:00	
http://datacenter.bdqn.cn/logs/user?actionBegin=1536150451540
&	actionClient			=Mozilla%2F5.0+%28Windows+NT+10.0%3B+WOW64%29+AppleWebKit%2F537.36+%28KHTML%2C+like+Gecko%29+Chrome%		2F58.0.3029.110+Safari%2F537.36+SE+2.X+MetaSr+1.0
&	actionEnd			=1536150451668&actionName=startEval
&	actionTest			=0
&	actionType			=3
&	actionValue		=272090
&	clientType			=001_kgc
&	examType			=001
&	ifEquipment		=web
&	isFromContinue		=false
&	skillIdCount		=0
&	skillLevel			=0
&	testType			=jineng
&	userSID				=B842B843AE317425D53D0C567A903EF7.exam-tomcat-node3.exam-tomcat-node3
&	userUID				=272090
&	userUIP				=1.180.18.157
GET	200	192.168.168.64	-	-	Apache-HttpClient/4.1.2 (java 1.5)

1.常量文件

object JdbcUtils {
val driver="com.mysql.jdbc.Driver"
  val url="jdbc:mysql://hadoop8:3306/test"
  val user="root"
  val password="ok"
  val path="in/test.log"
  val savepath="hdfs://hadoop8:9000/hello.txt"
  val table_access_log="table_access_log"
  val table_full_access_log="table_full_access_log1"
}

2.数据清洗

import java.util.Properties

import org.apache.commons.lang.StringUtils
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql._

object EtlDemo {
  def main(args: Array[String]): Unit = {
    //Spark数据导入--数据清洗--数据导出到mysql里
    //数据导入
    val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("hello")
    val spark: SparkSession = SparkSession.builder().config(conf).getOrCreate()
    val sc: SparkContext = spark.sparkContext
    val rowRDD1: RDD[String] = sc.textFile("in/test.log")
    //数据清洗：
    // 按照Tab切割数据，过滤出字段数量为8个的；
    val rowRDD: RDD[Row] = rowRDD1
      .map(x=>x.split("\t"))
    .filter(x=>x.length==8)
      .map(x=>Row(x(0),x(1),x(2),x(3),x(4),x(5),x(6),x(7)))
    //构建schema
    val schema = StructType {
      Array(
        StructField("event_time", StringType),
        StructField("url", StringType),
        StructField("method", StringType),
        StructField("status", StringType),
        StructField("sip", StringType),
        StructField("user_uip", StringType),
        StructField("action_prepend", StringType),
        StructField("action_client", StringType)
      )
    }
    val logDF: DataFrame = spark.createDataFrame(rowRDD,schema)//---------------
    logDF.show(3)
    //按照第一列和第二列对数据进行去重
    //过滤掉状态码非200
    //过滤掉event_time为空的数据
    val filterlogs: Dataset[Row] = logDF.dropDuplicates("event_time", "url")
      .filter(x => x(3) == "200")
      .filter(x => StringUtils.isNotEmpty(x(0).toString)) //------------
    //将url按照”&”以及”=”切割,切割后返回rdd（没有表头）
import spark.implicits._

    val full_logs_RDD: RDD[Row] = filterlogs.map(line => {
      val str: String = line.getAs[String]("url")
      //-----------------
      val paramsArray: Array[String] = str.split("\\?")
      var paramsMap: Map[String, String] = null
      if (paramsArray.length == 2) {
        paramsMap = paramsArray(1).split("&")
          .map(x => x.split("=")).filter(x => x.length == 2).map(x => (x(0), x(1))).toMap
      }
      (
        line.getAs[String]("event_time"),
        paramsMap.getOrElse[String]("userUID", ""),
        paramsMap.getOrElse[String]("userSID", ""),
        paramsMap.getOrElse[String]("actionBegin", ""),
        paramsMap.getOrElse[String]("actionEnd", ""),
        paramsMap.getOrElse[String]("actionType", ""),
        paramsMap.getOrElse[String]("actionName", ""),
        paramsMap.getOrElse[String]("actionValue", ""),
        paramsMap.getOrElse[String]("actionTest", ""),
        paramsMap.getOrElse[String]("ifEquipment", ""),
        line.getAs[String]("method"),
        line.getAs[String]("status"),
        line.getAs[String]("sip"),
        line.getAs[String]("user_uip"),
        line.getAs[String]("action_prepend"),
        line.getAs[String]("action_client")
      )
    }).toDF().rdd
    //给rdd定义表头
    val full_logs_as_schema = StructType {
      Array(
        StructField("event_time", StringType),
        StructField("userUID", StringType),
        StructField("userSID", StringType),
        StructField("actionBegin", StringType),
        StructField("actionEnd", StringType),
        StructField("actionType", StringType),
        StructField("actionName", StringType),
        StructField("actionValue", StringType),
        StructField("actionTest", StringType),
        StructField("ifEquipment", StringType),
        StructField("method", StringType),
        StructField("status", StringType),
        StructField("sip", StringType),
        StructField("user_uip", StringType),
        StructField("action_prepend", StringType),
        StructField("action_client", StringType)
      )
    }
    val full_logDF: DataFrame = spark.createDataFrame(full_logs_RDD,full_logs_as_schema)
full_logDF.show(3)
    //将数据写入mysql表中
val properties = new Properties()
    properties.setProperty("user",JdbcUtils.user)
    properties.setProperty("password",JdbcUtils.password)
    properties.setProperty("driver",JdbcUtils.driver)
    full_logDF.write.mode(SaveMode.Overwrite).jdbc(JdbcUtils.url,JdbcUtils.table_full_access_log,properties)
  }
}

3.数据分析

import java.text.SimpleDateFormat
import java.util.Properties

import org.apache.commons.lang.StringUtils
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.expressions.UserDefinedFunction
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}

object Retention {
  def main(args: Array[String]): Unit = {
    //求当天新增用户总数n
    //求当天新增的用户ID与次日登录的用户ID的交集,得出新增用户次日登录总数m (次日留存数)
    //m/n*100%
    val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("hello")
    val spark: SparkSession = SparkSession.builder().config(conf).getOrCreate()
    val sc: SparkContext = spark.sparkContext
    import spark.implicits._
    import org.apache.spark.sql.functions._
    import org.apache.spark.sql.types._

    //从mysql读取数据
    val properties = new Properties()
    properties.setProperty("user",JdbcUtils.user)
    properties.setProperty("password",JdbcUtils.password)
    properties.setProperty("driver",JdbcUtils.driver)
    val logs: DataFrame = spark.read.jdbc(JdbcUtils.url,JdbcUtils.table_full_access_log,properties)
logs.show(3,false)
//得到userUID,event_time,actionName
    val registered: DataFrame = logs.filter($"actionName" === "Registered")
      .withColumnRenamed("event_time", "register_time")
      .select("userUID", "register_time")
    registered.show(2)
    //得到userUID,event_time,actionName
    val signed: DataFrame = logs.filter($"actionName" === "Signin")
      .withColumnRenamed("event_time", "signin_time")
      .select("userUID", "signin_time")
    signed.show(2)
    //Join得到登录=注册+1天的表，cout即为当天留存数
    val joined: DataFrame = registered.join(signed,Seq("userUID"),"left")
    //处理时间
    val spdf = new SimpleDateFormat("yyyy-MM-dd")

//    val joined2: Dataset[Row] = joined
//      .filter(date_sub(date_format($"signin_time", "yyyy-MM-dd"),
//      1) === date_format($"register_time", "yyyy-MM-dd"))
//
//    val count22: DataFrame = joined2.groupBy("register_time").agg(countDistinct("userUID").as("count2"))
//    val count11: DataFrame = registered.groupBy("register_time").agg(countDistinct("userUID").as("count1"))
//    count11.join(count22,Seq("register_time"),"left")
//      .withColumn("liucunlv",round($"count2"*100.0/$"count1",2)+"%").show(2)

    //注册UDF，作日期的数值转换
//    val gszh: UserDefinedFunction = spark.udf.register("gszh", (event_time: String) => {
//      if (StringUtils.isEmpty(event_time)) 0
//      else {
//        spdf.parse(event_time)
//      }.getTime()
//    }
//    )
//    val joined2: DataFrame = joined.withColumn("register_date", gszh($"register_time"))
//      .withColumn("signin_date", gszh($"signin_time"))
//
//    val count11: DataFrame = joined2.groupBy($"register_date").agg(countDistinct("userUID").as("num1"))
//    val count22: DataFrame = joined2.filter($"register_date" + 86400000 === $"signin_date")
//      .groupBy($"register_date").agg(countDistinct("userUID").as("num2"))
//    count11.join(count22,Seq("register_date"),"left")
//      .withColumn("liucunlv",$"num2"/$"num1").show(2)

  }
}