package sparkupdatedemo.dataset import org.apache.spark.sql.SparkSession /** * 用户活跃度分析 * * 1、指定时间段内访问次数最多的10个用户 * 2、指定时间段内购买商品金额最多的10个用户 * 3、最近周期内相对之前一个周期访问次数增长最快的10个用户 * 4、最近周期内相对之前一个周期购买商品金额增长最快的10个用户 * 5、指定周期内注册的新用户在头7天访问次数最多的10个用户 * 6、指定周期内注册的新用户在头7天访问购买金额最多的10个用户 */ object UserVisistActionNanlysis { case class UserActionLog(logId:Long,userId:Long,actionTime:String,actionType:Long,purchaseMoney:Double) case class UserActionLogVO(logId: Long, userId: Long, actionValue: Long) case class UserActionLogMVO(logId: Long, userId: Long, actionValue: Double) def main(args: Array[String]): Unit = { val start_time = "2016-06-18" val end_time = "2016-10-18" val spark = SparkSession .builder() .appName("UserVisistActionNanlysis") .master("local") .config("spark.sql.warehouse.dir", "C:\\Users\\Administrator\\Desktop\\spark-warehouse") .getOrCreate() import spark.implicits._ import org.apache.spark.sql.functions._ val log = spark.read.json("C:\\Users\\Administrator\\Desktop\\user_action_log.json") val userBaseInfo = spark.read.json("C:\\Users\\Administrator\\Desktop\\user_base_info.json") //1.指定时间范围内访问次数前10用户 // log // //filter不需要$符号,接收的是条件表达式 // .filter("actionTime >= '" + start_time + "' and actionTime <= '" + end_time + "' and actionType = 0") // .join(userBaseInfo,log("userId") === userBaseInfo("userId")) // .groupBy(userBaseInfo("userId"),userBaseInfo("username")) // .agg(count(log("logId")).alias("ActionCount")) // .sort($"ActionCount".desc) // .limit(10) // .show() //2、指定时间段内购买商品金额最多的10个用户 // log // .filter("actionTime >= '" + start_time + "' and actionTime <= '" + end_time + "' and actionType = 1") // .join(userBaseInfo,log("userId") === userBaseInfo("userId")) // .groupBy(userBaseInfo("userId"),userBaseInfo("username")) // .agg(round(sum(log("purchaseMoney")),2).alias("TotalMoney")) // feature,技术点的讲解:嵌套函数的使用 // .sort($"TotalMoney".desc) // .limit(10) // .show() //3、最近一个周期内相对之前一个周期访问次数增长最快的10个用户 //周期任由用户在页面指定 //具体的分析思路写出来有助于编程,比如举例实例化(见多识广!!!) // Dataset[T]比Dataset[Row]效率更高吧 // val userActionLogInFirstPeriod = log.as[UserActionLog] // .filter("actionTime >= '2016-10-01' and actionTime <= '2016-10-31' and actionType = 0") // .map(UserActionLogET => UserActionLogVO(UserActionLogET.logId ,UserActionLogET.userId, 1)) // // val userActionLogInLastPeriod = log.as[UserActionLog] // .filter("actionTime >= '2016-09-01' and actionTime <= '2016-09-30' and actionType = 0") // .map(UserActionLogET => UserActionLogVO(UserActionLogET.logId ,UserActionLogET.userId, -1)) // // //上述分别把两个周期的每条记录加1标识,合并成一个表,一相加即为两个周期相比的增量 // val userActionLogIncrDS = userActionLogInFirstPeriod.union(userActionLogInLastPeriod) // // userActionLogIncrDS // .join(userBaseInfo,userActionLogIncrDS("userId") === userBaseInfo("userId")) // .groupBy(userBaseInfo("userId"),userBaseInfo("username")) // .agg(sum(userActionLogIncrDS("actionValue")).alias("actionIncr")) // .sort($"actionIncr".desc) // .limit(10) // .show() // //4、最近周期内相对之前一个周期购买商品金额增长最快的10个用户 // val userActionLogInFirstPeriod = log.as[UserActionLog] // .filter("actionTime >= '2016-10-01' and actionTime <= '2016-10-31' and actionType = 1") // .map(UserActionLogET => UserActionLogMVO(UserActionLogET.logId ,UserActionLogET.userId, UserActionLogET.purchaseMoney)) // // val userActionLogInLastPeriod = log.as[UserActionLog] // .filter("actionTime >= '2016-09-01' and actionTime <= '2016-09-30' and actionType = 1") // .map(UserActionLogET => UserActionLogMVO(UserActionLogET.logId ,UserActionLogET.userId, -UserActionLogET.purchaseMoney)) // // //上述分别把两个周期的每条记录加1标识,合并成一个表,一相加即为两个周期相比的增量 // val userActionLogIncrDS = userActionLogInFirstPeriod.union(userActionLogInLastPeriod) // // userActionLogIncrDS // .join(userBaseInfo,userActionLogIncrDS("userId") === userBaseInfo("userId")) // .groupBy(userBaseInfo("userId"),userBaseInfo("username")) // .agg(round(sum(userActionLogIncrDS("actionValue")),2).alias("actionIncr")) // .sort($"actionIncr".desc) // .limit(10) // .show() //5、指定周期内注册的新用户在头7天访问次数最多的10个用户 // log // .join(userBaseInfo,userBaseInfo("userId") === log("userId")) // .filter(userBaseInfo("registTime") >= "2016-10-01" && // userBaseInfo("registTime") <= "2016-10-31" && // log("actionTime") >= userBaseInfo("registTime") and // log("actionTime") <= date_add(userBaseInfo("registTime"),7)) // .groupBy(userBaseInfo("userId"),userBaseInfo("username")) // .agg(count(log("logId")).alias("ActionCount")) // .sort($"ActionCount".desc) // .limit(10) // .show() //6、指定周期内注册的新用户在头7天访问购买金额最多的10个用户 log .join(userBaseInfo,userBaseInfo("userId") === log("userId")) .filter(userBaseInfo("registTime") >= "2016-10-01" && userBaseInfo("registTime") <= "2016-10-31" && log("actionTime") >= userBaseInfo("registTime") and log("actionTime") <= date_add(userBaseInfo("registTime"),7)) .groupBy(userBaseInfo("userId"),userBaseInfo("username")) .agg(round(sum(log("purchaseMoney")),2).alias("TotalMoney")) .sort($"TotalMoney".desc) .limit(10) .show() } }