数据仓库项目笔记8

最新推荐文章于 2024-11-06 10:03:27 发布

新奇酷

最新推荐文章于 2024-11-06 10:03:27 发布

阅读量190

点赞数

分类专栏： bigdata

本文链接：https://blog.youkuaiyun.com/xinqiku/article/details/100599479

版权

bigdata 专栏收录该内容

15 篇文章

订阅专栏

路径分析-转化率概念

业务背景:公司有很多很多的各种类型的业务,而每一项业务往往能分成若干个操作环节,用户在业务的各个操作环节上进行操作，一步步走向业务目标（比如买单，比如注册成功，比如充值完成，比如进入充值页）那么，一个业务的操作环节链条，就叫做这个业务的转化路径！
转化率，漏斗模型: 路径中，每一个环节上的事件发生次数或人数，都会不同，一般是前面的环节上人数多，越往后越少，这样就引出一个概念：转化率
下图为一个业务的转换
下图为用户的行为轨迹为不同业务步骤转换打基础当join不同业务流程时可获取不同业务的步骤转换(漏斗转换率)

从报表页面原型中，分析出所要计算的数据要素

所访问的页面
第几步访问的
前一步是哪个页面
这是哪个人
这是哪个会话
维度…..

uid	sessionid	访问步骤序号	页面	前一页

ods->dwd 筛选uid sid 页面页面类型
通过lead over 获取前一个页面 row number 获取步骤数

-- 建表：
drop table if exists  dws_acc_route;
create table dws_acc_route(
uid string,
sessionid string,
url string,
sno int,
pre_url string
)
partitioned by (dt string)
stored as parquet
;


-- etl计算
insert into table dws_acc_route partition(dt='2019-06-16')
select
imei as uid,
sessionid,
event['url'] as url,
row_number() over(partition by imei,sessionid order by commit_time) as sno,
lag(event['url']) over(partition by imei,sessionid order by commit_time) as pre_url

from ods_traffic_log where dt='2019-06-16' and eventtype='pg_view'

小需求: 去除重新刷新的页面

/*
   访问路径分析：dws_user_acc_route
   @src  demo_dwd_event_dtl 事件记录明细表
   
   计算的逻辑细节：
		1. 考虑用户在同一个页面多次刷新，是否要重复计数的问题（此处公司想去重）
				可以将用户的访问记录，通过lead over错位，如果在同一行中，出现两个 B,A   B,A的组合，则可以去掉
		2. 一条记录在用户会话中是第几步访问，通过对上面整理后的数据打rownumber即可
		3. 详细看下图：
*/

-- 造demo测试数据
vi e.dat
u01,s01,pg_view,A,X,1
u01,s01,pg_view,B,A,2
u01,s01,pg_view,B,A,3
u01,s01,pg_view,B,A,4
u01,s01,pg_view,C,B,5
u01,s01,pg_view,D,C,6
u01,s01,pg_view,A,D,7
u01,s01,pg_view,B,A,8
u02,s21,pg_view,A,X,1
u02,s21,pg_view,B,A,2
u02,s21,pg_view,D,B,3
u02,s21,pg_view,B,D,4
u02,s21,pg_view,E,B,5
u02,s21,pg_view,F,E,6
u02,s21,pg_view,D,F,7
u02,s21,pg_view,B,D,8

-- 建demo测试表
drop table if exists demo_dwd_event_dtl;
create table demo_dwd_event_dtl(
uid string,
sessionid string,
event_type string,
url string,
reference string,
commit_time bigint
)
row format delimited fields terminated by ',';


load data local inpath '/root/data/demo_dwd_event_dtl.dat' into table demo_dwd_event_dtl;

-- 逻辑示意图
uid,sessionid,event_type,url,reference ,commit_time      lead() over()
u01, s01,   pg_view, A,    X           ,  time1          B,    A
u01, s01,   pg_view, B,    A           ,  time2          B,    A
u01, s01,   pg_view, B,    A           ,  time3          B,    A
u01, s01,   pg_view, B,    A           ,  time4          C,    B
u01, s01,   pg_view, C,    B           ,  time5          D,    C
u01, s01,   pg_view, D,    C           ,  time6          A,    D
u01, s01,   pg_view, A,    D           ,  time7          B,    A
u01, s01,   pg_view, B,    A           ,  time8          


--  @dst建表
create table ads_user_acc_route(
uid string,   --用户标识
sessionid string,  -- 会话标识
step int,   -- 访问步骤号
url string,  -- 访问的页面
ref string   -- 前一个页面（所来自的页面）
)
partitioned by (dt string)
stored as parquet
;
-- etl计算

-- 先过滤掉重复刷新的记录

with tmp as
(
select
uid,
sessionid,
event_type,
url,
reference,
commit_time,
concat_ws('-',url,reference) as tuple,
lead(concat_ws('-',url,reference),1) over(partition by uid,sessionid order by commit_time) as tuple2
from
demo_dwd_event_dtl
)

-- 得到结果
/*
+------+------------+-------------+------+------------+--------------+--------+---------+
| uid  | sessionid  | event_type  | url  | reference  | commit_time  | tuple  | tuple2  |
+------+------------+-------------+------+------------+--------------+--------+---------+
| u01  | s01        | pg_view     | A    | X          | 1            | A-X    | B-A     |
| u01  | s01        | pg_view     | B    | A          | 2            | B-A    | B-A     |
| u01  | s01        | pg_view     | B    | A          | 3            | B-A    | B-A     |
| u01  | s01        | pg_view     | B    | A          | 4            | B-A    | C-B     |
| u01  | s01        | pg_view     | C    | B          | 5            | C-B    | D-C     |
| u01  | s01        | pg_view     | D    | C          | 6            | D-C    | A-D     |
| u01  | s01        | pg_view     | A    | D          | 7            | A-D    | B-A     |
| u01  | s01        | pg_view     | B    | A          | 8            | B-A    | NULL    |
| u02  | s21        | pg_view     | A    | X          | 1            | A-X    | B-A     |
| u02  | s21        | pg_view     | B    | A          | 2            | B-A    | D-B     |
| u02  | s21        | pg_view     | D    | B          | 3            | D-B    | B-D     |
| u02  | s21        | pg_view     | B    | D          | 4            | B-D    | E-B     |
| u02  | s21        | pg_view     | E    | B          | 5            | E-B    | F-E     |
| u02  | s21        | pg_view     | F    | E          | 6            | F-E    | D-F     |
| u02  | s21        | pg_view     | D    | F          | 7            | D-F    | B-D     |
| u02  | s21        | pg_view     | B    | D          | 8            | B-D    | NULL    |
+------+------------+-------------+------+------------+--------------+--------+---------+
*/

-- 将上述中间结果中的tuple=tuple2的记录去除

select 
uid,
sessionid,
event_type,
url,
reference,
commit_time
from tmp
where tuple !<==> tuple2 or tuple2 is null

-- 并按同一个人的同一个会话的时间顺序 标记行号，对上步骤的sql略微改造一下：
select 
uid,
sessionid,
event_type,
url,
reference,
commit_time,
row_number() over(partition by uid,sessionid order by commit_time) as step
from tmp
where tuple != tuple2 or tuple2 is not null

-- 最后的完整语句

with tmp as
(
select
uid,
sessionid,
event_type,
url,
reference,
commit_time,
concat_ws('-',url,reference) as tuple,
lead(concat_ws('-',url,reference),1) over(partition by uid,sessionid order by commit_time) as tuple2
from 
demo_dwd_event_dtl
)

select 
uid,
sessionid,
event_type,
url,
reference,
commit_time,
row_number() over(partition by uid,sessionid order by commit_time) as step
from tmp
where tuple != tuple2
;
-- 访问路径明细过滤重复刷新 row_number() 实现
with tmp as (
select 
uid,
sessionid,
event_type,
url,
reference,
commit_time,
row_number() over(partition by uid,sessionid order by commit_time) - row_number() over(partition by uid,sessionid,url,reference order by commit_time) 
 as rn
from
demo_dwd_event_dtl
)

select * 
,row_number() over(partition by uid,sessionid order by commit_time) as step
from (
select
uid,
sessionid,
event_type,
url,
reference,
max(commit_time) commit_time

from
tmp
group by uid,sessionid,url,reference,event_type,rn) o

转换率漏斗需求:不同业务流程的完成步骤的人数 (用户的行为轨迹join不同业务流程时可获取不同业务的步骤转换)

所需字段

uid	tid	step
用户id	业务id	完成步骤

路径分析-转化率漏斗: 不同业务流程的完成步骤的人数
hive解耦业务流程

流程解析

业务步骤流程由业务人员通过web控制
数据分析join业务流程表来实现业务(每天晚上获取新业务分析)

/*
   造数据1： 用户访问路径记录表，在hive数仓 dws_user_acc_route
*/
uid,sid,step,url,ref
u01,s01,1,X
u01,s01,2,Y,X
u01,s01,3,A,Y
u01,s01,4,B,A
u01,s01,5,C,B
u01,s01,6,B,C
u01,s01,7,o,B
u02,s02,1,A
u02,s02,2,C,A
u02,s02,3,A,C
u02,s02,4,B,A
u02,s02,5,D
u02,s02,6,B,D
u02,s02,7,C,B


/*
    造数据2： 业务转化路径定义表，在元数据管理中  transaction_route
       T101	1	步骤1	A	null
       T101	2	步骤2	B	A
       T101	3	步骤3	C	B
       T102	1	步骤1	D	null
       T102	2	步骤2	B	D
       T102	3	步骤3	C	B
*/


/*
	计算步骤：
		1. 加载 元数据库中  transaction_route 表，整理格式
		2. 读取 数仓中的  dws_user_acc_route 表
		3. 对用户访问路径数据，按session分组，将一个人的一次会话中的所有访问记录整合到一起
		        u01,s01,1,X
                u01,s01,2,Y,X
                u01,s01,3,A,Y
                u01,s01,4,B,A
                u01,s01,5,C,B
                u01,s01,6,B,C
                u01,s01,7,o,B
		
		4. 然后依照公司定义的规则，判断这个人是否完成了 ？ 业务路径中的 ？ 步骤，如果完成了，则输出如下数据：多次完成相同步骤忽略不计
				u01   t101   step_1  完成了1步骤
				u01   t101   step_2  完成2个步骤
				u01   t102   step_1  只完成一个步骤
				u02   t101   step_1
				u02   t102   step_1
				u02   t102   step_2		
				......
				
		5. 对上述数据，按业务id和步骤号，分组统计人数，得到结果：
				t101  step_1   2
				t101  step_2   1
				t101  step_3   1
				t102  step_1   3
				t102  step_2   2
				......
*/
/**
      * 接下来要做的事，就是去判断每个人的行为，是否满足某个业务的某个步骤定义，如果满足，则输出：
      * u_id,t_id,t_step
      * 怎么做呢？怎么做都好说，问题在于，判断标准的是什么？
      * 对于一个人的行为，是否满足某业务的步骤定义，可能有如下界定标准：
      * 比如，业务定义的步骤事件分别为： A B C D
      * 假如，某个人的行为记录为：
      * 张三： A  A  B  A  B  C
      * 李四： C  D  A  B  C  E  D
      * 王五： A  B  B  C  E  A  D
      * 赵六： B  C  E  E  D
      * 那么：这些算满足了业务定义的哪些步骤？
      * 标准1：  判断是否满足业务C步骤，必须要求前面两个近邻的事件是A B
      * 标准2：  判断是否满足业务C步骤，只要求C事件发生前，前面发生过 B ，B前面发生过A，不要求紧邻
算法实现
def routeMatch(userActions: List[String], transSteps: List[String]): List[Int] = {
    val buffer = new ListBuffer[Int]

    var index = -1
    var flag = true
    for (i <- 0 until transSteps.size if flag) {
      index = userActions.indexOf(transSteps(i), index + 1)
      if(index != -1) {
        buffer += i+1
      }else{
        flag = false
      }
    }
    buffer.toList
  }
      * 标准3：  判断是否满足业务C步骤，只要发生了C事件且前面发生B就行
      * 标准4：  判断是否满足业务C步骤，只要发生了C事件就算
      *
      * 那么：在写代码时，究竟按哪个标准来计算？ 看公司开会讨论的需求！
      * 咱们下面以 标准2 为例！
      *
      * 做法：
      * 将一个用户的所有行为按时间先后顺序收集到一起：A  A  B  A  B  C
      * 将业务路径定义做成广播变量：
      * Map(
      * "t101" -> list(A,B,C,D)
      * "t102" -> list(D,B,C)
      * )
      * 然后，将这个人的行为和业务定义去对比，按标准2对比
      * 具体来说：
      * 拿业务定义中的步骤1，去用户的行为序列中搜索，如果搜索到，则继续
      * 拿步骤2，去用户行为序列种步骤1事件后面去搜索，如果搜索到，则继续
      * 以此类推!
      *
      *
      **/

/**
*@Description: 不同业务流程的完成步骤的人数 | 用户id | 业务id |完成步骤 |
*@Author: dyc
*@date: 2019/9/7
*/
object FunnelAnalysis {

  def main(args: Array[String]): Unit = {
    Logger.getLogger("org").setLevel(Level.WARN)

    val spark = SparkUtil.getSparkSession()

    // 加载业务路径定义 元数据
    val props = new Properties()
    props.setProperty("user","root")
    props.setProperty("password","123456")

    val routeDefine = spark.read.jdbc("jdbc:mysql://localhost:3306/test","transaction_route",props)
    //获取用户定义的业务流程步骤
    val routeMap: collection.Map[String, List[(Int, String)]] = routeDefine.rdd.map(row => {
      val t_id = row.getAs[String]("tid")
      val t_event = row.getAs[String]("event_id")
      val t_step = row.getAs[Int]("step")

      (t_id, (t_step, t_event))
    }).groupByKey().mapValues(it => {
      val tuples: List[(Int, String)] = it.toList.sortBy(_._1)
      tuples
    }).collectAsMap()
    val bc: Broadcast[collection.Map[String, List[(Int, String)]]] = spark.sparkContext.broadcast(routeMap)

    // 加载用户访问路径数据
    val userRoute = spark.read.option("header","true").csv("data_ware/data/dws_user_acc_route/part-00000.csv")
    import spark.implicits._
    //val ds: RelationalGroupedDataset = userRoute.selectExpr("uid","sid", "collect_set(concat_ws('-',step,url,ref)").groupBy('uid,'sid)
    userRoute.createTempView("t")
    val df = spark.sql(
      """
        |with tmp as (select * from t order by step)
        |select uid,sid,
        |collect_list(url) as url_order
        |from
        |tmp group by uid, sid
        |
      """.stripMargin)
    //获取每个用户一次会话操作的步骤
    val usStep = df.map(row => {
      val uid: String = row.getAs[String]("uid")
      val sid: String = row.getAs[String]("sid")

      val strings: Seq[String] = row.getAs[Seq[String]]("url_order")
      ((uid,sid), strings.mkString)
    })

//    usStep.show(10, false)
    //获取每个用户一次会话操作的步骤 进行业务流程匹配
    val ustidstep = usStep.flatMap(t => {
      val uid = t._1._1
      val userlist = t._2.toList.map(_.toString)
      val routeMap: collection.Map[String, List[(Int, String)]] = bc.value
      val resList = new ListBuffer[(String, String, Int)]
      //对每个业务流程匹配 用户操作步骤
      for ((k, v) <- routeMap) {
        val routeList: List[String] = v.map(t => t._2)
        val steps: List[Int] = TransactionRouteMatch.routeMatch(userlist, routeList)
        val tuples: List[(String, String, Int)] = steps.map(i => {
          (uid, k, i)
        })
        resList ++= tuples
      }
      resList
    }).toDF("uid", "sid", "step")
    ustidstep.show(10,false)

    spark.close()

  }

}

关于不能再foreach中调用list.remove方法
- 进行上述操作是啥会抛出ConcurrentModificationException，原因是迭代器在调用next方法是会调用checkModification方法，检查modCount（集合被修改的次数）和expectedModCount（迭代器期待集合被修改的次数获取迭代器时赋值为modCount）是否相等；modCount为Arraylist的成员变量(继承自父类AbstractList)， expectedModCount为Arraylist的内部类Itr的成员变量；当调用Arraylist的remove方法时，只会修改modCount，而不会修改expectedModCount，所以当Itr调用next方法时，就会抛出异常；而Itr自己的remove方法中对二者进行了赋值处理，保证两者相同；
- 另外，即使是在集合的最后一个元素时执行的删除，也会使Itr调用next方法，原因是Itr的hasNext方法中判断了ArrayList的成员变量cursor和size的值是否相等，若不相等则返回true，而通过Arraylist的remove方法删除数据时，size会被减1，但cursor(为当前遍历值索引index+1)不会更改，导致两者不相等，hasNext方法返回true，还是会调用next方法；
- 关于iterator移除时不报错是因为next的时候cursor是赋值为当前遍历值索引index+1 而lastRet赋值为index, 而remove时候赋值cursor为lastRet,size也相应减1 保证cursor遍历完集合才跳出hasNext(){return cursor!=size}