def main(args: Array[String]): Unit = {
val date = args(0)
//afs://xingtian.afs.baidu.com:9902/app/insight/lbs/lbs_mobile_matrix_user_daily/event_day=20191201/event_type=matrix_stat_daily/000099_0
val reqdata = args(1)
//afs://xingtian.afs.baidu.com:9902/app/insight/lbs/lbs_mobile_matrix_fact/event_day=20191201/event_type=nophoneinfo_normaltest
val savepath = args(2)
val conf = new SparkConf()
conf.setAppName(“Matrix_Icon_Models_”+date)
val sc = new SparkContext(conf)
val rdd = MatrixFact.dealSourceData(reqdata, sc)
//插入normal分区数据 pv、uv
MatrixFact.Insertnophoneinfo_normal(rdd, savepath)
//插入nosv_normal分区数据 pv、uv
MatrixFact.Insertnosv_phoneinfo_normal(rdd, savepath)
//插入sv_dims分区数据 pv、uv
MatrixFact.Insertnophoneinfo_dims(rdd, savepath)
//插入nosv_dims分区数据 pv、uv
MatrixFact.Insertnosv_phoneinfo_dims(rdd, savepath)
//插入dims分区数据 pv、uv
MatrixFact.Insertdims(rdd, savepath)
MatrixFact.Insertnosv_dims(rdd, savepath)
}
def dealSourceData(reqdata: String, sc: SparkContext): RDD[String] = {
val rdd = sc.textFile(reqdata,2000).filter(
line => (!line.split("\t")(0).equals("")) //过滤掉cuid为空的情况
).map(_.split("\t")).flatMap(line => { //按照
//flatMap方法将每行数据按照sid拆成多行
val cuid = line(0)
val os_sv = line(1)
val hp_mode=line(2)
val user_daily_stat = line(3) // user_daily_stat表示用户当日的所有行为操作
val user_daily_stat_map = JsonUtil.json2Map(user_daily_stat)
for (key <- user_daily_stat_map) yield {
val sid = key._1 // 表示一个sid
val sid_stat = key._2 //sid_stat表示某个sid的所有操作,比如{"MEpEcHArZGtqZ2lXOXcvSS9DLzlOQT09|detailpage":2,"aWVVTE5kNXV5blo5V1IvYUdYVm9Tdz09|detailpage":1}
cuid + "\t" + sid + "\t" + os_sv + "\t" + hp_mode + "\t" + sid_stat //cuid表示cuid,sid表示sid,os_sv表示os,sv,sid_stat表示一个cuid和一个sid下的所有操作
}
}).map(_.split("\t")).flatMap(line => {
//将拆出来的每行数据按自定义维度再拆成多行
val cuid = line(0)
val sid = line(1)
val os_sv = line(2)
val hp_mode=line(3)
val sid_stat = line(4) // sid_stat表示每个sid的所有操作
val sid_stat_json = JsonUtil.json2Map(sid_stat)
for (key <- sid_stat_json) yield {
val dim1_dim2 = key._1.split("\\|")
//key._1表示自定义维度 ,有四种情况
// (1)MEpEcHArZGtqZ2lXOXcvSS9DLzlOQT09|detailpage
// (2)MEpEcHArZGtqZ2lXOXcvSS9DLzlOQT09|
// (3)|detailpage
// (4)|
var dim1 = ""
var dim2 = ""
if (dim1_dim2.length == 0) {
// |
dim1 = ""
dim2 = ""
}
else if (dim1_dim2.length == 1) {
// MEpEcHArZGtqZ2lXOXcvSS9DLzlOQT09|
dim1 = dim1_dim2(0)
dim2 = ""
} else {
// (1)MEpEcHArZGtqZ2lXOXcvSS9DLzlOQT09|detailpage (2) |detailpage
dim1 = dim1_dim2(0)
dim2 = dim1_dim2(1)
}
// val dim1_utf8 = new String(dim1.getBytes,"UTF-8")
// val dim2_utf8 = new String(dim2.getBytes,"UTF-8")
val dim1dim2 = dim1 + "," + dim2 //自定义维度,用","连接
val pv = key._2 //自定义维度下的pv
cuid + "\t" + sid + "\t" + os_sv + "\t" + hp_mode + "\t" + dim1dim2 + "\t" + pv
}
})
(rdd)
}
def Insertnophoneinfo_normal(rdd: RDD[String], savepath: String): Unit = {
//cuid + “\t” + sid + “\t” + os_sv + “\t” + hp_mode + “\t” + dim1dim2 + “\t” + pv
val normal_pairrdd = rdd.map(line => {
val value = line.split("\t")
val pv = Integer.valueOf(value(5))
//(sid os_sv,dim1dim2 cuid ,pv) key是sid os,sv,dim1,dim2 cuid value是pv
(value(1) + “\t” + value(2) + “,” + value(4) + “\t” + value(0), pv) //value(1)表示sid,value(2)表示os,sv,value(3)表示自定义维度,value(0)表示cuid,value(4)表示pv
}).reduceByKey((x, y) => {
val pv = x + y //计算每个sid,cuid,os,sv,dim1_dim2下的总的pv
pv
//sid os,sv,dim1,dim2 cuid
}).map(line => { //重新组织键值对,每个cuid的uv为1
val value_1 = line._1.split("\t") //将键按照"\t"进行分割,value_1(0)表示sid,value_1(1)表示os,sv,dim1,dim2
val pv = line._2
//(sid os_sv,dim1dim2,pv)key是sid os_sv,dim1dim2 value是pv
(value_1(0) + "\t" + value_1(1), pv) //得到的键值对:sid,os_sv,dim1_dim2作为键,pv,uv作为值(uv为1)
//(sid os_sv,(pv,1))
}).mapValues(pv => (pv, 1)).reduceByKey((x, y) => { //处理键相同的值,分别对pv和uv进行加和
//(sid os_sv,dim1dim2,(pv+pv,uv+uv))key是sid os_sv,dim1dim2 value是(pv+pv,uv+uv)
(x._1 + y._1, x._2 + y._2)
}).map(line => {
//sid os_sv,dim1dim2 pv uv
line._1 + "\t" + line._2._1 + "," + line._2._2 //line._1表示sid,os,sv,dim1,dim2 line._2表示pv,uv
})
normal_pairrdd.repartition(1).saveAsTextFile(MatrixFact.nophoneinfo_normal_type(savepath))
}
def nophoneinfo_normal_type(savepath: String): String = { //normal分区
//最细粒度划分 区分:sid os,sv,dim1,dim2 pv,uv
savepath + “/event_type=nophoneinfo_normal”
}