Hadoo06

最新推荐文章于 2024-09-22 20:08:28 发布

原创最新推荐文章于 2024-09-22 20:08:28 发布 · 189 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#Hadoop

大数据分布式开发专栏收录该内容

32 篇文章

订阅专栏

文章目录

数据采集到数据分析案例

数据采集到数据分析案例

ETC

模拟ETC流程

0.Flume采集
1.将数据通过Flume从指定位置采集到hdfs(/app-log-data/data/2019-07-*);
在这里插入图片描述
2.将mr程序打成jar备用，

package com.initialize;


import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;


import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;

public class AppLogDataClean {

    public static class AppLogDataCleanMapper extends Mapper<LongWritable, Text, Text, NullWritable>{

        Text k = null;
        NullWritable v = null;
        SimpleDateFormat sdf = null;
        MultipleOutputs<Text, NullWritable> mos = null;//多路输出器

        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            k = new Text();
            v = NullWritable.get();
            sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
            mos = new MultipleOutputs<Text, NullWritable>(context);
        }

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

            JSONObject jsonObj = JSON.parseObject(value.toString());

            JSONObject headerObj = jsonObj.getJSONObject(GlobalConstants.HEADER);

            /**
             * 过滤缺失必选字段的记录
             */
            if(null == headerObj.getString("sdk_ver") || "".equals(headerObj.getString("sdk_ver"))){return;}
            if(null == headerObj.getString("time_zone") || "".equals(headerObj.getString("time_zone"))){return;}
            if(null == headerObj.getString("commit_id") || "".equals(headerObj.getString("commit_id"))){return;}
            if(null == headerObj.getString("commit_time") || "".equals(headerObj.getString("commit_time"))){return;}
            else{
                //练习时追加的逻辑，替换掉原始数据中的时间撮
                String commit_time = headerObj.getString("commit_time");
                String format = sdf.format(new Date(new Date().getTime() - 24 * 60 * 60 * 1000L));
                //String format = sdf.format(new Date());
                headerObj.put("commit_time", format);
            }
            if(null == headerObj.getString("pid") || "".equals(headerObj.getString("pid"))){return;}
            if(null == headerObj.getString("app_token") || "".equals(headerObj.getString("app_token"))){return;}
            if(null == headerObj.getString("app_id") || "".equals(headerObj.getString("app_id"))){return;}
            if(null == headerObj.getString("device_id") || "".equals(headerObj.getString("device_id"))){return;}
            if(null == headerObj.getString("device_id_type") || "".equals(headerObj.getString("device_id_type"))){return;}
            if(null == headerObj.getString("release_channel") || "".equals(headerObj.getString("release_channel"))){return;}
            if(null == headerObj.getString("app_ver_name") || "".equals(headerObj.getString("app_ver_name"))){return;}
            if(null == headerObj.getString("app_ver_code") || "".equals(headerObj.getString("app_ver_code"))){return;}
            if(null == headerObj.getString("os_name") || "".equals(headerObj.getString("os_name"))){return;}
            if(null == headerObj.getString("os_ver") || "".equals(headerObj.getString("os_ver"))){return;}
            if(null == headerObj.getString("language") || "".equals(headerObj.getString("language"))){return;}
            if(null == headerObj.getString("country") || "".equals(headerObj.getString("country"))){return;}
            if(null == headerObj.getString("manufacture") || "".equals(headerObj.getString("manufacture"))){return;}
            if(null == headerObj.getString("device_model") || "".equals(headerObj.getString("device_model"))){return;}
            if(null == headerObj.getString("resolution") || "".equals(headerObj.getString("resolution"))){return;}
            if(null == headerObj.getString("net_type") || "".equals(headerObj.getString("net_type"))){return;}


            /**
             * 生成user_id
             */
            String user_id = "";
            if("android".equals(headerObj.getString("os_name").trim())){
                user_id = StringUtils.isNotBlank(headerObj.getString("android_id"))?headerObj.getString("android_id"):headerObj.getString("device_id");
            }else{
                user_id = headerObj.getString("device_id");
            }

            /**
             * 输出结果
             */
            headerObj.put("user_id", user_id);
            k.set(JsonToStringUtil.toString(headerObj));

            if("android".equals(headerObj.getString("os_name"))){
                mos.write(k, v, "android/android");
            }else {
                mos.write(k, v, "ios/ios");
            }


        }

        @Override
        protected void cleanup(Context context) throws IOException, InterruptedException {
            mos.close();
        }
    }

    public static void main(String[] args) throws Exception{

        Configuration conf = new Configuration();

        Job job = Job.getInstance(conf);

        job.setJarByClass(AppLogDataClean.class);

        job.setMapperClass(AppLogDataCleanMapper.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        job.setNumReduceTasks(0);
        //job.setOutputFormatClass(TextOutputFormat.class); //默认

        //避免生成默认的part-m-00000文件，因为，数据已经交给MultipleOutputs输出了
        LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);

        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        boolean res = job.waitForCompletion(true);
        System.exit(res? 0:1);

    }
}
package com.initialize;


public class GlobalConstants {

    public static final String HEADER = "header";

}
package com.initialize;


import com.alibaba.fastjson.JSONObject;

public class JsonToStringUtil {

    public static String toString(JSONObject jsonObj) {

        StringBuilder sb = new StringBuilder();

        sb.append(jsonObj.getString("sdk_ver")).append("\001")
                .append(jsonObj.getString("time_zone")).append("\001")
                .append(jsonObj.getString("commit_id")).append("\001")
                .append(jsonObj.getString("commit_time")).append("\001")
                .append(jsonObj.getString("pid")).append("\001")
                .append(jsonObj.getString("app_token")).append("\001")
                .append(jsonObj.getString("app_id")).append("\001")
                .append(jsonObj.getString("device_id")).append("\001")
                .append(jsonObj.getString("device_id_type")).append("\001")
                .append(jsonObj.getString("release_channel")).append("\001")
                .append(jsonObj.getString("app_ver_name")).append("\001")
                .append(jsonObj.getString("app_ver_code")).append("\001")
                .append(jsonObj.getString("os_name")).append("\001")
                .append(jsonObj.getString("os_ver")).append("\001")
                .append(jsonObj.getString("language")).append("\001")
                .append(jsonObj.getString("country")).append("\001")
                .append(jsonObj.getString("manufacture")).append("\001")
                .append(jsonObj.getString("device_model")).append("\001")
                .append(jsonObj.getString("resolution")).append("\001")
                .append(jsonObj.getString("net_type")).append("\001")
                .append(jsonObj.getString("account")).append("\001")
                .append(jsonObj.getString("app_device_id")).append("\001")
                .append(jsonObj.getString("mac")).append("\001")
                .append(jsonObj.getString("android_id")).append("\001")
                .append(jsonObj.getString("imei")).append("\001")
                .append(jsonObj.getString("cid_sn")).append("\001")
                .append(jsonObj.getString("build_num")).append("\001")
                .append(jsonObj.getString("mobile_data_type")).append("\001")
                .append(jsonObj.getString("promotion_channel")).append("\001")
                .append(jsonObj.getString("carrier")).append("\001")
                .append(jsonObj.getString("city")).append("\001")
                .append(jsonObj.getString("user_id"))
                ;

        return sb.toString();
    }


}

3.编写shell脚本，定时进行ETC.

#!/bin/bash
#day_str=`date +'%Y-%m-%d'`
day_str=`date +'%Y-%m-%d' -d '-1 days'` 


inpath=/app-log-data/data/$day_str
outpath=/app-log-data/clean/$day_str

echo "准备清洗$day_str 的数据......"

/appdata/hadoop/bin/hadoop jar /home/lys/app-data-24-1.0-SNAPSHOT.jar com.initialize.AppLogDataClean $inpath $outpath

4.运行脚本
sh ./action_day.sh

5.运行结果
在这里插入图片描述

日活用户线观统计

各维度组合分析：

不区分操作系统os_name 不区分城市city 不区分渠道release_channel 不区分版本app_ver_name 活跃用户
区分操作系统os_name 不区分城市city 不区分渠道release_channel 不区分版本app_ver_name 活跃用户
不区分操作系统os_name 区分城市city 不区分渠道release_channel 不区分版本app_ver_name 活跃用户
不区分操作系统os_name 不区分城市city 区分渠道release_channel 不区分版本app_ver_name 活跃用户
不区分操作系统os_name 不区分城市city 不区分渠道release_channel 区分版本app_ver_name 活跃用户
区分操作系统os_name 区分城市city 不区分渠道release_channel 不区分版本app_ver_name 活跃用户
…

维度组合统计

0 0 0 0
0 0 0 1
0 0 1 0
0 0 1 1
0 1 0 0
0 1 0 1
0 1 1 0
0 1 1 1
1 0 0 0
1 0 0 1
1 0 1 0
1 0 1 1
1 1 0 0
1 1 0 1
1 1 1 0
1 1 1 1

0.创建ods_app_log表

CREATE TABLE ods_app_log (
    sdk_ver string
    ,time_zone string
    ,commit_id string
    ,commit_time string
    ,pid string
    ,app_token string
    ,app_id string
    ,device_id string
    ,device_id_type string
    ,release_channel string
    ,app_ver_name string
    ,app_ver_code string
    ,os_name string
    ,os_ver string
    ,language string
    ,country string
    ,manufacture string
    ,device_model string
    ,resolution string
    ,net_type string
    ,account string
    ,app_device_id string
    ,mac string
    ,android_id string
    ,imei string
    ,cid_sn string
    ,build_num string
    ,mobile_data_type string
    ,promotion_channel string
    ,carrier string
    ,city string
    ,user_id string
    ) partitioned BY (day string, os string) row format delimited fields terminated BY '\001';

导入数据
alter table ods_app_log add partition(day=‘2019-07-21’,os=‘android’) location ‘/app-log-data/clean/2019-07-21/android’;
展示分区
show partitions ods_app_log;
删除分区
alter table ods_app_log drop partition (day=‘2017-09-22’,os=‘android’);

1/ 把当天的活跃用户信息抽取出来，存入一个日活用户信息表

1.1/ 建日活用户信息表

CREATE TABLE etl_user_active_day (
    sdk_ver string
    ,time_zone string
    ,commit_id string
    ,commit_time string
    ,pid string
    ,app_token string
    ,app_id string
    ,device_id string
    ,device_id_type string
    ,release_channel string
    ,app_ver_name string
    ,app_ver_code string
    ,os_name string
    ,os_ver string
    ,language string
    ,country string
    ,manufacture string
    ,device_model string
    ,resolution string
    ,net_type string
    ,account string
    ,app_device_id string
    ,mac string
    ,android_id string
    ,imei string
    ,cid_sn string
    ,build_num string
    ,mobile_data_type string
    ,promotion_channel string
    ,carrier string
    ,city string
    ,user_id string
    ) partitioned BY (day string) row format delimited fields terminated BY '\001';

1.2 从ods_app_log原始数据表的当天分区中，抽取当日的日活用户信息插入日活用户信息表etl_user_active_day
注意点：每个活跃用户抽取他当天所有记录中时间最早的一条；

INSERT INTO TABLE etl_user_active_day PARTITION (day = '2019-07-22')
SELECT sdk_ver
    ,time_zone
    ,commit_id
    ,commit_time
    ,pid
    ,app_token
    ,app_id
    ,device_id
    ,device_id_type
    ,release_channel
    ,app_ver_name
    ,app_ver_code
    ,os_name
    ,os_ver
    ,LANGUAGE
    ,country
    ,manufacture
    ,device_model
    ,resolution
    ,net_type
    ,account
    ,app_device_id
    ,mac
    ,android_id
    ,imei
    ,cid_sn
    ,build_num
    ,mobile_data_type
    ,promotion_channel
    ,carrier
    ,city
    ,user_id
FROM (
    SELECT *
        ,row_number() OVER (
            PARTITION BY user_id ORDER BY commit_time
            ) AS rn
    FROM ods_app_log
    WHERE day = '2019-07-22'
    ) tmp
WHERE rn = 1;

维度统计

建维度统计结果表 dim_user_active_day

DROP TABLE dim_user_active_day;
CREATE TABLE dim_user_active_day (
    os_name string
    ,city string
    ,release_channel string
    ,app_ver_name string
    ,cnts INT
    ) partitioned BY (
    day string
    ,dim string
    );

利用多重insert语法来统计各种维度组合的日活用户数，并插入到日活维度统计表的各分区中；

FROM etl_user_active_day

INSERT INTO TABLE dim_user_active_day PARTITION (
    day = '2019-07-21'
    ,dim = '0000'
    )
SELECT 'all'
    ,'all'
    ,'all'
    ,'all'
    ,count(1)
WHERE day = '2019-07-21'

INSERT INTO TABLE dim_user_active_day PARTITION (
    day = '2019-07-21'
    ,dim = '1000'
    )
SELECT os_name
    ,'all'
    ,'all'
    ,'all'
    ,count(1)
WHERE day = '2019-07-21'
GROUP BY (os_name)

INSERT INTO TABLE dim_user_active_day PARTITION (
    day = '2019-07-21'
    ,dim = '0100'
    )
SELECT 'all'
    ,city
    ,'all'
    ,'all'
    ,count(1)
WHERE day = '2019-07-21'
GROUP BY (city)

INSERT INTO TABLE dim_user_active_day PARTITION (
    day = '2019-07-21'
    ,dim = '0010'
    )
SELECT 'all'
    ,'all'
    ,release_channel
    ,'all'
    ,count(1)
WHERE day = '2019-07-21'
GROUP BY (release_channel)

INSERT INTO TABLE dim_user_active_day PARTITION (
    day = '2019-07-21'
    ,dim = '0001'
    )
SELECT 'all'
    ,'all'
    ,'all'
    ,app_ver_name
    ,count(1)
WHERE day = '2019-07-21'
GROUP BY (app_ver_name)

INSERT INTO TABLE dim_user_active_day PARTITION (
    day = '2019-07-21'
    ,dim = '1100'
    )
SELECT os_name
    ,city
    ,'all'
    ,'all'
    ,count(1)
WHERE day = '2019-07-21'
GROUP BY os_name,city

INSERT INTO TABLE dim_user_active_day PARTITION (
    day = '2019-07-21'
    ,dim = '1010'
    )
SELECT os_name
    ,'all'
    ,release_channel
    ,'all'
    ,count(1)
WHERE day = '2019-07-21'
GROUP BY os_name,release_channel

INSERT INTO TABLE dim_user_active_day PARTITION (
    day = '2019-07-21'
    ,dim = '1001'
    )
SELECT os_name
    ,'all'
    ,'all'
    ,app_ver_name
    ,count(1)
WHERE day = '2019-07-21'
GROUP BY os_name,app_ver_name

INSERT INTO TABLE dim_user_active_day PARTITION (
    day = '2019-07-21'
    ,dim = '0110'
    )
SELECT 'all'
    ,city
    ,release_channel
    ,'all'
    ,count(1)
WHERE day = '2019-07-21'
GROUP BY city,release_channel

INSERT INTO TABLE dim_user_active_day PARTITION (
    day = '2019-07-21'
    ,dim = '0101'
    )
SELECT 'all'
    ,city
    ,'all'
    ,app_ver_name
    ,count(1)
WHERE day = '2019-07-21'
GROUP BY city,app_ver_name

INSERT INTO TABLE dim_user_active_day PARTITION (
    day = '2019-07-21'
    ,dim = '0011'
    )
SELECT 'all'
    ,'all'
    ,release_channel
    ,app_ver_name
    ,count(1)
WHERE day = '2019-07-21'
GROUP BY release_channel,app_ver_name

INSERT INTO TABLE dim_user_active_day PARTITION (
    day = '2019-07-21'
    ,dim = '0111'
    )
SELECT 'all'
    ,city
    ,release_channel
    ,app_ver_name
    ,count(1)
WHERE day = '2019-07-21'
GROUP BY city,release_channel,app_ver_name

INSERT INTO TABLE dim_user_active_day PARTITION (
    day = '2019-07-21'
    ,dim = '1011'
    )
SELECT os_name
    ,'all'
    ,release_channel
    ,app_ver_name
    ,count(1)
WHERE day = '2019-07-21'
GROUP BY os_name,release_channel,app_ver_name

INSERT INTO TABLE dim_user_active_day PARTITION (
    day = '2019-07-21'
    ,dim = '1101'
    )
SELECT os_name
    ,city
    ,'all'
    ,app_ver_name
    ,count(1)
WHERE day = '2019-07-21'
GROUP BY os_name,city,app_ver_name

INSERT INTO TABLE dim_user_active_day PARTITION (
    day = '2019-07-21'
    ,dim = '1110'
    )
SELECT os_name
    ,city
    ,release_channel
    ,'all'
    ,count(1)
WHERE day = '2019-07-21'
GROUP BY os_name,city,release_channel

INSERT INTO TABLE dim_user_active_day PARTITION (
    day = '2019-07-21'
    ,dim = '1111'
    )
SELECT os_name
    ,city
    ,release_channel
    ,app_ver_name
    ,count(1)
WHERE day = '2019-07-21'
GROUP BY os_name,city,release_channel,app_ver_name
;

日新用户相关统计

日新：当日第一次出现的用户–当日的新增用户
思路： a、应该建立一个历史用户表（只存user_id）
b、将当日的活跃用户去比对历史用户表，就知道哪些人是今天新出现的用户 --> 当日新增用户
c、将当日新增用户追加到历史用户表

历史用户表

create table etl_user_history(user_id string);

当日新增用户表

:存所有字段（每个人时间最早的一条）,带有一个分区字段：day string;
create table etl_user_new_day like etl_user_active_day;

1 当日活跃-历史用户表 --> 新增用户表的当日分区

insert  into etl_user_new_day partition(day='2019-07-21')
SELECT sdk_ver
    ,time_zone
    ,commit_id
    ,commit_time
    ,pid
    ,app_token
    ,app_id
    ,device_id
    ,device_id_type
    ,release_channel
    ,app_ver_name
    ,app_ver_code
    ,os_name
    ,os_ver
    ,LANGUAGE
    ,country
    ,manufacture
    ,device_model
    ,resolution
    ,net_type
    ,account
    ,app_device_id
    ,mac
    ,android_id
    ,imei
    ,cid_sn
    ,build_num
    ,mobile_data_type
    ,promotion_channel
    ,carrier
    ,city
    ,a.user_id
from  etl_user_active_day a left join  etl_user_history b on a.user_id = b.user_id
where a.day='2019-07-21' and b.user_id is null;

2 将当日新增用户的user_id追加到历史表

insert into table etl_user_history
select user_id from etl_user_new_day where day=‘2019-07-21’;

日新：维度统计报表

思路： a、从日新etl表中，按照维度组合，统计出各种维度组合下的新用户数量
维度：
os_name city release_channel app_ver_name
维度组合统计
0 0 0 0
0 0 0 1
0 0 1 0
0 0 1 1
0 1 0 0
0 1 0 1
0 1 1 0
0 1 1 1
1 0 0 0
1 0 0 1
1 0 1 0
1 0 1 1
1 1 0 0
1 1 0 1
1 1 1 0
1 1 1 1

1 日新维度统计报表–数据建模
create table dim_user_new_day(os_name string,city string,release_channel string,app_ver_name string,cnts int)
partitioned by (day string, dim string);

2 日新维度统计报表sql开发(利用多重插入语法)

from etl_user_new_day

insert into table dim_user_new_day partition(day='2017-09-21',dim='0000')
select 'all','all','all','all',count(1)
where day='2019-07-21'


insert into table dim_user_new_day partition(day='2017-09-21',dim='0001')
select 'all','all','all',app_ver_name,count(1)
where day='2019-07-21'
group by app_ver_name



insert into table dim_user_new_day partition(day='2017-09-21',dim='0010')
select 'all','all',release_channel,'all',count(1)
where day='2019-07-21'
group by release_channel


insert into table dim_user_new_day partition(day='2017-09-21',dim='0011')
select 'all','all',release_channel,app_ver_name,count(1)
where day='2019-07-21'
group by release_channel,app_ver_name


insert into table dim_user_new_day partition(day='2017-09-21',dim='0100')
select 'all',city,'all','all',count(1)
where day='2019-07-21'
group by city

;

次日留存用户分析

概念：昨日新增，今日还活跃

逻辑思路：昨天在新用户表中，今天在活跃用户表中 --> 今日的“次日留存用户”
昨天的新用户表中，存在于今天的活跃用户表中的人 --> 今日的“次日留存用户”

数据建模

建次日留存etl信息表

记录跟活跃用户表相同的字段
create table etl_user_keepalive_nextday like etl_user_active_day;

etl开发

insert into table etl_user_keepalive_nextday partition(day='2019-07-22')
select
     actuser.sdk_ver 
    ,actuser.time_zone 
    ,actuser.commit_id 
    ,actuser.commit_time 
    ,actuser.pid 
    ,actuser.app_token 
    ,actuser.app_id 
    ,actuser.device_id 
    ,actuser.device_id_type 
    ,actuser.release_channel 
    ,actuser.app_ver_name 
    ,actuser.app_ver_code 
    ,actuser.os_name 
    ,actuser.os_ver 
    ,actuser.language 
    ,actuser.country 
    ,actuser.manufacture 
    ,actuser.device_model 
    ,actuser.resolution 
    ,actuser.net_type 
    ,actuser.account 
    ,actuser.app_device_id 
    ,actuser.mac 
    ,actuser.android_id 
    ,actuser.imei 
    ,actuser.cid_sn 
    ,actuser.build_num 
    ,actuser.mobile_data_type 
    ,actuser.promotion_channel 
    ,actuser.carrier 
    ,actuser.city 
    ,actuser.user_id 
from etl_user_new_day newuser join etl_user_active_day actuser
on newuser.user_id = actuser.user_id
where newuser.day='2019-07-21' and actuser.day='2019-07-22';

删除指定分区数据

ALTER TABLE etl_user_keepalive_nextday DROP IF EXISTS PARTITION(day=‘2019-07-22’);

用左半连接效率略高

insert into table etl_user_keepalive_nextday partition(day='2019-07-22')
select 
 sdk_ver 
,time_zone 
,commit_id 
,commit_time 
,pid 
,app_token 
,app_id 
,device_id 
,device_id_type 
,release_channel 
,app_ver_name 
,app_ver_code 
,os_name 
,os_ver 
,language 
,country 
,manufacture 
,device_model 
,resolution 
,net_type 
,account 
,app_device_id 
,mac 
,android_id 
,imei 
,cid_sn 
,build_num 
,mobile_data_type 
,promotion_channel 
,carrier 
,city 
,user_id 
from etl_user_new_day a left semi join etl_user_active_day b
on a.user_id = b.user_id and a.day='2019-07-21' and b.day='2019-07-22';

where a.day='2019-07-21' and b.day='2019-07-22'; // 注意：left semi join中，右表的引用不能出现在where条件中

维度统计

利用多重插入语法

版本轨迹分析

利用hive的窗口分析函数解决问题

2017-08-14,赵老师,共享女友,安智市场,北京,v1.2
2017-08-14,赵老师,共享女友,安智市场,北京,v1.2
2017-08-14,赵老师,共享女友,安智市场,北京,v1.2
2017-08-14,许老师,共享女友,360应用,天津,v1.2
2017-08-14,许老师,共享女友,360应用,天津,v1.2
2017-08-14,许老师,共享女友,360应用,天津,v1.2
2017-08-14,赵老师,共享女友,安智市场,北京,v1.2
2017-08-14,许老师,共享女友,360应用,天津,v1.2
2017-08-14,许老师,共享女友,360应用,天津,v1.2
2017-08-14,许老师,共享女友,360应用,天津,v1.2
2017-08-14,许老师,共享女友,360应用,天津,v1.2
2017-08-14,许老师,共享女友,小米应用,天津,v2.0

解决方案：

create table t_lag_test(day string,user_id string,app_token string,release_channel string,city string,app_ver_name string)
row format delimited fields terminated by ',';

load data local inpath '/home/lys/ver.test' into table t_lag_test;

select 
day,user_id,app_token,release_channel,city,ver_2,app_ver_name
from
(
select
day,user_id,app_token,release_channel,city,app_ver_name,
lag(app_ver_name,1,null) over(partition by user_id order by app_ver_name) as ver_2
from t_lag_test) tmp
where ver_2 is not null and app_ver_name>ver_2 
;

补充hive的窗口分析函数

测试使用数据

+----------------+---------------------------------+-----------------------+--------------+--+
|  t_access.ip   |          t_access.url           | t_access.access_time  | t_access.dt  |
+----------------+---------------------------------+-----------------------+--------------+--+
| 192.168.33.3   | http://www.edu360.cn/stu        | 2017-08-04 15:30:20   | 20170804     |
| 192.168.33.3   | http://www.edu360.cn/teach      | 2017-08-04 15:35:20   | 20170804     |
| 192.168.33.4   | http://www.edu360.cn/stu        | 2017-08-04 15:30:20   | 20170804     |
| 192.168.33.4   | http://www.edu360.cn/job        | 2017-08-04 16:30:20   | 20170804     |
| 192.168.33.5   | http://www.edu360.cn/job        | 2017-08-04 15:40:20   | 20170804     |
| 192.168.33.3   | http://www.edu360.cn/stu        | 2017-08-05 15:30:20   | 20170805     |
| 192.168.44.3   | http://www.edu360.cn/teach      | 2017-08-05 15:35:20   | 20170805     |
| 192.168.33.44  | http://www.edu360.cn/stu        | 2017-08-05 15:30:20   | 20170805     |
| 192.168.33.46  | http://www.edu360.cn/job        | 2017-08-05 16:30:20   | 20170805     |
| 192.168.33.55  | http://www.edu360.cn/job        | 2017-08-05 15:40:20   | 20170805     |
| 192.168.133.3  | http://www.edu360.cn/register   | 2017-08-06 15:30:20   | 20170806     |
| 192.168.111.3  | http://www.edu360.cn/register   | 2017-08-06 15:35:20   | 20170806     |
| 192.168.34.44  | http://www.edu360.cn/pay        | 2017-08-06 15:30:20   | 20170806     |
| 192.168.33.46  | http://www.edu360.cn/excersize  | 2017-08-06 16:30:20   | 20170806     |
| 192.168.33.55  | http://www.edu360.cn/job        | 2017-08-06 15:40:20   | 20170806     |
| 192.168.33.46  | http://www.edu360.cn/excersize  | 2017-08-06 16:30:20   | 20170806     |
| 192.168.33.25  | http://www.edu360.cn/job        | 2017-08-06 15:40:20   | 20170806     |
| 192.168.33.36  | http://www.edu360.cn/excersize  | 2017-08-06 16:30:20   | 20170806     |
| 192.168.33.55  | http://www.edu360.cn/job        | 2017-08-06 15:40:20   | 20170806     |
+----------------+---------------------------------+-----------------------+--------------+--+

执行命令查看测试数据：
select * from t_access;
在这里插入图片描述

LAG函数

作用：按照指定列进行分区，将a列（不同于前一列）数据拷贝作为b列，让b列第1行数据与a列第n+1行数据向对应，以此类推。a列前n行数据与默认数据相对应。
应用场景：版本升级轨迹查询。

select ip,url,access_time,
row_number() over(partition by ip order by access_time) as rn,
lag(access_time,1,0) over(partition by ip order by access_time)as last_access_time
from t_access;

在这里插入图片描述

LEAD函数

与LAG作用相反。

select ip,url,access_time,
row_number() over(partition by ip order by access_time) as rn,
lead(access_time,1,0) over(partition by ip order by access_time)as last_access_time
from t_access;

在这里插入图片描述

FIRST_VALUE 函数

例：取每个用户访问的第一个页面

select ip,url,access_time,
row_number() over(partition by ip order by access_time) as rn,
first_value(url) over(partition by ip order by access_time rows between unbounded preceding and unbounded following)as last_access_time
from t_access;

在这里插入图片描述

LAST_VALUE 函数

例：取每个用户访问的最后一个页面

select ip,url,access_time,
row_number() over(partition by ip order by access_time) as rn,
last_value(url) over(partition by ip order by access_time rows between unbounded preceding and unbounded following)as last_access_time
from t_access;

在这里插入图片描述

sum() over() 函数

累计报表–分析函数实现版

select id
,month
,sum(amount) over(partition by id order by month rows between unbounded preceding and current row)
from
(select id,month,
sum(fee) as amount
from t_test
group by id,month) tmp;

序列号打印相关函数

row_number() over()：比较相等，按任意顺序排列
rank() over()：比较相等，序列号相同，默认占去后面的序列号。
dense_rank() over()：比较相等，序列号相同，默认不占去后面的序列号。
求薪资排名中位于前1/3的人
ntile(3) over()：将总数量的1/n划分为一个相同的序列号。

score rownumber rankover dense_rank ntile
89 1 1 1 1
90 2 2 2 1
90 3 2 2 1
91 4 4 3 2
92 5 5 4 2
92 6 5 4 2
93 7 7 5 3