Hadoo06

数据采集到数据分析案例

ETC

模拟ETC流程

0.Flume采集
1.将数据通过Flume从指定位置采集到hdfs(/app-log-data/data/2019-07-*);
在这里插入图片描述
2.将mr程序打成jar备用,

package com.initialize;


import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;


import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;

public class AppLogDataClean {

    public static class AppLogDataCleanMapper extends Mapper<LongWritable, Text, Text, NullWritable>{

        Text k = null;
        NullWritable v = null;
        SimpleDateFormat sdf = null;
        MultipleOutputs<Text, NullWritable> mos = null;//多路输出器

        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            k = new Text();
            v = NullWritable.get();
            sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
            mos = new MultipleOutputs<Text, NullWritable>(context);
        }

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

            JSONObject jsonObj = JSON.parseObject(value.toString());

            JSONObject headerObj = jsonObj.getJSONObject(GlobalConstants.HEADER);

            /**
             * 过滤缺失必选字段的记录
             */
            if(null == headerObj.getString("sdk_ver") || "".equals(headerObj.getString("sdk_ver"))){return;}
            if(null == headerObj.getString("time_zone") || "".equals(headerObj.getString("time_zone"))){return;}
            if(null == headerObj.getString("commit_id") || "".equals(headerObj.getString("commit_id"))){return;}
            if(null == headerObj.getString("commit_time") || "".equals(headerObj.getString("commit_time"))){return;}
            else{
                //练习时追加的逻辑,替换掉原始数据中的时间撮
                String commit_time = headerObj.getString("commit_time");
                String format = sdf.format(new Date(new Date().getTime() - 24 * 60 * 60 * 1000L));
                //String format = sdf.format(new Date());
                headerObj.put("commit_time", format);
            }
            if(null == headerObj.getString("pid") || "".equals(headerObj.getString("pid"))){return;}
            if(null == headerObj.getString("app_token") || "".equals(headerObj.getString("app_token"))){return;}
            if(null == headerObj.getString("app_id") || "".equals(headerObj.getString("app_id"))){return;}
            if(null == headerObj.getString("device_id") || "".equals(headerObj.getString("device_id"))){return;}
            if(null == headerObj.getString("device_id_type") || "".equals(headerObj.getString("device_id_type"))){return;}
            if(null == headerObj.getString("release_channel") || "".equals(headerObj.getString("release_channel"))){return;}
            if(null == headerObj.getString("app_ver_name") || "".equals(headerObj.getString("app_ver_name"))){return;}
            if(null == headerObj.getString("app_ver_code") || "".equals(headerObj.getString("app_ver_code"))){return;}
            if(null == headerObj.getString("os_name") || "".equals(headerObj.getString("os_name"))){return;}
            if(null == headerObj.getString("os_ver") || "".equals(headerObj.getString("os_ver"))){return;}
            if(null == headerObj.getString("language") || "".equals(headerObj.getString("language"))){return;}
            if(null == headerObj.getString("country") || "".equals(headerObj.getString("country"))){return;}
            if(null == headerObj.getString("manufacture") || "".equals(headerObj.getString("manufacture"))){return;}
            if(null == headerObj.getString("device_model") || "".equals(headerObj.getString("device_model"))){return;}
            if(null == headerObj.getString("resolution") || "".equals(headerObj.getString("resolution"))){return;}
            if(null == headerObj.getString("net_type") || "".equals(headerObj.getString("net_type"))){return;}


            /**
             * 生成user_id
             */
            String user_id = "";
            if("android".equals(headerObj.getString("os_name").trim())){
                user_id = StringUtils.isNotBlank(headerObj.getString("android_id"))?headerObj.getString("android_id"):headerObj.getString("device_id");
            }else{
                user_id = headerObj.getString("device_id");
            }

            /**
             * 输出结果
             */
            headerObj.put("user_id", user_id);
            k.set(JsonToStringUtil.toString(headerObj));

            if("android".equals(headerObj.getString("os_name"))){
                mos.write(k, v, "android/android");
            }else {
                mos.write(k, v, "ios/ios");
            }


        }

        @Override
        protected void cleanup(Context context) throws IOException, InterruptedException {
            mos.close();
        }
    }

    public static void main(String[] args) throws Exception{

        Configuration conf = new Configuration();

        Job job = Job.getInstance(conf);

        job.setJarByClass(AppLogDataClean.class);

        job.setMapperClass(AppLogDataCleanMapper.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        job.setNumReduceTasks(0);
        //job.setOutputFormatClass(TextOutputFormat.class); //默认

        //避免生成默认的part-m-00000文件,因为,数据已经交给MultipleOutputs输出了
        LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);

        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        boolean res = job.waitForCompletion(true);
        System.exit(res? 0:1);

    }
}
package com.initialize;


public class GlobalConstants {

    public static final String HEADER = "header";

}
package com.initialize;


import com.alibaba.fastjson.JSONObject;

public class JsonToStringUtil {

    public static String toString(JSONObject jsonObj) {

        StringBuilder sb = new StringBuilder();

        sb.append(jsonObj.getString("sdk_ver")).append("\001")
                .append(jsonObj.getString("time_zone")).append("\001")
                .append(jsonObj.getString("commit_id")).append("\001")
                .append(jsonObj.getString("commit_time")).append("\001")
                .append(jsonObj.getString("pid")).append("\001")
                .append(jsonObj.getString("app_token")).append("\001")
                .append(jsonObj.getString("app_id")).append("\001")
                .append(jsonObj.getString("device_id")).append("\001")
                .append(jsonObj.getString("device_id_type")).append("\001")
                .append(jsonObj.getString("release_channel")).append("\001")
                .append(jsonObj.getString("app_ver_name")).append("\001")
                .append(jsonObj.getString("app_ver_code")).append("\001")
                .append(jsonObj.getString("os_name")).append("\001")
                .append(jsonObj.getString("os_ver")).append("\001")
                .append(jsonObj.getString("language")).append("\001")
                .append(jsonObj.getString("country")).append("\001")
                .append(jsonObj.getString("manufacture")).append("\001")
                .append(jsonObj.getString("device_model")).append("\001")
                .append(jsonObj.getString("resolution")).append("\001")
                .append(jsonObj.getString("net_type")).append("\001")
                .append(jsonObj.getString("account")).append("\001")
                .append(jsonObj.getString("app_device_id")).append("\001")
                .append(jsonObj.getString("mac")).append("\001")
                .append(jsonObj.getString("android_id")).append("\001")
                .append(jsonObj.getString("imei")).append("\001")
                .append(jsonObj.getString("cid_sn")).append("\001")
                .append(jsonObj.getString("build_num")).append("\001")
                .append(jsonObj.getString("mobile_data_type")).append("\001")
                .append(jsonObj.getString("promotion_channel")).append("\001")
                .append(jsonObj.getString("carrier")).append("\001")
                .append(jsonObj.getString("city")).append("\001")
                .append(jsonObj.getString("user_id"))
                ;

        return sb.toString();
    }


}

3.编写shell脚本,定时进行ETC.

#!/bin/bash
#day_str=`date +'%Y-%m-%d'`
day_str=`date +'%Y-%m-%d' -d '-1 days'` 


inpath=/app-log-data/data/$day_str
outpath=/app-log-data/clean/$day_str

echo "准备清洗$day_str 的数据......"

/appdata/hadoop/bin/hadoop jar /home/lys/app-data-24-1.0-SNAPSHOT.jar com.initialize.AppLogDataClean $inpath $outpath

4.运行脚本
sh ./action_day.sh

5.运行结果
在这里插入图片描述
在这里插入图片描述

日活用户线观统计

各维度组合分析:

不区分操作系统os_name 不区分城市city 不区分渠道release_channel 不区分版本app_ver_name 活跃用户
区分操作系统os_name 不区分城市city 不区分渠道release_channel 不区分版本app_ver_name 活跃用户
不区分操作系统os_name 区分城市city 不区分渠道release_channel 不区分版本app_ver_name 活跃用户
不区分操作系统os_name 不区分城市city 区分渠道release_channel 不区分版本app_ver_name 活跃用户
不区分操作系统os_name 不区分城市city 不区分渠道release_channel 区分版本app_ver_name 活跃用户
区分操作系统os_name 区分城市city 不区分渠道release_channel 不区分版本app_ver_name 活跃用户

维度组合统计

0 0 0 0
0 0 0 1
0 0 1 0
0 0 1 1
0 1 0 0
0 1 0 1
0 1 1 0
0 1 1 1
1 0 0 0
1 0 0 1
1 0 1 0
1 0 1 1
1 1 0 0
1 1 0 1
1 1 1 0
1 1 1 1

0.创建ods_app_log表
CREATE TABLE ods_app_log (
    sdk_ver string
    ,time_zone string
    ,commit_id string
    ,commit_time string
    ,pid string
    ,app_token string
    ,app_id string
    ,device_id string
    ,device_id_type string
    ,release_channel string
    ,app_ver_name string
    ,app_ver_code string
    ,os_name string
    ,os_ver string
    ,language string
    ,country string
    ,manufacture string
    ,device_model string
    ,resolution string
    ,net_type string
    ,account string
    ,app_device_id string
    ,mac string
    ,android_id string
    ,imei string
    ,cid_sn string
    ,build_num string
    ,mobile_data_type string
    ,promotion_channel string
    ,carrier string
    ,city string
    ,user_id string
    ) partitioned BY (day string, os string) row format delimited fields terminated BY '\001';

导入数据
alter table ods_app_log add partition(day=‘2019-07-21’,os=‘android’) location ‘/app-log-data/clean/2019-07-21/android’;
展示分区
show partitions ods_app_log;
删除分区
alter table ods_app_log drop partition (day=‘2017-09-22’,os=‘android’);

1/ 把当天的活跃用户信息抽取出来,存入一个日活用户信息表

1.1/ 建日活用户信息表

CREATE TABLE etl_user_active_day (
    sdk_ver string
    ,time_zone string
    ,commit_id string
    ,commit_time string
    ,pid string
    ,app_token string
    ,app_id string
    ,device_id string
    ,device_id_type string
    ,release_channel string
    ,app_ver_name string
    ,app_ver_code string
    ,os_name string
    ,os_ver string
    ,language string
    ,country string
    ,manufacture string
    ,device_model string
    ,resolution string
    ,net_type string
    ,account string
    ,app_device_id string
    ,mac string
    ,android_id string
    ,imei string
    ,cid_sn string
    ,build_num string
    ,mobile_data_type string
    ,promotion_channel string
    ,carrier string
    ,city string
    ,user_id string
    ) partitioned BY (day string) row format delimited fields terminated BY '\001';

1.2 从ods_app_log原始数据表的当天分区中,抽取当日的日活用户信息插入日活用户信息表etl_user_active_day
注意点:每个活跃用户抽取他当天所有记录中时间最早的一条;

INSERT INTO TABLE etl_user_active_day PARTITION (day = '2019-07-22')
SELECT sdk_ver
    ,time_zone
    ,commit_id
    ,commit_time
    ,pid
    ,app_token
    ,app_id
    ,device_id
    ,device_id_type
    ,release_channel
    ,app_ver_name
    ,app_ver_code
    ,os_name
    ,os_ver
    ,LANGUAGE
    ,country
    ,manufacture
    ,device_model
    ,resolution
    ,net_type
    ,account
    ,app_device_id
    ,mac
    ,android_id
    ,imei
    ,cid_sn
    ,build_num
    ,mobile_data_type
    ,promotion_channel
    ,carrier
    ,city
    ,user_id
FROM (
    SELECT *
        ,row_number() OVER (
            PARTITION BY user_id ORDER BY commit_time
            ) AS rn
    FROM ods_app_log
    WHERE day = '2019-07-22'
    ) tmp
WHERE rn = 1;
维度统计

建维度统计结果表 dim_user_active_day

DROP TABLE dim_user_active_day;
CREATE TABLE dim_user_active_day (
    os_name string
    ,city string
    ,release_channel string
    ,app_ver_name string
    ,cnts INT
    ) partitioned BY (
    day string
    ,dim string
    );

利用多重insert语法来统计各种维度组合的日活用户数,并插入到日活维度统计表的各分区中;

FROM etl_user_active_day

INSERT INTO TABLE dim_user_active_day PARTITION (
    day = '2019-07-21'
    ,dim = '0000'
    )
SELECT 'all'
    ,'all'
    ,'all'
    ,'all'
    ,count(1)
WHERE day = '2019-07-21'

INSERT INTO TABLE dim_user_active_day PARTITION (
    day = '2019-07-21'
    ,dim = '1000'
    )
SELECT os_name
    ,'all'
    ,'all'
    ,'all'
    ,count(1)
WHERE day = '2019-07-21'
GROUP BY (os_name)

INSERT INTO TABLE dim_user_active_day PARTITION (
    day = '2019-07-21'
    ,dim = '0100'
    )
SELECT 'all'
    ,city
    ,'all'
    ,'all'
    ,count(1)
WHERE day = '2019-07-21'
GROUP BY (city)

INSERT INTO TABLE dim_user_active_day PARTITION (
    day = '2019-07-21'
    ,dim = '0010'
    )
SELECT 'all'
    ,'all'
    ,release_channel
    ,'all'
    ,count(1)
WHERE day = '2019-07-21'
GROUP BY (release_channel)

INSERT INTO TABLE dim_user_active_day PARTITION (
    day = '2019-07-21'
    ,dim = '0001'
    )
SELECT 'all'
    ,'all'
    ,'all'
    ,app_ver_name
    ,count(1)
WHERE day = '2019-07-21'
GROUP BY (app_ver_name)

INSERT INTO TABLE dim_user_active_day PARTITION (
    day = '2019-07-21'
    ,dim = '1100'
    )
SELECT os_name
    ,city
    ,'all'
    ,'all'
    ,count(1)
WHERE day = '2019-07-21'
GROUP BY os_name,city

INSERT INTO TABLE dim_user_active_day PARTITION (
    day = '2019-07-21'
    ,dim = '1010'
    )
SELECT os_name
    ,'all'
    ,release_channel
    ,'all'
    ,count(1)
WHERE day = '2019-07-21'
GROUP BY os_name,release_channel

INSERT INTO TABLE dim_user_active_day PARTITION (
    day = '2019-07-21'
    ,dim = '1001'
    )
SELECT os_name
    ,'all'
    ,'all'
    ,app_ver_name
    ,count(1)
WHERE day = '2019-07-21'
GROUP BY os_name,app_ver_name

INSERT INTO TABLE dim_user_active_day PARTITION (
    day = '2019-07-21'
    ,dim = '0110'
    )
SELECT 'all'
    ,city
    ,release_channel
    ,'all'
    ,count(1)
WHERE day = '2019-07-21'
GROUP BY city,release_channel

INSERT INTO TABLE dim_user_active_day PARTITION (
    day = '2019-07-21'
    ,dim = '0101'
    )
SELECT 'all'
    ,city
    ,'all'
    ,app_ver_name
    ,count(1)
WHERE day = '2019-07-21'
GROUP BY city,app_ver_name

INSERT INTO TABLE dim_user_active_day PARTITION (
    day = '2019-07-21'
    ,dim = '0011'
    )
SELECT 'all'
    ,'all'
    ,release_channel
    ,app_ver_name
    ,count(1)
WHERE day = '2019-07-21'
GROUP BY release_channel,app_ver_name

INSERT INTO TABLE dim_user_active_day PARTITION (
    day = '2019-07-21'
    ,dim = '0111'
    )
SELECT 'all'
    ,city
    ,release_channel
    ,app_ver_name
    ,count(1)
WHERE day = '2019-07-21'
GROUP BY city,release_channel,app_ver_name

INSERT INTO TABLE dim_user_active_day PARTITION (
    day = '2019-07-21'
    ,dim = '1011'
    )
SELECT os_name
    ,'all'
    ,release_channel
    ,app_ver_name
    ,count(1)
WHERE day = '2019-07-21'
GROUP BY os_name,release_channel,app_ver_name

INSERT INTO TABLE dim_user_active_day PARTITION (
    day = '2019-07-21'
    ,dim = '1101'
    )
SELECT os_name
    ,city
    ,'all'
    ,app_ver_name
    ,count(1)
WHERE day = '2019-07-21'
GROUP BY os_name,city,app_ver_name

INSERT INTO TABLE dim_user_active_day PARTITION (
    day = '2019-07-21'
    ,dim = '1110'
    )
SELECT os_name
    ,city
    ,release_channel
    ,'all'
    ,count(1)
WHERE day = '2019-07-21'
GROUP BY os_name,city,release_channel

INSERT INTO TABLE dim_user_active_day PARTITION (
    day = '2019-07-21'
    ,dim = '1111'
    )
SELECT os_name
    ,city
    ,release_channel
    ,app_ver_name
    ,count(1)
WHERE day = '2019-07-21'
GROUP BY os_name,city,release_channel,app_ver_name
;

日新用户相关统计

日新:当日第一次出现的用户–当日的新增用户
思路: a、应该建立一个历史用户表(只存user_id)
b、将当日的活跃用户去 比对 历史用户表, 就知道哪些人是今天新出现的用户 --> 当日新增用户
c、将当日新增用户追加到历史用户表

历史用户表

create table etl_user_history(user_id string);

当日新增用户表

:存所有字段(每个人时间最早的一条),带有一个分区字段:day string;
create table etl_user_new_day like etl_user_active_day;

1 当日活跃-历史用户表 --> 新增用户表的当日分区
insert  into etl_user_new_day partition(day='2019-07-21')
SELECT sdk_ver
    ,time_zone
    ,commit_id
    ,commit_time
    ,pid
    ,app_token
    ,app_id
    ,device_id
    ,device_id_type
    ,release_channel
    ,app_ver_name
    ,app_ver_code
    ,os_name
    ,os_ver
    ,LANGUAGE
    ,country
    ,manufacture
    ,device_model
    ,resolution
    ,net_type
    ,account
    ,app_device_id
    ,mac
    ,android_id
    ,imei
    ,cid_sn
    ,build_num
    ,mobile_data_type
    ,promotion_channel
    ,carrier
    ,city
    ,a.user_id
from  etl_user_active_day a left join  etl_user_history b on a.user_id = b.user_id
where a.day='2019-07-21' and b.user_id is null;
2 将当日新增用户的user_id追加到历史表

insert into table etl_user_history
select user_id from etl_user_new_day where day=‘2019-07-21’;

日新:维度统计报表

思路: a、从日新etl表中,按照维度组合,统计出各种维度组合下的新用户数量
维度:
os_name city release_channel app_ver_name
维度组合统计
0 0 0 0
0 0 0 1
0 0 1 0
0 0 1 1
0 1 0 0
0 1 0 1
0 1 1 0
0 1 1 1
1 0 0 0
1 0 0 1
1 0 1 0
1 0 1 1
1 1 0 0
1 1 0 1
1 1 1 0
1 1 1 1

1 日新维度统计报表–数据建模
create table dim_user_new_day(os_name string,city string,release_channel string,app_ver_name string,cnts int)
partitioned by (day string, dim string);

2 日新维度统计报表sql开发(利用多重插入语法)

from etl_user_new_day

insert into table dim_user_new_day partition(day='2017-09-21',dim='0000')
select 'all','all','all','all',count(1)
where day='2019-07-21'


insert into table dim_user_new_day partition(day='2017-09-21',dim='0001')
select 'all','all','all',app_ver_name,count(1)
where day='2019-07-21'
group by app_ver_name



insert into table dim_user_new_day partition(day='2017-09-21',dim='0010')
select 'all','all',release_channel,'all',count(1)
where day='2019-07-21'
group by release_channel


insert into table dim_user_new_day partition(day='2017-09-21',dim='0011')
select 'all','all',release_channel,app_ver_name,count(1)
where day='2019-07-21'
group by release_channel,app_ver_name


insert into table dim_user_new_day partition(day='2017-09-21',dim='0100')
select 'all',city,'all','all',count(1)
where day='2019-07-21'
group by city

;

次日留存用户分析

概念:昨日新增,今日还活跃

逻辑思路:昨天在新用户表中,今天在活跃用户表中 --> 今日的“次日留存用户”
昨天的新用户表中,存在于今天的活跃用户表中的人 --> 今日的“次日留存用户”

数据建模

建次日留存etl信息表

记录跟活跃用户表相同的字段
create table etl_user_keepalive_nextday like etl_user_active_day;

etl开发
insert into table etl_user_keepalive_nextday partition(day='2019-07-22')
select
     actuser.sdk_ver 
    ,actuser.time_zone 
    ,actuser.commit_id 
    ,actuser.commit_time 
    ,actuser.pid 
    ,actuser.app_token 
    ,actuser.app_id 
    ,actuser.device_id 
    ,actuser.device_id_type 
    ,actuser.release_channel 
    ,actuser.app_ver_name 
    ,actuser.app_ver_code 
    ,actuser.os_name 
    ,actuser.os_ver 
    ,actuser.language 
    ,actuser.country 
    ,actuser.manufacture 
    ,actuser.device_model 
    ,actuser.resolution 
    ,actuser.net_type 
    ,actuser.account 
    ,actuser.app_device_id 
    ,actuser.mac 
    ,actuser.android_id 
    ,actuser.imei 
    ,actuser.cid_sn 
    ,actuser.build_num 
    ,actuser.mobile_data_type 
    ,actuser.promotion_channel 
    ,actuser.carrier 
    ,actuser.city 
    ,actuser.user_id 
from etl_user_new_day newuser join etl_user_active_day actuser
on newuser.user_id = actuser.user_id
where newuser.day='2019-07-21' and actuser.day='2019-07-22';
删除指定分区数据

ALTER TABLE etl_user_keepalive_nextday DROP IF EXISTS PARTITION(day=‘2019-07-22’);

用左半连接效率略高
insert into table etl_user_keepalive_nextday partition(day='2019-07-22')
select 
 sdk_ver 
,time_zone 
,commit_id 
,commit_time 
,pid 
,app_token 
,app_id 
,device_id 
,device_id_type 
,release_channel 
,app_ver_name 
,app_ver_code 
,os_name 
,os_ver 
,language 
,country 
,manufacture 
,device_model 
,resolution 
,net_type 
,account 
,app_device_id 
,mac 
,android_id 
,imei 
,cid_sn 
,build_num 
,mobile_data_type 
,promotion_channel 
,carrier 
,city 
,user_id 
from etl_user_new_day a left semi join etl_user_active_day b
on a.user_id = b.user_id and a.day='2019-07-21' and b.day='2019-07-22';

where a.day='2019-07-21' and b.day='2019-07-22'; // 注意:left semi join中,右表的引用不能出现在where条件中
维度统计

利用多重插入语法

版本轨迹分析

利用hive的窗口分析函数解决问题
2017-08-14,赵老师,共享女友,安智市场,北京,v1.2
2017-08-14,赵老师,共享女友,安智市场,北京,v1.2
2017-08-14,赵老师,共享女友,安智市场,北京,v1.2
2017-08-14,许老师,共享女友,360应用,天津,v1.2
2017-08-14,许老师,共享女友,360应用,天津,v1.2
2017-08-14,许老师,共享女友,360应用,天津,v1.2
2017-08-14,赵老师,共享女友,安智市场,北京,v1.2
2017-08-14,许老师,共享女友,360应用,天津,v1.2
2017-08-14,许老师,共享女友,360应用,天津,v1.2
2017-08-14,许老师,共享女友,360应用,天津,v1.2
2017-08-14,许老师,共享女友,360应用,天津,v1.2
2017-08-14,许老师,共享女友,小米应用,天津,v2.0

解决方案:

create table t_lag_test(day string,user_id string,app_token string,release_channel string,city string,app_ver_name string)
row format delimited fields terminated by ',';

load data local inpath '/home/lys/ver.test' into table t_lag_test;

select 
day,user_id,app_token,release_channel,city,ver_2,app_ver_name
from
(
select
day,user_id,app_token,release_channel,city,app_ver_name,
lag(app_ver_name,1,null) over(partition by user_id order by app_ver_name) as ver_2
from t_lag_test) tmp
where ver_2 is not null and app_ver_name>ver_2 
;

补充hive的窗口分析函数

测试使用数据

+----------------+---------------------------------+-----------------------+--------------+--+
|  t_access.ip   |          t_access.url           | t_access.access_time  | t_access.dt  |
+----------------+---------------------------------+-----------------------+--------------+--+
| 192.168.33.3   | http://www.edu360.cn/stu        | 2017-08-04 15:30:20   | 20170804     |
| 192.168.33.3   | http://www.edu360.cn/teach      | 2017-08-04 15:35:20   | 20170804     |
| 192.168.33.4   | http://www.edu360.cn/stu        | 2017-08-04 15:30:20   | 20170804     |
| 192.168.33.4   | http://www.edu360.cn/job        | 2017-08-04 16:30:20   | 20170804     |
| 192.168.33.5   | http://www.edu360.cn/job        | 2017-08-04 15:40:20   | 20170804     |
| 192.168.33.3   | http://www.edu360.cn/stu        | 2017-08-05 15:30:20   | 20170805     |
| 192.168.44.3   | http://www.edu360.cn/teach      | 2017-08-05 15:35:20   | 20170805     |
| 192.168.33.44  | http://www.edu360.cn/stu        | 2017-08-05 15:30:20   | 20170805     |
| 192.168.33.46  | http://www.edu360.cn/job        | 2017-08-05 16:30:20   | 20170805     |
| 192.168.33.55  | http://www.edu360.cn/job        | 2017-08-05 15:40:20   | 20170805     |
| 192.168.133.3  | http://www.edu360.cn/register   | 2017-08-06 15:30:20   | 20170806     |
| 192.168.111.3  | http://www.edu360.cn/register   | 2017-08-06 15:35:20   | 20170806     |
| 192.168.34.44  | http://www.edu360.cn/pay        | 2017-08-06 15:30:20   | 20170806     |
| 192.168.33.46  | http://www.edu360.cn/excersize  | 2017-08-06 16:30:20   | 20170806     |
| 192.168.33.55  | http://www.edu360.cn/job        | 2017-08-06 15:40:20   | 20170806     |
| 192.168.33.46  | http://www.edu360.cn/excersize  | 2017-08-06 16:30:20   | 20170806     |
| 192.168.33.25  | http://www.edu360.cn/job        | 2017-08-06 15:40:20   | 20170806     |
| 192.168.33.36  | http://www.edu360.cn/excersize  | 2017-08-06 16:30:20   | 20170806     |
| 192.168.33.55  | http://www.edu360.cn/job        | 2017-08-06 15:40:20   | 20170806     |
+----------------+---------------------------------+-----------------------+--------------+--+

执行命令查看测试数据:
select * from t_access;
在这里插入图片描述

LAG函数

作用:按照指定列进行分区,将a列(不同于前一列)数据拷贝作为b列,让b列第1行数据与a列第n+1行数据向对应,以此类推。a列前n行数据与默认数据相对应。
应用场景:版本升级轨迹查询。

select ip,url,access_time,
row_number() over(partition by ip order by access_time) as rn,
lag(access_time,1,0) over(partition by ip order by access_time)as last_access_time
from t_access;

在这里插入图片描述

LEAD函数

与LAG作用相反。

select ip,url,access_time,
row_number() over(partition by ip order by access_time) as rn,
lead(access_time,1,0) over(partition by ip order by access_time)as last_access_time
from t_access;

在这里插入图片描述

FIRST_VALUE 函数

例:取每个用户访问的第一个页面

select ip,url,access_time,
row_number() over(partition by ip order by access_time) as rn,
first_value(url) over(partition by ip order by access_time rows between unbounded preceding and unbounded following)as last_access_time
from t_access;

在这里插入图片描述

LAST_VALUE 函数

例:取每个用户访问的最后一个页面

select ip,url,access_time,
row_number() over(partition by ip order by access_time) as rn,
last_value(url) over(partition by ip order by access_time rows between unbounded preceding and unbounded following)as last_access_time
from t_access;

在这里插入图片描述

sum() over() 函数

累计报表–分析函数实现版

select id
,month
,sum(amount) over(partition by id order by month rows between unbounded preceding and current row)
from
(select id,month,
sum(fee) as amount
from t_test
group by id,month) tmp;
序列号打印相关函数

row_number() over():比较相等,按任意顺序排列
rank() over():比较相等,序列号相同,默认占去后面的序列号。
dense_rank() over():比较相等,序列号相同,默认不占去后面的序列号。
求薪资排名中位于前1/3的人
ntile(3) over():将总数量的1/n划分为一个相同的序列号。

score rownumber rankover dense_rank ntile
89 1 1 1 1
90 2 2 2 1
90 3 2 2 1
91 4 4 3 2
92 5 5 4 2
92 6 5 4 2
93 7 7 5 3

<think>嗯,用户的问题是关于Hadoop集群中Namenode无法连接到Journalnode,尽管端口8485是可达的,而且Journalnode已经连接在由hadoop04、hadoop05、hadoop06组成的多节点集群上。首先,我需要回想一下Hadoop的高可用架构中Namenode和Journalnode的交互过程。HA架构下,Namenode通过JournalNode来共享编辑日志,确保Active和Standby状态同步。 用户提到端口8485是可达的,这可能意味着网络层面的连通性没有问题,但可能还有其他原因导致连接失败。我需要逐步分析可能的原因。首先,确认配置是否正确,尤其是Journalnode的RPC地址是否正确配置。检查hdfs-site.xml中的dfs.journalnode.rpc-address是否指向正确的节点和端口,比如8485。另外,Namenode的配置是否正确指向所有Journalnode的主机名,可能需要检查dfs.namenode.shared.edits.dir的配置,确保使用正确的URI格式,比如qjournal://hadoop04:8485;hadoop05:8485;hadoop06:8485/mycluster。 接下来,查看日志是关键。Namenode的启动日志中可能会有更详细的错误信息,比如连接被拒绝、认证失败或超时。用户需要检查namenode的日志,通常在logs目录下,查找类似“Connection refused”或“Authentication failed”的关键词。例如,如果日志显示无法解析主机名,可能是DNS或/etc/hosts文件的问题,需要确认所有节点的主机名配置正确,并且能够互相解析。 权限问题也是一个常见原因。Journalnode的数据目录权限是否正确?Hadoop进程运行的用户是否有权限读写该目录?比如,目录权限应该是755,所有者是hdfs用户。此外,SELinux或防火墙可能虽然允许了8485端口,但可能还有其他安全策略阻止了连接,需要检查iptables或firewalld的设置,确认没有其他规则拦截流量。 还有一个可能是Journalnode服务没有正确启动或者绑定到预期的接口。使用netstat -tulnp | grep 8485可以确认Journalnode是否在正确的IP上监听,比如0.0.0.0而不是127.0.0.1。如果绑定到本地回环地址,其他节点就无法连接了。此外,检查Journalnode的日志是否有启动错误,比如端口被占用或配置错误。 时间同步问题也可能导致连接失败,因为Hadoop集群对时间同步比较敏感。需要确认所有节点的时间是否同步,可以使用ntpdate或chronyd服务。 还有可能是Hadoop的版本兼容性问题,特别是如果集群中的组件版本不一致,比如Journalnode和Namenode的Hadoop版本不同,可能导致RPC协议不兼容,从而连接失败。用户需要检查所有节点上的Hadoop版本是否一致。 另外,检查是否启用了安全认证,比如Kerberos,如果配置不正确,可能导致认证失败。查看日志中是否有相关的SASL或GSSAPI错误,确认keytab文件是否正确,并且权限适当。 用户提到是多节点集群,所以需要确认所有Journalnode节点都处于活动状态,并且至少有大多数(比如3个中的2个)是可用的。如果大部分Journalnode无法连接,Namenode可能无法启动。可以使用jps命令检查Journalnode进程是否在hadoop04、hadoop05、hadoop06上运行。 最后,可能需要尝试手动连接测试,比如使用telnet hadoop04 8485来确认Namenode节点能够建立TCP连接,或者使用nc命令测试连通性。如果手动测试失败,虽然用户说端口可达,但可能存在间歇性问题或防火墙规则变动。 总结一下,可能的排查步骤包括检查配置、日志分析、权限和安全性设置、服务状态、网络连通性、时间同步以及版本兼容性。需要逐步排除每个可能性,从配置和日志入手,再深入到网络和系统设置。</think>### Hadoop Namenode连接Journalnode故障排查指南 #### 一、核心配置验证 1. **Journalnode RPC地址配置** 检查所有Journalnode节点的$HADOOP_HOME/etc/hadoop/hdfs-site.xml: ```xml <property> <name>dfs.journalnode.rpc-address</name> <value>hadoop04:8485</value> <!-- 需确保主机名与集群实际名称一致 --> </property> ``` 2. **Namenode共享编辑目录** 确认namenode配置中使用完整的Journalnode列表: ```xml <property> <name>dfs.namenode.shared.edits.dir</name> <value>qjournal://hadoop04:8485;hadoop05:8485;hadoop06:8485/mycluster</value> </property> ``` *错误示例*:若遗漏某个节点会导致法定人数不足[^1] #### 二、网络与端口深度检查 1. **带协议测试**(在Namenode节点执行): ```bash telnet hadoop04 8485 # 预期看到Journalnode欢迎消息 nc -zv hadoop05 8485 # 检查瞬时连通性 ``` 2. **防火墙例外验证**: ```bash iptables -L -n | grep 8485 # 查看防火墙规则 firewall-cmd --list-ports | grep 8485 # Firewalld检查 ``` #### 三、日志分析要点 1. **Namenode日志关键错误模式**: - `Connection refused (code: CONNECTION_REFUSED)`:指向网络层问题 - `Couldn't connect to any Journalnode`:配置错误或服务未启动 - `GSSAPI authentication failed`:Kerberos认证故障 2. **Journalnode日志位置**: ```bash $JOURNALNODE_LOG_DIR/hadoop-hdfs-journalnode-*.log ``` 重点关注IPC Server初始化状态: ``` 2023-07-15 10:00:00 INFO IPC.Server: IPC Server listener on 8485: started ``` #### 四、特殊场景排查 1. **主机名解析陷阱** 在所有节点执行: ```bash ping -c 1 hadoop04 # 验证DNS/hosts解析 hostname -f # 检查FQDN配置 ``` *注意*:错误的hosts文件可能导致解析到127.0.1.1[^2] 2. **数据目录权限** Journalnode数据目录需具有写权限: ```bash ls -ld /path/to/journal/data # 应显示drwxr-xr-x ps -ef | grep journalnode # 验证运行用户身份 ``` #### 五、高级诊断工具 1. **Journalnode状态查询**: ```bash hdfs haadmin -getServiceState jn1 # 需要配置HA管理命令 ``` 2. **RPC协议抓包分析**: ```bash tcpdump -i eth0 port 8485 -w journalnode.pcap ``` 使用Wireshark分析TCP握手过程与Hadoop RPC协议交互
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值