hive视频日志转换格式

最新推荐文章于 2024-11-29 08:54:00 发布

原创最新推荐文章于 2024-11-29 08:54:00 发布

· 156 阅读

0 ·

版权

文章标签：

#hive

hive 专栏收录该内容

17 篇文章

订阅专栏

crontab -e //打开

1 */1 * * * sh /opt/cp_movie_data.sh //每个小时第一分钟执行一次
3 */1 * * * nohup sh /opt/up_movie_data.sh >> /opt/local/log.log 2>&1 & //一般它会依赖第一条

cd /opt
vi test.sh
#!/bin/bash
datetime= $shujuku.table_name partition(dt=$ datetime) "
作用：每过一小时就上传一下，把同一时间段的作为一个分区

create table movie(
stat_data string,
userid string,
uid string,
version string,
country string,
province string,
movie_tryvv int,
movie_sucvv int,
movie_ptime int,
dt string
)

1.首先put数据，用自己写的shell(假设：因为是网站每隔1小时就刷新一下)
#!/bin/bash
datetime= $shujuku.table_name partition(dt=$ datetime) "

2.需求：每3天的平均消费，其输出结果格式为：2019-01-01~2019-01-03 20
table_name: tset
data_time cost

datatime //这条是不规范数据
2019/1/1,10
2019/1/2,20
2019/1/3,30
2019/1/4,40
2019/1/5,50
2019/1/6,60
…

思路：
//首先split
select split(“2019/1/1”); => [“2019”,“1”,“1”]
//然后concat_ws
select concat_ws(’-’,select split(“2019/1/1”)); => 2019-1-1

datediff:日期比较函数
语法：datediff(string enddate,string startdate)
返回值：int
说明：返回结束日期减去开始日期的天数
date_add --加
语法：date_add(string startdate,int days)
返回值：string
说明：返回开始日期加上days天后的日期

步骤：
select concat_ws(’-’,split(date_time,’/’)) from test limit 2 ;
2019-6-1 //这里数据格式为2019-6-1 如果为 2019-06-01,后面就会出现这种格式2019-06-01
2019-6-2

select datediff(concat_ws(’-’,split(date_time,’/’)),“2019-6-1”) from test limit 10 ;
null //数据中的不规范数据造成的 —>解决：先过滤，后计算
0
1
2
…
8

select datediff(concat_ws(’-’,split(date_time,’/’)),“2019-6-1”)/7 from test limit 10 ;
null
0.0
0.1452…
0.294…
0.3645…
0.428…
0.5…
0.6…
0.7…
0.8…
1.0
1.1…
…

//注意 floor(double a)的参数是double,返回的是bigint
select floor(datediff(concat_ws(’-’,split(date_time,’/’)),“2019-1-1”)/7) from test limit 10 ;
null
0
0
0
0
0
0
0
1
1
…

//转换数据类型为int case() as int
select date_add(“2019-6-1”,case(floor(datediff(concat_ws(’-’,split(date_time,’/’)),“2019-6-1”)/7)*7) as int) from test limit 10 ;

null
2019-6-1
2019-6-1
2019-6-1
2019-6-1
2019-6-1
2019-6-1
2019-6-1
2019-6-8
2019-6-8
…

select concat(
date_add(“2019-6-1”,case(floor(datediff(concat_ws(’-’,split(date_time,’/’)),“2019-6-1”)/7)*7) as int),
‘~’,
date_add(“2019-6-1”,case(floor(datediff(concat_ws(’-’,split(date_time,’/’)),“2019-6-1”)/7)*7)+6 as int)))
from test ;

null
2019-6-1
2019-6-1
2019-6-1
2019-6-1
2019-6-1
2019-6-1
2019-6-1
2019-6-7
…
2019-6-7
2019-6-14
…
2019-6-14
2019-6-21
…
2019-6-21
2019-6-28
…
2019-6-28
2019-7-05
…
2019-7-05
…

//如果加上cost求平均的话
select concat(
date_add(“2019-6-1”,case(floor(datediff(concat_ws(’-’,split(date_time,’/’)),“2019-6-1”)/7)*7) as int),
‘~’,
date_add(“2019-6-1”,case(floor(datediff(concat_ws(’-’,split(date_time,’/’)),“2019-6-1”)/7)*7)+6 as int))),avg(cost) as avg_cost
from test
group by
concat(
date_add(“2019-6-1”,case(floor(datediff(concat_ws(’-’,split(date_time,’/’)),“2019-6-1”)/7)*7) as int),
‘~’,
date_add(“2019-6-1”,case(floor(datediff(concat_ws(’-’,split(date_time,’/’)),“2019-6-1”)/7)*7)+6 as int)))
;

//假设用的数据格式为2019-06-01，这样最终输出数据好看
2019-06-01~2019-06-07 12.26263 //avg_cost数值可以使用round保留自己想要的位数
2019-06-08~2019-06-14 15.51532 //计算avg的语句可以改成
2019-06-15~2019-06-21 18.91532 //round(avg(cost),2)，结果如下
2019-06-22~2019-06-28 15.0
2019-06-29~2019-07-05 15.25132
…

2019-06-01~2019-06-07 12.26
2019-06-08~2019-06-14 15.52
2019-06-15~2019-06-21 18.92
…

//取整函数round
语法：round(double a)
返回值：bigint
说明：返回double类型的整数值部分（遵循四舍五入）
举例：
select round(3.1415);
3
select round(3.5415);
4

//指定精度取整函数：round
语法：round(double a,int d)
返回值：double
说明：返回指定精度d的double类型
举例：
select round(3.1415926,4);4
3.1416