HiveSQL基础之常用函数_sql exception hive when parsing datetime string in-优快云博客

本文链接：https://blog.youkuaiyun.com/FDFCJBAFSA/article/details/99683164

1. 如何把时间戳转换成日期

语法

from_unixtime(bigint unixtime, string format)

format

1. yyyy-MM-dd hh:mm:ss

2. yyyy-MM-dd hh

3. yyyy-MM-dd hh:mm

4. yyyyMMdd

举例

SELECT 
from_unixtime(paytime,'yyyy-MM-dd hh:mm:ss')
FROM user_trade
WHERE dt='2019-04-09';

拓展：把日期转换为时间戳
unix_timestamp

SELECT 
unix_timestamp(firstactivetime)
FROM user_info
WHERE level=5;

2. 如何计算日期间隔

语法

datediff(string enddate, string startdate)

举例：用户首次激活时间，与2019年5月1日的日期间隔

SELECT user_name,
	   datediff('2019-05-01', to_date(firstactivetime))
FROM user_info
LIMIT 10;

拓展：日期增加，减少函数
语法

date_add(string startdate, int days)
date_sub (string startdate, int days)

举例

SELECT user_name,
	   date_add(firstactivetime,7)
FROM user_info
LIMIT 10;

3. case when 条件函数

统计以下4个年龄段20岁以下，20-30岁，30-40岁，40岁以上的用户分布

SELECT case when age<20 then '20岁以下'
			when age<30 then '20-30岁'
			when age<40 then '30-40岁'
			else '40岁以上' end,
		count(user_id) as user_num
FROM user_info
GROUP BY case when age<20 then '20岁以下'
			when age<30 then '20-30岁'
			when age<40 then '30-40岁'
			else '40岁以上' end;

4. if函数

统计每个性别用户等级高低的分布情况

SELECT sex,
	   if(level>5,'高','低'),
	   count(user_id) as user_num
FROM user_info
GROUP BY sex,
		 if(level>5,'高','低');

5. 字符串函数

语法

substr(string A, int start, int len)

每个月新激活用户数

SELECT substr(firstactivetime,1,7) as month,
	   count(user_id) as user_num
FROM user_info
GROUP BY substr(firstactivetime,1,7);

语法

get_json_object(string json_string, string path)

不同手机品牌的用户数

SELECT get_json_object(extra1,'$.phonebrand') as phone_brand,
	   count(user_id) as user_num
FROM user_info
GROUP BY get_json_object(extra1,'$.phonebrand');

不同手机品牌学历分布用户数

SELECT extra2['phonebrand'] as phone_brand,
	   extra2['education'] as education,
	   count(user_id) as user_num
FROM user_info
GROUP BY extra2['phonebrand'],extra2['education'];

6. 聚合统计函数

ELLA用户2018年平均支付金额，以及2018年最大支付日期和最小支付日期的时间间隔

SELECT avg(pay_amount) as avg_amount,
	   datediff(max(from_unixtime(pay_time,'yy-MM-dd')),
	   min(from_unixtime(pay_time,'yy-MM-dd')))
FROM user_trade
WHERE year(dt)='2018'
	  and user_name='ELLA';

# 另一种写法
SELECT avg(pay_amount) as avg_amount,
	   datediff(from_unixtime(max(pay_time),'yy-MM-dd'),
	   from_unixtime(min(pay_time),'yy-MM-dd'))
FROM user_trade
WHERE year(dt)='2018'
	  and user_name='ELLA';

7. 综合练习

2018年购买的商品品类在两个以上的用户数

SELECT count(a.user_name)
FROM (SELECT user_name,
	   count(distinct goods_category) as category_num
FROM user_trade
WHERE year(dt)='2018'
GROUP BY user_name HAVING count(distinct goods_category) > 2) as a;

用户激活时间在2018年，年龄段在20-30岁和30-40岁的婚姻状况分布

SELECT a.age_type, 
	   if(marriage_status=1,'已婚','未婚'),
	   count(a.user_id)
FROM
	(SELECT case when age<20 then '20岁以下'
				when age<30 then '20-30岁'
		        when age<40 then '30-40岁'
		        else '40岁以上' end as age_type,
		   get_json_object(extra1, '$.marriage_status') as marriage_status,
		   user_id
	FROM user_info
	WHERE year(firstactivetime)='2018') as a
WHERE a.age_type in ('20-30岁', '30-40岁')
GROUP BY a.age_type, 
	   if(marriage_status=1,'已婚','未婚');

激活天数距今超过300天的男女分布情况

SELECT sex,
	   count(user_id)
FROM user_info
WHERE datediff('2019-08-07', substr(firstactivetime,1,10))>300
GROUP BY sex;

不同性别，教育程度的分布情况

SELECT sex,
	   get_json_object(extra1,'$.education'),
	   count(user_id)
FROM user_info
GROUP BY sex,
	     get_json_object(extra1,'$.education');

2019年1月1日到2019年4月30日，每个时段的不同品类购买金额分布

select substr(from_unixtime(pay_time,'yyyy-MM-dd hh'),12),
       goods_category,
       sum(pay_amount)
from user_trade
where dt between '2019-01-01' and '2019-04-30'
group by substr(from_unixtime(pay_time,'yyyy-MM-dd hh'),12),
         goods_category;