hive中常用函数汇总

你补药过来啊

于 2025-03-12 11:58:26 发布

阅读量1.8k

点赞数 36

文章标签： hive hadoop 数据仓库 sql 大数据

本文链接：https://blog.youkuaiyun.com/weixin_45943747/article/details/146199868

版权

持续更新中…

0.数据准备

首先为后面测试函数准备数据表

-- todo 数据准备-----
-- 创建数据库
create database hql_function;
-- 使用数据库
use hql_function;
-- 创建表格
create table tab_function
(
    id int,
    name string,
    price float,
    category int,
    dates date,
    years array<string>
);

INSERT INTO tab_function (id, name, price, category, dates, years)
VALUES
    (1, '苹果', 99.55, 1, '2024-01-01', array('2024')),
    (2, '电视机', 2999.99, 2, '2024-02-03', array('2023', '2024')),
    (3, '冰箱', 4599.60, 1, '2023-03-15', array('2022', '2023', '2024')),
    (4, '洗衣机', 1889.77, 3, '2025-04-12', array('2024', '2025', '2026')),
    (5, '电饭煲', 299.99, 2, '2022-05-20', array('2022')),
    (6, '微波炉', 499.50, 1, '2024-06-25', array('2023', '2024', '2025')),
    (7, '空调', 3599.30, 3, '2023-07-11', array('2021', '2022', '2023', '2024')),
    (8, '耳机', 159.25, 2, '2025-08-05', array('2025', '2026')),
    (9, '笔记本电脑', 4999.99, 1, '2022-09-19', array('2021', '2022', '2023')),
    (10, '手机', 2499.75, 3, '2024-10-06', array('2023', '2024')),
    (11, '扫地机器人', 899.99, 2, '2025-11-03', array('2023', '2024', '2025')),
    (12, '投影仪', 1499.45, 1, '2023-12-17', array('2022', '2023')),
    (13, '饮水机', 399.60, 3, '2024-01-22', array('2024')),
    (14, '电动牙刷', 199.99, 2, '2025-02-27', array('2024', '2025', '2026')),
    (15, '智能手表', 1299.80, 1, '2022-03-29', array('2021', '2022', '2023')),
    (16, '电风扇', 159.99, 3, '2024-04-16', array('2023')),
    (17, '咖啡机', 899.90, 2, '2023-05-13', array('2023', '2024')),
    (18, '路由器', 239.75, 1, '2025-06-21', array('2025', '2026', '2027')),
    (19, '蓝牙音响', 350.99, 3, '2022-07-30', array('2021')),
    (20, '游戏主机', 2899.60, 2, '2024-08-14', array('2023', '2024', '2025', '2026'));
-- 查看数据
select * from tab_function;

1.查看函数基本信息

查看hive中所有函数

-- 查看hive中所有函数
show functions ;

查看指定函数的信息

-- 查看指定函数的信息
desc function split;

查看指定函数详细扩展信息

-- 查看指定函数详细扩展信息(有对应示例)
desc function extended split;
SELECT split('one,two,three', ',');

2.字符串操作函数

字符串常见的函数:
split: 分割字符串（按指定字符/正则）
concat: 字符串紧凑拼接到一起生成新字符串
concat_ws: 字符串用指定分隔符拼接到一起生成新字符串
length: 获取字符串长度
lower: 把字符串中的字母全部变成小写
upper: 把字符串中的字母全部变成大写
trim: 把字符串两端的空白去除
拓展字符串函数
substr: 截取字符串
replace: 替换字符串
regexp_replace: 正则替换字符串
parse_url: 解析url(统一资源定位符) 组成: 协议/主机地址:端口号/资源路径?查询参数
get_json_object: 获取json对象解析对应数据

-- 已知字符串'苹果-香蕉-西瓜-哈密瓜-火龙果-榴莲'要求切割放到容器中（分割函数）
select split('苹果-香蕉-西瓜-哈密瓜-火龙果-榴莲', '-'); -- ["苹果","香蕉","西瓜","哈密瓜","火龙果","榴莲"]
-- concat: 字符串紧凑拼接到一起生成新字符串
select concat('abc', '666', '888');
-- concat_ws: 字符串用指定分隔符拼接到一起生成新字符串
select concat_ws('-', 'abc', '666', '888');
-- length: 获取字符串长度
select length('abc-666');
-- lower: 把字符串中的字母全部变成小写
select lower('AbC-666');
-- upper: 把字符串中的字母全部变成大写
select upper('aBc-666');
-- trim: 把字符串两端的空白去除
select trim('   abc 666  ');

-- substr(字符串,开始索引,截取长度): 截取字符串
-- 注意: 正索引从1开始正着数  负索引从-1开始负着数
select substr('abcd666',1,2);
select substr('abcd666',1); -- 默认从1索引位置到最后
select substr('abcd666',-4);-- 默认从-4索引位置到最后

-- 已知'2023-05-21'要求分别截取年月日
select substr('2023-05-21',1,4); -- 结果2023
select substr('2023-05-21',6,2); -- 结果05
select substr('2023-05-21',-2,2); -- 结果21

-- replace(大字符串,敏感词,替换后的内容):替换字符串
select replace('你TMD哦','TMD','***');

--正则表达式替换函数：regexp_replace(str, regexp, rep)
select regexp_replace('abcd-666', '\\d+', 'num');
--正则表达式解析函数：regexp_extract(str, regexp[, idx])
-- 正则中()代表分组,自动从1开始生成编号,提取正则匹配到的指定组内容
select regexp_extract('abc-666-888-def-111', '(\\d+)-(\\d+)', 0); -- 0:输出全部 666-888
select regexp_extract('abc-666-888-def-111', '(\\d+)-(\\d+)', 1); -- 1:输出第一个 666
select regexp_extract('abc-666-888-def-111', '(\\d+)-(\\d+)', 2); -- 888

--URL解析函数：parse_url 注意要想一次解析出多个 可以使用parse_url_tuple这个UDTF函数
-- URL: 统一资源定位符 也就是咱们常说的网址   组成: 协议 主机地址:端口号 资源路径 查询参数
select parse_url('http://www.baidu.com/path/aaa.html?user=aaa&pwd=123', 'HOST');
select parse_url('http://www.baidu.com/path/aaa.html?user=aaa&pwd=123', 'PATH');
select parse_url('http://www.baidu.com/path/aaa.html?user=aaa&pwd=123', 'QUERY');
select parse_url('http://www.baidu.com/path/aaa.html?user=aaa&pwd=123', 'QUERY', 'user');
select parse_url('http://www.baidu.com/path/aaa.html?user=aaa&pwd=123', 'QUERY', 'pwd');

-- json解析函数：get_json_object(json_txt, path),
-- 细节: 整个json字符串用单引号'包裹, json字符串中的键, 值用双引号"包裹.
-- json字符串的格式: {键:值, 键: 值}
-- json数组的格式: [{键:值, 键: 值}, {键:值, 键: 值}, {键:值, 键: 值}]      -- 索引从 0 开始.
select get_json_object('{"name":"杨过", "age":"18"}', '$.name');      -- 杨过, $表示json对象
select get_json_object('[{"name":"杨过", "age":"18"}, {"name":"小龙女", "age":"26"}]', '$.[0]'); -- {"name":"杨过", "age":"18"}
select get_json_object('[{"name":"杨过", "age":"18"}, {"name":"小龙女", "age":"26"}]', '$.[1].name'); -- 小龙女,   $表示json对象

3.复杂函数

hive复杂类型: array struct map
array类型: 又叫数组类型,存储同类型的单数据的集合
----取值: 字段名[索引] 注意: 索引从0开始
----获取长度的函数: size(array字段名) 常用
----判断是否包含某个数据的函数: array_contains(字段名,某数据) 常用
----对数组进行排序的函数: sort_array(数组)

struct类型: 又叫结构类型,可以存储不同类型单数据的集合
-----取值: 字段名.子字段名n

map类型: 又叫映射类型,存储键值对数据的映射(根据key找value)
----取值: 字段名[key]
----获取长度的函数: size(字段名) 常用
----获取所有key的函数: map_keys() 常用
----获取所有value的函数: map_values() 常用

-- 创建数组类型
select array('binzi','666','888');
-- 统计长度
select size(array('binzi','666','888'));
-- 判断某个成员在不在数组中
select array_contains(array('binzi','666','888'),'binzi');
-- 排序
select sort_array(array(3,1,5,2,4)); -- [1,2,3,4,5]

-- 创建map(键值对形式)
select map('a',1,'b',2,'c',3);
-- 统计长度
select size(map('a',1,'b',2,'c',3));
-- 获取键的列表
select map_keys(map('a',1,'b',2,'c',3));-- ["a","b","c"]
-- 获取值的列表
select map_values(map('a',1,'b',2,'c',3));-- [1,2,3]

4.日期操作函数

unix_timestamp: 获取unix时间戳(时间原点到现在的秒/毫秒)
----可以使用yyyyMMdd HH:mm:ss进行格式化转换
current_timestamp: 获取时间原点到现在的秒/毫秒,底层自动转换方便查看的日期格式
to_date: 字符串格式时间戳转日期(年月日)
current_date: 获取当前日期(年月日)

year: 获取指定日期时间中的年
quarter: 获取指定日期所属的季度
month:获取指定日期时间中的月
day:获取指定日期时间中的日
hour:获取指定日期时间中的时
minute:获取指定日期时间中的分
second:获取指定日期时间中的秒

dayofmonth: 获取指定日期时间中的月中第几天
dayofweek:获取指定日期时间中的周中第几天
quarter:获取指定日期时间中的所属季度
weekofyear:获取指定日期时间中的年中第几周

datediff: 获取两个指定时间的差值
date_add: 在指定日期时间上加几天
date_sub: 在指定日期时间上减几天

from_unixtime: 把unix时间戳转换为日期格式的时间
---- 如果传入的参数是0,获取的是时间原点1970-01-01 00:00:00

-- 获取当前时间戳(时间原点到现在的秒/毫秒)
select unix_timestamp();    -- 1741695416
select current_timestamp(); -- 自动转换 2025-03-11 20:16:34.608000000
-- 获取当前日期
select current_date(); -- 2025-03-11
-- 字符串格式时间戳转日期
select to_date('2025-03-11 20:16:34.608000000');
select to_date(current_timestamp());
-- 依次获取年月日时分秒
select year('2025-03-11 20:16:34.608000000');
select month('2025-03-11 20:16:34.608000000');
select day('2025-03-11 20:16:34.608000000');

select hour('2025-03-11 20:16:34.608000000');
select minute('2025-03-11 20:16:34.608000000');
select second('2025-03-11 20:16:34.608000000');
-- 依次获取月中第几天,周中第几天,季度,年中第几周
select dayofmonth('2025-03-11 20:16:34.608000000');
select dayofweek('2025-03-11 20:16:34.608000000');  -- 周日为一周中的第一天
select quarter('2025-03-11 20:16:34.60800000');
select weekofyear('2025-03-11 20:16:34.608000000');

-- 计算时间差
select datediff('2025-03-11', '2023-05-21'); -- 前减后

-- 获取明天的日期
select date_add(current_timestamp(),1);
select date_sub(current_timestamp(),-1);
-- 获取昨天的日期
select date_sub(current_timestamp(),1);
select date_add(current_timestamp(),-1);

-- 拓展
-- UNIX时间戳函为1970-1-1距今的秒数
--获取当前UNIX时间戳函数: unix_timestamp
select unix_timestamp();
--字符串日期转UNIX时间戳函数: unix_timestamp
select unix_timestamp("2025-03-11 20:27:07");

--指定格式日期转UNIX时间戳函数: unix_timestamp
select unix_timestamp('20250311 20:28:56','yyyyMMdd HH:mm:ss');

--UNIX时间戳转日期函数: from_unixtime
select from_unixtime(1741724936); -- 2025-03-11 20:28:56
-- 获取时间原点日期
select from_unixtime(0); -- 1970-01-01 00:00:00

5.数字操作函数

pi: 生成π结果
round: 指定小数保留位数常用
rand: 生成0-1的随机数
ceil: 向上取整
floor: 向下取整

-- 随机数
select rand();  -- 0-1之间
-- 获取π值
select pi();
-- 四舍五入设置保留位数
select round(pi(),4);
-- 向上取整
select ceil(pi());
-- 向下取整
select floor(pi());

6.条件函数

if(参数1,参数2,参数3): 如果参数1表达式结果为true,就执行参数2内容,否则执行参数3的内容
case…when.then…end: 条件判断类似于编程语言中的if…else if …else…
isnull(数据) : 为空null: true 不为空:false
isnotnull(数据): 不为空: true 为空null:false
nvl(数据,参数2): 如果数据不为空就保留数据本身,如果数据为空null那就用第二个参数替换常用
coalesce(参数1,参数2…): 从左到右依次查找,返回第一个不是null的值,如果找到最后都是null,就返回null

-- if(条件判断,true的时候执行此处,false的时候执行此处)
select if(10 > 5, '真', '假');
select if(10 < 5, '真', '假');
--条件转换函数格式1: CASE a WHEN b THEN c [WHEN d THEN e]* [ELSE f] END
select
       case 7
           when 1 then '周一上班'
           when 2 then '周二上班'
           when 3 then '周三上班'
           when 4 then '周四上班'
           when 5 then '周五上班'
           when 6 then '周六休息'
           when 7 then '周日休息'
           else '老弟啊,你是外星人吗?'
       end;

-- 条件转换函数格式2:CASE  WHEN a==b THEN a==c [WHEN a==d THEN a==e]* [ELSE f] END
select
       case
           when 7==1 then '周一上班'
           when 7==2 then '周二上班'
           when 7==3 then '周三上班'
           when 7==4 then '周四上班'
           when 7==5 then '周五上班'
           when 7==6 then '周六休息'
           when 7==7 then '周日休息'
           else '老弟啊,你是外星人吗?'
       end;

-- isnull(数据) 为空: true 不为空:false
select isnull(null); -- true
-- isnotnull(数据) 不为空: true 为空:false
select isnotnull('abab'); -- true

-- nvl(数据,前面的数据是null的时候执行此处): 如果数据不为空打印数据,为空打印第二个参数
select nvl('abab','666');  -- 输出 abab
select nvl(null,'666');  -- 输出 666
-- coalesce(v1,v2...): 从左到右依次查找,返回第一个不是null的值,如果找到最后都是null,就返回null
select coalesce(null,11,22,33);-- 11
select coalesce(null,null,22,33);--22
select coalesce(null,null,null,33);--33
select coalesce(null,null,null,0);--0
select coalesce(null,null,null,null);--null

-- 注意: null不能和其他数值相加,结果会返回null,这时候就需要用nvl或者coalesce把null转为0
select 10+null;
select 10+nvl(null,0);
select 10+coalesce(null,0);

7.类型转换函数

cast(数据 as 要转换的类型) ：十分常用

-- cast: 主要用于类型转换 注意: 转换失败返回null
select cast(3.14 as int); -- 3
select cast(3.14 as string) ; -- '3.14'
select cast('3.14' as float); -- 3.14
select cast('3.14' as int); -- 3
select cast('abc' as int); -- null

-- -- 注意: 很多时候底层都默认做了自动转换
select '3'+3; -- 6

-- 实际应用场景:concat_ws要求被连接的必须是字符串,如果直接用666就报错
select concat_ws('_','abc',666,'888'); --此行报错,因为concat_ws只能拼接字符串类型
select concat_ws('_','abc',cast(666 as string),'888'); -- abc_666_888

8.数据脱敏函数

mask(string, a, b, c): 对string内容进行屏蔽,规则为：大写字母替换为a, 小写字母替换为b, 数字替换为c。 abc默认分别为Xxn
mask_first_n(string, n, a, b, c): 对前n个进行脱敏替换大写字母转换为a，小写字母转换为b，数字转换为c。abc默认分别为Xxn
mask_last_n(string, n, a, b, c):对后n个进行脱敏替换大写字母转换为a，小写字母转换为b，数字转换为c。abc默认分别为Xxn
mask_show_first_n(string, n, a, b, c):除了前n个字符，其余进行掩码处理。大写字母转换为a，小写字母转换为b，数字转换为c。abc默认分别为Xxn
mask_show_last_n(string, n, a, b, c):除了后n个字符，其余进行掩码处理。大写字母转换为a，小写字母转换为b，数字转换为c。abc默认分别为Xxn

--将查询回的数据，大写字母转换为X，小写字母转换为x，数字转换为n。
select mask("abc123DEF");
--自定义替换的字母: 依次为大写小写数字
select mask("abc123DEF",'大','小','数');
select mask("abc123DEF",'/','.','%');

--mask_first_n(string str[, int n]
--对前n个进行脱敏替换 大写字母转换为X，小写字母转换为x，数字转换为n。
select mask_first_n("abc123DEF",8, 'a', 'b', 'c');

--mask_last_n(string str[, int n])
--对后n个进行脱敏替换 大写字母转换为X，小写字母转换为x，数字转换为n。
select mask_last_n("abc123DEF",6);

--mask_show_first_n(string str[, int n])
--除了前n个字符，其余进行掩码处理
select mask_show_first_n("2999.99",2);
--除了后n个字符，其余进行掩码处理
--mask_show_last_n(string str[, int n])
select mask_show_last_n("abc123DEF",6);

9.开窗函数

开窗函数格式:
select … 开窗函数 over(partition by 分组字段名 order by 排序字段名 asc|desc) … from 表名;

聚合开窗函数: 聚合函数(max,min,sum,count,avg) 配合over()使用的时候,这些聚合函数也可以叫窗口函数

排序开窗函数: row_number dense_rank rank
----row_number: 特点: 唯一且连续
----dense_rank: 特点: 并列且连续
----rank : 巧记: 特点: 并列不连续

开窗函数: hive和mysql8都能使用
开窗函数本质在表后新增了一列
聚合开窗函数: max min sum avg count
聚合函数配合over()使用,也可以叫开窗函数

其他开窗函数: ntile lag和lead first_value和last_value

ntile(x)功能: 将分组排序之后的数据分成指定的x个部分（x个桶）
注意ntile规则:尽量平均分配，优先满足最小(编号1)的桶，彼此最多不相差1个。

lag: 用于统计窗口内往上第n行值
lead:用于统计窗口内往下第n行值

first_value: 取分组内排序后，截止到当前行，第一个值
last_value : 取分组内排序后，截止到当前行，最后一个值

[开窗函数控制范围]
开窗函数控制范围: rows | range(默认) between X and Y
X，Y取值有:
- x preceding:往前x行
- x following:往后x行
- current row:当前行（Y默认值）
- unbounded: 起点
- unbounded preceding :表示从前面的起点第一行（X默认值）
- unbounded following :表示到后面的终点最后一行

-- 查看最高价
select *, max(price) over() from tab_function;
-- 开窗函数分组+聚合函数 -- 查看每个分类中的最高价
select *, max(price) over(partition by category) from tab_function;
-- 开窗函数分组排序 + 聚合函数 -- 查看每个分类中的最高价，同时按分类id排序
select *, max(price) over(partition by category order by category) from tab_function;

-- 按日期排序后，计算当前商品及之前的价格总和
select
    name, price, dates,
    sum(price) over (order by dates rows between unbounded preceding and current row )
from tab_function;

-- 按id排序，计算当前商品和上一个、下一个商品的差值
select
    id, name, price,
    lag(price, 1) over (order by id) pre_price,
    lead(price, 1) over (order by id) next_price,
    price - lag(price, 1) over (order by id) pre_diff,
    price - lead(price, 1) over (order by id) next_diff
from tab_function;

-- 按id排序，计算当前商品及前两个商品的价格总和
select
    id, name, price,
    sum(price) over (order by price rows between 2 preceding and current row) sum_price
from tab_function;

-- 按id排序，计算当前商品和上一个、下一个商品的差值
select
    id, name, price,
    lag(price, 1) over (order by id) pre_price,
    lead(price, 1) over (order by id) next_price,
    price - lag(price, 1) over (order by id) pre_diff,
    price - lead(price, 1) over (order by id) next_diff
from tab_function;

-- 取分组内排序后，从开始计算,截止到当前行，第一个值
select id, name, category, price,
    row_number() over (partition by category order by price) rn,
    first_value(name) over (partition by category order by price) fv
from tab_function;

10. 其他函数

炸裂函数explode
哈希函数hash、mask_hash
加密函数md5、sha1、sha2等
查看当前信息current_user
…

-- 已知array容器中"苹果","香蕉","西瓜","哈密瓜","火龙果","榴莲"要求炸开（炸裂函数）
select explode(array("苹果", "香蕉", "西瓜", "哈密瓜", "火龙果", "榴莲"));

-- mask_hash:  返回指定字符串的hash编码
select mask_hash('aaa');

--取哈希值函数:hash
select hash("aaa");

--MD5加密: md5(string/binary)
select md5("aaa"); -- 32位   47bce5c74f589f4867dbd57e9ca9f808
select length('47bce5c74f589f4867dbd57e9ca9f808');

--SHA-1加密: sha1(string/binary)
select sha1("aaa"); -- 40位 7e240de74fb1ed08fa08d38063f6a6a91462a815
select length('7e240de74fb1ed08fa08d38063f6a6a91462a815');

--SHA-2家族算法加密：sha2(string/binary, int)  (SHA-224, SHA-256, SHA-384, SHA-512)
select sha2("aaa",224); -- ed782653bfec275cf37d027511a68cece08d1e53df1360c762ce043a
select sha2("aaa",512);

--crc32加密:
select crc32("aaa"); -- 4027020077

-- 当前环境相关的
--      当前用户        登录用户            当前数据库           hive版本
select current_user(),logged_in_user(),current_database(),version();

11.实战

-- 商品价格打码
select id, name,
    -- float->int->string-> 除第一位，其余不显示 -> 将不显示的符号替换为x -> 起别名
    replace(mask_show_first_n(cast(cast(price as int) as string), 1), 'n', 'x') show_price,
    category, dates
from tab_function;

-- 展开年份(years), lateral view 为侧视图，类似于join
select * from tab_function t1
lateral view explode(t1.years) t2 as years;

-- 给价格小于1000的标记为‘贵’，大于等于1000的标记为‘真的贵’
select price, if(price<1000, '贵', '真的贵') from tab_function;

-- 将category为1的标记为‘日用品‘， 2的标记为’夜用品‘， 其他类别标记为’狗都不用‘
select
    category,
    case category
        when 1 then '日用品'
        when 2 then '夜用品'
        else '狗都不用'
    end
from tab_function;
-- 另一种实现方式
select
    category,
    case
        when category==1 then '日用品'
        when category==2 then '夜用品'
        else '狗都不用'
    end
from tab_function;

-- 查看每个分类中最贵和最便宜商品的差值
select
    category, max(price) `max`, min(price) `min`,
    max(price) - min(price) diff
from tab_function
group by category;
-- 查看每个分类中最贵和当前商品的差值
select
    category, price, max(price) over (partition by category) `max`,
    max(price) over (partition by category) - price diff
from tab_function;