Hive进阶-----------------------函数篇（常用函数）

最新推荐文章于 2023-08-02 14:01:51 发布
原创最新推荐文章于 2023-08-02 14:01:51 发布 · 330 阅读
1 ·
CC 4.0 BY-SA版权
hive 专栏收录该内容
5 篇文章
订阅专栏
博客聚焦Hive进阶中的常用函数。Hive是大数据开发领域重要工具，掌握其常用函数能提升数据处理效率，在数据查询、分析等操作中发挥关键作用。
摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >
1. 类型转换函数
    select cast("5" as int) ;
    select cast("2017-08-03" as date) ;
    select cast(current_timestamp as date);
2. 数学运算函数
    select round(5.4);   ## 5  四舍五入
    select round(5.1345,3) ;  ##5.135
    select ceil(5.4) ; // select ceiling(5.4) from dual;   ## 6  向上取整
    select floor(5.4);  ## 5  向下取整
    select abs(-5.4) ;  ## 5.4  绝对值
    select greatest(3,5,6) ;  ## 6  
    select least(3,5,6) from dual;  ##求多个输入参数中的最小值
3. 字符串函数
    substr(string str, int start)   ## 截取子串
    substring(string str, int start)
    示例：select substr("abcdefg",2) from dual;
    substr(string, int start, int len) 
    substring(string, int start, int len)
    示例：select substr("abcdefg",2,3) from dual;

    concat(string A, string B...)  ## 拼接字符串
    concat_ws(string SEP, string A, string B...)
    示例：select concat("ab","xy") from dual;  ## abxy
    select concat_ws(".","192","168","33","44") from dual; ## 192.168.33.44

    length(string A)
    示例：select length("192.168.33.44") from dual;  ## 13

    split(string str, string pat)
    示例：select split("192.168.33.44",".") from dual; 错误的，因为.号是正则语法中的特定字符
    select split("192.168.33.44","\\.") from dual;

    upper(string str) ##转大写
    lower(string str)

4. 时间函数
    select current_timestamp; ## 获取当前的时间戳(详细时间信息)
    select current_date;   ## 获取当前的日期

    ## 取当前时间的秒数时间戳--(距离格林威治时间1970-1-1 0：0：0秒的差距) 
    select unix_timestamp();

    ## unix时间戳转字符串
    from_unixtime(bigint unixtime[, string format])
    示例：select from_unixtime(unix_timestamp());
    select from_unixtime(unix_timestamp(),"yyyy/MM/dd HH:mm:ss");

    ## 字符串转unix时间戳
    unix_timestamp(string date, string pattern)
    示例： select unix_timestamp("2017-08-10 17:50:30");
    select unix_timestamp("2017-08-10 17:50:30","yyyy-MM-dd HH:mm:ss");

    ## 将字符串转成日期date
    select to_date("2017-09-17 16:58:32");
5. 条件控制函数
    a. case when
        select id,name,
        case
        when age<28 then 'youngth'
        when age>27 and age<40 then 'zhongnian'
        else 'old'
        end
        from t_user;
    b. if
        select id,if(age>25,'working','worked') from t_user;

        select moive_name,if(array_contains(actors,'吴刚'),'好电影',’烂片儿’) 
        from t_movie;
6. 集合函数
    a. array_contains(Array<T>,value)返回boolean值
    select moive_name,array_contains(actors,'吴刚') from t_movie;
    select array_contains(array('a','b','c'),'c') from dual;
    b. sort_array(Array<T>)返回排序后的数组
        select sort_array(array('c','b','a')) from dual;
        select 'haha',sort_array(array('c','b','a')) as xx from (select 0) tmp;
    c. size(Array<T>)  返回一个集合的长度，int值
        i. select moive_name,size(actors) as actor_number from t_movie;
7. 行转列函数：explode()
    a. 数据：
        1,zhangsan,化学:物理:数学:语文
        2,lisi,化学:数学:生物:生理:卫生
        3,wangwu,化学:语文:英语:体育:生物
    b. 创建表
        create table t_stu_subject(id int,name string,subjects array<string>)
        row format delimited fields terminated by ','
        collection items terminated by ':';
    c. 使用explode()对数组字段"炸裂"
        i. select explode(subjects) as sub from t_stu_subject
        ii. 利用这个结果，求去重的课程
        select distinct tmp.sub
        from 
        (select explode(subjects) as sub from t_stu_subject) tmp;
8. 表生成函数
    select id,name,tmp.sub 
    from t_stu_subject lateral view explode(subjects) tmp as sub;
    理解： lateral view 相当于两个表在join
    左表：是原表
    右表：是explode(某个集合字段)之后产生的表
    而且：这个join只在同一行的数据间进行

    那样，可以方便做更多的查询：
    比如，查询选修了生物课的同学
    select a.id,a.name,a.sub from 
    (select id,name,tmp.sub as sub from t_stu_subject lateral view explode(subjects) tmp as sub) a
    where sub='生物';
9. json解析函数
    a. 创建一个原始表用来对应原始的json数据
        create table t_json(json string);
        load data local inpath ‘/root/rating.json’ into table t_json;
    b. 利用json_tube进行json数据解析
        i. select json_tuple(json,'movie','rate','timeStamp','uid') as(movie,rate,ts,uid) from t_json limit 10;
    c. 真正解析整张json表，将解析结果数据插入一张新表
        create table t_movie_rate
        as
        select json_tuple(json,'movie','rate','timeStamp','uid') as(movie,rate,ts,uid) from t_json;
    d. 利用json_tuple从原始json数据表中，etl出一个详细信息表：
        create table t_rate 
        as 
        select 
        uid,
        movie,
        rate,
        year(from_unixtime(cast(ts as bigint))) as year,
        month(from_unixtime(cast(ts as bigint))) as month,
        day(from_unixtime(cast(ts as bigint))) as day,
        hour(from_unixtime(cast(ts as bigint))) as hour,
        minute(from_unixtime(cast(ts as bigint))) as minute,
        from_unixtime(cast(ts as bigint)) as ts
        from 
        (select 
        json_tuple(rateinfo,'movie','rate','timeStamp','uid') as(movie,rate,ts,uid)
        from t_json) tmp
        ;
10. 分析函数
    a. 数据：
        1,18,a,male
        2,19,b,male
        3,22,c,female
        4,16,d,female
        5,30,e,male
        6,26,f,female
    b. 详解：
        row_number() OVER (PARTITION BY COL1 ORDERBY COL2)表示根据COL1分组，在分组内部根据COL2排序，而此函数计算的值就表示每组内部排序后的顺序编号（该编号在组内是连续并且唯一的)。

    c. 需要查询出每种性别中年龄最大的2条数据（创建表）
        create table t_rn(id int,age int,name string,sex string)
        row format delimited fields terminated by ‘,’;

    d. 实现(打标号)
        使用row_number函数，对表中的数据按照性别分组，按照年龄倒序排序并进行标记

        hql代码：
        select id,age,name,sex,
        row_number() over(partition by sex order by age desc) as rank
        from t_rn
        i. 利用上述结果操作，满足最终要求
            select id,age,name,sex
            from 
            (select id,age,name,sex,
            row_number() over(partition by sex order by age desc) as rank
            from t_rn) tmp
            where rank<=2;
11. 分析函数---->sum聚合，避免join
    select username,month,m_sale,
    sum(m_sale) over (partition by username order by month rows between unbounded preceding and current row)
    from
    (select username,substr(day,1,7) as month ,sum(msale) as m_sale
    from t_sale
    group by username,substr(day,1,7))tmp; 

    a. 按照partition后面的字符分组，order by后面的字符排序，求和，（