Hivesql10题

1.找出所有科目成绩都大于某一学科平均成绩的学生

数据集如下

uid  subject_id  score 
1001	01		 90
1001	02		 90
1001	03		 90
1002	01		 85
1002	02		 85
1002	03		 70
1003	01		 70
1003	02		 70
1003	03		 85
select
	uid
from
	(
	select
		uid
		,if(score>avg_score,0,1) flag
	from
		(
		select
			uid
			,subject_id
			,avg(score) over(partition by subject_id) avg_score
		from
			score) t1) t2
where
group by
	uid
having
	sum(flag)= 0;
2.统计出每个用户的累积访问次数

用户访问数据

userId	visitDate	visitCount
u01		2017/1/21	5
u02		2017/1/23	6
u03		2017/1/22	8
u04		2017/1/20	3
u01		2017/1/23	6
u01		2017/2/21	8
u02		2017/1/23	6
u01		2017/2/22	4

最后结果如下所示

用户id	月份		小计	累积
u01		2017-01	 11	 11
u01		2017-02	 12	 23
u02		2017-01	 12	 12
u03		2017-01	 8	 8
u04		2017-01	 3	 3
select
    userId,
    mn,
    mn_count,
    sum(mn_count) over(partition by userId order by mn)
from 
(   select
        userId,
        mn,
        sum(visitCount) mn_count
    from
         (select
             userId,
             date_format(regexp_replace(visitDate,'/','-'),'yyyy-MM') mn,
             visitCount
         from
             action)t1
group by userId,mn)t2;
3.有50W个京东店铺,每个顾客访客访问任何一个店铺的任何一个商品时都会产生一条访问日志,访问日志存储的表名为Visit,访客的用户id为user_id,被访问的店铺名称为shop,请统计:

1)每个店铺的UV(访客数)

select shop
		,count(distinct user_id)
from visit
group by shop;

2)每个店铺访问次数top3的访客信息,输出店铺名称、访客id、访问次数。

select shop
	 	,user_id
	 	,ct
from (select shop
		,user_id
		,ct
		,rank() over(partition by shop order by ct) rk
from 
	(select shop
	   		,user_id
	   		,count(*) ct
	from visit
	group by shop,user_id) t1
 ) t2
where rk<=3;		
4.已知一个表STG.ORDER,有如下字段:Date,Order_id,User_id,amount。请给出sql进行统计:数据样例:2017-01-01,10029028,1000003251,33.57。(表名统一用order_tab)
1)给出 2017年每个月的订单数、用户数、总成交金额。
select
   date_format(dt,'yyyy-MM'),
   count(order_id),
   count(distinct user_id),
   sum(amount)
from
   order_tab
where
   date_format(dt,'yyyy')='2017'
group by
   date_format(dt,'yyyy-MM');
2)给出2017年11月的新客数(指在11月才有第一笔订单)
select
   count(user_id)
from
   order_tab
group by
   user_id
having
   date_format(min(dt),'yyyy-MM')='2017-11';
5.有日志如下,请写出代码求得所有用户和活跃用户的总数及平均年龄。(活跃用户指连续两天都有访问记录的用户)日期 用户 年龄

数据如下所示,表名user_age

dt		user_id   age
2019-02-11,test_1,23
2019-02-11,test_2,19
2019-02-11,test_3,39
2019-02-11,test_1,23
2019-02-11,test_3,39
2019-02-11,test_1,23
2019-02-12,test_2,19
2019-02-13,test_1,23
2019-02-15,test_2,19
2019-02-16,test_2,19
-- 求得所有用户和活跃用户的总数及平均年龄。
-- (活跃用户指连续两天都有访问记录的用户)

-- 1.按日期以及用户分组,按日期排序并给出排名
select
	dt,
	user_id,
	min(age) age,
	rank() over(partition by user_id order by dt) rk
from
	user_age
group by
	dt,
	user_id
-- 2.计算日期和排名的差值
select
	user_id,
	age,
	date_sub(dt, rk) flag
from
	(
	select
		dt,
		user_id,
		min(age) age,
		rank() over(partition by user_id order by dt) rk
	from
		user_age
	group by
		dt,
		user_id) t1
-- 3.过滤出差值大于2的,即为连续两天活跃的用户
select
	user_id,
	min(age) age
from
	(
	select
		user_id,
		age,
		date_sub(dt, rk) flag
	from
		(
		select
			dt,
			user_id,
			min(age) age,
			rank() over(partition by user_id order by dt) rk
		from
			user_age
		group by
			dt,
			user_id) t1
		)t2
group by
	user_id,
	flag
having
	count(*)>= 2
-- 4.对数据进行去重处理
select
	user_id,
	min(age) age
from
	(select
	user_id,
	min(age) age
from
	(
	select
		user_id,
		age,
		date_sub(dt, rk) flag
	from
		(
		select
			dt,
			user_id,
			min(age) age,
			rank() over(partition by user_id order by dt) rk
		from
			user_age
		group by
			dt,
			user_id) t1
		)t2
group by
	user_id,
	flag
having
	count(*)>= 2)t3
group by
	user_id
-- 5.计算活跃用户的人数以及平均年龄
select
	count(*) ct,
	cast(sum(age)/ count(*) as decimal(10, 2))
from
	(
	select
		user_id,
		min(age) age
	from
		(
		select
			user_id,
			min(age) age
		from
			(
			select
				user_id,
				age,
				date_sub(dt, rk) flag
			from
				(
				select
					dt,
					user_id,
					min(age) age,
					rank() over(partition by user_id order by dt) rk
				from
					user_age
				group by
					dt,
					user_id) t1
		)t2
		group by
			user_id,
			flag
		having
			count(*)>= 2)t3
	group by
		user_id)t4;
-- 6.对全量数据进行去重(按用户去重)
select
	user_id,
	min(age) age
from
	user_age
group by
	user_id;

-- 7.计算所有用户的数量以及平均年龄
select
	count(*) user_count,
	cast((sum(age)/ count(*)) as decimal(10, 1))
from
	(
	select
		user_id,
		min(age) age
	from
		user_age
	group by
		user_id)t5;
-- 8.把4和7的数据进行union
select
	0 user_total_count,
	0 user_total_avg_age,
	count(*) twice_count,
	cast(sum(age)/ count(*) as decimal(10, 2)) twice_count_avg_age
from
	(
	select
		user_id,
		min(age) age
	from
		(
		select
			user_id,
			min(age) age
		from
			(
			select
				user_id,
				age,
				date_sub(dt, rk) flag
			from
				(
				select
					dt,
					user_id,
					min(age) age,
					 rank() over(partition by user_id order by dt) rk
				from
					user_age
				group by
					dt,
					user_id
    ) t1
    ) t2
		group by
			user_id,
			flag
		having
			count(*)>= 2)t3
	group by
		user_id 
) t4
union all

select
	count(*) user_total_count,
	cast((sum(age)/ count(*)) as decimal(10, 1)),
	0 twice_count,
	0 twice_count_avg_age
from
	(
	select
		user_id,
		min(age) age
	from
		user_age
	group by
		user_id
   ) t5
-- 9.最终整合
select
	sum(user_total_count),
	sum(user_total_avg_age),
	sum(twice_count),
	sum(twice_count_avg_age)
from
	(
	select
		0 user_total_count,
		0 user_total_avg_age,
		count(*) twice_count,
		cast(sum(age)/ count(*) as decimal(10, 2)) twice_count_avg_age
	from
		(
		select
			user_id,
			min(age) age
		from
			(
			select
				user_id,
				min(age) age
			from
				(
				select
					user_id,
					age,
					date_sub(dt, rk) flag
				from
					(
					select
						dt,
						user_id,
						min(age) age,
						rank() over(partition by user_id order by dt) rk
					from
						user_age
					group by
						dt,
						user_id
    				) t1
    			) t2
			group by
				user_id,
				flag
			having count(*)>= 2
			) t3
		group by
			user_id 
		) t4
union all
 select
		count(*) user_total_count,
		cast((sum(age)/ count(*)) as decimal(10, 1)),
		0 twice_count,
		0 twice_count_avg_age
	from
		(
		select
			user_id,
			min(age) age
		from
			user_age
		group by
			user_id
   ) t5
) t6;
6.请用sql写出所有用户中在今年10月份第一次购买商品的金额,表名ordertable,字段(购买用户:userid,金额:money,购买时间:paymenttime(格式:2017-10-01),订单id:orderid)
-- 1.
select 
		userid,
		min(paymenttime) paymenttime
from ordertable
where date_format(paymenttime,'yyyy-MM')='2017-10'
group by userid
-- 2.
select 
		t1.userid,
		t1.paymenttime,
		ot.money
from (select 
		userid,
		min(paymenttime) paymenttime
from ordertable
where date_format(paymenttime,'yyyy-MM')='2017-10'
group by userid) t1
join ordertable ot 
on t1.userid=ot.userid and t1.paymenttime=ot.paymenttime;
7.求11月9号下午14点(14-15点),访问api/user/login接口的top10的ip地址

有一个线上服务器访问日志格式如下
时间 --------------------- 接口 ------------ ip地址
2016-11-09 11:22:05 /api/user/login 110.23.5.33
2016-11-09 11:23:10 /api/user/detail 57.3.2.16

2016-11-09 23:59:40 /api/user/login 200.6.5.166
求11月9号下午14点(14-15点),访问api/user/login接口的top10的ip地址

还在完善中…(有时间就弄完…)

### 大数据开发 HiveSQL 笔试题目及答案 #### 题目 1: 自连接实现 自连接是一种特殊的连接方式,表与其自身进行连接。这通常用于查找具有层次结构的数据。 ```sql SELECT e.employee_id, m.manager_name FROM employees AS e JOIN employees AS m ON e.manager_id = m.employee_id; ``` 此查询展示了如何通过员工表找到每位员工对应的经理名称[^1]。 #### 题目 2: 窗口分析函数实现 窗口函数允许执行聚合计算而不减少输入行的数量。下面的例子展示了一个典型的排名场景: ```sql SELECT employee_id, salary, DENSE_RANK() OVER (PARTITION BY department ORDER BY salary DESC) as rank FROM salaries; ``` 这段代码按部门分区并根据薪水降序排列来分配密集等级。 #### 题目 3: Top N 查询 为了获取每个类别的前几名记录,可以使用窗口函数配合 `LIMIT` 或者子查询的方法: ```sql WITH RankedSales AS ( SELECT product_category, sales_amount, ROW_NUMBER() OVER(PARTITION BY product_category ORDER BY sales_amount DESC) rn FROM sales_data ) SELECT * FROM RankedSales WHERE rn <= 5; -- 获取每种类目的前五名销售量商品 ``` 这里利用了CTE(公用表表达式)以及ROW_NUMBER窗口函数实现了分类下的top n选取. #### 题目 4: 使用 CONCAT 和 CONCAT_WS 函数 这两个字符串拼接函数的区别在于后者可以在多个参数间加入指定分隔符。 ```sql -- 单纯地把两列内容连在一起 SELECT concat(firstname,' ',lastname) full_name FROM users; -- 加入逗号作为分隔符链接多列 SELECT concat_ws(',', firstname, lastname, email) info_string FROM users; ``` 上述例子分别演示了两种不同情况下应该如何选用合适的字符串组合方法[^3]. #### 题目 5: LOAD DATA 命令语法 当需要加载外部文件到Hive表时,可采用如下命令格式: ```sql LOAD DATA LOCAL INPATH '/path/to/local/file' INTO TABLE my_table; ``` 这条语句指定了本地路径中的文件被导入至名为my_table的目标表格中[^4]. #### 题目 6: WHERE vs HAVING 的区别 理解两者之间的差异对于写出高效的查询至关重要。WHERE适用于过滤单条记录;而HAVING则针对已经完成GROUP BY操作后的结果集做进一步筛选。 ```sql -- 正确用法示范 SELECT region, COUNT(*) num_stores FROM stores WHERE open_date >= '2020-01-01' GROUP BY region HAVING COUNT(*) > 10; ``` 该片段说明了何时应该应用WHERE条件而非HAVING来进行初步的数据裁剪[^5].
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值