实现需求:
1) city_info表存在MySQL
2) product_info表存在MySQL
3) user_click数据导入Hive
4) 三表的join 取TOP3(按区域进行分组)按天分区表
5) 查询结果导入到MySQL数据库
最终的统计结果字段如下:
product_id 商品ID
product_name 商品名称
area 区域
click_count 点击数/访问量
rank 排名
day 时间
-------------------------------------------------------------------------------------
一、创建ruozedatajob数据库
create database ruozedata_job;
二、hive创建用户点击表
drop table user_click;
create table user_click(
user_id int,
session_id string,
action_time string,
city_id int,
product_id int
)
partitioned by (data string)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ',';
-- load 用户点击量表到用户点击表
load data local inpath '/home/hadoop/data/user_click.txt' overwrite into table user_click partition(data='2018-06-20');
三、导入城市信息表到hive
-- hive创建城市信息表
drop table city_info;
create table city_info
(
city_id int,
city_name string,
area string
)
-- MySQL导入Hive
sqoop import \
--connect jdbc:mysql://localhost:3306/ruozedata \
--username root --password root \
--table city_info -m 1 \
--mapreduce-job-name Hive_city_info \
--delete-target-dir \
--hive-database ruozedata_job \
--hive-table city_info \
--hive-overwrite \
--hive-import
四、导入产品信息表到hive
-- hive创建产品信息表
drop table product_info;
create external table product_info(
product_id int,
product_name string,
extend_info string
);
-- MySQL导入Hive
sqoop import \
--connect jdbc:mysql://localhost:3306/ruozedata \
--username root --password root \
--table product_info -m 1 \
--mapreduce-job-name Hive_product_info \
--delete-target-dir \
--hive-database ruozedata_job \
--hive-table product_info \
--hive-import \
--hive-overwrite
五、最终的统计结果字段如下:
product_id 商品ID
product_name 商品名称
area 区域
click_count 点击数/访问量
rank 排名
day 时间
-- SQL语句
use ruozedata_job;
统计结果:
Total MapReduce CPU Time Spent: 7 seconds 400 msec
OK
7 product7 CC 39 1 2018-06-21
26 product26 CC 39 2 2018-06-21
70 product70 CC 38 3 2018-06-21
4 product4 EC 40 1 2018-06-21
96 product96 EC 32 2 2018-06-21
5 product5 EC 31 3 2018-06-21
40 product40 NC 16 1 2018-06-21
9 product9 NC 16 2 2018-06-21
5 product5 NC 13 3 2018-06-21
56 product56 NW 20 1 2018-06-21
67 product67 NW 20 2 2018-06-21
48 product48 NW 19 3 2018-06-21
38 product38 SC 35 1 2018-06-21
98 product98 SC 34 2 2018-06-21
88 product88 SC 34 3 2018-06-21
16 product16 SW 20 1 2018-06-21
60 product60 SW 19 2 2018-06-21
95 product95 SW 19 3 2018-06-21
Time taken: 57.804 seconds, Fetched: 18 row(s)
六、Hive创建复制表及表数据
七、HIve导入到MySQL
-- MySQL创建表结构
CREATE TABLE product_hot
(
product_id int ,
product_name varchar(255),
area varchar(255),
click_count int,
rank int,
day varchar(255)
)
-- 导入MySQL数据库
sqoop export \
--connect jdbc:mysql://localhost:3306/ruozedata \
--username root --password root \
--mapreduce-job-name FromHDFSToMySQL3 \
--table product_hot \
-m 2 \
--export-dir /user/hive/warehouse/ruozedata_job.db/product_hot/* \
--fields-terminated-by '\001'
MySQL查看结果:
1) city_info表存在MySQL
2) product_info表存在MySQL
3) user_click数据导入Hive
4) 三表的join 取TOP3(按区域进行分组)按天分区表
5) 查询结果导入到MySQL数据库
最终的统计结果字段如下:
product_id 商品ID
product_name 商品名称
area 区域
click_count 点击数/访问量
rank 排名
day 时间
-------------------------------------------------------------------------------------
一、创建ruozedatajob数据库
create database ruozedata_job;
二、hive创建用户点击表
drop table user_click;
create table user_click(
user_id int,
session_id string,
action_time string,
city_id int,
product_id int
)
partitioned by (data string)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ',';
-- load 用户点击量表到用户点击表
load data local inpath '/home/hadoop/data/user_click.txt' overwrite into table user_click partition(data='2018-06-20');
三、导入城市信息表到hive
-- hive创建城市信息表
drop table city_info;
create table city_info
(
city_id int,
city_name string,
area string
)
-- MySQL导入Hive
sqoop import \
--connect jdbc:mysql://localhost:3306/ruozedata \
--username root --password root \
--table city_info -m 1 \
--mapreduce-job-name Hive_city_info \
--delete-target-dir \
--hive-database ruozedata_job \
--hive-table city_info \
--hive-overwrite \
--hive-import
四、导入产品信息表到hive
-- hive创建产品信息表
drop table product_info;
create external table product_info(
product_id int,
product_name string,
extend_info string
);
-- MySQL导入Hive
sqoop import \
--connect jdbc:mysql://localhost:3306/ruozedata \
--username root --password root \
--table product_info -m 1 \
--mapreduce-job-name Hive_product_info \
--delete-target-dir \
--hive-database ruozedata_job \
--hive-table product_info \
--hive-import \
--hive-overwrite
五、最终的统计结果字段如下:
product_id 商品ID
product_name 商品名称
area 区域
click_count 点击数/访问量
rank 排名
day 时间
-- SQL语句
use ruozedata_job;
- SELECT t2.*,current_date as day from
- (
- SELECT
- t1.product_id,
- t1.product_name,
- t1.area,
- t1.click_count,
- row_number() over(PARTITION BY t1.AREA ORDER BY t1.click_count DESC) AS rank
- FROM
- (
- SELECT
- t.product_id ,
- t.product_name,
- t.area,
- COUNT(t.session_id) click_count
- FROM (
- SELECT
- uc.session_id,
- ci.area,
- pi.product_id,
- pi.product_name
- FROM user_click uc
- LEFT JOIN city_info ci ON uc.city_id=ci.city_id
- LEFT JOIN product_info pi ON uc.product_id=pi.product_id
- WHERE 1=1
- AND pi.product_name IS NOT NULL
- AND ci.area IS NOT NULL
- )t
- GROUP BY t.area,t.product_name,t.product_id
- ORDER BY t.AREA ,click_count DESC
- )t1
- )t2
- WHERE t2.rank <=3
统计结果:
Total MapReduce CPU Time Spent: 7 seconds 400 msec
OK
7 product7 CC 39 1 2018-06-21
26 product26 CC 39 2 2018-06-21
70 product70 CC 38 3 2018-06-21
4 product4 EC 40 1 2018-06-21
96 product96 EC 32 2 2018-06-21
5 product5 EC 31 3 2018-06-21
40 product40 NC 16 1 2018-06-21
9 product9 NC 16 2 2018-06-21
5 product5 NC 13 3 2018-06-21
56 product56 NW 20 1 2018-06-21
67 product67 NW 20 2 2018-06-21
48 product48 NW 19 3 2018-06-21
38 product38 SC 35 1 2018-06-21
98 product98 SC 34 2 2018-06-21
88 product88 SC 34 3 2018-06-21
16 product16 SW 20 1 2018-06-21
60 product60 SW 19 2 2018-06-21
95 product95 SW 19 3 2018-06-21
Time taken: 57.804 seconds, Fetched: 18 row(s)
六、Hive创建复制表及表数据
- create table product_hot as
- SELECT t2.*,current_date as day from
- (
- SELECT
- t1.product_id,
- t1.product_name,
- t1.area,
- t1.click_count,
- row_number() over(PARTITION BY t1.AREA ORDER BY t1.click_count DESC) AS rank
- FROM
- (
- SELECT
- t.product_id ,
- t.product_name,
- t.area,
- COUNT(t.session_id) click_count
- FROM (
- SELECT
- uc.session_id,
- ci.area,
- pi.product_id,
- pi.product_name
- FROM user_click uc
- LEFT JOIN city_info ci ON uc.city_id=ci.city_id
- LEFT JOIN product_info pi ON uc.product_id=pi.product_id
- WHERE 1=1
- AND pi.product_name IS NOT NULL
- AND ci.area IS NOT NULL
- )t
- GROUP BY t.area,t.product_name,t.product_id
- ORDER BY t.AREA ,click_count DESC
- )t1
- )t2
- WHERE t2.rank <=3
七、HIve导入到MySQL
-- MySQL创建表结构
CREATE TABLE product_hot
(
product_id int ,
product_name varchar(255),
area varchar(255),
click_count int,
rank int,
day varchar(255)
)
-- 导入MySQL数据库
sqoop export \
--connect jdbc:mysql://localhost:3306/ruozedata \
--username root --password root \
--mapreduce-job-name FromHDFSToMySQL3 \
--table product_hot \
-m 2 \
--export-dir /user/hive/warehouse/ruozedata_job.db/product_hot/* \
--fields-terminated-by '\001'
MySQL查看结果:

来自 “ ITPUB博客 ” ,链接:http://blog.itpub.net/31441024/viewspace-2156457/,如需转载,请注明出处,否则将追究法律责任。
转载于:http://blog.itpub.net/31441024/viewspace-2156457/