sqoop小作业

最新推荐文章于 2025-02-19 08:35:33 发布

转载最新推荐文章于 2025-02-19 08:35:33 发布 · 136 阅读

本文介绍了一种使用Hive和MySQL进行数据同步的方法。具体步骤包括：创建Hive表来存储用户点击数据、城市信息及产品信息；通过多表连接获取各区域热门商品排名并按天分区；最后将结果导入MySQL。该方案适用于需要从大数据平台向关系型数据库迁移数据的场景。

实现需求：
1) city_info表存在MySQL
2) product_info表存在MySQL
3) user_click数据导入Hive
4) 三表的join 取TOP3(按区域进行分组)按天分区表
5) 查询结果导入到MySQL数据库

最终的统计结果字段如下：
product_id 商品ID
product_name 商品名称
area 区域
click_count 点击数/访问量
rank 排名
day 时间

-------------------------------------------------------------------------------------

一、创建ruozedatajob数据库
create database ruozedata_job;

二、hive创建用户点击表
drop table user_click;
create table user_click(
user_id int,
session_id string,
action_time string,
city_id int,
product_id int
)
partitioned by (data string)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ',';

-- load 用户点击量表到用户点击表
load data local inpath '/home/hadoop/data/user_click.txt' overwrite into table user_click partition(data='2018-06-20');

三、导入城市信息表到hive
-- hive创建城市信息表
drop table city_info;
create table city_info
(
city_id int,
city_name string,
area string
)

-- MySQL导入Hive
sqoop import \
--connect jdbc:mysql://localhost:3306/ruozedata \
--username root --password root \
--table city_info -m 1 \
--mapreduce-job-name Hive_city_info \
--delete-target-dir \
--hive-database ruozedata_job \
--hive-table city_info \
--hive-overwrite \
--hive-import

四、导入产品信息表到hive

-- hive创建产品信息表
drop table product_info;
create external table product_info(
product_id int,
product_name string,
extend_info string
);

-- MySQL导入Hive
sqoop import \
--connect jdbc:mysql://localhost:3306/ruozedata \
--username root --password root \
--table product_info -m 1 \
--mapreduce-job-name Hive_product_info \
--delete-target-dir \
--hive-database ruozedata_job \
--hive-table product_info \
--hive-import \
--hive-overwrite

五、最终的统计结果字段如下：
product_id 商品ID
product_name 商品名称
area 区域
click_count 点击数/访问量
rank 排名
day 时间

-- SQL语句
use ruozedata_job;

SELECT t2.*,current_date as day from
(
SELECT
t1.product_id,
t1.product_name,
t1.area,
t1.click_count,
row_number() over(PARTITION BY t1.AREA ORDER BY t1.click_count DESC) AS rank
FROM
(
SELECT
t.product_id ,
t.product_name,
t.area,
COUNT(t.session_id) click_count
FROM (
SELECT
uc.session_id,
ci.area,
pi.product_id,
pi.product_name
FROM user_click uc
LEFT JOIN city_info ci ON uc.city_id=ci.city_id
LEFT JOIN product_info pi ON uc.product_id=pi.product_id
WHERE 1=1
AND pi.product_name IS NOT NULL
AND ci.area IS NOT NULL
)t
GROUP BY t.area,t.product_name,t.product_id
ORDER BY t.AREA ,click_count DESC
)t1
)t2
WHERE t2.rank <=3

统计结果：
Total MapReduce CPU Time Spent: 7 seconds 400 msec
OK
7 product7 CC 39 1 2018-06-21
26 product26 CC 39 2 2018-06-21
70 product70 CC 38 3 2018-06-21
4 product4 EC 40 1 2018-06-21
96 product96 EC 32 2 2018-06-21
5 product5 EC 31 3 2018-06-21
40 product40 NC 16 1 2018-06-21
9 product9 NC 16 2 2018-06-21
5 product5 NC 13 3 2018-06-21
56 product56 NW 20 1 2018-06-21
67 product67 NW 20 2 2018-06-21
48 product48 NW 19 3 2018-06-21
38 product38 SC 35 1 2018-06-21
98 product98 SC 34 2 2018-06-21
88 product88 SC 34 3 2018-06-21
16 product16 SW 20 1 2018-06-21
60 product60 SW 19 2 2018-06-21
95 product95 SW 19 3 2018-06-21
Time taken: 57.804 seconds, Fetched: 18 row(s)

六、Hive创建复制表及表数据

create table product_hot as
SELECT t2.*,current_date as day from
(
SELECT
t1.product_id,
t1.product_name,
t1.area,
t1.click_count,
row_number() over(PARTITION BY t1.AREA ORDER BY t1.click_count DESC) AS rank
FROM
(
SELECT
t.product_id ,
t.product_name,
t.area,
COUNT(t.session_id) click_count
FROM (
SELECT
uc.session_id,
ci.area,
pi.product_id,
pi.product_name
FROM user_click uc
LEFT JOIN city_info ci ON uc.city_id=ci.city_id
LEFT JOIN product_info pi ON uc.product_id=pi.product_id
WHERE 1=1
AND pi.product_name IS NOT NULL
AND ci.area IS NOT NULL
)t
GROUP BY t.area,t.product_name,t.product_id
ORDER BY t.AREA ,click_count DESC
)t1
)t2
WHERE t2.rank <=3

七、HIve导入到MySQL

-- MySQL创建表结构
CREATE TABLE product_hot
(
product_id int ,
product_name varchar(255),
area varchar(255),
click_count int,
rank int,
day varchar(255)
)

-- 导入MySQL数据库
sqoop export \
--connect jdbc:mysql://localhost:3306/ruozedata \
--username root --password root \
--mapreduce-job-name FromHDFSToMySQL3 \
--table product_hot \
-m 2 \
--export-dir /user/hive/warehouse/ruozedata_job.db/product_hot/* \
--fields-terminated-by '\001'

MySQL查看结果：