一、概述
1.分析统计24小时内的每个时段的pv和uv
(1)pv统计总的浏览量
(2)uv统计对guid去重后的总量
(3)获取时间字段,日期和小时,建立分区表
2.网站两天数据格式数据
121508281810000000 http://www.yhd.com/?union_ref=7&cp=0 3 PR4E9HWE38DMN4Z6HUG667SCJNZXMHSPJRER VFA5QRQ1N4UJNS9P6MH6HPA76SXZ737P 10977119545 124.65.159.122 unionKey:10977119545 2015-08-28 18:10:00 50116447 http://image.yihaodianimg.com/virtual-web_static/virtual_yhd_iframe_index_widthscreen.html?randid=2015828 6 1000 Mozilla/5.0 (Windows NT 6.1; rv:40.0) Gecko/20100101 Firefox/40.0 Win32 lunbo_tab_3 北京市 2 北京市 1 1 1 1 1440*900 1440756285639
121508281810000001 http://my.yhd.com/order/finishOrder.do?orderCode=5435446505152 http://buy.yhd.com/checkoutV3/index.do 3 YJ25S3QAVPAS31PHSB3HFGZ1E5AYMKX9XUTX 6W26QM41DM6HHND3R4FP42YYXXE1NKGA 222.73.202.251 2015-08-28 18:10:00 85133152 http://www.haosou.com/s?src=new_isearch&q=1%E5%8F%B7%E5%BA%97 25 0 1 Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36 Win32 MY_ORDERCOMPLETION_EDITADDRESS 上海市 1 上海市 2058 0 2058 0 1366*768 1440756699916
二、实现
1、数据收集
(1)登陆hive
启动服务端:
bin/hiveserver2 &
启动客户端:
bin/beeline -u jdbc:hive2://bigdata.ibeifeng.com:10000 -n hadoop -p 123456
(2)创建源表并且加载数据
create database track_log;
create table yhd_source(
id string,
url string,
referer string,
keyword string,
type string,
guid string,
pageId string,
moduleId string,
linkId string,
attachedInfo string,
sessionId string,
trackerU string,
trackerType string,
ip string,
trackerSrc string,
cookie string,
orderCode string,
trackTime string,
endUserId string,
firstLink string,
sessionViewNo string,
productId string,
curMerchantId string,
provinceId string,
cityId string,
fee string,
edmActivity string,
edmEmail string,
edmJobId string,
ieVersion string,
platform string,
internalKeyword string,
resultSum string,
currentPage string,
linkPosition string,
buttonPosition string
)
row format delimited fields terminated by "\t";
load data local inpath '/opt/datas/2015082819' into table yhd_source;
load data local inpath '/opt/datas/2015082818' into table yhd_source;
2.数据清洗
(1)创建一个清洗表
create table track_log.yhd_qingxi(
id string,
url string,
guid string,
`date` string,
hour string
)
row format delimited fields terminated by "\t";
(2)插入数据
insert into table track_log.yhd_qingxi
select id,url,guid,substring(trackTime,9,2) `date`,substring(trackTime,12,2) hour from yhd_source;
3.创建分区表并且加载数据
3.1 静态分区
(1)根据时间字段分区(静态分区)
create table track_log.yhd_part(
id string,
url string,
guid string
)partitioned by (`date` string ,hour string)
row format delimited fields terminated by "\t";
插入数据到分区表中:
insert into table track_log.yhd_part partition (`date`='20150828',hour='18')
select id,url,guid from track_log.yhd_qingxi where `date`='28' and hour='18' ;
insert into table track_log.yhd_part partition (`date`='20150828',hour='19')
select id,url,guid from track_log.yhd_qingxi where `date`='28' and hour='19' ;
(2)查询测试:
select id,url,guid from track_log.yhd_part where `date`='20150828' and hour='19' limit 10;
查看分区:
show partitions track_log.yhd_part;
结果:
partition
date=20150828/hour=18
date=20150828/hour=19
Time taken: 0.3 seconds, Fetched: 2 row(s)
3.2 动态分区
(1)添加配置到hive-site中
-》表示每个节点支持动态分区的个数
<property>
<name>hive.exec.max.dynamic.partitions.pernode</name>
<value>100</value>
<description>Maximum number of dynamic partitions allowed to be created in each mapper/reducer node.</description>
</property>
-》表示动态分区的最大个数
<property>
<name>hive.exec.max.dynamic.partitions</name>
<value>1000</value>
<description>Maximum number of dynamic partitions allowed to be created in total.</description>
</property>
-》表示是否开启动态分区
<property>
<name>hive.exec.dynamic.partition</name>
<value>true</value>
<description>Whether or not to allow dynamic partitions in DML/DDL.</description>
</property>
-》使用动态分区,需要改变模式为非严格模式
<property>
<name>hive.exec.dynamic.partition.mode</name>
<value>nonstrict</value>
<description>In strict mode, the user must specify at least one static partition in case the user accidentally overwrites all partitions.</description>
</property>
(2)创建第二个分区表用做测试
create table track_log.yhd_part2(
id string,
url string,
guid string
)partitioned by (`date` string ,hour string)
row format delimited fields terminated by "\t";
(3)插入数据
insert into table track_log.yhd_part2 partition (`date`,hour) select * from
track_log.yhd_qingxi;
(4)测试
select id,url,guid from track_log.yhd_part2 where `date`='28' and hour='19' limit 10;
结果:
id url guid
71508281814590031 http://s.yhd.com/?tc=0.0.12.2704_13852075_5.13&tp=1.1.16.0.5.KlBK557-10-6z7Ct B8SUYA48VDXS8KAE9Y1SVBC3HVBS3JRV9VZX
71508281814590032 http://1mall.yhd.com/16/?uid=97663602576&tracker_u=1787&website_id=516686 XUZT4771JABDUNN245167YC9ZADP3DU9B9V6
71508281814590033 http://item.m.yhd.com/item/41296336?tc=3.0.5.41296336.3&tp=5009.1668.196.0.3.KxnnONn-11-F5B4q 8XWH2FF3RFSY22SUPB7M57CBZ9B7QAAN3YFG
4.数据分析
(1)PV
select `date`,hour,count(url) pv from track_log.yhd_part group by `date`,hour;
结果:
+-----------+-------+--------+--+
| date | hour | pv |
+-----------+-------+--------+--+
| 20150828 | 18 | 64972 |
| 20150828 | 19 | 61162 |
+-----------+-------+--------+--+
(2)UV
select `date`,hour,count(distinct guid) uv from track_log.yhd_part group by `date`,hour;
结果:
+-----------+-------+--------+--+
| date | hour | uv |
+-----------+-------+--------+--+
| 20150828 | 18 | 23938 |
| 20150828 | 19 | 22330 |
+-----------+-------+--------+--+
(3)最终的结果
create table track_log.result as
select `date`,hour,count(url) pv,count(distinct guid) uv from track_log.yhd_part group by `date`,hour;
(4)测试结果
select * from track_log.result;
结果:
+--------------+--------------+------------+------------+--+
| result.date | result.hour | result.pv | result.uv |
+--------------+--------------+------------+------------+--+
| 20150828 | 18 | 64972 | 23938 |
| 20150828 | 19 | 61162 | 22330 |
+--------------+--------------+------------+------------+--+
5.数据导出
(1)在mysql里创建表
create table hive_result(
date varchar(30),
hour varchar(30),
pv varchar(30),
uv varchar(30),
primary key(date,hour)
);
(2)sqoop方式(hive的表数据默认分隔符是\001)
bin/sqoop export \
--connect jdbc:mysql://bigdata.ibeifeng.com:3306/sqoop \
--username root \
--password 123456 \
--table hive_result \
--export-dir /user/hive/warehouse/track_log.db/result \
-m 1 \
--input-fields-terminated-by '\001'
(3)结果:
mysql> select * from hive_result;
结果:
+----------+------+-------+-------+
| date | hour | pv | uv |
+----------+------+-------+-------+
| 20150828 | 18 | 64972 | 23938 |
| 20150828 | 19 | 61162 | 22330 |
+----------+------+-------+-------+