ZJF
一 、数据收集部分
1.1 收集什么数据
应用用户操作日志
1.2 数据源
日志生产
访问首页,用户点击感兴趣得图片后产生操作日志
egg:
不忍直视.jpg this'is'zjf~log
不忍直视.jpg this'is'zjf~log
不忍直视.jpg this'is'zjf~log
FrameworkServlet 'dispatcherServlet': initialization started
日志存储
log4j.rootLogger=INFO,stdout,file
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.Threshold=INFO
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern=%m%n
#one log file a day
log4j.appender.file=org.apache.log4j.DailyRollingFileAppender
log4j.appender.file.Threshold=INFO
log4j.appender.file.File=/home/logs/access.tmp
log4j.appender.file.DatePattern=yyyy-MM-dd-hh-mm'.log'
log4j.appender.file.layout=org.apache.log4j.PatternLayout
log4j.appender.file.layout.ConversionPattern=%m%n
通过上述配置文件可以看到是将日志存储在/home/logs/,当前分钟数日志为access.tmp 日志按照相应的规则进行滚动
收集方式
使用Flume进行收集
收集方案
TAILDIR 类型的Source
memeory 类型的Channel
HDFS 类型的Sink
架构
暂使用单节点
Flume配置方式
# Name the components on this agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1
# Describe/configure the source
a1.sources.r1.type = TAILDIR
a1.sources.r1.filegroups = f1
a1.sources.r1.filegroups.f1 = /home/logs/.*log.*
# Describe the sink
a1.sinks.k1.type = hdfs
a1.sinks.k1.hdfs.path =hdfs://192.168.64.30:9000/access_log/%y-%m-%d/%H-%M
a1.sinks.k1.hdfs.filePrefix = access_log
a1.sinks.k1.hdfs.fileSuffix = .log
a1.sinks.k1.hdfs.batchSize= 5
a1.sinks.k1.hdfs.fileType = DataStream
a1.sinks.k1.hdfs.writeFormat =Text
# roll:滚动切换:控制写文件的切换规则
a1.sinks.k1.hdfs.rollSize = 100
a1.sinks.k1.hdfs.rollCount = 1000000
a1.sinks.k1.hdfs.rollInterval = 10
a1.sinks.k1.hdfs.round = true
a1.sinks.k1.hdfs.roundValue = 1
a1.sinks.k1.hdfs.roundUnit = minute
a1.sinks.k1.hdfs.useLocalTimeStamp = true
# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
二 、数据清洗部分
2.1、清洗的数据类型
不忍直视.jpg this'is'zjf~log
不忍直视.jpg this'is'zjf~log
不忍直视.jpg this'is'zjf~log
FrameworkServlet 'dispatcherServlet': initialization started
将带有 this’is’zjf~log 日志标识的行日志摘出来 并用UUID作为单行日志的唯一标识
,并去掉 this’is’zjf~log
2.2 、怎么去清洗
public static class CleanMapper extends Mapper<LongWritable, Text, NullWritable, Text> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
boolean contains = value.toString().contains("this'is'zjf~log");
if (contains) {
String s = value.toString().split("this'is'zjf~log")[0];
context.write(NullWritable.get(), new Text(s));
}
}
}
public static void main(String[] args) throws Exception {
//准备conf对象
Configuration configuration = new Configuration();
configuration.set("mapreduce.app-submission.cross-platform", "true");
System.setProperty("HADOOP_USER_NAME", "root");
configuration.addResource("core-site.xml");
configuration.addResource("hdfs-site.xml");
configuration.addResource("yarn-site.xml");
configuration.addResource("mapred-site.xml");
configuration.set(MRJobConfig.JAR, "E:\\大数据\\自己\\IDEA_WSCode\\MR\\target\\MR-0.0.1-SNAPSHOT.jar");
// 准备Job对象
Job job = Job.getInstance(configuration);
//
job.setJarByClass(App.class);
//配置读数据组件
//job.setInputFormatClass(TextInputFormat.class);
job.setInputFormatClass(CombineTextInputFormat.class);
CombineTextInputFormat.setMinInputSplitSize(job, 4194304);// 4M
//配置写数据组件
job.setOutputFormatClass(TextOutputFormat.class);
// job.setCombinerClass(WCReducer.class);
//数据源 路径
TextInputFormat.addInputPath(job, new Path ("/access_log/19-07-31/23-13"));
//数据写出 路径
TextOutputFormat.setOutputPath(job, new Path("/access_log/19-07-30clean3"));
// 设置作业 map 和 reduce 逻辑处理类jpo
job.setMapperClass(CleanMapper.class);
//5.设置Mapper和Reducer输出泛型
job.setMapOutputKeyClass(NullWritable.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Text.class);
//job.submit();
job.waitForCompletion(true);
}
清洗展示
6ba75815-ba8e-47b0-80d4-9a281d1755ab 不忍直视.jpg
c8d8b2b6-479c-4963-9781-2955b4a79fc9 不忍直视.jpg
8b53f1d9-43bf-48b5-bb4e-beccc59723f1 不忍直视.jpg
31bc16f5-2ead-4fb8-bc25-1e76062ebb49 惊悚.jpg
18cd2d83-cf93-4768-9874-f5a3b2de7d3e 美艳.jpg
三、数据存储
3.1、java代码 将hdfs文件转存hbase
package com.mapreduce;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.MRJobConfig;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import java.io.IOException;
public class HDFSStore {
public static class MyMapper extends Mapper<LongWritable, Text, NullWritable, Text> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
context.write(NullWritable.get(), value);
}
}
public static class MyReducer extends TableReducer<NullWritable, Text, NullWritable> {
@Override
protected void reduce(NullWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
for (Text log : values) {
String[] lines = log.toString().split(" ");
Put put = new Put(lines[0].getBytes()); // uuid
//put.addColumn("cf1".getBytes(), "imgId".getBytes(), lines[1].getBytes());
put.addColumn("cf1".getBytes(), "imgName".getBytes(), lines[1].getBytes());
context.write(NullWritable.get(), put);
}
}
}
public static void main(String[] args) throws Exception {
System.setProperty("HADOOP_USER_NAME", "root");
//1.封装job对象
Configuration conf = new Configuration();
conf.addResource("core-site.xml");
conf.addResource("hdfs-site.xml");
conf.addResource("yarn-site.xml");
conf.addResource("mapred-site.xml");
conf.set("mapreduce.app-submission.cross-platform", "true");
conf.set("hbase.zookeeper.quorum", "Hadoop00");
conf.set("hbase.zookeeper.property.clientPort", "2181");
conf.set(MRJobConfig.JAR, "E:\\大数据\\自己\\IDEA_WSCode\\MR\\target\\MR-0.0.1-SNAPSHOT.jar");
Job job = Job.getInstance(conf);
//2.设置数据读入和写出格式
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(job, new Path ("/access_log/19-07-31clean"));
//设置mapper格式
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(NullWritable.class);
job.setMapOutputValueClass(Text.class);
job.setOutputFormatClass(TableOutputFormat.class);
//存储的表
TableMapReduceUtil.initTableReducerJob(
"zjf:t_log",
MyReducer.class,
job);
//6.提交任务
//job.submit();
job.waitForCompletion(true);
}
}
3.2 、准备数据
hbase 建表建库
zjf namespace
t_log table
HDFS数据
3.3 提交job,查看结果
成功执行
四 、数据分析
统计内容
- 统计某个图片的点击量,分析用户的喜好
统计方式
使用Hive 统计
优点:
逻辑清晰
SQL便于理解和学习
成本低
Hive统计
数据萃取 Extract
即对所需的数据进行清洗
建立数据库基础存储表
drop tables logs;
create table mylog(
uuid string,
imgname string
)
partitioned BY (day string)
row format delimited fields terminated
by ' '
lines terminated by '\n';
数据导入
load data inpath '/access_log/19-07-31clean' into table mylog partition(day='19-07-30');
[外链图片转存失败(img-pxzE1Jjb-1565187117933)(C:\Users\ZHANGJ~1\AppData\Local\Temp\1565025472623.png)]
数据Transform
统计单个图片点击量
由于数量量较少,直接用mylog作为单维度表
创建维度信息表
create table dim_img_go
(
imgName string,
count int
)
partitioned by (day string,dim string)
row format delimited fields terminated
by ' ';
维度统计
from mylog
insert into dim_img_go PARTITION( day ='19-08-04',dim = '00')
SELECT 'all'
,count(1)
WHERE day = '19-08-04'
INSERT INTO TABLE dim_img_go PARTITION (
day = '19-08-04'
,dim = '01'
)
SELECT
imgName
,count(1)
WHERE day = '19-08-04'
GROUP BY (imgName)
数据统计结果
[外链图片转存失败(img-jMmewrSS-1565187117934)(C:\Users\ZHANGJ~1\AppData\Local\Temp\1565028476895.png)]
五、数据导出
数据提取
分区,多重分区需要注意的细节
1> 将图片点击量的两级分区改为一级分区
create table dim_img_go_1p
(
imgName string,
count int ,
dayString string,
dimString string
)
partitioned by (day string)
row format delimited fields terminated
by ' ';
2> 导入数据
insert into table dim_img_go_1p partition(day = '19-08-04')
select
imgName ,
count ,
day ,
dim
from
dim_img_go
where day = '19-08-04';
导出数据 &结果
./sqoop export \
--connect "jdbc:mysql://192.168.64.30:3306/mylog?useUnicode=true&characterEncoding=UTF-8" \
--username root \
--password root \
--table Mylog \
--export-dir /user/hive/warehouse/dim_img_go_1p/day=19-08-04 \
--input-fields-terminated-by ' ' \
;
六、数据可视化展示
day = ‘19-08-04’)
select
imgName ,
count ,
day ,
dim
from
dim_img_go
where day = ‘19-08-04’;
##### 导出数据 &结果
./sqoop export
–connect “jdbc:mysql://192.168.64.30:3306/mylog?useUnicode=true&characterEncoding=UTF-8”
–username root
–password root
–table Mylog
–export-dir /user/hive/warehouse/dim_img_go_1p/day=19-08-04
–input-fields-terminated-by ’ ’
;

## 六、数据可视化展示