基于Hadoop生态体系的数据分析平台
一、项目设计
架构图
设计目标
-
分析系统每日访问量PV(Page View)
-
分析系统各个模块访问量MV(Model View)
二、环境搭建
安装Nginx并配置日志切割
#安装gcc基本环境
yum install gcc-c++ perl-devel pcre-devel openssl-devel zlib-devel wget
#解压nginx
tar -zxvf nginx-1.11.1.tar.gz
#配置nginx
./configure --prefix=/usr/local/nginx
# 编译安装
make && make install
日志分割
[root@hadoop ~]# vi echo.sh
[root@hadoop ~]# chmod 777 echo.sh
#nginx日志切割脚本
#!/bin/bash
#设置日志文件存放目录
logs_path="/usr/local/nginx/logs/"
#设置pid文件
pid_path="/usr/local/nginx/logs/nginx.pid"
#重命名日志文件
mv ${logs_path}access.log /usr/local/nginx/access_log/access_$(date -d "yesterday" +"%Y-%m-%d%I:%M:%S").log
#向nginx主进程发信号重新打开日志
kill -USR1 `cat ${
pid_path}`
定义Linux操作系统定时任务
周期性将nginx的
access.log
文件移动到日志文件的存放目录,并使用新的access.log
文件记录用户新的访问日志。
[root@hadoop ~]# crontab -e
* * * * * /root/echo.sh
Flume采集日志文件的访问日志数据
Agent配置
Source: SpoolDir
Channel:Kafka Channel
Sink: HDFS
[root@hadoop apache-flume-1.7.0-bin]# vi conf/nginxLog.properties
# example.conf: A single-node Flume configuration
# Name the components on this agent
a1.sources = r1
a1.sinks = k2
a1.channels = c2
# Describe/configure the source
a1.sources.r1.type = spooldir
a1.sources.r1.spoolDir = /usr/local/nginx/access_log/
a1.sources.r1.interceptors = i1
a1.sources.r1.interceptors.i1.type = timestamp
# Describe the sink
#a1.sinks.k1.type = logger
a1.sinks.k2.type = hdfs
a1.sinks.k2.hdfs.path = /flume/events/%y-%m-%d/
a1.sinks.k2.hdfs.filePrefix = events-
a1.sinks.k2.hdfs.useLocalTimeStamp = true
#a1.sinks.k2.hdfs.round = true
#a1.sinks.k2.hdfs.roundValue = 10
#a1.sinks.k2.hdfs.roundUnit = minute
a1.sinks.k2.hdfs.fileType=DataStream
a1.channels.c2.type = org.apache.flume.channel.kafka.KafkaChannel
a1.channels.c2.kafka.bootstrap.servers = hadoop:9092
a1.channels.c2.kafka.topic = nginx
a1.channels.c2.kafka.consumer.group.id = flume-consumer
# Bind the source and sink to the channel
a1.sources.r1.channels = c2
a1.sinks.k2.channel = c2
安装Kafaka
使用到了KafkaChannel需要有kafka环境
确保zookeeper正常运行
#解压
[root@hadoop usr]# tar -zxvf kafka_2.11-0.11.0.0.tgz
#修改配置文件
[root@hadoop kafka_2.11-0.11.0.0]# vi config/server.properties
############################# Server Basics #############################
# The id of the broker. This must be set to a unique integer for each broker.
broker.id=0
# Switch to enable topic deletion or not, default value is false
#delete.topic.enable=true
############################# Socket Server Settings #############################
############################# Socket Server Settings #############################
# The address the socket server listens on. It will get the value returned from
# java.net.InetAddress.getCanonicalHostName() if not configured.
# FORMAT:
# listeners = listener_name://host_name:port
# EXAMPLE:
# listeners = PLAINTEXT://your.host.name:9092
listeners=PLAINTEXT://hadoop:9092
# Hostname and port the broker will advertise to producers and consumers. If not set,
############################# Zookeeper #############################
# Zookeeper connection string (see zookeeper docs for details).
# This is a comma separated host:port pairs, each corresponding to a zk
# server. e.g. "127.0.0.1:3000,127.0.0.1:3001,127.0.0.1:3002".
# You can also append an optional chroot string to the urls to specify the
# root directory for all kafka znodes.
zookeeper.connect=hadoop:2181
# Timeout in ms