1、搭建环境
要安装jdk,hadoop2.5客户端,剩下的就直接是配置了,配置下就能搞定。
2、配置
a1.sources = r1
a1.channels = c2
a1.sinks = k2
a1.sources.r1.type = exec
a1.sources.r1.command = tail -n +0 -F /opt/nginx/logs/link.log
a1.sources.r1.channels = c2
a1.sources.r1.batchSize = 200
#a1.sources.r1.useHost = true
#使用memory
a1.channels.c2.type = memory
a1.channels.c2.capacity = 2000000
a1.channels.c2.keep-alive = 6
a1.channels.c2.transactionCapacity = 20000
#使用kafka channel,与memory二者只能选其一
a1.channels.c2.type = org.apache.flume.channel.kafka.KafkaChannel
a1.channels.c2.capacity = 10000
a1.channels.c2.brokerList = broker1:9092,broker2ip:9092,broker3ip:9092
a1.channels.c2.topic = ifoxchannel
a1.channels.c2.groupId = ifox
a1.channels.c2.zookeeperConnect = z1:2181,z2:2181,z3:2181
a1.channels.c2.transactionCapacity = 1000
a1.sinks.k2.type = hdfs
a1.sinks.k2.channel = c2
a1.sinks.k2.hdfs.path = hdfs://buffercluster1/user/hive/warehouse/log_text/%Y%m%d/%Y%m%d%H
a1.sinks.k2.hdfs.useLocalTimeStamp = true
a1.sinks.k2.hdfs.filePrefix = logs_bx_14_244-%Y%m%d%H
#默认是sequencefile格式
a1.sinks.k2.hdfs.fileType = DataStream
a1.sinks.k2.hdfs.writeFormat = Text
a1.sinks.k2.hdfs.round = true
a1.sinks.k2.hdfs.roundValue = 1
a1.sinks.k2.hdfs.roundUnit = hour
#timeout默认10秒,如果出现这个错误,可以调大点
a1.sinks.k2.hdfs.callTimeout = 30000
#rollInterval不能设置太小,否则会产生小文件
a1.sinks.k2.hdfs.rollInterval = 3600
a1.sinks.k2.hdfs.rollSize = 0
a1.sinks.k2.hdfs.rollCount = 0
3、启动
SCRIPT_NAME=$(readlink -f "$0")
dir=`dirname ${SCRIPT_NAME}`
cd $dir
if [[ -z $1 ]]
then
echo "error! must give the name of the configure file!"
exit -1
else
#not no postfix, such as conf
con_file=$1
fi
log_dir=/opt/hadoop/flume/user_logs/
mkdir -p ${log_dir}
nohup /opt/hadoop/flume/bin/flume-ng agent --conf /opt/hadoop/flume/conf -f /opt/hadoop/flume/conf/${con_file}.conf -Dflume.root.logger=INFO,console -n a1 > ${log_dir}/${con_file}.log 2>&1 &
echo $! > ${log_dir}/${con_file}.pid #保留pid
4、1.6.0新功能
支持kafka channel,这样flume、kafka、hdfs可以很容易的整合在一起了,有时间研究下。
5、kafka启动
bin/kafka-server-start.sh -daemon config/server.properties
6、kafka停止
bin/kafka-server-stop.sh