下文中的agent统一取名为a1
#taildir source
a1.sources = r1
a1.sources.r1.type = TAILDIR #选择source为TAILDIR
a1.sources.r1.filegroups = f1 #需要监控的文件夹组,实现多目录监控
a1.sources.r1.filegroups.f1 = /opt/module/applog/log/app.* # 需要监控的文件夹
a1.sources.r1.positionFile = /opt/module/flume/taildir_position.json # 实现断点续传的临时文件
a1.sources.r1.interceptors = i1 # 拦截器组,可以自定拦截器
a1.sources.r1.interceptors.i1.type = com.xigema.flume.interceptor.ETLInterceptor$Builder #自定拦截器内部Buider类的包名
#=====================================================================
#kafka source
a1.sources = r1
a1.sources.r1.type = org.apache.flume.source.kafka.KafkaSource # 选择类型为kafka source
a1.sources.r1.kafka.bootstrap.servers = ip地址1:9092,ip地址2:9092,ip地址3:9092 # 需要连接kafka的集群地址,写一个就可以
a1.sources.r1.kafka.topics=topic_log # kafka主题名
a1.sources.r1.interceptors = i1 #拦截器组
a1.sources.r1.interceptors.i1.type= com.xigema.flume.interceptor.TimeStampInterceptor$Builder # 自定义拦截器
a1.sources.r1.batchSize = 5000 # 拉取尺寸的大小
a1.sources.r1.batchDurationMillis = 2000 # 尝试拉取的间隔时间
#=====================================================================
#kafka channel
a1.channels = c1
a1.channels.c1.type = org.apache.flume.channel.kafka.KafkaChannel # 选择channel类型
a1.channels.c1.kafka.bootstrap.servers = ip地址1:9092,ip地址2:9092 # 选择需要连接的kafka集群
a1.channels.c1.kafka.topic = topic_log # kafka的主题名
a1.channels.c1.parseAsFlumeEvent = false # 去掉头部的信息,使传入kafka的信息为原本信息
#=====================================================================
#memory channel
a1.channels = c1
a1.channels.c1.type = memory # 选择channel类型
a1.channels.c1.capacity = 1000 # 选择内存存储大小 单位event
a1.channels.c1.transactionCapacity = 100 # 选择抓取尺寸大小
#=====================================================================
#file channel
a1.channels = c1
a1.channels.c1.type = file #选择channel类型
a1.channels.c1.checkpointDir = /opt/module/flume/checkpoint/behavior1 # 保存的地址
a1.channels.c1.dataDirs = /opt/module/flume/data/behavior1/ # 保存的备份地址
a1.channels.c1.maxFileSize = 2146435071 # 保存的文件大小
a1.channels.c1.capacity = 1000000 # 保存的容量
a1.channels.c1.keep-alive = 6
在重启flume有时需要将checkpointDir与dataDirs删除
#=====================================================================
#hdfs sink
a1.sinks.k1.type = hdfs # 选择sink类型
a1.sinks.k1.hdfs.path = /origin_data/gmall/log/topic_log/%Y-%m-%d # 保存的hdfs的路径
a1.sinks.k1.hdfs.filePrefix = log- #保存文件的前缀
a1.sinks.k1.hdfs.round = false
# 控制文件生成的三个参数
a1.sinks.k1.hdfs.rollInterval = 1800
a1.sinks.k1.hdfs.rollSize = 134217728
a1.sinks.k1.hdfs.rollCount = 0
## 控制输出文件是原生文件。
a1.sinks.k1.hdfs.fileType = CompressedStream
a1.sinks.k1.hdfs.codeC = lzop