搭建Flume运行环境
解压安装包
[root@CentOS ~]# tar -zxf apache-flume-1.9.0-bin.tar.gz -C /usr/
配置文件结构
# 声明组件信息
<Agent>.sources = <Source1> <Source2>
<Agent>.sinks = <Sink1> <Sink1>
<Agent>.channels = <Channel1> <Channel2>
# 组件配置
<Agent>.sources.<Source>.<someProperty> = <someValue>
<Agent>.channels.<Channel>.<someProperty> = <someValue>
<Agent>.sinks.<Sink>.<someProperty> = <someValue>
# 链接组件
<Agent>.sources.<Source>.channels = <Channel1> <Channel2> ...
<Agent>.sinks.<Sink>.channel = <Channel1>
- 第一台目标机器
[root@CentOS apache-flume-1.9.0-bin]# vi conf/demo01.properties
# 声明组件信息
a1.sources = s1
a1.sinks = sk1
a1.channels = c1
# 组件配置
a1.sources.s1.type = TAILDIR
a1.sources.s1.filegroups = f1
a1.sources.s1.filegroups.f1 = /root/logs/userLoginFile.*
a1.channels.c1.type = memory
a1.sinks.sk1.type = avro
a1.sinks.sk1.hostname = 192.168.111.133
a1.sinks.sk1.port = 44444
# 链接组件
a1.sources.s1.channels = c1
a1.sinks.sk1.channel = c1
- 第二台机器(192.168.111.133)
[root@CentOS apache-flume-1.9.0-bin]# vi conf/demo01.properties
# 声明组件信息
a1.sources = s1
a1.sinks = sk1
a1.channels = c1
# 组件配置
a1.sources.s1.type = avro
a1.sources.s1.bind = 192.168.111.133
a1.sources.s1.port = 44444
a1.channels.c1.type = memory
a1.sinks.sk1.type = file_roll
a1.sinks.sk1.sink.directory = /root/file_roll
a1.sinks.sk1.sink.rollInterval = 0
# 链接组件
a1.sources.s1.channels = c1
a1.sinks.sk1.channel = c1
启动
- 先启动第二台
[root@CentOS apache-flume-1.9.0-bin]# ./bin/flume-ng agent --conf conf/ --conf-file conf/demo01.properties --name a1
- 再启动第一台
[root@CentOS apache-flume-1.9.0-bin]# ./bin/flume-ng agent --conf conf/ --conf-file conf/demo01.properties --name a1
查看日志:tail -f 文件名
Avro Source(重要)
- 一般可以通过Avro Sink 将结果直接写入 Avro Source,这种情况,一般指的是通过flume采集本地的日志文件,架构一般如上图所示,一般情况下的应用服务器必须和agent部署在同一台物理主机。(服务器端日志采集)
- 用户调用Flume的暴露的SDK,直接将数据发送给Avro Source(移动端)
<dependency>
<groupId>org.apache.flume</groupId>
<artifactId>flume-ng-sdk</artifactId>
<version>1.9.0</version>
</dependency>
Properties props = new Properties();
props.setProperty(RpcClientConfigurationConstants.CONFIG_CLIENT_TYPE, "avro");
props.put("client.type", "default_loadbalance");
props.put("hosts", "h1 h2 h3");
String host1 = "192.168.111.133:44444";
String host2 = "192.168.111.133:44444";
String host3 = "192.168.111.133:44444";
props.put("hosts.h1", host1);
props.put("hosts.h2", host2);
props.put("hosts.h3", host3);
props.put("host-selector", "random"); // round_robin
RpcClient client= RpcClientFactory.getInstance(props);
Event event= EventBuilder.withBody("1 zhangsan true 28".getBytes());
client.append(event);
client.close();
Avro Source | memory channel| Kafka Sink
[root@CentOS apache-flume-1.9.0-bin]# vi conf/demo02.properties
# 声明组件信息
a1.sources = s1
a1.sinks = sk1
a1.channels = c1
# 组件配置
a1.sources.s1.type = avro
a1.sources.s1.bind = 192.168.111.132
a1.sources.s1.port = 44444
a1.channels.c1.type = memory
a1.sinks.sk1.type = org.apache.flume.sink.kafka.KafkaSink
a1.sinks.sk1.kafka.bootstrap.servers = 192.168.111.132:9092
a1.sinks.sk1.kafka.topic = topic01
a1.sinks.sk1.flumeBatchSize = 20
a1.sinks.sk1.kafka.producer.acks = 1
a1.sinks.sk1.kafka.producer.linger.ms = 1
# 链接组件
a1.sources.s1.channels = c1
a1.sinks.sk1.channel = c1
[root@CentOS apache-flume-1.9.0-bin]# ./bin/flume-ng agent --conf conf/ --conf-file conf/demo02.properties --name a1
Flume和log4j整合
<dependency>
<groupId>org.apache.flume</groupId>
<artifactId>flume-ng-sdk</artifactId>
<version>1.9.0</version>
</dependency>
<dependency>
<groupId>org.apache.flume.flume-ng-clients</groupId>
<artifactId>flume-ng-log4jappender</artifactId>
<version>1.9.0</version>
</dependency>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>1.2.17</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>1.7.5</version>
</dependency>
- log4j.properties
log4j.appender.flume = org.apache.flume.clients.log4jappender.LoadBalancingLog4jAppender
log4j.appender.flume.Hosts = 192.168.111.132:44444 192.168.111.132:44444 192.168.111.132:44444
log4j.appender.flume.Selector = ROUND_ROBIN
log4j.appender.flume.MaxBackoff = 30000
log4j.logger.com.baizhi = DEBUG,flume
- 测试代码
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
public class TestLog {
private static Log log= LogFactory.getLog(TestLog.class);
public static void main(String[] args) {
log.debug("你好!_debug");
log.info("你好!_info");
log.warn("你好!_warn");
log.error("你好!_error");
}
}
Spring Boot flume logback整合
- SpringBoot引入logback.xml
<?xml version="1.0" encoding="UTF-8"?>
<configuration scan="true" scanPeriod="60 seconds" debug="false">
<appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender" >
<encoder>
<pattern>%p %c#%M %d{yyyy-MM-dd HH:mm:ss} %m%n</pattern>
<charset>UTF-8</charset>
</encoder>
</appender>
<appender name="FILE" class="ch.qos.logback.core.rolling.RollingFileAppender">
<rollingPolicy class="ch.qos.logback.core.rolling.TimeBasedRollingPolicy">
<fileNamePattern>logs/userLoginFile-%d{yyyyMMdd}.log</fileNamePattern>
<maxHistory>30</maxHistory>
</rollingPolicy>
<encoder>
<pattern>%p %c#%M %d{yyyy-MM-dd HH:mm:ss} %m%n</pattern>
<charset>UTF-8</charset>
</encoder>
</appender>
<!-- 控制台输出日志级别 -->
<root level="ERROR">
<appender-ref ref="STDOUT" />
</root>
<!--additivity 为false,日志不会再父类appender中输出-->
<logger name="com.baizhi.tests" level="INFO" additivity="false">
<appender-ref ref="flume" /><!--输出到flume-->
<appender-ref ref="STDOUT" /><!--在控制台打印-->
</logger>
</configuration>
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
private static final Logger LOG = LoggerFactory.getLogger(TestSpringBootLog.class);
LOG.info("-----------------------");
集成 Flume +logback
- 在github上找到https://github.com/gilt/logback-flume-appender
- 将项目源代码拷贝到项目工程
- 在SpringBoot工程中添加当前版本flume的sdk
<dependency>
<groupId>org.apache.flume</groupId>
<artifactId>flume-ng-sdk</artifactId>
<version>1.9.0</version>
</dependency>
- 在项目的logback.xml中添加flume的appender实现
<appender name="flume" class="com.gilt.logback.flume.FlumeLogstashV1Appender">
<flumeAgents>
192.168.111.132:44444,
192.168.111.132:44444,
192.168.111.132:44444
</flumeAgents>
<flumeProperties>
connect-timeout=4000;
request-timeout=8000
</flumeProperties>
<batchSize>1</batchSize>
<reportingWindow>1000</reportingWindow>
<additionalAvroHeaders>
myHeader=myValue
</additionalAvroHeaders>
<application>smapleapp</application>
<layout class="ch.qos.logback.classic.PatternLayout">
<pattern>%p %c#%M %d{yyyy-MM-dd HH:mm:ss} %m%n</pattern>
</layout>
</appender>
定制自己的appender
public class BZFlumeLogAppender extends UnsynchronizedAppenderBase<ILoggingEvent> {
private String flumeAgents;
protected Layout<ILoggingEvent> layout;
private static RpcClient rpcClient;
@Override
protected void append(ILoggingEvent eventObject) {
String body= layout!= null? layout.doLayout(eventObject):eventObject.getFormattedMessage();
if(rpcClient==null){
rpcClient=buildRpcClient();
}
Event event= EventBuilder.withBody(body,Charset.forName("UTF-8"));
try {
rpcClient.append(event);
} catch (EventDeliveryException e) {
e.printStackTrace();
}
}
public void setFlumeAgents(String flumeAgents) {
this.flumeAgents = flumeAgents;
}
public void setLayout(Layout<ILoggingEvent> layout) {
this.layout = layout;
}
private RpcClient buildRpcClient(){
Properties props = new Properties();
int i = 0;
for (String agent : flumeAgents.split(",")) {
String[] tokens = agent.split(":");
props.put("hosts.h" + (i++), tokens[0] + ':' + tokens[1]);
}
StringBuffer buffer = new StringBuffer(i * 4);
for (int j = 0; j < i; j++) {
buffer.append("h").append(j).append(" ");
}
props.put("hosts", buffer.toString());
if(i > 1) {
props.put("client.type", "default_loadbalance");
props.put("host-selector", "round_robin");
}
props.put("backoff", "true");
props.put("maxBackoff", "10000");
return RpcClientFactory.getInstance(props);
}
}
<appender name="bz" class="com.baizhi.flume.BZFlumeLogAppender">
<flumeAgents>
192.168.111.132:44444,192.168.111.132:44444
</flumeAgents>
<layout class="ch.qos.logback.classic.PatternLayout">
<pattern>%p %c#%M %d{yyyy-MM-dd HH:mm:ss} %m</pattern>
</layout>
</appender>
Flume对接HDFS (静态批处理)
将一个目录下的日志文件,采集到HDFS中,并且删除采集完成的日志文件(批处理作业中)
spooldir source、jdbc channel、HDFS Sink
# 声明组件信息
a1.sources = s1
a1.sinks = sk1
a1.channels = c1
# 组件配置
a1.sources.s1.type = spooldir
a1.sources.s1.spoolDir = /root/spooldir
a1.sources.s1.deletePolicy = immediate
a1.sources.s1.includePattern = ^.*\.log$
a1.channels.c1.type = jdbc
a1.sinks.sk1.type = hdfs
a1.sinks.sk1.hdfs.path= hdfs:///flume/%y-%m-%d/
a1.sinks.sk1.hdfs.filePrefix = events-
a1.sinks.sk1.hdfs.useLocalTimeStamp = true
a1.sinks.sk1.hdfs.rollInterval = 0
a1.sinks.sk1.hdfs.rollSize = 0
a1.sinks.sk1.hdfs.rollCount = 0
a1.sinks.sk1.hdfs.fileType = DataStream
# 链接组件
a1.sources.s1.channels = c1
a1.sinks.sk1.channel = c1
日志分流案例:
需求:只采集用户模块的日志流,对需要评估的数据发送给evaluatetopic,其他用户板块的数据发送给usertopic。
# 声明组件信息
a1.sources = s1
a1.sinks = sk1 sk2
a1.channels = c1 c2
# 组件配置
a1.sources.s1.type = avro
a1.sources.s1.bind = 192.168.111.132
a1.sources.s1.port = 44444
# 拦截器
a1.sources.s1.interceptors = i1 i2
a1.sources.s1.interceptors.i1.type = regex_filter
a1.sources.s1.interceptors.i1.regex = .*UserController.*
a1.sources.s1.interceptors.i1.excludeEvents = false
a1.sources.s1.interceptors.i2.type = regex_extractor
a1.sources.s1.interceptors.i2.regex = .*(EVALUATE|SUCCESS).*
a1.sources.s1.interceptors.i2.serializers = s1
a1.sources.s1.interceptors.i2.serializers.s1.name = type
a1.channels.c1.type = memory
a1.channels.c2.type = memory
a1.sinks.sk1.type = org.apache.flume.sink.kafka.KafkaSink
a1.sinks.sk1.kafka.bootstrap.servers = 192.168.111.132:9092
a1.sinks.sk1.kafka.topic = evaluatetopic
a1.sinks.sk1.flumeBatchSize = 20
a1.sinks.sk1.kafka.producer.acks = 1
a1.sinks.sk1.kafka.producer.linger.ms = 1
a1.sinks.sk2.type = org.apache.flume.sink.kafka.KafkaSink
a1.sinks.sk2.kafka.bootstrap.servers = 192.168.111.132:9092
a1.sinks.sk2.kafka.topic = usertopic
a1.sinks.sk2.flumeBatchSize = 20
a1.sinks.sk2.kafka.producer.acks = 1
a1.sinks.sk2.kafka.producer.linger.ms = 1
# 通道选择器分流
a1.sources.s1.selector.type = multiplexing
a1.sources.s1.selector.header = type
a1.sources.s1.selector.mapping.EVALUATE = c1
a1.sources.s1.selector.mapping.SUCCESS = c2
a1.sources.s1.selector.default = c2
# 链接组件
a1.sources.s1.channels = c1 c2
a1.sinks.sk1.channel = c1
a1.sinks.sk2.channel = c2
测试:
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
public class TestLog {
private static Log log= LogFactory.getLog(TestLog.class);
public static void main(String[] args) {
log.debug("你好!_debug");
log.info("你好!_info");
log.warn("Mintzzz UserController Rage范 EVALUATE");
log.error("haonan UserController enming SUCCESS");
}
}
Sink Processor
# 声明组件
a1.sources = s1
a1.sinks = sk1 sk2
a1.channels = c1
# 将看 k1 k2 归纳一个组
a1.sinkgroups = g1
a1.sinkgroups.g1.sinks = sk1 sk2
a1.sinkgroups.g1.processor.type = load_balance
a1.sinkgroups.g1.processor.backoff = true
a1.sinkgroups.g1.processor.selector = round_robin
# 配置source属性
a1.sources.s1.type = avro
a1.sources.s1.bind = 192.168.111.132
a1.sources.s1.port = 44444
# 配置sink属性
a1.sinks.sk1.type = org.apache.flume.sink.kafka.KafkaSink
a1.sinks.sk1.kafka.bootstrap.servers = 192.168.111.132:9092
a1.sinks.sk1.kafka.topic = evaluatetopic
a1.sinks.sk1.flumeBatchSize = 20
a1.sinks.sk1.kafka.producer.acks = 1
a1.sinks.sk1.kafka.producer.linger.ms = 1
a1.sinks.sk2.type = org.apache.flume.sink.kafka.KafkaSink
a1.sinks.sk2.kafka.bootstrap.servers = 192.168.111.132:9092
a1.sinks.sk2.kafka.topic = usertopic
a1.sinks.sk2.flumeBatchSize = 20
a1.sinks.sk2.kafka.producer.acks = 1
a1.sinks.sk2.kafka.producer.linger.ms = 1
# 配置channel属性
a1.channels.c1.type = memory
a1.channels.c1.transactionCapacity = 1
# 将source连接channel
a1.sources.s1.channels = c1
a1.sinks.sk1.channel = c1
a1.sinks.sk2.channel = c1