Flume 拦截器
时间戳拦截器
flume-timestamp.conf
#1.定义agent名, source、channel、sink的名称
a4.sources = r1
a4.channels = c1
a4.sinks = k1
#2.具体定义source
a4.sources.r1.type = spooldir
a4.sources.r1.spoolDir = /opt/wind
#定义拦截器,为文件最后添加时间戳
a4.sources.r1.interceptors = timestamp
a4.sources.r1.interceptors.timestamp.type = org.apache.flume.interceptor.TimestampInterceptor$Builder
#具体定义channel
a4.channels.c1.type = memory
a4.channels.c1.capacity = 10000
a4.channels.c1.transactionCapacity = 100
#具体定义sink
a4.sinks.k1.type = hdfs
a4.sinks.k1.hdfs.path = hdfs://bigdata111:9000/flume-interceptors/%H
a4.sinks.k1.hdfs.filePrefix = events-
a4.sinks.k1.hdfs.fileType = DataStream
#不按照条数生成文件
a4.sinks.k1.hdfs.rollCount = 0
#HDFS上的文件达到128M时生成一个文件
a4.sinks.k1.hdfs.rollSize = 134217728
#HDFS上的文件达到60秒生成一个文件
a4.sinks.k1.hdfs.rollInterval = 60
#组装source、channel、sink
a4.sources.r1.channels = c1
a4.sinks.k1.channel = c1
启动命令
/opt/module/flume-1.8.0/bin/flume-ng agent -n a4 \
-f /opt/module/flume-1.8.0/myconf/flume-timestamp.conf \
-c /opt/module/flume-1.8.0/conf \
-Dflume.root.logger=INFO,console
主机名拦截器
flume-host.conf
#1.定义agent
a1.sources= r1
a1.sinks = k1
a1.channels = c1
#2.定义source
a1.sources.r1.type = exec
a1.sources.r1.channels = c1
a1.sources.r1.command = tail -F /opt/wind
#拦截器
a1.sources.r1.interceptors = i1
a1.sources.r1.interceptors.i1.type = host
#参数为true时用IP192.168.1.111,参数为false时用主机名,默认为true
a1.sources.r1.interceptors.i1.useIP = false
a1.sources.r1.interceptors.i1.hostHeader = agentHost
#3.定义sinks
a1.sinks.k1.type=hdfs
a1.sinks.k1.hdfs.path = hdfs://bigdata111:9000/flumehost/%{agentHost}
a1.sinks.k1.hdfs.filePrefix = plus_%{agentHost}
#往生成的文件加后缀名.log
a1.sinks.k1.hdfs.fileSuffix = .log
a1.sinks.k1.hdfs.fileType = DataStream
a1.sinks.k1.hdfs.writeFormat = Text
a1.sinks.k1.hdfs.rollInterval = 10
a1.sinks.k1.hdfs.useLocalTimeStamp = true
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
UUID拦截器
flume-uuid.conf
a1.sources = r1
a1.sinks = k1
a1.channels = c1
a1.sources.r1.type = exec
a1.sources.r1.channels = c1
a1.sources.r1.command = tail -F /opt/wind
a1.sources.r1.interceptors = i1
#type的参数不能写成uuid,得写具体,否则找不到类
a1.sources.r1.interceptors.i1.type = org.apache.flume.sink.solr.morphline.UUIDInterceptor$Builder
#如果UUID头已经存在,它应该保存
a1.sources.r1.interceptors.i1.preserveExisting = true
a1.sources.r1.interceptors.i1.prefix = UUID_
#如果sink类型改为HDFS,那么在HDFS的文本中没有headers的信息数据
a1.sinks.k1.type = logger
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
查询替换拦截器
flume-search_replace.conf
#1 agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1
#2 source
a1.sources.r1.type = exec
a1.sources.r1.channels = c1
a1.sources.r1.command = tail -F /opt/wind
a1.sources.r1.interceptors = i1
a1.sources.r1.interceptors.i1.type = search_replace
#遇到数字改成wind,A123会替换为Awind
a1.sources.r1.interceptors.i1.searchPattern = [0-9]+
a1.sources.r1.interceptors.i1.replaceString = wind
a1.sources.r1.interceptors.i1.charset = UTF-8
#3 sink
a1.sinks.k1.type = logger
#4 Chanel
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
#5 bind
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
正则过滤拦截器
flume-filter.conf
#1 agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1
#2 source
a1.sources.r1.type = exec
a1.sources.r1.channels = c1
a1.sources.r1.command = tail -F /opt/wind
a1.sources.r1.interceptors = i1
a1.sources.r1.interceptors.i1.type = regex_filter
a1.sources.r1.interceptors.i1.regex = ^A.*
#如果excludeEvents设为false,表示过滤掉不是以A开头的events。如果excludeEvents设为true,则表示过滤掉以A开头的events。
a1.sources.r1.interceptors.i1.excludeEvents = true
a1.sinks.k1.type = logger
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
正则抽取拦截器
flume-extractor.conf
#1 agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1
#2 source
a1.sources.r1.type = exec
a1.sources.r1.channels = c1
a1.sources.r1.command = tail -F /opt/wind
a1.sources.r1.interceptors = i1
a1.sources.r1.interceptors.i1.type = regex_extractor
# hostname is bigdata111 ip is 192.168.20.111
a1.sources.r1.interceptors.i1.regex = hostname is (.*?) ip is (.*)
a1.sources.r1.interceptors.i1.serializers = s1 s2
#hostname(自定义)= (.*?)->bigdata111
a1.sources.r1.interceptors.i1.serializers.s1.name = hostname
#ip(自定义) = (.*)->192.168.20.111
a1.sources.r1.interceptors.i1.serializers.s2.name = ip
a1.sinks.k1.type = logger
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
自定义拦截器
字母小写变大写
第一步:添加 flume 依赖
<dependencies>
<!-- flume核心依赖 -->
<dependency>
<groupId>org.apache.flume</groupId>
<artifactId>flume-ng-core</artifactId>
<version>1.8.0</version>
</dependency>
</dependencies>
<build>
<plugins>
<!-- 打包插件 -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<version>2.4</version>
<configuration>
<archive>
<manifest>
<addClasspath>true</addClasspath>
<classpathPrefix>lib/</classpathPrefix>
<mainClass></mainClass>
</manifest>
</archive>
</configuration>
</plugin>
<!-- 编译插件 -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>1.8</source>
<target>1.8</target>
<encoding>utf-8</encoding>
</configuration>
</plugin>
</plugins>
</build>
第二步:IDEA 自定义实现拦截器
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.interceptor.Interceptor;
import java.util.ArrayList;
import java.util.List;
public class MyInterceptor implements Interceptor {
public void initialize() {
}
/**
* 拦截器实现方法
* @param event 一个事件 封装了一行数据,有header和body(具体的数据)
* */
public Event intercept(Event event) {
//获取带body
byte[] oldbody = event.getBody();
//将数据转换为大写
byte[] bytes = new String(oldbody).toUpperCase().getBytes();
//将转换后的数据分装成一个event,返回
event.setBody(bytes);
return event;
}
public List<Event> intercept(List<Event> list) {
ArrayList<Event> events = new ArrayList<Event>();
//循环每个事件的数据转换成大写
for(Event event : list){
events.add(intercept(event));
}
return events;
}
public void close() {
}
public static class Builder implements Interceptor.Builder{
public Interceptor build() {
return new MyInterceptor();
}
public void configure(Context context) {
}
}
}

创建 flume-myInterceptor.conf
#1 agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1
#2 source
a1.sources.r1.type = exec
a1.sources.r1.channels = c1
a1.sources.r1.command = tail -F /opt/wind
#定义拦截器规则
a1.sources.r1.interceptors = i1
a1.sources.r1.interceptors.i1.type = flume.MyInterceptor$Builder
a1.sinks.k1.type = logger
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1

本文介绍了Flume的几种常用拦截器,包括时间戳拦截器、主机名拦截器、UUID拦截器、查询替换拦截器和正则过滤抽取拦截器。还详细讲解了如何自定义拦截器,如将字母转为大写,并提供了相应的配置文件示例。
3564

被折叠的 条评论
为什么被折叠?



