从文件名提取日期、小时信息,决定数据发送到hdfs哪天哪小时的分区目录。
需要自定义一个拦截器
package interceptor;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang.StringUtils;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.interceptor.Interceptor;
import org.apache.flume.interceptor.RegexExtractorInterceptorPassThroughSerializer;
import org.apache.flume.interceptor.RegexExtractorInterceptorSerializer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.base.Charsets;
import com.google.common.base.Preconditions;
import com.google.common.base.Throwables;
import com.google.common.collect.Lists;
/**
* Interceptor that extracts matches using a specified regular expression and
* appends the matches to the event headers using the specified serializers</p>
* Note that all regular expression matching occurs through Java's built in
* java.util.regex package</p>. Properties:
* <p>
* regex: The regex to use
* <p>
* serializers: Specifies the group the serializer will be applied to, and the
* name of the header that will be added. If no serializer is specified for a
* group the default {@link RegexExtractorInterceptorPassThroughSerializer} will
* be used
* <p>
* Sample config:
* <p>
* agent.sources.r1.channels = c1
* <p>
* agent.sources.r1.type = SEQ
* <p>
* agent.sources.r1.interceptors = i1
* <p>
* agent.sources.r1.interceptors.i1.type = REGEX_EXTRACTOR
* <p>
* agent.sources.r1.interceptors.i1.regex = (WARNING)|(ERROR)|(FATAL)
* <p>
* agent.sources.r1.interceptors.i1.serializers = s1 s2
* agent.sources.r1.interceptors.i1.serializers.s1.type = com.blah.SomeSerializer
* agent.sources.r1.interceptors.i1.serializers.s1.name = warning
* agent.sources.r1.interceptors.i1.serializers.s2.type = org.apache.flume.interceptor.RegexExtractorInterceptorTimestampSerializer
* agent.sources.r1.interceptors.i1.serializers.s2.name = error
* agent.sources.r1.interceptors.i1.serializers.s2.dateFormat = yyyy-MM-dd
* </code>
* </p>
* <pre>
* Example 1:
* </p>
* EventBody: 1:2:3.4foobar5</p> Configuration:
* agent.sources.r1.interceptors.i1.regex = (\\d):(\\d):(\\d)
* </p>
* agent.sources.r1.interceptors.i1.serializers = s1 s2 s3
* agent.sources.r1.interceptors.i1.serializers.s1.name = one
* agent.sources.r1.interceptors.i1.serializers.s2.name = two
* agent.sources.r1.interceptors.i1.serializers.s3.name = three
* </p>
* results in an event with the the following
*
* body: 1:2:3.4foobar5 headers: one=>1, two=>2, three=3
*
* Example 2:
*
* EventBody: 1:2:3.4foobar5
*
* Configuration: agent.sources.r1.interceptors.i1.regex = (\\d):(\\d):(\\d)
* <p>
* agent.sources.r1.interceptors.i1.serializers = s1 s2
* agent.sources.r1.interceptors.i1.serializers.s1.name = one
* agent.sources.r1.interceptors.i1.serializers.s2.name = two
* <p>
*
* results in an event with the the following
*
* body: 1:2:3.4foobar5 headers: one=>1, two=>2
* </pre>
*/
public class RegexExtractorExtInterceptor implements Interceptor {
static final String REGEX = "regex";
static final String SERIALIZERS = "serializers";
// 增加代码开始
static final String EXTRACTOR_HEADER = "extractorHeader";
static final boolean DEFAULT_EXTRACTOR_HEADER = false;
static final String EXTRACTOR_HEADER_KEY = "extractorHeaderKey";
// 增加代码结束
private static final Logger logger = LoggerFactory
.getLogger(RegexExtractorExtInterceptor.class);
private final Pattern regex;
private final List<NameAndSerializer> serializers;
// 增加代码开始
private final boolean extractorHeader;
private final String extractorHeaderKey;
// 增加代码结束
private RegexExtractorExtInterceptor(Pattern regex,
List<NameAndSerializer> serializers, boolean extractorHeader,
String extractorHeaderKey) {
this.regex = regex;
this.serializers = serializers;
this.extractorHeader = extractorHeader;
this.extractorHeaderKey = extractorHeaderKey;
}
@Override
public void initialize() {
// NO-OP...
}
@Override
public void close() {
// NO-OP...
}
@Override
public Event intercept(Event event) {
String tmpStr;
if(extractorHeader)
{
tmpStr = event.getHeaders().get(extractorHeaderKey);
}
else
{
tmpStr=new String(event.getBody(),
Charsets.UTF_8);
}
Matcher matcher = regex.matcher(tmpStr);
Map<String, String> headers = event.getHeaders();
if (matcher.find()) {
for (int group = 0, count = matcher.groupCount(); group < count; group++) {
int groupIndex = group + 1;
if (groupIndex > serializers.size()) {
if (logger.isDebugEnabled()) {
logger.debug(
"Skipping group {} to {} due to missing serializer",
group, count);
}
break;
}
NameAndSerializer serializer = serializers.get(group);
if (logger.isDebugEnabled()) {
logger.debug("Serializing {} using {}",
serializer.headerName, serializer.serializer);
}
headers.put(serializer.headerName, serializer.serializer
.serialize(matcher.group(groupIndex)));
}
}
return event;
}
@Override
public List<Event> intercept(List<Event> events) {
List<Event> intercepted = Lists.newArrayListWithCapacity(events.size());
for (Event event : events) {
Event interceptedEvent = intercept(event);
if (interceptedEvent != null) {
intercepted.add(interceptedEvent);
}
}
return intercepted;
}
public static class Builder implements Interceptor.Builder {
private Pattern regex;
private List<NameAndSerializer> serializerList;
// 增加代码开始
private boolean extractorHeader;
private String extractorHeaderKey;
// 增加代码结束
private final RegexExtractorInterceptorSerializer defaultSerializer = new RegexExtractorInterceptorPassThroughSerializer();
@Override
public void configure(Context context) {
String regexString = context.getString(REGEX);
Preconditions.checkArgument(!StringUtils.isEmpty(regexString),
"Must supply a valid regex string");
regex = Pattern.compile(regexString);
regex.pattern();
regex.matcher("").groupCount();
configureSerializers(context);
// 增加代码开始
extractorHeader = context.getBoolean(EXTRACTOR_HEADER,
DEFAULT_EXTRACTOR_HEADER);
if (extractorHeader) {
extractorHeaderKey = context.getString(EXTRACTOR_HEADER_KEY);
Preconditions.checkArgument(
!StringUtils.isEmpty(extractorHeaderKey),
"必须指定要抽取内容的header key");
}
// 增加代码结束
}
private void configureSerializers(Context context) {
String serializerListStr = context.getString(SERIALIZERS);
Preconditions.checkArgument(
!StringUtils.isEmpty(serializerListStr),
"Must supply at least one name and serializer");
String[] serializerNames = serializerListStr.split("\\s+");
Context serializerContexts = new Context(
context.getSubProperties(SERIALIZERS + "."));
serializerList = Lists
.newArrayListWithCapacity(serializerNames.length);
for (String serializerName : serializerNames) {
Context serializerContext = new Context(
serializerContexts.getSubProperties(serializerName
+ "."));
String type = serializerContext.getString("type", "DEFAULT");
String name = serializerContext.getString("name");
Preconditions.checkArgument(!StringUtils.isEmpty(name),
"Supplied name cannot be empty.");
if ("DEFAULT".equals(type)) {
serializerList.add(new NameAndSerializer(name,
defaultSerializer));
} else {
serializerList.add(new NameAndSerializer(name,
getCustomSerializer(type, serializerContext)));
}
}
}
private RegexExtractorInterceptorSerializer getCustomSerializer(
String clazzName, Context context) {
try {
RegexExtractorInterceptorSerializer serializer = (RegexExtractorInterceptorSerializer) Class
.forName(clazzName).newInstance();
serializer.configure(context);
return serializer;
} catch (Exception e) {
logger.error("Could not instantiate event serializer.", e);
Throwables.propagate(e);
}
return defaultSerializer;
}
@Override
public Interceptor build() {
Preconditions.checkArgument(regex != null,
"Regex pattern was misconfigured");
Preconditions.checkArgument(serializerList.size() > 0,
"Must supply a valid group match id list");
return new RegexExtractorExtInterceptor(regex, serializerList,
extractorHeader, extractorHeaderKey);
}
}
static class NameAndSerializer {
private final String headerName;
private final RegexExtractorInterceptorSerializer serializer;
public NameAndSerializer(String headerName,
RegexExtractorInterceptorSerializer serializer) {
this.headerName = headerName;
this.serializer = serializer;
}
}
}
项目的pom.xml文件
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.flume-dev</groupId>
<artifactId>com.flume-dev</artifactId>
<name>com.flume-dev</name>
<version>1.0.0</version>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
</plugin>
</plugins>
</build>
<dependencies>
<dependency>
<groupId>org.apache.flume</groupId>
<artifactId>flume-ng-sdk</artifactId>
<version>1.5.0</version>
</dependency>
<dependency>
<groupId>org.apache.flume</groupId>
<artifactId>flume-ng-core</artifactId>
<version>1.5.0</version>
</dependency>
<dependency>
<groupId>org.apache.flume</groupId>
<artifactId>flume-ng-configuration</artifactId>
<version>1.5.0</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>1.6.1</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.10</version>
<scope>test</scope>
</dependency>
</dependencies>
</project>
创建flume的插件存放目录
cd $FLUME_HOME
mkdir plugins.d
mkdir plugins.d/RegexExtractorExtInterceptor
cd plugins.d/RegexExtractorExtInterceptor
mkdir lib libext native
然后将自已的jar包扔进lib目录,写flume数据流配置文件
fileheader.properties
agent-1.channels.ch-1.type = file
agent-1.channels.ch-1.checkpointDir= /root/temp/fileheader/checkpoint
agent-1.channels.ch-1.dataDirs= /root/temp/fileheader/data
agent-1.sources.src-1.type = spooldir
agent-1.sources.src-1.channels = ch-1
agent-1.sources.src-1.spoolDir = /root/test
agent-1.sources.src-1.deletePolicy= never
agent-1.sources.src-1.fileHeader = true
agent-1.sources.src-1.basenameHeader = true
agent-1.sources.src-1.interceptors =i1 hosti
agent-1.sources.src-1.interceptors.i1.type = interceptor.RegexExtractorExtInterceptor$Builder
agent-1.sources.src-1.interceptors.i1.regex=(.*)\\.(.*)\\.(.*)
agent-1.sources.src-1.interceptors.i1.extractorHeader=true
agent-1.sources.src-1.interceptors.i1.extractorHeaderKey=basename
agent-1.sources.src-1.interceptors.i1.serializers=s1 s2 s3
agent-1.sources.src-1.interceptors.i1.serializers.s1.name=one
agent-1.sources.src-1.interceptors.i1.serializers.s2.name=two
agent-1.sources.src-1.interceptors.i1.serializers.s3.name=three
agent-1.sources.src-1.interceptors.hosti.type = host
agent-1.sources.src-1.interceptors.hosti.useIP=false
agent-1.sinks.sink_hdfs.channel = ch-1
agent-1.sinks.sink_hdfs.type = hdfs
agent-1.sinks.sink_hdfs.hdfs.path = hdfs://xxx:port/tmp/events110/fileheader/%{three}
agent-1.sinks.sink_hdfs.hdfs.filePrefix = logs.%{host}
agent-1.sinks.sink_hdfs.hdfs.inUsePrefix = .
agent-1.sinks.sink_hdfs.hdfs.rollInterval = 30
agent-1.sinks.sink_hdfs.hdfs.rollSize = 0
agent-1.sinks.sink_hdfs.hdfs.rollCount = 0
agent-1.sinks.sink_hdfs.hdfs.batchSize = 100
agent-1.sinks.sink_hdfs.hdfs.writeFormat = text
agent-1.sinks.sink_hdfs.hdfs.fileType = DataStream
#agent-1.sinks.sink_hdfs.hdfs.fileType = CompressedStream
#agent-1.sinks.sink_hdfs.hdfs.codeC = lzop
agent-1.channels = ch-1
agent-1.sources = src-1
agent-1.sinks = sink_hdfs
如文件名为data.log.20151111 ,则写入分区20151111最后执行bin/flume-ng agent -c /usr/local/flume/conf -f /usr/local/flume/conf/fileheader.properties -n agent-1 -Dflume.root.logger=INFO,console
参考 http://blog.youkuaiyun.com/xiao_jun_0820/article/details/38333171