续上篇,由于filesink中需要使用/data/log/%{dayStr}/log-%{hourStr}%{minStr}-这样文件格式的,为了使file-sink能使用%{dayStr}这样的标签,需要在数据传输过程中,给event的header中添加对应的键值对。在flume-ng中提供了很方便的方式:Interceptor
以下为实现的interceptor,首先使用正则表达式匹配nginx日志,如何匹配成功,则获取匹配到的数据,并且对url中的参数进行处理,最后所有日志信息都被存储在Map中。根据配置文件中需要输出的键找到对应的值,按照顺序输出为csv格式的行。
原始日志格式:
112.245.239.72 - - [29/Dec/2012:15:00:00 +0800] "GET /p.gif?a=1&b=2 HTTP/1.1" 200 0 "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4
.0; 4399Box.1357; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; AskTbPTV2/5.9.1.14019; 4399Box.1357)" |
最终结果:
1,2
|
配置信息为:
agent.sources = source agent.channels = channel agent.sinks = sink agent.sources.source.type = exec #agent.sources.source.command = tail -n +0 -F /data/tmp/accesspvpb_2012-11-18.log agent.sources.source.command = cat /opt/nginx/logs/vvaccess_log_pipe agent.sources.source.interceptors = logformat agent.sources.source.interceptors.logformat.type = org.apache.flume.interceptor.LogFormatInterceptor$Builder agent.sources.source.interceptors.logformat.confpath = /usr/programs/flume/conf/logformat_vv.properties agent.sources.source.interceptors.logformat.dynamicprop = true agent.sources.source.interceptors.logformat.hostname = vv111 agent.sources.source.interceptors.logformat.prop.monitor.rollInterval = 100000 # The channel can be defined as follows. agent.sources.source.channels = channel agent.sinks.sink.type = avro agent.sinks.sink.hostname = 192.168.0.100 agent.sinks.sink.port = 44444 agent.sinks.sink.channel = channel # Each channel's type is defined. agent.channels.channel.type = file agent.channels.channel.checkpointDir = /data/tmpc/checkpoint agent.channels.channel.dataDirs = /data/tmpc/data agent.channels.channel.transactionCapacity = 15000 |
/usr/programs/flume/conf/logformat_vv.properties文件内容为:
keys=a,b regexp=([0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3})\\s-\\s-\\s\\[([^]]+)\\]\\s\"GET\\s/p.gif\\?(.+)\\s.*\"\\s[0-9]+\\s[0-9]+\\s\"(.+)\" |
interceptor的代码:
package org.apache.flume.interceptor;
import static org.apache.flume.interceptor.LogFormatInterceptor.Constants.CONF_PATH;
import static org.apache.flume.interceptor.LogFormatInterceptor.Constants.DYNAMICPROP;
import static org.apache.flume.interceptor.LogFormatInterceptor.Constants.DYNAMICPROP_DFLT;
import static org.apache.flume.interceptor.LogFormatInterceptor.Constants.HOSTNAME;
import static org.apache.flume.interceptor.LogFormatInterceptor.Constants.HOSTNAME_DFLT;
import static org.apache.flume.interceptor.LogFormatInterceptor.Constants.PROPMONITORINTERVAL;
import static org.apache.flume.interceptor.LogFormatInterceptor.Constants.PROPMONITORINTERVAL_DFLT;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.event.EventBuilder;
import org.apache.oro.text.regex.MalformedPatternException;
import org.apache.oro.text.regex.MatchResult;
import org.apache.oro.text.regex.Pattern;
import org.apache.oro.text.regex.PatternCompiler;
import org.apache.oro.text.regex.PatternMatcher;
import org.apache.oro.text.regex.Perl5Compiler;
import org.apache.oro.text.regex.Perl5Matcher;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class LogFormatInterceptor implements Interceptor {
private static final Logger logger =
LoggerFactory
. getLogger(LogFormatInterceptor.class);
private String conf_path = null;
private boolean dynamicProp = false;
private String hostname = null;
private long propLastModify =
0;
private long propMonitorInterval ;
private String regexp = null;
private List<String> keys = null;
private Pattern pattern = null;
private PatternCompiler compiler = null;
private PatternMatcher matcher = null;
private SimpleDateFormat sdf = null;
private SimpleDateFormat sd = null;
private SimpleDateFormat sh = null;
private SimpleDateFormat sm = null;
private SimpleDateFormat sdfAll = null;
private long eventCount =
0l;
public LogFormatInterceptor(String
conf_path, boolean dynamicProp,
String hostname, long propMonitorInterval)
{
this.conf_path =
conf_path;
this.dynamicProp =
dynamicProp;
this.hostname =
hostname;
this.propMonitorInterval =
propMonitorInterval;
}
@Override
public void close ()
{
}
@Override
public void initialize ()
{
try {
// 读取配置文件,初始化正在表达式和输出的key列表
File file = new File(conf_path );
propLastModify =
file.lastModified();
Properties props = new Properties();
FileInputStream fis;
fis = new FileInputStream(file);
props.load(fis);
regexp =
props.getProperty( "regexp");
String strKey = props.getProperty( "keys");
if (strKey
!= null) {
String[] strkeys = strKey.split( ",");
keys = new LinkedList<String>();
for (String
key : strkeys) {
keys.add(key);
}
}
if (keys == null)
{
logger.error("====================keys
is null====================");
} else {
logger.info("keys=" + keys );
}
if (regexp == null)
{
logger.error("====================regexp
is null====================");
} else {
logger.info("regexp=" + regexp );
}
// 初始化正在表达式以及时间格式化类
compiler = new Perl5Compiler();
pattern = compiler.compile( regexp);
matcher = new Perl5Matcher();
sdf = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss
Z" ,
java.util.Locale. US);
sd = new SimpleDateFormat("yyyyMMdd" );
sh = new SimpleDateFormat("HH" );
sm = new SimpleDateFormat("mm" );
sdfAll = new SimpleDateFormat("yyyyMMddHHmmss" );
} catch (MalformedPatternException
e) {
logger.error("Could
not complile pattern!" , e);
} catch (FileNotFoundException
e) {
logger.error("conf
file is not found!" , e);
} catch (IOException
e) {
logger.error("conf
file can not be read!" , e);
}
}
@Override
public Event intercept(Event
event) {
++ eventCount;
try {
if (dynamicProp && eventCount > propMonitorInterval)
{
File file = new File(conf_path );
if (file.lastModified()
> propLastModify ) {
propLastModify =
file.lastModified();
Properties props = new Properties();
FileInputStream fis;
fis = new FileInputStream(file);
props.load(fis);
String strKey = props.getProperty( "keys");
if (strKey
!= null) {
String[] strkeys = strKey.split("," );
List<String> keystmp = new LinkedList<String>();
for (String
key : strkeys) {
keystmp.add(key);
}
if (keystmp.size()
> keys .size()) {
keys =
keystmp;
logger.info("dynamicProp
status updated = " + keys);
} else {
logger.error("dynamicProp
status new keys size less than old,so status update fail = "
+ keys);
}
} else {
logger.error("dynamicProp
status get keys fail ,so status update fail = "
+ keys);
}
}
}
Map<String, String> headers = event.getHeaders();
headers.put( "host", hostname );
String body = new String(event.getBody());
if (pattern != null)
{
StringBuffer stringBuffer = new StringBuffer();
Date date = null;
Map<String, String> index = new HashMap<String,
String>();
if (matcher .contains(body, pattern))
{
index.put( "host", hostname );
MatchResult result = matcher.getMatch();
index.put( "ip",
result.group(1));
try {
date = sdf.parse(result.group(2));
index.put( "loc_time", sdfAll.format(date));
} catch (ParseException
e1) {
}
String url = result.group(3).replaceAll( ",", "|");
String[] params = url.split( "&");
for (String
param : params) {
String[] p = param.split("=" );
if (p.length ==
2) {
index.put(p[0], p[1]);
}
}
index.put( "browser",
result.group(4).replaceAll("," , "|" ));
for (String
key : keys ) {
if (index.containsKey(key))
{
stringBuffer.append(index.get(key) + ",");
} else {
stringBuffer.append( "~,");
}
}
if (stringBuffer.length()
> 0) {
stringBuffer.deleteCharAt(stringBuffer.length() - 1);
} else {
stringBuffer.append( "error=" +
body);
}
if (date
!= null) {
headers.put( "dayStr", sd .format(date));
headers.put( "hourStr", sh .format(date));
Integer m = Integer.parseInt(sm.format(date));
String min = "";
if (m
>= 0 && m < 10) {
min = "0" +
(m / 5) * 5;
} else {
min = (m / 5) * 5 + "" ;
}
headers.put( "minStr",
min);
} else {
headers.put( "dayStr", "errorLog" );
}
Event e = EventBuilder.withBody(stringBuffer.toString()
.getBytes(), headers);
return e;
}
}
} catch (Exception
e) {
logger.error("LogFormat
error!" , e);
}
return null ;
}
@Override
public List<Event> intercept(List<Event>
events) {
List<Event> list = new LinkedList<Event>();
for (Event
event : events) {
Event e = intercept(event);
if (e
!= null ) {
list.add(e);
}
}
return list;
}
/**
* Builder which builds new instances of the HostInterceptor.
*/
public static class Builder implements Interceptor.Builder
{
private String confPath ;
private boolean dynamicProp ;
private String hostname ;
private long propMonitorInterval ;
@Override
public Interceptor
build() {
return new LogFormatInterceptor(confPath, dynamicProp, hostname,
propMonitorInterval);
}
@Override
public void configure(Context
context) {
confPath =
context.getString( CONF_PATH);
dynamicProp =
context.getBoolean(DYNAMICPROP, DYNAMICPROP_DFLT);
hostname =
context.getString( HOSTNAME, HOSTNAME_DFLT );
propMonitorInterval =
context.getLong(PROPMONITORINTERVAL,
PROPMONITORINTERVAL_DFLT);
}
}
public static class Constants
{
public static String CONF_PATH = "confpath";
public static String DYNAMICPROP = "dynamicprop";
public static boolean DYNAMICPROP_DFLT = false;
public static String HOSTNAME = "hostname";
public static String HOSTNAME_DFLT = "hostname";
public static String PROPMONITORINTERVAL = "prop.monitor.rollInterval" ;
public static long PROPMONITORINTERVAL_DFLT =
500000l;
}
}
|
至此,获取nginx日志,进行格式化清洗,传输到collector机器,按照格式化的目录和文件名进行输出全部完成。