应用场景:需要实时收集多台服务器的nginx日志到一台机器。收集完成结果存放需要按天生成文件夹,按每5分钟生成文件,比如2012年12月29日12点26分的日志,需要放到/data/log/20121229/log-1225-对应的文件中。自己实现了类似flume-og和flume-ng的hdfs-sink的文件sink。
使用的时候配置如下:
agent.sources = source agent.channels = channel agent.sinks = sink agent.sources.source.type = avro agent.sources.source.bind = 192.168.0.100 agent.sources.source.port = 44444 agent.sources.source.channels = channel agent.sinks.sink.type = org.apache.flume.sink.FileSink agent.sinks.sink.file.path = /data/log/%{dayStr} agent.sinks.sink.file.filePrefix = log-%{hourStr}%{minStr}- agent.sinks.sink.file.txnEventMax = 10000 agent.sinks.sink.file.maxOpenFiles = 5 agent.sinks.sink.channel = channel agent.channels.channel.type = memory agent.channels.channel.capacity = 100000 agent.channels.channel.transactionCapacity = 100000 agent.channels.channel.keep-alive = 60 |
依赖的jar如下:
jakarta-oro-2.0.1.jar
flume-ng-core-1.3.0-SNAPSHOT.jar
flume-ng-sdk-1.3.0-SNAPSHOT.jar
flume-ng-configuration-1.3.0-SNAPSHOT.jar
slf4j-log4j12-1.6.1.jar
slf4j-api-1.6.1.jar
guava-10.0.1.jar
代码如下:
FileSink.java
packageorg.apache.flume.sink;
importjava.io.IOException;
importjava.util.Calendar;
importjava.util.List;
importjava.util.concurrent.Executors;
importjava.util.concurrent.ScheduledExecutorService;
importorg.apache.flume.Channel;
importorg.apache.flume.Context;
importorg.apache.flume.Event;
importorg.apache.flume.EventDeliveryException;
importorg.apache.flume.Transaction;
importorg.apache.flume.conf.Configurable;
importorg.apache.flume.formatter.output.BucketPath;
importorg.apache.flume.instrumentation.SinkCounter;
importorg.apache.flume.serialization.EventSerializer;
importorg.slf4j.Logger;
importorg.slf4j.LoggerFactory;
importcom.google.common.base.Preconditions;
importcom.google.common.collect.Lists;
importcom.google.common.util.concurrent.ThreadFactoryBuilder;
publicclassFileSinkextendsAbstractSinkimplementsConfigurable
{
privatestaticfinalLoggerlogger=
LoggerFactory
.getLogger(FileSink.class);
privateStringpath;
privatestaticfinalStringdefaultFileName="FlumeData";
privatestaticfinalintdefaultMaxOpenFiles=
50;
/**
* Default length of time we wait for blocking BucketWriter calls before
* timing out the operation. Intended to prevent server hangs.
*/
privatelongtxnEventMax;
privateFileWriterLinkedHashMapsfWriters;
privateStringserializerType;
privateContextserializerContext;
privatebooleanneedRounding=false;
privateintroundUnit=
Calendar.SECOND;
privateintroundValue=
1;
privateSinkCountersinkCounter;
privateintmaxOpenFiles;
privateScheduledExecutorServicetimedRollerPool;
privatelongrollInterval;
@Override
publicvoidconfigure(Context
context) {
String directory = Preconditions.checkNotNull(
context.getString("file.path"),"file.path
is required");
String fileName = context.getString("file.filePrefix",defaultFileName);
this.path=
directory +"/"+ fileName;
maxOpenFiles=
context.getInteger("file.maxOpenFiles",
defaultMaxOpenFiles);
serializerType=
context.getString("sink.serializer","TEXT");
serializerContext=newContext(
context.getSubProperties(EventSerializer.CTX_PREFIX));
txnEventMax=
context.getLong("file.txnEventMax", 1l);
if(sinkCounter==null)
{
sinkCounter=newSinkCounter(getName());
}
rollInterval=
context.getLong("file.rollInterval", 30l);
String rollerName ="hdfs-"+
getName() +"-roll-timer-%d";
timedRollerPool=
Executors.newScheduledThreadPool(maxOpenFiles,
newThreadFactoryBuilder().setNameFormat(rollerName).build());
}
@Override
publicStatus
process()throwsEventDeliveryException
{
Channel channel = getChannel();
Transaction transaction = channel.getTransaction();
List<BucketFileWriter> writers = Lists.newArrayList();
transaction.begin();
try{
Event event =null;
inttxnEventCount
= 0;
for(txnEventCount
= 0; txnEventCount <txnEventMax; txnEventCount++) {
event = channel.take();
if(event
==null) {
break;
}
// reconstruct the path
name by substituting place holders
String realPath = BucketPath
.escapeString(path,
event.getHeaders(),needRounding,
roundUnit,roundValue);
BucketFileWriter bucketFileWriter =sfWriters.get(realPath);
// we haven't seen this
file yet, so open it and cache the
// handle
if(bucketFileWriter
==null) {
bucketFileWriter =newBucketFileWriter();
bucketFileWriter.open(realPath,serializerType,
serializerContext,rollInterval,timedRollerPool,
sfWriters);
sfWriters.put(realPath,
bucketFileWriter);
}
// track the buckets getting
written in this transaction
if(!writers.contains(bucketFileWriter))
{
writers.add(bucketFileWriter);
}
// Write the data to File
bucketFileWriter.append(event);
}
if(txnEventCount
== 0) {
sinkCounter.incrementBatchEmptyCount();
}elseif(txnEventCount
==txnEventMax) {
sinkCounter.incrementBatchCompleteCount();
}else{
sinkCounter.incrementBatchUnderflowCount();
}
// flush all pending buckets
before committing the transaction
for(BucketFileWriter
bucketFileWriter : writers) {
if(!bucketFileWriter.isBatchComplete())
{
flush(bucketFileWriter);
}
}
transaction.commit();
if(txnEventCount
> 0) {
sinkCounter.addToEventDrainSuccessCount(txnEventCount);
}
if(event
==null) {
returnStatus.BACKOFF;
}
returnStatus.READY;
}catch(IOException
eIO) {
transaction.rollback();
logger.warn("File
IO error", eIO);
returnStatus.BACKOFF;
}catch(Throwable
th) {
transaction.rollback();
logger.error("process
failed", th);
if(thinstanceofError)
{
throw(Error)
th;
}else{
thrownewEventDeliveryException(th);
}
}finally{
transaction.close();
}
}
privatevoidflush(BucketFileWriter
bucketFileWriter)throwsIOException
{
bucketFileWriter.flush();
}
@Override
publicsynchronizedvoidstart()
{
super.start();
this.sfWriters=newFileWriterLinkedHashMap(maxOpenFiles);
sinkCounter.start();
}
}
|
BucketFileWriter.java
package org.apache.flume.sink; import java.io.BufferedOutputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; import java.util.concurrent.Callable; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicLong; import org.apache.flume.Context; import org.apache.flume.Event; import org.apache.flume.serialization.EventSerializer; import org.apache.flume.serialization.EventSerializerFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class BucketFileWriter { private static final Logger logger = LoggerFactory .getLogger(BucketFileWriter.class); private static final String IN_USE_EXT = ".tmp"; /** * This lock ensures that only one thread can open a file at a time. */ private final AtomicLong fileExtensionCounter; private OutputStream outputStream; private EventSerializer serializer; private String filePath; /** * Close the file handle and rename the temp file to the permanent filename. * Safe to call multiple times. Logs HDFSWriter.close() exceptions. * * @throws IOException * On failure to rename if temp file exists. */ public BucketFileWriter() { fileExtensionCounter = new AtomicLong(System.currentTimeMillis()); } public void open(final String filePath, String serializerType, Context serializerContext, final long rollInterval, final ScheduledExecutorService timedRollerPool, final FileWriterLinkedHashMap sfWriters) throws IOException { this.filePath = filePath; File file = new File(filePath + fileExtensionCounter + IN_USE_EXT); file.getParentFile().mkdirs(); outputStream = new BufferedOutputStream(new FileOutputStream(file)); logger.info("filename = " + file.getAbsolutePath()); serializer = EventSerializerFactory.getInstance(serializerType, serializerContext, outputStream); serializer.afterCreate(); if (rollInterval > 0) { Callable<Void> action = new Callable<Void>() { @Override public Void call() throws Exception { logger.debug( "Rolling file ({}): Roll scheduled after {} sec elapsed.", filePath + fileExtensionCounter + IN_USE_EXT, rollInterval); if (sfWriters.containsKey(filePath)) { sfWriters.remove(filePath); } close(); return null; } }; timedRollerPool.schedule(action, rollInterval, TimeUnit.SECONDS); } } public void append(Event event) throws IOException { serializer.write(event); } public boolean isBatchComplete() { return true; } public void flush() throws IOException { serializer.flush(); outputStream.flush(); } /** * Rename bucketPath file from .tmp to permanent location. */ private void renameBucket() { File srcPath = new File(filePath + fileExtensionCounter + IN_USE_EXT); File dstPath = new File(filePath + fileExtensionCounter); if (srcPath.exists()) { srcPath.renameTo(dstPath); logger.info("Renaming " + srcPath + " to " + dstPath); } } public synchronized void close() throws IOException, InterruptedException { if (outputStream != null) { outputStream.flush(); outputStream.close(); } renameBucket(); } } |
FileWriterLinkedHashMap.java
package org.apache.flume.sink; import java.io.IOException; import java.util.LinkedHashMap; import java.util.Map.Entry; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class FileWriterLinkedHashMap extends LinkedHashMap<String, BucketFileWriter> { private static final Logger logger = LoggerFactory .getLogger(FileWriterLinkedHashMap.class); private static final long serialVersionUID = -7860596835613215998L; private final int maxOpenFiles; public FileWriterLinkedHashMap(int maxOpenFiles) { super(16, 0.75f, true); // stock initial capacity/load, access this.maxOpenFiles = maxOpenFiles; } @Override protected boolean removeEldestEntry(Entry<String, BucketFileWriter> eldest) { if (size() > maxOpenFiles) { // If we have more that max open files, then close the last one // and // return true try { eldest.getValue().close(); } catch (IOException e) { logger.warn(eldest.getKey().toString(), e); } catch (InterruptedException e) { logger.warn(eldest.getKey().toString(), e); Thread.currentThread().interrupt(); } return true; } else { return false; } } } |