Flume-ng生产环境实践（三）实现文件sink，按照固定格式目录输出

最新推荐文章于 2021-02-20 00:17:55 发布

最新推荐文章于 2021-02-20 00:17:55 发布 · 445 阅读

文章标签：

#大数据 #java #运维

本文介绍了一种自定义Flume组件实现多台服务器nginx日志的实时收集方案。该方案通过配置Agent来实现按天和时间间隔归档日志文件，并提供了核心代码示例。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

应用场景：需要实时收集多台服务器的nginx日志到一台机器。收集完成结果存放需要按天生成文件夹，按每5分钟生成文件，比如2012年12月29日12点26分的日志，需要放到/data/log/20121229/log-1225-对应的文件中。自己实现了类似flume-og和flume-ng的hdfs-sink的文件sink。

使用的时候配置如下：

agent.sources = source
agent.channels = channel
agent.sinks = sink

agent.sources.source.type = avro
agent.sources.source.bind = 192.168.0.100
agent.sources.source.port = 44444
agent.sources.source.channels = channel

agent.sinks.sink.type = org.apache.flume.sink.FileSink
agent.sinks.sink.file.path = /data/log/%{dayStr}
agent.sinks.sink.file.filePrefix = log-%{hourStr}%{minStr}-
agent.sinks.sink.file.txnEventMax = 10000
agent.sinks.sink.file.maxOpenFiles = 5
agent.sinks.sink.channel = channel

agent.channels.channel.type = memory
agent.channels.channel.capacity = 100000
agent.channels.channel.transactionCapacity = 100000
agent.channels.channel.keep-alive = 60

依赖的jar如下：

jakarta-oro-2.0.1.jar

flume-ng-core-1.3.0-SNAPSHOT.jar

flume-ng-sdk-1.3.0-SNAPSHOT.jar

flume-ng-configuration-1.3.0-SNAPSHOT.jar

slf4j-log4j12-1.6.1.jar

slf4j-api-1.6.1.jar

guava-10.0.1.jar

代码如下：

FileSink.java

packageorg.apache.flume.sink;

importjava.io.IOException;

importjava.util.Calendar;

importjava.util.List;

importjava.util.concurrent.Executors;

importjava.util.concurrent.ScheduledExecutorService;

importorg.apache.flume.Channel;

importorg.apache.flume.Context;

importorg.apache.flume.Event;

importorg.apache.flume.EventDeliveryException;

importorg.apache.flume.Transaction;

importorg.apache.flume.conf.Configurable;

importorg.apache.flume.formatter.output.BucketPath;

importorg.apache.flume.instrumentation.SinkCounter;

importorg.apache.flume.serialization.EventSerializer;

importorg.slf4j.Logger;

importorg.slf4j.LoggerFactory;

importcom.google.common.base.Preconditions;

importcom.google.common.collect.Lists;

importcom.google.common.util.concurrent.ThreadFactoryBuilder;

publicclassFileSinkextendsAbstractSinkimplementsConfigurable {

privatestaticfinalLoggerlogger= LoggerFactory

.getLogger(FileSink.class);

privateStringpath;

privatestaticfinalStringdefaultFileName="FlumeData";

privatestaticfinalintdefaultMaxOpenFiles= 50;

/**

* Default length of time we wait for blocking BucketWriter calls before

* timing out the operation. Intended to prevent server hangs.

privatelongtxnEventMax;

privateFileWriterLinkedHashMapsfWriters;

privateStringserializerType;

privateContextserializerContext;

privatebooleanneedRounding=false;

privateintroundUnit= Calendar.SECOND;

privateintroundValue= 1;

privateSinkCountersinkCounter;

privateintmaxOpenFiles;

privateScheduledExecutorServicetimedRollerPool;

privatelongrollInterval;

@Override

publicvoidconfigure(Context context) {

String directory = Preconditions.checkNotNull(

context.getString("file.path"),"file.path is required");

String fileName = context.getString("file.filePrefix",defaultFileName);

this.path= directory +"/"+ fileName;

maxOpenFiles= context.getInteger("file.maxOpenFiles",

defaultMaxOpenFiles);

serializerType= context.getString("sink.serializer","TEXT");

serializerContext=newContext(

context.getSubProperties(EventSerializer.CTX_PREFIX));

txnEventMax= context.getLong("file.txnEventMax", 1l);

if(sinkCounter==null) {

sinkCounter=newSinkCounter(getName());

}

rollInterval= context.getLong("file.rollInterval", 30l);

String rollerName ="hdfs-"+ getName() +"-roll-timer-%d";

timedRollerPool= Executors.newScheduledThreadPool(maxOpenFiles,

newThreadFactoryBuilder().setNameFormat(rollerName).build());

}

@Override

publicStatus process()throwsEventDeliveryException {

Channel channel = getChannel();

Transaction transaction = channel.getTransaction();

List<BucketFileWriter> writers = Lists.newArrayList();

transaction.begin();

try{

Event event =null;

inttxnEventCount = 0;

for(txnEventCount = 0; txnEventCount <txnEventMax; txnEventCount++) {

event = channel.take();

if(event ==null) {

break;

}

// reconstruct the path name by substituting place holders

String realPath = BucketPath

.escapeString(path, event.getHeaders(),needRounding,

roundUnit,roundValue);

BucketFileWriter bucketFileWriter =sfWriters.get(realPath);

// we haven't seen this file yet, so open it and cache the

// handle

if(bucketFileWriter ==null) {

bucketFileWriter =newBucketFileWriter();

bucketFileWriter.open(realPath,serializerType,

serializerContext,rollInterval,timedRollerPool,

sfWriters);

sfWriters.put(realPath, bucketFileWriter);

}

// track the buckets getting written in this transaction

if(!writers.contains(bucketFileWriter)) {

writers.add(bucketFileWriter);

}

// Write the data to File

bucketFileWriter.append(event);

}

if(txnEventCount == 0) {

sinkCounter.incrementBatchEmptyCount();

}elseif(txnEventCount ==txnEventMax) {

sinkCounter.incrementBatchCompleteCount();

}else{

sinkCounter.incrementBatchUnderflowCount();

}

// flush all pending buckets before committing the transaction

for(BucketFileWriter bucketFileWriter : writers) {

if(!bucketFileWriter.isBatchComplete()) {

flush(bucketFileWriter);

}

transaction.commit();

if(txnEventCount > 0) {

sinkCounter.addToEventDrainSuccessCount(txnEventCount);

}

if(event ==null) {

returnStatus.BACKOFF;

}

returnStatus.READY;

}catch(IOException eIO) {

transaction.rollback();

logger.warn("File IO error", eIO);

returnStatus.BACKOFF;

}catch(Throwable th) {

transaction.rollback();

logger.error("process failed", th);

if(thinstanceofError) {

throw(Error) th;

}else{

thrownewEventDeliveryException(th);

}

}finally{

transaction.close();

}

privatevoidflush(BucketFileWriter bucketFileWriter)throwsIOException {

bucketFileWriter.flush();

}

@Override

publicsynchronizedvoidstart() {

super.start();

this.sfWriters=newFileWriterLinkedHashMap(maxOpenFiles);

sinkCounter.start();

}

BucketFileWriter.java

package org.apache.flume.sink;

import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.concurrent.Callable;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;

import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.serialization.EventSerializer;
import org.apache.flume.serialization.EventSerializerFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class BucketFileWriter {

private static final Logger logger = LoggerFactory
.getLogger(BucketFileWriter.class);
private static final String IN_USE_EXT = ".tmp";
/**
* This lock ensures that only one thread can open a file at a time.
*/
private final AtomicLong fileExtensionCounter;
private OutputStream outputStream;

private EventSerializer serializer;

private String filePath;

/**
* Close the file handle and rename the temp file to the permanent filename.
* Safe to call multiple times. Logs HDFSWriter.close() exceptions.
*
* @throws IOException
* On failure to rename if temp file exists.
*/

public BucketFileWriter() {
fileExtensionCounter = new AtomicLong(System.currentTimeMillis());
}

public void open(final String filePath, String serializerType,
Context serializerContext, final long rollInterval,
final ScheduledExecutorService timedRollerPool,
final FileWriterLinkedHashMap sfWriters) throws IOException {
this.filePath = filePath;
File file = new File(filePath + fileExtensionCounter + IN_USE_EXT);
file.getParentFile().mkdirs();
outputStream = new BufferedOutputStream(new FileOutputStream(file));
logger.info("filename = " + file.getAbsolutePath());
serializer = EventSerializerFactory.getInstance(serializerType,
serializerContext, outputStream);
serializer.afterCreate();
if (rollInterval > 0) {
Callable<Void> action = new Callable<Void>() {
@Override
public Void call() throws Exception {
logger.debug(
"Rolling file ({}): Roll scheduled after {} sec elapsed.",
filePath + fileExtensionCounter + IN_USE_EXT,
rollInterval);
if (sfWriters.containsKey(filePath)) {
sfWriters.remove(filePath);
}
close();
return null;
}
};
timedRollerPool.schedule(action, rollInterval, TimeUnit.SECONDS);
}
}

public void append(Event event) throws IOException {
serializer.write(event);
}

public boolean isBatchComplete() {
return true;
}

public void flush() throws IOException {
serializer.flush();
outputStream.flush();

}

/**
* Rename bucketPath file from .tmp to permanent location.
*/
private void renameBucket() {
File srcPath = new File(filePath + fileExtensionCounter + IN_USE_EXT);
File dstPath = new File(filePath + fileExtensionCounter);
if (srcPath.exists()) {
srcPath.renameTo(dstPath);
logger.info("Renaming " + srcPath + " to " + dstPath);
}
}

public synchronized void close() throws IOException, InterruptedException {
if (outputStream != null) {
outputStream.flush();
outputStream.close();
}
renameBucket();
}
}

FileWriterLinkedHashMap.java

package org.apache.flume.sink;

import java.io.IOException;
import java.util.LinkedHashMap;
import java.util.Map.Entry;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class FileWriterLinkedHashMap extends
LinkedHashMap<String, BucketFileWriter> {

private static final Logger logger = LoggerFactory
.getLogger(FileWriterLinkedHashMap.class);

private static final long serialVersionUID = -7860596835613215998L;
private final int maxOpenFiles;

public FileWriterLinkedHashMap(int maxOpenFiles) {
super(16, 0.75f, true); // stock initial capacity/load, access
this.maxOpenFiles = maxOpenFiles;
}

@Override
protected boolean removeEldestEntry(Entry<String, BucketFileWriter> eldest) {
if (size() > maxOpenFiles) {
// If we have more that max open files, then close the last one
// and
// return true
try {
eldest.getValue().close();
} catch (IOException e) {
logger.warn(eldest.getKey().toString(), e);
} catch (InterruptedException e) {
logger.warn(eldest.getKey().toString(), e);
Thread.currentThread().interrupt();
}
return true;
} else {
return false;
}
}
}