Flink AsyncIO特性测评

本文介绍Flink 1.2版本引入的AsyncIO特性,该特性通过异步化外部IO操作大幅提升流处理效率。在百万词IO模拟测试中,与传统同步IO相比,AsyncIO实现了接近5倍的性能提升。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >


背景:Flink 1.2起,对流处理中耗时较高的外部IO操作进行了优化,引入了Async IO特性,将IO操作异步化,大幅提升了效率。


测试环境:

Java HotSpot(TM) 64-Bit Server VM 1.8.0_112-b16 on Mac OS X 10.11.6

Intel(R) Core(TM) i5-5257U CPU @ 2.70GHz

 

测试基准:对一个100万词的较大的txt文本文件就行map,对每个词模拟外部IO操作(通过sleep,单次IO操作平均时间1ms),之后进行常规的WORD COUNT流处理。


测试代码:

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.streaming.examples.async;

import com.google.common.base.Stopwatch;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.runtime.state.filesystem.FsStateBackend;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.AsyncDataStream;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.async.AsyncFunction;
import org.apache.flink.streaming.api.functions.async.RichAsyncFunction;
import org.apache.flink.streaming.api.functions.async.collector.AsyncCollector;
import org.apache.flink.util.Collector;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.Collections;
import java.util.Random;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;

import static java.lang.Thread.sleep;

/**
 * Example to illustrates how to use {@link AsyncFunction}
 */
public class AsyncIOExample {

	private static final Logger LOG = LoggerFactory.getLogger(AsyncIOExample.class);

	private static final String EXACTLY_ONCE_MODE = "exactly_once";
	private static final String EVENT_TIME = "EventTime";
	private static final String INGESTION_TIME = "IngestionTime";
	private static final String ORDERED = "ordered";


	/**
	 * An sample of {@link AsyncFunction} using a thread pool and executing working threads
	 * to simulate multiple async operations.
	 * <p>
	 * For the real use case in production environment, the thread pool may stay in the
	 * async client.
	 */
	private static class SampleAsyncFunction extends RichAsyncFunction<String, String> {
		private static final long serialVersionUID = 2098635244857937717L;

		private static ExecutorService executorService;
		private static Random random;

		private int counter;

		/**
		 * The result of multiplying sleepFactor with a random float is used to pause
		 * the working thread in the thread pool, simulating a time consuming async operation.
		 */
		private final long sleepFactor;

		/**
		 * The ratio to generate an exception to simulate an async error. For example, the error
		 * may be a TimeoutException while visiting HBase.
		 */
		private final float failRatio;

		private final long shutdownWaitTS;

		SampleAsyncFunction(long sleepFactor, float failRatio, long shutdownWaitTS) {
			this.sleepFactor = sleepFactor;
			this.failRatio = failRatio;
			this.shutdownWaitTS = shutdownWaitTS;
		}


		@Override
		public void open(Configuration parameters) throws Exception {
			super.open(parameters);

			synchronized (SampleAsyncFunction.class) {
				if (counter == 0) {
					executorService = Executors.newFixedThreadPool(30);

					random = new Random();
				}

				++counter;
			}
		}

		@Override
		public void close() throws Exception {
			super.close();

			synchronized (SampleAsyncFunction.class) {
				--counter;

				if (counter == 0) {
					executorService.shutdown();

					try {
						if (!executorService.awaitTermination(shutdownWaitTS, TimeUnit.MILLISECONDS)) {
							executorService.shutdownNow();
						}
					} catch (InterruptedException e) {
						executorService.shutdownNow();
					}
				}
			}
		}

		@Override
		public void asyncInvoke(final String input, final AsyncCollector<String> collector) throws Exception {
			this.executorService.submit(new Runnable() {
				@Override
				public void run() {
					// wait for while to simulate async operation here
					//long sleep = (long) (random.nextFloat() * sleepFactor);
					try {
						sleep(1); 				// simulate IO operation
					} catch (InterruptedException e) {
						e.printStackTrace();
					}
					collector.collect(
						Collections.singletonList("key-" + input));
				}
			});
		}
	}

	private static void printUsage() {
		System.out.println("To customize example, use: AsyncIOExample [--fsStatePath <path to fs state>] " +
				"[--checkpointMode <exactly_once or at_least_once>] " +
				"[--maxCount <max number of input from source, -1 for infinite input>] " +
				"[--sleepFactor <interval to sleep for each stream element>] [--failRatio <possibility to throw exception>] " +
				"[--waitMode <ordered or unordered>] [--waitOperatorParallelism <parallelism for async wait operator>] " +
				"[--eventType <EventTime or IngestionTime>] [--shutdownWaitTS <milli sec to wait for thread pool>]" +
				"[--timeout <Timeout for the asynchronous operations>]");
	}

	public static void main(String[] args) throws Exception {

		// obtain execution environment
		StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

		// parse parameters
		final ParameterTool params = ParameterTool.fromArgs(args);

		final String statePath;
		final String cpMode;
		final int maxCount;
		final long sleepFactor;
		final float failRatio;
		final String mode;
		final int taskNum;
		final String timeType;
		final long shutdownWaitTS;
		final long timeout;

		try {
			// check the configuration for the job
			statePath = params.get("fsStatePath", null);
			cpMode = params.get("checkpointMode", "exactly_once");
			maxCount = params.getInt("maxCount", 100000);
			sleepFactor = params.getLong("sleepFactor", 100);
			failRatio = params.getFloat("failRatio", 0.001f);
			mode = params.get("waitMode", "ordered");
			taskNum = params.getInt("waitOperatorParallelism", 1);
			timeType = params.get("eventType", "EventTime");
			shutdownWaitTS = params.getLong("shutdownWaitTS", 20000);
			timeout = params.getLong("timeout", 10000L);
		} catch (Exception e) {
			printUsage();

			throw e;
		}

		StringBuilder configStringBuilder = new StringBuilder();

		final String lineSeparator = System.getProperty("line.separator");

		configStringBuilder
			.append("Job configuration").append(lineSeparator)
			.append("FS state path=").append(statePath).append(lineSeparator)
			.append("Checkpoint mode=").append(cpMode).append(lineSeparator)
			.append("Max count of input from source=").append(maxCount).append(lineSeparator)
			.append("Sleep factor=").append(sleepFactor).append(lineSeparator)
			.append("Fail ratio=").append(failRatio).append(lineSeparator)
			.append("Waiting mode=").append(mode).append(lineSeparator)
			.append("Parallelism for async wait operator=").append(taskNum).append(lineSeparator)
			.append("Event type=").append(timeType).append(lineSeparator)
			.append("Shutdown wait timestamp=").append(shutdownWaitTS);

		LOG.info(configStringBuilder.toString());

		if (statePath != null) {
			// setup state and checkpoint mode
			env.setStateBackend(new FsStateBackend(statePath));
		}

		// enable watermark or not
		if (EVENT_TIME.equals(timeType)) {
			env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
		}
		else if (INGESTION_TIME.equals(timeType)) {
			env.setStreamTimeCharacteristic(TimeCharacteristic.IngestionTime);
		}

		final Stopwatch stopwatch = Stopwatch.createStarted();

		// create input stream of an single integer
		// DataStream<String> inputStream = env.fromElements(WordCountData.WORDS).flatMap(new Tokenizer());
		DataStream<String> inputStream = env.readTextFile("big.txt").flatMap(new Tokenizer());

		DataStream<String> result;

		// Benchmark 1: tracitional I/O
		// result = inputStream.map(new LegacyRedisWriter());

		// Benchmark 2: Async I/O
		result = asyncIOAproach(sleepFactor, failRatio, mode, taskNum, shutdownWaitTS, timeout, inputStream);


		// add a reduce to get the sum of each keys.
		result.flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() {
			private static final long serialVersionUID = -938116068682344455L;

			@Override
			public void flatMap(String value, Collector<Tuple2<String, Integer>> out) throws Exception {
				out.collect(new Tuple2<>(value, 1));
			}
		}).keyBy(0).sum(1).print();

		// execute the program
		env.execute("Async IO Example");

		stopwatch.stop(); //optional

		System.out.println("Elapsed time ==> " + stopwatch);
	}

	private static DataStream<String> asyncIOAproach(long sleepFactor, float failRatio, String mode, int taskNum, long shutdownWaitTS, long timeout, DataStream<String> inputStream) {
		// create async function, which will *wait* for a while to simulate the process of async i/o
		AsyncFunction<String, String> function =
				new SampleAsyncFunction(sleepFactor, failRatio, shutdownWaitTS);

		// add async operator to streaming job
		DataStream<String> result;
		if (ORDERED.equals(mode)) {
			result = AsyncDataStream.orderedWait(
				inputStream,
				function,
				timeout,
				TimeUnit.MILLISECONDS,
				20).setParallelism(taskNum);
		}
		else {
			result = AsyncDataStream.unorderedWait(
				inputStream,
				function,
				timeout,
				TimeUnit.MILLISECONDS,
				20).setParallelism(taskNum);
		}
		return result;
	}

	private static class Tokenizer implements FlatMapFunction<String,String> {
		@Override
		public void flatMap(String value, Collector<String> out) throws Exception {
			String[] tokens = value.toLowerCase().split("\\W+");

			// emit the pairs
			for (String token : tokens) {
				if (token.length() > 0) {
					out.collect(token);
				}
			}
		}
	}

	private static class LegacyRedisWriter implements org.apache.flink.api.common.functions.MapFunction<String,String> {
		@Override
		public String map(String input) throws Exception {
			String key = "key-" + input;
			sleep(1); // simulate IO operation
			/*Jedis jedis = new Jedis();
			jedis.set(key, String.valueOf(System.currentTimeMillis() / 1000L));
			jedis.close();
			*/
			return key;
		}
	}
}


文本数据下载:http://norvig.com/big.txt

 

测试结果:

1)      传统的在单次Map中进行阻塞式的同步IO操作,总执行时间:Elapsed time ==> 6.317 min

2)      开启Async IO特性,进行非阻塞式的异步IO操作,总执行时间:Elapsed time ==> 1.448 min


结论:

1)       流计算中的传统同步IO,因为同步操作耗时慢,而且需要很高的并发,从而需要更多的任务开销;

2)       Async IO 通过异步IO优化,在我们的百万词IO模拟测试中,获得了接近5的性能提升。


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值