概述
常见的混音算法是采用振幅乘以权值再求和,如下图所示,两路音频混音,a1, a2分别为源音频1和源音频2权重,其中要求 a1 + a2 = 1。
af_amix介绍
ffmpeg中af_amix设计原理大致上和常用算法雷同。
af_amix算法默认每路音频权重相同(均为1/n,其中n是混音音频总数)。若要加重某路音频音量,可通过构造一个weights数组(空格隔开),例如三路混音是数组 “8 1 1”,权重值将分别为"0.8 0.1 0.1",它将代替默认的1/3。(计算方法可参考下面节选的代码)
另外,考虑到混音的音源时长存在不同,af_amix会检查音频路数变化,并动态更新权重值,例如三路音频混音,前面两路是60秒长,后面一路是30秒长,那么在30秒后,只有两路音频混音,权重值更新为"8/9 1/9"。
至此,读者对ffmpeg af_amix已有一定了解。接下来本文重点分析af_amix存在问题,并提出一种解决方案的结果供读者体验。(可以参考波形图,从百度云下载源音频,混音音频试听体验,其中附有一个Windows 64位系统上的MFC工具,欢迎试用效果)
链接:https://pan.baidu.com/s/1dS3PzQwmbF3vONswsmLRwg
提取码:amix
下面是从ffmpeg 节选出来的几个函数代码,供参考。
1.权重weights数组解析函数
static void parse_weights(AVFilterContext *ctx)
{// hybase@qq.com 2022.05.20
MixContext *s = ctx->priv;
float last_weight = 1.f;
char *p;
int i;
s->weight_sum = 0.f;
p = s->weights_str;
av_log(ctx, AV_LOG_VERBOSE, "[parse_weights] weights_str=%s\n", p);
for (i = 0; i < s->nb_inputs; i++) {
last_weight = av_strtod(p, &p);
s->weights[i] = last_weight;
s->weight_sum += FFABS(last_weight);
av_log(ctx, AV_LOG_VERBOSE, "[parse_weights] weights[%d]=%f, weight_sum=%f\n", i, s->weights[i], s->weight_sum);
if (p && *p) {
p++;
} else {
i++;
break;
}
}
for (; i < s->nb_inputs; i++) {
s->weights[i] = last_weight;
s->weight_sum += FFABS(last_weight);
av_log(ctx, AV_LOG_VERBOSE, "[parse_weights] weights[%d]=%f, weight_sum=%f\n", i, s->weights[i], s->weight_sum);
}
}
2.检查音频数据状态,动态计算实际权重值函数
static void calculate_scales(MixContext *s, int nb_samples)
{// hybase@qq.com 2022.05.20
float weight_sum = 0.f;
int i;
for (i = 0; i < s->nb_inputs; i++)
if (s->input_state[i] & INPUT_ON)
weight_sum += FFABS(s->weights[i]);
for (i = 0; i < s->nb_inputs; i++) {
if (s->input_state[i] & INPUT_ON) {
if (s->scale_norm[i] > weight_sum / FFABS(s->weights[i])) {
s->scale_norm[i] -= ((s->weight_sum / FFABS(s->weights[i])) / s->nb_inputs) *
nb_samples / (s->dropout_transition * s->sample_rate);
s->scale_norm[i] = FFMAX(s->scale_norm[i], weight_sum / FFABS(s->weights[i]));
}
}
}
for (i = 0; i < s->nb_inputs; i++) {
if (s->input_state[i] & INPUT_ON) {
if (!s->normalize)
s->input_scale[i] = FFABS(s->weights[i]);
else
s->input_scale[i] = 1.0f / s->scale_norm[i] * FFSIGN(s->weights[i]);
} else {
s->input_scale[i] = 0.0f;
}
}
}
3.执行混音的函数
static int output_frame(AVFilterLink *outlink)
{// hybase@qq.com 2022.05.20
AVFilterContext *ctx = outlink->src;
MixContext *s = ctx->priv;
AVFrame *out_buf, *in_buf;
int nb_samples, ns, i;
if (s->input_state[0] & INPUT_ON) {
/* first input live: use the corresponding frame size */
nb_samples = frame_list_next_frame_size(s->frame_list);
for (i = 1; i < s->nb_inputs; i++) {
if (s->input_state[i] & INPUT_ON) {
ns = av_audio_fifo_size(s->fifos[i]);
if (ns < nb_samples) {
if (!(s->input_state[i] & INPUT_EOF))
/* unclosed input with not enough samples */
return 0;
/* closed input to drain */
nb_samples = ns;
av_log(ctx, AV_LOG_DEBUG, "[output_frame] av_audio_fifo_size() input_%d: update to nb_samples=%d\n", i, nb_samples);
}
}
}
} else {
/* first input closed: use the available samples */
nb_samples = INT_MAX;
for (i = 1; i < s->nb_inputs; i++) {
if (s->input_state[i] & INPUT_ON) {
ns = av_audio_fifo_size(s->fifos[i]);
nb_samples = FFMIN(nb_samples, ns);
av_log(ctx, AV_LOG_DEBUG, "[output_frame] av_audio_fifo_size() input_%d, update to nb_samples=%d\n", i, nb_samples);
}
}
if (nb_samples == INT_MAX) {
ff_outlink_set_status(outlink, AVERROR_EOF, s->next_pts);
return 0;
}
}
frame_list_remove_samples(s->frame_list, nb_samples);
calculate_scales(s, nb_samples);// hybase@qq.com 2022.05.20
if (nb_samples == 0)
return 0;
out_buf = ff_get_audio_buffer(outlink, nb_samples);
if (!out_buf)
return AVERROR(ENOMEM);
in_buf = ff_get_audio_buffer(outlink, nb_samples);
if (!in_buf) {
av_frame_free(&out_buf);
return AVERROR(ENOMEM);
}
for (i = 0; i < s->nb_inputs; i++) {
if (s->input_state[i] & INPUT_ON) {
int planes, plane_size, p;
av_audio_fifo_read(s->fifos[i], (void **)in_buf->extended_data,
nb_samples);
planes = s->planar ? s->nb_channels : 1;
plane_size = nb_samples * (s->planar ? 1 : s->nb_channels);
plane_size = FFALIGN(plane_size, 16);
if (out_buf->format == AV_SAMPLE_FMT_FLT ||
out_buf->format == AV_SAMPLE_FMT_FLTP) {
for (p = 0; p < planes; p++) {
s->fdsp->vector_fmac_scalar((float *)out_buf->extended_data[p],
(float *) in_buf->extended_data[p],
s->input_scale[i], plane_size);
}
} else {
for (p = 0; p < planes; p++) {
s->fdsp->vector_dmac_scalar((double *)out_buf->extended_data[p],
(double *) in_buf->extended_data[p],
s->input_scale[i], plane_size);
}
}
}
}
av_frame_free(&in_buf);
out_buf->pts = s->next_pts;
if (s->next_pts != AV_NOPTS_VALUE)
s->next_pts += nb_samples;
return ff_filter_frame(outlink, out_buf);
}
af_amix不足
af_amix能满足大部分场景应用,尤其对音量变小不敏感的场景。另外,结合 loudnorm 或 dynaudnorm 音频滤波器辅助提升音量,也可以缓解混音后音量变小的问题。
特例说明:三路音频均值混音时,如果正好有两路声音是静音,整个混音结果将是第三路的三分之一。如果有更多路,音量将变得更小。
下面是五路音频混音案例,执行脚本如下:
# !/bin/sh
# audio filter 's amix(ffmpeg af_amix)
# 5 inputs (hybase@qq.com)
#
ulimit -c 9999999
date1="`date +"%Y-%m-%d %H:%M:%S"`"
input_file=" -i /audio/t001.wav -i /audio/t002.wav -i /audio/t003.wav -i /audio/t004.wav -i /audio/t005.wav"
echo "==== start ==== "
ffmpeg -loglevel verbose -y -fflags +genpts $input_file \
-filter_complex \
"amix=inputs=5:duration=first:dropout_transition=2" \
-acodec mp3 -ar 44100 -ac 2 -ab 192k\
amix_00.mp3
echo "==== end ===="
exit
源音频1波形图(30秒)
源音频2波形图(18秒)
源音频3波形图(30秒)
源音频4波形图(33秒)
源音频5波形图(8.5秒)
ffmpeg原生态 amix 五路均值混音波形图
对比分析,主要存在两个问题:
1)图像直观显示,混音后的波形振幅显著降低;显然混音路数的增加,这个问题会变得更为严重;
2)五段音频时长不同,因为算法会更新路数动态算权重,剩下音频的音量将会逐步增大。音量将会不平稳,明显从小变大。(参考百度盘中的 “amix_ffmpeg_音量变大情况.mp3”)
总之,上述特定情况下af_amix并不能有较好的音量输出结果。体验不佳。
改进方法
首先,改进算法的波形图(均衡混音)
节选代码,供读者参考
1.自加参数
static const AVOption amix_options[] = {
{ ..., // hybase@qq.com 省略相同代码
#if USE_MY_AMIX
{ "USE_MY_AMIX", "my amix use flag",
OFFSET(use_my_amix), AV_OPT_TYPE_BOOL, {.i64=1}, 0, 1, A|F|T },
{ "volume_scalse", "Set hys volume scale for each input.(valid only if use my amix)",
OFFSET(volume_scalse_str), AV_OPT_TYPE_STRING, {.str="1 1"}, 0, 0, A|F|T },
#endif
{ NULL }
};
2.自定义函数
/**
* return > 0 success,<= 0 failure
*/
static int hybase_proc_mix(SPcmInput *pInputs, size_t nInputCount, uint8_t *pOutBuf, size_t nOutBufSize, int nBaseAudio)
{...}
// read input plane by plane
static int hybase_mix_read(AVFilterContext *ctx, MixContext *s, AVFrame *in_buf, int format, int nb_samples)
{...}
static int hybase_mix_plane(AVFilterContext *ctx, MixContext *s)
{
int p;
int planes = s->planar ? s->nb_channels : 1;
for (p = 0; p < planes; p++) {
int base_audio = -1;
int i, ret, nb_valid;
uint8_t *mixed_buf = s->mixed_frame->data[p];
int mixed_buf_size = s->mixed_frame->linesize[0];
nb_valid = 0;
for (i = 0; i < s->nb_inputs; i++) {
if (s->input_state[i] & INPUT_ON) {
SPcmInput *input = &s->pcm_inputs[nb_valid];
input->pBuf = s->input_frames[i]->data[p];
input->nSize = s->input_frames[i]->linesize[0];
if (fabs(s->volume_scalse[i] - 1.0000) < 0.001)
input->nVolume = 100; // remain volume
else
input->nVolume = (int)(100 * s->volume_scalse[i]);
if (base_audio == -1)
base_audio = nb_valid;
nb_valid++;
}
}
// hybase@qq.com mix process
ret = hybase_proc_mix(s->pcm_inputs, nb_valid, mixed_buf, mixed_buf_size, base_audio);
if (ret <= 0)
av_log(ctx, AV_LOG_ERROR, "[hybase_mix_plane] !=== hybase_proc_mix() failed. "
"ret=%d, nb_valid=%d, p=%d(planes=%d), mixed_buf=%p(size=%d)\n",
ret, nb_valid, p, planes, mixed_buf, mixed_buf_size);
}
return 0;
}
static void hybase_mix_output(AVFilterContext *ctx, MixContext *s, AVFrame *out_buf, int nb_samples)
{...}
3.执行体函数的修改
/**
* Read samples from the input FIFOs, mix, and write to the output link.
*/
static int output_frame(AVFilterLink *outlink)
{
... // hybase@qq.com 此处省略相同代码
in_buf = ff_get_audio_buffer(outlink, nb_samples);
if (!in_buf) {
av_frame_free(&out_buf);
return AVERROR(ENOMEM);
}
#if USE_MY_AMIX
if (s->use_my_amix) {
if (nb_samples > s->max_nb_samples) {
int ret = reset_audio_buffer(ctx, s, in_buf->format, in_buf->channels, nb_samples);
if (ret != 0)
return ret;
}
// read inputs, which be saved to s->input_frames->data[i]
hybase_mix_read(ctx, s, in_buf, in_buf->format, nb_samples);
//! do mix
hybase_mix_plane(ctx, s);
//! output
hybase_mix_output(ctx, s, out_buf, nb_samples);
} else {
#endif
for (i = 0; i < s->nb_inputs; i++) {
if (s->input_state[i] & INPUT_ON) {
int planes, plane_size, p;
av_audio_fifo_read(s->fifos[i], (void **)in_buf->extended_data,
nb_samples);
planes = s->planar ? s->nb_channels : 1;
plane_size = nb_samples * (s->planar ? 1 : s->nb_channels);
plane_size = FFALIGN(plane_size, 16);
if (out_buf->format == AV_SAMPLE_FMT_FLT ||
out_buf->format == AV_SAMPLE_FMT_FLTP) {
for (p = 0; p < planes; p++) {
s->fdsp->vector_fmac_scalar((float *)out_buf->extended_data[p],
(float *) in_buf->extended_data[p],
s->input_scale[i], plane_size);
}
} else {
for (p = 0; p < planes; p++) {
s->fdsp->vector_dmac_scalar((double *)out_buf->extended_data[p],
(double *) in_buf->extended_data[p],
s->input_scale[i], plane_size);
}
}
}
}
#if USE_MY_AMIX
}
#endif
... // hybase@qq.com 此处省略相同代码
}
4.执行脚本
# !/bin/sh
# audio filter 's amix(ffmpeg af_amix and my amix)
# 5 inputs (hybase@qq.com)
#
ulimit -c 9999999
date1="`date +"%Y-%m-%d %H:%M:%S"`"
input_file=" -i /audio/t001.wav -i /audio/t002.wav -i /audio/t003.wav -i /audio/t004.wav -i /audio/t005.wav"
echo "==== start ==== "
ffmpeg -loglevel verbose -y -fflags +genpts $input_file \
-filter_complex \
"amix=inputs=5:duration=first:dropout_transition=2:use_my_amix=1" \
-acodec mp3 -ar 44100 -ac 2 -ab 192k\
my_amix_00.mp3
echo "==== end ===="
exit
总之,改进算法特别适应于视频会议或多人语音通话的场景。算法已经不再是前面所述的乘以权值再求和思路。从原理上避免了振幅的缩小。另外,改进算法依然能根据需要支持加重或降低某路音频的音量。而且算法涉及的计算量非常小,不仅用于浮点,也适用于定点计算平台。
算法可有偿提供,非诚勿扰: hybase@qq.com
体验效果,可以到下面链接下载mp3文件。
文件详情
验证工具
验证工具介绍
验证工具执行实例
链接:https://pan.baidu.com/s/1dS3PzQwmbF3vONswsmLRwg
提取码:amix