FFMPEG 之 parser 一文搞懂FFMPEG 分帧、组帧Framimg

FFMPEG 之 parse_packet

前言

现实世界中的声音图像采样后经过音视频压缩技术压缩而成的码流称为ES流（Elementary Stream），ES流中包含有解码器解码文件必须的信息，比如视频宽高，采样格式，声音的采样率，声道等等。为了方便传输，播放，将音视频ES数据打包到一个文件中，这个文件称之为音视频ES流的封装，常见的音视频封装格式有：MP3，MP4，AVI，MKV，FLV，RMVB，TS，PS等等。
解码器源源不断地读取ES流，解码输出，音乐美妙，画面流畅，这一切看起来都是那么的和谐。播放的时候，为了音视频同步，引入了PTS和DTS，即Presentation Time Stamp和Decode Time Stamp 。有的解码器要求将PTS放在ES流的前面几个字节，有的解码器要求将PTS通过描述信息，用描述信息头的方式传入，颇似带内和带外传输。
这就要求将ES流分成一段一段的，用来将PTS和该段的ES流对应起来，这就是所称的分帧，组帧Framing操作。音频的Framing较为简单，SYNC WORD加上两个SYNC WORD之间的部分就是一帧，SYNC WORD也相对固定。图像的Framing相对复杂，除上述原因之外，有的解码器要求严格按照图像的边界来描述ES流，如H264的流必须以00 00 00 01 或 00 00 01开始。再者，有些文件在打包的时候，只包含了ES流，无封装格式，比如一些MPEG格式的视频。这些MPEG格式的视频文件后缀可以是mpgv，mpv，mp1v，m1v，mp2v，m2v。所以，这才有了本文需要讲述的packet parse。本文分析基于FFMPEG4.2.2。

数据流向图

为了兼容各种类型的解码器，播放器在读取原始文件的ES流的时候，在ES流的边界上将文件里面的数据流分成一帧一帧的。这个过程叫做parse。不同类型的ES流，帧的边界不一样，因此parser也不同。在parser设计的时候，切忌将不同类型的parser集中设计，宜采用分治的思路，将不同类型的ES流的parse功能按编码类型设计到独立文件中。本文也认为parser属于adec，vdec的功能，可以将parser放到不同类型的独立性的adec，vdec文件中，这样，包括像frame rate估计，视频宽高等等基本信息的获取，这些工作也可以在parser中完成，实际上FFMPEG中视频基本信息就是在parser中完成的。下面是MPEG VIDEO的帧边界示意图。分析软件为：Elecard Stream Analyzer v.2.3

Parse的过程就是将这些元素切割，分开来。我们把这个切割分开的一段数据叫帧。

parse 流解决的几个问题：

相关的数组结构

与FFMPEG Frmaing相关的重要结构体有两个，AVCodecParserContext和AVCodecParser，说明如下：

AVCodecParserContext

typedef struct AVCodecParserContext {
/* 格式不同，Framing的方式也不一样，
* 不同的codec可能需要一些额外的结构来保存Framing的上下文，
* priv_data就指向这些不同Codec的上下文 */
void *priv_data;
/* 特定Codec的parse 方法钩子函数 */
struct AVCodecParser *parser;
/* offset of the current frame,
* AVSTREAM_PARSE_FULL_RAW 时表示当前帧在文件中的位置
* 其他时候表示输出帧的offset.
* frame_offset, cur_offset, next_frame_offset三姊妹都仅代表偏移。
* 个人理解在raw的时候，这三个值之于文件位置才具意义。
*/
int64_t frame_offset;
/*
* 代表了当前包中的ES流在全部demux出来的ES中的偏移。流初始为第一个Pkt的Pos
* ----------------------------------------- ---------------------------------------------
* cur_offset| pkt2|pkt3|……|pktn|组成的一帧 cur_offset为第一个cur_offset+第一个帧长
* ----------------------------------------- ---------------------------------------------
* 开始时，表示第一个pkt在文件中的位置
* 找到下一个帧头时，表示下一个帧头相对于第一个PKT的位置,
* 即cur_offset += frame1_len,cur_offset与文件位置无明确对应关系。
* 未找到下一个帧头时，表示已经parse过的数据相对于第一个PKT的位置
* 可以理解成Demux出来的ES数据之间的位置关系。间接描述了文件ES流的长度。
*/
int64_t cur_offset;
/* 始终指向下一个待输出的frame偏移 */
int64_t next_frame_offset; /* offset of the next frame */
/* video info，待输出帧类型，I,P,B，S等等 */
int pict_type; /* XXX: Put it back in AVCodecContext. */
/*
* 用于帧时长计算
* frame_duration = (1 + repeat_pict) * time_base
* It is used by codecs like H.264 to display telecined material.
*/
int repeat_pict; /* XXX: Put it back in AVCodecContext. */
int64_t pts; /* pts of the current frame */
int64_t dts; /* dts of the current frame */

/* private data */
int64_t last_pts;
int64_t last_dts;
/* 首次使用或者输出一帧后置位 */
int fetch_timestamp;

#define AV_PARSER_PTS_NB 4
/* FFMPEG保存了4个pkt的信息，用于给当前的输出的帧填上合适的POS，pts，dts信息 */
int cur_frame_start_index;
/* 偏移信息，4个包相对于ES流起始的偏移 */
int64_t cur_frame_offset[AV_PARSER_PTS_NB];
int64_t cur_frame_pts[AV_PARSER_PTS_NB];
int64_t cur_frame_dts[AV_PARSER_PTS_NB];

int flags;
#define PARSER_FLAG_COMPLETE_FRAMES 0x0001
#define PARSER_FLAG_ONCE 0x0002
/// Set if the parser has a valid file offset
#define PARSER_FLAG_FETCHED_OFFSET 0x0004
#define PARSER_FLAG_USE_CODEC_TS 0x1000

/* 相当于帧长 */
int64_t offset; ///< byte offset from starting packet start
/* 偏移信息，4个包的结束偏移 */
int64_t cur_frame_end[AV_PARSER_PTS_NB];

/**
* Set by parser to 1 for key frames and 0 for non-key frames.
* It is initialized to -1, so if the parser doesn't set this flag,
* old-style fallback using AV_PICTURE_TYPE_I picture type as key frames
* will be used.
*/
int key_frame;

……

/**
* Position of the packet in file.
* 这个才是文件位置
* Analogous to cur_frame_pts/dts
*/
int64_t cur_frame_pos[AV_PARSER_PTS_NB];

/**
* Byte position of currently parsed frame in stream.
*/
int64_t pos;

/**
* Previous frame byte position.
*/
int64_t last_pos;

……
} AVCodecParserContext;

ParseContext

typedef struct ParseContext{
/*
* 保存parse过的数据，当找到一帧之后，parse_packet会调用
* av_packet_make_refcounted产生新的buffer，将如下buffer中的数据复制出去。
*/
uint8_t *buffer;
/* 已使用buffer的最后位置，在当找到一帧之后这个值会清0 */
int index;
/* 保存前一次使用的index值 */
int last_index;
/* buffer大小，可以不等于buffer中数据大小 */
unsigned int buffer_size;
/* 流中MSB表示的最后的4个字节 */
uint32_t state;
/* 给具体的codec parsere用 */
int frame_start_found;
/*
* 不可避免地读入的下一帧的多余的数据大小，这个数据可能是通过parser这边读入的，
* 在read_frame_internal控制之外，因此有此一举。
*/
int overread;
/* 一帧输出之后，剩下的多读进来的数据在buffer中的下标。*/
int overread_index;
/* 流中MSB表示的最后的8个字节 */
uint64_t state64;
} ParseContext;

几个相关函数：

read_frame_internal

/* ffmpeg 的parser 统一设计，在read frame的时候调用 */
static int read_frame_internal(AVFormatContext *s, AVPacket *pkt)
{
while(!got_packet && !s->internal->parse_queue) {
/* read next packet */
ret = ff_read_packet(s, &cur_pkt);
if(ret < 0) {
/* flush the parsers */
for(i = 0; i < s->nb_streams; i++) {
st = s->streams[i];
/* 不能继续从流当中取数，则尝试输出最后一帧 */
if(st->parser && st->need_parsing) {
parse_packet(s, NULL, st->index);
}
}
/* all remaining packets are now in parse_queue =>
* really terminate parsing */
break;
}
/* close parser, because it depends on the codec,parser与codec相关，codec变化则应改变parser */
if(st->parser && st->internal->avctx->codec_id != st->codecpar->codec_id) {
av_parser_close(st->parser);
st->parser = NULL;
}
......
/* 需要parser，但目前无parser，则初始化一个。 */
if (st->need_parsing && !st->parser && !(s->flags & AVFMT_FLAG_NOPARSE)) {
st->parser = av_parser_init(st->codecpar->codec_id);
if (!st->parser) {
/* no parser available: just output the raw packets */
st->need_parsing = AVSTREAM_PARSE_NONE;
} else if (st->need_parsing == AVSTREAM_PARSE_HEADERS)
st->parser->flags |= PARSER_FLAG_COMPLETE_FRAMES;
else if (st->need_parsing == AVSTREAM_PARSE_FULL_ONCE)
st->parser->flags |= PARSER_FLAG_ONCE;
else if (st->need_parsing == AVSTREAM_PARSE_FULL_RAW)
st->parser->flags |= PARSER_FLAG_USE_CODEC_TS;
}
if(!st->need_parsing || !st->parser) {
......
/* 不需要parse的流直接输出 */
got_packet = 1;
} else if(st->discard < AVDISCARD_ALL) {
/* 不是需要丢弃的流则parse */
if((ret = parse_packet(s, &cur_pkt, cur_pkt.stream_index)) < 0) {
return ret;
}
st->codecpar->sample_rate = st->internal->avctx->sample_rate;
st->codecpar->bit_rate = st->internal->avctx->bit_rate;
st->codecpar->channels = st->internal->avctx->channels;
st->codecpar->channel_layout = st->internal->avctx->channel_layout;
st->codecpar->codec_id = st->internal->avctx->codec_id;
} else {
/* 标记为AVDISCARD_ALL的流直接将它的pkt释放掉。free packet */
av_packet_unref(&cur_pkt);
}
}
……
/* 未找到pkt并且parse对列不空，从parse队列取包输出。 */
if (!got_packet && s->internal->parse_queue)
ret = ff_packet_list_get(&s->internal->parse_queue, &s->internal->parse_queue_end, pkt);

return ret;
}

parse_packet

函数原型：static int parse_packet(AVFormatContext *s, AVPacket *pkt, int stream_index)

详细分析如下：

static int parse_packet(AVFormatContext *s, AVPacket *pkt, int stream_index)
{
……
/* size >0 表示第一次进入此函数,pkt中有data数据;或者已经找到了一帧，数据还有剩余，可以继续分帧。
* (pkt == &flush_pkt && got_output)表示已经是不能从流中读到数据了，需要处理以前缓存在buffer中的数据，
* 看是否能作为最后一帧输出
*/
while (size > 0 || (pkt == &flush_pkt && got_output)) {
……
/* 对data指向的size个数据进行prase，返回的len表示已经使用了的数据长度。
* 如果已经是找到了一帧的边界，那么out_pkt.size才不会为0，out_pkt.data也才不会为空。
*/
len = av_parser_parse2(st->parser, st->internal->avctx,
&out_pkt.data, &out_pkt.size, data, size,
pkt->pts, pkt->dts, pkt->pos);

……
got_output = !!out_pkt.size;
/* 没有找到一帧的边界则继续找，如果size已经为0了就退出循环，已经处理好的数据放在parser分配好的缓存中。
* 此时缓存的buffer还不是最终的输出缓存，FFMPEG的最终输出buffer还需要在数据size的基础上加AV_INPUT_BUFFER_PADDING_SIZE，重新realloc内存。
* 上面两种情况，使用buffer的parser指针都是pc->buffer。
*/
if (!out_pkt.size)
continue;
……
if (pkt->buf && out_pkt.data == pkt->data) {

} else {
/* 将out_pkt中pkt->data指向的由parser管理的数据复制到重新分配的pkt->buf->data中来，
* 让pkt->data指向pkt->buf->data，而parser管理的buffer pc->buffer也不会被释放，
* pc->buffer在下一轮ff_combine_frame中将会因为现在data size变小，在realloc时减小尺寸
*/
ret = av_packet_make_refcounted(&out_pkt);
}

……

out_pkt.stream_index = st->index;
out_pkt.pts = st->parser->pts;
out_pkt.dts = st->parser->dts;
out_pkt.pos = st->parser->pos;
out_pkt.flags |= pkt->flags & AV_PKT_FLAG_DISCARD;

/* 收集好一帧后，会重新parse这帧的信息。此处将pts信息，帧所在文件位置信息处理好后的out_pkt保存到parse队列中 */
ret = ff_packet_list_put(&s->internal->parse_queue,
&s->internal->parse_queue_end,
&out_pkt, 0);
if (ret < 0) {
/* 这个包的数据就会被丢掉 */
av_packet_unref(&out_pkt);
goto fail;
}
}

/* end of the stream => close and free the parser */
if (pkt == &flush_pkt) {
av_parser_close(st->parser);
st->parser = NULL;
}

fail:
/* 使用完这个pkt,将这个Pkt持有的AVBufferRef释放，信息清空。
* 但是保存数据的buffer还要等到它没有使用者的时候才释放！
*/
av_packet_unref(pkt);
return ret;
}

av_parser_parse2

int av_parser_parse2(AVCodecParserContext *s, AVCodecContext *avctx,
uint8_t **poutbuf, int *poutbuf_size,
const uint8_t *buf, int buf_size,
int64_t pts, int64_t dts, int64_t pos)
{
int index, i;
uint8_t dummy_buf[AV_INPUT_BUFFER_PADDING_SIZE];

av_assert1(avctx->codec_id != AV_CODEC_ID_NONE);

/* Parsers only work for the specified codec ids. */
av_assert1(avctx->codec_id == s->parser->codec_ids[0] ||
avctx->codec_id == s->parser->codec_ids[1] ||
avctx->codec_id == s->parser->codec_ids[2] ||
avctx->codec_id == s->parser->codec_ids[3] ||
avctx->codec_id == s->parser->codec_ids[4]);
/* 第一次进入时，flags为0，会进入if将offset设置成当前pkt的pos */
if (!(s->flags & PARSER_FLAG_FETCHED_OFFSET)) {
s->next_frame_offset =
s->cur_offset = pos;
s->flags |= PARSER_FLAG_FETCHED_OFFSET;
}

if (buf_size == 0) {
/* padding is always necessary even if EOF, so we add it here，
* 这样处理之后会将缓存的数据全部输出。
*/
memset(dummy_buf, 0, sizeof(dummy_buf));
buf = dummy_buf;
/* 来了一个新的包，不是上一次分帧未用完的数据 */
} else if (s->cur_offset + buf_size != s->cur_frame_end[s->cur_frame_start_index]) { /* skip remainder packets */
/* add a new packet descriptor */
i = (s->cur_frame_start_index + 1) & (AV_PARSER_PTS_NB - 1);
s->cur_frame_start_index = i;
s->cur_frame_offset[i] = s->cur_offset;
s->cur_frame_end[i] = s->cur_offset + buf_size;
s->cur_frame_pts[i] = pts;
s->cur_frame_dts[i] = dts;
s->cur_frame_pos[i] = pos;
}
/* 前一帧输出后，这个值会被置成1，继续寻找下一帧的pts。 */
if (s->fetch_timestamp) {
s->fetch_timestamp = 0;
s->last_pts = s->pts;
s->last_dts = s->dts;
s->last_pos = s->pos;
ff_fetch_timestamp(s, 0, 0, 0);
}
/* WARNING: the returned index can be negative,
* 负数表示当前的packet的数据全部是前一帧的。
* 正数表示下一个start code开始的帧头与buf的偏移或者当前pkt的size，
* 此时也意味着packet全部数据属于前一个帧。
*/
index = s->parser->parser_parse(s, avctx, (const uint8_t **) poutbuf,
poutbuf_size, buf, buf_size);
av_assert0(index > -0x20000000); // The API does not allow returning AVERROR codes
#define FILL(name) if(s->name > 0 && avctx->name <= 0) avctx->name = s->name
if (avctx->codec_type == AVMEDIA_TYPE_VIDEO) {
FILL(field_order);
}

/* update the file pointer，值存在表示已经找到了下一个帧就头 */
if (*poutbuf_size) {
/* fill the data for the current frame */
s->frame_offset = s->next_frame_offset;

/* offset of the next frame */
s->next_frame_offset = s->cur_offset + index;
s->fetch_timestamp = 1;
}
/*
* 未找到下一个帧头，index为负数，不更新cur_offset，
* 此时cur_offset应该在对应codec的parser中被更新。但ffmpeg没有这样做，
* ffmpeg在当前的packet全部是前一个帧的内容时返回当前包的size。所以，
* 此时s->cur_offset实际上代表了当前已经parse的相对位置
*/
if (index < 0)
index = 0;
s->cur_offset += index;
return index;
}

ff_fetch_timestamp

/*
* 取得当前帧的PTS，off表示帧的下一个start code位置与cur_offset的偏移。
* @param remove Found timestamps will be removed if set to 1, kept if set to 0.
* @param fuzzy Only use found value if it is more informative than what we already have
*/
void ff_fetch_timestamp(AVCodecParserContext *s, int off, int remove, int fuzzy)
{
int i;

if (!fuzzy) {
s->dts =
s->pts = AV_NOPTS_VALUE;
s->pos = -1;
s->offset = 0;
}
for (i = 0; i < AV_PARSER_PTS_NB; i++) {
/* 为了确定当前帧在保存的frame信息范围内 */
if (s->cur_offset + off >= s->cur_frame_offset[i] &&
(s->frame_offset < s->cur_frame_offset[i] ||
(!s->frame_offset && !s->next_frame_offset)) && // first field/frame
// check disabled since MPEG-TS does not send complete PES packets
/*s->next_frame_offset + off <*/ s->cur_frame_end[i]){

if (!fuzzy || s->cur_frame_dts[i] != AV_NOPTS_VALUE) {
s->dts = s->cur_frame_dts[i];
s->pts = s->cur_frame_pts[i];
s->pos = s->cur_frame_pos[i];
s->offset = s->next_frame_offset - s->cur_frame_offset[i];
}
if (remove)
s->cur_frame_offset[i] = INT64_MAX;
/* 包未用完。*/
if (s->cur_offset + off < s->cur_frame_end[i])
break;
}
}
}

快进时的parser处理

由于快进后parser中的内容已经没有意义，所以需要将parser中的数据丢掉，ffmpeg是这样做的：
avformat_seek_file->seek_frame_internal->seek_frame_internal->ff_read_frame_flush

void ff_read_frame_flush(AVFormatContext *s)
{
……
/* 丢掉packet缓存 */
flush_packet_queue(s);

/* Reset read state for each stream. */
for (i = 0; i < s->nb_streams; i++) {
st = s->streams[i];
/* 关闭并将parser置空，read_frame_internal时会判断流是否需要parser，进而重新配置 */
if (st->parser) {
av_parser_close(st->parser);
st->parser = NULL;
}
……
}
}