AOF持久化记录所有服务器对数据库的写操作,并在服务器启动时,用这些命令来还原数据集。AOF文件中的命令都以Redis协议的格式来记录,新命令会被追加到文件的末尾。Redis还支持后台对AOF文件进行重写,防止其超过AOF文件限制的大小
AOF持久化的优缺点
优点
·使用AOF持久化使Redis变得更耐久你可以设置不同的 fsync 策略,比如无 fsync ,每秒钟一次 fsync ,或者每次执行写入命令时 fsync 。AOF 的默认策略为每秒钟
fsync 一次,在这种配置下,Redis 仍然可以保持良好的性能,并且就算发生故障停机,也最多只会丢失2秒钟的数据(fsync 会在后台线程执行,所以主线程可以继续努力地处理命令请求)
·Redis 可以在 AOF 文件体积变得过大时,自动地在后台对 AOF 进行重写:重写后的新 AOF 文件包含了恢复当前数据集所需的最小命令集合。整个重写操作是绝对安全的,因为 Redis 在创建新 AOF文件的过程中,会继续将命令追加到现有的 AOF 文件里面,即使重写过程中发生停机,现有的 AOF文件也不会丢失。而一旦新 AOF 文件创建完毕,Redis 就会从旧 AOF 文件切换到新 AOF 文件,并开始对新 AOF 文件进行追加操作
·AOF 文件有序地保存了对数据库执行的所有写入操作,这些写入操作以 Redis 协议的格式保存,因此 AOF 文件的内容非常容易被人读懂,对文件进行分析(parse)也很轻松。导出(export)AOF 文件也非常简单:举个例子,如果你不小心执行了FLUSHALL 命令,但只要 AOF 文件未被重写,那么只要停止服务器,移除 AOF 文件末尾的FLUSHALL 命令,并重启 Redis ,就可以将数据集恢复到FLUSHALL 执行之前的状态
缺点
·对于同样的数据集来说,AOF文件的体积要大于RDB文件的体积
·根据所使用的 fsync 策略,AOF 的速度可能会慢于 RDB 。在一般情况下,每秒 fsync 的性能依然非常高,而关闭 fsync 可以让 AOF 的速度和 RDB 一样快,即使在高负荷之下也是如此。不过在处理巨大的写入载入时,RDB 可以提供更有保证的最大延迟时间
AOF持久化的实现
当在配置文件中打开了AOF持久化功能后:
appendonly yes
从现在开始后,每当一个redis命令改变数据集时,这个命令会被追加到AOF文件末尾
相关结构体
AOF相关变量在全局server中的定义:
1: struct redisServer {
2: ……3: int aof_state; /* AOF状态 REDIS_AOF_(ON|OFF|WAIT_REWRITE) */4: int aof_fsync; /* fsync()策略 */5: char *aof_filename; /* AOF文件名 */6: int aof_no_fsync_on_rewrite; /* 在rewrite期间是否fsync */7: int aof_rewrite_perc; /* M 当AOF文件达到上次rewrite后文件大小的M倍后触发rewrite */8: off_t aof_rewrite_min_size; /* AOF文件rewrite最小大小 */
9: off_t aof_rewrite_base_size; /* 上一次rewrite后的AOF文件大小 */
10: off_t aof_current_size; /* 现在AOF文件大小 */
11: int aof_rewrite_scheduled; /* 当bgsave结束后开始rewrite */12: pid_t aof_child_pid; /* rewrite进程的pid */
13: list *aof_rewrite_buf_blocks; /* rewrite期间的AOF缓冲 */
14: sds aof_buf; /* AOF缓冲,需要在事件循环中被fsync同步到硬盘上 */
15: int aof_fd; /* 现在AOF文件的fd */16: int aof_selected_db; /* AOF文件现在指定的DB编号 */17: time_t aof_flush_postponed_start; /* 上一次推迟fsync的时间 */
18: time_t aof_last_fsync; /* 上一次fsync的时间 */
19: time_t aof_rewrite_time_last; /* 上一次rewrite时间 */
20: time_t aof_rewrite_time_start; /* 这次rewrite的开始时间 */
21: int aof_lastbgrewrite_status; /* REDIS_OK or REDIS_ERR */22: unsigned long aof_delayed_fsync; /* fsync拖延次数 */23: ……24: }
AOF过程详情
每个命令调用都会调用call(),而在call函数中,如果该命令涉及到写操作,那么会调用progagate()来传播写操作到AOF和slaves。在progagate中,通过调用feedAppendOnlyFile()来进行
1: /* Propagate the command into the AOF and replication link */
2: // 传播命令到 AOF 和附属节点
3: if (flags & REDIS_CALL_PROPAGATE) {
4: int flags = REDIS_PROPAGATE_NONE;
5:6: if (c->cmd->flags & REDIS_CMD_FORCE_REPLICATION)
7: flags |= REDIS_PROPAGATE_REPL;8:9: if (dirty)
10: flags |= (REDIS_PROPAGATE_REPL | REDIS_PROPAGATE_AOF);11:12: if (flags != REDIS_PROPAGATE_NONE)
13: propagate(c->cmd,c->db->id,c->argv,c->argc,flags);14: }
1: /* Propagate the specified command (in the context of the specified database id)
2: * to AOF and Slaves.3: *4: * 传播给定命令到 AOF 或附属节点5: *6: * flags are an xor between:7: * + REDIS_PROPAGATE_NONE (no propagation of command at all)8: * + REDIS_PROPAGATE_AOF (propagate into the AOF file if is enabled)9: * + REDIS_PROPAGATE_REPL (propagate into the replication link)10: */11: void propagate(struct redisCommand *cmd, int dbid, robj **argv, int argc,12: int flags)
13: {14: if (server.aof_state != REDIS_AOF_OFF && flags & REDIS_PROPAGATE_AOF)
15: feedAppendOnlyFile(cmd,dbid,argv,argc);16: if (flags & REDIS_PROPAGATE_REPL && listLength(server.slaves))
17: replicationFeedSlaves(server.slaves,dbid,argv,argc);18: }
1:2: /*
3: * 将给定命令追加到 AOF 文件/缓存中4: */5: void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {6: sds buf = sdsempty();7: robj *tmpargv[3];8:9: /* The DB this command was targetting is not the same as the last command
10: * we appendend. To issue a SELECT command is needed. */11: // 当前 db 不是指定的 aof db,
12: // 通过创建 SELECT 命令来切换数据库
13: if (dictid != server.aof_selected_db) {
14: char seldb[64];
15:16: // 让 AOF 文件切换 DB
17: snprintf(seldb,sizeof(seldb),"%d",dictid);18: buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
19: (unsigned long)strlen(seldb),seldb);20:21: // 程序切换 DB
22: server.aof_selected_db = dictid;23: }24:25: // 将 EXPIRE / PEXPIRE / EXPIREAT 命令翻译为 PEXPIREAT 命令
26: if (cmd->proc == expireCommand || cmd->proc == pexpireCommand ||
27: cmd->proc == expireatCommand) {28: /* Translate EXPIRE/PEXPIRE/EXPIREAT into PEXPIREAT */
29: buf = catAppendOnlyExpireAtCommand(buf,cmd,argv[1],argv[2]);30:31: // 将 SETEX / PSETEX 命令翻译为 SET 和 PEXPIREAT 命令
32: } else if (cmd->proc == setexCommand || cmd->proc == psetexCommand) {33: /* Translate SETEX/PSETEX to SET and PEXPIREAT */
34: tmpargv[0] = createStringObject("SET",3);
35: tmpargv[1] = argv[1];36: tmpargv[2] = argv[3];37: buf = catAppendOnlyGenericCommand(buf,3,tmpargv);38: decrRefCount(tmpargv[0]);39: buf = catAppendOnlyExpireAtCommand(buf,cmd,argv[1],argv[2]);40:41: // 其他命令直接追加到 buf 末尾
42: } else {
43: /* All the other commands don't need translation or need the
44: * same translation already operated in the command vector45: * for the replication itself. */46: buf = catAppendOnlyGenericCommand(buf,argc,argv);47: }48:49: /* Append to the AOF buffer. This will be flushed on disk just before
50: * of re-entering the event loop, so before the client will get a51: * positive reply about the operation performed. */52: // 将 buf 追加到服务器的 aof_buf 末尾
53: // 下次 AOF 写入执行时,这些数据就会被写入
54: if (server.aof_state == REDIS_AOF_ON)
55: server.aof_buf = sdscatlen(server.aof_buf,buf,sdslen(buf));56:57: /* If a background append only file rewriting is in progress we want to
58: * accumulate the differences between the child DB and the current one59: * in a buffer, so that when the child process will do its work we60: * can append the differences to the new append only file. */61: // 如果 AOF 重写正在执行,那么也将新 buf 追加到 AOF 重写缓存中
62: // 等 AOF 重写完之前的数据之后,新输入的命令也会追加到新 AOF 文件中。
63: if (server.aof_child_pid != -1)
64: aofRewriteBufferAppend((unsigned char*)buf,sdslen(buf));65:66: sdsfree(buf);67: }
在这一步,命令会先添加到aof_buf这个缓冲中,同步到硬盘需要另外一个过程进行。首先,我们需要确定目前AOF文件中最后的命令是不是与该命令的DB编号一致,如果不一致,我们还需要将select命令写入AOF文件。然后格式化该命令,当AOF机制打开时,将命令写入aof_buf。如果此时有rewrite进程在运行,那么还需要将命令写入server.aof_rewrite_buf_blocks(此结构在下面的AOF重写中介绍)
1: /* Append data to the AOF rewrite buffer, allocating new blocks if needed.
2: *3: * 将数组 s 追加到 AOF 缓存的末尾。4: * 如果有需要的话,分配一个新的缓存块。5: */6: void aofRewriteBufferAppend(unsigned char *s, unsigned long len) {7: listNode *ln = listLast(server.aof_rewrite_buf_blocks);8: aofrwblock *block = ln ? ln->value : NULL;9:10: while(len) {
11: /* If we already got at least an allocated block, try appending
12: * at least some piece into it. */13: // 将数据保存到最后一个块里
14: // 数据保存的数量是不定的,可能需要创建一个新块来保存数据
15: if (block) {
16: unsigned long thislen = (block->free < len) ? block->free : len;17: if (thislen) { /* The current block is not already full. */18: memcpy(block->buf+block->used, s, thislen);19: block->used += thislen;20: block->free -= thislen;
21: s += thislen;22: len -= thislen;23: }24: }25:26: // 分配第一个块,
27: // 或者创建另一个块来保存数据
28: if (len) { /* First block to allocate, or need another block. */29: int numblocks;
30:31: block = zmalloc(sizeof(*block));
32: block->free = AOF_RW_BUF_BLOCK_SIZE;
33: block->used = 0;34: listAddNodeTail(server.aof_rewrite_buf_blocks,block);35:36: /* Log every time we cross more 10 or 100 blocks, respectively
37: * as a notice or warning. */38: numblocks = listLength(server.aof_rewrite_buf_blocks);39: if (((numblocks+1) % 10) == 0) {
40: int level = ((numblocks+1) % 100) == 0 ? REDIS_WARNING :
41: REDIS_NOTICE;42: redisLog(level,"Background AOF buffer size: %lu MB",
43: aofRewriteBufferSize()/(1024*1024));44: }45: }46: }47: }
每当服务器常规任务函数被执行、或者事件处理器被执行时,aof.c/flushAppendOnlyFile 函数都会被调用,这个函数执行以下两个工作:
WRITE:根据条件,将 aof_buf 中的缓存写入到 AOF 文件。
SAVE:根据条件,调用 fsync 或 fdatasync 函数,将 AOF 文件保存到磁盘中
在命令添加到aof_buf内后,每次事件循环开始,会调用flushAppendOnlyFile(int force)来将aof_buf写到硬盘上。force参数主要是用于当异步IO线程在进行fsync()并且fsync策略是每秒同步时,是否继续写入或者拖延写入。如果在serverCron发现上次flush操作是拖延的,那么继续尝试flush到硬盘
1: void beforeSleep(struct aeEventLoop *eventLoop) {2: ……3: /* Write the AOF buffer on disk */
4: flushAppendOnlyFile(0);5: ……6: }7:8: int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {9: ……10: if (server.aof_flush_postponed_start) flushAppendOnlyFile(0);
11: ……12: }
然后来看flushAppendOnlyFile(),首先,bioPendingJobsOfType()调用会检测异步IO线程是否在进行同步到硬盘操作。如果fsync策略是AOF_FSYNC_EVERYSEC,那么如果有异步IO线程在进行,那么会判断上次尝试flush是否拖延,如果上次尝试不拖延或者拖延时间没超过两秒,那么继续拖延。否则继续执行函数
1:2: /* Write the append only file buffer on disk.
3: *4: * 将 AOF 缓存写到文件中5: *6: * Since we are required to write the AOF before replying to the client,7: * and the only way the client socket can get a write is entering when the8: * the event loop, we accumulate all the AOF writes in a memory9: * buffer and write it on disk using this function just before entering10: * the event loop again.11: *12: * 因为程序需要在会回复客户端之前对 AOF 执行写操作。13: * 而客户端能执行写操作的唯一机会就是在事件 loop 中,14: * 因为程序将所有 AOF 写保存到缓存中,15: * 并在进入事件 loop 之前,将缓存写入到文件中。16: *17: * About the 'force' argument:18: *19: * 关于 force 参数:20: *21: * When the fsync policy is set to 'everysec' we may delay the flush if there22: * is still an fsync() going on in the background thread, since for instance23: * on Linux write(2) will be blocked by the background fsync anyway.24: * When this happens we remember that there is some aof buffer to be25: * flushed ASAP, and will try to do that in the serverCron() function.26: *27: * 当 fsync 策略是“每秒进行一次 fsync”时,28: * 后台队列里可能会有 fsync 等待执行并阻塞,29: * 这些 fsync 会在 serverCron() 中执行。30: *31: * However if force is set to 1 we'll write regardless of the background32: * fsync.33: *34: * 但是,如果 force 为 1 ,那么不管后台任务是否在 fsync ,35: * 程序都直接执行 fsync 。36: */37: void flushAppendOnlyFile(int force) {38: ssize_t nwritten;39: int sync_in_progress = 0;
40:41: // 没有缓存等待写入,直接返回
42: if (sdslen(server.aof_buf) == 0) return;43:44: // 返回后台正在等待执行的 fsync 数量
45: if (server.aof_fsync == AOF_FSYNC_EVERYSEC)
46: sync_in_progress = bioPendingJobsOfType(REDIS_BIO_AOF_FSYNC) != 0;47:48: // AOF 模式为每秒 fsync ,并且 force 不为 1
49: // 如果可以的话,推延冲洗
50: if (server.aof_fsync == AOF_FSYNC_EVERYSEC && !force) {
51: /* With this append fsync policy we do background fsyncing.
52: * If the fsync is still in progress we can try to delay53: * the write for a couple of seconds. */54: // 如果 aof_fsync 队列里已经有正在等待的任务
55: if (sync_in_progress) {
56:57: // 推迟 aof 重写 ...
58:59: if (server.aof_flush_postponed_start == 0) {
60: // 上一次没有推迟冲洗过,记录推延的当前时间,然后返回
61: /* No previous write postponinig, remember that we are
62: * postponing the flush and return. */63: server.aof_flush_postponed_start = server.unixtime;64: return;
65:66: } else if (server.unixtime - server.aof_flush_postponed_start < 2) {67: // 允许在两秒之内的推延冲洗
68: /* We were already waiting for fsync to finish, but for less
69: * than two seconds this is still ok. Postpone again. */70: return;
71: }72: /* Otherwise fall trough, and go write since we can't wait
73: * over two seconds. */74: // 记录冲洗推延次数
75: server.aof_delayed_fsync++;76: redisLog(REDIS_NOTICE,"Asynchronous AOF fsync is taking too long (disk is busy?). Writing the AOF buffer without waiting for fsync to complete, this may slow down Redis.");
77: }78: }79:80: // 到达这一步,冲洗已经不能推迟了,将属性设为 0
81: /* If you are following this code path, then we are going to write so
82: * set reset the postponed flush sentinel to zero. */83: server.aof_flush_postponed_start = 0;84:85: /* We want to perform a single write. This should be guaranteed atomic
86: * at least if the filesystem we are writing is a real physical one.87: * While this will save us against the server being killed I don't think88: * there is much to do about the whole server stopping for power problems89: * or alike */90: // 将 AOF 缓存写入到文件,如果一切幸运的话,写入会原子性地完成
91: nwritten = write(server.aof_fd,server.aof_buf,sdslen(server.aof_buf));
92: // 写入出错,停止 Redis 并报告错误
93: if (nwritten != (signed)sdslen(server.aof_buf)) {94: /* Ooops, we are in troubles. The best thing to do for now is
95: * aborting instead of giving the illusion that everything is96: * working as expected. */97: if (nwritten == -1) {
98: redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
99: } else {
100: redisLog(REDIS_WARNING,"Exiting on short write while writing to "
101: "the append-only file: %s (nwritten=%ld, "
102: "expected=%ld)",
103: strerror(errno),104: (long)nwritten,
105: (long)sdslen(server.aof_buf));
106:107: if (ftruncate(server.aof_fd, server.aof_current_size) == -1) {
108: redisLog(REDIS_WARNING, "Could not remove short write "
109: "from the append-only file. Redis may refuse "
110: "to load the AOF the next time it starts. "
111: "ftruncate: %s", strerror(errno));
112: }113: }114: exit(1);
115: }116: // 更新 AOF 文件的当前大小
117: server.aof_current_size += nwritten;118:119: /* Re-use AOF buffer when it is small enough. The maximum comes from the
120: * arena size of 4k minus some overhead (but is otherwise arbitrary). */121: // 如果 aof 缓存不是太大,那么重用它,否则,清空 aof 缓存
122: if ((sdslen(server.aof_buf)+sdsavail(server.aof_buf)) < 4000) {
123: sdsclear(server.aof_buf);124: } else {
125: sdsfree(server.aof_buf);126: server.aof_buf = sdsempty();127: }128:129: /* Don't fsync if no-appendfsync-on-rewrite is set to yes and there are
130: * children doing I/O in the background. */131: // 以下条件发生时,直接返回,不执行后面的 fsnyc :
132: // 不允许在 AOF 重写时写入 AOF 文件 并且
133: // REWRITEAOF 正在执行 或者 BGSAVE 正在进行
134: if (server.aof_no_fsync_on_rewrite &&
135: (server.aof_child_pid != -1 || server.rdb_child_pid != -1))136: return;
137:138: /* Perform the fsync if needed. */
139: // 如果有需要,执行 fsync
140:141: // AOF 模式为总是 fsync ,那么执行 fsync
142: if (server.aof_fsync == AOF_FSYNC_ALWAYS) {
143: /* aof_fsync is defined as fdatasync() for Linux in order to avoid
144: * flushing metadata. */145: aof_fsync(server.aof_fd); /* Let's try to get this data on the disk */
146: // 更新对 AOF 文件最后一次进行 fsync 的时间
147: server.aof_last_fsync = server.unixtime;148:149: // AOF 模式为每秒一次,并且距离上次写 AOF 文件已经超过 1 秒
150: } else if ((server.aof_fsync == AOF_FSYNC_EVERYSEC &&151: server.unixtime > server.aof_last_fsync)) {152: // 仅在没有 fsync 在后台进行时,才将新的 fsync 任务放到后台执行
153: if (!sync_in_progress) aof_background_fsync(server.aof_fd);
154: // 更新对 AOF 文件最后一次进行 fsync 时间
155: server.aof_last_fsync = server.unixtime;156: }157: }
AOF 保存模式
Redis 目前支持三种 AOF 保存模式,它们分别是:
1. AOF_FSYNC_NO :不保存。
2. AOF_FSYNC_EVERYSEC :每一秒钟保存一次。
3. AOF_FSYNC_ALWAYS :每执行一个命令保存一次
不保存
在这种模式下,每次调用 flushAppendOnlyFile 函数,WRITE 都会被执行,但 SAVE 会被
略过。在这种模式下,SAVE 只会在以下任意一种情况中被执行:
• Redis 被关闭
• AOF 功能被关闭
• 系统的写缓存被刷新(可能是缓存已经被写满,或者定期保存操作被执行)
这三种情况下的 SAVE 操作都会引起 Redis 主进程阻塞
每一秒钟保存一次
在这种模式中,SAVE 原则上每隔一秒钟就会执行一次,因为 SAVE 操作是由后台子线程调用的,所以它不会引起服务器主进程阻塞
注意,在上一句的说明里面使用了词语“原则上” ,在实际运行中,程序在这种模式下对 fsync或 fdatasync 的调用并不是每秒一次,它和调用 flushAppendOnlyFile 函数时 Redis 所处的状态有关
每当 flushAppendOnlyFile 函数被调用时,可能会出现以下四种情况:
• 子线程正在执行 SAVE ,并且:
1. 这个 SAVE 的执行时间未超过 2 秒,那么程序直接返回,并不执行 WRITE 或新的
SAVE 。
2. 这个 SAVE 已经执行超过 2 秒,那么程序执行 WRITE ,但不执行新的 SAVE
注意,因为这时 WRITE 的写入必须等待子线程先完成(旧的)SAVE ,因此这里
WRITE 会比平时阻塞更长时间
• 子线程没有在执行 SAVE ,并且:
3. 上次成功执行 SAVE 距今不超过 1 秒,那么程序执行 WRITE ,但不执行 SAVE
4. 上次成功执行 SAVE 距今已经超过 1 秒,那么程序执行 WRITE 和 SAVE
根据以上说明可以知道,在“每一秒钟保存一次”模式下,如果在情况 1 中发生故障停机,那么用户最多损失小于 2 秒内所产生的所有数据。
如果在情况 2 中发生故障停机,那么用户损失的数据是可以超过 2 秒的
Redis 官网上所说的,AOF 在“每一秒钟保存一次”时发生故障,只丢失 1 秒钟数据的说法,实际上并不准确
每执行一个命令保存一次
在这种模式下,每次执行完一个命令之后,WRITE 和 SAVE 都会被执行
另外,因为 SAVE 是由 Redis 主进程执行的,所以在 SAVE 执行期间,主进程会被阻塞,不能接受命令请求
AOF重写
因为AOF的运作方式是不断的向AOF文件中追加命令,所以随着写入命令的不断增加,AOF文件的体积会变得越来越大
举个例子,如果你对一个计数器调用了 100 次INCR ,那么仅仅是为了保存这个计数器的当前值,AOF 文件就需要使用 100 条记录(entry)
然而在实际上,只使用一条SET 命令已经足以保存计数器的当前值了,其余 99 条记录实际上都是多余的。
为了处理这种情况,Redis 支持一种有趣的特性:可以在不打断服务客户端的情况下,对 AOF 文件进行重写(rewrite)
执行BGREWRITEAOF 命令,Redis 将生成一个新的 AOF 文件,这个文件包含重建当前数据集所需的最少命令
后台重写
AOF 重写程序可以很好地完成创建一个新 AOF 文件的任务,但是,在执行这个程序的时候,调用者线程会被阻塞
很明显,作为一种辅佐性的维护手段,Redis 不希望 AOF 重写造成服务器无法处理请求,所以Redis 决定将 AOF 重写程序放到(后台)子进程里执行,这样处理的最大好处是:
1. 子进程进行 AOF 重写期间,主进程可以继续处理命令请求
2. 子进程带有主进程的数据副本,使用子进程而不是线程,可以在避免锁的情况下,保证数据的安全性不过,使用子进程也有一个问题需要解决:因为子进程在进行 AOF 重写期间,主进程还需要继续处理命令,而新的命令可能对现有的数据进行修改,这会让当前数据库的数据和重写后的AOF 文件中的数据不一致
为了解决这个问题,Redis 增加了一个 AOF 重写缓存,这个缓存在 fork 出子进程之后开始启用,Redis 主进程在接到新的写命令之后,除了会将这个写命令的协议内容追加到现有的 AOF文件之外,还会追加到这个缓存中:
这个缓存就是上文提到的server.aof_rewrite_buf_blocks
换言之,当子进程在执行 AOF 重写时,主进程需要执行以下三个工作:
1. 处理命令请求
2. 将写命令追加到现有的 AOF 文件中
3. 将写命令追加到 AOF 重写缓存中
这样一来可以保证:
1. 现有的 AOF 功能会继续执行,即使在 AOF 重写期间发生停机,也不会有任何数据丢失
2. 所有对数据库进行修改的命令都会被记录到 AOF 重写缓存中
当子进程完成 AOF 重写之后,它会向父进程发送一个完成信号,父进程在接到完成信号之后,会调用一个信号处理函数,并完成以下工作:
1. 将 AOF 重写缓存中的内容全部写入到新 AOF 文件中
2. 对新的 AOF 文件进行改名,覆盖原有的 AOF 文件
当步骤 1 执行完毕之后,现有 AOF 文件、新 AOF 文件和数据库三者的状态就完全一致
当步骤 2 执行完毕之后,程序就完成了新旧两个 AOF 文件的交替
这个信号处理函数执行完毕之后,主进程就可以继续像往常一样接受命令请求了。在整个 AOF后台重写过程中,只有最后的写入缓存和改名操作会造成主进程阻塞,在其他时候,AOF 后台重写都不会对主进程造成阻塞,这将 AOF 重写对性能造成的影响降到了最低
AOF 后台重写的触发条件
服务器在 AOF 功能开启的情况下,会维持以下三个变量:
• 记录当前 AOF 文件大小的变量 aof_current_size
• 记录最后一次 AOF 重写之后,AOF 文件大小的变量 aof_rewirte_base_size
• 增长百分比变量 aof_rewirte_perc
每次当 serverCron 函数执行时,它都会检查以下条件是否全部满足,如果是的话,就会触发自动的 AOF 重写:
1. 没有 BGSAVE 命令在进行
2. 没有 BGREWRITEAOF 在进行
3. 当前 AOF 文件大小大于 server.aof_rewrite_min_size (默认值为 1 MB)
4. 当前 AOF 文件大小和最后一次 AOF 重写后的大小之间的比率大于等于指定的增长百分比
默认情况下,增长百分比为 100% ,也即是说,如果前面三个条件都已经满足,并且当前 AOF文件大小比最后一次 AOF 重写时的大小要大一倍的话,那么触发自动 AOF 重写
AOF重写实现
rewrite过程主要是调用函数rewriteAppendOnlyFileBackground()来实现的,下面具体分析此函数具体做了什么工作
1:2: /* This is how rewriting of the append only file in background works:
3: *4: * 以下是后台重写 AOF 文件的工作步骤:5: *6: * 1) The user calls BGREWRITEAOF7: * 用户调用 BGREWRITEAOF8: *9: * 2) Redis calls this function, that forks():10: * Redis 调用这个函数,它执行 fork() :11: *12: * 2a) the child rewrite the append only file in a temp file.13: * 子进程在临时文件中对 AOF 文件进行重写14: *15: * 2b) the parent accumulates differences in server.aof_rewrite_buf.16: * 父进程将新输入的命令追加到 server.aof_rewrite_buf 中17: *18: * 3) When the child finished '2a' exists.19: * 当步骤 2a 执行完之后,子进程结束20: *21: * 4) The parent will trap the exit code, if it's OK, will append the22: * data accumulated into server.aof_rewrite_buf into the temp file, and23: * finally will rename(2) the temp file in the actual file name.24: * The the new file is reopened as the new append only file. Profit!25: *26: * 如果子进程的退出状态是 OK 的话,那么父进程将新输入命令写入到临时文件,27: * 然后对临时文件改名,用它代替旧的 AOF 文件,至此,后台 AOF 重写完成。28: */29: int rewriteAppendOnlyFileBackground(void) {30: pid_t childpid;31: long long start;32:33: // 后台重写正在执行
34: if (server.aof_child_pid != -1) return REDIS_ERR;35:36: // 开始时间
37: start = ustime();38: if ((childpid = fork()) == 0) {
39: char tmpfile[256];
40:41: /* Child */
42: // 关闭网络连接
43: if (server.ipfd > 0) close(server.ipfd);44: if (server.sofd > 0) close(server.sofd);45:46: // 创建临时文件
47: snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());48: // 重写
49: if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
50: size_t private_dirty = zmalloc_get_private_dirty();51:52: if (private_dirty) {
53: redisLog(REDIS_NOTICE,54: "AOF rewrite: %lu MB of memory used by copy-on-write",
55: private_dirty/(1024*1024));56: }57: // 向父进程发送信号, exitFromChild 定义于 redis.c
58: exitFromChild(0);59: } else {
60: exitFromChild(1);61: }62: } else {
63: /* Parent */
64: server.stat_fork_time = ustime()-start;65:66: // 如果创建子进程失败,直接返回
67: if (childpid == -1) {
68: redisLog(REDIS_WARNING,69: "Can't rewrite append only file in background: fork: %s",
70: strerror(errno));71: return REDIS_ERR;
72: }73:74: // 报告客户端,后台重写正在进行
75: redisLog(REDIS_NOTICE,76: "Background append only file rewriting started by pid %d",childpid);
77:78: // 更新服务器状态
79: server.aof_rewrite_scheduled = 0;80: server.aof_rewrite_time_start = time(NULL);
81: server.aof_child_pid = childpid;82: // 关闭 key space 的 rehash ,避免写时复制
83: updateDictResizePolicy();84: /* We set appendseldb to -1 in order to force the next call to the
85: * feedAppendOnlyFile() to issue a SELECT command, so the differences86: * accumulated by the parent into server.aof_rewrite_buf will start87: * with a SELECT statement and it will be safe to merge. */88: server.aof_selected_db = -1;89: return REDIS_OK;
90: }91: return REDIS_OK; /* unreached */92: }
rewriteAppendOnlyFileBackground()的工作主要是fork出一个子进程,然后对父进程进行AOF状态的更新。rewrite任务就交给子进程运行rewriteAppendOnlyFile()解决
1:2: /* Write a sequence of commands able to fully rebuild the dataset into
3: * "filename". Used both by REWRITEAOF and BGREWRITEAOF.4: *5: * 写一串足以还原数据集的命令到给定文件里。6: * 被 REWRITEAOF 和 BGREWRITEAOF 所使用。7: *8: * In order to minimize the number of commands needed in the rewritten9: * log Redis uses variadic commands when possible, such as RPUSH, SADD10: * and ZADD. However at max REDIS_AOF_REWRITE_ITEMS_PER_CMD items per time11: * are inserted using a single command.12: *13: * 为了减少重建数据集所需命令的数量,14: * 在可能时,Redis 会使用可变参数命令,比如 RPUSH 、 SADD 和 ZADD 。15: * 不过这些命令每次最多添加的元素不会超过 REDIS_AOF_REWRITE_ITEMS_PER_CMD 。16: *17: * 重写失败返回 REDIS_ERR ,成功返回 REDIS_OK 。18: */19: int rewriteAppendOnlyFile(char *filename) {20: dictIterator *di = NULL;21: dictEntry *de;22: rio aof;23: FILE *fp;24: char tmpfile[256];
25: int j;
26: long long now = mstime();27:28: /* Note that we have to use a different temp name here compared to the
29: * one used by rewriteAppendOnlyFileBackground() function. */30: snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());31: fp = fopen(tmpfile,"w");
32: if (!fp) {
33: redisLog(REDIS_WARNING, "Opening the temp file for AOF rewrite in rewriteAppendOnlyFile(): %s", strerror(errno));
34: return REDIS_ERR;
35: }36:37: // 初始化文件流
38: rioInitWithFile(&aof,fp);39: // 遍历所有数据库
40: for (j = 0; j < server.dbnum; j++) {
41: char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";42: redisDb *db = server.db+j;43: dict *d = db->dict;44: if (dictSize(d) == 0) continue;45: di = dictGetSafeIterator(d);46: if (!di) {
47: fclose(fp);48: return REDIS_ERR;
49: }50:51: /* SELECT the new DB */
52: // 切换到合适的数据库上
53: if (rioWrite(&aof,selectcmd,sizeof(selectcmd)-1) == 0) goto werr;54: if (rioWriteBulkLongLong(&aof,j) == 0) goto werr;55:56: /* Iterate this DB writing every entry */
57: // 遍历数据库的所有 key-value 对
58: while((de = dictNext(di)) != NULL) {
59: sds keystr;60: robj key, *o;61: long long expiretime;62:63: keystr = dictGetKey(de);64: o = dictGetVal(de);65: initStaticStringObject(key,keystr);66:67: expiretime = getExpire(db,&key);68:69: /* Save the key and associated value */
70: // 保存 key 和 value
71: if (o->type == REDIS_STRING) {
72: /* Emit a SET command */
73: char cmd[]="*3\r\n$3\r\nSET\r\n";74: if (rioWrite(&aof,cmd,sizeof(cmd)-1) == 0) goto werr;75: /* Key and value */
76: if (rioWriteBulkObject(&aof,&key) == 0) goto werr;77: if (rioWriteBulkObject(&aof,o) == 0) goto werr;78: } else if (o->type == REDIS_LIST) {79: if (rewriteListObject(&aof,&key,o) == 0) goto werr;80: } else if (o->type == REDIS_SET) {81: if (rewriteSetObject(&aof,&key,o) == 0) goto werr;82: } else if (o->type == REDIS_ZSET) {83: if (rewriteSortedSetObject(&aof,&key,o) == 0) goto werr;84: } else if (o->type == REDIS_HASH) {85: if (rewriteHashObject(&aof,&key,o) == 0) goto werr;86: } else {
87: redisPanic("Unknown object type");
88: }89: /* Save the expire time */
90: // 保存可能有的过期时间
91: if (expiretime != -1) {
92: char cmd[]="*3\r\n$9\r\nPEXPIREAT\r\n";93:94: /* If this key is already expired skip it
95: *96: * 如果键已经过期,那么不写入它的过期时间97: */98: if (expiretime < now) continue;99:100: if (rioWrite(&aof,cmd,sizeof(cmd)-1) == 0) goto werr;101: if (rioWriteBulkObject(&aof,&key) == 0) goto werr;102: if (rioWriteBulkLongLong(&aof,expiretime) == 0) goto werr;103: }104: }105: dictReleaseIterator(di);106: }107:108: /* Make sure data will not remain on the OS's output buffers */
109: // 重新文件流
110: fflush(fp);111: // sync
112: aof_fsync(fileno(fp));113: // 关闭
114: fclose(fp);115:116: /* Use RENAME to make sure the DB file is changed atomically only
117: * if the generate DB file is ok. */118: // 通过更名,用重写后的新 AOF 文件代替旧的 AOF 文件
119: if (rename(tmpfile,filename) == -1) {120: redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
121: unlink(tmpfile);122: return REDIS_ERR;
123: }124: redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
125: return REDIS_OK;
126:127: werr:128: fclose(fp);129: unlink(tmpfile);130: redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
131: if (di) dictReleaseIterator(di);
132: return REDIS_ERR;
133: }
rewriteAppendOnlyFile()的工作就是遍历数据库,将kv键值对格式化为标准的Redis命令写入临时文件。然后强制刷到硬盘上,重命名临时文件为参数规定的名字,最后子进程退出。这里的关注点在于kv键值格式化的结果。rioWriteBulkObject,rewriteListObject,rewriteSetObject,rewriteSortedSetObject,rewriteHashObject的工作就是将5种类型对象分别解析,然后写入到硬盘
在子进程完成rewrite过程后,主进程会在serverCron中收到信号,然后调用backgroundRewriteDoneHandler()处理
backgroundRewriteDoneHandler()首先判断子进程退出是否正常或者是被信号打断,然后打开刚刚rewrite的文件,将aof_rewrite_buf_blocks中的缓冲添加到文件里
1: void backgroundRewriteDoneHandler(int exitcode, int bysignal) {2: if (!bysignal && exitcode == 0) {
3: int newfd, oldfd;
4: char tmpfile[256];
5: long long now = ustime();6:7: redisLog(REDIS_NOTICE,8: "Background AOF rewrite terminated with success");
9:10: /* Flush the differences accumulated by the parent to the
11: * rewritten AOF. */12: snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof",
13: (int)server.aof_child_pid);
14: newfd = open(tmpfile,O_WRONLY|O_APPEND);
15: if (newfd == -1) {
16: redisLog(REDIS_WARNING,17: "Unable to open the temporary AOF produced by the child: %s", strerror(errno));
18: goto cleanup;
19: }20:21: if (aofRewriteBufferWrite(newfd) == -1) {
22: redisLog(REDIS_WARNING,23: "Error trying to flush the parent diff to the rewritten AOF: %s", strerror(errno));
24: close(newfd);
25: goto cleanup;
26: }
接下来就分为两种情况,如果AOF是打开的,那么rewrite在rename后关闭原来的AOF文件是会阻塞的。如果AOF是关闭的,但是原来路径存在AOF文件,那么rename时unlink原来文件也会阻塞。这里Redis给出的方案是不管AOF是不是打开,如果原来的文件存在,都先打开原来文件。那么rename后,因为原来的文件是打开的,所以不会unlink。将unlink推迟到关闭原来文件的描述符时。最后,将close()操作放到异步IO线程执行
1: if (server.aof_fd == -1) {
2: /* AOF disabled */
3:4: /* Don't care if this fails: oldfd will be -1 and we handle that.
5: * One notable case of -1 return is if the old file does6: * not exist. */7: oldfd = open(server.aof_filename,O_RDONLY|O_NONBLOCK);
8: } else {
9: /* AOF enabled */
10: oldfd = -1; /* We'll set this to the current AOF filedes later. */
11: }12:13: /* Rename the temporary file. This will not unlink the target file if
14: * it exists, because we reference it with "oldfd". */15: if (rename(tmpfile,server.aof_filename) == -1) {16: redisLog(REDIS_WARNING,17: "Error trying to rename the temporary AOF file: %s", strerror(errno));
18: close(newfd);
19: if (oldfd != -1) close(oldfd);20: goto cleanup;
21: }22:23: if (server.aof_fd == -1) {
24: /* AOF disabled, we don't need to set the AOF file descriptor
25: * to this new file, so we can close it. */26: close(newfd);
27: } else {
28: /* AOF enabled, replace the old fd with the new one. */
29: oldfd = server.aof_fd;30: server.aof_fd = newfd;31: if (server.aof_fsync == AOF_FSYNC_ALWAYS)
32: aof_fsync(newfd);33: else if (server.aof_fsync == AOF_FSYNC_EVERYSEC)34: aof_background_fsync(newfd);35: server.aof_selected_db = -1; /* Make sure SELECT is re-issued */
36: aofUpdateCurrentSize();37: server.aof_rewrite_base_size = server.aof_current_size;38:39: /* Clear regular AOF buffer since its contents was just written to
40: * the new AOF from the background rewrite buffer. */41: sdsfree(server.aof_buf);42: server.aof_buf = sdsempty();43: }44:45: server.aof_lastbgrewrite_status = REDIS_OK;46:47: redisLog(REDIS_NOTICE, "Background AOF rewrite finished successfully");
48: /* Change state from WAIT_REWRITE to ON if needed */
49: if (server.aof_state == REDIS_AOF_WAIT_REWRITE)
50: server.aof_state = REDIS_AOF_ON;51:52: /* Asynchronously close the overwritten AOF. */
53: if (oldfd != -1) bioCreateBackgroundJob(REDIS_BIO_CLOSE_FILE,(void*)(long)oldfd,NULL,NULL);54:55: redisLog(REDIS_VERBOSE,56: "Background AOF rewrite signal handler took %lldus", ustime()-now);
57: } else if (!bysignal && exitcode != 0) {58: server.aof_lastbgrewrite_status = REDIS_ERR;59:60: redisLog(REDIS_WARNING,61: "Background AOF rewrite terminated with error");
62: } else {
63: server.aof_lastbgrewrite_status = REDIS_ERR;64:65: redisLog(REDIS_WARNING,66: "Background AOF rewrite terminated by signal %d", bysignal);
67: }68: }