Redis AOF持久化

本文详细介绍了Redis的AOF持久化机制,包括AOF文件如何记录服务器对数据库的所有写操作,以及如何在启动时利用这些操作恢复数据集。文章探讨了AOF的优点与局限性,如数据耐久性、AOF文件重写机制及其对性能的影响。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

AOF持久化记录所有服务器对数据库的写操作,并在服务器启动时,用这些命令来还原数据集。AOF文件中的命令都以Redis协议的格式来记录,新命令会被追加到文件的末尾。Redis还支持后台对AOF文件进行重写,防止其超过AOF文件限制的大小

image

AOF持久化的优缺点

优点

·使用AOF持久化使Redis变得更耐久你可以设置不同的 fsync 策略,比如无 fsync ,每秒钟一次 fsync ,或者每次执行写入命令时 fsync 。AOF 的默认策略为每秒钟
fsync 一次,在这种配置下,Redis 仍然可以保持良好的性能,并且就算发生故障停机,也最多只会丢失2秒钟的数据(fsync 会在后台线程执行,所以主线程可以继续努力地处理命令请求)

·Redis 可以在 AOF 文件体积变得过大时,自动地在后台对 AOF 进行重写:重写后的新 AOF 文件包含了恢复当前数据集所需的最小命令集合。整个重写操作是绝对安全的,因为 Redis 在创建新 AOF文件的过程中,会继续将命令追加到现有的 AOF 文件里面,即使重写过程中发生停机,现有的 AOF文件也不会丢失。而一旦新 AOF 文件创建完毕,Redis 就会从旧 AOF 文件切换到新 AOF 文件,并开始对新 AOF 文件进行追加操作

·AOF 文件有序地保存了对数据库执行的所有写入操作,这些写入操作以 Redis 协议的格式保存,因此 AOF 文件的内容非常容易被人读懂,对文件进行分析(parse)也很轻松。导出(export)AOF 文件也非常简单:举个例子,如果你不小心执行了FLUSHALL 命令,但只要 AOF 文件未被重写,那么只要停止服务器,移除 AOF 文件末尾的FLUSHALL 命令,并重启 Redis ,就可以将数据集恢复到FLUSHALL 执行之前的状态

缺点

·对于同样的数据集来说,AOF文件的体积要大于RDB文件的体积

·根据所使用的 fsync 策略,AOF 的速度可能会慢于 RDB 。在一般情况下,每秒 fsync 的性能依然非常高,而关闭 fsync 可以让 AOF 的速度和 RDB 一样快,即使在高负荷之下也是如此。不过在处理巨大的写入载入时,RDB 可以提供更有保证的最大延迟时间

AOF持久化的实现

当在配置文件中打开了AOF持久化功能后:
appendonly yes

从现在开始后,每当一个redis命令改变数据集时,这个命令会被追加到AOF文件末尾

相关结构体

AOF相关变量在全局server中的定义:

  1: struct redisServer {
  2:         ……
  3:     int aof_state;                  /* AOF状态 REDIS_AOF_(ON|OFF|WAIT_REWRITE) */
  4:     int aof_fsync;                  /* fsync()策略 */
  5:     char *aof_filename;             /* AOF文件名 */
  6:     int aof_no_fsync_on_rewrite;    /* 在rewrite期间是否fsync */
  7:     int aof_rewrite_perc;           /* M 当AOF文件达到上次rewrite后文件大小的M倍后触发rewrite */
  8:     off_t aof_rewrite_min_size;     /* AOF文件rewrite最小大小 */
  9:     off_t aof_rewrite_base_size;    /* 上一次rewrite后的AOF文件大小 */
 10:     off_t aof_current_size;         /* 现在AOF文件大小 */
 11:     int aof_rewrite_scheduled;      /* 当bgsave结束后开始rewrite */
 12:     pid_t aof_child_pid;            /* rewrite进程的pid */
 13:     list *aof_rewrite_buf_blocks;   /* rewrite期间的AOF缓冲 */
 14:     sds aof_buf;      /* AOF缓冲,需要在事件循环中被fsync同步到硬盘上 */
 15:     int aof_fd;       /* 现在AOF文件的fd */
 16:     int aof_selected_db; /* AOF文件现在指定的DB编号 */
 17:     time_t aof_flush_postponed_start; /* 上一次推迟fsync的时间 */
 18:     time_t aof_last_fsync;            /* 上一次fsync的时间 */
 19:     time_t aof_rewrite_time_last;   /* 上一次rewrite时间 */
 20:     time_t aof_rewrite_time_start;  /* 这次rewrite的开始时间 */
 21:     int aof_lastbgrewrite_status;   /* REDIS_OK or REDIS_ERR */
 22:     unsigned long aof_delayed_fsync;  /* fsync拖延次数 */
 23:     ……
 24: }

AOF过程详情

每个命令调用都会调用call(),而在call函数中,如果该命令涉及到写操作,那么会调用progagate()来传播写操作到AOF和slaves。在progagate中,通过调用feedAppendOnlyFile()来进行

  1:  /* Propagate the command into the AOF and replication link */
  2:     // 传播命令到 AOF 和附属节点
  3:     if (flags & REDIS_CALL_PROPAGATE) {
  4:         int flags = REDIS_PROPAGATE_NONE;
  5: 
  6:         if (c->cmd->flags & REDIS_CMD_FORCE_REPLICATION)
  7:             flags |= REDIS_PROPAGATE_REPL;
  8: 
  9:         if (dirty)
 10:             flags |= (REDIS_PROPAGATE_REPL | REDIS_PROPAGATE_AOF);
 11: 
 12:         if (flags != REDIS_PROPAGATE_NONE)
 13:             propagate(c->cmd,c->db->id,c->argv,c->argc,flags);
 14:     }
  1: /* Propagate the specified command (in the context of the specified database id)
  2:  * to AOF and Slaves.
  3:  *
  4:  * 传播给定命令到 AOF 或附属节点
  5:  *
  6:  * flags are an xor between:
  7:  * + REDIS_PROPAGATE_NONE (no propagation of command at all)
  8:  * + REDIS_PROPAGATE_AOF (propagate into the AOF file if is enabled)
  9:  * + REDIS_PROPAGATE_REPL (propagate into the replication link)
 10:  */
 11: void propagate(struct redisCommand *cmd, int dbid, robj **argv, int argc,
 12:                int flags)
 13: {
 14:     if (server.aof_state != REDIS_AOF_OFF && flags & REDIS_PROPAGATE_AOF)
 15:         feedAppendOnlyFile(cmd,dbid,argv,argc);
 16:     if (flags & REDIS_PROPAGATE_REPL && listLength(server.slaves))
 17:         replicationFeedSlaves(server.slaves,dbid,argv,argc);
 18: }

  1: 
  2: /*
  3:  * 将给定命令追加到 AOF 文件/缓存中 
  4:  */
  5: void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
  6:     sds buf = sdsempty();
  7:     robj *tmpargv[3];
  8: 
  9:     /* The DB this command was targetting is not the same as the last command
 10:      * we appendend. To issue a SELECT command is needed. */
 11:     // 当前 db 不是指定的 aof db,
 12:     // 通过创建 SELECT 命令来切换数据库
 13:     if (dictid != server.aof_selected_db) {
 14:         char seldb[64];
 15: 
 16:         // 让 AOF 文件切换 DB
 17:         snprintf(seldb,sizeof(seldb),"%d",dictid);
 18:         buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
 19:             (unsigned long)strlen(seldb),seldb);
 20: 
 21:         // 程序切换 DB
 22:         server.aof_selected_db = dictid;
 23:     }
 24: 
 25:     // 将 EXPIRE / PEXPIRE / EXPIREAT 命令翻译为 PEXPIREAT 命令
 26:     if (cmd->proc == expireCommand || cmd->proc == pexpireCommand ||
 27:         cmd->proc == expireatCommand) {
 28:         /* Translate EXPIRE/PEXPIRE/EXPIREAT into PEXPIREAT */
 29:         buf = catAppendOnlyExpireAtCommand(buf,cmd,argv[1],argv[2]);
 30: 
 31:     // 将 SETEX / PSETEX 命令翻译为 SET 和 PEXPIREAT 命令
 32:     } else if (cmd->proc == setexCommand || cmd->proc == psetexCommand) {
 33:         /* Translate SETEX/PSETEX to SET and PEXPIREAT */
 34:         tmpargv[0] = createStringObject("SET",3);
 35:         tmpargv[1] = argv[1];
 36:         tmpargv[2] = argv[3];
 37:         buf = catAppendOnlyGenericCommand(buf,3,tmpargv);
 38:         decrRefCount(tmpargv[0]);
 39:         buf = catAppendOnlyExpireAtCommand(buf,cmd,argv[1],argv[2]);
 40: 
 41:     // 其他命令直接追加到 buf 末尾
 42:     } else {
 43:         /* All the other commands don't need translation or need the
 44:          * same translation already operated in the command vector
 45:          * for the replication itself. */
 46:         buf = catAppendOnlyGenericCommand(buf,argc,argv);
 47:     }
 48: 
 49:     /* Append to the AOF buffer. This will be flushed on disk just before
 50:      * of re-entering the event loop, so before the client will get a
 51:      * positive reply about the operation performed. */
 52:     // 将 buf 追加到服务器的 aof_buf 末尾
 53:     // 下次 AOF 写入执行时,这些数据就会被写入
 54:     if (server.aof_state == REDIS_AOF_ON)
 55:         server.aof_buf = sdscatlen(server.aof_buf,buf,sdslen(buf));
 56: 
 57:     /* If a background append only file rewriting is in progress we want to
 58:      * accumulate the differences between the child DB and the current one
 59:      * in a buffer, so that when the child process will do its work we
 60:      * can append the differences to the new append only file. */
 61:     // 如果 AOF 重写正在执行,那么也将新 buf 追加到 AOF 重写缓存中
 62:     // 等 AOF 重写完之前的数据之后,新输入的命令也会追加到新 AOF 文件中。
 63:     if (server.aof_child_pid != -1)
 64:         aofRewriteBufferAppend((unsigned char*)buf,sdslen(buf));
 65: 
 66:     sdsfree(buf);
 67: }

在这一步,命令会先添加到aof_buf这个缓冲中,同步到硬盘需要另外一个过程进行。首先,我们需要确定目前AOF文件中最后的命令是不是与该命令的DB编号一致,如果不一致,我们还需要将select命令写入AOF文件。然后格式化该命令,当AOF机制打开时,将命令写入aof_buf。如果此时有rewrite进程在运行,那么还需要将命令写入server.aof_rewrite_buf_blocks(此结构在下面的AOF重写中介绍)

  1: /* Append data to the AOF rewrite buffer, allocating new blocks if needed. 
  2:  *
  3:  * 将数组 s 追加到 AOF 缓存的末尾。
  4:  * 如果有需要的话,分配一个新的缓存块。
  5:  */
  6: void aofRewriteBufferAppend(unsigned char *s, unsigned long len) {
  7:     listNode *ln = listLast(server.aof_rewrite_buf_blocks);
  8:     aofrwblock *block = ln ? ln->value : NULL;
  9: 
 10:     while(len) {
 11:         /* If we already got at least an allocated block, try appending
 12:          * at least some piece into it. */
 13:         // 将数据保存到最后一个块里
 14:         // 数据保存的数量是不定的,可能需要创建一个新块来保存数据
 15:         if (block) {
 16:             unsigned long thislen = (block->free < len) ? block->free : len;
 17:             if (thislen) {  /* The current block is not already full. */
 18:                 memcpy(block->buf+block->used, s, thislen);
 19:                 block->used += thislen;
 20:                 block->free -= thislen;
 21:                 s += thislen;
 22:                 len -= thislen;
 23:             }
 24:         }
 25: 
 26:         // 分配第一个块,
 27:         // 或者创建另一个块来保存数据
 28:         if (len) { /* First block to allocate, or need another block. */
 29:             int numblocks;
 30: 
 31:             block = zmalloc(sizeof(*block));
 32:             block->free = AOF_RW_BUF_BLOCK_SIZE;
 33:             block->used = 0;
 34:             listAddNodeTail(server.aof_rewrite_buf_blocks,block);
 35: 
 36:             /* Log every time we cross more 10 or 100 blocks, respectively
 37:              * as a notice or warning. */
 38:             numblocks = listLength(server.aof_rewrite_buf_blocks);
 39:             if (((numblocks+1) % 10) == 0) {
 40:                 int level = ((numblocks+1) % 100) == 0 ? REDIS_WARNING :
 41:                                                          REDIS_NOTICE;
 42:                 redisLog(level,"Background AOF buffer size: %lu MB",
 43:                     aofRewriteBufferSize()/(1024*1024));
 44:             }
 45:         }
 46:     }
 47: }

每当服务器常规任务函数被执行、或者事件处理器被执行时,aof.c/flushAppendOnlyFile 函数都会被调用,这个函数执行以下两个工作:
WRITE:根据条件,将 aof_buf 中的缓存写入到 AOF 文件。
SAVE:根据条件,调用 fsync 或 fdatasync 函数,将 AOF 文件保存到磁盘中

在命令添加到aof_buf内后,每次事件循环开始,会调用flushAppendOnlyFile(int force)来将aof_buf写到硬盘上。force参数主要是用于当异步IO线程在进行fsync()并且fsync策略是每秒同步时,是否继续写入或者拖延写入。如果在serverCron发现上次flush操作是拖延的,那么继续尝试flush到硬盘

  1: void beforeSleep(struct aeEventLoop *eventLoop) {
  2:     ……
  3:     /* Write the AOF buffer on disk */
  4:     flushAppendOnlyFile(0);
  5:     ……
  6: }
  7:  
  8: int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
  9:     ……
 10:     if (server.aof_flush_postponed_start) flushAppendOnlyFile(0);
 11:         ……
 12: }

然后来看flushAppendOnlyFile(),首先,bioPendingJobsOfType()调用会检测异步IO线程是否在进行同步到硬盘操作。如果fsync策略是AOF_FSYNC_EVERYSEC,那么如果有异步IO线程在进行,那么会判断上次尝试flush是否拖延,如果上次尝试不拖延或者拖延时间没超过两秒,那么继续拖延。否则继续执行函数

  1: 
  2: /* Write the append only file buffer on disk.
  3:  *
  4:  * 将 AOF 缓存写到文件中
  5:  *
  6:  * Since we are required to write the AOF before replying to the client,
  7:  * and the only way the client socket can get a write is entering when the
  8:  * the event loop, we accumulate all the AOF writes in a memory
  9:  * buffer and write it on disk using this function just before entering
 10:  * the event loop again.
 11:  *
 12:  * 因为程序需要在会回复客户端之前对 AOF 执行写操作。
 13:  * 而客户端能执行写操作的唯一机会就是在事件 loop 中,
 14:  * 因为程序将所有 AOF 写保存到缓存中,
 15:  * 并在进入事件 loop 之前,将缓存写入到文件中。
 16:  *
 17:  * About the 'force' argument:
 18:  *
 19:  * 关于 force 参数:
 20:  *
 21:  * When the fsync policy is set to 'everysec' we may delay the flush if there
 22:  * is still an fsync() going on in the background thread, since for instance
 23:  * on Linux write(2) will be blocked by the background fsync anyway.
 24:  * When this happens we remember that there is some aof buffer to be
 25:  * flushed ASAP, and will try to do that in the serverCron() function.
 26:  *
 27:  * 当 fsync 策略是“每秒进行一次 fsync”时,
 28:  * 后台队列里可能会有 fsync 等待执行并阻塞,
 29:  * 这些 fsync 会在 serverCron() 中执行。
 30:  *
 31:  * However if force is set to 1 we'll write regardless of the background
 32:  * fsync. 
 33:  *
 34:  * 但是,如果 force 为 1 ,那么不管后台任务是否在 fsync ,
 35:  * 程序都直接执行 fsync 。
 36:  */
 37: void flushAppendOnlyFile(int force) {
 38:     ssize_t nwritten;
 39:     int sync_in_progress = 0;
 40: 
 41:     // 没有缓存等待写入,直接返回
 42:     if (sdslen(server.aof_buf) == 0) return;
 43: 
 44:     // 返回后台正在等待执行的 fsync 数量
 45:     if (server.aof_fsync == AOF_FSYNC_EVERYSEC)
 46:         sync_in_progress = bioPendingJobsOfType(REDIS_BIO_AOF_FSYNC) != 0;
 47: 
 48:     // AOF 模式为每秒 fsync ,并且 force 不为 1 
 49:     // 如果可以的话,推延冲洗
 50:     if (server.aof_fsync == AOF_FSYNC_EVERYSEC && !force) {
 51:         /* With this append fsync policy we do background fsyncing.
 52:          * If the fsync is still in progress we can try to delay
 53:          * the write for a couple of seconds. */
 54:         // 如果 aof_fsync 队列里已经有正在等待的任务
 55:         if (sync_in_progress) {
 56: 
 57:             // 推迟 aof 重写 ...
 58: 
 59:             if (server.aof_flush_postponed_start == 0) {
 60:                 // 上一次没有推迟冲洗过,记录推延的当前时间,然后返回
 61:                 /* No previous write postponinig, remember that we are
 62:                  * postponing the flush and return. */
 63:                 server.aof_flush_postponed_start = server.unixtime;
 64:                 return;
 65: 
 66:             } else if (server.unixtime - server.aof_flush_postponed_start < 2) {
 67:                 // 允许在两秒之内的推延冲洗
 68:                 /* We were already waiting for fsync to finish, but for less
 69:                  * than two seconds this is still ok. Postpone again. */
 70:                 return;
 71:             }
 72:             /* Otherwise fall trough, and go write since we can't wait
 73:              * over two seconds. */
 74:             // 记录冲洗推延次数
 75:             server.aof_delayed_fsync++;
 76:             redisLog(REDIS_NOTICE,"Asynchronous AOF fsync is taking too long (disk is busy?). Writing the AOF buffer without waiting for fsync to complete, this may slow down Redis.");
 77:         }
 78:     }
 79: 
 80:     // 到达这一步,冲洗已经不能推迟了,将属性设为 0 
 81:     /* If you are following this code path, then we are going to write so
 82:      * set reset the postponed flush sentinel to zero. */
 83:     server.aof_flush_postponed_start = 0;
 84: 
 85:     /* We want to perform a single write. This should be guaranteed atomic
 86:      * at least if the filesystem we are writing is a real physical one.
 87:      * While this will save us against the server being killed I don't think
 88:      * there is much to do about the whole server stopping for power problems
 89:      * or alike */
 90:     // 将 AOF 缓存写入到文件,如果一切幸运的话,写入会原子性地完成
 91:     nwritten = write(server.aof_fd,server.aof_buf,sdslen(server.aof_buf));
 92:     // 写入出错,停止 Redis 并报告错误
 93:     if (nwritten != (signed)sdslen(server.aof_buf)) {
 94:         /* Ooops, we are in troubles. The best thing to do for now is
 95:          * aborting instead of giving the illusion that everything is
 96:          * working as expected. */
 97:         if (nwritten == -1) {
 98:             redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
 99:         } else {
100:             redisLog(REDIS_WARNING,"Exiting on short write while writing to "
101:                                    "the append-only file: %s (nwritten=%ld, "
102:                                    "expected=%ld)",
103:                                    strerror(errno),
104:                                    (long)nwritten,
105:                                    (long)sdslen(server.aof_buf));
106: 
107:             if (ftruncate(server.aof_fd, server.aof_current_size) == -1) {
108:                 redisLog(REDIS_WARNING, "Could not remove short write "
109:                          "from the append-only file.  Redis may refuse "
110:                          "to load the AOF the next time it starts.  "
111:                          "ftruncate: %s", strerror(errno));
112:             }
113:         }
114:         exit(1);
115:     }
116:     // 更新 AOF 文件的当前大小
117:     server.aof_current_size += nwritten;
118: 
119:     /* Re-use AOF buffer when it is small enough. The maximum comes from the
120:      * arena size of 4k minus some overhead (but is otherwise arbitrary). */
121:     // 如果 aof 缓存不是太大,那么重用它,否则,清空 aof 缓存
122:     if ((sdslen(server.aof_buf)+sdsavail(server.aof_buf)) < 4000) {
123:         sdsclear(server.aof_buf);
124:     } else {
125:         sdsfree(server.aof_buf);
126:         server.aof_buf = sdsempty();
127:     }
128: 
129:     /* Don't fsync if no-appendfsync-on-rewrite is set to yes and there are
130:      * children doing I/O in the background. */
131:     // 以下条件发生时,直接返回,不执行后面的 fsnyc :
132:     // 不允许在 AOF 重写时写入 AOF 文件 并且
133:     // REWRITEAOF 正在执行 或者 BGSAVE 正在进行
134:     if (server.aof_no_fsync_on_rewrite &&
135:         (server.aof_child_pid != -1 || server.rdb_child_pid != -1))
136:             return;
137: 
138:     /* Perform the fsync if needed. */
139:     // 如果有需要,执行 fsync
140: 
141:     // AOF 模式为总是 fsync ,那么执行 fsync
142:     if (server.aof_fsync == AOF_FSYNC_ALWAYS) {
143:         /* aof_fsync is defined as fdatasync() for Linux in order to avoid
144:          * flushing metadata. */
145:         aof_fsync(server.aof_fd); /* Let's try to get this data on the disk */
146:         // 更新对 AOF 文件最后一次进行 fsync 的时间
147:         server.aof_last_fsync = server.unixtime;
148: 
149:     // AOF 模式为每秒一次,并且距离上次写 AOF 文件已经超过 1 秒
150:     } else if ((server.aof_fsync == AOF_FSYNC_EVERYSEC &&
151:                 server.unixtime > server.aof_last_fsync)) {
152:         // 仅在没有 fsync 在后台进行时,才将新的 fsync 任务放到后台执行
153:         if (!sync_in_progress) aof_background_fsync(server.aof_fd);
154:         // 更新对 AOF 文件最后一次进行 fsync 时间
155:         server.aof_last_fsync = server.unixtime;
156:     }
157: }
AOF 保存模式

Redis 目前支持三种 AOF 保存模式,它们分别是:
1. AOF_FSYNC_NO :不保存。
2. AOF_FSYNC_EVERYSEC :每一秒钟保存一次。
3. AOF_FSYNC_ALWAYS :每执行一个命令保存一次

不保存

在这种模式下,每次调用 flushAppendOnlyFile 函数,WRITE 都会被执行,但 SAVE 会被
略过。在这种模式下,SAVE 只会在以下任意一种情况中被执行:
• Redis 被关闭
• AOF 功能被关闭
• 系统的写缓存被刷新(可能是缓存已经被写满,或者定期保存操作被执行)
这三种情况下的 SAVE 操作都会引起 Redis 主进程阻塞

每一秒钟保存一次

在这种模式中,SAVE 原则上每隔一秒钟就会执行一次,因为 SAVE 操作是由后台子线程调用的,所以它不会引起服务器主进程阻塞
注意,在上一句的说明里面使用了词语“原则上” ,在实际运行中,程序在这种模式下对 fsync或 fdatasync 的调用并不是每秒一次,它和调用 flushAppendOnlyFile 函数时 Redis 所处的状态有关
每当 flushAppendOnlyFile 函数被调用时,可能会出现以下四种情况:
• 子线程正在执行 SAVE ,并且:
1. 这个 SAVE 的执行时间未超过 2 秒,那么程序直接返回,并不执行 WRITE 或新的
SAVE 。
2. 这个 SAVE 已经执行超过 2 秒,那么程序执行 WRITE ,但不执行新的 SAVE
注意,因为这时 WRITE 的写入必须等待子线程先完成(旧的)SAVE ,因此这里
WRITE 会比平时阻塞更长时间
• 子线程没有在执行 SAVE ,并且:
3. 上次成功执行 SAVE 距今不超过 1 秒,那么程序执行 WRITE ,但不执行 SAVE
4. 上次成功执行 SAVE 距今已经超过 1 秒,那么程序执行 WRITE 和 SAVE

image

根据以上说明可以知道,在“每一秒钟保存一次”模式下,如果在情况 1 中发生故障停机,那么用户最多损失小于 2 秒内所产生的所有数据。
如果在情况 2 中发生故障停机,那么用户损失的数据是可以超过 2 秒的
Redis 官网上所说的,AOF 在“每一秒钟保存一次”时发生故障,只丢失 1 秒钟数据的说法,实际上并不准确

每执行一个命令保存一次

在这种模式下,每次执行完一个命令之后,WRITE 和 SAVE 都会被执行
另外,因为 SAVE 是由 Redis 主进程执行的,所以在 SAVE 执行期间,主进程会被阻塞,不能接受命令请求

AOF重写

因为AOF的运作方式是不断的向AOF文件中追加命令,所以随着写入命令的不断增加,AOF文件的体积会变得越来越大

举个例子,如果你对一个计数器调用了 100 次INCR ,那么仅仅是为了保存这个计数器的当前值,AOF 文件就需要使用 100 条记录(entry)
然而在实际上,只使用一条SET 命令已经足以保存计数器的当前值了,其余 99 条记录实际上都是多余的。
为了处理这种情况,Redis 支持一种有趣的特性:可以在不打断服务客户端的情况下,对 AOF 文件进行重写(rewrite)
执行BGREWRITEAOF 命令,Redis 将生成一个新的 AOF 文件,这个文件包含重建当前数据集所需的最少命令

后台重写

AOF 重写程序可以很好地完成创建一个新 AOF 文件的任务,但是,在执行这个程序的时候,调用者线程会被阻塞
很明显,作为一种辅佐性的维护手段,Redis 不希望 AOF 重写造成服务器无法处理请求,所以Redis 决定将 AOF 重写程序放到(后台)子进程里执行,这样处理的最大好处是:
1. 子进程进行 AOF 重写期间,主进程可以继续处理命令请求
2. 子进程带有主进程的数据副本,使用子进程而不是线程,可以在避免锁的情况下,保证数据的安全性不过,使用子进程也有一个问题需要解决:因为子进程在进行 AOF 重写期间,主进程还需要继续处理命令,而新的命令可能对现有的数据进行修改,这会让当前数据库的数据和重写后的AOF 文件中的数据不一致

为了解决这个问题,Redis 增加了一个 AOF 重写缓存,这个缓存在 fork 出子进程之后开始启用,Redis 主进程在接到新的写命令之后,除了会将这个写命令的协议内容追加到现有的 AOF文件之外,还会追加到这个缓存中:

image

这个缓存就是上文提到的server.aof_rewrite_buf_blocks 

换言之,当子进程在执行 AOF 重写时,主进程需要执行以下三个工作:
1. 处理命令请求
2. 将写命令追加到现有的 AOF 文件中

3. 将写命令追加到 AOF 重写缓存中

这样一来可以保证:
1. 现有的 AOF 功能会继续执行,即使在 AOF 重写期间发生停机,也不会有任何数据丢失
2. 所有对数据库进行修改的命令都会被记录到 AOF 重写缓存中
当子进程完成 AOF 重写之后,它会向父进程发送一个完成信号,父进程在接到完成信号之后,会调用一个信号处理函数,并完成以下工作:
1. 将 AOF 重写缓存中的内容全部写入到新 AOF 文件中

2. 对新的 AOF 文件进行改名,覆盖原有的 AOF 文件
当步骤 1 执行完毕之后,现有 AOF 文件、新 AOF 文件和数据库三者的状态就完全一致
当步骤 2 执行完毕之后,程序就完成了新旧两个 AOF 文件的交替
这个信号处理函数执行完毕之后,主进程就可以继续像往常一样接受命令请求了。在整个 AOF后台重写过程中,只有最后的写入缓存和改名操作会造成主进程阻塞,在其他时候,AOF 后台重写都不会对主进程造成阻塞,这将 AOF 重写对性能造成的影响降到了最低

AOF 后台重写的触发条件

服务器在 AOF 功能开启的情况下,会维持以下三个变量:
• 记录当前 AOF 文件大小的变量 aof_current_size
• 记录最后一次 AOF 重写之后,AOF 文件大小的变量 aof_rewirte_base_size
• 增长百分比变量 aof_rewirte_perc
每次当 serverCron 函数执行时,它都会检查以下条件是否全部满足,如果是的话,就会触发自动的 AOF 重写:
1. 没有 BGSAVE 命令在进行
2. 没有 BGREWRITEAOF 在进行
3. 当前 AOF 文件大小大于 server.aof_rewrite_min_size (默认值为 1 MB)
4. 当前 AOF 文件大小和最后一次 AOF 重写后的大小之间的比率大于等于指定的增长百分比
默认情况下,增长百分比为 100% ,也即是说,如果前面三个条件都已经满足,并且当前 AOF文件大小比最后一次 AOF 重写时的大小要大一倍的话,那么触发自动 AOF 重写

AOF重写实现

rewrite过程主要是调用函数rewriteAppendOnlyFileBackground()来实现的,下面具体分析此函数具体做了什么工作

  1: 
  2: /* This is how rewriting of the append only file in background works:
  3:  * 
  4:  * 以下是后台重写 AOF 文件的工作步骤:
  5:  *
  6:  * 1) The user calls BGREWRITEAOF
  7:  *    用户调用 BGREWRITEAOF
  8:  *
  9:  * 2) Redis calls this function, that forks():
 10:  *    Redis 调用这个函数,它执行 fork() :
 11:  *
 12:  *    2a) the child rewrite the append only file in a temp file.
 13:  *        子进程在临时文件中对 AOF 文件进行重写
 14:  *
 15:  *    2b) the parent accumulates differences in server.aof_rewrite_buf.
 16:  *        父进程将新输入的命令追加到 server.aof_rewrite_buf 中
 17:  *
 18:  * 3) When the child finished '2a' exists.
 19:  *    当步骤 2a 执行完之后,子进程结束
 20:  *
 21:  * 4) The parent will trap the exit code, if it's OK, will append the
 22:  *    data accumulated into server.aof_rewrite_buf into the temp file, and
 23:  *    finally will rename(2) the temp file in the actual file name.
 24:  *    The the new file is reopened as the new append only file. Profit!
 25:  *
 26:  *    如果子进程的退出状态是 OK 的话,那么父进程将新输入命令写入到临时文件,
 27:  *    然后对临时文件改名,用它代替旧的 AOF 文件,至此,后台 AOF 重写完成。
 28:  */
 29: int rewriteAppendOnlyFileBackground(void) {
 30:     pid_t childpid;
 31:     long long start;
 32: 
 33:     // 后台重写正在执行
 34:     if (server.aof_child_pid != -1) return REDIS_ERR;
 35: 
 36:     // 开始时间
 37:     start = ustime();
 38:     if ((childpid = fork()) == 0) {
 39:         char tmpfile[256];
 40: 
 41:         /* Child */
 42:         // 关闭网络连接
 43:         if (server.ipfd > 0) close(server.ipfd);
 44:         if (server.sofd > 0) close(server.sofd);
 45: 
 46:         // 创建临时文件
 47:         snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
 48:         // 重写
 49:         if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
 50:             size_t private_dirty = zmalloc_get_private_dirty();
 51: 
 52:             if (private_dirty) {
 53:                 redisLog(REDIS_NOTICE,
 54:                     "AOF rewrite: %lu MB of memory used by copy-on-write",
 55:                     private_dirty/(1024*1024));
 56:             }
 57:             // 向父进程发送信号, exitFromChild 定义于 redis.c
 58:             exitFromChild(0);
 59:         } else {
 60:             exitFromChild(1);
 61:         }
 62:     } else {
 63:         /* Parent */
 64:         server.stat_fork_time = ustime()-start;
 65: 
 66:         // 如果创建子进程失败,直接返回
 67:         if (childpid == -1) {
 68:             redisLog(REDIS_WARNING,
 69:                 "Can't rewrite append only file in background: fork: %s",
 70:                 strerror(errno));
 71:             return REDIS_ERR;
 72:         }
 73: 
 74:         // 报告客户端,后台重写正在进行
 75:         redisLog(REDIS_NOTICE,
 76:             "Background append only file rewriting started by pid %d",childpid);
 77: 
 78:         // 更新服务器状态
 79:         server.aof_rewrite_scheduled = 0;
 80:         server.aof_rewrite_time_start = time(NULL);
 81:         server.aof_child_pid = childpid;
 82:         // 关闭 key space 的 rehash ,避免写时复制
 83:         updateDictResizePolicy();
 84:         /* We set appendseldb to -1 in order to force the next call to the
 85:          * feedAppendOnlyFile() to issue a SELECT command, so the differences
 86:          * accumulated by the parent into server.aof_rewrite_buf will start
 87:          * with a SELECT statement and it will be safe to merge. */
 88:         server.aof_selected_db = -1;
 89:         return REDIS_OK;
 90:     }
 91:     return REDIS_OK; /* unreached */
 92: }

rewriteAppendOnlyFileBackground()的工作主要是fork出一个子进程,然后对父进程进行AOF状态的更新。rewrite任务就交给子进程运行rewriteAppendOnlyFile()解决

  1: 
  2: /* Write a sequence of commands able to fully rebuild the dataset into
  3:  * "filename". Used both by REWRITEAOF and BGREWRITEAOF.
  4:  *
  5:  * 写一串足以还原数据集的命令到给定文件里。
  6:  * 被 REWRITEAOF 和 BGREWRITEAOF 所使用。
  7:  *
  8:  * In order to minimize the number of commands needed in the rewritten
  9:  * log Redis uses variadic commands when possible, such as RPUSH, SADD
 10:  * and ZADD. However at max REDIS_AOF_REWRITE_ITEMS_PER_CMD items per time
 11:  * are inserted using a single command. 
 12:  *
 13:  * 为了减少重建数据集所需命令的数量,
 14:  * 在可能时,Redis 会使用可变参数命令,比如 RPUSH 、 SADD 和 ZADD 。
 15:  * 不过这些命令每次最多添加的元素不会超过 REDIS_AOF_REWRITE_ITEMS_PER_CMD 。
 16:  *
 17:  * 重写失败返回 REDIS_ERR ,成功返回 REDIS_OK 。
 18:  */
 19: int rewriteAppendOnlyFile(char *filename) {
 20:     dictIterator *di = NULL;
 21:     dictEntry *de;
 22:     rio aof;
 23:     FILE *fp;
 24:     char tmpfile[256];
 25:     int j;
 26:     long long now = mstime();
 27: 
 28:     /* Note that we have to use a different temp name here compared to the
 29:      * one used by rewriteAppendOnlyFileBackground() function. */
 30:     snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
 31:     fp = fopen(tmpfile,"w");
 32:     if (!fp) {
 33:         redisLog(REDIS_WARNING, "Opening the temp file for AOF rewrite in rewriteAppendOnlyFile(): %s", strerror(errno));
 34:         return REDIS_ERR;
 35:     }
 36: 
 37:     // 初始化文件流
 38:     rioInitWithFile(&aof,fp);
 39:     // 遍历所有数据库
 40:     for (j = 0; j < server.dbnum; j++) {
 41:         char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
 42:         redisDb *db = server.db+j;
 43:         dict *d = db->dict;
 44:         if (dictSize(d) == 0) continue;
 45:         di = dictGetSafeIterator(d);
 46:         if (!di) {
 47:             fclose(fp);
 48:             return REDIS_ERR;
 49:         }
 50: 
 51:         /* SELECT the new DB */
 52:         // 切换到合适的数据库上
 53:         if (rioWrite(&aof,selectcmd,sizeof(selectcmd)-1) == 0) goto werr;
 54:         if (rioWriteBulkLongLong(&aof,j) == 0) goto werr;
 55: 
 56:         /* Iterate this DB writing every entry */
 57:         // 遍历数据库的所有 key-value 对
 58:         while((de = dictNext(di)) != NULL) {
 59:             sds keystr;
 60:             robj key, *o;
 61:             long long expiretime;
 62: 
 63:             keystr = dictGetKey(de);
 64:             o = dictGetVal(de);
 65:             initStaticStringObject(key,keystr);
 66: 
 67:             expiretime = getExpire(db,&key);
 68: 
 69:             /* Save the key and associated value */
 70:             // 保存 key 和 value
 71:             if (o->type == REDIS_STRING) {
 72:                 /* Emit a SET command */
 73:                 char cmd[]="*3\r\n$3\r\nSET\r\n";
 74:                 if (rioWrite(&aof,cmd,sizeof(cmd)-1) == 0) goto werr;
 75:                 /* Key and value */
 76:                 if (rioWriteBulkObject(&aof,&key) == 0) goto werr;
 77:                 if (rioWriteBulkObject(&aof,o) == 0) goto werr;
 78:             } else if (o->type == REDIS_LIST) {
 79:                 if (rewriteListObject(&aof,&key,o) == 0) goto werr;
 80:             } else if (o->type == REDIS_SET) {
 81:                 if (rewriteSetObject(&aof,&key,o) == 0) goto werr;
 82:             } else if (o->type == REDIS_ZSET) {
 83:                 if (rewriteSortedSetObject(&aof,&key,o) == 0) goto werr;
 84:             } else if (o->type == REDIS_HASH) {
 85:                 if (rewriteHashObject(&aof,&key,o) == 0) goto werr;
 86:             } else {
 87:                 redisPanic("Unknown object type");
 88:             }
 89:             /* Save the expire time */
 90:             // 保存可能有的过期时间
 91:             if (expiretime != -1) {
 92:                 char cmd[]="*3\r\n$9\r\nPEXPIREAT\r\n";
 93: 
 94:                 /* If this key is already expired skip it 
 95:                  *
 96:                  * 如果键已经过期,那么不写入它的过期时间
 97:                  */
 98:                 if (expiretime < now) continue;
 99: 
100:                 if (rioWrite(&aof,cmd,sizeof(cmd)-1) == 0) goto werr;
101:                 if (rioWriteBulkObject(&aof,&key) == 0) goto werr;
102:                 if (rioWriteBulkLongLong(&aof,expiretime) == 0) goto werr;
103:             }
104:         }
105:         dictReleaseIterator(di);
106:     }
107: 
108:     /* Make sure data will not remain on the OS's output buffers */
109:     // 重新文件流
110:     fflush(fp);
111:     // sync
112:     aof_fsync(fileno(fp));
113:     // 关闭
114:     fclose(fp);
115: 
116:     /* Use RENAME to make sure the DB file is changed atomically only
117:      * if the generate DB file is ok. */
118:     // 通过更名,用重写后的新 AOF 文件代替旧的 AOF 文件
119:     if (rename(tmpfile,filename) == -1) {
120:         redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
121:         unlink(tmpfile);
122:         return REDIS_ERR;
123:     }
124:     redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
125:     return REDIS_OK;
126: 
127: werr:
128:     fclose(fp);
129:     unlink(tmpfile);
130:     redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
131:     if (di) dictReleaseIterator(di);
132:     return REDIS_ERR;
133: }

rewriteAppendOnlyFile()的工作就是遍历数据库,将kv键值对格式化为标准的Redis命令写入临时文件。然后强制刷到硬盘上,重命名临时文件为参数规定的名字,最后子进程退出。这里的关注点在于kv键值格式化的结果。rioWriteBulkObject,rewriteListObject,rewriteSetObject,rewriteSortedSetObject,rewriteHashObject的工作就是将5种类型对象分别解析,然后写入到硬盘

在子进程完成rewrite过程后,主进程会在serverCron中收到信号,然后调用backgroundRewriteDoneHandler()处理
backgroundRewriteDoneHandler()首先判断子进程退出是否正常或者是被信号打断,然后打开刚刚rewrite的文件,将aof_rewrite_buf_blocks中的缓冲添加到文件里

  1: void backgroundRewriteDoneHandler(int exitcode, int bysignal) {
  2:     if (!bysignal && exitcode == 0) {
  3:         int newfd, oldfd;
  4:         char tmpfile[256];
  5:         long long now = ustime();
  6:  
  7:         redisLog(REDIS_NOTICE,
  8:             "Background AOF rewrite terminated with success");
  9:  
 10:         /* Flush the differences accumulated by the parent to the
 11:          * rewritten AOF. */
 12:         snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof",
 13:             (int)server.aof_child_pid);
 14:         newfd = open(tmpfile,O_WRONLY|O_APPEND);
 15:         if (newfd == -1) {
 16:             redisLog(REDIS_WARNING,
 17:                 "Unable to open the temporary AOF produced by the child: %s", strerror(errno));
 18:             goto cleanup;
 19:         }
 20:  
 21:         if (aofRewriteBufferWrite(newfd) == -1) {
 22:             redisLog(REDIS_WARNING,
 23:                 "Error trying to flush the parent diff to the rewritten AOF: %s", strerror(errno));
 24:             close(newfd);
 25:             goto cleanup;
 26:         }

接下来就分为两种情况,如果AOF是打开的,那么rewrite在rename后关闭原来的AOF文件是会阻塞的。如果AOF是关闭的,但是原来路径存在AOF文件,那么rename时unlink原来文件也会阻塞。这里Redis给出的方案是不管AOF是不是打开,如果原来的文件存在,都先打开原来文件。那么rename后,因为原来的文件是打开的,所以不会unlink。将unlink推迟到关闭原来文件的描述符时。最后,将close()操作放到异步IO线程执行

  1:    if (server.aof_fd == -1) {
  2:             /* AOF disabled */
  3:  
  4:              /* Don't care if this fails: oldfd will be -1 and we handle that.
  5:               * One notable case of -1 return is if the old file does
  6:               * not exist. */
  7:              oldfd = open(server.aof_filename,O_RDONLY|O_NONBLOCK);
  8:         } else {
  9:             /* AOF enabled */
 10:             oldfd = -1; /* We'll set this to the current AOF filedes later. */
 11:         }
 12:  
 13:         /* Rename the temporary file. This will not unlink the target file if
 14:          * it exists, because we reference it with "oldfd". */
 15:         if (rename(tmpfile,server.aof_filename) == -1) {
 16:             redisLog(REDIS_WARNING,
 17:                 "Error trying to rename the temporary AOF file: %s", strerror(errno));
 18:             close(newfd);
 19:             if (oldfd != -1) close(oldfd);
 20:             goto cleanup;
 21:         }
 22:  
 23:         if (server.aof_fd == -1) {
 24:             /* AOF disabled, we don't need to set the AOF file descriptor
 25:              * to this new file, so we can close it. */
 26:             close(newfd);
 27:         } else {
 28:             /* AOF enabled, replace the old fd with the new one. */
 29:             oldfd = server.aof_fd;
 30:             server.aof_fd = newfd;
 31:             if (server.aof_fsync == AOF_FSYNC_ALWAYS)
 32:                 aof_fsync(newfd);
 33:             else if (server.aof_fsync == AOF_FSYNC_EVERYSEC)
 34:                 aof_background_fsync(newfd);
 35:             server.aof_selected_db = -1; /* Make sure SELECT is re-issued */
 36:             aofUpdateCurrentSize();
 37:             server.aof_rewrite_base_size = server.aof_current_size;
 38:  
 39:             /* Clear regular AOF buffer since its contents was just written to
 40:              * the new AOF from the background rewrite buffer. */
 41:             sdsfree(server.aof_buf);
 42:             server.aof_buf = sdsempty();
 43:         }
 44:  
 45:         server.aof_lastbgrewrite_status = REDIS_OK;
 46:  
 47:         redisLog(REDIS_NOTICE, "Background AOF rewrite finished successfully");
 48:         /* Change state from WAIT_REWRITE to ON if needed */
 49:         if (server.aof_state == REDIS_AOF_WAIT_REWRITE)
 50:             server.aof_state = REDIS_AOF_ON;
 51:  
 52:         /* Asynchronously close the overwritten AOF. */
 53:         if (oldfd != -1) bioCreateBackgroundJob(REDIS_BIO_CLOSE_FILE,(void*)(long)oldfd,NULL,NULL);
 54:  
 55:         redisLog(REDIS_VERBOSE,
 56:             "Background AOF rewrite signal handler took %lldus", ustime()-now);
 57:     } else if (!bysignal && exitcode != 0) {
 58:         server.aof_lastbgrewrite_status = REDIS_ERR;
 59:  
 60:         redisLog(REDIS_WARNING,
 61:             "Background AOF rewrite terminated with error");
 62:     } else {
 63:         server.aof_lastbgrewrite_status = REDIS_ERR;
 64:  
 65:         redisLog(REDIS_WARNING,
 66:             "Background AOF rewrite terminated by signal %d", bysignal);
 67:     }
 68: }
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值