Redis

最新推荐文章于 2025-08-19 12:18:01 发布

春泥面包

最新推荐文章于 2025-08-19 12:18:01 发布

阅读量1.6k

点赞数

CC 4.0 BY-SA版权

分类专栏： Redis 文章标签： redis

本文链接：https://blog.youkuaiyun.com/huntinux/article/details/51517473

Redis 专栏收录该内容

1 篇文章

订阅专栏

本文深入探讨Redis内部实现机制，覆盖数据结构如简单动态字符串(SDS)、链表、字典、跳跃表等，以及对象系统、内存管理机制、数据库和持久化策略等内容。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

主要参考：
Redis设计与实现链接: http://pan.baidu.com/s/1jH45VMI 密码: nxhb <请大家支持正版, 电子版可以先看看>
http://debugo.com/python-redis/
http://my.oschina.net/fuckphp/blog/270258
http://www.heychinaski.com/blog/2013/10/14/a-look-at-the-redis-source-code/
http://my.oschina.net/fuckphp/blog/277407
知乎上关于epoll的讨论，值得一看：http://www.zhihu.com/question/20122137

源码下载，我下载的是3.0分支：

git clone -b 3.0 https://github.com/antirez/redis.git

Redis数据库中每个键值对儿的键总是一个字符串，而值可以为：字符串、列表、哈希、集合、有序集合。

分析工具

vim+ctags+grep 我是不是很土 >_<

sds 简单动态字符串 Simple Dynamic String

typedef char *sds;

struct sdshdr {
    unsigned int len;  // 当前长度
    unsigned int free; // 可用空间大小
    char buf[];
};

static inline size_t sdslen(const sds s) {
    struct sdshdr *sh = (void*)(s-(sizeof(struct sdshdr))); // 通过相对偏移计算s所在sdshdr结构体的首地址 ^_^ 
    return sh->len;
}

len属性，可以以O(1)返回字符串长度。
free属性保存可用空间大小。内存分派策略为预分派和惰性释放，提高效率。杜绝缓冲区溢出，对sds进行修改时会检测是否有足够的空间。
二进制安全（binary-safe），C字符串只能保存文本数据，遇到’\0’则认为字符串结束。sds 的API以二进制方式处理放在buf的数据。可以看到源码中比较操作用的是memcmp，而不是strcmp

// sds.c
int sdscmp(const sds s1, const sds s2) {
    size_t l1, l2, minlen;
    int cmp;

    l1 = sdslen(s1);
    l2 = sdslen(s2);
    minlen = (l1 < l2) ? l1 : l2;
    cmp = memcmp(s1,s2,minlen);
    if (cmp == 0) return l1-l2;
    return cmp;
}

链表 adlist

// 双端链表的节点定义
typedef struct listNode {
    struct listNode *prev;
    struct listNode *next;
    void *value;
} listNode;

// 迭代器，这是比较有意思的地方
typedef struct listIter {
    listNode *next;
    int direction; // 迭代方向
} listIter;

typedef struct list {
    listNode *head;
    listNode *tail;
    void *(*dup)(void *ptr); // 通过函数指针实现对不同类型的支持，多态，在lighttpd中也有体现
    void (*free)(void *ptr);
    int (*match)(void *ptr, void *key);
    unsigned long len;
} list;

字典

字典在Redis中应用相当广泛，Redis的数据库的底层实现就是使用的字典。

哈希表

// 哈希表中的节点：键值对
typedef struct dictEntry {
    void *key; // 键
    union {
        void *val;
        uint64_t u64;
        int64_t s64;
        double d;
    } v; // 值
    struct dictEntry *next; // 用来解决键冲突，键相同的节点组成一个链表。（还记得解决hash冲突的常用方法吗：开放定址法，链地址法。）
} dictEntry;

// 哈希表
typedef struct dictht {
    dictEntry **table;      // 哈希表数组，数组中的每一项是一个指向dictEntry的指针，dictEntry中保存的是一个键值对儿。
    unsigned long size;     // 哈希表大小
    unsigned long sizemask; // 用于计算索引值的掩码，总是等于size-1
    unsigned long used;     // 已有节点数量
} dictht;

字典

typedef struct dictType {
    unsigned int (*hashFunction)(const void *key);
    void *(*keyDup)(void *privdata, const void *key);
    void *(*valDup)(void *privdata, const void *obj);
    int (*keyCompare)(void *privdata, const void *key1, const void *key2);
    void (*keyDestructor)(void *privdata, void *key);
    void (*valDestructor)(void *privdata, void *obj);
} dictType;

// 字典
typedef struct dict {
    dictType *type; // 类型特定函数，hash函数
    void *privdata; 
    dictht ht[2];   // 哈希表，默认使用0，rehash时使用1
    long rehashidx; /* rehashing not in progress if rehashidx == -1 */
    int iterators; /* number of iterators currently running */
} dict;

Redis使用MurmurHash2算法来计算键的hash值。（还有djb hash）

rehash

随着操作不断进行，哈希表中保存的键值对逐渐地增大或减少，为了让哈希表的负载因子（load factor）维持在一个合理的范围之内，当哈希表保存的键值对数量太多或太少时，程序需要对哈希表进行扩展或收缩（通过rehash来完成）。
load factor = ht[0].used / ht[0].size
rehash是渐进式的，这是为了避免对服务器性能造成影响，所以分多次、渐进的将ht[0]里面的键值对慢慢地rehash到ht[1]。

渐进式rehash过程：

为ht[1]分配空间
rehashidx = 0，表示rehash从index=0的地方开始
程序每次对字典执行添加、删除、查找或者更新操作时，程序都会顺带将 ht[0][rehashidx]—rehash–> ht[1] 中，并且rehashidx++
基本思想是将rehash分散到对字典的每个添加、删除、查找和更新操作上，从而避免了集中式rehash带来的庞大计算量。
其中，在进行渐进式rehash的过程中，字典的删除、查找、更新等操作会在两个哈希表上进行。例如，查找一个键时，会先在ht[0]里面查找，如果没有找到，会继续到ht[1]里进行查找。
而rehash期间的添加操作一律被保存在ht[1]中，ht[0]中只减不增，随着rehash的进行，最终变为空表。

跳跃表 skiplist

http://www.cnblogs.com/xuqiang/archive/2011/05/22/2053516.html
http://blog.youkuaiyun.com/haidao2009/article/details/8206856
Skip List是一种随机化的数据结构，基于并联的链表，其效率可比拟于二叉查找树（对于大多数操作需要O(log n)平均时间）。基本上，跳跃列表是对有序的链表增加上附加的前进链接，增加是以随机化的方式进行的，所以在列表中的查找可以快速的跳过部分列表(因此得名)。所有操作都以对数随机化的时间进行。Skip List可以很好解决有序链表查找特定值的困难。

它是一种有序数据结构，通过在每个节点中维持多个指向其他节点的指针，从而达到快速访问节点的目的。支持评价O(logN), 最坏O(N)复杂度的节点查找，大部分情况下效率可以与平衡树媲美，并且因为实现更为简单，所以有不少程序使用跳跃表来代替平衡树。

Redis只在两个地方用到了跳跃表

有序集合
集群节点中用作内部数据结构

整数集合 intset

整数集合是集合的底层实现之一，当一个集合只包含整数值元素，并且这个集合的元素数量不多时，Redis就会使用整数集合作为集合的底层实现。

127.0.0.1:6379> sadd num 1 2 3 4 
(integer) 4
127.0.0.1:6379> TYPE num
set
127.0.0.1:6379> OBJECT ENCODING num
"intset"
127.0.0.1:6379> sadd stuff 1 "hello"
(integer) 2
127.0.0.1:6379> OBJECT encoding stuff
"hashtable"

可以看到当结合中都是整数元素时，底层是使用intset实现的；否则为hashtable。
这里贴出一个intset的APIintsetSearch。
可以看到该函数的注释清晰、逻辑严谨，代码命名和排版精美，简直就是艺术品。^_^

/* Search for the position of "value". Return 1 when the value was found and
 * sets "pos" to the position of the value within the intset. Return 0 when
 * the value is not present in the intset and sets "pos" to the position
 * where "value" can be inserted. */
static uint8_t intsetSearch(intset *is, int64_t value, uint32_t *pos) {
    int min = 0, max = intrev32ifbe(is->length)-1, mid = -1;
    int64_t cur = -1;

    /* The value can never be found when the set is empty */
    if (intrev32ifbe(is->length) == 0) {
        if (pos) *pos = 0;
        return 0;
    } else {
        /* Check for the case where we know we cannot find the value,
         * but do know the insert position. */
        if (value > _intsetGet(is,intrev32ifbe(is->length)-1)) {
            if (pos) *pos = intrev32ifbe(is->length);
            return 0;
        } else if (value < _intsetGet(is,0)) {
            if (pos) *pos = 0;
            return 0;
        }
    }

    /* 折半查找 */
    while(max >= min) {
        mid = ((unsigned int)min + (unsigned int)max) >> 1;
        cur = _intsetGet(is,mid);
        if (value > cur) {
            min = mid+1;
        } else if (value < cur) {
            max = mid-1;
        } else {
            break;
        }
    }

    if (value == cur) {
        if (pos) *pos = mid; /* 找到了则返回该元素的位置 */
        return 1;
    } else {
        if (pos) *pos = min; /* 没找到则返回可以插入的位置 */
        return 0;
    }
}

该函数首先对于特殊条件下的解进行了处理，然后进行折半查找。

压缩列表 ziplist

ziplist是列表和哈希的底层实现之一。当一个列表只包含少量列表项，并且每个列表项要么是小整数值，要么是长度比较短的字符串，那么Redis就会用ziplist来做列表的实现。
当一个哈希表只包含少量的键值对，而且每个键值对的键和值要么就是小整数值、要么就是较短的字符串，Redis就会使用ziplist来实现哈希表。

对象

前面学习了Redis中的数据结构，如SDS、双端链表、字典、压缩列表、整数集合等。
Redis并没有直接使用这些数据结构来实现键值对数据库，而是基于这些数据结构创建了一个对象系统，这个系统包含字符串对象、列表对象、哈希对象、集合对象和有序集合对象这5种类型的对象，每种对象都用到了至少一种前面所介绍的数据结构。
通过这五种不同类型的对象，Redis可以在执行命令之前，根据对象的类型来判断一个对象是否可以执行给定的命令。使用对象的另一个好处是，我们可以针对不同的使用场景，为对象设置多种不同的数据结构实现，从而优化对象在不同场景下的使用效率。
除此之外，Redis的对象系统还实现了基于引用计数技术的内存回收机制，当程序不再使用某个对象的时候，这个对象所占用的内存就会被自动释放；另外，Redis还通过引用计数技术实现了对象共享机制，这一机制可以在适当的条件下，通过让多个数据库键共享同一个对象来节约内存。
最后，Redis的对象带有访问时间记录信息，该信息可以用于计算数据库键的空转时长，在服务器启用了maxmemory功能的情况下，空转时长较大的那些键可能会优先被服务器删除。

例如，下面的命令在数据库中创建了一个新的键值对，其中键值对的键是包含了字符串值”msg”的对象，而键值的值则是一个包含了字符串值”Hello”的对象。

127.0.0.1:6379> set msg "Hello"
OK

Redis中的每个对象由一个redisObject结构表示。

/* A redis object, that is a type able to hold a string / list / set */

/* The actual Redis Object */
#define REDIS_LRU_BITS 24
#define REDIS_LRU_CLOCK_MAX ((1<<REDIS_LRU_BITS)-1) /* Max value of obj->lru */
#define REDIS_LRU_CLOCK_RESOLUTION 1000 /* LRU clock resolution in ms */
typedef struct redisObject {
    unsigned type:4;     // 对象类型
    unsigned encoding:4; // 底层实现使用的哪种数据结构
    unsigned lru:REDIS_LRU_BITS; /* lru time (relative to server.lruclock) */
    int refcount; // 引用计数
    void *ptr;    // 指向底层实现数据结构的指针
} robj;

其中类型可以为：

/* Object types */
#define REDIS_STRING 0 // 字符串对象
#define REDIS_LIST 1   // 列表对象
#define REDIS_SET 2    // 集合对象
#define REDIS_ZSET 3   // 有序集合对象
#define REDIS_HASH 4   // 哈希对象

编码表示对象的底层实现是使用的哪种数据结构:

/* Objects encoding. Some kind of objects like Strings and Hashes can be
 * internally represented in multiple ways. The 'encoding' field of the object
 * is set to one of this fields for this object. */
#define REDIS_ENCODING_RAW 0     /* Raw representation */
#define REDIS_ENCODING_INT 1     /* Encoded as integer */
#define REDIS_ENCODING_HT 2      /* Encoded as hash table */
#define REDIS_ENCODING_ZIPMAP 3  /* Encoded as zipmap */
#define REDIS_ENCODING_LINKEDLIST 4 /* Encoded as regular linked list */
#define REDIS_ENCODING_ZIPLIST 5 /* Encoded as ziplist */
#define REDIS_ENCODING_INTSET 6  /* Encoded as intset */
#define REDIS_ENCODING_SKIPLIST 7  /* Encoded as skiplist */
#define REDIS_ENCODING_EMBSTR 8  /* Embedded sds string encoding */

类型和编码
使用OBJECT ENCODING命令可以查看一个数据库键对应的值对象的编码（即底层实现是用的什么数据结构）
命令TYPE用于查看一个数据库键对应的值对象的类型是五种类型中的哪一个。

引用计数和共享

redisObject结构体中有一个refcount表示的就是该对象的引用计数，当引用计数为0时可以回收对象。基于此，可以实现对象的共享。例如，Redis会在服务器初始化时创建0~9999所对应的10000个字符串，达到共享的目的，从而节约内存。可以看到下面对1000的引用计数是2。

127.0.0.1:6379> SET A 1000
OK
127.0.0.1:6379> OBJECT REFCOUNT A
(integer) 2

源码见redis.c/initServer->createSharedObjects;

对象的空转时长

redisObject结构体中有一个lru属性，该属性记录了最后一个被访问的时间。
OBJECT IDLETIME命令可以打印出给定键的空转时长，这是通过用当前事件减去lru得到的。
除了可以被OBJECT IDLETIME命令打印出来之外，键的空转时长还有另外一项作用：如果服务器打开了maxmemory选项，并且服务器用于回收内存的算法为volatile-lru或者allkeys-lru，那么当服务器占用的内存数超过了maxmemory选项所设置的上限值时，空转时长较高的那部分键会优先被服务器释放，从而回收内存。
配置文件的maxmemory选项和maxmemory-policy选项的说明介绍了关于这方面的更多信息。

内存回收调用过程


// 当有客户端连接请求时，即监听套接字可读时，设置处理函数为acceptTcpHandler
initServer
---->aeCreateFileEvent(..., AE_READABLE, acceptTcpHandler);

// 事件处理主循环
main
---->aeMain
-------->aeProcessEvents(eventLoop, AE_ALL_EVENTS) 

// 接受client的连接，并设置已连接套接字cfd上的可读事件处理函数为
acceptTcpHandler
---->cfd = anetTcpAccept(..., listenfd, ...)；
---->acceptCommonHandler(cfd,0);
-------->createClient
------------>aeCreateFileEvent(...,cfd,AE_READABLE,readQueryFromClient)

// 当client有可读事件发生时，即client发送了命令给server，通过事件循环的调用过程如下
readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask)
---->processInputBuffer(redisClient *c)
-------->processCommand(redisClient *c)
------------>freeMemoryIfNeeded(void) //如果需要的话，就释放内存

具体过程应该是通过某种I/O复用技术（如epoll）来实现的事件的监听和处理（如epoll_wait）

数据库

服务器端在redisServer结构中保存数据库。

struct redisServer {
    ...
    redisDb *db; // 数据库数组，由配置redis.conf中的选项database决定，默认值为16
    ...
};

Redis客户端选择服务器端某个数据库作为操作对象。默认选择下标为0的数据库。切换数据库使用SELECT命令。
服务器内部使用redisClient的db属性记录客户端选择的数据库是哪个。
redisDb表示一个数据库，其中的dict保存了数据库中所有的键值对，将这个字典称为键空间（Key Space）

/* Redis database representation. There are multiple databases identified
 * by integers from 0 (the default database) up to the max configured
 * database. The database number is the 'id' field in the structure. */
typedef struct redisDb {
    dict *dict;                 /* The keyspace for this DB */
    dict *expires;              /* Timeout of keys with a timeout set */
    dict *blocking_keys;        /* Keys with clients waiting for data (BLPOP) */
    dict *ready_keys;           /* Blocked keys that received a PUSH */
    dict *watched_keys;         /* WATCHED keys for MULTI/EXEC CAS */
    struct evictionPoolEntry *eviction_pool;    /* Eviction pool of keys */
    int id;                     /* Database ID */
    long long avg_ttl;          /* Average TTL, just for stats */
} redisDb;

如前所述，数据库的键空间的每个键都是一个字符串对象；每个值可以是字符串对象、列表对象、哈希对象、集合对象、有序集合对象。

过期时间 expires

可以设置一个键的过期时间，到达过期时间时数据库会删除这个键。redisDb结构的expires字典保存了数据库中所有键的过期时间。定义见上面。
键空间（dict）的键和过期字典（expires）的键都指向同一个对象，所以不会出现任何重复对象，也不会浪费任何空间。

过期删除策略

定时删除，设置键的过期时间的同时，创建一个定时器
惰性删除，键过期时不是马上删除，每次从键空间获取键时，都检查取得的键是否过期，如果过期就删除键（db.c/expireIfNeeded）
定期删除，每个一段时间对数据库进行一次检查，删除过期键。(redis.c/activeExpireCycle)

/* 定期删除策略分析 */
/* 添加周期任务，serverCron */
initServer
---->aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL)

serverCron
---->databasesCron(); /* 数据库任务 */
-------->activeExpireCycle(ACTIVE_EXPIRE_CYCLE_SLOW); /* 定期过期删除 */
/*
全局变量currend_db会记录当前activeExpireCycle检查的进度。下次调用activeExpireCycle时会继续处理。
*/

持久化

个人粗暴理解，就是将内存中的数据库保存在硬盘上的策略

RDB：在不同时间点，生成数据库的快照保存在磁盘等介质上。
AOF (Append Only File): 将操作记录在文件里，下次启动Redis时，只要再执行一下这些操作就可以实现数据恢复了。

Redis事件模型

引自 Linux高性能服务器编程

作为一个服务器，通常要处理3类事件：

I/O事件
定时器事件
信号
- SIGHUP 当进程的控制终端被挂起时引发SIGHUP，对于没有控制终端而在后台运行的服务器程序来说，通常用SIGHUP来强制重新读取配置文件。
- SIGPIPE 向一个读端关闭的管道或socket写入数据时引发该信号
- SIGURG 发送带外数据的一种方法，另一种方法是使用I/O复用

通常，使用统一信号源的技术统一处理这三类事件，如libevent。
下面开始分析Redis的事件模型。（基于Reactor模型）

I/O多路复用中，套接字在什么时候是可读的？什么时候是可写的？

对于监听套接字，当有客户端进行connect时，监听套接字可读；对于已连接套接字，客户端对套接字执行write操作或执行close操作时，已连接套接字可读
客户端对套接字执行read操作，套接字可写
附赠《Linux高性能服务器编程》中的总结，这很重要（请叫我盗图党）：

事件类型：

#define AE_NONE 0   
#define AE_READABLE 1   /* 可读事件 */
#define AE_WRITABLE 2   /* 可写事件 */

文件事件结构体。(几乎不用写注释，代码本身已经很meaningful了。)

/* 文件事件的处理函数原型 */
typedef void aeFileProc(struct aeEventLoop *eventLoop, int fd, void *clientData, int mask);

/* File event structure */
typedef struct aeFileEvent {
    int mask; /* one of AE_(READABLE|WRITABLE) */
    aeFileProc *rfileProc; /* 可读事件发生时的处理函数 */
    aeFileProc *wfileProc; /* 可写事件发生时的处理函数 */
    void *clientData;
} aeFileEvent;

事件处理框架

事件循环的状态由aeEventLoop结构体来表示：

/* State of an event based program */
typedef struct aeEventLoop {
    int maxfd;   /* highest file descriptor currently registered */
    int setsize; /* max number of file descriptors tracked */
    long long timeEventNextId;
    time_t lastTime;     /* Used to detect system clock skew */
    aeFileEvent *events; /* Registered events */
    aeFiredEvent *fired; /* Fired events */
    aeTimeEvent *timeEventHead;
    int stop;
    void *apidata; /* This is used for polling API specific data */
    aeBeforeSleepProc *beforesleep;
} aeEventLoop;

redisServer结构中的el代表事件循环

// redis.h
struct redisServer {
    //...
    aeEventLoop *el; /* 事件循环 */
    //...
}

redis.c/main函数会调用aeMain(server.el);来初始化el。

// redis.c

/* Global vars */
struct redisServer server; /* server global state */

int main(){
    aeMain(server.el);
}

那么aeMain函数做了什么呢？如下所示：调用aeProcessEvents函数对事件进行处理。

void aeMain(aeEventLoop *eventLoop) {
    eventLoop->stop = 0;
    while (!eventLoop->stop) {
        if (eventLoop->beforesleep != NULL)
            eventLoop->beforesleep(eventLoop);
        /* 对事件进行处理 */
        aeProcessEvents(eventLoop, AE_ALL_EVENTS);
    }
}

所以事件处理的框架是这样的，很简单：

// 事件处理主循环
main
---->aeMain
-------->aeProcessEvents(eventLoop, AE_ALL_EVENTS)

事件在哪里被注册的？

对于监听套接字的事件是在initServer函数中注册的，处理函数为acceptTcpHandler
对于已连接套接字，是在acceptTcpHandle函数中注册的，处理函数为readQueryFromClient

/* 监听套接字的可读事件注册 */
main
---->initServer
-------->信号处理
-------->server.el = aeCreateEventLoop(server.maxclients+REDIS_EVENTLOOP_FDSET_INCR);// 初始化server.el
-------->listenToPort(server.port,server.ipfd,&server.ipfd_count) == REDIS_ERR);//创建监听套接字，保存在server.ipfd中
-------->aeCreateFileEvent(server.el, server.ipfd[j], AE_READABLE,acceptTcpHandler,NULL) // 重点来了，这里注册了文件事件，意思是当监听套接字可读时，使用函数acceptTcpHandler来处理
---->aeMain(); /* 注册完之后，进入事件处理循环，当有事件发生时，调用注册时指定的函数来处理 */

/* 已连接套接字事件的注册 */
acceptTcpHandler
---->cfd = anetTcpAccept(..., listenfd, ...)；
---->acceptCommonHandler(cfd,0);
-------->createClient
------------>aeCreateFileEvent(...,cfd,AE_READABLE,readQueryFromClient)

事件循环aeProcessEvents是怎么实现的？

对于不同的平台，IO复用技术有所不同，Linux下有select、poll、epoll；FreeBSD下的kqueue等。这里只分析一下epoll相关的代码。关于epoll的基本用法，见这里。

首先分析在注册事件时是怎么调用到epoll_ctl的。以对监听套接字注册可读事件过程为例子，注册文件事件的API为：

int aeCreateFileEvent(aeEventLoop *eventLoop, int fd, int mask,
        aeFileProc *proc, void *clientData)
{
    ...

    // adEventLoop中的events数组以fd为下标存储事件
    // 这里以fd为下标，取得对应事件的指针
    aeFileEvent *fe = &eventLoop->events[fd];

    // 调用aeApiAddEvent注册事件
    if (aeApiAddEvent(eventLoop, fd, mask) == -1)
        return AE_ERR;
    fe->mask |= mask;
    if (mask & AE_READABLE) fe->rfileProc = proc; // 设置处理函数
    if (mask & AE_WRITABLE) fe->wfileProc = proc; // 设置处理函数
    fe->clientData = clientData;
    if (fd > eventLoop->maxfd)
        eventLoop->maxfd = fd;
    return AE_OK;
}

epoll实现的aeApiAddEvent。（kqueue实现见ae_kqueue.c 以此类推）

// ae_epoll.c
static int aeApiAddEvent(aeEventLoop *eventLoop, int fd, int mask) {
    aeApiState *state = eventLoop->apidata;
    struct epoll_event ee;
    /* If the fd was already monitored for some event, we need a MOD
     * operation. Otherwise we need an ADD operation. */
    int op = eventLoop->events[fd].mask == AE_NONE ?
            EPOLL_CTL_ADD : EPOLL_CTL_MOD;

    ee.events = 0;
    mask |= eventLoop->events[fd].mask; /* Merge old events */
    if (mask & AE_READABLE) ee.events |= EPOLLIN;
    if (mask & AE_WRITABLE) ee.events |= EPOLLOUT;
    ee.data.u64 = 0; /* avoid valgrind warning */
    ee.data.fd = fd;

    /* 调用epoll_ctl注册事件 */
    if (epoll_ctl(state->epfd,op,fd,&ee) == -1) return -1;
    return 0;
}

对于使用epoll的redis来说，注册在底层调用的时epoll_ctl。那么事件循环在底层当然使用的是epoll_wait。

int aeProcessEvents(aeEventLoop *eventLoop, int flags) {
    ...
    // 取得发生的事件，然后根据事件类型（可读/可写），调用注册时指定的函数进行处理。
        numevents = aeApiPoll(eventLoop, tvp);
        for (j = 0; j < numevents; j++) {
            aeFileEvent *fe = &eventLoop->events[eventLoop->fired[j].fd];
            int mask = eventLoop->fired[j].mask;
            int fd = eventLoop->fired[j].fd;
            int rfired = 0;

        /* note the fe->mask & mask & ... code: maybe an already processed
             * event removed an element that fired and we still didn't
             * processed, so we check if the event is still valid. */
            if (fe->mask & mask & AE_READABLE) {
                rfired = 1;
                fe->rfileProc(eventLoop,fd,fe->clientData,mask);
            }
            if (fe->mask & mask & AE_WRITABLE) {
                if (!rfired || fe->wfileProc != fe->rfileProc)
                    fe->wfileProc(eventLoop,fd,fe->clientData,mask);
            }
            processed++;
        }
    ...
}

用epoll实现的aeApiPoll是这样的，调用了epoll_wait

static int aeApiPoll(aeEventLoop *eventLoop, struct timeval *tvp) {
    aeApiState *state = eventLoop->apidata;
    int retval, numevents = 0;

    retval = epoll_wait(state->epfd,state->events,eventLoop->setsize,
            tvp ? (tvp->tv_sec*1000 + tvp->tv_usec/1000) : -1);
    if (retval > 0) {
        int j;

        numevents = retval;
        for (j = 0; j < numevents; j++) {
            int mask = 0;
            struct epoll_event *e = state->events+j;
        /* EPOLLIN为可读事件 EPOLLOUT，EPOLLERR，EPOLLHUP为可写事件 */
            if (e->events & EPOLLIN) mask |= AE_READABLE;
            if (e->events & EPOLLOUT) mask |= AE_WRITABLE;
            if (e->events & EPOLLERR) mask |= AE_WRITABLE;
            if (e->events & EPOLLHUP) mask |= AE_WRITABLE;
            eventLoop->fired[j].fd = e->data.fd;
            eventLoop->fired[j].mask = mask;
        }
    }
    return numevents;
}

事件事件

定时事件
周期性事件

时间事件结构体：

/* Time event structure */
typedef struct aeTimeEvent {
    long long id; /* 时间事件的标识符 */
    long when_sec; /* seconds */
    long when_ms; /* milliseconds */
    aeTimeProc *timeProc; /* 事时间事件处理函数 */
    aeEventFinalizerProc *finalizerProc;
    void *clientData;
    struct aeTimeEvent *next;
} aeTimeEvent;

Redis服务器将所有时间事件放在一个无序链表中（aeEventLoop.timeEventHead），当时间事件执行器运行时，会遍历整个链表，找到已达到的时间事件调用相应的事件处理函数。

向事件循环添加一个时间事件，保存在server.el.timeEventHead所指向的链表中（头插法）：

long long aeCreateTimeEvent(aeEventLoop *eventLoop, long long milliseconds,
        aeTimeProc *proc, void *clientData,
        aeEventFinalizerProc *finalizerProc)
{
    long long id = eventLoop->timeEventNextId++;
    aeTimeEvent *te;

    te = zmalloc(sizeof(*te));
    if (te == NULL) return AE_ERR;
    te->id = id;
    aeAddMillisecondsToNow(milliseconds,&te->when_sec,&te->when_ms);
    te->timeProc = proc;
    te->finalizerProc = finalizerProc;
    te->clientData = clientData;
    te->next = eventLoop->timeEventHead; /* 插入到时间事件链表中 */
    eventLoop->timeEventHead = te;
    return id;
}

时间事件在哪里被注册的？

在服务器初始化函数，使用aeCreateTimeEvent注册了一个时间事件，处理函数为serverCron。

void initServer(void) {
    ...
    /* Create the serverCron() time event, that's our main way to process
     * background operations. */
    if(aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL) == AE_ERR) 
    {
        redisPanic("Can't create the serverCron time event.");
        exit(1);
    }
    ...
}

来看看serverCron干了些什么（从名字上感觉跟linux下的crontab是一个意思，看人家这命名水平）, 函数太长了，这里只贴出注释。

/* This is our timer interrupt, called server.hz times per second.
 * Here is where we do a number of things that need to be done asynchronously.
 * For instance:
 *
 * - Active expired keys collection (it is also performed in a lazy way on
 *   lookup).
 * - Software watchdog.
 * - Update some statistic.
 * - Incremental rehashing of the DBs hash tables.
 * - Triggering BGSAVE / AOF rewrite, and handling of terminated children.
 * - Clients timeout of different kinds.
 * - Replication reconnection.
 * - Many more...
 *
 * Everything directly called here will be called server.hz times per second,
 * so in order to throttle execution of things we want to do less frequently
 * a macro is used: run_with_period(milliseconds) { .... }
 */
int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
     ...
}

时间事件在哪里被触发的？

Of course, 应该是在上面说到的aeProcessEvents中：

int aeProcessEvents(aeEventLoop *eventLoop, int flags)
{
    ...

    /* 检查时间事件*/
    if (flags & AE_TIME_EVENTS)
        processed += processTimeEvents(eventLoop);

    ...
}

/* Process time events */
static int processTimeEvents(aeEventLoop *eventLoop) {
    int processed = 0;
    aeTimeEvent *te;
    long long maxId;
    time_t now = time(NULL);

    /* If the system clock is moved to the future, and then set back to the
     * right value, time events may be delayed in a random way. Often this
     * means that scheduled operations will not be performed soon enough.
     *
     * Here we try to detect system clock skews, and force all the time
     * events to be processed ASAP when this happens: the idea is that
     * processing events earlier is less dangerous than delaying them
     * indefinitely, and practice suggests it is. */
    if (now < eventLoop->lastTime) {
        te = eventLoop->timeEventHead;
        while(te) {
            te->when_sec = 0;
            te = te->next;
        }
    }
    eventLoop->lastTime = now;

    te = eventLoop->timeEventHead;
    maxId = eventLoop->timeEventNextId-1;
    while(te) {
        long now_sec, now_ms;
        long long id;

        if (te->id > maxId) {
            te = te->next;
            continue;
        }
        aeGetTime(&now_sec, &now_ms);
        if (now_sec > te->when_sec ||
            (now_sec == te->when_sec && now_ms >= te->when_ms))
        {
            int retval;

            id = te->id;
            /* 调用注册时指定的处理函数 */
            retval = te->timeProc(eventLoop, id, te->clientData);
            processed++;
            /* After an event is processed our time event list may
             * no longer be the same, so we restart from head.
             * Still we make sure to don't process events registered
             * by event handlers itself in order to don't loop forever.
             * To do so we saved the max ID we want to handle.
             *
             * FUTURE OPTIMIZATIONS:
             * Note that this is NOT great algorithmically. Redis uses
             * a single time event so it's not a problem but the right
             * way to do this is to add the new elements on head, and
             * to flag deleted elements in a special way for later
             * deletion (putting references to the nodes to delete into
             * another linked list). */
            if (retval != AE_NOMORE) {
                aeAddMillisecondsToNow(retval,&te->when_sec,&te->when_ms);
            } else {
                aeDeleteTimeEvent(eventLoop, id);
            }
            te = eventLoop->timeEventHead;
        } else {
            te = te->next;
        }
    }
    return processed;
}

Redis服务器处理文件事件、时间事件的框架

int main()
{
    // 1. 服务器初始化
    initServer()
        // 1.1 创建事件循环
        server.el = aeCreateEventLoop(server.maxclients+REDIS_EVENTLOOP_FDSET_INCR);

        // 1.2 在事件循环上注册时间事件，会新建一个时间事件，然后添加到eventLoop中的时间事件链表中，等待在后续进行处理
        aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL)

        // 1.3 注册文件事件：监听套接字可读时，即有客户端连接时，由函数acceptTcpHandler来处理
        // acceptTcpHandler函数在accept此连接后，会将对已连接套接字创建一个新的文件事件，
        // 处理函数为readQueryFromCli，即有client发送数据时，调用readQueryFromCli进行处理
        aeCreateFileEvent(server.el, server.ipfd[j], AE_READABLE,
            acceptTcpHandler,NULL)

    // 2. 进入事件循环，处理事件：文件事件/时间事件
    aeMain(server.el)
        aeProcessEvents(eventLoop, AE_ALL_EVENTS);
            ...
            // 2.1 处理文件事件
            numevents = aeApiPoll(eventLoop, tvp);// 底层调用epoll或kqueue等
            for (j = 0; j < numevents; j++) {
                ...
            }
            // 2.2 处理时间事件
            if (flags & AE_TIME_EVENTS)
                processed += processTimeEvents(eventLoop);

    // 3. 最后，删除事件循环
    aeDeleteEventLoop(server.el);
    return 0;
}

Redis客户端发送命令，Redis服务器执行命令的流程

配色有点丑，见谅。
这里写图片描述

其中，向client返回命令执行结果是通过addReply函数完成的，更准确的说是sendReplyToClient：

addReply(redisClient *c, robj *obj) 
---->prepareClientToWrite(c)
-------->aeCreateFileEvent(server.el, c->fd, AE_WRITABLE, sendReplyToClient, c) // 添加关于已连接套接字的可写事件到事件循环中

//addReply下面的写操作只是将数据放到client的发送缓冲区buf中，真正的发送是在写事件就绪时，由函数sendReplyToClient完成的。

sendReplyToClient函数：

// 暂时没有分析

注意一下，使用epoll的情况下，Redis-Server是非阻塞socket+EPOLLLT

监听套接字是nonblock的

listenToPort--->anetNonBlock(listenfd)

客户端的已连接套接字也是nonblock的

createClient--->anetNonBlock(connfd)

并没有使用nonblock socket + epoll ET。

再谈Redis的网络框架

Redis的网络框架在底层可以通过epoll、kqueue、select等IO复用技术实现。Redis抽象出了一组统一的接口，可以使用epoll、kqueue等技术实现这些接口，从而达到支持多种IO复用技术的目的。
以文件事件为例，下面是Redis网络框架中使用的API（ae.h）：

// 创建事件循环，会调用aeApiCreate
aeEventLoop *aeCreateEventLoop(int setsize); 

// 创建一个文件事件，会调用aeApiAddEvent
int aeCreateFileEvent(aeEventLoop *eventLoop, int fd, int mask,
        aeFileProc *proc, void *clientData);

// 处理发生的事件，会调用aeApiPoll查询发生了什么事件
int aeProcessEvents(aeEventLoop *eventLoop, int flags);

所以对于epoll,kqueue等来说，它们需要实现的接口有：

static int aeApiCreate(aeEventLoop *eventLoop);
static int aeApiAddEvent(aeEventLoop *eventLoop, int fd, int mask);
static int aeApiPoll(aeEventLoop *eventLoop, struct timeval *tvp);
static char *aeApiName(void);
static void aeApiDelEvent(aeEventLoop *eventLoop, int fd, int delmask);
static void aeApiFree(aeEventLoop *eventLoop);
static int aeApiResize(aeEventLoop *eventLoop, int setsize);

此外，还需要创建一个api相关的结构体，对于epoll来说，该结构体用来存放epollfd和发生的事件。该结构保存在aeEventLoop.apidata中。

// epoll的apidata
typedef struct aeApiState {
    int epfd; // epollfd
    struct epoll_event *events; // 事件数组
} aeApiState;

Redis的服务器框架会不断的查询有没有事件发生，发生了就使用相应的函数进行处理。为了统一接口，包容epoll,kqueue等技术，Redis对事件进行了封装。

对于文件事件，事件类型只有两种：可读（AE_READABLE）、可写（AE_WRITABLE）。前面已经总结了可读、可写事件具体指的是什么。

文件事件结构体中有两个处理函数与这两类事件相对应，即rfileProc和wfileProc。clientData存放的是client的信息，包括clientid，接收/发送缓冲区，使用的db等。该信息是在创建事件时放入的。

/* File event structure */
typedef struct aeFileEvent {
    int mask; /* one of AE_(READABLE|WRITABLE) */
    aeFileProc *rfileProc;
    aeFileProc *wfileProc;
    void *clientData; 
} aeFileEvent;

注意事件循环在底层可能由epoll、kqueue等api实习那，因此在aeEventLoop结构体中使用apidata成员保存与api相关的数据。对于epoll，这块区域保存的就是epollfd和epoll_event数组（发生了哪些事件）。这块区域会在调用aeApiCreate时进行初始化，

/* State of an event based program */
typedef struct aeEventLoop {
    int maxfd;   /* highest file descriptor currently registered */
    int setsize; /* max number of file descriptors tracked */
    long long timeEventNextId;
    time_t lastTime;     /* Used to detect system clock skew */
    aeFileEvent *events; /* Registered events */
    aeFiredEvent *fired; /* Fired events */
    aeTimeEvent *timeEventHead;
    int stop;
    void *apidata; /* API相关的数据，对epoll是epollfd和epoll_event数组 */
    aeBeforeSleepProc *beforesleep;
} aeEventLoop;

aeCreateFileEvent、aeProcessEvents分析：

int aeCreateFileEvent(aeEventLoop *eventLoop, int fd, int mask,
        aeFileProc *proc, void *clientData)
{
    // 以fd为下标，存储事件到eventloop中，保存的数据有处理函数的指针，事件类型等
    aeFileEvent *fe = &eventLoop->events[fd];

    // 对于epoll，是调用epoll_ctl将fd对应的事件进行注册
    if (aeApiAddEvent(eventLoop, fd, mask) == -1)
        return AE_ERR;

    // 设置注册的事件类型、处理函数
    fe->mask |= mask;
    if (mask & AE_READABLE) fe->rfileProc = proc;
    if (mask & AE_WRITABLE) fe->wfileProc = proc;
    fe->clientData = clientData;
    if (fd > eventLoop->maxfd)
        eventLoop->maxfd = fd;
    return AE_OK;
}

int aeProcessEvents(aeEventLoop *eventLoop, int flags){
        ...
        // 将发生的事件对应的fd保存在eventLoop.fired数组中。
        numevents = aeApiPoll(eventLoop, tvp); 
        for (j = 0; j < numevents; j++) {
            // 根据fd取得对应的aeFileEvent结构体，该结构体保存了该fd对应的事件类型和处理函数。
            aeFileEvent *fe = &eventLoop->events[eventLoop->fired[j].fd];  
            if (fe->mask & mask & AE_READABLE) { 
                // 发生了读事件，那么就使用rfileProc来处理
                rfired = 1;
                fe->rfileProc(eventLoop,fd,fe->clientData,mask);
            }
            if (fe->mask & mask & AE_WRITABLE) {
                // 发生了写事件，那么就使用wfileProc来处理
                if (!rfired || fe->wfileProc != fe->rfileProc)
                    fe->wfileProc(eventLoop,fd,fe->clientData,mask);
            }
            ...
}