我们知道当packet到达交换机之后会提取出flow key->查询流表,如果匹配成功就执行对应的action,否则构造netlink attribute发送到用户空间对应的进程,这里vswitchd会调用handle_upcalls(ofproto/ofproto-dpif.c)来处理。主要流程图是:
结构体 dpif_upcall 表征的是从datapath传到userspace的一个packet(lib/dpif.h)。
(如果key或者action非空的话,那么其指向的数据从属于这个packet,所以不能单独被释放;)
struct dpif_upcall {
/* All types. */
enum dpif_upcall_type type;
struct ofpbuf *packet; /* Packet data. 是全部数据包,还是根据of协议 ??
struct nlattr *key; /* Flow key. */
size_t key_len; /* Length of 'key' in bytes. */
/* DPIF_UC_ACTION only. */
uint64_t userdata; /* Argument to OVS_ACTION_ATTR_USERSPACE. */
};
struct dpif_upcall {
/* All types. */
enum dpif_upcall_type type;
struct ofpbuf *packet; /* Packet data. 是全部数据包,还是根据of协议 ??
struct nlattr *key; /* Flow key. */
size_t key_len; /* Length of 'key' in bytes. */
/* DPIF_UC_ACTION only. */
uint64_t userdata; /* Argument to OVS_ACTION_ATTR_USERSPACE. */
};
upcall的类型有:
enum dpif_upcall_type {
DPIF_UC_MISS, /* Miss in flow table. */
DPIF_UC_ACTION, /* OVS_ACTION_ATTR_USERSPACE action. */
DPIF_N_UC_TYPES
};
DPIF_UC_MISS, /* Miss in flow table. */
DPIF_UC_ACTION, /* OVS_ACTION_ATTR_USERSPACE action. */
DPIF_N_UC_TYPES
};
static int handle_upcalls(struct ofproto_dpif *ofproto, unsigned int max_batch) {
struct dpif_upcall misses[FLOW_MISS_MAX_BATCH];
struct ofpbuf miss_bufs[FLOW_MISS_MAX_BATCH];
uint64_t miss_buf_stubs[FLOW_MISS_MAX_BATCH][4096 / 8];
int n_processed;
int n_misses;
int i;
assert(max_batch <= FLOW_MISS_MAX_BATCH);
n_misses = 0;
for (n_processed = 0; n_processed < max_batch; n_processed++) {
struct dpif_upcall *upcall = &misses[n_misses];
struct ofpbuf *buf = &miss_bufs[n_misses];
int error;
ofpbuf_use_stub(buf, miss_buf_stubs[n_misses], sizeof miss_buf_stubs[n_misses]);
struct dpif_upcall misses[FLOW_MISS_MAX_BATCH];
struct ofpbuf miss_bufs[FLOW_MISS_MAX_BATCH];
uint64_t miss_buf_stubs[FLOW_MISS_MAX_BATCH][4096 / 8];
int n_processed;
int n_misses;
int i;
assert(max_batch <= FLOW_MISS_MAX_BATCH);
n_misses = 0;
for (n_processed = 0; n_processed < max_batch; n_processed++) {
struct dpif_upcall *upcall = &misses[n_misses];
struct ofpbuf *buf = &miss_bufs[n_misses];
int error;
ofpbuf_use_stub(buf, miss_buf_stubs[n_misses], sizeof miss_buf_stubs[n_misses]);
//这个函数的作用是初始化从某个基址开始,一定大小的ofbuf缓冲区,并且base应该指向一个栈的缓冲区并且要对齐;
//使用uint32_t,uint64_t类型可以确保合适的对齐;一个ofbuf缓冲区操作如果需要重新分配数据的话需要将STUB上的
//数据拷贝到malloc缓冲区中,由于这个buf可能扩展到堆上,所以最后应该调用ofpbuf_uninit来释放相应的内存。(??)
error = dpif_recv(ofproto->dpif, upcall, buf);
//通过具体的datapath类型,调用其中的recv方法(这里的dpif class 是dpif_linux_class,表示本机);
//dpif_linux_class表示的是通过netlink和本地的datapath通信,而dpif_netdev_class通过网络协议和远程的datapath通信;
switch (classify_upcall(upcall)) {
case MISS_UPCALL:
n_misses++; /* Handle it later. */
break;
case SFLOW_UPCALL:
if (ofproto->sflow) {
handle_sflow_upcall(ofproto, upcall);
case MISS_UPCALL:
n_misses++; /* Handle it later. */
break;
case SFLOW_UPCALL:
if (ofproto->sflow) {
handle_sflow_upcall(ofproto, upcall);
//对于SFLOW_UPCALL 和 BAD_UPCALL,进行对应处理后释放存有 upcall 消息的 buf,而对于
//MISS_UPCALL 类型,则调用 handle_miss_upcalls 进行后续的处理。如何理解SFLOW ??有待跟进!
}
ofpbuf_uninit(buf);
break;
case BAD_UPCALL:
ofpbuf_uninit(buf);
break;
}
}
/* Handle deferred MISS_UPCALL processing. */
handle_miss_upcalls(ofproto, misses, n_misses);
for (i = 0; i < n_misses; i++) {
ofpbuf_uninit(&miss_bufs[i]);
}
return n_processed;
}
//MISS_UPCALL 类型,则调用 handle_miss_upcalls 进行后续的处理。如何理解SFLOW ??有待跟进!
}
ofpbuf_uninit(buf);
break;
case BAD_UPCALL:
ofpbuf_uninit(buf);
break;
}
}
/* Handle deferred MISS_UPCALL processing. */
handle_miss_upcalls(ofproto, misses, n_misses);
for (i = 0; i < n_misses; i++) {
ofpbuf_uninit(&miss_bufs[i]);
}
return n_processed;
}
轮询dpif 看是否有upcall ,如果有的话就用upcall表征,数据在buf中,这里要确保设置了能够从dpif接收packets即 dpif_recv_set() ;'upcall->packet' 和 'upcall->key'都会指向用户提供的buf中存的数据,所以它们不能分别进行内存释放。
int dpif_recv(struct dpif *dpif, struct dpif_upcall *upcall, struct ofpbuf *buf)
{
int error = dpif->dpif_class->recv(dpif, upcall, buf);
{
int error = dpif->dpif_class->recv(dpif, upcall, buf);
//具体情况见后面;
if (!error && !VLOG_DROP_DBG(&dpmsg_rl)) {
struct ds flow; // dynamic string (lib/dynamic-string.h)
char *packet;
//以字符串形式返回代表ethernet frame的内容(lib/ofp-print.c)
if (!error && !VLOG_DROP_DBG(&dpmsg_rl)) {
struct ds flow; // dynamic string (lib/dynamic-string.h)
char *packet;
//以字符串形式返回代表ethernet frame的内容(lib/ofp-print.c)
packet = ofp_packet_to_string(upcall->packet->data, upcall->packet->size);
ds_init(&flow);
odp_flow_key_format(upcall->key, upcall->key_len, &flow);
//将upcall中的key中的attribute-OVS_KEY_ATTR_*分别解析出来到flow中,只是为了日志输出。
VLOG_DBG("%s: %s upcall:\n%s\n%s", dpif_name(dpif), dpif_upcall_type_to_string(upcall->type),
ds_init(&flow);
odp_flow_key_format(upcall->key, upcall->key_len, &flow);
//将upcall中的key中的attribute-OVS_KEY_ATTR_*分别解析出来到flow中,只是为了日志输出。
VLOG_DBG("%s: %s upcall:\n%s\n%s", dpif_name(dpif), dpif_upcall_type_to_string(upcall->type),
ds_cstr(&flow), packet);
//如何查看这些信息呢??
ds_destroy(&flow);
free(packet);
} else if (error && error != EAGAIN) {
log_operation(dpif, "recv", error);
}
return error;
}
//如何查看这些信息呢??
ds_destroy(&flow);
free(packet);
} else if (error && error != EAGAIN) {
log_operation(dpif, "recv", error);
}
return error;
}
接下来会执行dpif_linux_class 的receive方法,从内核层datapath中获得信息(lib/dpif-linux.c);
const struct dpif_class dpif_linux_class = {
"system",
dpif_linux_enumerate,
dpif_linux_open,
dpif_linux_close,
dpif_linux_destroy,
dpif_linux_run,
dpif_linux_wait,
dpif_linux_get_stats,
dpif_linux_port_add,
dpif_linux_port_del,
dpif_linux_port_query_by_number,
dpif_linux_port_query_by_name,
dpif_linux_get_max_ports,
dpif_linux_port_get_pid,
dpif_linux_port_dump_start,
dpif_linux_port_dump_next,
dpif_linux_port_dump_done,
dpif_linux_port_poll,
dpif_linux_port_poll_wait,
dpif_linux_flow_get,
dpif_linux_flow_put,
dpif_linux_flow_del,
dpif_linux_flow_flush,
dpif_linux_flow_dump_start,
dpif_linux_flow_dump_next,
dpif_linux_flow_dump_done,
dpif_linux_execute,
dpif_linux_operate,
dpif_linux_recv_set,
dpif_linux_queue_to_priority,
dpif_linux_recv,
dpif_linux_recv_wait,
dpif_linux_recv_purge,
};
"system",
dpif_linux_enumerate,
dpif_linux_open,
dpif_linux_close,
dpif_linux_destroy,
dpif_linux_run,
dpif_linux_wait,
dpif_linux_get_stats,
dpif_linux_port_add,
dpif_linux_port_del,
dpif_linux_port_query_by_number,
dpif_linux_port_query_by_name,
dpif_linux_get_max_ports,
dpif_linux_port_get_pid,
dpif_linux_port_dump_start,
dpif_linux_port_dump_next,
dpif_linux_port_dump_done,
dpif_linux_port_poll,
dpif_linux_port_poll_wait,
dpif_linux_flow_get,
dpif_linux_flow_put,
dpif_linux_flow_del,
dpif_linux_flow_flush,
dpif_linux_flow_dump_start,
dpif_linux_flow_dump_next,
dpif_linux_flow_dump_done,
dpif_linux_execute,
dpif_linux_operate,
dpif_linux_recv_set,
dpif_linux_queue_to_priority,
dpif_linux_recv,
dpif_linux_recv_wait,
dpif_linux_recv_purge,
};
/* Datapath interface for the openvswitch Linux kernel module. */
struct dpif_linux {
struct dpif dpif;
int dp_ifindex;
/* Upcall messages. */
struct dpif_channel channels[N_CHANNELS]; //??
uint32_t ready_mask; /* 1-bit for each sock with unread messages. */
int epoll_fd; /* epoll fd that includes channel socks. */
long long int next_scale; /* Next time to scale down the sketches. */
/* Change notification. */
struct sset changed_ports; /* Ports that have changed. */
struct nln_notifier *port_notifier;
bool change_error;
/* Port number allocation. */
uint16_t alloc_port_no;
};
struct dpif_linux {
struct dpif dpif;
int dp_ifindex;
/* Upcall messages. */
struct dpif_channel channels[N_CHANNELS]; //??
uint32_t ready_mask; /* 1-bit for each sock with unread messages. */
int epoll_fd; /* epoll fd that includes channel socks. */
long long int next_scale; /* Next time to scale down the sketches. */
/* Change notification. */
struct sset changed_ports; /* Ports that have changed. */
struct nln_notifier *port_notifier;
bool change_error;
/* Port number allocation. */
uint16_t alloc_port_no;
};
*kernel 和 userspace之间的netlink 通道
struct dpif_channel {struct nl_sock *sock; /* Netlink socket. */
struct dpif_sketch sketches[N_SKETCHES]; /* From max to min 'hits'. */
long long int last_poll; /* Last time this channel was polled. */
};
static int dpif_linux_recv(struct dpif *dpif_, struct dpif_upcall *upcall, struct ofpbuf *buf)
{
struct dpif_linux *dpif = dpif_linux_cast(dpif_); //通过container_of获得包含dpif的dpif_linux容器地址;
int read_tries = 0;
if (dpif->epoll_fd < 0) {
return EAGAIN;
}
if (!dpif->ready_mask) {
struct epoll_event events[N_CHANNELS];
int retval;
int i;
do {
retval = epoll_wait(dpif->epoll_fd, events, N_CHANNELS, 0);
} while (retval < 0 && errno == EINTR);
if (retval < 0) {
static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
VLOG_WARN_RL(&rl, "epoll_wait failed (%s)", strerror(errno));
}
for (i = 0; i < retval; i++) {
dpif->ready_mask |= 1u << events[i].data.u32; //说明有缓冲区非空的事件,设置掩码;
}
}
while (dpif->ready_mask) {
int indx = ffs(dpif->ready_mask) - 1; //得到ready_mask中第一个值为真的bit,-1可以索引到具体的通道;
struct dpif_channel *ch = &dpif->channels[indx];
dpif->ready_mask &= ~(1u << indx); //然后将那一位反置为0,使得下次循环处理其他的;
for (;;) {
int dp_ifindex;
int error;
if (++read_tries > 50) {
return EAGAIN;
}
error = nl_sock_recv(ch->sock, buf, false);
if (error == ENOBUFS) {
/* ENOBUFS typically means that we've received so many packets that the buffer overflowed. Try again
* immediately because there's almost certainly a packet waiting for us. */
report_loss(dpif_, ch);
continue;
}
ch->last_poll = time_msec();
if (error) {
if (error == EAGAIN) {
break;
}
return error;
}
error = parse_odp_packet(buf, upcall, &dp_ifindex);
{
struct dpif_linux *dpif = dpif_linux_cast(dpif_); //通过container_of获得包含dpif的dpif_linux容器地址;
int read_tries = 0;
if (dpif->epoll_fd < 0) {
return EAGAIN;
}
if (!dpif->ready_mask) {
struct epoll_event events[N_CHANNELS];
int retval;
int i;
do {
retval = epoll_wait(dpif->epoll_fd, events, N_CHANNELS, 0);
} while (retval < 0 && errno == EINTR);
if (retval < 0) {
static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
VLOG_WARN_RL(&rl, "epoll_wait failed (%s)", strerror(errno));
}
for (i = 0; i < retval; i++) {
dpif->ready_mask |= 1u << events[i].data.u32; //说明有缓冲区非空的事件,设置掩码;
}
}
while (dpif->ready_mask) {
int indx = ffs(dpif->ready_mask) - 1; //得到ready_mask中第一个值为真的bit,-1可以索引到具体的通道;
struct dpif_channel *ch = &dpif->channels[indx];
dpif->ready_mask &= ~(1u << indx); //然后将那一位反置为0,使得下次循环处理其他的;
for (;;) {
int dp_ifindex;
int error;
if (++read_tries > 50) {
return EAGAIN;
}
error = nl_sock_recv(ch->sock, buf, false);
if (error == ENOBUFS) {
/* ENOBUFS typically means that we've received so many packets that the buffer overflowed. Try again
* immediately because there's almost certainly a packet waiting for us. */
report_loss(dpif_, ch);
continue;
}
ch->last_poll = time_msec();
if (error) {
if (error == EAGAIN) {
break;
}
return error;
}
error = parse_odp_packet(buf, upcall, &dp_ifindex);
//上面已经将来自内核的数据接收到了buf中,这里是按照nla 策略具体解析出来到upcall中;
if (!error && dp_ifindex == dpif->dp_ifindex) {
const struct nlattr *in_port;
in_port = nl_attr_find__(upcall->key, upcall->key_len, OVS_KEY_ATTR_IN_PORT);
if (in_port) {
update_sketch(ch, nl_attr_get_u32(in_port));
}//更新相应的通道在某个端口上的包接收情况;
return 0;
}
if (error) {
return error;
}
}
}
return EAGAIN;
}
利用套接字sock从内核中接收netlink msg 到ofpbuf缓冲区中,若参数wait为真,就阻塞等待消息,否则就返回EAGAIN,调用者要确保缓冲区至少空间为 NLMSG_HDRLEN,通常应该为典型的消息分配足够的空间,执行成功后接收到的消息会在ofbuf中必要话还会扩展空间。
int nl_sock_recv(struct nl_sock *sock, struct ofpbuf *buf, bool wait)
{
int error = nl_sock_cow__(sock);
//如果即将操作的这个sock正在进行dump操作,就会重新克隆一个新的sock来接纳现在的操作;
if (error) {
return error;
}
return nl_sock_recv__(sock, buf, wait);
}
static int nl_sock_recv__(struct nl_sock *sock, struct ofpbuf *buf, bool wait)
{
struct nlmsghdr *nlmsghdr;
uint8_t tail[65536];
if (error) {
return error;
}
return nl_sock_recv__(sock, buf, wait);
}
static int nl_sock_recv__(struct nl_sock *sock, struct ofpbuf *buf, bool wait)
{
struct nlmsghdr *nlmsghdr;
uint8_t tail[65536];
//不能精确预测将要接收消息的大小,为了意外,这里预留64KB的空间;
struct iovec iov[2];
struct msghdr msg;
ssize_t retval;
assert(buf->allocated >= sizeof *nlmsghdr);
ofpbuf_clear(buf);
iov[0].iov_base = buf->base;
iov[0].iov_len = buf->allocated;
iov[1].iov_base = tail;
iov[1].iov_len = sizeof tail;
memset(&msg, 0, sizeof msg);
msg.msg_iov = iov;
msg.msg_iovlen = 2;
struct iovec iov[2];
struct msghdr msg;
ssize_t retval;
assert(buf->allocated >= sizeof *nlmsghdr);
ofpbuf_clear(buf);
iov[0].iov_base = buf->base;
iov[0].iov_len = buf->allocated;
iov[1].iov_base = tail;
iov[1].iov_len = sizeof tail;
memset(&msg, 0, sizeof msg);
msg.msg_iov = iov;
msg.msg_iovlen = 2;
//recvmsg的参数需要构造 msghdr ,里面的缓冲区用来接收消息;
//要注意这些数据结构的组织关系,通过msg接收的数据最终就是在buf指向的缓存中;
do {
retval = recvmsg(sock->fd , &msg, wait ? 0 : MSG_DONTWAIT);
} while (retval < 0 && errno == EINTR);
if (retval < 0) {
int error = errno;
if (error == ENOBUFS) {
COVERAGE_INC(netlink_overflow); //套接字接收缓冲区溢出;
}
return error;
}
if (msg.msg_flags & MSG_TRUNC) { //消息有删节,和上面overflow有何区别???
VLOG_ERR_RL(&rl, "truncated message (longer than %zu bytes)", sizeof tail);
return E2BIG;
}
nlmsghdr = buf->data;
if (retval < sizeof *nlmsghdr || nlmsghdr->nlmsg_len < sizeof *nlmsghdr || nlmsghdr->nlmsg_len > retval) {
VLOG_ERR_RL(&rl, "received invalid nlmsg (%zd bytes < %zu)", retval, sizeof *nlmsghdr);
return EPROTO;
}
if (STRESS(netlink_overflow)) {
return ENOBUFS;
}
buf->size = MIN(retval, buf->allocated);
if (retval > buf->allocated) { //
COVERAGE_INC(netlink_recv_jumbo);
ofpbuf_put(buf, tail, retval - buf->allocated);
}
log_nlmsg(__func__, 0, buf->data, buf->size, sock->protocol);
COVERAGE_INC(netlink_received);
return 0;
}
结构体ofbuf作为一种通用的缓存结构,必要的时候会重新分配;
//要注意这些数据结构的组织关系,通过msg接收的数据最终就是在buf指向的缓存中;
do {
retval = recvmsg(sock->fd , &msg, wait ? 0 : MSG_DONTWAIT);
} while (retval < 0 && errno == EINTR);
if (retval < 0) {
int error = errno;
if (error == ENOBUFS) {
COVERAGE_INC(netlink_overflow); //套接字接收缓冲区溢出;
}
return error;
}
if (msg.msg_flags & MSG_TRUNC) { //消息有删节,和上面overflow有何区别???
VLOG_ERR_RL(&rl, "truncated message (longer than %zu bytes)", sizeof tail);
return E2BIG;
}
nlmsghdr = buf->data;
if (retval < sizeof *nlmsghdr || nlmsghdr->nlmsg_len < sizeof *nlmsghdr || nlmsghdr->nlmsg_len > retval) {
VLOG_ERR_RL(&rl, "received invalid nlmsg (%zd bytes < %zu)", retval, sizeof *nlmsghdr);
return EPROTO;
}
if (STRESS(netlink_overflow)) {
return ENOBUFS;
}
buf->size = MIN(retval, buf->allocated);
if (retval > buf->allocated) { //
COVERAGE_INC(netlink_recv_jumbo);
ofpbuf_put(buf, tail, retval - buf->allocated);
}
log_nlmsg(__func__, 0, buf->data, buf->size, sock->protocol);
COVERAGE_INC(netlink_received);
return 0;
}
结构体ofbuf作为一种通用的缓存结构,必要的时候会重新分配;
struct ofpbuf {
void *base; /* First byte of allocated space. */
size_t allocated; /* Number of bytes allocated. */
enum ofpbuf_source source; /* Source of memory allocated as 'base'. */
void *data; /* First byte actually in use. */
size_t size; /* Number of bytes in use. */
void *l2; /* Link-level header. */
void *l3; /* Network-level header. */
void *l4; /* Transport-level header. */
void *l7; /* Application data. */
struct list list_node; /* Private list element for use by owner. */
void *private_p; /* Private pointer for use by owner. */
};
void *base; /* First byte of allocated space. */
size_t allocated; /* Number of bytes allocated. */
enum ofpbuf_source source; /* Source of memory allocated as 'base'. */
void *data; /* First byte actually in use. */
size_t size; /* Number of bytes in use. */
void *l2; /* Link-level header. */
void *l3; /* Network-level header. */
void *l4; /* Transport-level header. */
void *l7; /* Application data. */
struct list list_node; /* Private list element for use by owner. */
void *private_p; /* Private pointer for use by owner. */
};
static int parse_odp_packet(struct ofpbuf *buf, struct dpif_upcall *upcall, int *dp_ifindex)
{
static const struct nl_policy ovs_packet_policy[] = {
/* Always present. */
[OVS_PACKET_ATTR_PACKET] = { .type = NL_A_UNSPEC, .min_len = ETH_HEADER_LEN },
[OVS_PACKET_ATTR_KEY] = { .type = NL_A_NESTED },
/* OVS_PACKET_CMD_ACTION only. */
[OVS_PACKET_ATTR_USERDATA] = { .type = NL_A_U64, .optional = true },
};
struct ovs_header *ovs_header;
struct nlattr *a[ARRAY_SIZE(ovs_packet_policy)];
struct nlmsghdr *nlmsg;
struct genlmsghdr *genl;
struct ofpbuf b;
int type;
ofpbuf_use_const(&b, buf->data, buf->size); //??
nlmsg = ofpbuf_try_pull(&b, sizeof *nlmsg);
genl = ofpbuf_try_pull(&b, sizeof *genl);
ovs_header = ofpbuf_try_pull(&b, sizeof *ovs_header);
{
static const struct nl_policy ovs_packet_policy[] = {
/* Always present. */
[OVS_PACKET_ATTR_PACKET] = { .type = NL_A_UNSPEC, .min_len = ETH_HEADER_LEN },
[OVS_PACKET_ATTR_KEY] = { .type = NL_A_NESTED },
/* OVS_PACKET_CMD_ACTION only. */
[OVS_PACKET_ATTR_USERDATA] = { .type = NL_A_U64, .optional = true },
};
struct ovs_header *ovs_header;
struct nlattr *a[ARRAY_SIZE(ovs_packet_policy)];
struct nlmsghdr *nlmsg;
struct genlmsghdr *genl;
struct ofpbuf b;
int type;
ofpbuf_use_const(&b, buf->data, buf->size); //??
nlmsg = ofpbuf_try_pull(&b, sizeof *nlmsg);
genl = ofpbuf_try_pull(&b, sizeof *genl);
ovs_header = ofpbuf_try_pull(&b, sizeof *ovs_header);
//通过netlink msg的构造依次通过函数ofbuf_pull(lib/ofpbuf.c)解析出具体的结构nlmsg genlmsg ovshdr;
if (!nlmsg || !genl || !ovs_header || nlmsg->nlmsg_type != ovs_packet_family
|| !nl_policy_parse(&b, 0, ovs_packet_policy, a, ARRAY_SIZE(ovs_packet_policy))) {
return EINVAL;
}
type = (genl->cmd == OVS_PACKET_CMD_MISS ? DPIF_UC_MISS
: genl->cmd == OVS_PACKET_CMD_ACTION ? DPIF_UC_ACTION
: -1);
if (type < 0) {
return EINVAL;
}
memset(upcall, 0, sizeof *upcall);
upcall->type = type;
upcall->packet = buf;
upcall->packet->data = CONST_CAST(struct nlattr *, nl_attr_get(a[OVS_PACKET_ATTR_PACKET]));
upcall->packet->size = nl_attr_get_size(a[OVS_PACKET_ATTR_PACKET]);
upcall->key = CONST_CAST(struct nlattr *,nl_attr_get(a[OVS_PACKET_ATTR_KEY]));
upcall->key_len = nl_attr_get_size(a[OVS_PACKET_ATTR_KEY]);
upcall->userdata = (a[OVS_PACKET_ATTR_USERDATA]
? nl_attr_get_u64(a[OVS_PACKET_ATTR_USERDATA])
: 0);
*dp_ifindex = ovs_header->dp_ifindex;
return 0;
}
if (!nlmsg || !genl || !ovs_header || nlmsg->nlmsg_type != ovs_packet_family
|| !nl_policy_parse(&b, 0, ovs_packet_policy, a, ARRAY_SIZE(ovs_packet_policy))) {
return EINVAL;
}
type = (genl->cmd == OVS_PACKET_CMD_MISS ? DPIF_UC_MISS
: genl->cmd == OVS_PACKET_CMD_ACTION ? DPIF_UC_ACTION
: -1);
if (type < 0) {
return EINVAL;
}
memset(upcall, 0, sizeof *upcall);
upcall->type = type;
upcall->packet = buf;
upcall->packet->data = CONST_CAST(struct nlattr *, nl_attr_get(a[OVS_PACKET_ATTR_PACKET]));
upcall->packet->size = nl_attr_get_size(a[OVS_PACKET_ATTR_PACKET]);
upcall->key = CONST_CAST(struct nlattr *,nl_attr_get(a[OVS_PACKET_ATTR_KEY]));
upcall->key_len = nl_attr_get_size(a[OVS_PACKET_ATTR_KEY]);
upcall->userdata = (a[OVS_PACKET_ATTR_USERDATA]
? nl_attr_get_u64(a[OVS_PACKET_ATTR_USERDATA])
: 0);
*dp_ifindex = ovs_header->dp_ifindex;
return 0;
}
static void handle_miss_upcalls(struct ofproto_dpif *ofproto, struct dpif_upcall *upcalls,size_t n_upcalls) {
struct dpif_upcall *upcall;
struct flow_miss *miss;
struct flow_miss misses[FLOW_MISS_MAX_BATCH];
struct flow_miss_op flow_miss_ops[FLOW_MISS_MAX_BATCH * 2];
struct dpif_op *dpif_ops[FLOW_MISS_MAX_BATCH * 2];
struct hmap todo;
int n_misses;
size_t n_ops;
size_t i;
struct dpif_upcall *upcall;
struct flow_miss *miss;
struct flow_miss misses[FLOW_MISS_MAX_BATCH];
struct flow_miss_op flow_miss_ops[FLOW_MISS_MAX_BATCH * 2];
struct dpif_op *dpif_ops[FLOW_MISS_MAX_BATCH * 2];
struct hmap todo;
int n_misses;
size_t n_ops;
size_t i;
//构造一个to do list ,这相当于从每个packet中析取出flow 然后将那些具有相同流的packets 集合到 "flow_miss"structure 中,这样我们可以一同处理它们。
hmap_init(&todo);
n_misses = 0;
for (upcall = upcalls; upcall < &upcalls[n_upcalls]; upcall++) {
struct flow_miss *miss = &misses[n_misses];
struct flow_miss *existing_miss;
struct flow flow;
uint32_t hash;
/*和函数 odp_flow_key_to_flow()类似,函数ofproto_dpif_extract_flow_key(ofproto/ofproto-dpif.c)将key中的特定长度的OVS_KEY_ATTR_* attribute转换成一个流结构体flow,返回ODP_FIT_* 值 来表征 upcall->key 和我们期望的合适度。 */
hmap_init(&todo);
n_misses = 0;
for (upcall = upcalls; upcall < &upcalls[n_upcalls]; upcall++) {
struct flow_miss *miss = &misses[n_misses];
struct flow_miss *existing_miss;
struct flow flow;
uint32_t hash;
/*和函数 odp_flow_key_to_flow()类似,函数ofproto_dpif_extract_flow_key(ofproto/ofproto-dpif.c)将key中的特定长度的OVS_KEY_ATTR_* attribute转换成一个流结构体flow,返回ODP_FIT_* 值 来表征 upcall->key 和我们期望的合适度。 */
miss->key_fitness = ofproto_dpif_extract_flow_key( ofproto, upcall->key, upcall->key_len, &flow, &miss->initial_tci, upcall->packet);
if (miss->key_fitness == ODP_FIT_ERROR) {
continue;
}
if (miss->key_fitness == ODP_FIT_ERROR) {
continue;
}
/* flow_extract(lib/flow.c) 用packet中的信息,'skb_priority', 'tnl', 及 'ofp_in_port' 来填充flow中的域(重要),同时packet中的一些指针也会真正有效,层层跟进(比如packet->l4
= b.data)**/
flow_extract(upcall->packet, flow.skb_priority, flow.skb_mark, &flow.tunnel, flow.in_port, &miss->flow);
//将新的packets加入todo list 中(通过对struct flow_miss中的flow进行hash)
hash = flow_hash(&miss->flow, 0);
existing_miss = flow_miss_find(&todo, &miss->flow, hash);
if (!existing_miss) {
hmap_insert(&todo, &miss->hmap_node, hash);
miss->key = upcall->key;
miss->key_len = upcall->key_len;
miss->upcall_type = upcall->type;
list_init(&miss->packets);
n_misses++;
} else {
miss = existing_miss;
}
list_push_back(&miss->packets, &upcall->packet->list_node);
}
/*然后对todo list中的packets处理看是否完全匹配流表分别呼叫handle_flow_miss_without_facet, handle_flow_miss_with_facet。*/
n_ops = 0;
HMAP_FOR_EACH (miss, hmap_node, &todo) {
handle_flow_miss(ofproto, miss, flow_miss_ops, &n_ops);
}
assert(n_ops <= ARRAY_SIZE(flow_miss_ops));
/* Execute batch. */
for (i = 0; i < n_ops; i++) {
dpif_ops[i] = &flow_miss_ops[i].dpif_op;
}
dpif_operate(ofproto->dpif, dpif_ops, n_ops);
//根据dpif_op中的操作类型分别调用dpif_flow_put/del/execute__(lib/dpif.c)
/* Free memory and update facets. */
for (i = 0; i < n_ops; i++) {
struct flow_miss_op *op = &flow_miss_ops[i];
switch (op->dpif_op.type) {
case DPIF_OP_EXECUTE:
break;
case DPIF_OP_FLOW_PUT:
if (!op->dpif_op.error) {
op->subfacet->path = subfacet_want_path(op->subfacet->slow);
}
break;
case DPIF_OP_FLOW_DEL:
NOT_REACHED();
}
free(op->garbage);
}
hmap_destroy(&todo);
}
//将新的packets加入todo list 中(通过对struct flow_miss中的flow进行hash)
hash = flow_hash(&miss->flow, 0);
existing_miss = flow_miss_find(&todo, &miss->flow, hash);
if (!existing_miss) {
hmap_insert(&todo, &miss->hmap_node, hash);
miss->key = upcall->key;
miss->key_len = upcall->key_len;
miss->upcall_type = upcall->type;
list_init(&miss->packets);
n_misses++;
} else {
miss = existing_miss;
}
list_push_back(&miss->packets, &upcall->packet->list_node);
}
/*然后对todo list中的packets处理看是否完全匹配流表分别呼叫handle_flow_miss_without_facet, handle_flow_miss_with_facet。*/
n_ops = 0;
HMAP_FOR_EACH (miss, hmap_node, &todo) {
handle_flow_miss(ofproto, miss, flow_miss_ops, &n_ops);
}
assert(n_ops <= ARRAY_SIZE(flow_miss_ops));
/* Execute batch. */
for (i = 0; i < n_ops; i++) {
dpif_ops[i] = &flow_miss_ops[i].dpif_op;
}
dpif_operate(ofproto->dpif, dpif_ops, n_ops);
//根据dpif_op中的操作类型分别调用dpif_flow_put/del/execute__(lib/dpif.c)
/* Free memory and update facets. */
for (i = 0; i < n_ops; i++) {
struct flow_miss_op *op = &flow_miss_ops[i];
switch (op->dpif_op.type) {
case DPIF_OP_EXECUTE:
break;
case DPIF_OP_FLOW_PUT:
if (!op->dpif_op.error) {
op->subfacet->path = subfacet_want_path(op->subfacet->slow);
}
break;
case DPIF_OP_FLOW_DEL:
NOT_REACHED();
}
free(op->garbage);
}
hmap_destroy(&todo);
}
对upcalls的处理细节见后面。