packet在内核空间匹配失败后传到用户空间的处理逻辑是什么？

内核空间到用户空间的packet处理逻辑解析

最新推荐文章于 2023-01-30 21:57:55 发布

原创最新推荐文章于 2023-01-30 21:57:55 发布 · 2.6k 阅读

1 ·

CC 4.0 BY-SA版权

文章标签：

#dpif_upcall #handle_upcalls #dpif_recv #dpif_linux_class #nl_sock_recv

OVS源码阅读同时被 2 个专栏收录

29 篇文章

订阅专栏

OVS源码阅读

22 篇文章

订阅专栏

本文探讨了当packet在内核空间匹配失败后，如何通过dpif_upcall、handle_upcalls等机制传递到用户空间的过程，详细解析了dpif_recv、dpif_linux_class中的关键步骤，并讲解了nl_sock_recv在接收这些信息时的作用。

我们知道当packet到达交换机之后会提取出flow key->查询流表，如果匹配成功就执行对应的action，否则构造netlink attribute发送到用户空间对应的进程，这里vswitchd会调用handle_upcalls（ofproto/ofproto-dpif.c）来处理。主要流程图是：

结构体 dpif_upcall 表征的是从datapath传到userspace的一个packet(lib/dpif.h)。

（如果key或者action非空的话，那么其指向的数据从属于这个packet，所以不能单独被释放；）
struct dpif_upcall {
    /* All types. */
    enum dpif_upcall_type type;
    struct ofpbuf *packet;      /* Packet data. 是全部数据包，还是根据of协议？？
    struct nlattr *key;         /* Flow key. */
    size_t key_len;             /* Length of 'key' in bytes. */

    /* DPIF_UC_ACTION only. */
    uint64_t userdata;          /* Argument to OVS_ACTION_ATTR_USERSPACE. */
};

upcall的类型有：

enum dpif_upcall_type {
    DPIF_UC_MISS,               /* Miss in flow table. */
    DPIF_UC_ACTION,             /* OVS_ACTION_ATTR_USERSPACE action. */
    DPIF_N_UC_TYPES
};

static int handle_upcalls(struct ofproto_dpif *ofproto, unsigned int max_batch) {
    struct dpif_upcall misses[FLOW_MISS_MAX_BATCH];
    struct ofpbuf miss_bufs[FLOW_MISS_MAX_BATCH];
    uint64_t miss_buf_stubs[FLOW_MISS_MAX_BATCH][4096 / 8];
    int n_processed;
    int n_misses;
    int i;

    assert(max_batch <= FLOW_MISS_MAX_BATCH);

    n_misses = 0;
    for (n_processed = 0; n_processed < max_batch; n_processed++) {
        struct dpif_upcall *upcall = &misses[n_misses];
        struct ofpbuf *buf = &miss_bufs[n_misses];
        int error;

        ofpbuf_use_stub(buf, miss_buf_stubs[n_misses], sizeof miss_buf_stubs[n_misses]);

//这个函数的作用是初始化从某个基址开始，一定大小的ofbuf缓冲区，并且base应该指向一个栈的缓冲区并且要对齐；

//使用uint32_t,uint64_t类型可以确保合适的对齐；一个ofbuf缓冲区操作如果需要重新分配数据的话需要将STUB上的

//数据拷贝到malloc缓冲区中，由于这个buf可能扩展到堆上，所以最后应该调用ofpbuf_uninit来释放相应的内存。(??)

error = dpif_recv(ofproto->dpif, upcall, buf);
//通过具体的datapath类型，调用其中的recv方法（这里的dpif class 是dpif_linux_class，表示本机）；

//dpif_linux_class表示的是通过netlink和本地的datapath通信，而dpif_netdev_class通过网络协议和远程的datapath通信；

switch (classify_upcall(upcall)) {
        case MISS_UPCALL:
n_misses++; /* Handle it later. */
            break;

        case SFLOW_UPCALL:
            if (ofproto->sflow) {
                handle_sflow_upcall(ofproto, upcall);

//对于SFLOW_UPCALL 和 BAD_UPCALL，进行对应处理后释放存有 upcall 消息的 buf，而对于
//MISS_UPCALL 类型，则调用 handle_miss_upcalls 进行后续的处理。如何理解SFLOW ？？有待跟进！
            }
            ofpbuf_uninit(buf);
            break;

        case BAD_UPCALL:
            ofpbuf_uninit(buf);
            break;
        }
    }
/* Handle deferred MISS_UPCALL processing. */
    handle_miss_upcalls(ofproto, misses, n_misses);
    for (i = 0; i < n_misses; i++) {
        ofpbuf_uninit(&miss_bufs[i]);
    }
return n_processed;
}

轮询dpif 看是否有upcall ，如果有的话就用upcall表征，数据在buf中，这里要确保设置了能够从dpif接收packets即 dpif_recv_set() ；'upcall->packet' 和 'upcall->key'都会指向用户提供的buf中存的数据，所以它们不能分别进行内存释放。

int dpif_recv(struct dpif *dpif, struct dpif_upcall *upcall, struct ofpbuf *buf)
{
int error = dpif->dpif_class->recv(dpif, upcall, buf);

//具体情况见后面；
    if (!error && !VLOG_DROP_DBG(&dpmsg_rl)) {
        struct ds flow; // dynamic string (lib/dynamic-string.h)
        char *packet;
  //以字符串形式返回代表ethernet frame的内容(lib/ofp-print.c)

        packet = ofp_packet_to_string(upcall->packet->data, upcall->packet->size);

        ds_init(&flow);
        odp_flow_key_format(upcall->key, upcall->key_len, &flow);
//将upcall中的key中的attribute-OVS_KEY_ATTR_*分别解析出来到flow中，只是为了日志输出。
        VLOG_DBG("%s: %s upcall:\n%s\n%s", dpif_name(dpif), dpif_upcall_type_to_string(upcall->type),

ds_cstr(&flow), packet);
//如何查看这些信息呢？？
        ds_destroy(&flow);
        free(packet);
    } else if (error && error != EAGAIN) {
        log_operation(dpif, "recv", error);
    }
    return error;
}

接下来会执行dpif_linux_class 的receive方法，从内核层datapath中获得信息（lib/dpif-linux.c）；

const struct dpif_class dpif_linux_class = {
    "system",
    dpif_linux_enumerate,
    dpif_linux_open,
    dpif_linux_close,
    dpif_linux_destroy,
    dpif_linux_run,
    dpif_linux_wait,
    dpif_linux_get_stats,
    dpif_linux_port_add,
    dpif_linux_port_del,
    dpif_linux_port_query_by_number,
    dpif_linux_port_query_by_name,
    dpif_linux_get_max_ports,
    dpif_linux_port_get_pid,
    dpif_linux_port_dump_start,
    dpif_linux_port_dump_next,
    dpif_linux_port_dump_done,
    dpif_linux_port_poll,
    dpif_linux_port_poll_wait,
    dpif_linux_flow_get,
    dpif_linux_flow_put,
    dpif_linux_flow_del,
    dpif_linux_flow_flush,
    dpif_linux_flow_dump_start,
    dpif_linux_flow_dump_next,
    dpif_linux_flow_dump_done,
    dpif_linux_execute,
    dpif_linux_operate,
    dpif_linux_recv_set,
    dpif_linux_queue_to_priority,
    dpif_linux_recv,
    dpif_linux_recv_wait,
    dpif_linux_recv_purge,
};

/* Datapath interface for the openvswitch Linux kernel module. */
struct dpif_linux {
    struct dpif dpif;
    int dp_ifindex;

    /* Upcall messages. */
    struct dpif_channel channels[N_CHANNELS];    //？？
    uint32_t ready_mask;        /* 1-bit for each sock with unread messages. */
    int epoll_fd;               /* epoll fd that includes channel socks. */
    long long int next_scale;   /* Next time to scale down the sketches. */

    /* Change notification. */
    struct sset changed_ports; /* Ports that have changed. */
    struct nln_notifier *port_notifier;
    bool change_error;

    /* Port number allocation. */
    uint16_t alloc_port_no;
};

*kernel 和 userspace之间的netlink 通道

struct dpif_channel {
    struct nl_sock *sock;       /* Netlink socket. */
    struct dpif_sketch sketches[N_SKETCHES]; /* From max to min 'hits'. */
    long long int last_poll;    /* Last time this channel was polled. */
};

static int dpif_linux_recv(struct dpif *dpif_, struct dpif_upcall *upcall, struct ofpbuf *buf)
{
    struct dpif_linux *dpif = dpif_linux_cast(dpif_); //通过container_of获得包含dpif的dpif_linux容器地址；
    int read_tries = 0;

    if (dpif->epoll_fd < 0) {
       return EAGAIN;
    }

    if (!dpif->ready_mask) {
        struct epoll_event events[N_CHANNELS];
        int retval;
        int i;

        do {
            retval = epoll_wait(dpif->epoll_fd, events, N_CHANNELS, 0);
        } while (retval < 0 && errno == EINTR);
        if (retval < 0) {
            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
            VLOG_WARN_RL(&rl, "epoll_wait failed (%s)", strerror(errno));
        }

        for (i = 0; i < retval; i++) {
            dpif->ready_mask |= 1u << events[i].data.u32; //说明有缓冲区非空的事件，设置掩码；
        }
    }

    while (dpif->ready_mask) {
        int indx = ffs(dpif->ready_mask) - 1; //得到ready_mask中第一个值为真的bit，-1可以索引到具体的通道；
        struct dpif_channel *ch = &dpif->channels[indx];

        dpif->ready_mask &= ~(1u << indx); //然后将那一位反置为0，使得下次循环处理其他的；

        for (;;) {
            int dp_ifindex;
            int error;

            if (++read_tries > 50) {
                return EAGAIN;
            }

            error = nl_sock_recv(ch->sock, buf, false);
            if (error == ENOBUFS) {
                /* ENOBUFS typically means that we've received so many packets that the buffer overflowed. Try again
                 * immediately because there's almost certainly a packet waiting for us. */
                report_loss(dpif_, ch);
                continue;
            }

            ch->last_poll = time_msec();
            if (error) {
                if (error == EAGAIN) {
                    break;
                }
                return error;
            }

            error = parse_odp_packet(buf, upcall, &dp_ifindex);

//上面已经将来自内核的数据接收到了buf中，这里是按照nla 策略具体解析出来到upcall中；

            if (!error && dp_ifindex == dpif->dp_ifindex) {
                const struct nlattr *in_port;

                in_port = nl_attr_find__(upcall->key, upcall->key_len, OVS_KEY_ATTR_IN_PORT);
                if (in_port) {
                    update_sketch(ch, nl_attr_get_u32(in_port));
                }//更新相应的通道在某个端口上的包接收情况；
                return 0;
            }
            if (error) {
                return error;
            }
        }
    }

    return EAGAIN;
}

利用套接字sock从内核中接收netlink msg 到ofpbuf缓冲区中，若参数wait为真，就阻塞等待消息，否则就返回EAGAIN，调用者要确保缓冲区至少空间为 NLMSG_HDRLEN，通常应该为典型的消息分配足够的空间，执行成功后接收到的消息会在ofbuf中必要话还会扩展空间。

int nl_sock_recv(struct nl_sock *sock, struct ofpbuf *buf, bool wait)
{
int error = nl_sock_cow__(sock);

//如果即将操作的这个sock正在进行dump操作，就会重新克隆一个新的sock来接纳现在的操作；
    if (error) {
        return error;
    }
    return nl_sock_recv__(sock, buf, wait);
}

static int nl_sock_recv__(struct nl_sock *sock, struct ofpbuf *buf, bool wait)
{
  struct nlmsghdr *nlmsghdr;
    uint8_t tail[65536];

//不能精确预测将要接收消息的大小，为了意外，这里预留64KB的空间；
    struct iovec iov[2];
    struct msghdr msg;
    ssize_t retval;

    assert(buf->allocated >= sizeof *nlmsghdr);
    ofpbuf_clear(buf);

    iov[0].iov_base = buf->base;
    iov[0].iov_len = buf->allocated;
    iov[1].iov_base = tail;
    iov[1].iov_len = sizeof tail;

    memset(&msg, 0, sizeof msg);
    msg.msg_iov = iov;
    msg.msg_iovlen = 2;

//recvmsg的参数需要构造 msghdr ，里面的缓冲区用来接收消息；
//要注意这些数据结构的组织关系，通过msg接收的数据最终就是在buf指向的缓存中；
    do {
        retval = recvmsg(sock->fd , &msg, wait ? 0 : MSG_DONTWAIT);
    } while (retval < 0 && errno == EINTR);

    if (retval < 0) {
        int error = errno;
        if (error == ENOBUFS) {
   COVERAGE_INC(netlink_overflow); //套接字接收缓冲区溢出；
        }
        return error;
    }

    if (msg.msg_flags & MSG_TRUNC) { //消息有删节，和上面overflow有何区别？？？
        VLOG_ERR_RL(&rl, "truncated message (longer than %zu bytes)", sizeof tail);
        return E2BIG;
    }

    nlmsghdr = buf->data;
    if (retval < sizeof *nlmsghdr  || nlmsghdr->nlmsg_len < sizeof *nlmsghdr   || nlmsghdr->nlmsg_len > retval) {
        VLOG_ERR_RL(&rl, "received invalid nlmsg (%zd bytes < %zu)", retval, sizeof *nlmsghdr);
        return EPROTO;
    }

    if (STRESS(netlink_overflow)) {
        return ENOBUFS;
    }

    buf->size = MIN(retval, buf->allocated);
    if (retval > buf->allocated) { //
        COVERAGE_INC(netlink_recv_jumbo);
        ofpbuf_put(buf, tail, retval - buf->allocated);
    }

    log_nlmsg(__func__, 0, buf->data, buf->size, sock->protocol);
    COVERAGE_INC(netlink_received);

    return 0;
}

结构体ofbuf作为一种通用的缓存结构，必要的时候会重新分配；

struct ofpbuf {
    void *base;                 /* First byte of allocated space. */
    size_t allocated;           /* Number of bytes allocated. */
    enum ofpbuf_source source; /* Source of memory allocated as 'base'. */

    void *data;                 /* First byte actually in use. */
    size_t size;                /* Number of bytes in use. */

    void *l2;                   /* Link-level header. */
    void *l3;                   /* Network-level header. */
    void *l4;                   /* Transport-level header. */
    void *l7;                   /* Application data. */

    struct list list_node;      /* Private list element for use by owner. */
    void *private_p;            /* Private pointer for use by owner. */
};

static int parse_odp_packet(struct ofpbuf *buf, struct dpif_upcall *upcall, int *dp_ifindex)
{
    static const struct nl_policy ovs_packet_policy[] = {
        /* Always present. */
        [OVS_PACKET_ATTR_PACKET] = { .type = NL_A_UNSPEC, .min_len = ETH_HEADER_LEN },
        [OVS_PACKET_ATTR_KEY] = { .type = NL_A_NESTED },

        /* OVS_PACKET_CMD_ACTION only. */
        [OVS_PACKET_ATTR_USERDATA] = { .type = NL_A_U64, .optional = true },
    };

    struct ovs_header *ovs_header;
    struct nlattr *a[ARRAY_SIZE(ovs_packet_policy)];
    struct nlmsghdr *nlmsg;
    struct genlmsghdr *genl;
    struct ofpbuf b;
    int type;

    ofpbuf_use_const(&b, buf->data, buf->size); //？？

    nlmsg = ofpbuf_try_pull(&b, sizeof *nlmsg);
    genl = ofpbuf_try_pull(&b, sizeof *genl);
    ovs_header = ofpbuf_try_pull(&b, sizeof *ovs_header);

//通过netlink msg的构造依次通过函数ofbuf_pull(lib/ofpbuf.c)解析出具体的结构nlmsg genlmsg ovshdr；
    if (!nlmsg || !genl || !ovs_header || nlmsg->nlmsg_type != ovs_packet_family
        || !nl_policy_parse(&b, 0, ovs_packet_policy, a, ARRAY_SIZE(ovs_packet_policy))) {
        return EINVAL;
    }

    type = (genl->cmd == OVS_PACKET_CMD_MISS ? DPIF_UC_MISS
            : genl->cmd == OVS_PACKET_CMD_ACTION ? DPIF_UC_ACTION
            : -1);
    if (type < 0) {
        return EINVAL;
    }

    memset(upcall, 0, sizeof *upcall);
    upcall->type = type;
    upcall->packet = buf;
    upcall->packet->data = CONST_CAST(struct nlattr *, nl_attr_get(a[OVS_PACKET_ATTR_PACKET]));
    upcall->packet->size = nl_attr_get_size(a[OVS_PACKET_ATTR_PACKET]);
    upcall->key = CONST_CAST(struct nlattr *,nl_attr_get(a[OVS_PACKET_ATTR_KEY]));
    upcall->key_len = nl_attr_get_size(a[OVS_PACKET_ATTR_KEY]);
    upcall->userdata = (a[OVS_PACKET_ATTR_USERDATA]
                        ? nl_attr_get_u64(a[OVS_PACKET_ATTR_USERDATA])
                        : 0);
    *dp_ifindex = ovs_header->dp_ifindex;

    return 0;
}

static void handle_miss_upcalls(struct ofproto_dpif *ofproto, struct dpif_upcall *upcalls,size_t n_upcalls) {
    struct dpif_upcall *upcall;
    struct flow_miss *miss;
    struct flow_miss misses[FLOW_MISS_MAX_BATCH];
    struct flow_miss_op flow_miss_ops[FLOW_MISS_MAX_BATCH * 2];
    struct dpif_op *dpif_ops[FLOW_MISS_MAX_BATCH * 2];
    struct hmap todo;
    int n_misses;
    size_t n_ops;
    size_t i;

  //构造一个to do list ，这相当于从每个packet中析取出flow 然后将那些具有相同流的packets 集合到 "flow_miss"structure 中，这样我们可以一同处理它们。
  hmap_init(&todo);
    n_misses = 0;
    for (upcall = upcalls; upcall < &upcalls[n_upcalls]; upcall++) {
        struct flow_miss *miss = &misses[n_misses];
        struct flow_miss *existing_miss;
        struct flow flow;
        uint32_t hash;

        /*和函数 odp_flow_key_to_flow()类似，函数ofproto_dpif_extract_flow_key（ofproto/ofproto-dpif.c）将key中的特定长度的OVS_KEY_ATTR_* attribute转换成一个流结构体flow，返回ODP_FIT_* 值来表征 upcall->key 和我们期望的合适度。 */

        miss->key_fitness = ofproto_dpif_extract_flow_key( ofproto, upcall->key, upcall->key_len, &flow, &miss->initial_tci, upcall->packet);
        if (miss->key_fitness == ODP_FIT_ERROR) {
            continue;
        }

/* flow_extract（lib/flow.c）用packet中的信息，'skb_priority', 'tnl', 及 'ofp_in_port' 来填充flow中的域（重要），同时packet中的一些指针也会真正有效，层层跟进（比如packet->l4 = b.data）**/

        flow_extract(upcall->packet, flow.skb_priority, flow.skb_mark, &flow.tunnel, flow.in_port, &miss->flow);

        //将新的packets加入todo list 中（通过对struct flow_miss中的flow进行hash）
        hash = flow_hash(&miss->flow, 0);
        existing_miss = flow_miss_find(&todo, &miss->flow, hash);
        if (!existing_miss) {
            hmap_insert(&todo, &miss->hmap_node, hash);
            miss->key = upcall->key;
            miss->key_len = upcall->key_len;
            miss->upcall_type = upcall->type;
            list_init(&miss->packets);

            n_misses++;
        } else {
            miss = existing_miss;
        }
        list_push_back(&miss->packets, &upcall->packet->list_node);
    }

/*然后对todo list中的packets处理看是否完全匹配流表分别呼叫handle_flow_miss_without_facet， handle_flow_miss_with_facet。*/
    n_ops = 0;
    HMAP_FOR_EACH (miss, hmap_node, &todo) {
        handle_flow_miss(ofproto, miss, flow_miss_ops, &n_ops);
    }
    assert(n_ops <= ARRAY_SIZE(flow_miss_ops));

    /* Execute batch. */
    for (i = 0; i < n_ops; i++) {
        dpif_ops[i] = &flow_miss_ops[i].dpif_op;
    }
    dpif_operate(ofproto->dpif, dpif_ops, n_ops);
//根据dpif_op中的操作类型分别调用dpif_flow_put/del/execute__(lib/dpif.c)
    /* Free memory and update facets. */
    for (i = 0; i < n_ops; i++) {
        struct flow_miss_op *op = &flow_miss_ops[i];

        switch (op->dpif_op.type) {
        case DPIF_OP_EXECUTE:
            break;

        case DPIF_OP_FLOW_PUT:
            if (!op->dpif_op.error) {
                op->subfacet->path = subfacet_want_path(op->subfacet->slow);
            }
            break;

        case DPIF_OP_FLOW_DEL:
            NOT_REACHED();
        }

        free(op->garbage);
    }
    hmap_destroy(&todo);
}

对upcalls的处理细节见后面。