/*
* Vhost-user RDMA device : init and packets forwarding
*
* Copyright (C) 2025 KylinSoft Inc. and/or its affiliates. All rights reserved.
*
* Author: Xiong Weimin <xiongweimin@kylinos.cn>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*/
#include <unistd.h>
#include <sys/uio.h>
#include <stdlib.h>
#include <rte_ethdev.h>
#include <rte_spinlock.h>
#include <rte_malloc.h>
#include "vhost_rdma.h"
#include "vhost_rdma_ib.h"
#include "vhost_rdma_log.h"
#include "vhost_rdma_pkt.h"
#define CHK_IOVEC(tp, iov) \
do { \
if(iov->iov_len < sizeof(*tp)) { \
RDMA_LOG_ERR("%s: " #iov " iovec is too small : %ld, %ld", __func__, sizeof(*tp), iov->iov_len); \
return -1; \
} \
tp = iov->iov_base; \
} while(0); \
#define DEFINE_VIRTIO_RDMA_CMD(cmd, handler) [cmd] = {handler, #cmd}
#define CTRL_NO_CMD __rte_unused struct iovec *__in
#define CTRL_NO_RSP __rte_unused struct iovec *__out
/**
* @brief Free resources held by a response entry in the RDMA responder path.
*
* Cleans up mbuf (for ATOMIC) or MR reference (for RDMA READ), then resets type.
* Uses RDMA_LOG_* macros for consistent logging.
*
* @param qp Queue Pair (currently unused)
* @param res Response resource to free (in/out)
*/
void
free_rd_atomic_resource(__rte_unused struct vhost_rdma_qp *qp,
struct vhost_rdma_resp_res *res)
{
if (!res) {
RDMA_LOG_ERR("Cannot free NULL response resource");
return;
}
switch (res->type) {
case VHOST_ATOMIC_MASK: {
struct rte_mbuf *mbuf = res->atomic.mbuf;
if (mbuf) {
RDMA_LOG_DEBUG("Freeing mbuf=%p from ATOMIC response", mbuf);
rte_pktmbuf_free(mbuf);
res->atomic.mbuf = NULL;
}
break;
}
case VHOST_READ_MASK: {
struct vhost_rdma_mr *mr = res->read.mr;
if (mr) {
RDMA_LOG_DEBUG("Dropping MR reference %p from RDMA READ response", mr);
vhost_rdma_drop_ref(mr, qp->dev, mr);
res->read.mr = NULL;
}
break;
}
case 0:
/* Already freed — silent no-op */
break;
default:
RDMA_LOG_ERR("Unknown response resource type %u (possible memory corruption)", res->type);
break;
}
/* Reset type to mark as free */
res->type = 0;
}
/**
* @brief Free all RD/Atomic response resources allocated for a Queue Pair.
*
* Iterates through the pre-allocated array of response tracking entries
* (used for RDMA READ and ATOMIC operations), frees associated mbufs or MRs,
* then releases the entire array memory.
*
* Safe to call multiple times (idempotent).
*
* @param qp Pointer to the Queue Pair whose response resources should be freed
*/
void
free_rd_atomic_resources(struct vhost_rdma_qp *qp)
{
if (!qp) {
RDMA_LOG_ERR("Cannot free response resources: qp is NULL");
return;
}
if (!qp->resp.resources) {
RDMA_LOG_DEBUG("No response resources to free for QP %u", qp->qpn);
return;
}
const uint32_t max_ops = qp->attr.max_dest_rd_atomic;
RDMA_LOG_DEBUG("Freeing %u RD/Atomic response resources for QP %u",
max_ops, qp->qpn);
for (uint32_t i = 0; i < max_ops; i++) {
struct vhost_rdma_resp_res *res = &qp->resp.resources[i];
/* Frees internal resources (mbuf or mr) and resets type */
free_rd_atomic_resource(qp, res);
}
/* Now free the entire array */
rte_free(qp->resp.resources);
qp->resp.resources = NULL;
RDMA_LOG_DEBUG("Successfully freed response resource array for QP %u", qp->qpn);
}
/**
* @brief Clean up a vhost RDMA queue.
*/
void
vhost_rdma_queue_cleanup(struct vhost_rdma_qp *qp, struct vhost_rdma_queue *queue)
{
if (!queue)
return;
if (queue->cb && qp)
rte_intr_callback_unregister(&queue->intr_handle, queue->cb, qp);
rte_free(queue->data);
queue->data = NULL;
}
/**
* @brief Cleanup callback for MR: reset type.
*/
void
vhost_rdma_mr_cleanup(void *arg)
{
struct vhost_rdma_mr *mr = arg;
if (mr)
mr->type = VHOST_MR_TYPE_NONE;
}
/**
* @brief Cleanup callback for QP: drop references and free resources.
*/
void
vhost_rdma_qp_cleanup(void *arg)
{
struct vhost_rdma_qp *qp = arg;
if (!qp)
return;
if (qp->scq) {
vhost_rdma_drop_ref(qp->scq, qp->dev, cq);
qp->scq = NULL;
}
if (qp->rcq) {
vhost_rdma_drop_ref(qp->rcq, qp->dev, cq);
qp->rcq = NULL;
}
if (qp->pd) {
vhost_rdma_drop_ref(qp->pd, qp->dev, pd);
qp->pd = NULL;
}
if (qp->resp.mr) {
vhost_rdma_drop_ref(qp->resp.mr, qp->dev, mr);
qp->resp.mr = NULL;
}
free_rd_atomic_resources(qp);
}
void
vhost_rdma_init_ib(struct vhost_rdma_device *dev)
{
uint32_t qpn;
if (!dev) {
return;
}
/* Initialize device attributes (virtio-rdma IB capability) */
dev->attr.max_qps = 64;
dev->attr.max_cqs = 64;
dev->attr.max_mr_size = UINT64_MAX;
dev->attr.page_size_cap = 0xFFFFF000U;
dev->attr.max_qp_wr = 1024;
dev->attr.device_cap_flags = VIRTIO_IB_DEVICE_RC_RNR_NAK_GEN;
dev->attr.max_send_sge = 32;
dev->attr.max_recv_sge = 32;
dev->attr.max_sge_rd = 32;
dev->attr.max_cqe = 1024;
dev->attr.max_mr = 0x00001000;
dev->attr.max_mw = 0;
dev->attr.max_pd = 0x7FFC;
dev->attr.max_qp_rd_atom = 128;
dev->attr.max_qp_init_rd_atom = 128;
dev->attr.max_ah = 100;
dev->attr.max_fast_reg_page_list_len = 512;
dev->attr.local_ca_ack_delay = 15;
/* Point to the RDMA configuration structure for cleaner assignment */
struct vhost_rdma_config *cfg = &dev->rdma_config;
/* Copy basic limits from device attributes */
cfg->max_qp = dev->attr.max_qps;
cfg->max_cq = dev->attr.max_cqs;
cfg->max_mr = dev->attr.max_mr;
cfg->max_pd = dev->attr.max_pd;
cfg->max_ah = dev->attr.max_ah;
cfg->max_cqe = dev->attr.max_cqe;
cfg->max_qp_wr = dev->attr.max_qp_wr;
cfg->max_send_sge = dev->attr.max_send_sge;
cfg->max_recv_sge = dev->attr.max_recv_sge;
cfg->max_sge_rd = dev->attr.max_sge_rd;
cfg->max_qp_rd_atom = dev->attr.max_qp_rd_atom;
cfg->max_qp_init_rd_atom = dev->attr.max_qp_init_rd_atom;
cfg->max_mr_size = dev->attr.max_mr_size;
cfg->max_mw = dev->attr.max_mw;
cfg->max_fast_reg_page_list_len = dev->attr.max_fast_reg_page_list_len;
cfg->page_size_cap = dev->attr.page_size_cap;
cfg->device_cap_flag = dev->attr.device_cap_flags;
cfg->local_ca_ack_delay = dev->attr.local_ca_ack_delay;
cfg->phys_port_cnt = 1;
cfg->max_pkeys = 1;
cfg->vendor_id = 0x1AF4;
cfg->vendor_part_id = 0x0042;
cfg->sys_image_guid = 1;
/* Derived capabilities */
cfg->max_res_rd_atom = cfg->max_qp_rd_atom * cfg->max_qp;
cfg->max_total_mcast_qp_attach = 8192UL * 56UL;
cfg->max_pi_fast_reg_page_list_len = cfg->max_fast_reg_page_list_len / 2;
/* Inline data and MTU settings */
dev->max_inline_data = dev->attr.max_send_sge * sizeof(struct vhost_user_rdma_sge);
dev->mtu_cap = ib_mtu_enum_to_int(DEFAULT_IB_MTU);
/* Reset port counters */
dev->port_attr.bad_pkey_cntr = 0;
dev->port_attr.qkey_viol_cntr = 0;
/* Initialize GID table (illegal by default) */
for (int i = 0; i < VHOST_MAX_GID_TBL_LEN; i++) {
dev->gid_tbl[i].type = VHOST_RDMA_GID_TYPE_ILLIGAL; /* Typo? Should be ILLEGAL? */
}
/* Setup virtual queue mappings:
* rdma_vqs[0] is reserved (likely control),
* cq_vqs starts at index 1,
* qp_vqs follows after all CQs.
*/
dev->cq_vqs = &dev->rdma_vqs[1];
dev->qp_vqs = &dev->rdma_vqs[1 + dev->attr.max_cqs];
/* Initialize resource pools */
vhost_rdma_pool_init(&dev->pd_pool, "pd_pool", dev->attr.max_pd,
sizeof(struct vhost_rdma_pd), false, NULL);
vhost_rdma_pool_init(&dev->mr_pool, "mr_pool", dev->attr.max_mr,
sizeof(struct vhost_rdma_mr), false, vhost_rdma_mr_cleanup);
vhost_rdma_pool_init(&dev->cq_pool, "cq_pool", dev->attr.max_cqs,
sizeof(struct vhost_rdma_cq), true, NULL); /* Shared across cores? */
vhost_rdma_pool_init(&dev->qp_pool, "qp_pool", dev->attr.max_qps,
sizeof(struct vhost_rdma_qp), false, vhost_rdma_qp_cleanup);
vhost_rdma_pool_init(&dev->ah_pool, "ah_pool", dev->attr.max_ah,
sizeof(struct vhost_rdma_av), false, NULL);
/* Allocate special GSI QP (QP number 1), used for subsystem management (e.g., SM in IB) */
dev->qp_gsi = vhost_rdma_pool_alloc(&dev->qp_pool, &qpn);
if (!dev->qp_gsi) {
return; /* Failed to allocate GSI QP */
}
vhost_rdma_add_ref(dev->qp_gsi); /* Hold a reference */
assert(qpn == 1); /* GSI must be assigned QPN 1 */
}
/**
* @brief Destroy and clean up all RDMA resources associated with the device.
*
* This function safely releases all allocated QPs, CQs, MRs, PDs, and AVs,
* then destroys their respective memory pools.
*
* Note: It assumes no external references exist to these objects.
*/
void
vhost_rdma_destroy_ib(struct vhost_rdma_device *dev)
{
struct vhost_rdma_mr *mr;
struct vhost_rdma_pd *pd;
struct vhost_rdma_cq *cq;
struct vhost_rdma_qp *qp;
struct vhost_rdma_av *av;
uint32_t i;
if (!dev) {
return;
}
/* Clean up Memory Regions (MR): cleanup callback may have already reset state */
for (i = 0; i < dev->attr.max_mr; i++) {
mr = vhost_rdma_pool_get(&dev->mr_pool, i);
if (mr) {
vhost_rdma_pool_free(&dev->mr_pool, i); /* Triggers cleanup if registered */
}
}
/* Clean up Protection Domains (PD) */
for (i = 0; i < dev->attr.max_pd; i++) {
pd = vhost_rdma_pool_get(&dev->pd_pool, i);
if (pd) {
vhost_rdma_pool_free(&dev->pd_pool, i);
}
}
/* Clean up Completion Queues (CQ) */
for (i = 0; i < dev->attr.max_cqs; i++) {
cq = vhost_rdma_pool_get(&dev->cq_pool, i);
if (cq) {
vhost_rdma_pool_free(&dev->cq_pool, i);
}
}
/* Clean up Queue Pairs (QP): must drain SQ/RQ before freeing */
for (i = 0; i < dev->attr.max_qps; i++) {
qp = vhost_rdma_pool_get(&dev->qp_pool, i);
if (qp) {
/* Cleanup send and receive queues (e.g., unregister intr handlers, free ring buffers) */
vhost_rdma_queue_cleanup(qp, &qp->sq.queue);
vhost_rdma_queue_cleanup(qp, &qp->rq.queue);
/* Now free the QP from the pool (triggers vhost_rdma_qp_cleanup if set) */
vhost_rdma_pool_free(&dev->qp_pool, i);
}
}
/* Clean up Address Handles (AH / AV) */
for (i = 0; i < dev->attr.max_ah; i++) {
av = vhost_rdma_pool_get(&dev->ah_pool, i);
if (av) {
vhost_rdma_pool_free(&dev->ah_pool, i);
}
}
/*
* Destroy resource pools.
* This frees internal pool metadata and backing arrays.
* Pools should be empty at this point.
*/
vhost_rdma_pool_destroy(&dev->mr_pool);
vhost_rdma_pool_destroy(&dev->pd_pool);
vhost_rdma_pool_destroy(&dev->cq_pool);
vhost_rdma_pool_destroy(&dev->qp_pool);
vhost_rdma_pool_destroy(&dev->ah_pool);
}
/**
* @brief Convert a guest physical address payload into iovec entries.
*
* This function translates a contiguous memory region (starting at 'payload'
* with length 'remaining') into one or more iovecs by looking up the virtual
* address via gpa_to_vva(). The resulting iovecs are stored in 'iovs', and
* 'iov_index' is updated accordingly.
*
* @param mem Pointer to vhost memory structure for GPA->VVA translation.
* @param iovs Array of iovec structures to fill.
* @param iov_index Current index in the iovs array (updated on success).
* @param payload Guest physical address (GPA) of the data.
* @param remaining Total number of bytes left to translate.
* @param num_iovs Maximum number of iovecs allowed.
* @return 0 on success, -1 on error (e.g., translation failure or overflow).
*/
static int
desc_payload_to_iovs(struct rte_vhost_memory *mem,
struct iovec *iovs,
uint32_t *iov_index,
uintptr_t payload,
uint64_t remaining,
uint16_t num_iovs)
{
void *vva;
uint64_t len;
do {
if (*iov_index >= num_iovs) {
RDMA_LOG_ERR("MAX_IOVS reached");
return -1;
}
len = remaining;
vva = (void *)(uintptr_t)gpa_to_vva(mem, payload, &len);
if (!vva || !len) {
RDMA_LOG_ERR("failed to translate desc address.");
return -1;
}
iovs[*iov_index].iov_base = vva;
iovs[*iov_index].iov_len = len;
payload += len;
remaining -= len;
(*iov_index)++;
} while (remaining);
return 0;
}
/**
* @brief Set up iovecs from vring descriptors for a given request.
*
* Parses the descriptor chain starting at 'req_idx'. Handles both direct and
* indirect descriptors. Fills the provided 'iovs' array with valid memory
* regions derived from GPA-to-VVA translation. Also counts input/output descriptors.
*
* @param mem Vhost memory configuration for address translation.
* @param vq Virtual queue containing the descriptor ring.
* @param req_idx Index of the first descriptor in the chain.
* @param iovs Pre-allocated iovec array to populate.
* @param num_iovs Size of the iovs array (maximum entries).
* @param num_in Output: number of writable (input) descriptors.
* @param num_out Output: number of readable (output) descriptors.
* @return Number of filled iovecs on success, -1 on error.
*/
int
setup_iovs_from_descs(struct rte_vhost_memory *mem,
struct vhost_user_queue *vq,
uint16_t req_idx,
struct iovec *iovs,
uint16_t num_iovs,
uint16_t *num_in,
uint16_t *num_out)
{
struct vring_desc *desc = &vq->vring.desc[req_idx];
struct vring_desc *desc_table;
uint32_t iovs_idx = 0;
uint64_t len;
uint16_t in = 0, out = 0;
/* Handle indirect descriptors */
if (desc->flags & VRING_DESC_F_INDIRECT) {
len = desc->len;
desc_table = (struct vring_desc *)(uintptr_t)gpa_to_vva(mem, desc->addr, &len);
if (!desc_table || !len) {
RDMA_LOG_ERR("failed to translate desc address.");
return -1;
}
assert(len == desc->len);
desc = desc_table;
} else {
desc_table = vq->vring.desc;
}
/* Walk through descriptor chain */
do {
if (iovs_idx >= num_iovs) {
RDMA_LOG_ERR("MAX_IOVS reached\n");
return -1;
}
if (desc->flags & VRING_DESC_F_WRITE) {
in++; /* Descriptor allows write from device perspective (input) */
} else {
out++; /* Descriptor allows read (output) */
}
/* Translate payload (address + length) into iovec(s) */
if (desc_payload_to_iovs(mem, iovs,
&iovs_idx,
desc->addr,
desc->len,
num_iovs) != 0) {
RDMA_LOG_ERR("Failed to convert desc payload to iovs");
return -1;
}
/* Move to next descriptor in chain */
desc = vhost_rdma_vring_get_next_desc(desc_table, desc);
} while (desc != NULL);
*num_in = in;
*num_out = out;
return iovs_idx;
}
static int
vhost_rdma_query_device(struct vhost_rdma_device *dev, CTRL_NO_CMD,
struct iovec *out)
{
struct vhost_rdma_ack_query_device *rsp;
CHK_IOVEC(rsp, out);
rsp->max_mr_size = dev->attr.max_mr_size;
rsp->page_size_cap = dev->attr.page_size_cap;
rsp->max_qp_wr = dev->attr.max_qp_wr;
rsp->device_cap_flags = dev->attr.device_cap_flags;
rsp->max_send_sge = dev->attr.max_send_sge;
rsp->max_recv_sge = dev->attr.max_recv_sge;
rsp->max_sge_rd = dev->attr.max_sge_rd;
rsp->max_cqe = dev->attr.max_cqe;
rsp->max_mr = dev->attr.max_mr;
rsp->max_pd = dev->attr.max_pd;
rsp->max_qp_rd_atom = dev->attr.max_qp_rd_atom;
rsp->max_qp_init_rd_atom = dev->attr.max_qp_init_rd_atom;
rsp->max_ah = dev->attr.max_ah;
rsp->local_ca_ack_delay = dev->attr.local_ca_ack_delay;
return 0;
}
static int
vhost_rdma_query_port(__rte_unused struct vhost_rdma_device *dev,
CTRL_NO_CMD,
struct iovec *out)
{
struct vhost_rdma_ack_query_port *rsp;
CHK_IOVEC(rsp, out);
rsp->gid_tbl_len = VHOST_MAX_GID_TBL_LEN;
rsp->max_msg_sz = 0x800000;
rsp->active_mtu = VHOST_RDMA_IB_MTU_256;
rsp->phys_mtu = VHOST_RDMA_IB_MTU_256;
rsp->port_cap_flags = 65536UL;
rsp->bad_pkey_cntr = 0UL;
rsp->phys_state = VHOST_RDMA_IB_PORT_PHYS_STATE_POLLING;
rsp->pkey_tbl_len = 1UL;
rsp->qkey_viol_cntr = 0UL;
rsp->state = VHOST_RDMA_IB_PORT_DOWN;
rsp->active_speed = 1UL;
rsp->active_width = VHOST_RDMA_IB_WIDTH_1X;
rsp->max_mtu = VHOST_RDMA_IB_MTU_4096;
return 0;
}
/* Command handler table declaration */
struct {
int (*handler)(struct vhost_rdma_device *dev, struct iovec *in, struct iovec *out);
const char *name; /* Name of the command (for logging) */
} cmd_tbl[] = {
DEFINE_VIRTIO_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_QUERY_DEVICE, vhost_rdma_query_device),
DEFINE_VIRTIO_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_QUERY_PORT, vhost_rdma_query_port),
};
/**
* @brief Main handler for control virtqueue events.
*
* Processes incoming requests from the control virtual queue. Waits for kick
* notification via eventfd, then processes available descriptor chains.
* Each chain contains a header followed by optional input/output data.
* Executes corresponding handler based on command ID.
*
* @param arg Pointer to vhost_rdma_device instance.
*/
void
vhost_rdma_handle_ctrl_vq(void *arg)
{
struct vhost_rdma_device *dev = arg;
struct vhost_rdma_ctrl_hdr *hdr;
struct vhost_user_queue *ctrl_vq = &dev->rdma_vqs[0];
struct iovec data_iovs[4]; /* Fixed-size iovec buffer */
struct iovec *in_iovs, *out_iovs;
uint16_t desc_idx, num_in, num_out;
uint8_t *status;
int kick_fd, nbytes, i, in_len;
kick_fd = ctrl_vq->vring.kickfd;
/* Wait until we get a valid kick (notification) */
do {
uint64_t kick_data;
nbytes = eventfd_read(kick_fd, &kick_data);
if (nbytes < 0) {
if (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN) {
continue; /* Retry on transient errors */
}
RDMA_LOG_ERR("Failed to read kickfd of ctrl virtq: %s", strerror(errno));
}
break;
} while (1);
/* Process all available requests in the control queue */
while (vhost_rdma_vq_is_avail(ctrl_vq)) {
desc_idx = vhost_rdma_vq_get_desc_idx(ctrl_vq);
/* Build iovecs from descriptor chain */
if (setup_iovs_from_descs(dev->mem, ctrl_vq,
desc_idx, data_iovs, 4,
&num_in, &num_out) < 0) {
RDMA_LOG_ERR("read from desc failed");
break;
}
/* Split iovecs into output (device reads) and input (device writes) */
out_iovs = data_iovs;
in_iovs = &data_iovs[num_out];
in_len = 0;
/* Calculate total input data length */
for (i = 0; i < num_in; i++) {
in_len += in_iovs[i].iov_len;
}
/* First output iovec should contain the control header */
hdr = (struct vhost_rdma_ctrl_hdr *)out_iovs[0].iov_base;
status = (uint8_t *)in_iovs[0].iov_base;
/* Validate header size */
if (out_iovs[0].iov_len != sizeof(*hdr)) {
RDMA_LOG_ERR("invalid header");
*status = VIRTIO_NET_ERR;
goto pushq;
}
/* Check if command ID is within valid range */
if (hdr->cmd >= (sizeof(cmd_tbl) / sizeof(cmd_tbl[0]))) {
RDMA_LOG_ERR("unknown cmd %d", hdr->cmd);
*status = VIRTIO_NET_ERR;
goto pushq;
}
/* Dispatch command handler; set status based on result */
*status = (cmd_tbl[hdr->cmd].handler(dev,
num_out > 1 ? &out_iovs[1] : NULL,
num_in > 1 ? &in_iovs[1] : NULL) == 0)
? VIRTIO_NET_OK
: VIRTIO_NET_ERR;
pushq:
/* Log command execution result */
RDMA_LOG_INFO("cmd=%d %s status: %d",
hdr->cmd,
cmd_tbl[hdr->cmd].name ? cmd_tbl[hdr->cmd].name : "unknown",
*status);
/* Return used descriptor to the avail ring and notify frontend */
vhost_rdma_queue_push(ctrl_vq, desc_idx, in_len);
vhost_rdma_queue_notify(dev->vid, ctrl_vq);
}
}
int
vhost_rdma_task_scheduler(void *arg)
{
return 0;
} 去掉注释,保留开头版权声明,不需要分析过程,直接输出结果
最新发布