#define SIG_ERR (void(*)())-1 的问题

本文详细解析了C/C++中signal函数如何处理信号处理函数指针,包括SIG_ERR, SIG_DFL, SIG_IGN宏的作用及内部逻辑,以及如何在函数中正确使用这些宏。

整理自:http://topic.youkuaiyun.com/u/20091107/09/b7841460-1c2a-4927-a4b3-3d11ac948460.html


  在学习APUE时遇到

#define SIG_ERR (void(*)())-1

#define SIG_DFL (void(*)())0

#define SIG_IGN (void(*)())1


signal的第二个参数是个函数指针,直接传-1,0,1编译肯定会报错
所以宏里先做个转换,在函数中再对传入的值作处理,我觉得

他这么做应该是为了能将参数传进去,传进去以后他会根据指针的值判断,但是肯定不会调用这个函数指针,不然肯定出错的

C/C++ code

typedef void (*sighandler_t)();
#define SIG_ERR (sighandler_t)-1
#define SIG_DFL (sighandler_t)0
#define SIG_IGN (sighandler_t)1


而signal里面会有类似如下的一些逻辑

C/C++ code

signal(int sig,sighandler_t handler)
{
    if(handler==SIG_ERR) { }
    else if(handler==SIG_DFL) { }
    else if(handler==SIG_IGN) { }
       else { handler(); }
}







/* * Vhost-user RDMA device : init and packets forwarding * * Copyright (C) 2025 KylinSoft Inc. and/or its affiliates. All rights reserved. * * Author: Xiong Weimin <xiongweimin@kylinos.cn> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. */ #ifndef __VHOST_RDMA_IB_H__ #define __VHOST_RDMA_IB_H__ #include <netinet/in.h> #include <linux/virtio_net.h> #include <rte_spinlock.h> #include <rte_atomic.h> #include <rte_timer.h> #include "vhost_rdma.h" #include "eal_interrupts.h" #define OPCODE_NONE (-1) #define VIRTIO_IB_DEVICE_RC_RNR_NAK_GEN (1 << 0) #define VHOST_USER_MEMORY_MAX_NREGIONS 8 #define VHOST_USER_MAX_CONFIG_SIZE 256 #define VHOST_RDMA_CTRL_ROCE 6 #define VHOST_RDMA_CTRL_ROCE_QUERY_DEVICE 0 #define VHOST_RDMA_CTRL_ROCE_QUERY_PORT 1 #define VHOST_RDMA_CTRL_ROCE_CREATE_CQ 2 #define VHOST_RDMA_CTRL_ROCE_DESTROY_CQ 3 #define VHOST_RDMA_CTRL_ROCE_CREATE_PD 4 #define VHOST_RDMA_CTRL_ROCE_DESTROY_PD 5 #define VHOST_RDMA_CTRL_ROCE_GET_DMA_MR 6 #define VHOST_RDMA_CTRL_ROCE_ALLOC_MR 7 #define VHOST_RDMA_CTRL_ROCE_REG_USER_MR 9 #define VHOST_RDMA_CTRL_ROCE_MAP_MR_SG 8 #define VHOST_RDMA_CTRL_ROCE_DEREG_MR 10 #define VHOST_RDMA_CTRL_ROCE_CREATE_QP 11 #define VHOST_RDMA_CTRL_ROCE_MODIFY_QP 12 #define VHOST_RDMA_CTRL_ROCE_QUERY_QP 13 #define VHOST_RDMA_CTRL_ROCE_DESTROY_QP 14 #define VHOST_RDMA_CTRL_ROCE_QUERY_PKEY 15 // #define VHOST_RDMA_CTRL_ROCE_CREATE_AH 13 // #define VHOST_RDMA_CTRL_ROCE_DESTROY_AH 14 #define VHOST_RDMA_CTRL_ROCE_ADD_GID 16 #define VHOST_RDMA_CTRL_ROCE_DEL_GID 17 #define VHOST_RDMA_CTRL_ROCE_REQ_NOTIFY_CQ 18 enum vhost_rdma_ib_qp_state { VHOST_RDMA_IB_QPS_RESET, VHOST_RDMA_IB_QPS_INIT, VHOST_RDMA_IB_QPS_RTR, VHOST_RDMA_IB_QPS_RTS, VHOST_RDMA_IB_QPS_SQD, VHOST_RDMA_IB_QPS_SQE, VHOST_RDMA_IB_QPS_ERR }; enum vhost_rdma_ib_mtu { VHOST_RDMA_IB_MTU_256 = 1, VHOST_RDMA_IB_MTU_512 = 2, VHOST_RDMA_IB_MTU_1024 = 3, VHOST_RDMA_IB_MTU_2048 = 4, VHOST_RDMA_IB_MTU_4096 = 5 }; enum vhost_rdma_ib_wc_status { /* Operation completed successfully */ VHOST_RDMA_IB_WC_SUCCESS, /* Local Length Error */ VHOST_RDMA_IB_WC_LOC_LEN_ERR, /* Local QP Operation Error */ VHOST_RDMA_IB_WC_LOC_QP_OP_ERR, /* Local Protection Error */ VHOST_RDMA_IB_WC_LOC_PROT_ERR, /* Work Request Flushed Error */ VHOST_RDMA_IB_WC_WR_FLUSH_ERR, /* Bad Response Error */ VHOST_RDMA_IB_WC_BAD_RESP_ERR, /* Local Access Error */ VHOST_RDMA_IB_WC_LOC_ACCESS_ERR, /* Remote Invalid Request Error */ VHOST_RDMA_IB_WC_REM_INV_REQ_ERR, /* Remote Access Error */ VHOST_RDMA_IB_WC_REM_ACCESS_ERR, /* Remote Operation Error */ VHOST_RDMA_IB_WC_REM_OP_ERR, /* Transport Retry Counter Exceeded */ VHOST_RDMA_IB_WC_RETRY_EXC_ERR, /* RNR Retry Counter Exceeded */ VHOST_RDMA_IB_WC_RNR_RETRY_EXC_ERR, /* Remote Aborted Error */ VHOST_RDMA_IB_WC_REM_ABORT_ERR, /* Fatal Error */ VHOST_RDMA_IB_WC_FATAL_ERR, /* Response Timeout Error */ VHOST_RDMA_IB_WC_RESP_TIMEOUT_ERR, /* General Error */ VHOST_RDMA_IB_WC_GENERAL_ERR }; enum vhost_rdma_res_state { VHOST_RDMA_RES_STATE_NEXT, VHOST_RDMA_RES_STATE_NEW, VHOST_RDMA_RES_STATE_REPLAY, }; enum vhost_user_rdma_request { VHOST_USER_NONE = 0, VHOST_USER_GET_FEATURES = 1, VHOST_USER_SET_FEATURES = 2, VHOST_USER_SET_OWNER = 3, VHOST_USER_RESET_OWNER = 4, VHOST_USER_SET_MEM_TABLE = 5, VHOST_USER_SET_LOG_BASE = 6, VHOST_USER_SET_LOG_FD = 7, VHOST_USER_SET_VRING_NUM = 8, VHOST_USER_SET_VRING_ADDR = 9, VHOST_USER_SET_VRING_BASE = 10, VHOST_USER_GET_VRING_BASE = 11, VHOST_USER_SET_VRING_KICK = 12, VHOST_USER_SET_VRING_CALL = 13, VHOST_USER_SET_VRING_ERR = 14, VHOST_USER_GET_PROTOCOL_FEATURES = 15, VHOST_USER_SET_PROTOCOL_FEATURES = 16, VHOST_USER_GET_QUEUE_NUM = 17, VHOST_USER_SET_VRING_ENABLE = 18, VHOST_USER_GET_CONFIG = 24, VHOST_USER_SET_CONFIG = 25, VHOST_USER_MAX }; struct vhost_rdma_qp_cap { uint32_t max_send_wr; uint32_t max_send_sge; uint32_t max_recv_wr; uint32_t max_recv_sge; uint32_t max_inline_data; }; struct vhost_rdma_global_route { /* Destination GID or MGID */ uint8_t dgid[16]; /* Flow label */ uint32_t flow_label; /* Source GID index */ uint8_t sgid_index; /* Hop limit */ uint8_t hop_limit; /* Traffic class */ uint8_t traffic_class; }; struct vhost_rdma_ah_attr { /* Global Routing Header (GRH) attributes */ struct vhost_rdma_global_route grh; uint8_t sl; uint8_t static_rate; uint8_t port_num; uint8_t ah_flags; /* Destination MAC address */ uint8_t dmac[6]; }; struct vhost_rdma_qp_attr { enum vhost_rdma_ib_qp_state qp_state; enum vhost_rdma_ib_qp_state cur_qp_state; enum vhost_rdma_ib_mtu path_mtu; uint32_t qkey; uint32_t rq_psn; uint32_t sq_psn; uint32_t dest_qp_num; uint32_t qp_access_flags; uint8_t sq_draining; uint8_t max_rd_atomic; uint8_t max_dest_rd_atomic; uint8_t min_rnr_timer; uint8_t timeout; uint8_t retry_cnt; uint8_t rnr_retry; uint32_t rate_limit; struct vhost_rdma_qp_cap cap; struct vhost_rdma_ah_attr ah_attr; }; struct vhost_rdma_pd { struct vhost_rdma_device *dev; uint32_t pdn; rte_atomic32_t refcnt; }; struct vhost_rdma_queue { struct vhost_queue *vq; void *data; size_t elem_size; size_t num_elems; uint16_t consumer_index; uint16_t producer_index; struct rte_intr_handle intr_handle; rte_intr_callback_fn cb; }; /** Fixed-size vhost_memory struct */ struct vhost_memory_padded { uint32_t nregions; uint32_t padding; struct vhost_memory_region regions[VHOST_USER_MEMORY_MAX_NREGIONS]; }; /** Get/set config msg payload */ struct vhost_user_rdma_config { uint32_t offset; uint32_t size; uint32_t flags; uint8_t region[VHOST_USER_MAX_CONFIG_SIZE]; }; struct vhost_user_rdma_msg { enum vhost_user_rdma_request request; #define VHOST_USER_VERSION_MASK 0x3 #define VHOST_USER_REPLY_MASK (0x1 << 2) uint32_t flags; uint32_t size; /**< the following payload size */ union { #define VHOST_USER_VRING_IDX_MASK 0xff #define VHOST_USER_VRING_NOFD_MASK (0x1 << 8) uint64_t u64; struct vhost_vring_state state; struct vhost_vring_addr addr; struct vhost_memory_padded memory; struct vhost_user_rdma_config cfg; } payload; } __rte_packed; struct vhost_rdma_cq { struct vhost_queue *vq; rte_spinlock_t cq_lock; uint8_t notify; bool is_dying; uint32_t cqn; rte_atomic32_t refcnt; }; struct vhost_rdma_sq { rte_spinlock_t lock; /* guard queue */ struct vhost_rdma_queue queue; }; struct vhost_rdma_rq { rte_spinlock_t lock; /* guard queue */ struct vhost_rdma_queue queue; }; struct vhost_rdma_av { /* From RXE_NETWORK_TYPE_* */ uint8_t network_type; uint8_t dmac[6]; struct vhost_rdma_global_route grh; union { struct sockaddr_in _sockaddr_in; struct sockaddr_in6 _sockaddr_in6; } sgid_addr, dgid_addr; }; struct vhost_rdma_task { char name[8]; int state; bool destroyed; rte_atomic16_t sched; rte_spinlock_t state_lock; /* spinlock for task state */ struct rte_ring *task_ring; int (*func)(void *arg); void *arg; int ret; }; struct vhost_rdma_req_info { enum vhost_rdma_ib_qp_state state; int wqe_index; uint32_t psn; int opcode; rte_atomic32_t rd_atomic; int wait_fence; int need_rd_atomic; int wait_psn; int need_retry; int noack_pkts; struct vhost_rdma_task task; }; struct vhost_rdma_comp_info { uint32_t psn; int opcode; int timeout; int timeout_retry; int started_retry; uint32_t retry_cnt; uint32_t rnr_retry; struct vhost_rdma_task task; }; struct vhost_rdma_sge { __le64 addr; __le32 length; __le32 lkey; }; struct vhost_rdma_dma_info { uint32_t length; uint32_t resid; uint32_t cur_sge; uint32_t num_sge; uint32_t sge_offset; uint32_t reserved; union { uint8_t *inline_data; struct vhost_rdma_sge *sge; void *raw; }; }; struct vhost_rdma_recv_wqe { __aligned_u64 wr_id; __u32 num_sge; __u32 padding; struct vhost_rdma_dma_info dma; }; enum vhost_rdma_mr_type { VHOST_MR_TYPE_NONE, VHOST_MR_TYPE_DMA, VHOST_MR_TYPE_MR, }; enum vhost_rdma_mr_state { VHOST_MR_STATE_ZOMBIE, VHOST_MR_STATE_INVALID, VHOST_MR_STATE_FREE, VHOST_MR_STATE_VALID, }; struct vhost_rdma_mr { struct vhost_rdma_pd *pd; enum vhost_rdma_mr_type type; enum vhost_rdma_mr_state state; uint64_t va; uint64_t iova; size_t length; uint32_t offset; int access; uint32_t lkey; uint32_t rkey; uint32_t npages; uint32_t max_pages; uint64_t *pages; uint32_t mrn; rte_atomic32_t refcnt; }; struct vhost_rdma_resp_res { int type; int replay; uint32_t first_psn; uint32_t last_psn; uint32_t cur_psn; enum vhost_rdma_res_state state; union { struct { struct rte_mbuf *mbuf; } atomic; struct { struct vhost_rdma_mr *mr; uint64_t va_org; uint32_t rkey; uint32_t length; uint64_t va; uint32_t resid; } read; }; }; struct vhost_rdma_resp_info { enum vhost_rdma_ib_qp_state state; uint32_t msn; uint32_t psn; uint32_t ack_psn; int opcode; int drop_msg; int goto_error; int sent_psn_nak; enum vhost_rdma_ib_wc_status status; uint8_t aeth_syndrome; /* Receive only */ struct vhost_rdma_recv_wqe *wqe; /* RDMA read / atomic only */ uint64_t va; uint64_t offset; struct vhost_rdma_mr *mr; uint32_t resid; uint32_t rkey; uint32_t length; uint64_t atomic_orig; /* Responder resources. It's a circular list where the oldest * resource is dropped first. */ struct vhost_rdma_resp_res *resources; unsigned int res_head; unsigned int res_tail; struct vhost_rdma_resp_res *res; struct vhost_rdma_task task; }; struct vhost_rdma_qp { struct vhost_rdma_device *dev; struct vhost_rdma_qp_attr attr; uint32_t qpn; uint8_t type; unsigned int valid; unsigned int mtu; struct vhost_rdma_pd *pd; struct vhost_rdma_cq *scq; struct vhost_rdma_cq *rcq; uint8_t sq_sig_all; struct vhost_rdma_sq sq; struct vhost_rdma_rq rq; void *srq; // reversed uint32_t dst_cookie; uint16_t src_port; struct vhost_rdma_av av; struct rte_ring *req_pkts; struct rte_mbuf *req_pkts_head; // use this to support peek struct rte_ring *resp_pkts; struct vhost_rdma_req_info req; struct vhost_rdma_comp_info comp; struct vhost_rdma_resp_info resp; rte_atomic32_t ssn; rte_atomic32_t mbuf_out; int need_req_mbuf; /* Timer for retranmitting packet when ACKs have been lost. RC * only. The requester sets it when it is not already * started. The responder resets it whenever an ack is * received. */ struct rte_timer retrans_timer; uint64_t qp_timeout_ticks; /* Timer for handling RNR NAKS. */ struct rte_timer rnr_nak_timer; rte_spinlock_t state_lock; /* guard requester and completer */ rte_atomic32_t refcnt; }; struct vhost_user_rdma_sge { uint64_t addr; uint32_t length; uint32_t lkey; }; static inline int ib_mtu_enum_to_int(enum vhost_rdma_ib_mtu mtu) { switch (mtu) { case VHOST_RDMA_IB_MTU_256: return 256; case VHOST_RDMA_IB_MTU_512: return 512; case VHOST_RDMA_IB_MTU_1024: return 1024; case VHOST_RDMA_IB_MTU_2048: return 2048; case VHOST_RDMA_IB_MTU_4096: return 4096; default: return -1; } } void vhost_rdma_init_ib(struct vhost_rdma_device *dev); void vhost_rdma_destroy_ib(struct vhost_rdma_device *dev); void vhost_rdma_handle_ctrl_vq(void* arg); int vhost_rdma_task_scheduler(void *arg); void free_rd_atomic_resource(struct vhost_rdma_qp *qp, struct vhost_rdma_resp_res *res); void free_rd_atomic_resources(struct vhost_rdma_qp *qp); void vhost_rdma_mr_cleanup(void* arg); void vhost_rdma_qp_cleanup(void* arg); void vhost_rdma_queue_cleanup(struct vhost_rdma_qp *qp, struct vhost_rdma_queue* queue); #endif 也改一下
10-10
From 7264a58404b1b24464fac6c564dbb94e8d137a30 Mon Sep 17 00:00:00 2001 From: xiongweimin <xiongweimin@kylinos.cn> Date: Tue, 16 Dec 2025 10:19:42 +0800 Subject: [PATCH 5/8] examples/vhost_user_rdma: implement advanced completer engine with reliability features This commit adds the completer engine for RDMA operations with: 1. State machine for ACK packet processing 2. PSN-based sequence validation 3. Reliability mechanisms (retry, RNR backoff) 4. Atomic operation execution 5. Comprehensive error handling 6. Performance counters for diagnostics Key features: - 11-state processing pipeline for response handling - Dynamic retransmission timer management - RNR NAK timer for flow control - Packet lifetime tracking (mbuf release) - Work completion error propagation - Congestion-aware task scheduling Signed-off-by: Xiong Weimin <xiongweimin@kylinos.cn> Change-Id: I12a7baf03edffcd66da7bdc84218001c6bf3a0de examples/vhost_user_rdma: implement P_Key query operation with default partition key This commit adds support for the IB_QUERY_PKEY command: 1. Implements mandatory InfiniBand partition key query 2. Provides default full-membership P_Key (0xFFFF) 3. Includes I/O vector safety validation 4. Maintains compatibility with standard IB management tools Key features: - Hardcoded default P_Key for simplified management - Buffer size validation using CHK_IOVEC macro - Zero-copy response writing via iovec - Minimal overhead for frequent management operations Signed-off-by: Xiong Weimin <xiongweimin@kylinos.cn> Change-Id: Ibc7be3488989285da205aff7400be38995a435fd examples/vhost_user_rdma: implement request notification for completion queues This commit adds CQ notification management: 1. Processes IB_REQ_NOTIFY commands 2. Configures event generation policy 3. Validates CQ existence via resource pool 4. Supports edge-triggered and conditional notification modes Key features: - CQ lookup via centralized resource pool - Dynamic notification mode switching - Error logging for invalid CQ references - Lightweight control path operation Signed-off-by: Xiong Weimin <xiongweimin@kylinos.cn> Change-Id: I13e240ee71f1e1530d41875564da989ddb0fef86 --- examples/vhost_user_rdma/meson.build | 51 +- .../vhost_user_rdma/vhost_rdma_complete.c | 850 ++++++++++++++++++ examples/vhost_user_rdma/vhost_rdma_ib.c | 66 +- examples/vhost_user_rdma/vhost_rdma_ib.h | 4 + examples/vhost_user_rdma/vhost_rdma_opcode.h | 437 +++++---- examples/vhost_user_rdma/vhost_rdma_queue.c | 6 - examples/vhost_user_rdma/vhost_rdma_queue.h | 5 + 7 files changed, 1176 insertions(+), 243 deletions(-) create mode 100644 examples/vhost_user_rdma/vhost_rdma_complete.c diff --git a/examples/vhost_user_rdma/meson.build b/examples/vhost_user_rdma/meson.build index 2a0a6ffc15..89ff4fbbf1 100644 --- a/examples/vhost_user_rdma/meson.build +++ b/examples/vhost_user_rdma/meson.build @@ -7,8 +7,8 @@ # DPDK instance, use 'make' if not is_linux - build = false - subdir_done() + build = false + subdir_done() endif deps += ['vhost', 'timer'] @@ -16,34 +16,35 @@ deps += ['vhost', 'timer'] allow_experimental_apis = true cflags_options = [ - '-std=c11', - '-Wno-strict-prototypes', - '-Wno-pointer-arith', - '-Wno-maybe-uninitialized', - '-Wno-discarded-qualifiers', - '-Wno-old-style-definition', - '-Wno-sign-compare', - '-Wno-stringop-overflow', - '-O3', - '-g', - '-DALLOW_EXPERIMENTAL_API', - '-DDEBUG_RDMA', - '-DDEBUG_RDMA_DP', + '-std=c11', + '-Wno-strict-prototypes', + '-Wno-pointer-arith', + '-Wno-maybe-uninitialized', + '-Wno-discarded-qualifiers', + '-Wno-old-style-definition', + '-Wno-sign-compare', + '-Wno-stringop-overflow', + '-O3', + '-g', + '-DALLOW_EXPERIMENTAL_API', + '-DDEBUG_RDMA', + '-DDEBUG_RDMA_DP', ] foreach option:cflags_options - if cc.has_argument(option) - cflags += option - endif + if cc.has_argument(option) + cflags += option + endif endforeach sources = files( - 'main.c', - 'vhost_rdma.c', - 'vhost_rdma_ib.c', - 'vhost_rdma_queue.c', - 'vhost_rdma_opcode.c', - 'vhost_rdma_pkt.c', - 'vhost_rdma_crc.c', + 'main.c', + 'vhost_rdma.c', + 'vhost_rdma_ib.c', + 'vhost_rdma_queue.c', + 'vhost_rdma_opcode.c', + 'vhost_rdma_pkt.c', + 'vhost_rdma_crc.c', + 'vhost_rdma_complete.c', ) diff --git a/examples/vhost_user_rdma/vhost_rdma_complete.c b/examples/vhost_user_rdma/vhost_rdma_complete.c new file mode 100644 index 0000000000..623b8dd2a0 --- /dev/null +++ b/examples/vhost_user_rdma/vhost_rdma_complete.c @@ -0,0 +1,850 @@ +/* + * Vhost-user RDMA device: Completion Queue Handler (Completer) + * + * This module handles the completion of Send Queue Work Queue Entries (WQEs) + * based on incoming response packets such as ACKs, Read Responses, or NAKs. + * It ensures reliable delivery for RC QPs by checking PSN, handling retries, + * and posting completions to the CQ. + * + * Copyright (C) 2025 KylinSoft Inc. and/or its affiliates. All rights reserved. + * + * Author: Xiong Weimin <xiongweimin@kylinos.cn> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <rte_mbuf.h> +#include <rte_timer.h> +#include <rte_atomic.h> +#include <rte_log.h> + +#include "vhost_rdma_opcode.h" +#include "vhost_rdma_ib.h" +#include "vhost_rdma_queue.h" +#include "vhost_rdma.h" +#include "vhost_rdma_pkt.h" + +/** + * enum comp_state - State machine for RDMA completer + * + * The completer processes incoming responses using a state machine to handle: + * - Packet validation (PSN, opcode) + * - Retry logic (timeout, RNR NAK) + * - Data operations (READ, ATOMIC) + * - Completion generation + */ +enum comp_state { + VHOST_RDMA_COMPST_GET_ACK, + VHOST_RDMA_COMPST_GET_WQE, + VHOST_RDMA_COMPST_COMP_WQE, + VHOST_RDMA_COMPST_COMP_ACK, + VHOST_RDMA_COMPST_CHECK_PSN, + VHOST_RDMA_COMPST_CHECK_ACK, + VHOST_RDMA_COMPST_READ, + VHOST_RDMA_COMPST_ATOMIC, + VHOST_RDMA_COMPST_WRITE_SEND, + VHOST_RDMA_COMPST_UPDATE_COMP, + VHOST_RDMA_COMPST_ERROR_RETRY, + VHOST_RDMA_COMPST_RNR_RETRY, + VHOST_RDMA_COMPST_ERROR, + VHOST_RDMA_COMPST_EXIT, + VHOST_RDMA_COMPST_DONE, +}; + +/* Human-readable state names for debugging */ +static const char *comp_state_name[] = { + [VHOST_RDMA_COMPST_GET_ACK] = "GET ACK", + [VHOST_RDMA_COMPST_GET_WQE] = "GET WQE", + [VHOST_RDMA_COMPST_COMP_WQE] = "COMP WQE", + [VHOST_RDMA_COMPST_COMP_ACK] = "COMP ACK", + [VHOST_RDMA_COMPST_CHECK_PSN] = "CHECK PSN", + [VHOST_RDMA_COMPST_CHECK_ACK] = "CHECK ACK", + [VHOST_RDMA_COMPST_READ] = "READ", + [VHOST_RDMA_COMPST_ATOMIC] = "ATOMIC", + [VHOST_RDMA_COMPST_WRITE_SEND] = "WRITE/SEND", + [VHOST_RDMA_COMPST_UPDATE_COMP] = "UPDATE COMP", + [VHOST_RDMA_COMPST_ERROR_RETRY] = "ERROR RETRY", + [VHOST_RDMA_COMPST_RNR_RETRY] = "RNR RETRY", + [VHOST_RDMA_COMPST_ERROR] = "ERROR", + [VHOST_RDMA_COMPST_EXIT] = "EXIT", + [VHOST_RDMA_COMPST_DONE] = "DONE", +}; + +/** + * enum ib_rnr_timeout - Backoff values for RNR NAK timer + * + * These define exponential backoff delays when receiver is not ready. + * Expressed in microseconds via rnrnak_usec[] table. + */ +enum ib_rnr_timeout { + IB_RNR_TIMER_655_36 = 0, + IB_RNR_TIMER_000_01 = 1, + IB_RNR_TIMER_000_02 = 2, + IB_RNR_TIMER_000_03 = 3, + IB_RNR_TIMER_000_04 = 4, + IB_RNR_TIMER_000_06 = 5, + IB_RNR_TIMER_000_08 = 6, + IB_RNR_TIMER_000_12 = 7, + IB_RNR_TIMER_000_16 = 8, + IB_RNR_TIMER_000_24 = 9, + IB_RNR_TIMER_000_32 = 10, + IB_RNR_TIMER_000_48 = 11, + IB_RNR_TIMER_000_64 = 12, + IB_RNR_TIMER_000_96 = 13, + IB_RNR_TIMER_001_28 = 14, + IB_RNR_TIMER_001_92 = 15, + IB_RNR_TIMER_002_56 = 16, + IB_RNR_TIMER_003_84 = 17, + IB_RNR_TIMER_005_12 = 18, + IB_RNR_TIMER_007_68 = 19, + IB_RNR_TIMER_010_24 = 20, + IB_RNR_TIMER_015_36 = 21, + IB_RNR_TIMER_020_48 = 22, + IB_RNR_TIMER_030_72 = 23, + IB_RNR_TIMER_040_96 = 24, + IB_RNR_TIMER_061_44 = 25, + IB_RNR_TIMER_081_92 = 26, + IB_RNR_TIMER_122_88 = 27, + IB_RNR_TIMER_163_84 = 28, + IB_RNR_TIMER_245_76 = 29, + IB_RNR_TIMER_327_68 = 30, + IB_RNR_TIMER_491_52 = 31 +}; + +/** + * rnrnak_usec - Microsecond delay lookup for RNR timeout codes + * + * Indexed by enum ib_rnr_timeout. Used to schedule RNR retry timers. + */ +static unsigned long rnrnak_usec[32] = { + [IB_RNR_TIMER_655_36] = 655360, + [IB_RNR_TIMER_000_01] = 10, + [IB_RNR_TIMER_000_02] = 20, + [IB_RNR_TIMER_000_03] = 30, + [IB_RNR_TIMER_000_04] = 40, + [IB_RNR_TIMER_000_06] = 60, + [IB_RNR_TIMER_000_08] = 80, + [IB_RNR_TIMER_000_12] = 120, + [IB_RNR_TIMER_000_16] = 160, + [IB_RNR_TIMER_000_24] = 240, + [IB_RNR_TIMER_000_32] = 320, + [IB_RNR_TIMER_000_48] = 480, + [IB_RNR_TIMER_000_64] = 640, + [IB_RNR_TIMER_000_96] = 960, + [IB_RNR_TIMER_001_28] = 1280, + [IB_RNR_TIMER_001_92] = 1920, + [IB_RNR_TIMER_002_56] = 2560, + [IB_RNR_TIMER_003_84] = 3840, + [IB_RNR_TIMER_005_12] = 5120, + [IB_RNR_TIMER_007_68] = 7680, + [IB_RNR_TIMER_010_24] = 10240, + [IB_RNR_TIMER_015_36] = 15360, + [IB_RNR_TIMER_020_48] = 20480, + [IB_RNR_TIMER_030_72] = 30720, + [IB_RNR_TIMER_040_96] = 40960, + [IB_RNR_TIMER_061_44] = 61410, + [IB_RNR_TIMER_081_92] = 81920, + [IB_RNR_TIMER_122_88] = 122880, + [IB_RNR_TIMER_163_84] = 163840, + [IB_RNR_TIMER_245_76] = 245760, + [IB_RNR_TIMER_327_68] = 327680, + [IB_RNR_TIMER_491_52] = 491520, +}; + +/** + * vhost_rdma_get_wqe - Retrieve head WQE from send queue + * @qp: Queue pair + * @pkt: Incoming packet (may be NULL) + * @wqe_p: Output pointer to current WQE + * + * Returns next state depending on WQE state and presence of packet. + */ +static __rte_always_inline enum comp_state +vhost_rdma_get_wqe(struct vhost_rdma_qp *qp, struct vhost_rdma_pkt_info *pkt, + struct vhost_rdma_send_wqe **wqe_p) +{ + struct vhost_rdma_send_wqe *wqe; + + wqe = queue_head(&qp->sq.queue); + *wqe_p = wqe; + + /* No WQE available or requester hasn't started processing */ + if (!wqe || wqe->state == WQE_STATE_POSTED) + return pkt ? VHOST_RDMA_COMPST_DONE : VHOST_RDMA_COMPST_EXIT; + + /* Already completed locally */ + if (wqe->state == WQE_STATE_DONE) + return VHOST_RDMA_COMPST_COMP_WQE; + + /* WQE previously failed */ + if (wqe->state == WQE_STATE_ERROR) + return VHOST_RDMA_COMPST_ERROR; + + /* Valid WQE exists — proceed to PSN check if packet exists */ + return pkt ? VHOST_RDMA_COMPST_CHECK_PSN : VHOST_RDMA_COMPST_EXIT; +} + +/** + * reset_retry_counters - Reset retry counters after successful ACK + * @qp: Queue pair whose attributes are used + */ +static __rte_always_inline void +reset_retry_counters(struct vhost_rdma_qp *qp) +{ + qp->comp.retry_cnt = qp->attr.retry_cnt; + qp->comp.rnr_retry = qp->attr.rnr_retry; + qp->comp.started_retry = 0; +} + +/** +* vhost_rdma_check_psn - Validate packet sequence number against expected +* @qp: Queue pair +* @pkt: Response packet +* @wqe: Current WQE +* +* Checks whether PSN is valid, detects retransmissions, timeouts, or gaps. +*/ +static __rte_always_inline enum comp_state +vhost_rdma_check_psn(struct vhost_rdma_qp *qp, + struct vhost_rdma_pkt_info *pkt, + struct vhost_rdma_send_wqe *wqe) +{ + int32_t diff; + + /* Check if this response is newer than last segment of current WQE */ + diff = psn_compare(pkt->psn, wqe->last_psn); + if (diff > 0) { + if (wqe->state == WQE_STATE_PENDING) { + /* Unexpected late arrival — likely timeout occurred */ + if (wqe->mask & WR_ATOMIC_OR_READ_MASK) + return VHOST_RDMA_COMPST_ERROR_RETRY; + + /* Reset retry count on new transaction */ + reset_retry_counters(qp); + return VHOST_RDMA_COMPST_COMP_WQE; + } else { + return VHOST_RDMA_COMPST_DONE; + } + } + + /* Compare with expected PSN at completer */ + diff = psn_compare(pkt->psn, qp->comp.psn); + if (diff < 0) { + /* Retransmitted packet — complete only if matches WQE */ + if (pkt->psn == wqe->last_psn) + return VHOST_RDMA_COMPST_COMP_ACK; + else + return VHOST_RDMA_COMPST_DONE; + } else if ((diff > 0) && (wqe->mask & WR_ATOMIC_OR_READ_MASK)) { + /* Out-of-order read/atomic response — skip */ + return VHOST_RDMA_COMPST_DONE; + } else { + return VHOST_RDMA_COMPST_CHECK_ACK; + } +} + +/** + * vhost_rdma_check_ack - Validate response opcode and AETH status + * @qp: Queue pair + * @pkt: Incoming packet + * @wqe: Associated WQE + */ +static __rte_always_inline enum comp_state +vhost_rdma_check_ack(struct vhost_rdma_qp *qp, + struct vhost_rdma_pkt_info *pkt, + struct vhost_rdma_send_wqe *wqe) +{ + struct vhost_rdma_device *dev = qp->dev; + unsigned int mask = pkt->mask; + uint8_t syn; + + /* Handle initial opcode expectations */ + switch (qp->comp.opcode) { + case -1: + /* Expecting start of message */ + if (!(mask & VHOST_START_MASK)) + return VHOST_RDMA_COMPST_ERROR; + break; + + case IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST: + case IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE: + if (pkt->opcode != IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE && + pkt->opcode != IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST) { + /* Allow retry from first or only segment */ + if ((pkt->psn == wqe->first_psn && + pkt->opcode == IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST) || + (wqe->first_psn == wqe->last_psn && + pkt->opcode == IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY)) + break; + + return VHOST_RDMA_COMPST_ERROR; + } + break; + default: + RDMA_LOG_ERR("Invalid comp opcode state: %d", qp->comp.opcode); + return VHOST_RDMA_COMPST_ERROR; + } + + /* Parse AETH syndrome for ACK/NAK types */ + syn = aeth_syn(pkt); + + switch (pkt->opcode) { + case IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST: + case IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST: + case IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY: + if ((syn & AETH_TYPE_MASK) != AETH_ACK) + return VHOST_RDMA_COMPST_ERROR; + /* Fall through */ + case IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE: + if (wqe->wr->opcode != VHOST_RDMA_IB_WR_RDMA_READ) { + wqe->status = VHOST_RDMA_IB_WC_FATAL_ERR; + return VHOST_RDMA_COMPST_ERROR; + } + reset_retry_counters(qp); + return VHOST_RDMA_COMPST_READ; + + case IB_OPCODE_RC_ACKNOWLEDGE: + switch (syn & AETH_TYPE_MASK) { + case AETH_ACK: + reset_retry_counters(qp); + return VHOST_RDMA_COMPST_WRITE_SEND; + + case AETH_RNR_NAK: + vhost_rdma_counter_inc(dev, VHOST_RDMA_CNT_RCV_RNR); + return VHOST_RDMA_COMPST_RNR_RETRY; + + case AETH_NAK: + switch (syn) { + case AETH_NAK_PSN_SEQ_ERROR: + int diff; + diff = psn_compare(pkt->psn, qp->comp.psn); + if (diff > 0) { + vhost_rdma_counter_inc(dev, VHOST_RDMA_CNT_RCV_SEQ_ERR); + qp->comp.psn = pkt->psn; + if (qp->req.wait_psn) { + qp->req.wait_psn = 0; + vhost_rdma_run_task(&qp->req.task, 0); + } + } + return VHOST_RDMA_COMPST_ERROR_RETRY; + + case AETH_NAK_INVALID_REQ: + wqe->status = VHOST_RDMA_IB_WC_REM_INV_REQ_ERR; + return VHOST_RDMA_COMPST_ERROR; + + case AETH_NAK_REM_ACC_ERR: + wqe->status = VHOST_RDMA_IB_WC_REM_ACCESS_ERR; + return VHOST_RDMA_COMPST_ERROR; + + case AETH_NAK_REM_OP_ERR: + wqe->status = VHOST_RDMA_IB_WC_REM_OP_ERR; + return VHOST_RDMA_COMPST_ERROR; + + default: + RDMA_LOG_ERR("Unexpected NAK type: 0x%x", syn); + wqe->status = VHOST_RDMA_IB_WC_REM_OP_ERR; + return VHOST_RDMA_COMPST_ERROR; + } + + default: + RDMA_LOG_ERR("Unknown AETH type: 0x%x", syn); + return VHOST_RDMA_COMPST_ERROR; + } + break; + + default: + RDMA_LOG_ERR("Unexpected opcode: %u", pkt->opcode); + return VHOST_RDMA_COMPST_ERROR; + } +} + +/** + * vhost_rdma_do_read - Copy data from read response into local buffer + * @qp: Queue pair + * @pkt: Read response packet + * @wqe: Corresponding WQE + */ +static __rte_always_inline enum comp_state +vhost_rdma_do_read(struct vhost_rdma_qp *qp, + struct vhost_rdma_pkt_info *pkt, + struct vhost_rdma_send_wqe *wqe) +{ + int ret; + + ret = copy_data(qp->pd, VHOST_RDMA_IB_ACCESS_LOCAL_WRITE, + &wqe->dma, payload_addr(pkt), + payload_size(pkt), VHOST_RDMA_TO_MR_OBJ, NULL); + if (ret) { + wqe->status = VHOST_RDMA_IB_WC_LOC_PROT_ERR; + return VHOST_RDMA_COMPST_ERROR; + } + + /* Final packet? Complete now */ + if (wqe->dma.resid == 0 && (pkt->mask & VHOST_END_MASK)) + return VHOST_RDMA_COMPST_COMP_ACK; + + return VHOST_RDMA_COMPST_UPDATE_COMP; +} + +/** + * vhost_rdma_do_atomic - Handle atomic acknowledgment with original value + * @qp: Queue pair + * @pkt: Atomic ACK packet + * @wqe: WQE + */ +static __rte_always_inline enum comp_state +vhost_rdma_do_atomic(struct vhost_rdma_qp *qp, + struct vhost_rdma_pkt_info *pkt, + struct vhost_rdma_send_wqe *wqe) +{ + int ret; + uint64_t atomic_orig = atmack_orig(pkt); + + ret = copy_data(qp->pd, VHOST_RDMA_IB_ACCESS_LOCAL_WRITE, + &wqe->dma, &atomic_orig, + sizeof(uint64_t), VHOST_RDMA_TO_MR_OBJ, NULL); + if (ret) { + wqe->status = VHOST_RDMA_IB_WC_LOC_PROT_ERR; + return VHOST_RDMA_COMPST_ERROR; + } + + return VHOST_RDMA_COMPST_COMP_ACK; +} + +/** + * wr_to_wc_opcode - Convert Work Request opcode to Work Completion opcode + * @opcode: WR opcode + * + * Returns corresponding WC opcode or 0xff on error. + */ +static enum vhost_rdma_ib_wc_opcode +wr_to_wc_opcode(enum vhost_rdma_ib_wr_opcode opcode) +{ + switch (opcode) { + case VHOST_RDMA_IB_WR_RDMA_WRITE: + case VHOST_RDMA_IB_WR_RDMA_WRITE_WITH_IMM: + return VHOST_RDMA_IB_WC_RDMA_WRITE; + case VHOST_RDMA_IB_WR_SEND: + case VHOST_RDMA_IB_WR_SEND_WITH_IMM: + return VHOST_RDMA_IB_WC_SEND; + case VHOST_RDMA_IB_WR_RDMA_READ: + return VHOST_RDMA_IB_WC_RDMA_READ; + default: + return 0xff; + } +} + +/** + * make_send_cqe - Build a completion queue entry from WQE + * @qp: Queue pair + * @wqe: Completed WQE + * @cqe: Output CQE + */ +static void +make_send_cqe(struct vhost_rdma_qp *qp, + struct vhost_rdma_send_wqe *wqe, + struct vhost_rdma_cq_req *cqe) +{ + memset(cqe, 0, sizeof(*cqe)); + + cqe->wr_id = wqe->wr->wr_id; + cqe->status = wqe->status; + cqe->opcode = wr_to_wc_opcode(wqe->wr->opcode); + + if (wqe->wr->opcode == VHOST_RDMA_IB_WR_RDMA_WRITE_WITH_IMM || + wqe->wr->opcode == VHOST_RDMA_IB_WR_SEND_WITH_IMM) + cqe->wc_flags |= VHOST_RDMA_WC_WITH_IMM; + + cqe->byte_len = wqe->dma.length; + cqe->qp_num = qp->qpn; +} + +/** + * advance_consumer - Advance SQ consumer index and notify virtqueue + * @q: Queue structure + */ +static __rte_always_inline void +advance_consumer(struct vhost_rdma_queue *q) +{ + uint16_t cons_idx; + uint16_t desc_idx; + + assert(q->consumer_index == q->vq->last_avail_idx); + + cons_idx = q->consumer_index & (q->num_elems - 1); + desc_idx = q->vq->vring.avail->ring[cons_idx]; + + vhost_rdma_queue_push(q->vq, desc_idx, 0); + + q->consumer_index++; + q->vq->last_avail_idx++; +} + +/** + * vhost_rdma_do_complete - Complete a WQE and post CQE if needed + * @qp: Queue pair + * @wqe: WQE to complete + * + * Per IB spec, even unsignaled WQEs must generate CQE on error. + */ +static void +vhost_rdma_do_complete(struct vhost_rdma_qp *qp, + struct vhost_rdma_send_wqe *wqe) +{ + struct vhost_rdma_device *dev = qp->dev; + struct vhost_rdma_cq_req cqe; + bool post; + + post = (qp->sq_sig_all || + (wqe->wr->send_flags & VHOST_RDMA_IB_SEND_SIGNALED) || + wqe->status != VHOST_RDMA_IB_WC_SUCCESS); + + if (post) + make_send_cqe(qp, wqe, &cqe); + + advance_consumer(&qp->sq.queue); + + if (post) + vhost_rdma_cq_post(dev, qp->scq, &cqe, 0); + + if (wqe->wr->opcode == VHOST_RDMA_IB_WR_SEND || + wqe->wr->opcode == VHOST_RDMA_IB_WR_SEND_WITH_IMM) + vhost_rdma_counter_inc(dev, VHOST_RDMA_CNT_RDMA_SEND); + + /* Wake up requester if waiting for fence or PSN */ + if (qp->req.wait_fence) { + qp->req.wait_fence = 0; + vhost_rdma_run_task(&qp->req.task, 0); + } +} + +/** + * vhost_rdma_complete_wqe - Mark WQE as completed and update PSN + * @qp: Queue pair + * @pkt: Response packet (may be NULL) + * @wqe: WQE + */ +static __rte_always_inline enum comp_state +vhost_rdma_complete_wqe(struct vhost_rdma_qp *qp, + struct vhost_rdma_pkt_info *pkt, + struct vhost_rdma_send_wqe *wqe) +{ + if (pkt && wqe->state == WQE_STATE_PENDING) { + if (psn_compare(wqe->last_psn, qp->comp.psn) >= 0) { + qp->comp.psn = (wqe->last_psn + 1) & VHOST_RDMA_PSN_MASK; + qp->comp.opcode = -1; + } + + if (qp->req.wait_psn) { + qp->req.wait_psn = 0; + vhost_rdma_run_task(&qp->req.task, 1); + } + } + + vhost_rdma_do_complete(qp, wqe); + return VHOST_RDMA_COMPST_GET_WQE; +} + +/** + * vhost_rdma_rnr_nak_timer - Callback when RNR backoff timer expires + * @timer: Timer instance + * @arg: Pointer to QP + */ +static void +vhost_rdma_rnr_nak_timer(__rte_unused struct rte_timer *timer, void *arg) +{ + struct vhost_rdma_qp *qp = arg; + + RDMA_LOG_DEBUG_DP("QP#%d RNR NAK timer expired", qp->qpn); + vhost_rdma_run_task(&qp->req.task, 1); +} + +/** + * vhost_rdma_complete_ack - Handle ACK completion including RD_ATOMICS sync + * @qp: Queue pair + * @pkt: ACK packet + * @wqe: WQE + */ +static __rte_always_inline enum comp_state +vhost_rdma_complete_ack(struct vhost_rdma_qp *qp, + struct vhost_rdma_pkt_info *pkt, + struct vhost_rdma_send_wqe *wqe) +{ + if (wqe->has_rd_atomic) { + wqe->has_rd_atomic = 0; + rte_atomic32_inc(&qp->req.rd_atomic); + if (qp->req.need_rd_atomic) { + qp->comp.timeout_retry = 0; + qp->req.need_rd_atomic = 0; + vhost_rdma_run_task(&qp->req.task, 0); + } + } + + /* Handle SQ drain transition */ + if (unlikely(qp->req.state == QP_STATE_DRAIN)) { + rte_spinlock_lock(&qp->state_lock); + if (qp->req.state == QP_STATE_DRAIN && + qp->comp.psn == qp->req.psn) { + qp->req.state = QP_STATE_DRAINED; + rte_spinlock_unlock(&qp->state_lock); + + // TODO: Trigger IB_EVENT_SQ_DRAINED + } else { + rte_spinlock_unlock(&qp->state_lock); + } + } + + vhost_rdma_do_complete(qp, wqe); + + if (psn_compare(pkt->psn, qp->comp.psn) >= 0) + return VHOST_RDMA_COMPST_UPDATE_COMP; + else + return VHOST_RDMA_COMPST_DONE; +} + +/** + * free_pkt - Release packet reference and free mbuf + * @pkt: Packet info to release + */ +static __rte_always_inline void +free_pkt(struct vhost_rdma_pkt_info *pkt) +{ + struct rte_mbuf *mbuf = PKT_TO_MBUF(pkt); + + vhost_rdma_drop_ref(pkt->qp, pkt->qp->dev, qp); + rte_pktmbuf_free(mbuf); +} + +/** + * rnrnak_ticks - Convert RNR timeout code to timer ticks + * @timeout: Timeout code + */ +static __rte_always_inline unsigned long +rnrnak_ticks(uint8_t timeout) +{ + uint64_t ticks_per_us = rte_get_timer_hz() / 1000000; + return RTE_MAX(rnrnak_usec[timeout] * ticks_per_us, 1UL); +} + +/** + * vhost_rdma_drain_resp_pkts - Flush all pending response packets + * @qp: Queue pair + * @notify: Whether to signal flush error + */ +static void +vhost_rdma_drain_resp_pkts(struct vhost_rdma_qp *qp, bool notify) +{ + struct rte_mbuf *mbuf; + struct vhost_rdma_send_wqe *wqe; + struct vhost_rdma_queue *q = &qp->sq.queue; + + while (rte_ring_dequeue(qp->resp_pkts, (void **)&mbuf) == 0) { + vhost_rdma_drop_ref(qp, qp->dev, qp); + rte_pktmbuf_free(mbuf); + } + + while ((wqe = queue_head(q))) { + if (notify) { + wqe->status = VHOST_RDMA_IB_WC_WR_FLUSH_ERR; + vhost_rdma_do_complete(qp, wqe); + } else { + advance_consumer(q); + } + } +} + +/** + * vhost_rdma_completer - Main completer function (run per QP) + * @arg: Pointer to vhost_rdma_qp + * + * Processes incoming response packets and completes WQEs accordingly. + * Implements reliability mechanisms: retry, RNR backoff, PSN tracking. + * + * Return: 0 on success, -EAGAIN if needs rescheduling + */ +int +vhost_rdma_completer(void *arg) +{ + struct vhost_rdma_qp *qp = arg; + struct vhost_rdma_device *dev = qp->dev; + struct vhost_rdma_send_wqe *wqe = NULL; + struct rte_mbuf *mbuf = NULL; + struct vhost_rdma_pkt_info *pkt = NULL; + enum comp_state state; + int ret = 0; + + vhost_rdma_add_ref(qp); + + if (!qp->valid || qp->req.state == QP_STATE_ERROR || + qp->req.state == QP_STATE_RESET) { + vhost_rdma_drain_resp_pkts(qp, qp->valid && + qp->req.state == QP_STATE_ERROR); + ret = -EAGAIN; + goto done; + } + + if (qp->comp.timeout) { + qp->comp.timeout_retry = 1; + qp->comp.timeout = 0; + } else { + qp->comp.timeout_retry = 0; + } + + if (qp->req.need_retry) { + ret = -EAGAIN; + goto done; + } + + state = VHOST_RDMA_COMPST_GET_ACK; + + while (1) { + RDMA_LOG_DEBUG_DP("QP#%d state=%s", qp->qpn, comp_state_name[state]); + + switch (state) { + case VHOST_RDMA_COMPST_GET_ACK: + if (rte_ring_dequeue(qp->resp_pkts, (void **)&mbuf) == 0) { + pkt = MBUF_TO_PKT(mbuf); + qp->comp.timeout_retry = 0; + } else { + mbuf = NULL; + } + state = VHOST_RDMA_COMPST_GET_WQE; + break; + + case VHOST_RDMA_COMPST_GET_WQE: + state = vhost_rdma_get_wqe(qp, pkt, &wqe); + break; + + case VHOST_RDMA_COMPST_CHECK_PSN: + state = vhost_rdma_check_psn(qp, pkt, wqe); + break; + + case VHOST_RDMA_COMPST_CHECK_ACK: + state = vhost_rdma_check_ack(qp, pkt, wqe); + break; + + case VHOST_RDMA_COMPST_READ: + state = vhost_rdma_do_read(qp, pkt, wqe); + break; + + case VHOST_RDMA_COMPST_ATOMIC: + state = vhost_rdma_do_atomic(qp, pkt, wqe); + break; + + case VHOST_RDMA_COMPST_WRITE_SEND: + if (wqe && wqe->state == WQE_STATE_PENDING && + wqe->last_psn == pkt->psn) + state = VHOST_RDMA_COMPST_COMP_ACK; + else + state = VHOST_RDMA_COMPST_UPDATE_COMP; + break; + + case VHOST_RDMA_COMPST_COMP_ACK: + state = vhost_rdma_complete_ack(qp, pkt, wqe); + break; + + case VHOST_RDMA_COMPST_COMP_WQE: + state = vhost_rdma_complete_wqe(qp, pkt, wqe); + break; + + case VHOST_RDMA_COMPST_UPDATE_COMP: + if (pkt->mask & VHOST_END_MASK) + qp->comp.opcode = -1; + else + qp->comp.opcode = pkt->opcode; + + if (psn_compare(pkt->psn, qp->comp.psn) >= 0) + qp->comp.psn = (pkt->psn + 1) & VHOST_RDMA_PSN_MASK; + + if (qp->req.wait_psn) { + qp->req.wait_psn = 0; + vhost_rdma_run_task(&qp->req.task, 1); + } + state = VHOST_RDMA_COMPST_DONE; + break; + + case VHOST_RDMA_COMPST_DONE: + goto done; + + case VHOST_RDMA_COMPST_EXIT: + if (qp->comp.timeout_retry && wqe) { + state = VHOST_RDMA_COMPST_ERROR_RETRY; + break; + } + + /* Restart retransmit timer if conditions met */ + if ((qp->type == VHOST_RDMA_IB_QPT_RC) && + (qp->req.state == QP_STATE_READY) && + (psn_compare(qp->req.psn, qp->comp.psn) > 0) && + qp->qp_timeout_ticks) { + rte_timer_reset(&qp->retrans_timer, + qp->qp_timeout_ticks, + SINGLE, rte_lcore_id(), + retransmit_timer, qp); + } + ret = -EAGAIN; + goto done; + + case VHOST_RDMA_COMPST_ERROR_RETRY: + if (!wqe || wqe->state == WQE_STATE_POSTED) + goto done; + + if (qp->comp.started_retry && !qp->comp.timeout_retry) + goto done; + + if (qp->comp.retry_cnt > 0) { + if (qp->comp.retry_cnt != 7) + qp->comp.retry_cnt--; + + if (psn_compare(qp->req.psn, qp->comp.psn) > 0) { + vhost_rdma_counter_inc(dev, VHOST_RDMA_CNT_COMP_RETRY); + qp->req.need_retry = 1; + qp->comp.started_retry = 1; + vhost_rdma_run_task(&qp->req.task, 0); + } + goto done; + } else { + vhost_rdma_counter_inc(dev, VHOST_RDMA_CNT_RETRY_EXCEEDED); + wqe->status = VHOST_RDMA_IB_WC_RETRY_EXC_ERR; + state = VHOST_RDMA_COMPST_ERROR; + } + break; + + case VHOST_RDMA_COMPST_RNR_RETRY: + if (qp->comp.rnr_retry > 0) { + if (qp->comp.rnr_retry != 7) + qp->comp.rnr_retry--; + + qp->req.need_retry = 1; + RDMA_LOG_DEBUG_DP("QP#%d setting RNR NAK timer", qp->qpn); + rte_timer_reset(&qp->rnr_nak_timer, + rnrnak_ticks(aeth_syn(pkt) & ~AETH_TYPE_MASK), + SINGLE, rte_lcore_id(), + vhost_rdma_rnr_nak_timer, qp); + ret = -EAGAIN; + goto done; + } else { + vhost_rdma_counter_inc(dev, VHOST_RDMA_CNT_RNR_RETRY_EXCEEDED); + wqe->status = VHOST_RDMA_IB_WC_RNR_RETRY_EXC_ERR; + state = VHOST_RDMA_COMPST_ERROR; + } + break; + + case VHOST_RDMA_COMPST_ERROR: + RDMA_LOG_ERR_DP("WQE Error: %u", wqe->status); + vhost_rdma_do_complete(qp, wqe); + vhost_rdma_qp_error(qp); + ret = -EAGAIN; + goto done; + } + } + +done: + if (pkt) + free_pkt(pkt); + vhost_rdma_drop_ref(qp, dev, qp); + + return ret; +} diff --git a/examples/vhost_user_rdma/vhost_rdma_ib.c b/examples/vhost_user_rdma/vhost_rdma_ib.c index aac5c28e9a..3776297a2f 100644 --- a/examples/vhost_user_rdma/vhost_rdma_ib.c +++ b/examples/vhost_user_rdma/vhost_rdma_ib.c @@ -36,7 +36,7 @@ tp = iov->iov_base; \ } while(0); \ -#define DEFINE_VIRTIO_RDMA_CMD(cmd, handler) [cmd] = {handler, #cmd} +#define DEFINE_VHOST_RDMA_CMD(cmd, handler) [cmd] = {handler, #cmd} #define CTRL_NO_CMD __rte_unused struct iovec *__in #define CTRL_NO_RSP __rte_unused struct iovec *__out @@ -1089,25 +1089,61 @@ vhost_rdma_destroy_qp(struct vhost_rdma_device *dev, struct iovec *in, CTRL_NO_R return 0; } +static int +vhost_rdma_query_pkey(__rte_unused struct vhost_rdma_device *dev, + CTRL_NO_CMD, struct iovec *out) +{ + struct vhost_rdma_cmd_query_pkey *pkey_rsp; + uint16_t pkey = IB_DEFAULT_PKEY_FULL; + + CHK_IOVEC(pkey_rsp, out); + + pkey_rsp->pkey = pkey; + + return 0; + +} + +static int +vhost_rdma_req_notify(struct vhost_rdma_device *dev, struct iovec *in, CTRL_NO_RSP) +{ + struct vhost_rdma_cmd_req_notify *cmd; + struct vhost_rdma_cq *cq; + + CHK_IOVEC(cmd, in); + + cq = vhost_rdma_pool_get(&dev->cq_pool, cmd->cqn); + if (unlikely(cq == NULL)) { + RDMA_LOG_ERR("cq not found"); + return -EINVAL; + } + + cq->notify = cmd->flags; + + return 0; +} + /* Command handler table declaration */ struct { int (*handler)(struct vhost_rdma_device *dev, struct iovec *in, struct iovec *out); const char *name; /* Name of the command (for logging) */ } cmd_tbl[] = { - DEFINE_VIRTIO_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_QUERY_DEVICE, vhost_rdma_query_device), - DEFINE_VIRTIO_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_QUERY_PORT, vhost_rdma_query_port), - DEFINE_VIRTIO_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_CREATE_CQ, vhost_rdma_create_cq), - DEFINE_VIRTIO_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_DESTROY_CQ, vhost_rdma_destroy_cq), - DEFINE_VIRTIO_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_CREATE_PD, vhost_rdma_create_pd), - DEFINE_VIRTIO_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_DESTROY_PD, vhost_rdma_destroy_pd), - DEFINE_VIRTIO_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_GET_DMA_MR, vhost_rdma_get_dma_mr), - DEFINE_VIRTIO_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_ALLOC_MR, vhost_rdma_alloc_mr), - DEFINE_VIRTIO_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_REG_USER_MR, vhost_rdma_reg_user_mr), - DEFINE_VIRTIO_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_DEREG_MR, vhost_rdma_dereg_mr), - DEFINE_VIRTIO_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_CREATE_QP, vhost_rdma_create_qp), - DEFINE_VIRTIO_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_MODIFY_QP, vhost_rdma_modify_qp), - DEFINE_VIRTIO_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_QUERY_QP, vhost_rdma_query_qp), - DEFINE_VIRTIO_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_DESTROY_QP, vhost_rdma_destroy_qp), + DEFINE_VHOST_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_QUERY_DEVICE, vhost_rdma_query_device), + DEFINE_VHOST_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_QUERY_PORT, vhost_rdma_query_port), + DEFINE_VHOST_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_CREATE_CQ, vhost_rdma_create_cq), + DEFINE_VHOST_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_DESTROY_CQ, vhost_rdma_destroy_cq), + DEFINE_VHOST_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_CREATE_PD, vhost_rdma_create_pd), + DEFINE_VHOST_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_DESTROY_PD, vhost_rdma_destroy_pd), + DEFINE_VHOST_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_GET_DMA_MR, vhost_rdma_get_dma_mr), + DEFINE_VHOST_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_ALLOC_MR, vhost_rdma_alloc_mr), + DEFINE_VHOST_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_REG_USER_MR, vhost_rdma_reg_user_mr), + DEFINE_VHOST_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_DEREG_MR, vhost_rdma_dereg_mr), + DEFINE_VHOST_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_CREATE_QP, vhost_rdma_create_qp), + DEFINE_VHOST_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_MODIFY_QP, vhost_rdma_modify_qp), + DEFINE_VHOST_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_QUERY_QP, vhost_rdma_query_qp), + DEFINE_VHOST_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_DESTROY_QP, vhost_rdma_destroy_qp), + DEFINE_VHOST_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_QUERY_PKEY, vhost_rdma_query_pkey), + DEFINE_VHOST_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_REQ_NOTIFY_CQ, vhost_rdma_req_notify), }; /** diff --git a/examples/vhost_user_rdma/vhost_rdma_ib.h b/examples/vhost_user_rdma/vhost_rdma_ib.h index 79575e735c..5a1787fabe 100644 --- a/examples/vhost_user_rdma/vhost_rdma_ib.h +++ b/examples/vhost_user_rdma/vhost_rdma_ib.h @@ -957,6 +957,10 @@ struct vhost_rdma_cmd_destroy_qp { uint32_t qpn; }; +struct vhost_rdma_cmd_query_pkey{ + uint16_t pkey; +}; + /** * @brief Convert IB MTU enum to byte size * @param mtu The MTU enum value diff --git a/examples/vhost_user_rdma/vhost_rdma_opcode.h b/examples/vhost_user_rdma/vhost_rdma_opcode.h index 6c3660f36b..0c2961d5cd 100644 --- a/examples/vhost_user_rdma/vhost_rdma_opcode.h +++ b/examples/vhost_user_rdma/vhost_rdma_opcode.h @@ -27,28 +27,28 @@ #include "vhost_rdma_pkt.h" /** Maximum number of QP types supported for WR mask dispatching */ -#define WR_MAX_QPT 8 +#define WR_MAX_QPT 8 /** Total number of defined opcodes (must be power-of-2 >= 256) */ -#define VHOST_NUM_OPCODE 256 +#define VHOST_NUM_OPCODE 256 #ifndef BIT #define BIT(x) (1 << (x)) #endif /* Invalid opcode marker */ -#define OPCODE_NONE (-1) +#define OPCODE_NONE (-1) #define VHOST_RDMA_SE_MASK (0x80) #define VHOST_RDMA_MIG_MASK (0x40) #define VHOST_RDMA_PAD_MASK (0x30) -#define VHOST_RDMA_TVER_MASK (0x0f) -#define VHOST_RDMA_FECN_MASK (0x80000000) -#define VHOST_RDMA_BECN_MASK (0x40000000) -#define VHOST_RDMA_RESV6A_MASK (0x3f000000) +#define VHOST_RDMA_TVER_MASK (0x0f) +#define VHOST_RDMA_FECN_MASK (0x80000000) +#define VHOST_RDMA_BECN_MASK (0x40000000) +#define VHOST_RDMA_RESV6A_MASK (0x3f000000) #define VHOST_RDMA_QPN_MASK (0x00ffffff) #define VHOST_RDMA_ACK_MASK (0x80000000) -#define VHOST_RDMA_RESV7_MASK (0x7f000000) +#define VHOST_RDMA_RESV7_MASK (0x7f000000) #define VHOST_RDMA_PSN_MASK (0x00ffffff) /** @@ -56,19 +56,19 @@ * @{ */ enum vhost_rdma_hdr_type { - VHOST_RDMA_LRH, /**< Link Layer Header (InfiniBand only) */ - VHOST_RDMA_GRH, /**< Global Route Header (IPv6-style GIDs) */ - VHOST_RDMA_BTH, /**< Base Transport Header */ - VHOST_RDMA_RETH, /**< RDMA Extended Transport Header */ - VHOST_RDMA_AETH, /**< Acknowledge/Error Header */ - VHOST_RDMA_ATMETH, /**< Atomic Operation Request Header */ - VHOST_RDMA_ATMACK, /**< Atomic Operation Response Header */ - VHOST_RDMA_IETH, /**< Immediate Data + Error Code Header */ - VHOST_RDMA_RDETH, /**< Reliable Datagram Extended Transport Header */ - VHOST_RDMA_DETH, /**< Datagram Endpoint Identifier Header */ - VHOST_RDMA_IMMDT, /**< Immediate Data Header */ - VHOST_RDMA_PAYLOAD, /**< Payload section */ - NUM_HDR_TYPES /**< Number of known header types */ + VHOST_RDMA_LRH, /**< Link Layer Header (InfiniBand only) */ + VHOST_RDMA_GRH, /**< Global Route Header (IPv6-style GIDs) */ + VHOST_RDMA_BTH, /**< Base Transport Header */ + VHOST_RDMA_RETH, /**< RDMA Extended Transport Header */ + VHOST_RDMA_AETH, /**< Acknowledge/Error Header */ + VHOST_RDMA_ATMETH, /**< Atomic Operation Request Header */ + VHOST_RDMA_ATMACK, /**< Atomic Operation Response Header */ + VHOST_RDMA_IETH, /**< Immediate Data + Error Code Header */ + VHOST_RDMA_RDETH, /**< Reliable Datagram Extended Transport Header */ + VHOST_RDMA_DETH, /**< Datagram Endpoint Identifier Header */ + VHOST_RDMA_IMMDT, /**< Immediate Data Header */ + VHOST_RDMA_PAYLOAD, /**< Payload section */ + NUM_HDR_TYPES /**< Number of known header types */ }; /** @@ -76,50 +76,50 @@ enum vhost_rdma_hdr_type { * @{ */ enum vhost_rdma_hdr_mask { - VHOST_LRH_MASK = BIT(VHOST_RDMA_LRH), - VHOST_GRH_MASK = BIT(VHOST_RDMA_GRH), - VHOST_BTH_MASK = BIT(VHOST_RDMA_BTH), - VHOST_IMMDT_MASK = BIT(VHOST_RDMA_IMMDT), - VHOST_RETH_MASK = BIT(VHOST_RDMA_RETH), - VHOST_AETH_MASK = BIT(VHOST_RDMA_AETH), - VHOST_ATMETH_MASK = BIT(VHOST_RDMA_ATMETH), - VHOST_ATMACK_MASK = BIT(VHOST_RDMA_ATMACK), - VHOST_IETH_MASK = BIT(VHOST_RDMA_IETH), - VHOST_RDETH_MASK = BIT(VHOST_RDMA_RDETH), - VHOST_DETH_MASK = BIT(VHOST_RDMA_DETH), - VHOST_PAYLOAD_MASK = BIT(VHOST_RDMA_PAYLOAD), - - /* Semantic packet type flags */ - VHOST_REQ_MASK = BIT(NUM_HDR_TYPES + 0), /**< Request packet */ - VHOST_ACK_MASK = BIT(NUM_HDR_TYPES + 1), /**< ACK/NACK packet */ - VHOST_SEND_MASK = BIT(NUM_HDR_TYPES + 2), /**< Send operation */ - VHOST_WRITE_MASK = BIT(NUM_HDR_TYPES + 3), /**< RDMA Write */ - VHOST_READ_MASK = BIT(NUM_HDR_TYPES + 4), /**< RDMA Read */ - VHOST_ATOMIC_MASK = BIT(NUM_HDR_TYPES + 5), /**< Atomic operation */ - - /* Packet fragmentation flags */ - VHOST_RWR_MASK = BIT(NUM_HDR_TYPES + 6), /**< RDMA with Immediate + Invalidate */ - VHOST_COMP_MASK = BIT(NUM_HDR_TYPES + 7), /**< Completion required */ - - VHOST_START_MASK = BIT(NUM_HDR_TYPES + 8), /**< First fragment */ - VHOST_MIDDLE_MASK = BIT(NUM_HDR_TYPES + 9), /**< Middle fragment */ - VHOST_END_MASK = BIT(NUM_HDR_TYPES + 10), /**< Last fragment */ - - VHOST_LOOPBACK_MASK = BIT(NUM_HDR_TYPES + 12), /**< Loopback within host */ - - /* Composite masks */ - VHOST_READ_OR_ATOMIC = (VHOST_READ_MASK | VHOST_ATOMIC_MASK), - VHOST_WRITE_OR_SEND = (VHOST_WRITE_MASK | VHOST_SEND_MASK), + VHOST_LRH_MASK = BIT(VHOST_RDMA_LRH), + VHOST_GRH_MASK = BIT(VHOST_RDMA_GRH), + VHOST_BTH_MASK = BIT(VHOST_RDMA_BTH), + VHOST_IMMDT_MASK = BIT(VHOST_RDMA_IMMDT), + VHOST_RETH_MASK = BIT(VHOST_RDMA_RETH), + VHOST_AETH_MASK = BIT(VHOST_RDMA_AETH), + VHOST_ATMETH_MASK = BIT(VHOST_RDMA_ATMETH), + VHOST_ATMACK_MASK = BIT(VHOST_RDMA_ATMACK), + VHOST_IETH_MASK = BIT(VHOST_RDMA_IETH), + VHOST_RDETH_MASK = BIT(VHOST_RDMA_RDETH), + VHOST_DETH_MASK = BIT(VHOST_RDMA_DETH), + VHOST_PAYLOAD_MASK = BIT(VHOST_RDMA_PAYLOAD), + + /* Semantic packet type flags */ + VHOST_REQ_MASK = BIT(NUM_HDR_TYPES + 0), /**< Request packet */ + VHOST_ACK_MASK = BIT(NUM_HDR_TYPES + 1), /**< ACK/NACK packet */ + VHOST_SEND_MASK = BIT(NUM_HDR_TYPES + 2), /**< Send operation */ + VHOST_WRITE_MASK = BIT(NUM_HDR_TYPES + 3), /**< RDMA Write */ + VHOST_READ_MASK = BIT(NUM_HDR_TYPES + 4), /**< RDMA Read */ + VHOST_ATOMIC_MASK = BIT(NUM_HDR_TYPES + 5), /**< Atomic operation */ + + /* Packet fragmentation flags */ + VHOST_RWR_MASK = BIT(NUM_HDR_TYPES + 6), /**< RDMA with Immediate + Invalidate */ + VHOST_COMP_MASK = BIT(NUM_HDR_TYPES + 7), /**< Completion required */ + + VHOST_START_MASK = BIT(NUM_HDR_TYPES + 8), /**< First fragment */ + VHOST_MIDDLE_MASK = BIT(NUM_HDR_TYPES + 9), /**< Middle fragment */ + VHOST_END_MASK = BIT(NUM_HDR_TYPES + 10), /**< Last fragment */ + + VHOST_LOOPBACK_MASK = BIT(NUM_HDR_TYPES + 12), /**< Loopback within host */ + + /* Composite masks */ + VHOST_READ_OR_ATOMIC = (VHOST_READ_MASK | VHOST_ATOMIC_MASK), + VHOST_WRITE_OR_SEND = (VHOST_WRITE_MASK | VHOST_SEND_MASK), }; /** * @brief Per-opcode metadata for parsing and validation */ struct vhost_rdma_opcode_info { - const char *name; /**< Opcode name (e.g., "RC SEND_FIRST") */ - int length; /**< Fixed payload length (if any) */ - int offset[NUM_HDR_TYPES]; /**< Offset of each header within packet */ - enum vhost_rdma_hdr_mask mask; /**< Header presence and semantic flags */ + const char *name; /**< Opcode name (e.g., "RC SEND_FIRST") */ + int length; /**< Fixed payload length (if any) */ + int offset[NUM_HDR_TYPES]; /**< Offset of each header within packet */ + enum vhost_rdma_hdr_mask mask; /**< Header presence and semantic flags */ }; /* Global opcode info table (indexed by IB opcode byte) */ @@ -146,8 +146,8 @@ static inline uint8_t bth_pad(struct vhost_rdma_pkt_info *pkt) } struct vhost_deth { - rte_be32_t qkey; - rte_be32_t sqp; + rte_be32_t qkey; + rte_be32_t sqp; }; #define GSI_QKEY (0x80010000) @@ -206,7 +206,7 @@ static inline void deth_set_sqp(struct vhost_rdma_pkt_info *pkt, uint32_t sqp) } struct vhost_immdt { - rte_be32_t imm; + rte_be32_t imm; }; static inline rte_be32_t __immdt_imm(void *arg) @@ -236,9 +236,9 @@ static inline void immdt_set_imm(struct vhost_rdma_pkt_info *pkt, rte_be32_t imm } struct vhost_reth { - rte_be64_t va; - rte_be32_t rkey; - rte_be32_t len; + rte_be64_t va; + rte_be32_t rkey; + rte_be32_t len; }; static inline uint64_t __reth_va(void *arg) @@ -323,35 +323,65 @@ struct vhost_aeth { rte_be32_t smsn; }; +#define AETH_SYN_MASK (0xff000000) +#define AETH_MSN_MASK (0x00ffffff) + +enum aeth_syndrome { + AETH_TYPE_MASK = 0xe0, + AETH_ACK = 0x00, + AETH_RNR_NAK = 0x20, + AETH_RSVD = 0x40, + AETH_NAK = 0x60, + AETH_ACK_UNLIMITED = 0x1f, + AETH_NAK_PSN_SEQ_ERROR = 0x60, + AETH_NAK_INVALID_REQ = 0x61, + AETH_NAK_REM_ACC_ERR = 0x62, + AETH_NAK_REM_OP_ERR = 0x63, + AETH_NAK_INV_RD_REQ = 0x64, +}; + +static inline uint8_t __aeth_syn(void *arg) +{ + struct vhost_aeth *aeth = arg; + + return (AETH_SYN_MASK & rte_be_to_cpu_32(aeth->smsn)) >> 24; +} + +static inline uint8_t aeth_syn(struct vhost_rdma_pkt_info *pkt) +{ + return __aeth_syn(pkt->hdr + + vhost_rdma_opcode[pkt->opcode].offset[VHOST_RDMA_AETH]); +} + struct vhost_atmack { - rte_be64_t orig; + rte_be64_t orig; }; struct vhost_atmeth { - rte_be64_t va; - rte_be32_t rkey; - rte_be64_t swap_add; - rte_be64_t comp; + rte_be64_t va; + rte_be32_t rkey; + rte_be64_t swap_add; + rte_be64_t comp; } __rte_packed; struct vhost_ieth { - rte_be32_t rkey; + rte_be32_t rkey; }; struct vhost_rdeth { - rte_be32_t een; + rte_be32_t een; }; enum vhost_rdma_hdr_length { - VHOST_BTH_BYTES = sizeof(struct vhost_bth), - VHOST_DETH_BYTES = sizeof(struct vhost_deth), - VHOST_IMMDT_BYTES = sizeof(struct vhost_immdt), - VHOST_RETH_BYTES = sizeof(struct vhost_reth), - VHOST_AETH_BYTES = sizeof(struct vhost_aeth), - VHOST_ATMACK_BYTES = sizeof(struct vhost_atmack), - VHOST_ATMETH_BYTES = sizeof(struct vhost_atmeth), - VHOST_IETH_BYTES = sizeof(struct vhost_ieth), - VHOST_RDETH_BYTES = sizeof(struct vhost_rdeth), + VHOST_BTH_BYTES = sizeof(struct vhost_bth), + VHOST_DETH_BYTES = sizeof(struct vhost_deth), + VHOST_IMMDT_BYTES = sizeof(struct vhost_immdt), + VHOST_RETH_BYTES = sizeof(struct vhost_reth), + VHOST_AETH_BYTES = sizeof(struct vhost_aeth), + VHOST_ATMACK_BYTES = sizeof(struct vhost_atmack), + VHOST_ATMETH_BYTES = sizeof(struct vhost_atmeth), + VHOST_IETH_BYTES = sizeof(struct vhost_ieth), + VHOST_RDETH_BYTES = sizeof(struct vhost_rdeth), }; /** @@ -360,8 +390,8 @@ enum vhost_rdma_hdr_length { * Expands to e.g.: `IB_OPCODE_RC_SEND_FIRST = IB_OPCODE_RC + IB_OPCODE_SEND_FIRST` */ #define IB_OPCODE(transport, op) \ - IB_OPCODE_ ## transport ## _ ## op = \ - (IB_OPCODE_ ## transport + IB_OPCODE_ ## op) + IB_OPCODE_ ## transport ## _ ## op = \ + (IB_OPCODE_ ## transport + IB_OPCODE_ ## op) /** * @defgroup ib_opcodes InfiniBand OpCode Definitions @@ -371,105 +401,105 @@ enum vhost_rdma_hdr_length { */ enum { - /* Transport types (base values) */ - IB_OPCODE_RC = 0x00, /**< Reliable Connection */ - IB_OPCODE_UC = 0x20, /**< Unreliable Connection */ - IB_OPCODE_RD = 0x40, /**< Reliable Datagram */ - IB_OPCODE_UD = 0x60, /**< Unreliable Datagram */ - IB_OPCODE_CNP = 0x80, /**< Congestion Notification Packet */ - IB_OPCODE_MSP = 0xe0, /**< Manufacturer Specific Protocol */ - - /* Operation subtypes */ - IB_OPCODE_SEND_FIRST = 0x00, - IB_OPCODE_SEND_MIDDLE = 0x01, - IB_OPCODE_SEND_LAST = 0x02, - IB_OPCODE_SEND_LAST_WITH_IMMEDIATE = 0x03, - IB_OPCODE_SEND_ONLY = 0x04, - IB_OPCODE_SEND_ONLY_WITH_IMMEDIATE = 0x05, - IB_OPCODE_RDMA_WRITE_FIRST = 0x06, - IB_OPCODE_RDMA_WRITE_MIDDLE = 0x07, - IB_OPCODE_RDMA_WRITE_LAST = 0x08, - IB_OPCODE_RDMA_WRITE_LAST_WITH_IMMEDIATE = 0x09, - IB_OPCODE_RDMA_WRITE_ONLY = 0x0a, - IB_OPCODE_RDMA_WRITE_ONLY_WITH_IMMEDIATE = 0x0b, - IB_OPCODE_RDMA_READ_REQUEST = 0x0c, - IB_OPCODE_RDMA_READ_RESPONSE_FIRST = 0x0d, - IB_OPCODE_RDMA_READ_RESPONSE_MIDDLE = 0x0e, - IB_OPCODE_RDMA_READ_RESPONSE_LAST = 0x0f, - IB_OPCODE_RDMA_READ_RESPONSE_ONLY = 0x10, - IB_OPCODE_ACKNOWLEDGE = 0x11, - IB_OPCODE_ATOMIC_ACKNOWLEDGE = 0x12, - IB_OPCODE_COMPARE_SWAP = 0x13, - IB_OPCODE_FETCH_ADD = 0x14, - /* 0x15 is reserved */ - IB_OPCODE_SEND_LAST_WITH_INVALIDATE = 0x16, - IB_OPCODE_SEND_ONLY_WITH_INVALIDATE = 0x17, - - /* Real opcodes generated via IB_OPCODE() macro */ - IB_OPCODE(RC, SEND_FIRST), - IB_OPCODE(RC, SEND_MIDDLE), - IB_OPCODE(RC, SEND_LAST), - IB_OPCODE(RC, SEND_LAST_WITH_IMMEDIATE), - IB_OPCODE(RC, SEND_ONLY), - IB_OPCODE(RC, SEND_ONLY_WITH_IMMEDIATE), - IB_OPCODE(RC, RDMA_WRITE_FIRST), - IB_OPCODE(RC, RDMA_WRITE_MIDDLE), - IB_OPCODE(RC, RDMA_WRITE_LAST), - IB_OPCODE(RC, RDMA_WRITE_LAST_WITH_IMMEDIATE), - IB_OPCODE(RC, RDMA_WRITE_ONLY), - IB_OPCODE(RC, RDMA_WRITE_ONLY_WITH_IMMEDIATE), - IB_OPCODE(RC, RDMA_READ_REQUEST), - IB_OPCODE(RC, RDMA_READ_RESPONSE_FIRST), - IB_OPCODE(RC, RDMA_READ_RESPONSE_MIDDLE), - IB_OPCODE(RC, RDMA_READ_RESPONSE_LAST), - IB_OPCODE(RC, RDMA_READ_RESPONSE_ONLY), - IB_OPCODE(RC, ACKNOWLEDGE), - IB_OPCODE(RC, ATOMIC_ACKNOWLEDGE), - IB_OPCODE(RC, COMPARE_SWAP), - IB_OPCODE(RC, FETCH_ADD), - IB_OPCODE(RC, SEND_LAST_WITH_INVALIDATE), - IB_OPCODE(RC, SEND_ONLY_WITH_INVALIDATE), - - /* UC opcodes */ - IB_OPCODE(UC, SEND_FIRST), - IB_OPCODE(UC, SEND_MIDDLE), - IB_OPCODE(UC, SEND_LAST), - IB_OPCODE(UC, SEND_LAST_WITH_IMMEDIATE), - IB_OPCODE(UC, SEND_ONLY), - IB_OPCODE(UC, SEND_ONLY_WITH_IMMEDIATE), - IB_OPCODE(UC, RDMA_WRITE_FIRST), - IB_OPCODE(UC, RDMA_WRITE_MIDDLE), - IB_OPCODE(UC, RDMA_WRITE_LAST), - IB_OPCODE(UC, RDMA_WRITE_LAST_WITH_IMMEDIATE), - IB_OPCODE(UC, RDMA_WRITE_ONLY), - IB_OPCODE(UC, RDMA_WRITE_ONLY_WITH_IMMEDIATE), - - /* RD opcodes */ - IB_OPCODE(RD, SEND_FIRST), - IB_OPCODE(RD, SEND_MIDDLE), - IB_OPCODE(RD, SEND_LAST), - IB_OPCODE(RD, SEND_LAST_WITH_IMMEDIATE), - IB_OPCODE(RD, SEND_ONLY), - IB_OPCODE(RD, SEND_ONLY_WITH_IMMEDIATE), - IB_OPCODE(RD, RDMA_WRITE_FIRST), - IB_OPCODE(RD, RDMA_WRITE_MIDDLE), - IB_OPCODE(RD, RDMA_WRITE_LAST), - IB_OPCODE(RD, RDMA_WRITE_LAST_WITH_IMMEDIATE), - IB_OPCODE(RD, RDMA_WRITE_ONLY), - IB_OPCODE(RD, RDMA_WRITE_ONLY_WITH_IMMEDIATE), - IB_OPCODE(RD, RDMA_READ_REQUEST), - IB_OPCODE(RD, RDMA_READ_RESPONSE_FIRST), - IB_OPCODE(RD, RDMA_READ_RESPONSE_MIDDLE), - IB_OPCODE(RD, RDMA_READ_RESPONSE_LAST), - IB_OPCODE(RD, RDMA_READ_RESPONSE_ONLY), - IB_OPCODE(RD, ACKNOWLEDGE), - IB_OPCODE(RD, ATOMIC_ACKNOWLEDGE), - IB_OPCODE(RD, COMPARE_SWAP), - IB_OPCODE(RD, FETCH_ADD), - - /* UD opcodes */ - IB_OPCODE(UD, SEND_ONLY), - IB_OPCODE(UD, SEND_ONLY_WITH_IMMEDIATE) + /* Transport types (base values) */ + IB_OPCODE_RC = 0x00, /**< Reliable Connection */ + IB_OPCODE_UC = 0x20, /**< Unreliable Connection */ + IB_OPCODE_RD = 0x40, /**< Reliable Datagram */ + IB_OPCODE_UD = 0x60, /**< Unreliable Datagram */ + IB_OPCODE_CNP = 0x80, /**< Congestion Notification Packet */ + IB_OPCODE_MSP = 0xe0, /**< Manufacturer Specific Protocol */ + + /* Operation subtypes */ + IB_OPCODE_SEND_FIRST = 0x00, + IB_OPCODE_SEND_MIDDLE = 0x01, + IB_OPCODE_SEND_LAST = 0x02, + IB_OPCODE_SEND_LAST_WITH_IMMEDIATE = 0x03, + IB_OPCODE_SEND_ONLY = 0x04, + IB_OPCODE_SEND_ONLY_WITH_IMMEDIATE = 0x05, + IB_OPCODE_RDMA_WRITE_FIRST = 0x06, + IB_OPCODE_RDMA_WRITE_MIDDLE = 0x07, + IB_OPCODE_RDMA_WRITE_LAST = 0x08, + IB_OPCODE_RDMA_WRITE_LAST_WITH_IMMEDIATE = 0x09, + IB_OPCODE_RDMA_WRITE_ONLY = 0x0a, + IB_OPCODE_RDMA_WRITE_ONLY_WITH_IMMEDIATE = 0x0b, + IB_OPCODE_RDMA_READ_REQUEST = 0x0c, + IB_OPCODE_RDMA_READ_RESPONSE_FIRST = 0x0d, + IB_OPCODE_RDMA_READ_RESPONSE_MIDDLE = 0x0e, + IB_OPCODE_RDMA_READ_RESPONSE_LAST = 0x0f, + IB_OPCODE_RDMA_READ_RESPONSE_ONLY = 0x10, + IB_OPCODE_ACKNOWLEDGE = 0x11, + IB_OPCODE_ATOMIC_ACKNOWLEDGE = 0x12, + IB_OPCODE_COMPARE_SWAP = 0x13, + IB_OPCODE_FETCH_ADD = 0x14, + /* 0x15 is reserved */ + IB_OPCODE_SEND_LAST_WITH_INVALIDATE = 0x16, + IB_OPCODE_SEND_ONLY_WITH_INVALIDATE = 0x17, + + /* Real opcodes generated via IB_OPCODE() macro */ + IB_OPCODE(RC, SEND_FIRST), + IB_OPCODE(RC, SEND_MIDDLE), + IB_OPCODE(RC, SEND_LAST), + IB_OPCODE(RC, SEND_LAST_WITH_IMMEDIATE), + IB_OPCODE(RC, SEND_ONLY), + IB_OPCODE(RC, SEND_ONLY_WITH_IMMEDIATE), + IB_OPCODE(RC, RDMA_WRITE_FIRST), + IB_OPCODE(RC, RDMA_WRITE_MIDDLE), + IB_OPCODE(RC, RDMA_WRITE_LAST), + IB_OPCODE(RC, RDMA_WRITE_LAST_WITH_IMMEDIATE), + IB_OPCODE(RC, RDMA_WRITE_ONLY), + IB_OPCODE(RC, RDMA_WRITE_ONLY_WITH_IMMEDIATE), + IB_OPCODE(RC, RDMA_READ_REQUEST), + IB_OPCODE(RC, RDMA_READ_RESPONSE_FIRST), + IB_OPCODE(RC, RDMA_READ_RESPONSE_MIDDLE), + IB_OPCODE(RC, RDMA_READ_RESPONSE_LAST), + IB_OPCODE(RC, RDMA_READ_RESPONSE_ONLY), + IB_OPCODE(RC, ACKNOWLEDGE), + IB_OPCODE(RC, ATOMIC_ACKNOWLEDGE), + IB_OPCODE(RC, COMPARE_SWAP), + IB_OPCODE(RC, FETCH_ADD), + IB_OPCODE(RC, SEND_LAST_WITH_INVALIDATE), + IB_OPCODE(RC, SEND_ONLY_WITH_INVALIDATE), + + /* UC opcodes */ + IB_OPCODE(UC, SEND_FIRST), + IB_OPCODE(UC, SEND_MIDDLE), + IB_OPCODE(UC, SEND_LAST), + IB_OPCODE(UC, SEND_LAST_WITH_IMMEDIATE), + IB_OPCODE(UC, SEND_ONLY), + IB_OPCODE(UC, SEND_ONLY_WITH_IMMEDIATE), + IB_OPCODE(UC, RDMA_WRITE_FIRST), + IB_OPCODE(UC, RDMA_WRITE_MIDDLE), + IB_OPCODE(UC, RDMA_WRITE_LAST), + IB_OPCODE(UC, RDMA_WRITE_LAST_WITH_IMMEDIATE), + IB_OPCODE(UC, RDMA_WRITE_ONLY), + IB_OPCODE(UC, RDMA_WRITE_ONLY_WITH_IMMEDIATE), + + /* RD opcodes */ + IB_OPCODE(RD, SEND_FIRST), + IB_OPCODE(RD, SEND_MIDDLE), + IB_OPCODE(RD, SEND_LAST), + IB_OPCODE(RD, SEND_LAST_WITH_IMMEDIATE), + IB_OPCODE(RD, SEND_ONLY), + IB_OPCODE(RD, SEND_ONLY_WITH_IMMEDIATE), + IB_OPCODE(RD, RDMA_WRITE_FIRST), + IB_OPCODE(RD, RDMA_WRITE_MIDDLE), + IB_OPCODE(RD, RDMA_WRITE_LAST), + IB_OPCODE(RD, RDMA_WRITE_LAST_WITH_IMMEDIATE), + IB_OPCODE(RD, RDMA_WRITE_ONLY), + IB_OPCODE(RD, RDMA_WRITE_ONLY_WITH_IMMEDIATE), + IB_OPCODE(RD, RDMA_READ_REQUEST), + IB_OPCODE(RD, RDMA_READ_RESPONSE_FIRST), + IB_OPCODE(RD, RDMA_READ_RESPONSE_MIDDLE), + IB_OPCODE(RD, RDMA_READ_RESPONSE_LAST), + IB_OPCODE(RD, RDMA_READ_RESPONSE_ONLY), + IB_OPCODE(RD, ACKNOWLEDGE), + IB_OPCODE(RD, ATOMIC_ACKNOWLEDGE), + IB_OPCODE(RD, COMPARE_SWAP), + IB_OPCODE(RD, FETCH_ADD), + + /* UD opcodes */ + IB_OPCODE(UD, SEND_ONLY), + IB_OPCODE(UD, SEND_ONLY_WITH_IMMEDIATE) }; /** @} */ @@ -478,17 +508,17 @@ enum { * @{ */ enum vhost_rdma_wr_mask { - WR_INLINE_MASK = BIT(0), /**< WR contains inline data */ - WR_ATOMIC_MASK = BIT(1), /**< WR is an atomic operation */ - WR_SEND_MASK = BIT(2), /**< WR is a send-type operation */ - WR_READ_MASK = BIT(3), /**< WR initiates RDMA read */ - WR_WRITE_MASK = BIT(4), /**< WR performs RDMA write */ - WR_LOCAL_OP_MASK = BIT(5), /**< WR triggers local memory op */ - - WR_READ_OR_WRITE_MASK = WR_READ_MASK | WR_WRITE_MASK, - WR_READ_WRITE_OR_SEND_MASK = WR_READ_OR_WRITE_MASK | WR_SEND_MASK, - WR_WRITE_OR_SEND_MASK = WR_WRITE_MASK | WR_SEND_MASK, - WR_ATOMIC_OR_READ_MASK = WR_ATOMIC_MASK | WR_READ_MASK, + WR_INLINE_MASK = BIT(0), /**< WR contains inline data */ + WR_ATOMIC_MASK = BIT(1), /**< WR is an atomic operation */ + WR_SEND_MASK = BIT(2), /**< WR is a send-type operation */ + WR_READ_MASK = BIT(3), /**< WR initiates RDMA read */ + WR_WRITE_MASK = BIT(4), /**< WR performs RDMA write */ + WR_LOCAL_OP_MASK = BIT(5), /**< WR triggers local memory op */ + + WR_READ_OR_WRITE_MASK = WR_READ_MASK | WR_WRITE_MASK, + WR_READ_WRITE_OR_SEND_MASK = WR_READ_OR_WRITE_MASK | WR_SEND_MASK, + WR_WRITE_OR_SEND_MASK = WR_WRITE_MASK | WR_SEND_MASK, + WR_ATOMIC_OR_READ_MASK = WR_ATOMIC_MASK | WR_READ_MASK, }; /** @@ -497,8 +527,8 @@ enum vhost_rdma_wr_mask { * Used to determine which operations are valid per QP type. */ struct vhost_rdma_wr_opcode_info { - const char *name; /**< Human-readable name */ - enum vhost_rdma_wr_mask mask[WR_MAX_QPT]; /**< Validity per QP type */ + const char *name; /**< Human-readable name */ + enum vhost_rdma_wr_mask mask[WR_MAX_QPT]; /**< Validity per QP type */ }; /* Extern declaration of global opcode metadata table */ @@ -510,8 +540,21 @@ static inline unsigned int wr_opcode_mask(int opcode, struct vhost_rdma_qp *qp) return vhost_rdma_wr_opcode_info[opcode].mask[qp->type]; } +static inline uint64_t __atmack_orig(void *arg) +{ + struct vhost_atmack *atmack = arg; + + return rte_be_to_cpu_64(atmack->orig); +} + +static inline uint64_t atmack_orig(struct vhost_rdma_pkt_info *pkt) +{ + return __atmack_orig(pkt->hdr + + vhost_rdma_opcode[pkt->opcode].offset[VHOST_RDMA_ATMACK]); +} + int vhost_rdma_next_opcode(struct vhost_rdma_qp *qp, - struct vhost_rdma_send_wqe *wqe, - uint32_t opcode); + struct vhost_rdma_send_wqe *wqe, + uint32_t opcode); #endif \ No newline at end of file diff --git a/examples/vhost_user_rdma/vhost_rdma_queue.c b/examples/vhost_user_rdma/vhost_rdma_queue.c index 7d0c45592c..5f9f7fd3c7 100644 --- a/examples/vhost_user_rdma/vhost_rdma_queue.c +++ b/examples/vhost_user_rdma/vhost_rdma_queue.c @@ -1388,12 +1388,6 @@ int vhost_rdma_requester(void *arg) return -EAGAIN; } -int vhost_rdma_completer(void* arg) -{ - //TODO: handle complete - return 0; -} - int vhost_rdma_responder(void* arg) { //TODO: handle response diff --git a/examples/vhost_user_rdma/vhost_rdma_queue.h b/examples/vhost_user_rdma/vhost_rdma_queue.h index fb5a90235f..d8af86cdf2 100644 --- a/examples/vhost_user_rdma/vhost_rdma_queue.h +++ b/examples/vhost_user_rdma/vhost_rdma_queue.h @@ -24,6 +24,11 @@ #include "vhost_rdma.h" #include "vhost_rdma_log.h" +#define PKT_TO_MBUF(p) ((struct rte_mbuf *) \ + (RTE_PTR_SUB(p, sizeof(struct rte_mbuf)))) +#define MBUF_TO_PKT(m) ((struct vhost_rdma_pkt_info *) \ + (RTE_PTR_ADD(m, sizeof(struct rte_mbuf)))) + #define QP_OPCODE_INVAILD (-1) /****************************************************************************** -- 2.43.0 为这段patch生成一个英文版的Commit信息
最新发布
12-19
From a84bacb935142ee96b3732fa910bd89122b95092 Mon Sep 17 00:00:00 2001 From: xiongweimin <xiongweimin@kylinos.cn> Date: Wed, 24 Sep 2025 18:20:36 +0800 Subject: [PATCH 09/10] RDMA/vrdma: Implement P_Key query verb support This commit adds P_Key table query capability to the virtio RDMA driver: 1. P_Key management: - Implements IB_QUERY_PKEY verb for partition key retrieval - Handles endianness conversion for cross-platform compatibility - Provides complete error handling for device communication failures 2. Resource management: - Safe memory allocation with guaranteed cleanup - Efficient virtqueue I/O using scatterlists - Detailed error logging for debugging Signed-off-by: Xiong Weimin <xiongweimin@kylinos.cn> vrdma: Implement QP query verb support This commit adds QP attribute query functionality to the virtio RDMA driver: 1. Complete QP attribute retrieval: - Implements VIRTIO_RDMA_CMD_QUERY_QP command - Handles all standard QP attributes including capabilities - Includes Address Handle (AH) attribute conversion 2. Robust memory management: - Guaranteed buffer cleanup in all execution paths - Byte order handling for cross-platform compatibility 3. Init attribute preservation: - Maintains creation-time attributes for compatibility - Ensures consistent QP context across operations Signed-off-by: Xiong Weimin <xiongweimin@kylinos.cn> vrdma: Implement user memory region registration support This commit adds user-space memory registration functionality to the virtio RDMA driver: 1. Core memory management: - User-space memory pinning via ib_umem_get() - DMA-safe page table construction - Full memory registration lifecycle handling 2. Robust resource management: - Guaranteed cleanup on all error paths - Strict memory boundary validation - Safe DMA allocation practices 3. Performance optimizations: - Bulk page table transfer to device - Zero-copy scatterlist processing - Large page support infrastructure 4. Security enhancements: - Proper address validation - Memory access flag enforcement - Secure DMA address handling Signed-off-by: Xiong Weimin <xiongweimin@kylinos.cn> --- .../infiniband/hw/virtio/vrdma_dev_api.h | 35 ++ .../drivers/infiniband/hw/virtio/vrdma_ib.c | 333 +++++++++++++++++- 2 files changed, 367 insertions(+), 1 deletion(-) diff --git a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_dev_api.h b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_dev_api.h index d0ce02601..86b5ecade 100644 --- a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_dev_api.h +++ b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_dev_api.h @@ -225,6 +225,41 @@ struct vrdma_rsp_modify_qp { __u32 qpn; }; +struct vrdma_cmd_query_pkey { + __u32 port; + __u16 index; +}; + +struct vrdma_rsp_query_pkey { + __u16 pkey; +}; + +struct vrdma_cmd_query_qp { + __u32 qpn; + __u32 attr_mask; +}; + +struct vrdma_rsp_query_qp { + struct vrdma_qp_attr attr; +}; + +struct vrdma_cmd_reg_user_mr { + __u32 pdn; + __u32 access_flags; + __u64 start; + __u64 length; + __u64 virt_addr; + + __u64 pages; + __u32 npages; +}; + +struct vrdma_rsp_reg_user_mr { + __u32 mrn; + __u32 lkey; + __u32 rkey; +}; + #define VRDMA_CTRL_OK 0 #define VRDMA_CTRL_ERR 1 diff --git a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c index f9b129774..b1429e072 100644 --- a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c +++ b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c @@ -23,6 +23,7 @@ #include "vrdma_queue.h" #define VRTIO_RDMA_PAGE_PER_TBL 512 +#define VRDMA_MAX_PAGES 512 * 512 /** * cmd_str - String representation of virtio RDMA control commands @@ -86,6 +87,36 @@ static void rdma_ah_attr_to_vrdma(struct vrdma_ah_attr *dst, memcpy(&dst->roce, &src->roce, sizeof(struct roce_ah_attr)); } +static void vrdma_to_ib_global_route(struct ib_global_route *dst, + const struct vrdma_global_route *src) +{ + dst->dgid = src->dgid; + dst->flow_label = src->flow_label; + dst->sgid_index = src->sgid_index; + dst->hop_limit = src->hop_limit; + dst->traffic_class = src->traffic_class; +} + +static void vrdma_to_ib_qp_cap(struct ib_qp_cap *dst, const struct vrdma_qp_cap *src) +{ + dst->max_send_wr = src->max_send_wr; + dst->max_recv_wr = src->max_recv_wr; + dst->max_send_sge = src->max_send_sge; + dst->max_recv_sge = src->max_recv_sge; + dst->max_inline_data = src->max_inline_data; +} + +static void vrdma_to_rdma_ah_attr(struct rdma_ah_attr *dst, + const struct vrdma_ah_attr *src) +{ + vrdma_to_ib_global_route(rdma_ah_retrieve_grh(dst), &src->grh); + rdma_ah_set_sl(dst, src->sl); + rdma_ah_set_static_rate(dst, src->static_rate); + rdma_ah_set_port_num(dst, src->port_num); + rdma_ah_set_ah_flags(dst, src->ah_flags); + memcpy(&dst->roce, &src->roce, sizeof(struct roce_ah_attr)); +} + /** * vrdma_exec_verbs_cmd - Execute a verbs command via control virtqueue * @vrdev: VRDMA device @@ -2521,6 +2552,303 @@ static int vrdma_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, return rc; } +/** + * vrdma_query_pkey - Query Partition Key (P_Key) at given index + * @ibdev: Verbs device (vRDMA virtual device) + * @port: Port number (1-indexed) + * @index: P_Key table index + * @pkey: Output buffer to store the P_Key value + * + * Queries the P_Key from the backend via virtqueue command. + * Only meaningful for IB-style ports (not RoCE). + * + * Context: Process context (may sleep). Can be called from user IOCTL path. + * Return: + * * 0 on success + * * -ENOMEM if command allocation fails + * * -EIO or other negative errno on communication failure + */ +static int vrdma_query_pkey(struct ib_device *ibdev, u32 port, u16 index, u16 *pkey) +{ + struct vrdma_dev *vdev = to_vdev(ibdev); + struct vrdma_cmd_query_pkey *cmd; + struct vrdma_rsp_query_pkey *rsp; + struct scatterlist in, out; + int rc; + + /* Allocate command and response buffers */ + cmd = kzalloc(sizeof(*cmd), GFP_KERNEL); + if (!cmd) + return -ENOMEM; + + rsp = kzalloc(sizeof(*rsp), GFP_KERNEL); + if (!rsp) { + kfree(cmd); + return -ENOMEM; + } + + /* Fill input parameters */ + cmd->port = cpu_to_le32(port); + cmd->index = cpu_to_le16(index); + + /* Prepare scatterlists for virtqueue I/O */ + sg_init_one(&in, cmd, sizeof(*cmd)); + sg_init_one(&out, rsp, sizeof(*rsp)); + + /* Execute command */ + rc = vrdma_exec_verbs_cmd(vdev, VIRTIO_RDMA_CMD_QUERY_PKEY, &in, &out); + if (rc) { + dev_err(&vdev->vdev->dev, + "VIRTIO_RDMA_CMD_QUERY_PKEY failed: port=%u idx=%u err=%d\n", + port, index, rc); + goto out_free; + } + + /* Copy result to user */ + *pkey = le16_to_cpu(rsp->pkey); + +out_free: + kfree(rsp); + kfree(cmd); + return rc; +} + +/** + * vrdma_query_qp - Query QP attributes from the backend + * @ibqp: Queue pair to query + * @attr: Output structure for QP attributes + * @attr_mask: Which fields are requested (ignored by some backends) + * @init_attr: Output structure for init-time attributes + * + * Queries the QP state and configuration via a control virtqueue command. + * This is a synchronous operation. + * + * Context: Process context (can sleep) + * Return: + * * 0 on success + * * -ENOMEM if allocation fails + * * -EIO or other negative errno on communication failure + */ +static int vrdma_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_qp_init_attr *init_attr) +{ + struct vrdma_qp *vqp = to_vqp(ibqp); + struct vrdma_dev *vdev = to_vdev(ibqp->device); + struct vrdma_cmd_query_qp *cmd; + struct vrdma_rsp_query_qp *rsp; + struct scatterlist in, out; + int rc; + + /* Allocate command and response buffers */ + cmd = kzalloc(sizeof(*cmd), GFP_KERNEL); + if (!cmd) + return -ENOMEM; + + rsp = kzalloc(sizeof(*rsp), GFP_KERNEL); + if (!rsp) { + kfree(cmd); + return -ENOMEM; + } + + /* Fill input parameters */ + cmd->qpn = cpu_to_le32(vqp->qp_handle); + cmd->attr_mask = cpu_to_le32(attr_mask); /* Optional optimization */ + + sg_init_one(&in, cmd, sizeof(*cmd)); + sg_init_one(&out, rsp, sizeof(*rsp)); + + /* Execute command over control virtqueue */ + rc = vrdma_exec_verbs_cmd(vdev, VIRTIO_RDMA_CMD_QUERY_QP, &in, &out); + if (rc) { + dev_err(&vdev->vdev->dev, + "VIRTIO_RDMA_CMD_QUERY_QP failed: qpn=0x%x err=%d\n", + vqp->qp_handle, rc); + goto out_free; + } + + /* Only copy results on success */ + attr->qp_state = rsp->attr.qp_state; + attr->cur_qp_state = rsp->attr.cur_qp_state; + attr->path_mtu = rsp->attr.path_mtu; + attr->path_mig_state = rsp->attr.path_mig_state; + attr->qkey = le32_to_cpu(rsp->attr.qkey); + attr->rq_psn = le32_to_cpu(rsp->attr.rq_psn); + attr->sq_psn = le32_to_cpu(rsp->attr.sq_psn); + attr->dest_qp_num = le32_to_cpu(rsp->attr.dest_qp_num); + attr->qp_access_flags = le32_to_cpu(rsp->attr.qp_access_flags); + attr->pkey_index = le16_to_cpu(rsp->attr.pkey_index); + attr->alt_pkey_index = le16_to_cpu(rsp->attr.alt_pkey_index); + attr->en_sqd_async_notify = rsp->attr.en_sqd_async_notify; + attr->sq_draining = rsp->attr.sq_draining; + attr->max_rd_atomic = rsp->attr.max_rd_atomic; + attr->max_dest_rd_atomic = rsp->attr.max_dest_rd_atomic; + attr->min_rnr_timer = rsp->attr.min_rnr_timer; + attr->port_num = rsp->attr.port_num; + attr->timeout = rsp->attr.timeout; + attr->retry_cnt = rsp->attr.retry_cnt; + attr->rnr_retry = rsp->attr.rnr_retry; + attr->alt_port_num = rsp->attr.alt_port_num; + attr->alt_timeout = rsp->attr.alt_timeout; + attr->rate_limit = le32_to_cpu(rsp->attr.rate_limit); + + /* Copy capabilities */ + vrdma_to_ib_qp_cap(&attr->cap, &rsp->attr.cap); + + /* Convert AH attributes (contains GRH + DIP) */ + vrdma_to_rdma_ah_attr(&attr->ah_attr, &rsp->attr.ah_attr); + vrdma_to_rdma_ah_attr(&attr->alt_ah_attr, &rsp->attr.alt_ah_attr); + + /* Fill init attributes (mostly static) */ + init_attr->event_handler = vqp->ibqp.event_handler; + init_attr->qp_context = vqp->ibqp.qp_context; + init_attr->send_cq = vqp->ibqp.send_cq; + init_attr->recv_cq = vqp->ibqp.recv_cq; + init_attr->srq = vqp->ibqp.srq; + init_attr->xrcd = NULL; /* Not supported in vRDMA */ + init_attr->cap = attr->cap; + init_attr->sq_sig_type = IB_SIGNAL_REQ_WR; /* Or driver default */ + init_attr->qp_type = vqp->ibqp.qp_type; + init_attr->create_flags = 0; + init_attr->port_num = vqp->port; + +out_free: + kfree(rsp); + kfree(cmd); + return rc; +} + +/** + * vrdma_reg_user_mr - Register a user memory region + * @pd: Protection domain + * @start: User virtual address of memory to register + * @length: Length of memory region + * virt_addr: Optional virtual address for rkey access (often same as start) + * @access_flags: Access permissions (IB_ACCESS_xxx) + * @udata: User data (optional, unused here) + * + * Locks down user pages, builds page table, and registers MR with backend. + * Returns pointer to ib_mr or ERR_PTR on failure. + * + * Context: Process context (may sleep during ib_umem_get) + * Return: + * * Pointer to &mr->ibmr on success + * * ERR_PTR(-errno) on failure + */ +static struct ib_mr *vrdma_reg_user_mr(struct ib_pd *pd, u64 start, + u64 length, u64 virt_addr, + int access_flags, + struct ib_udata *udata) +{ + struct vrdma_dev *dev = to_vdev(pd->device); + struct vrdma_cmd_reg_user_mr *cmd; + struct vrdma_rsp_reg_user_mr *rsp; + struct vrdma_mr *mr; + struct ib_umem *umem; + struct sg_dma_page_iter sg_iter; + struct scatterlist in, out; + int rc = 0; + unsigned npages; + dma_addr_t *pages_flat = NULL; + + /* Step 1: Pin user memory pages */ + umem = ib_umem_get(pd->device, start, length, access_flags); + if (IS_ERR(umem)) { + dev_err(&dev->vdev->dev, "Failed to pin user memory: va=0x%llx len=%llu\n", + start, length); + return ERR_CAST(umem); + } + + npages = ib_umem_num_pages(umem); + if (npages == 0 || npages > VRDMA_MAX_PAGES) { // e.g., VRDMA_MAX_PAGES = 512*512 + dev_err(&dev->vdev->dev, "Invalid number of pages: %u\n", npages); + rc = -EINVAL; + goto err_umem; + } + + /* Allocate command/response structures (GFP_KERNEL ok in process context) */ + cmd = kzalloc(sizeof(*cmd), GFP_KERNEL); + rsp = kzalloc(sizeof(*rsp), GFP_KERNEL); + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!cmd || !rsp || !mr) { + rc = -ENOMEM; + goto err_alloc; + } + + /* Initialize MR structure */ + mr->umem = umem; + mr->size = length; + mr->iova = virt_addr; + mr->max_pages = npages; + + /* Allocate contiguous DMA-mapped array for page addresses */ + pages_flat = dma_alloc_coherent(&dev->vdev->dev, + npages * sizeof(dma_addr_t), + &mr->dma_pages, GFP_KERNEL); + if (!pages_flat) { + dev_err(&dev->vdev->dev, "Failed to allocate DMA memory for page table\n"); + rc = -ENOMEM; + goto err_alloc; + } + mr->pages_k = &pages_flat; /* Treat as 2D: [i/512][i%512] */ + + /* Fill page table from ib_umem scatterlist */ + mr->npages = 0; + for_each_sg_dma_page(umem->sgt_append.sgt.sgl, &sg_iter, umem->sgt_append.sgt.nents, 0) { + dma_addr_t addr = sg_page_iter_dma_address(&sg_iter); + pages_flat[mr->npages++] = addr; + } + + /* Sanity check: should match ib_umem_num_pages() */ + WARN_ON(mr->npages != npages); + + /* Prepare command */ + cmd->pdn = cpu_to_le32(to_vpd(pd)->pd_handle); + cmd->start = cpu_to_le64(start); + cmd->length = cpu_to_le64(length); + cmd->virt_addr = cpu_to_le64(virt_addr); + cmd->access_flags = cpu_to_le32(access_flags); + cmd->pages = cpu_to_le64(mr->dma_pages); /* DMA address of page array */ + cmd->npages = cpu_to_le32(npages); + + sg_init_one(&in, cmd, sizeof(*cmd)); + sg_init_one(&out, rsp, sizeof(*rsp)); + + /* Send command to backend */ + rc = vrdma_exec_verbs_cmd(dev, VIRTIO_RDMA_CMD_REG_USER_MR, &in, &out); + if (rc) { + dev_err(&dev->vdev->dev, "Backend failed to register MR: %d\n", rc); + goto err_cmd; + } + + /* Copy results from response */ + mr->mr_handle = le32_to_cpu(rsp->mrn); + mr->ibmr.lkey = le32_to_cpu(rsp->lkey); + mr->ibmr.rkey = le32_to_cpu(rsp->rkey); + + /* Cleanup temporary allocations */ + kfree(cmd); + kfree(rsp); + + /* Link MR to PD if needed, initialize other fields */ + mr->ibmr.pd = pd; + mr->ibmr.device = pd->device; + mr->ibmr.type = IB_MR_TYPE_MEM_REG; + mr->ibmr.length = length; + + return &mr->ibmr; + +err_cmd: + dma_free_coherent(&dev->vdev->dev, npages * sizeof(dma_addr_t), + pages_flat, mr->dma_pages); +err_alloc: + kfree(mr); + kfree(rsp); + kfree(cmd); +err_umem: + ib_umem_release(umem); + return ERR_PTR(rc); +} + static const struct ib_device_ops vrdma_dev_ops = { .owner = THIS_MODULE, .uverbs_abi_ver = VIRTIO_RDMA_ABI_VERSION, @@ -2554,7 +2882,10 @@ static const struct ib_device_ops vrdma_dev_ops = { .modify_qp = vrdma_modify_qp, .poll_cq = vrdma_poll_cq, .post_recv = vrdma_post_recv, - .post_send = vrdma_post_send, + .post_send = vrdma_post_send, + .query_pkey = vrdma_query_pkey, + .query_qp = vrdma_query_qp, + .reg_user_mr = vrdma_reg_user_mr, }; /** -- 2.43.0 还有这个
12-19
// SPDX-License-Identifier: GPL-2.0-only /* * MTD Oops/Panic logger * * Copyright © 2007 Nokia Corporation. All rights reserved. * * Author: Richard Purdie <rpurdie@openedhand.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/module.h> #include <linux/console.h> #include <linux/vmalloc.h> #include <linux/workqueue.h> #include <linux/sched.h> #include <linux/wait.h> #include <linux/delay.h> #include <linux/interrupt.h> #include <linux/timekeeping.h> #include <linux/mtd/mtd.h> #include <linux/kmsg_dump.h> #include <linux/reboot.h> #include <linux/platform_device.h> #include <linux/io.h> /* Maximum MTD partition size */ #define MTDOOPS_MAX_MTD_SIZE (16 * 1024 * 1024) static unsigned long record_size = 4096; module_param(record_size, ulong, 0400); MODULE_PARM_DESC(record_size, "record size for MTD OOPS pages in bytes (default 4096)"); static char mtddev[80]; module_param_string(mtddev, mtddev, 80, 0400); MODULE_PARM_DESC(mtddev, "name or index number of the MTD device to use"); static int dump_oops = 1; module_param(dump_oops, int, 0600); MODULE_PARM_DESC(dump_oops, "set to 1 to dump oopses, 0 to only dump panics (default 1)"); static unsigned long lkmsg_record_size = 512 * 1024; extern struct raw_notifier_head pwrkey_irq_notifier_list; #define MAX_CMDLINE_PARAM_LEN 256 static char build_fingerprint[MAX_CMDLINE_PARAM_LEN] = {0}; module_param_string(fingerprint, build_fingerprint, MAX_CMDLINE_PARAM_LEN,0644); static int boot_mode = 0; module_param(boot_mode, int, 0600); MODULE_PARM_DESC(boot_mode, "boot_mode (default 0)"); #define MTDOOPS_KERNMSG_MAGIC_v1 0x5d005d00 /* Original */ #define MTDOOPS_KERNMSG_MAGIC_v2 0x5d005e00 /* Adds the timestamp */ #define MTDOOPS_HEADER_SIZE 8 enum mtd_dump_reason { MTD_DUMP_UNDEF, MTD_DUMP_PANIC, MTD_DUMP_OOPS, MTD_DUMP_EMERG, MTD_DUMP_SHUTDOWN, MTD_DUMP_RESTART, MTD_DUMP_POWEROFF, MTD_DUMP_LONG_PRESS, MTD_DUMP_MAX }; static char *kdump_reason[8] = { "Unknown", "Kernel Panic", "Oops!", "Emerg", "Shut Down", "Restart", "PowerOff", "Long Press" }; enum mtdoops_log_type { MTDOOPS_TYPE_UNDEF, MTDOOPS_TYPE_DMESG, MTDOOPS_TYPE_PMSG, }; static char *log_type[4] = { "Unknown", "LAST KMSG", "LAST LOGCAT" }; struct pmsg_buffer_hdr { uint32_t sig; atomic_t start; atomic_t size; uint8_t data[0]; }; struct pmsg_platform_data { unsigned long mem_size; phys_addr_t mem_address; unsigned long console_size; unsigned long pmsg_size; }; struct mtdoops_hdr { u32 seq; u32 magic; ktime_t timestamp; } __packed; static struct mtdoops_context { struct kmsg_dumper dump; struct notifier_block reboot_nb; struct notifier_block pwrkey_long_press_nb; struct pmsg_platform_data pmsg_data; int mtd_index; struct work_struct work_erase; struct work_struct work_write; struct mtd_info *mtd; int oops_pages; int nextpage; int nextcount; unsigned long *oops_page_used; unsigned long oops_buf_busy; void *oops_buf; } oops_cxt; static void mark_page_used(struct mtdoops_context *cxt, int page) { set_bit(page, cxt->oops_page_used); } static void mark_page_unused(struct mtdoops_context *cxt, int page) { clear_bit(page, cxt->oops_page_used); } static int page_is_used(struct mtdoops_context *cxt, int page) { return test_bit(page, cxt->oops_page_used); } static int mtdoops_erase_block(struct mtdoops_context *cxt, int offset) { struct mtd_info *mtd = cxt->mtd; u32 start_page_offset = mtd_div_by_eb(offset, mtd) * mtd->erasesize; u32 start_page = start_page_offset / record_size; u32 erase_pages = mtd->erasesize / record_size; struct erase_info erase; int ret; int page; erase.addr = offset; erase.len = mtd->erasesize; ret = mtd_erase(mtd, &erase); if (ret) { pr_warn("erase of region [0x%llx, 0x%llx] on \"%s\" failed\n", (unsigned long long)erase.addr, (unsigned long long)erase.len, mtddev); return ret; } /* Mark pages as unused */ for (page = start_page; page < start_page + erase_pages; page++) mark_page_unused(cxt, page); return 0; } static void mtdoops_erase(struct mtdoops_context *cxt) { struct mtd_info *mtd = cxt->mtd; int i = 0, j, ret, mod; /* We were unregistered */ if (!mtd) return; mod = (cxt->nextpage * record_size) % mtd->erasesize; if (mod != 0) { cxt->nextpage = cxt->nextpage + ((mtd->erasesize - mod) / record_size); if (cxt->nextpage >= cxt->oops_pages) cxt->nextpage = 0; } while ((ret = mtd_block_isbad(mtd, cxt->nextpage * record_size)) > 0) { badblock: pr_warn("bad block at %08lx\n", cxt->nextpage * record_size); i++; cxt->nextpage = cxt->nextpage + (mtd->erasesize / record_size); if (cxt->nextpage >= cxt->oops_pages) cxt->nextpage = 0; if (i == cxt->oops_pages / (mtd->erasesize / record_size)) { pr_err("all blocks bad!\n"); return; } } if (ret < 0) { pr_err("mtd_block_isbad failed, aborting\n"); return; } for (j = 0, ret = -1; (j < 3) && (ret < 0); j++) ret = mtdoops_erase_block(cxt, cxt->nextpage * record_size); if (ret >= 0) { pr_debug("ready %d, %d\n", cxt->nextpage, cxt->nextcount); return; } if (ret == -EIO) { ret = mtd_block_markbad(mtd, cxt->nextpage * record_size); if (ret < 0 && ret != -EOPNOTSUPP) { pr_err("block_markbad failed, aborting\n"); return; } } goto badblock; } /* Scheduled work - when we can't proceed without erasing a block */ static void mtdoops_workfunc_erase(struct work_struct *work) { struct mtdoops_context *cxt = container_of(work, struct mtdoops_context, work_erase); mtdoops_erase(cxt); } static void mtdoops_inc_counter(struct mtdoops_context *cxt, int panic) { cxt->nextpage++; if (cxt->nextpage >= cxt->oops_pages) cxt->nextpage = 0; cxt->nextcount++; if (cxt->nextcount == 0xffffffff) cxt->nextcount = 0; if (page_is_used(cxt, cxt->nextpage)) { pr_debug("not ready %d, %d (erase %s)\n", cxt->nextpage, cxt->nextcount, panic ? "immediately" : "scheduled"); if (panic) { /* In case of panic, erase immediately */ mtdoops_erase(cxt); } else { /* Otherwise, schedule work to erase it "nicely" */ schedule_work(&cxt->work_erase); } } else { pr_debug("ready %d, %d (no erase)\n", cxt->nextpage, cxt->nextcount); } } static void mtdoops_write(struct mtdoops_context *cxt, int panic) { struct mtd_info *mtd = cxt->mtd; size_t retlen; struct mtdoops_hdr *hdr; int ret; if (test_and_set_bit(0, &cxt->oops_buf_busy)) return; /* Add mtdoops header to the buffer */ hdr = (struct mtdoops_hdr *)cxt->oops_buf; hdr->seq = cxt->nextcount; hdr->magic = MTDOOPS_KERNMSG_MAGIC_v2; hdr->timestamp = ktime_get_real(); if (panic) { ret = mtd_panic_write(mtd, cxt->nextpage * record_size, record_size, &retlen, cxt->oops_buf); if (ret == -EOPNOTSUPP) { pr_err("Cannot write from panic without panic_write\n"); goto out; } } else ret = mtd_write(mtd, cxt->nextpage * record_size, record_size, &retlen, cxt->oops_buf); if (retlen != record_size || ret < 0) pr_err("write failure at %ld (%td of %ld written), error %d\n", cxt->nextpage * record_size, retlen, record_size, ret); mark_page_used(cxt, cxt->nextpage); // memset(cxt->oops_buf, 0xff, record_size); // mtdoops_inc_counter(cxt, panic); out: clear_bit(0, &cxt->oops_buf_busy); } static void mtdoops_workfunc_write(struct work_struct *work) { struct mtdoops_context *cxt = container_of(work, struct mtdoops_context, work_write); mtdoops_write(cxt, 0); } static void find_next_position(struct mtdoops_context *cxt) { struct mtd_info *mtd = cxt->mtd; struct mtdoops_hdr hdr; int ret, page, maxpos = 0; u32 maxcount = 0xffffffff; size_t retlen; for (page = 0; page < cxt->oops_pages; page++) { if (mtd_block_isbad(mtd, page * record_size)) continue; /* Assume the page is used */ mark_page_used(cxt, page); ret = mtd_read(mtd, page * record_size, sizeof(hdr), &retlen, (u_char *)&hdr); if (retlen != sizeof(hdr) || (ret < 0 && !mtd_is_bitflip(ret))) { pr_err("read failure at %ld (%zu of %zu read), err %d\n", page * record_size, retlen, sizeof(hdr), ret); continue; } if (hdr.seq == 0xffffffff && hdr.magic == 0xffffffff) mark_page_unused(cxt, page); if (hdr.seq == 0xffffffff || (hdr.magic != MTDOOPS_KERNMSG_MAGIC_v1 && hdr.magic != MTDOOPS_KERNMSG_MAGIC_v2)) continue; if (maxcount == 0xffffffff) { maxcount = hdr.seq; maxpos = page; } else if (hdr.seq < 0x40000000 && maxcount > 0xc0000000) { maxcount = hdr.seq; maxpos = page; } else if (hdr.seq > maxcount && hdr.seq < 0xc0000000) { maxcount = hdr.seq; maxpos = page; } else if (hdr.seq > maxcount && hdr.seq > 0xc0000000 && maxcount > 0x80000000) { maxcount = hdr.seq; maxpos = page; } } if (maxcount == 0xffffffff) { cxt->nextpage = cxt->oops_pages - 1; cxt->nextcount = 0; } else { cxt->nextpage = maxpos; cxt->nextcount = maxcount; } mtdoops_inc_counter(cxt, 0); } static void mtdoops_add_reason(char *oops_buf, int reason, enum mtdoops_log_type type, int index, int nextpage) { char str_buf[512] = {0}; int ret_len = 0; struct timespec64 now; struct tm ts; char temp_buf[32] = {0}; int temp_len = 0; char BootMode[20] = {0}; unsigned long local_time; ktime_get_coarse_real_ts64(&now); /*set title time to UTC+8*/ local_time = (unsigned long)(now.tv_sec + 8 * 60 * 60); time64_to_tm(local_time, 0, &ts); if (boot_mode == 0) { strcpy(BootMode, "normal"); } else if (boot_mode == 1) { strcpy(BootMode, "recovery"); } else if (boot_mode == 2) { strcpy(BootMode, "poweroff_charger"); } temp_len = snprintf(temp_buf, 32,"\n ---mtdoops report start--- \n"); memcpy(oops_buf, temp_buf, temp_len); ret_len = snprintf(str_buf, 200, "\n```\n## Oops_Index: %d\n### Build: %s\n## REASON: %s\n#### LOG TYPE:%s\n## BOOT MODE:%s\n##### %04ld-%02d-%02d %02d:%02d:%02d\n```c\n", index, build_fingerprint, kdump_reason[reason], log_type[type], BootMode, ts.tm_year+1900, ts.tm_mon + 1, ts.tm_mday, ts.tm_hour, ts.tm_min, ts.tm_sec); if(ret_len >= sizeof(str_buf)) ret_len = sizeof(str_buf); memcpy(oops_buf + temp_len, str_buf, ret_len); } static void mtdoops_add_pmsg_head(char *oops_buf, enum mtdoops_log_type type) { char str_buf[80] = {0}; int ret_len = 0; struct timespec64 now; struct tm ts; unsigned long local_time; ktime_get_coarse_real_ts64(&now); local_time = (unsigned long)(now.tv_sec + 8 * 60 * 60); time64_to_tm(local_time, 0, &ts); ret_len = snprintf(str_buf, 80, "\n```\n#### LOG TYPE:%s\n#####%04ld-%02d-%02d %02d:%02d:%02d\n```\n", log_type[type], ts.tm_year + 1900, ts.tm_mon + 1, ts.tm_mday, ts.tm_hour, ts.tm_min, ts.tm_sec); memcpy(oops_buf, str_buf, ret_len); } static void mtdoops_do_dump(struct kmsg_dumper *dumper, enum mtd_dump_reason reason) { struct mtdoops_context *cxt = container_of(dumper, struct mtdoops_context, dump); struct kmsg_dump_iter iter; size_t ret_len = 0; void *pmsg_buffer_start = NULL; struct pmsg_buffer_hdr *p_hdr = NULL; int j = 0; int ret = 0; static int do_dump_count = 0; if(cxt->mtd == NULL) return; if(reason == KMSG_DUMP_SHUTDOWN || reason == KMSG_DUMP_EMERG) return; /* Only dump oopses if dump_oops is set */ if (reason == KMSG_DUMP_OOPS && !dump_oops) return; do_dump_count++; pr_err("%s start , count = %d , page = %d, reason = %d, dump_count = %d\n", __func__, cxt->nextcount, cxt->nextpage, reason, do_dump_count); if(do_dump_count>1) { for (j = 0, ret = -1; (j < 3) && (ret < 0); j++) ret = mtdoops_erase_block(cxt, cxt->nextpage * record_size); } kmsg_dump_rewind(&iter); if (test_and_set_bit(0, &cxt->oops_buf_busy)) return; kmsg_dump_get_buffer(&iter, true, cxt->oops_buf + MTDOOPS_HEADER_SIZE, lkmsg_record_size - MTDOOPS_HEADER_SIZE, &ret_len); clear_bit(0, &cxt->oops_buf_busy); mtdoops_add_reason(cxt->oops_buf + MTDOOPS_HEADER_SIZE, reason, MTDOOPS_TYPE_DMESG, cxt->nextcount, cxt->nextpage); pmsg_buffer_start = ioremap( ((cxt->pmsg_data.mem_address + cxt->pmsg_data.mem_size)- cxt->pmsg_data.pmsg_size), cxt->pmsg_data.mem_size); if (!device_base) { printk(KERN_ERR "ioremap failed!\n"); } p_hdr = (struct pmsg_buffer_hdr *)pmsg_buffer_start; pr_err("mtdoops_do_dump pmsg paddr = 0x%p \n", pmsg_buffer_start); if(p_hdr->sig == 0x43474244) { void *oopsbuf = cxt->oops_buf + (MTDOOPS_HEADER_SIZE + ret_len); uint8_t *p_buff_end = (uint8_t *)p_hdr->data + atomic_read(&p_hdr->size); int pmsg_cp_size = 0; int pstart = p_hdr->start.counter; int psize = p_hdr->size.counter; pmsg_cp_size = (record_size - (ret_len + MTDOOPS_HEADER_SIZE)); if (psize <= pmsg_cp_size) pmsg_cp_size = psize; if (pstart >= pmsg_cp_size) { memcpy(oopsbuf, p_hdr->data, pmsg_cp_size); } else { memcpy(oopsbuf, p_buff_end - (pmsg_cp_size - pstart), pmsg_cp_size - pstart); memcpy(oopsbuf + (pmsg_cp_size - pstart), p_hdr->data, pstart); } mtdoops_add_pmsg_head(cxt->oops_buf + (MTDOOPS_HEADER_SIZE + ret_len), MTDOOPS_TYPE_PMSG); } else pr_err("mtdoops: read pmsg failed sig = 0x%x \n", p_hdr->sig); if (reason == KMSG_DUMP_OOPS || reason == KMSG_DUMP_PANIC) { /* Panics must be written immediately */ mtdoops_write(cxt, 1); } else { /*we should write log immediately , if use work to write, *ufs will shutdown before write log finish */ mtdoops_write(cxt, 0); } pr_err("mtdoops_do_dump() finish \n"); } static int mtdoops_reboot_nb_handle(struct notifier_block *this, unsigned long event, void *ptr) { enum mtd_dump_reason reason; struct mtdoops_context *cxt = &oops_cxt; if (event == SYS_RESTART) reason = MTD_DUMP_RESTART; else if(event == SYS_POWER_OFF) reason = MTD_DUMP_POWEROFF; else return NOTIFY_OK; mtdoops_do_dump(&cxt->dump, reason); return NOTIFY_OK; } static int pwrkey_long_press_irq_event(struct notifier_block *this, unsigned long event, void *ptr) { struct mtdoops_context *cxt = &oops_cxt; mtdoops_do_dump(&cxt->dump, MTD_DUMP_LONG_PRESS); return NOTIFY_DONE; } static void mtdoops_do_null(struct kmsg_dumper *dumper, enum kmsg_dump_reason reason) { return; } static void mtdoops_notify_add(struct mtd_info *mtd) { struct mtdoops_context *cxt = &oops_cxt; u64 mtdoops_pages = div_u64(mtd->size, record_size); int err; if (!strcmp(mtd->name, mtddev)) cxt->mtd_index = mtd->index; if (mtd->index != cxt->mtd_index || cxt->mtd_index < 0) return; if (mtd->size < mtd->erasesize * 2) { pr_err("MTD partition %d not big enough for mtdoops\n", mtd->index); return; } if (mtd->erasesize < record_size) { pr_err("eraseblock size of MTD partition %d too small\n", mtd->index); return; } if (mtd->size > MTDOOPS_MAX_MTD_SIZE) { pr_err("mtd%d is too large (limit is %d MiB)\n", mtd->index, MTDOOPS_MAX_MTD_SIZE / 1024 / 1024); return; } /* oops_page_used is a bit field */ cxt->oops_page_used = vmalloc(array_size(sizeof(unsigned long), DIV_ROUND_UP(mtdoops_pages, BITS_PER_LONG))); if (!cxt->oops_page_used) { pr_err("could not allocate page array\n"); return; } cxt->dump.max_reason = KMSG_DUMP_MAX; cxt->dump.dump = mtdoops_do_null; err = kmsg_dump_register(&cxt->dump); if (err) { pr_err("registering kmsg dumper failed, error %d\n", err); vfree(cxt->oops_page_used); cxt->oops_page_used = NULL; return; } /*for restart and power off*/ cxt->reboot_nb.notifier_call = mtdoops_reboot_nb_handle; cxt->reboot_nb.priority = 255; register_reboot_notifier(&cxt->reboot_nb); cxt->pwrkey_long_press_nb.notifier_call = pwrkey_long_press_irq_event; cxt->pwrkey_long_press_nb.priority = 255; raw_notifier_chain_register(&pwrkey_irq_notifier_list, &cxt->pwrkey_long_press_nb); cxt->mtd = mtd; cxt->oops_pages = (int)mtd->size / record_size; find_next_position(cxt); pr_info("Attached to MTD device %d\n", mtd->index); } static void mtdoops_notify_remove(struct mtd_info *mtd) { struct mtdoops_context *cxt = &oops_cxt; if (mtd->index != cxt->mtd_index || cxt->mtd_index < 0) return; if (kmsg_dump_unregister(&cxt->dump) < 0) pr_warn("could not unregister kmsg_dumper\n"); unregister_reboot_notifier(&cxt->reboot_nb); cxt->mtd = NULL; flush_work(&cxt->work_erase); flush_work(&cxt->work_write); } static struct mtd_notifier mtdoops_notifier = { .add = mtdoops_notify_add, .remove = mtdoops_notify_remove, }; static int mtdoops_parse_dt_u32(struct platform_device *pdev, const char *propname, u32 default_value, u32 *value) { u32 val32 = 0; int ret; ret = of_property_read_u32(pdev->dev.of_node, propname, &val32); if (ret == -EINVAL) { /* field is missing, use default value. */ val32 = default_value; } else if (ret < 0) { pr_err("failed to parse property %s: %d\n", propname, ret); return ret; } /* Sanity check our results. */ if (val32 > INT_MAX) { pr_err("%s %u > INT_MAX\n", propname, val32); return -EOVERFLOW; } *value = val32; return 0; } static int mtdoops_pmsg_probe(struct platform_device *pdev) { struct mtdoops_context *cxt = &oops_cxt; struct resource *res; u32 value; int ret; res = platform_get_resource(pdev, IORESOURCE_MEM, 0); if (!res) { pr_err("failed to locate DT /reserved-memory resource\n"); return -EINVAL; } cxt->pmsg_data.mem_size = resource_size(res); cxt->pmsg_data.mem_address = res->start; #define parse_u32(name, field, default_value) { \ ret = mtdoops_parse_dt_u32(pdev, name, default_value, \ &value); \ if (ret < 0) \ return ret; \ field = value; \ } parse_u32("console-size", cxt->pmsg_data.console_size, 0); parse_u32("pmsg-size", cxt->pmsg_data.pmsg_size, 0); #undef parse_u32 pr_err( "pares mtd_dt, mem_address =0x%llx, mem_size =0x%lx \n", cxt->pmsg_data.mem_address, cxt->pmsg_data.mem_size); pr_err( "pares mtd_dt, pmsg_size =0x%lx, console-size =0x%lx \n", cxt->pmsg_data.pmsg_size, cxt->pmsg_data.console_size); return 0; } static const struct of_device_id dt_match[] = { { .compatible = "xiaomi,mtdoops_pmsg" }, {} }; static struct platform_driver mtdoops_pmsg_driver = { .probe = mtdoops_pmsg_probe, .driver = { .name = "mtdoops_pmsg", .of_match_table = dt_match, }, }; static int __init mtdoops_init(void) { struct mtdoops_context *cxt = &oops_cxt; int mtd_index; char *endp; if (strlen(mtddev) == 0) { pr_err("mtd device (mtddev=name/number) must be supplied\n"); return -EINVAL; } if ((record_size & 4095) != 0) { pr_err("record_size must be a multiple of 4096\n"); return -EINVAL; } if (record_size < 4096) { pr_err("record_size must be over 4096 bytes\n"); return -EINVAL; } /* Setup the MTD device to use */ cxt->mtd_index = -1; mtd_index = simple_strtoul(mtddev, &endp, 0); if (*endp == '\0') cxt->mtd_index = mtd_index; cxt->oops_buf = kmalloc(record_size, GFP_KERNEL); if (!cxt->oops_buf) return -ENOMEM; memset(cxt->oops_buf, 0xff, record_size); cxt->oops_buf_busy = 0; INIT_WORK(&cxt->work_erase, mtdoops_workfunc_erase); INIT_WORK(&cxt->work_write, mtdoops_workfunc_write); platform_driver_register(&mtdoops_pmsg_driver); register_mtd_user(&mtdoops_notifier); return 0; } static void __exit mtdoops_exit(void) { struct mtdoops_context *cxt = &oops_cxt; unregister_mtd_user(&mtdoops_notifier); kfree(cxt->oops_buf); vfree(cxt->oops_page_used); } module_init(mtdoops_init); module_exit(mtdoops_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Richard Purdie <rpurdie@openedhand.com>"); MODULE_DESCRIPTION("MTD Oops/Panic console logger/driver"); 问题堆栈对应mtdoops.c文件上传,怎么修复这个问题
10-30
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值