[RFC,1/9] RDMA/rv: Public interferce for the RDMA Rendezvous module

Message ID	20210319125635.34492-2-kaike.wan@intel.com (mailing list archive)
State	RFC
Headers	show Return-Path: <linux-rdma-owner@kernel.org> IronPort-SDR: kfuMPcrK74tHkR0a1tYY92VktiDmaSqOfjGILEnjXE0HwzFWnUYZ0w/ypKOqSQxp2blBblf31w vBU0t9sQ5MrQ== IronPort-SDR: zlUW144FQdI+LPrHLB1XDWBoQRgiUmW0PtbPW+7DKJUpHypwzOEEYHZOsI/ANcnECUSgKe3gIi ugAHMYTTsQQg== From: kaike.wan@intel.com To: dledford@redhat.com, jgg@nvidia.com Cc: linux-rdma@vger.kernel.org, todd.rimmer@intel.com, Kaike Wan <kaike.wan@intel.com> Subject: [PATCH RFC 1/9] RDMA/rv: Public interferce for the RDMA Rendezvous module Date: Fri, 19 Mar 2021 08:56:27 -0400 Message-Id: <20210319125635.34492-2-kaike.wan@intel.com> In-Reply-To: <20210319125635.34492-1-kaike.wan@intel.com> References: <20210319125635.34492-1-kaike.wan@intel.com> Precedence: bulk
Series	A rendezvous module \| expand [RFC,0/9] A rendezvous module [RFC,1/9] RDMA/rv: Public interferce for the RDMA Rendezvous module [RFC,2/9] RDMA/rv: Add the internal header files [RFC,3/9] RDMA/rv: Add the rv module [RFC,4/9] RDMA/rv: Add functions for memory region cache [RFC,5/9] RDMA/rv: Add function to register/deregister memory region [RFC,6/9] RDMA/rv: Add connection management functions [RFC,7/9] RDMA/rv: Add functions for RDMA transactions [RFC,8/9] RDMA/rv: Add functions for file operations [RFC,9/9] RDMA/rv: Integrate the file operations into the rv module

diff --git a/include/uapi/rdma/rv_user_ioctls.h b/include/uapi/rdma/rv_user_ioctls.h new file mode 100644 index 000000000000..97e35b722443 --- /dev/null +++ b/include/uapi/rdma/rv_user_ioctls.h @@ -0,0 +1,558 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* + * Copyright(c) 2020 - 2021 Intel Corporation. + */ +#ifndef __RV_USER_IOCTL_H__ +#define __RV_USER_IOCTL_H__ +#include <rdma/rdma_user_ioctl.h> +#include <rdma/ib_user_sa.h> +#include <rdma/ib_user_verbs.h> + +/* Checking /Documentation/userspace-api/ioctl/ioctl-number.rst */ +#define RV_MAGIC RDMA_IOCTL_MAGIC +#define RV_FILE_NAME "/dev/rv" + +/* + * Handles are opaque to application; they are meaningful only to the + * RV driver + */ + +/* this version of ABI */ +#define RV_ABI_VER_MAJOR 1 +#define RV_ABI_VER_MINOR 0 + +struct rv_query_params_out { + /* ABI version */ + __u16 major_rev; + __u16 minor_rev; + __u32 resv1; + __aligned_u64 capability; + __aligned_u64 resv2[6]; +}; + +#define RV_IOCTL_QUERY _IOR(RV_MAGIC, 0xFC, struct rv_query_params_out) + +/* Mode for use of rv module by PSM */ +#define RV_RDMA_MODE_USER 0 /* user MR caching only */ +#define RV_RDMA_MODE_KERNEL 1 /* + kernel RC QPs with kernel MR caching */ +#define RV_RDMA_MODE_MAX 1 + +#define RV_MAX_DEV_NAME_LEN IB_DEVICE_NAME_MAX +#define RV_MAX_NUM_CONN 16 +#define RV_MAX_INDEX_BITS 12 +#define RV_MAX_JOB_KEY_LEN 16 +#define RV_MAX_CQ_ENTRIES 10000 + +/* + * mr_cache_size is in MBs and if 0 will use module param as default + * num_conn - number of QPs between each pair of nodes + * loc_addr - used to select client/listen vs rem_addr + * index_bits - num high bits of immed data with rv index + * loc_gid_index - SGID for client connections + * loc_gid[16] - to double check gid_index unchanged + * job_key[RV_MAX_JOB_KEY_LEN] = unique uuid per job + * job_key_len - len, if 0 matches jobs with len==0 only + * q_depth - size of QP and per QP CQs + * reconnect_timeout - in seconds from loss to restoration + * hb_interval - in milliseconds between heartbeats + */ +struct rv_attach_params_in { + char dev_name[RV_MAX_DEV_NAME_LEN]; + __u32 mr_cache_size; + __u8 rdma_mode; + + /* additional information for RV_RDMA_MODE_KERNEL */ + __u8 port_num; + __u8 num_conn; + __u32 loc_addr; + __u8 index_bits; + __u16 loc_gid_index; + __u8 loc_gid[16]; + __u8 job_key[RV_MAX_JOB_KEY_LEN]; + __u8 job_key_len; + __aligned_u64 service_id; + __aligned_u64 context; + __u32 cq_entries; + __u32 q_depth; + __u32 reconnect_timeout; + __u32 hb_interval; +}; + +/* + * rv_index - unique within job on given NIC + * mr_cache_size - in MBs + * q_depth - size of QP and per QP CQs + * reconnect_timeout - value being used + */ +struct rv_attach_params_out { + __u32 rv_index; + __u32 mr_cache_size; + __u32 q_depth; + __u32 reconnect_timeout; +}; + +struct rv_attach_params { + union { + struct rv_attach_params_in in; + struct rv_attach_params_out out; + }; +}; + +#define RV_IOCTL_ATTACH _IOWR(RV_MAGIC, 0xF1, struct rv_attach_params) + +/* The buffer is used to register a kernel mr */ +#define IBV_ACCESS_KERNEL 0x80000000 + +/* + * ibv_pd_handle - user space appl allocated pd + * ulen - driver_udata inlen + * *udata - driver_updata inbuf + */ +struct rv_mem_params_in { + __u32 ibv_pd_handle; + __u32 cmd_fd_int; + __aligned_u64 addr; + __aligned_u64 length; + __u32 access; + size_t ulen; + void *udata; +}; + +struct rv_mem_params_out { + __aligned_u64 mr_handle; + __aligned_u64 iova; + __u32 lkey; + __u32 rkey; +}; + +struct rv_mem_params { + union { + struct rv_mem_params_in in; + struct rv_mem_params_out out; + }; +}; + +#define RV_IOCTL_REG_MEM _IOWR(RV_MAGIC, 0xF3, struct rv_mem_params) + +struct rv_dereg_params_in { + __aligned_u64 mr_handle; + __aligned_u64 addr; + __aligned_u64 length; + __u32 access; +}; + +#define RV_IOCTL_DEREG_MEM _IOW(RV_MAGIC, 0xF4, struct rv_dereg_params_in) + +/* + * MR cache statistics are as follows: + * cache_size - Current memory in the cache in bytes + * max_cache_size - Maximum of cache_size in bytes + * limit_cache_size - Maximum allowed cache_size in MB + * count - Current number of MRs in the cache + * max_count - Maximum of count + * inuse - Current number of MRs in the cache with refcount > 0 + * max_inuse - Maximum of inuse + * inuse_bytes - Current number of bytes in cache for MRs with refcount > 0 + * max_inuse_bytes - Maximum of inuse_bytes + * max_refcount - Maximum of refcount for an MR + * Event counts: + * hit - Cache hit + * miss - Cache miss and added + * full - Cache miss but can't add since full + * failed - Cache miss but can't add since reg_mr failed + * remove - Refcount == 0 and removed by mmu notifier event or cache deinit + * evict - Removed from cache due to lack of cache space + * Number of valid IOCTL_REG_MEM calls = hit+miss+full+failed + * (counts omit some EINVAL and EFAULT REG_MEM use cases) + */ + +struct rv_cache_stats_params_out { + __aligned_u64 cache_size; + __aligned_u64 max_cache_size; + __u32 limit_cache_size; + __u32 count; + __u32 max_count; + __u32 inuse; + __u32 max_inuse; + __aligned_u64 inuse_bytes; + __aligned_u64 max_inuse_bytes; + __u32 max_refcount; + __aligned_u64 hit; + __aligned_u64 miss; + __aligned_u64 full; + __aligned_u64 failed; + __aligned_u64 remove; + __aligned_u64 evict; +}; + +#define RV_IOCTL_GET_CACHE_STATS _IOR(RV_MAGIC, 0xF7, \ + struct rv_cache_stats_params_out) + +/* + * The create provides an ah_attr. Field use is as follows: + * sanity checked with attach: + * grh.sgid_index, port_num + * possibly in future: src_path_bits + * identify QPs which can be shared + * rem_addr, ah.is_global, grh.dgid or dlid + * possibly in future: sl, traffic_class, flow_label, maybe static_rate + * sanity checked with connect path: + * dlid, grh.dgid + * could check: sl + * don't want to check: static_rate (could negotiate down in path) + * validated with inbound REQ + * port_num, grh.dgid or dlid + * Not used: hop_limit (locally resolve) + * rem_addr - used to select client/listen vs loc_addr + */ +struct rv_conn_create_params_in { + struct ib_uverbs_ah_attr ah; + __u32 rem_addr; + __aligned_u64 context; +}; + +/* + * handle - rv_user_conn for future conn/discon calls + * conn_handle - rv_conn for completions only + */ +struct rv_conn_create_params_out { + __aligned_u64 handle; + __aligned_u64 conn_handle; +}; + +struct rv_conn_create_params { + union { + struct rv_conn_create_params_in in; + struct rv_conn_create_params_out out; + }; +}; + +#define RV_IOCTL_CONN_CREATE _IOWR(RV_MAGIC, 0xF8, \ + struct rv_conn_create_params) + +struct rv_conn_connect_params_in { + __aligned_u64 handle; + struct ib_user_path_rec path; +}; + +#define RV_IOCTL_CONN_CONNECT _IOW(RV_MAGIC, 0xF9, \ + struct rv_conn_connect_params_in) + +struct rv_conn_connected_params_in { + __aligned_u64 handle; +}; + +#define RV_IOCTL_CONN_CONNECTED _IOW(RV_MAGIC, 0xFA, \ + struct rv_conn_connected_params_in) + +/* + * get connection count for a specific sconn + * returns: + * 0 - count returned + * EIO - connection lost and unrecoverable + * EINVAL - invalid handle and/or index + * A 32b count is sufficient to handle constant RV reconnects, with a + * 100ms delay between each established connection, for up to 13 years. + */ +struct rv_conn_get_conn_count_params_in { + __aligned_u64 handle; + __u8 index; +}; + +/* we return count as an output parameter (vs ioctl ret) so can use full 32b */ +struct rv_conn_get_conn_count_params_out { + __u32 count; +}; + +struct rv_conn_get_conn_count_params { + union { + struct rv_conn_get_conn_count_params_in in; + struct rv_conn_get_conn_count_params_out out; + }; +}; + +#define RV_IOCTL_CONN_GET_CONN_COUNT _IOWR(RV_MAGIC, 0xFF, \ + struct rv_conn_get_conn_count_params) + +/* index to get agg of sconn in given conn */ +#define RV_CONN_STATS_AGGREGATE 255 + +/* + * handle - if 0, aggregate of all rv's sconn returned + * index - ignored if !handle, otherwise specific sconn index + */ +struct rv_conn_get_stats_params_in { + __aligned_u64 handle; + __u8 index; +}; + +/* + * flags can be combined when get aggregate results + * so separate bits for client vs server + */ +#define RV_CONN_STAT_FLAG_SERVER 0x01 +#define RV_CONN_STAT_FLAG_CLIENT 0x02 +#define RV_CONN_STAT_FLAG_WAS_CONNECTED 0x04 + +/* + * index - mimics input value + * flags + * num_conn - total QPs included + * CM events + * req_error - IB_CM_REQ_ERROR + * req_recv - IB_CM_REQ_RECEIVED + * rep_error - IB_CM_REP_ERROR + * rep_recv - IB_CM_REP_RECEIVED + * rtu_recv - IB_CM_RTU_RECEIVED + * established - IB_CM_USER_ESTABLISHED - via ib_cm_notify + * dreq_error - IB_CM_DREQ_ERROR + * dreq_recv - IB_CM_DREQ_RECEIVED + * drep_recv - IB_CM_DREP_RECEIVED + * timewait - IB_CM_TIMEWAIT_EXIT + * mra_recv - IB_CM_MRA_RECEIVED + * rej_recv - IB_CM_REJ_RECEIVED + * lap_error - IB_CM_LAP_ERROR + * lap_recv - IB_CM_LAP_RECEIVED + * apr_recv - IB_CM_APR_RECEIVED + * unexp_event - SIDR and any others + * outbound CM messages + * req_sent - CM REQ + * rep_sent - CM REP + * rtu_sent - CM RTU + * rej_sent - CM REJ + * dreq_sent - CM DREQ + * drep_sent - CM DREP + * (re)connect time does not include wait nor resolver time + * wait_time - microseconds for initial connect + * resolve_time - microseconds for initial connect + * connect_time - microseconds for initial connect + * connected_time - microseconds were connected + * resolve - attempts at resolving + * resolve_fail - hard failures + * conn_recovery - # times recovered connection + * rewait_time - microseconds for connect recovery + * reresolve_time - microseconds for connect recovery + * reconnect_time - microseconds for connect recovery + * max_rewait_time - microseconds for connect recovery + * max_reresolve_time - microseconds for connect recovery + * max_reconnect_time - microseconds for connect recovery + * reresolve - attempts at resolving + * reresolve_fail - hard failures + * post_write - successful post_rdma_write + * post_write_fail - failed at time of posting + * post_write_bytes - for successful post + * outstand_send_write - sent RDMA Write waiting for CQE + * send_write_cqe - successful sent RDMA Write CQE + * send_write_cqe_fail - sent RDMA Write CQE with bad status + * recv_write_cqe - successful recv RDMA Write CQE + * recv_write_bytes - successful recv RDMA Write + * recv_cqe_fail - recv CQE with bad status + * post_hb - successful post of heartbeat + * post_hb_fail - failed at time of posting + * send_hb_cqe - successful sent heartbeat CQE + * send_hb_cqe_fail - sent heartbeat CQE with bad status + * recv_hb_cqe - successful recv heartbeat CQE + */ +struct rv_conn_get_stats_params_out { + __u8 index; + __u8 flags; + __u32 num_conn; + + /* CM events */ + __u32 req_error; + __u32 req_recv; + __u32 rep_error; + __u32 rep_recv; + __u32 rtu_recv; + __u32 established; + __u32 dreq_error; + __u32 dreq_recv; + __u32 drep_recv; + __u32 timewait; + __u32 mra_recv; + __u32 rej_recv; + __u32 lap_error; + __u32 lap_recv; + __u32 apr_recv; + __u32 unexp_event; + + /* outbound CM messages */ + __u32 req_sent; + __u32 rep_sent; + __u32 rtu_sent; + __u32 rej_sent; + __u32 dreq_sent; + __u32 drep_sent; + __aligned_u64 wait_time; + __aligned_u64 resolve_time; + __aligned_u64 connect_time; + __aligned_u64 connected_time; + __u32 resolve; + __u32 resolve_fail; + __u32 conn_recovery; + __aligned_u64 rewait_time; + __aligned_u64 reresolve_time; + __aligned_u64 reconnect_time; + __aligned_u64 max_rewait_time; + __aligned_u64 max_reresolve_time; + __aligned_u64 max_reconnect_time; + __u32 reresolve; + __u32 reresolve_fail; + + __aligned_u64 post_write; + __aligned_u64 post_write_fail; + __aligned_u64 post_write_bytes; + __u32 outstand_send_write; + __aligned_u64 send_write_cqe; + __aligned_u64 send_write_cqe_fail; + + __aligned_u64 recv_write_cqe; + __aligned_u64 recv_write_bytes; + __aligned_u64 recv_cqe_fail; + + __aligned_u64 post_hb; + __aligned_u64 post_hb_fail; + __aligned_u64 send_hb_cqe; + __aligned_u64 send_hb_cqe_fail; + __aligned_u64 recv_hb_cqe; +}; + +struct rv_conn_get_stats_params { + union { + struct rv_conn_get_stats_params_in in; + struct rv_conn_get_stats_params_out out; + }; +}; + +#define RV_IOCTL_CONN_GET_STATS _IOWR(RV_MAGIC, 0xFD, \ + struct rv_conn_get_stats_params) + +/* + * send_write_cqe - successful sent RDMA Write CQE + * send_write_cqe_fail - sent RDMA Write CQE with bad status + * send_write_bytes - for successful send + * recv_write_cqe - successful recv RDMA Write CQE + * recv_write_cqe_fail - recv RDMA Write CQE with bad status + * recv_write_bytes - successful recv RDMA Write + */ +struct rv_event_stats_params_out { + __aligned_u64 send_write_cqe; + __aligned_u64 send_write_cqe_fail; + __aligned_u64 send_write_bytes; + + __aligned_u64 recv_write_cqe; + __aligned_u64 recv_write_cqe_fail; + __aligned_u64 recv_write_bytes; +}; + +#define RV_IOCTL_GET_EVENT_STATS _IOR(RV_MAGIC, 0xFE, \ + struct rv_event_stats_params_out) + +/* + * handle - from create_conn + * application source buffer and a kernel lkey for it + * loc_addr + * loc_mr_handle + * local MR - selected by 3-tuple (addr, len, access) + * loc_mr_addr + * loc_mr_length + * loc_mr_access + * remote application dest buffer and a kernel rkey for it + * rem_addr + * rkey + * length + * wr_id - application context, included in RV SQ completion events + * immed + */ +struct rv_post_write_params_in { + __aligned_u64 handle; + __aligned_u64 loc_addr; + __aligned_u64 loc_mr_handle; + __aligned_u64 loc_mr_addr; + __aligned_u64 loc_mr_length; + __u32 loc_mr_access; + __u32 rkey; + __aligned_u64 rem_addr; + __aligned_u64 length; + __aligned_u64 wr_id; + __u32 immed; +}; + +struct rv_post_write_params_out { + __u8 sconn_index; + __u32 conn_count; +}; + +struct rv_post_write_params { + union { + struct rv_post_write_params_in in; + struct rv_post_write_params_out out; + }; +}; + +#define RV_IOCTL_POST_RDMA_WR_IMMED _IOWR(RV_MAGIC, 0xFB, \ + struct rv_post_write_params) + +enum rv_event_type { + RV_WC_RDMA_WRITE, /* send RDMA Write CQE */ + RV_WC_RECV_RDMA_WITH_IMM, /* recv RDMA Write w/immed CQE */ +}; + +/* + * events placed on ring buffer for delivery to user space. + * Carefully sized to be a multiple of 64 bytes for cache alignment. + * Must pack to get good field alignment and desired 64B overall size + * Unlike verbs, all rv_event fields are defined even when + * rv_event.wc.status != IB_WC_SUCCESS. Only sent writes can report bad status. + * event_type - enum rv_event_type + * wc - send or recv work completions + * status - ib_wc_status + * resv1 - alignment + * imm_data - for RV_WC_RECV_RDMA_WITH_IMM only + * wr_id - PSM wr_id for RV_WC_RDMA_WRITE only + * conn_handle - conn handle. For efficiency in completion processing, this + * handle is the rv_conn handle, not the rv_user_conn. + * Main use is sanity checks. On Recv PSM must use imm_data to + * efficiently identify source. + * byte_len - unlike verbs API, this is always valid + * resv2 - alignment + * cache_align - not used, but forces overall struct to 64B size + */ +struct rv_event { + __u8 event_type; + union { + struct { + __u8 status; + __u16 resv1; + __u32 imm_data; + __aligned_u64 wr_id; + __aligned_u64 conn_handle; + __u32 byte_len; + __u32 resv2; + } __attribute__((__packed__)) wc; + struct { + __u8 pad[7]; + uint64_t pad2[7]; + } __attribute__((__packed__)) cache_align; + }; +} __attribute__((__packed__)); + +/* + * head - consumer removes here + * tail - producer adds here + * overflow_cnt - number of times producer overflowed ring and discarded + * pad - 64B cache alignment for entries + */ +struct rv_ring_header { + volatile __u32 head; + volatile __u32 tail; + volatile __u64 overflow_cnt; + __aligned_u64 pad[6]; + struct rv_event entries[]; +}; + +#define RV_RING_ALLOC_LEN(num_entries) \ + ((__u32)(((num_entries) * sizeof(struct rv_event)) + \ + sizeof(struct rv_ring_header))) + +#endif /* __RV_USER_IOCTL_H__ */

[RFC,1/9] RDMA/rv: Public interferce for the RDMA Rendezvous module

Commit Message

Comments

Patch