Message ID | 20190131130850.6850-4-yuval.shaia@oracle.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Misc fixes to pvrdma device | expand |
Hi Yuval, On 1/31/19 3:08 PM, Yuval Shaia wrote: > To protect against CPU over utilization when guest performs unneeded > busy waiting loop on an empty CQ. > > Signed-off-by: Yuval Shaia <yuval.shaia@oracle.com> > --- > hw/rdma/rdma_backend.c | 11 +++++++---- > hw/rdma/rdma_backend.h | 2 +- > hw/rdma/rdma_rm.c | 1 + > hw/rdma/rdma_rm_defs.h | 6 +++++- > hw/rdma/vmw/pvrdma_qp_ops.c | 24 +++++++++++++++++++++++- > 5 files changed, 37 insertions(+), 7 deletions(-) > > diff --git a/hw/rdma/rdma_backend.c b/hw/rdma/rdma_backend.c > index 2f6372f8f0..b7d6afb5da 100644 > --- a/hw/rdma/rdma_backend.c > +++ b/hw/rdma/rdma_backend.c > @@ -64,9 +64,9 @@ static inline void complete_work(enum ibv_wc_status status, uint32_t vendor_err, > comp_handler(ctx, &wc); > } > > -static void rdma_poll_cq(RdmaDeviceResources *rdma_dev_res, struct ibv_cq *ibcq) > +static int rdma_poll_cq(RdmaDeviceResources *rdma_dev_res, struct ibv_cq *ibcq) > { > - int i, ne; > + int i, ne, total_ne = 0; > BackendCtx *bctx; > struct ibv_wc wc[2]; > > @@ -76,6 +76,7 @@ static void rdma_poll_cq(RdmaDeviceResources *rdma_dev_res, struct ibv_cq *ibcq) > trace_rdma_poll_cq(ne, ibcq); > > for (i = 0; i < ne; i++) { > + total_ne++; It seems 'i' and 'total_ne' hold the same value, do you need them both? > bctx = rdma_rm_get_cqe_ctx(rdma_dev_res, wc[i].wr_id); > if (unlikely(!bctx)) { > rdma_error_report("No matching ctx for req %"PRId64, > @@ -93,6 +94,8 @@ static void rdma_poll_cq(RdmaDeviceResources *rdma_dev_res, struct ibv_cq *ibcq) > if (ne < 0) { > rdma_error_report("ibv_poll_cq fail, rc=%d, errno=%d", ne, errno); > } > + > + return total_ne; > } > > static void *comp_handler_thread(void *arg) > @@ -267,9 +270,9 @@ int rdma_backend_query_port(RdmaBackendDev *backend_dev, > return 0; > } > > -void rdma_backend_poll_cq(RdmaDeviceResources *rdma_dev_res, RdmaBackendCQ *cq) > +int rdma_backend_poll_cq(RdmaDeviceResources *rdma_dev_res, RdmaBackendCQ *cq) > { > - rdma_poll_cq(rdma_dev_res, cq->ibcq); > + return rdma_poll_cq(rdma_dev_res, cq->ibcq); > } > > static GHashTable *ah_hash; > diff --git a/hw/rdma/rdma_backend.h b/hw/rdma/rdma_backend.h > index 5114c90e67..36305cd148 100644 > --- a/hw/rdma/rdma_backend.h > +++ b/hw/rdma/rdma_backend.h > @@ -85,7 +85,7 @@ void rdma_backend_destroy_mr(RdmaBackendMR *mr); > int rdma_backend_create_cq(RdmaBackendDev *backend_dev, RdmaBackendCQ *cq, > int cqe); > void rdma_backend_destroy_cq(RdmaBackendCQ *cq); > -void rdma_backend_poll_cq(RdmaDeviceResources *rdma_dev_res, RdmaBackendCQ *cq); > +int rdma_backend_poll_cq(RdmaDeviceResources *rdma_dev_res, RdmaBackendCQ *cq); > > int rdma_backend_create_qp(RdmaBackendQP *qp, uint8_t qp_type, > RdmaBackendPD *pd, RdmaBackendCQ *scq, > diff --git a/hw/rdma/rdma_rm.c b/hw/rdma/rdma_rm.c > index 64c6ea1a4e..1ba77ac42c 100644 > --- a/hw/rdma/rdma_rm.c > +++ b/hw/rdma/rdma_rm.c > @@ -261,6 +261,7 @@ int rdma_rm_alloc_cq(RdmaDeviceResources *dev_res, RdmaBackendDev *backend_dev, > if (!cq) { > return -ENOMEM; > } > + atomic_set(&cq->missing_cqe, 0); > > cq->opaque = opaque; > cq->notify = CNT_CLEAR; > diff --git a/hw/rdma/rdma_rm_defs.h b/hw/rdma/rdma_rm_defs.h > index 0ba61d1838..08692e87d4 100644 > --- a/hw/rdma/rdma_rm_defs.h > +++ b/hw/rdma/rdma_rm_defs.h > @@ -34,7 +34,9 @@ > #define MAX_QP_INIT_RD_ATOM 16 > #define MAX_AH 64 > > -#define MAX_RM_TBL_NAME 16 > +#define MAX_RM_TBL_NAME 16 > +#define MAX_CONSEQ_EMPTY_POLL_CQ 2048 /* considered as error above this */ > + > typedef struct RdmaRmResTbl { > char name[MAX_RM_TBL_NAME]; > QemuMutex lock; > @@ -59,6 +61,8 @@ typedef struct RdmaRmCQ { > RdmaBackendCQ backend_cq; > void *opaque; > CQNotificationType notify; > + int missing_cqe; Maybe cq_empty_hit_cnt ? We don't really have a missing cqe. > + int conseq_empty_poll; > } RdmaRmCQ; > > /* MR (DMA region) */ > diff --git a/hw/rdma/vmw/pvrdma_qp_ops.c b/hw/rdma/vmw/pvrdma_qp_ops.c > index 16db726dac..5d650a4943 100644 > --- a/hw/rdma/vmw/pvrdma_qp_ops.c > +++ b/hw/rdma/vmw/pvrdma_qp_ops.c > @@ -60,6 +60,8 @@ static int pvrdma_post_cqe(PVRDMADev *dev, uint32_t cq_handle, > return -EINVAL; > } > > + atomic_dec(&cq->missing_cqe); > + We should set it to 0 here? (If we are counting cq-empty hits) > ring = (PvrdmaRing *)cq->opaque; > > /* Step #1: Put CQE on CQ ring */ > @@ -141,12 +143,15 @@ void pvrdma_qp_send(PVRDMADev *dev, uint32_t qp_handle) > PvrdmaRing *ring; > int sgid_idx; > union ibv_gid *sgid; > + RdmaRmCQ *cq; > > qp = rdma_rm_get_qp(&dev->rdma_dev_res, qp_handle); > if (unlikely(!qp)) { > return; > } > > + cq = rdma_rm_get_cq(&dev->rdma_dev_res, qp->send_cq_handle); > + > ring = (PvrdmaRing *)qp->opaque; > > wqe = (struct PvrdmaSqWqe *)pvrdma_ring_next_elem_read(ring); > @@ -186,6 +191,7 @@ void pvrdma_qp_send(PVRDMADev *dev, uint32_t qp_handle) > continue; > } > > + atomic_inc(&cq->missing_cqe); > rdma_backend_post_send(&dev->backend_dev, &qp->backend_qp, qp->qp_type, > (struct ibv_sge *)&wqe->sge[0], wqe->hdr.num_sge, > sgid_idx, sgid, > @@ -204,12 +210,15 @@ void pvrdma_qp_recv(PVRDMADev *dev, uint32_t qp_handle) > RdmaRmQP *qp; > PvrdmaRqWqe *wqe; > PvrdmaRing *ring; > + RdmaRmCQ *cq; > > qp = rdma_rm_get_qp(&dev->rdma_dev_res, qp_handle); > if (unlikely(!qp)) { > return; > } > > + cq = rdma_rm_get_cq(&dev->rdma_dev_res, qp->recv_cq_handle); > + > ring = &((PvrdmaRing *)qp->opaque)[1]; > > wqe = (struct PvrdmaRqWqe *)pvrdma_ring_next_elem_read(ring); > @@ -231,6 +240,7 @@ void pvrdma_qp_recv(PVRDMADev *dev, uint32_t qp_handle) > continue; > } > > + atomic_inc(&cq->missing_cqe); > rdma_backend_post_recv(&dev->backend_dev, &dev->rdma_dev_res, > &qp->backend_qp, qp->qp_type, > (struct ibv_sge *)&wqe->sge[0], wqe->hdr.num_sge, > @@ -245,11 +255,23 @@ void pvrdma_qp_recv(PVRDMADev *dev, uint32_t qp_handle) > void pvrdma_cq_poll(RdmaDeviceResources *dev_res, uint32_t cq_handle) > { > RdmaRmCQ *cq; > + int polled; > > cq = rdma_rm_get_cq(dev_res, cq_handle); > if (!cq) { > return; > } > > - rdma_backend_poll_cq(dev_res, &cq->backend_cq); > + polled = rdma_backend_poll_cq(dev_res, &cq->backend_cq); > + if (!polled) { > + if (cq->conseq_empty_poll == MAX_CONSEQ_EMPTY_POLL_CQ) { > + rdma_warn_report("%d consequtive empty polls from CQ %d, missing cqe %d", > + cq->conseq_empty_poll, cq_handle, > + atomic_read(&cq->missing_cqe)); > + cq->conseq_empty_poll = 0; > + } > + cq->conseq_empty_poll++; > + } else { > + cq->conseq_empty_poll = 0; > + } > } So we don't really protect against high CPU usage, we only warn. Are the both counters interesting? Thanks, Marcel
On Wed, Feb 06, 2019 at 12:14:24PM +0200, Marcel Apfelbaum wrote: > Hi Yuval, > > On 1/31/19 3:08 PM, Yuval Shaia wrote: > > To protect against CPU over utilization when guest performs unneeded > > busy waiting loop on an empty CQ. > > > > Signed-off-by: Yuval Shaia <yuval.shaia@oracle.com> > > --- > > hw/rdma/rdma_backend.c | 11 +++++++---- > > hw/rdma/rdma_backend.h | 2 +- > > hw/rdma/rdma_rm.c | 1 + > > hw/rdma/rdma_rm_defs.h | 6 +++++- > > hw/rdma/vmw/pvrdma_qp_ops.c | 24 +++++++++++++++++++++++- > > 5 files changed, 37 insertions(+), 7 deletions(-) > > > > diff --git a/hw/rdma/rdma_backend.c b/hw/rdma/rdma_backend.c > > index 2f6372f8f0..b7d6afb5da 100644 > > --- a/hw/rdma/rdma_backend.c > > +++ b/hw/rdma/rdma_backend.c > > @@ -64,9 +64,9 @@ static inline void complete_work(enum ibv_wc_status status, uint32_t vendor_err, > > comp_handler(ctx, &wc); > > } > > -static void rdma_poll_cq(RdmaDeviceResources *rdma_dev_res, struct ibv_cq *ibcq) > > +static int rdma_poll_cq(RdmaDeviceResources *rdma_dev_res, struct ibv_cq *ibcq) > > { > > - int i, ne; > > + int i, ne, total_ne = 0; > > BackendCtx *bctx; > > struct ibv_wc wc[2]; > > @@ -76,6 +76,7 @@ static void rdma_poll_cq(RdmaDeviceResources *rdma_dev_res, struct ibv_cq *ibcq) > > trace_rdma_poll_cq(ne, ibcq); > > for (i = 0; i < ne; i++) { > > + total_ne++; > > It seems 'i' and 'total_ne' hold the same value, do you need them both? Scope is different, while the scope of 'i' is limited to one ibv_poll_cq call, the total_ne counts until ibv_poll_cq loop is done. > > > bctx = rdma_rm_get_cqe_ctx(rdma_dev_res, wc[i].wr_id); > > if (unlikely(!bctx)) { > > rdma_error_report("No matching ctx for req %"PRId64, > > @@ -93,6 +94,8 @@ static void rdma_poll_cq(RdmaDeviceResources *rdma_dev_res, struct ibv_cq *ibcq) > > if (ne < 0) { > > rdma_error_report("ibv_poll_cq fail, rc=%d, errno=%d", ne, errno); > > } > > + > > + return total_ne; > > } > > static void *comp_handler_thread(void *arg) > > @@ -267,9 +270,9 @@ int rdma_backend_query_port(RdmaBackendDev *backend_dev, > > return 0; > > } > > -void rdma_backend_poll_cq(RdmaDeviceResources *rdma_dev_res, RdmaBackendCQ *cq) > > +int rdma_backend_poll_cq(RdmaDeviceResources *rdma_dev_res, RdmaBackendCQ *cq) > > { > > - rdma_poll_cq(rdma_dev_res, cq->ibcq); > > + return rdma_poll_cq(rdma_dev_res, cq->ibcq); > > } > > static GHashTable *ah_hash; > > diff --git a/hw/rdma/rdma_backend.h b/hw/rdma/rdma_backend.h > > index 5114c90e67..36305cd148 100644 > > --- a/hw/rdma/rdma_backend.h > > +++ b/hw/rdma/rdma_backend.h > > @@ -85,7 +85,7 @@ void rdma_backend_destroy_mr(RdmaBackendMR *mr); > > int rdma_backend_create_cq(RdmaBackendDev *backend_dev, RdmaBackendCQ *cq, > > int cqe); > > void rdma_backend_destroy_cq(RdmaBackendCQ *cq); > > -void rdma_backend_poll_cq(RdmaDeviceResources *rdma_dev_res, RdmaBackendCQ *cq); > > +int rdma_backend_poll_cq(RdmaDeviceResources *rdma_dev_res, RdmaBackendCQ *cq); > > int rdma_backend_create_qp(RdmaBackendQP *qp, uint8_t qp_type, > > RdmaBackendPD *pd, RdmaBackendCQ *scq, > > diff --git a/hw/rdma/rdma_rm.c b/hw/rdma/rdma_rm.c > > index 64c6ea1a4e..1ba77ac42c 100644 > > --- a/hw/rdma/rdma_rm.c > > +++ b/hw/rdma/rdma_rm.c > > @@ -261,6 +261,7 @@ int rdma_rm_alloc_cq(RdmaDeviceResources *dev_res, RdmaBackendDev *backend_dev, > > if (!cq) { > > return -ENOMEM; > > } > > + atomic_set(&cq->missing_cqe, 0); > > cq->opaque = opaque; > > cq->notify = CNT_CLEAR; > > diff --git a/hw/rdma/rdma_rm_defs.h b/hw/rdma/rdma_rm_defs.h > > index 0ba61d1838..08692e87d4 100644 > > --- a/hw/rdma/rdma_rm_defs.h > > +++ b/hw/rdma/rdma_rm_defs.h > > @@ -34,7 +34,9 @@ > > #define MAX_QP_INIT_RD_ATOM 16 > > #define MAX_AH 64 > > -#define MAX_RM_TBL_NAME 16 > > +#define MAX_RM_TBL_NAME 16 > > +#define MAX_CONSEQ_EMPTY_POLL_CQ 2048 /* considered as error above this */ > > + > > typedef struct RdmaRmResTbl { > > char name[MAX_RM_TBL_NAME]; > > QemuMutex lock; > > @@ -59,6 +61,8 @@ typedef struct RdmaRmCQ { > > RdmaBackendCQ backend_cq; > > void *opaque; > > CQNotificationType notify; > > + int missing_cqe; > > Maybe cq_empty_hit_cnt ? We don't really have a missing cqe. Idea here is to count the CQEs we expect to have and still missing so name is fine. > > > + int conseq_empty_poll; > > } RdmaRmCQ; > > /* MR (DMA region) */ > > diff --git a/hw/rdma/vmw/pvrdma_qp_ops.c b/hw/rdma/vmw/pvrdma_qp_ops.c > > index 16db726dac..5d650a4943 100644 > > --- a/hw/rdma/vmw/pvrdma_qp_ops.c > > +++ b/hw/rdma/vmw/pvrdma_qp_ops.c > > @@ -60,6 +60,8 @@ static int pvrdma_post_cqe(PVRDMADev *dev, uint32_t cq_handle, > > return -EINVAL; > > } > > + atomic_dec(&cq->missing_cqe); > > + > > We should set it to 0 here? (If we are counting cq-empty hits) > > > ring = (PvrdmaRing *)cq->opaque; > > /* Step #1: Put CQE on CQ ring */ > > @@ -141,12 +143,15 @@ void pvrdma_qp_send(PVRDMADev *dev, uint32_t qp_handle) > > PvrdmaRing *ring; > > int sgid_idx; > > union ibv_gid *sgid; > > + RdmaRmCQ *cq; > > qp = rdma_rm_get_qp(&dev->rdma_dev_res, qp_handle); > > if (unlikely(!qp)) { > > return; > > } > > + cq = rdma_rm_get_cq(&dev->rdma_dev_res, qp->send_cq_handle); > > + > > ring = (PvrdmaRing *)qp->opaque; > > wqe = (struct PvrdmaSqWqe *)pvrdma_ring_next_elem_read(ring); > > @@ -186,6 +191,7 @@ void pvrdma_qp_send(PVRDMADev *dev, uint32_t qp_handle) > > continue; > > } > > + atomic_inc(&cq->missing_cqe); > > rdma_backend_post_send(&dev->backend_dev, &qp->backend_qp, qp->qp_type, > > (struct ibv_sge *)&wqe->sge[0], wqe->hdr.num_sge, > > sgid_idx, sgid, > > @@ -204,12 +210,15 @@ void pvrdma_qp_recv(PVRDMADev *dev, uint32_t qp_handle) > > RdmaRmQP *qp; > > PvrdmaRqWqe *wqe; > > PvrdmaRing *ring; > > + RdmaRmCQ *cq; > > qp = rdma_rm_get_qp(&dev->rdma_dev_res, qp_handle); > > if (unlikely(!qp)) { > > return; > > } > > + cq = rdma_rm_get_cq(&dev->rdma_dev_res, qp->recv_cq_handle); > > + > > ring = &((PvrdmaRing *)qp->opaque)[1]; > > wqe = (struct PvrdmaRqWqe *)pvrdma_ring_next_elem_read(ring); > > @@ -231,6 +240,7 @@ void pvrdma_qp_recv(PVRDMADev *dev, uint32_t qp_handle) > > continue; > > } > > + atomic_inc(&cq->missing_cqe); > > rdma_backend_post_recv(&dev->backend_dev, &dev->rdma_dev_res, > > &qp->backend_qp, qp->qp_type, > > (struct ibv_sge *)&wqe->sge[0], wqe->hdr.num_sge, > > @@ -245,11 +255,23 @@ void pvrdma_qp_recv(PVRDMADev *dev, uint32_t qp_handle) > > void pvrdma_cq_poll(RdmaDeviceResources *dev_res, uint32_t cq_handle) > > { > > RdmaRmCQ *cq; > > + int polled; > > cq = rdma_rm_get_cq(dev_res, cq_handle); > > if (!cq) { > > return; > > } > > - rdma_backend_poll_cq(dev_res, &cq->backend_cq); > > + polled = rdma_backend_poll_cq(dev_res, &cq->backend_cq); > > + if (!polled) { > > + if (cq->conseq_empty_poll == MAX_CONSEQ_EMPTY_POLL_CQ) { > > + rdma_warn_report("%d consequtive empty polls from CQ %d, missing cqe %d", > > + cq->conseq_empty_poll, cq_handle, > > + atomic_read(&cq->missing_cqe)); > > + cq->conseq_empty_poll = 0; > > + } > > + cq->conseq_empty_poll++; > > + } else { > > + cq->conseq_empty_poll = 0; > > + } > > } > > So we don't really protect against high CPU usage, we only warn. Correct, just a warning. But accepting your suggestion of adding this as another counter and get rid of this warning. > Are the both counters interesting? Both? the 'conseq' and 'missing'? Idea here is that 'missing' counts the total missing CQEs while conseq (short name for consecutive) counts the number of times the guest issues consecutive 'polls' to empty CQ (please note how it is zeroed when poll_cq returns something). > > Thanks, > Marcel > >
> > @@ -60,6 +60,8 @@ static int pvrdma_post_cqe(PVRDMADev *dev, uint32_t cq_handle, > > return -EINVAL; > > } > > + atomic_dec(&cq->missing_cqe); > > + > > We should set it to 0 here? (If we are counting cq-empty hits) No, this counter just count the number of missing CQEs. It is increased on every WQE we are posting and decreased on every CQE we receive. What is remain is the missing CQEs. > > > ring = (PvrdmaRing *)cq->opaque; > > /* Step #1: Put CQE on CQ ring */
diff --git a/hw/rdma/rdma_backend.c b/hw/rdma/rdma_backend.c index 2f6372f8f0..b7d6afb5da 100644 --- a/hw/rdma/rdma_backend.c +++ b/hw/rdma/rdma_backend.c @@ -64,9 +64,9 @@ static inline void complete_work(enum ibv_wc_status status, uint32_t vendor_err, comp_handler(ctx, &wc); } -static void rdma_poll_cq(RdmaDeviceResources *rdma_dev_res, struct ibv_cq *ibcq) +static int rdma_poll_cq(RdmaDeviceResources *rdma_dev_res, struct ibv_cq *ibcq) { - int i, ne; + int i, ne, total_ne = 0; BackendCtx *bctx; struct ibv_wc wc[2]; @@ -76,6 +76,7 @@ static void rdma_poll_cq(RdmaDeviceResources *rdma_dev_res, struct ibv_cq *ibcq) trace_rdma_poll_cq(ne, ibcq); for (i = 0; i < ne; i++) { + total_ne++; bctx = rdma_rm_get_cqe_ctx(rdma_dev_res, wc[i].wr_id); if (unlikely(!bctx)) { rdma_error_report("No matching ctx for req %"PRId64, @@ -93,6 +94,8 @@ static void rdma_poll_cq(RdmaDeviceResources *rdma_dev_res, struct ibv_cq *ibcq) if (ne < 0) { rdma_error_report("ibv_poll_cq fail, rc=%d, errno=%d", ne, errno); } + + return total_ne; } static void *comp_handler_thread(void *arg) @@ -267,9 +270,9 @@ int rdma_backend_query_port(RdmaBackendDev *backend_dev, return 0; } -void rdma_backend_poll_cq(RdmaDeviceResources *rdma_dev_res, RdmaBackendCQ *cq) +int rdma_backend_poll_cq(RdmaDeviceResources *rdma_dev_res, RdmaBackendCQ *cq) { - rdma_poll_cq(rdma_dev_res, cq->ibcq); + return rdma_poll_cq(rdma_dev_res, cq->ibcq); } static GHashTable *ah_hash; diff --git a/hw/rdma/rdma_backend.h b/hw/rdma/rdma_backend.h index 5114c90e67..36305cd148 100644 --- a/hw/rdma/rdma_backend.h +++ b/hw/rdma/rdma_backend.h @@ -85,7 +85,7 @@ void rdma_backend_destroy_mr(RdmaBackendMR *mr); int rdma_backend_create_cq(RdmaBackendDev *backend_dev, RdmaBackendCQ *cq, int cqe); void rdma_backend_destroy_cq(RdmaBackendCQ *cq); -void rdma_backend_poll_cq(RdmaDeviceResources *rdma_dev_res, RdmaBackendCQ *cq); +int rdma_backend_poll_cq(RdmaDeviceResources *rdma_dev_res, RdmaBackendCQ *cq); int rdma_backend_create_qp(RdmaBackendQP *qp, uint8_t qp_type, RdmaBackendPD *pd, RdmaBackendCQ *scq, diff --git a/hw/rdma/rdma_rm.c b/hw/rdma/rdma_rm.c index 64c6ea1a4e..1ba77ac42c 100644 --- a/hw/rdma/rdma_rm.c +++ b/hw/rdma/rdma_rm.c @@ -261,6 +261,7 @@ int rdma_rm_alloc_cq(RdmaDeviceResources *dev_res, RdmaBackendDev *backend_dev, if (!cq) { return -ENOMEM; } + atomic_set(&cq->missing_cqe, 0); cq->opaque = opaque; cq->notify = CNT_CLEAR; diff --git a/hw/rdma/rdma_rm_defs.h b/hw/rdma/rdma_rm_defs.h index 0ba61d1838..08692e87d4 100644 --- a/hw/rdma/rdma_rm_defs.h +++ b/hw/rdma/rdma_rm_defs.h @@ -34,7 +34,9 @@ #define MAX_QP_INIT_RD_ATOM 16 #define MAX_AH 64 -#define MAX_RM_TBL_NAME 16 +#define MAX_RM_TBL_NAME 16 +#define MAX_CONSEQ_EMPTY_POLL_CQ 2048 /* considered as error above this */ + typedef struct RdmaRmResTbl { char name[MAX_RM_TBL_NAME]; QemuMutex lock; @@ -59,6 +61,8 @@ typedef struct RdmaRmCQ { RdmaBackendCQ backend_cq; void *opaque; CQNotificationType notify; + int missing_cqe; + int conseq_empty_poll; } RdmaRmCQ; /* MR (DMA region) */ diff --git a/hw/rdma/vmw/pvrdma_qp_ops.c b/hw/rdma/vmw/pvrdma_qp_ops.c index 16db726dac..5d650a4943 100644 --- a/hw/rdma/vmw/pvrdma_qp_ops.c +++ b/hw/rdma/vmw/pvrdma_qp_ops.c @@ -60,6 +60,8 @@ static int pvrdma_post_cqe(PVRDMADev *dev, uint32_t cq_handle, return -EINVAL; } + atomic_dec(&cq->missing_cqe); + ring = (PvrdmaRing *)cq->opaque; /* Step #1: Put CQE on CQ ring */ @@ -141,12 +143,15 @@ void pvrdma_qp_send(PVRDMADev *dev, uint32_t qp_handle) PvrdmaRing *ring; int sgid_idx; union ibv_gid *sgid; + RdmaRmCQ *cq; qp = rdma_rm_get_qp(&dev->rdma_dev_res, qp_handle); if (unlikely(!qp)) { return; } + cq = rdma_rm_get_cq(&dev->rdma_dev_res, qp->send_cq_handle); + ring = (PvrdmaRing *)qp->opaque; wqe = (struct PvrdmaSqWqe *)pvrdma_ring_next_elem_read(ring); @@ -186,6 +191,7 @@ void pvrdma_qp_send(PVRDMADev *dev, uint32_t qp_handle) continue; } + atomic_inc(&cq->missing_cqe); rdma_backend_post_send(&dev->backend_dev, &qp->backend_qp, qp->qp_type, (struct ibv_sge *)&wqe->sge[0], wqe->hdr.num_sge, sgid_idx, sgid, @@ -204,12 +210,15 @@ void pvrdma_qp_recv(PVRDMADev *dev, uint32_t qp_handle) RdmaRmQP *qp; PvrdmaRqWqe *wqe; PvrdmaRing *ring; + RdmaRmCQ *cq; qp = rdma_rm_get_qp(&dev->rdma_dev_res, qp_handle); if (unlikely(!qp)) { return; } + cq = rdma_rm_get_cq(&dev->rdma_dev_res, qp->recv_cq_handle); + ring = &((PvrdmaRing *)qp->opaque)[1]; wqe = (struct PvrdmaRqWqe *)pvrdma_ring_next_elem_read(ring); @@ -231,6 +240,7 @@ void pvrdma_qp_recv(PVRDMADev *dev, uint32_t qp_handle) continue; } + atomic_inc(&cq->missing_cqe); rdma_backend_post_recv(&dev->backend_dev, &dev->rdma_dev_res, &qp->backend_qp, qp->qp_type, (struct ibv_sge *)&wqe->sge[0], wqe->hdr.num_sge, @@ -245,11 +255,23 @@ void pvrdma_qp_recv(PVRDMADev *dev, uint32_t qp_handle) void pvrdma_cq_poll(RdmaDeviceResources *dev_res, uint32_t cq_handle) { RdmaRmCQ *cq; + int polled; cq = rdma_rm_get_cq(dev_res, cq_handle); if (!cq) { return; } - rdma_backend_poll_cq(dev_res, &cq->backend_cq); + polled = rdma_backend_poll_cq(dev_res, &cq->backend_cq); + if (!polled) { + if (cq->conseq_empty_poll == MAX_CONSEQ_EMPTY_POLL_CQ) { + rdma_warn_report("%d consequtive empty polls from CQ %d, missing cqe %d", + cq->conseq_empty_poll, cq_handle, + atomic_read(&cq->missing_cqe)); + cq->conseq_empty_poll = 0; + } + cq->conseq_empty_poll++; + } else { + cq->conseq_empty_poll = 0; + } }
To protect against CPU over utilization when guest performs unneeded busy waiting loop on an empty CQ. Signed-off-by: Yuval Shaia <yuval.shaia@oracle.com> --- hw/rdma/rdma_backend.c | 11 +++++++---- hw/rdma/rdma_backend.h | 2 +- hw/rdma/rdma_rm.c | 1 + hw/rdma/rdma_rm_defs.h | 6 +++++- hw/rdma/vmw/pvrdma_qp_ops.c | 24 +++++++++++++++++++++++- 5 files changed, 37 insertions(+), 7 deletions(-)