diff mbox

[PULL,v2,08/10] hw/rdma: PVRDMA commands and data-path ops

Message ID 20180219114332.70443-9-marcel@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Marcel Apfelbaum Feb. 19, 2018, 11:43 a.m. UTC
From: Yuval Shaia <yuval.shaia@oracle.com>

First PVRDMA sub-module - implementation of the PVRDMA device.
- PVRDMA commands such as create CQ and create MR.
- Data path QP operations - post_send and post_recv.
- Completion handler.

Reviewed-by: Dotan Barak <dotanb@mellanox.com>
Reviewed-by: Zhu Yanjun <yanjun.zhu@oracle.com>
Signed-off-by: Yuval Shaia <yuval.shaia@oracle.com>
Signed-off-by: Marcel Apfelbaum <marcel@redhat.com>
---
 hw/rdma/Makefile.objs         |   2 +
 hw/rdma/vmw/pvrdma.h          | 122 ++++++++
 hw/rdma/vmw/pvrdma_cmd.c      | 673 ++++++++++++++++++++++++++++++++++++++++++
 hw/rdma/vmw/pvrdma_dev_ring.c | 155 ++++++++++
 hw/rdma/vmw/pvrdma_dev_ring.h |  42 +++
 hw/rdma/vmw/pvrdma_qp_ops.c   | 222 ++++++++++++++
 hw/rdma/vmw/pvrdma_qp_ops.h   |  27 ++
 7 files changed, 1243 insertions(+)
 create mode 100644 hw/rdma/vmw/pvrdma.h
 create mode 100644 hw/rdma/vmw/pvrdma_cmd.c
 create mode 100644 hw/rdma/vmw/pvrdma_dev_ring.c
 create mode 100644 hw/rdma/vmw/pvrdma_dev_ring.h
 create mode 100644 hw/rdma/vmw/pvrdma_qp_ops.c
 create mode 100644 hw/rdma/vmw/pvrdma_qp_ops.h

Comments

Peter Maydell April 27, 2018, 2:31 p.m. UTC | #1
On 19 February 2018 at 11:43, Marcel Apfelbaum <marcel@redhat.com> wrote:
> From: Yuval Shaia <yuval.shaia@oracle.com>
>
> First PVRDMA sub-module - implementation of the PVRDMA device.
> - PVRDMA commands such as create CQ and create MR.
> - Data path QP operations - post_send and post_recv.
> - Completion handler.
>
> Reviewed-by: Dotan Barak <dotanb@mellanox.com>
> Reviewed-by: Zhu Yanjun <yanjun.zhu@oracle.com>
> Signed-off-by: Yuval Shaia <yuval.shaia@oracle.com>
> Signed-off-by: Marcel Apfelbaum <marcel@redhat.com>

Hi; Coverity points out an array bounds overrun in this code:


> +static int create_bind(PVRDMADev *dev, union pvrdma_cmd_req *req,
> +                       union pvrdma_cmd_resp *rsp)
> +{
> +    struct pvrdma_cmd_create_bind *cmd = &req->create_bind;
> +#ifdef PVRDMA_DEBUG
> +    __be64 *subnet = (__be64 *)&cmd->new_gid[0];
> +    __be64 *if_id = (__be64 *)&cmd->new_gid[8];
> +#endif
> +
> +    pr_dbg("index=%d\n", cmd->index);
> +
> +    if (cmd->index > MAX_PORT_GIDS) {
> +        return -EINVAL;
> +    }

This bounds check allows cmd->index == MAX_PORT_GIDS...

> +
> +    pr_dbg("gid[%d]=0x%llx,0x%llx\n", cmd->index,
> +           (long long unsigned int)be64_to_cpu(*subnet),
> +           (long long unsigned int)be64_to_cpu(*if_id));
> +
> +    /* Driver forces to one port only */
> +    memcpy(dev->rdma_dev_res.ports[0].gid_tbl[cmd->index].raw, &cmd->new_gid,
> +           sizeof(cmd->new_gid));

...but the gid_tbl[] array we index into is declared with

    union ibv_gid gid_tbl[MAX_PORT_GIDS];

so using MAX_PORT_GIDS as an index is off the end of it.

Presumably the check should be ">=".

> +static int destroy_bind(PVRDMADev *dev, union pvrdma_cmd_req *req,
> +                        union pvrdma_cmd_resp *rsp)
> +{
> +    struct pvrdma_cmd_destroy_bind *cmd = &req->destroy_bind;
> +
> +    pr_dbg("clear index %d\n", cmd->index);
> +
> +    memset(dev->rdma_dev_res.ports[0].gid_tbl[cmd->index].raw, 0,
> +           sizeof(dev->rdma_dev_res.ports[0].gid_tbl[cmd->index].raw));

I'm assuming this function can't be called unless create_bind()
has previously succeeded and so it doesn't need its own
bounds check.

> +
> +    return 0;
> +}

thanks
-- PMM
Peter Maydell April 27, 2018, 2:43 p.m. UTC | #2
On 19 February 2018 at 11:43, Marcel Apfelbaum <marcel@redhat.com> wrote:
> From: Yuval Shaia <yuval.shaia@oracle.com>
>
> First PVRDMA sub-module - implementation of the PVRDMA device.
> - PVRDMA commands such as create CQ and create MR.
> - Data path QP operations - post_send and post_recv.
> - Completion handler.

Coverity (CID1390589, CID1390608) points out more array
bounds overruns here:

> +
> +typedef struct PVRDMADev {
> +    PCIDevice parent_obj;
> +    MemoryRegion msix;
> +    MemoryRegion regs;
> +    uint32_t regs_data[RDMA_BAR1_REGS_SIZE];

regs_data is an array of size RDMA_BAR1_REGS_SIZE...

> +    MemoryRegion uar;
> +    uint32_t uar_data[RDMA_BAR2_UAR_SIZE];
> +    DSRInfo dsr_info;
> +    int interrupt_mask;
> +    struct ibv_device_attr dev_attr;
> +    uint64_t node_guid;
> +    char *backend_device_name;
> +    uint8_t backend_gid_idx;
> +    uint8_t backend_port_num;
> +    RdmaBackendDev backend_dev;
> +    RdmaDeviceResources rdma_dev_res;
> +} PVRDMADev;
> +#define PVRDMA_DEV(dev) OBJECT_CHECK(PVRDMADev, (dev), PVRDMA_HW_NAME)
> +
> +static inline int get_reg_val(PVRDMADev *dev, hwaddr addr, uint32_t *val)
> +{
> +    int idx = addr >> 2;
> +
> +    if (idx > RDMA_BAR1_REGS_SIZE) {
> +        return -EINVAL;
> +    }

...but the bounds check here is ">" rather than ">="
and allows idx == RDMA_BAR1_REGS_SIZE through...

> +
> +    *val = dev->regs_data[idx];

...and this will overrun the array.

> +
> +    return 0;
> +}
> +
> +static inline int set_reg_val(PVRDMADev *dev, hwaddr addr, uint32_t val)
> +{
> +    int idx = addr >> 2;
> +
> +    if (idx > RDMA_BAR1_REGS_SIZE) {
> +        return -EINVAL;
> +    }
> +
> +    dev->regs_data[idx] = val;

Similarly here, where this is a write access.

Luckily this isn't an exploitable guest escape, because the only
call to set_reg_val() with a guest-controlled addr value is from
the read function of an MMIO MemoryRegion which is created with
a size of RDMA_BAR1_REGS_SIZE, so the guest can't get out of
range values into here.

Three times is a pattern -- you might like to check your
other bounds checks for off-by-one errors. Coverity doesn't
necessarily catch all of them.

thanks
-- PMM
Peter Maydell April 27, 2018, 2:58 p.m. UTC | #3
On 19 February 2018 at 11:43, Marcel Apfelbaum <marcel@redhat.com> wrote:
> From: Yuval Shaia <yuval.shaia@oracle.com>
>
> First PVRDMA sub-module - implementation of the PVRDMA device.
> - PVRDMA commands such as create CQ and create MR.
> - Data path QP operations - post_send and post_recv.
> - Completion handler.

> +void pvrdma_cq_poll(RdmaDeviceResources *dev_res, uint32_t cq_handle)
> +{
> +    RdmaRmCQ *cq;
> +
> +    cq = rdma_rm_get_cq(dev_res, cq_handle);
> +    if (!cq) {
> +        pr_dbg("Invalid CQ# %d\n", cq_handle);
> +    }
> +
> +    rdma_backend_poll_cq(dev_res, &cq->backend_cq);
> +}

Coverity CID 1390586: we check for cq being NULL, but then
go ahead and use it anyway. If a NULL cq is a possible
situation we should handle it correctly (early return?
return an error value that the caller has to handle?
something else?); if it is not possible then we should
use assert rather than an if().

thanks
-- PMM
Peter Maydell April 27, 2018, 3:01 p.m. UTC | #4
On 19 February 2018 at 11:43, Marcel Apfelbaum <marcel@redhat.com> wrote:
> From: Yuval Shaia <yuval.shaia@oracle.com>
>
> First PVRDMA sub-module - implementation of the PVRDMA device.
> - PVRDMA commands such as create CQ and create MR.
> - Data path QP operations - post_send and post_recv.
> - Completion handler.

Coverity CID 1390620: we call munmap() on a NULL pointer.

> +static int create_mr(PVRDMADev *dev, union pvrdma_cmd_req *req,
> +                     union pvrdma_cmd_resp *rsp)
> +{
> +    struct pvrdma_cmd_create_mr *cmd = &req->create_mr;
> +    struct pvrdma_cmd_create_mr_resp *resp = &rsp->create_mr_resp;
> +    PCIDevice *pci_dev = PCI_DEVICE(dev);
> +    void *host_virt = NULL;

Here we set host_virt to NULL...

> +
> +    memset(resp, 0, sizeof(*resp));
> +    resp->hdr.response = cmd->hdr.response;
> +    resp->hdr.ack = PVRDMA_CMD_CREATE_MR_RESP;
> +
> +    pr_dbg("pd_handle=%d\n", cmd->pd_handle);
> +    pr_dbg("access_flags=0x%x\n", cmd->access_flags);
> +    pr_dbg("flags=0x%x\n", cmd->flags);
> +
> +    if (!(cmd->flags & PVRDMA_MR_FLAG_DMA)) {

...and if we don't take this if() we won't set host_virt to anything...

> +        host_virt = pvrdma_map_to_pdir(pci_dev, cmd->pdir_dma, cmd->nchunks,
> +                                       cmd->length);
> +        if (!host_virt) {
> +            pr_dbg("Failed to map to pdir\n");
> +            resp->hdr.err = -EINVAL;
> +            goto out;
> +        }
> +    }
> +
> +    resp->hdr.err = rdma_rm_alloc_mr(&dev->rdma_dev_res, cmd->pd_handle,
> +                                     cmd->start, cmd->length, host_virt,
> +                                     cmd->access_flags, &resp->mr_handle,
> +                                     &resp->lkey, &resp->rkey);
> +    if (!resp->hdr.err) {
> +        munmap(host_virt, cmd->length);

...but here we call munmap() on it without checking if it is NULL.
Unlike g_free(), munmap() isn't specified to be "do nothing if
passed a NULL pointer".

> +    }
> +
> +out:
> +    pr_dbg("ret=%d\n", resp->hdr.err);
> +    return resp->hdr.err;
> +}

thanks
-- PMM
Marcel Apfelbaum April 27, 2018, 6:20 p.m. UTC | #5
Hi Peter,

On 27/04/2018 17:31, Peter Maydell wrote:
> On 19 February 2018 at 11:43, Marcel Apfelbaum <marcel@redhat.com> wrote:
>> From: Yuval Shaia <yuval.shaia@oracle.com>
>>
>> First PVRDMA sub-module - implementation of the PVRDMA device.
>> - PVRDMA commands such as create CQ and create MR.
>> - Data path QP operations - post_send and post_recv.
>> - Completion handler.
>>
>> Reviewed-by: Dotan Barak <dotanb@mellanox.com>
>> Reviewed-by: Zhu Yanjun <yanjun.zhu@oracle.com>
>> Signed-off-by: Yuval Shaia <yuval.shaia@oracle.com>
>> Signed-off-by: Marcel Apfelbaum <marcel@redhat.com>
> 
> Hi; Coverity points out an array bounds overrun in this code:
> 
> 
>> +static int create_bind(PVRDMADev *dev, union pvrdma_cmd_req *req,
>> +                       union pvrdma_cmd_resp *rsp)
>> +{
>> +    struct pvrdma_cmd_create_bind *cmd = &req->create_bind;
>> +#ifdef PVRDMA_DEBUG
>> +    __be64 *subnet = (__be64 *)&cmd->new_gid[0];
>> +    __be64 *if_id = (__be64 *)&cmd->new_gid[8];
>> +#endif
>> +
>> +    pr_dbg("index=%d\n", cmd->index);
>> +
>> +    if (cmd->index > MAX_PORT_GIDS) {
>> +        return -EINVAL;
>> +    }
> 
> This bounds check allows cmd->index == MAX_PORT_GIDS...
> 
>> +
>> +    pr_dbg("gid[%d]=0x%llx,0x%llx\n", cmd->index,
>> +           (long long unsigned int)be64_to_cpu(*subnet),
>> +           (long long unsigned int)be64_to_cpu(*if_id));
>> +
>> +    /* Driver forces to one port only */
>> +    memcpy(dev->rdma_dev_res.ports[0].gid_tbl[cmd->index].raw, &cmd->new_gid,
>> +           sizeof(cmd->new_gid));
> 
> ...but the gid_tbl[] array we index into is declared with
> 
>     union ibv_gid gid_tbl[MAX_PORT_GIDS];
> 
> so using MAX_PORT_GIDS as an index is off the end of it.
> 
> Presumably the check should be ">=".
> 

Right, thanks for finding it!

>> +static int destroy_bind(PVRDMADev *dev, union pvrdma_cmd_req *req,
>> +                        union pvrdma_cmd_resp *rsp)
>> +{
>> +    struct pvrdma_cmd_destroy_bind *cmd = &req->destroy_bind;
>> +
>> +    pr_dbg("clear index %d\n", cmd->index);
>> +
>> +    memset(dev->rdma_dev_res.ports[0].gid_tbl[cmd->index].raw, 0,
>> +           sizeof(dev->rdma_dev_res.ports[0].gid_tbl[cmd->index].raw));
> 
> I'm assuming this function can't be called unless create_bind()
> has previously succeeded and so it doesn't need its own
> bounds check.
> 

The index is provided by the guest, so we should check it,
right Yuval?

I'll take care of it.
Thanks,
Marcel

>> +
>> +    return 0;
>> +}
> 
> thanks
> -- PMM
>
Marcel Apfelbaum April 27, 2018, 6:22 p.m. UTC | #6
On 27/04/2018 17:43, Peter Maydell wrote:
> On 19 February 2018 at 11:43, Marcel Apfelbaum <marcel@redhat.com> wrote:
>> From: Yuval Shaia <yuval.shaia@oracle.com>
>>
>> First PVRDMA sub-module - implementation of the PVRDMA device.
>> - PVRDMA commands such as create CQ and create MR.
>> - Data path QP operations - post_send and post_recv.
>> - Completion handler.
> 
> Coverity (CID1390589, CID1390608) points out more array
> bounds overruns here:
> 
>> +
>> +typedef struct PVRDMADev {
>> +    PCIDevice parent_obj;
>> +    MemoryRegion msix;
>> +    MemoryRegion regs;
>> +    uint32_t regs_data[RDMA_BAR1_REGS_SIZE];
> 
> regs_data is an array of size RDMA_BAR1_REGS_SIZE...
> 
>> +    MemoryRegion uar;
>> +    uint32_t uar_data[RDMA_BAR2_UAR_SIZE];
>> +    DSRInfo dsr_info;
>> +    int interrupt_mask;
>> +    struct ibv_device_attr dev_attr;
>> +    uint64_t node_guid;
>> +    char *backend_device_name;
>> +    uint8_t backend_gid_idx;
>> +    uint8_t backend_port_num;
>> +    RdmaBackendDev backend_dev;
>> +    RdmaDeviceResources rdma_dev_res;
>> +} PVRDMADev;
>> +#define PVRDMA_DEV(dev) OBJECT_CHECK(PVRDMADev, (dev), PVRDMA_HW_NAME)
>> +
>> +static inline int get_reg_val(PVRDMADev *dev, hwaddr addr, uint32_t *val)
>> +{
>> +    int idx = addr >> 2;
>> +
>> +    if (idx > RDMA_BAR1_REGS_SIZE) {
>> +        return -EINVAL;
>> +    }
> 
> ...but the bounds check here is ">" rather than ">="
> and allows idx == RDMA_BAR1_REGS_SIZE through...
> 
>> +
>> +    *val = dev->regs_data[idx];
> 
> ...and this will overrun the array.
> 
>> +
>> +    return 0;
>> +}
>> +
>> +static inline int set_reg_val(PVRDMADev *dev, hwaddr addr, uint32_t val)
>> +{
>> +    int idx = addr >> 2;
>> +
>> +    if (idx > RDMA_BAR1_REGS_SIZE) {
>> +        return -EINVAL;
>> +    }
>> +
>> +    dev->regs_data[idx] = val;
> 
> Similarly here, where this is a write access.
> 
> Luckily this isn't an exploitable guest escape, because the only
> call to set_reg_val() with a guest-controlled addr value is from
> the read function of an MMIO MemoryRegion which is created with
> a size of RDMA_BAR1_REGS_SIZE, so the guest can't get out of
> range values into here.
> 
> Three times is a pattern -- you might like to check your
> other bounds checks for off-by-one errors. Coverity doesn't
> necessarily catch all of them.
> 

Agreed, I'll go over the code again.

Thanks,
Marcel


> thanks
> -- PMM
>
Marcel Apfelbaum April 27, 2018, 6:28 p.m. UTC | #7
On 27/04/2018 17:58, Peter Maydell wrote:
> On 19 February 2018 at 11:43, Marcel Apfelbaum <marcel@redhat.com> wrote:
>> From: Yuval Shaia <yuval.shaia@oracle.com>
>>
>> First PVRDMA sub-module - implementation of the PVRDMA device.
>> - PVRDMA commands such as create CQ and create MR.
>> - Data path QP operations - post_send and post_recv.
>> - Completion handler.
> 
>> +void pvrdma_cq_poll(RdmaDeviceResources *dev_res, uint32_t cq_handle)
>> +{
>> +    RdmaRmCQ *cq;
>> +
>> +    cq = rdma_rm_get_cq(dev_res, cq_handle);
>> +    if (!cq) {
>> +        pr_dbg("Invalid CQ# %d\n", cq_handle);
>> +    }
>> +
>> +    rdma_backend_poll_cq(dev_res, &cq->backend_cq);
>> +}
> 
> Coverity CID 1390586: we check for cq being NULL, but then
> go ahead and use it anyway. If a NULL cq is a possible
> situation we should handle it correctly (early return?
> return an error value that the caller has to handle?
> something else?); if it is not possible then we should
> use assert rather than an if().

The cq_handle is provided by the guest, the code assumes
a sane guest, which is not safe.

I will change the code to return early.

Thanks,
Marcel

> 
> thanks
> -- PMM
>
Marcel Apfelbaum April 27, 2018, 6:31 p.m. UTC | #8
On 27/04/2018 18:01, Peter Maydell wrote:
> On 19 February 2018 at 11:43, Marcel Apfelbaum <marcel@redhat.com> wrote:
>> From: Yuval Shaia <yuval.shaia@oracle.com>
>>
>> First PVRDMA sub-module - implementation of the PVRDMA device.
>> - PVRDMA commands such as create CQ and create MR.
>> - Data path QP operations - post_send and post_recv.
>> - Completion handler.
> 
> Coverity CID 1390620: we call munmap() on a NULL pointer.
> 
>> +static int create_mr(PVRDMADev *dev, union pvrdma_cmd_req *req,
>> +                     union pvrdma_cmd_resp *rsp)
>> +{
>> +    struct pvrdma_cmd_create_mr *cmd = &req->create_mr;
>> +    struct pvrdma_cmd_create_mr_resp *resp = &rsp->create_mr_resp;
>> +    PCIDevice *pci_dev = PCI_DEVICE(dev);
>> +    void *host_virt = NULL;
> 
> Here we set host_virt to NULL...
> 
>> +
>> +    memset(resp, 0, sizeof(*resp));
>> +    resp->hdr.response = cmd->hdr.response;
>> +    resp->hdr.ack = PVRDMA_CMD_CREATE_MR_RESP;
>> +
>> +    pr_dbg("pd_handle=%d\n", cmd->pd_handle);
>> +    pr_dbg("access_flags=0x%x\n", cmd->access_flags);
>> +    pr_dbg("flags=0x%x\n", cmd->flags);
>> +
>> +    if (!(cmd->flags & PVRDMA_MR_FLAG_DMA)) {
> 
> ...and if we don't take this if() we won't set host_virt to anything...
> 
>> +        host_virt = pvrdma_map_to_pdir(pci_dev, cmd->pdir_dma, cmd->nchunks,
>> +                                       cmd->length);
>> +        if (!host_virt) {
>> +            pr_dbg("Failed to map to pdir\n");
>> +            resp->hdr.err = -EINVAL;
>> +            goto out;
>> +        }
>> +    }
>> +
>> +    resp->hdr.err = rdma_rm_alloc_mr(&dev->rdma_dev_res, cmd->pd_handle,
>> +                                     cmd->start, cmd->length, host_virt,
>> +                                     cmd->access_flags, &resp->mr_handle,
>> +                                     &resp->lkey, &resp->rkey);
>> +    if (!resp->hdr.err) {
>> +        munmap(host_virt, cmd->length);
> 
> ...but here we call munmap() on it without checking if it is NULL.
> Unlike g_free(), munmap() isn't specified to be "do nothing if
> passed a NULL pointer".

Will fix, thanks for finding it!
Marcel

> 
>> +    }
>> +
>> +out:
>> +    pr_dbg("ret=%d\n", resp->hdr.err);
>> +    return resp->hdr.err;
>> +}
> 
> thanks
> -- PMM
>
Yuval Shaia April 29, 2018, 7:42 a.m. UTC | #9
On Fri, Apr 27, 2018 at 09:20:44PM +0300, Marcel Apfelbaum wrote:
> Hi Peter,
> 
> On 27/04/2018 17:31, Peter Maydell wrote:
> > On 19 February 2018 at 11:43, Marcel Apfelbaum <marcel@redhat.com> wrote:
> >> From: Yuval Shaia <yuval.shaia@oracle.com>
> >>
> >> First PVRDMA sub-module - implementation of the PVRDMA device.
> >> - PVRDMA commands such as create CQ and create MR.
> >> - Data path QP operations - post_send and post_recv.
> >> - Completion handler.
> >>
> >> Reviewed-by: Dotan Barak <dotanb@mellanox.com>
> >> Reviewed-by: Zhu Yanjun <yanjun.zhu@oracle.com>
> >> Signed-off-by: Yuval Shaia <yuval.shaia@oracle.com>
> >> Signed-off-by: Marcel Apfelbaum <marcel@redhat.com>
> > 
> > Hi; Coverity points out an array bounds overrun in this code:
> > 
> > 
> >> +static int create_bind(PVRDMADev *dev, union pvrdma_cmd_req *req,
> >> +                       union pvrdma_cmd_resp *rsp)
> >> +{
> >> +    struct pvrdma_cmd_create_bind *cmd = &req->create_bind;
> >> +#ifdef PVRDMA_DEBUG
> >> +    __be64 *subnet = (__be64 *)&cmd->new_gid[0];
> >> +    __be64 *if_id = (__be64 *)&cmd->new_gid[8];
> >> +#endif
> >> +
> >> +    pr_dbg("index=%d\n", cmd->index);
> >> +
> >> +    if (cmd->index > MAX_PORT_GIDS) {
> >> +        return -EINVAL;
> >> +    }
> > 
> > This bounds check allows cmd->index == MAX_PORT_GIDS...
> > 
> >> +
> >> +    pr_dbg("gid[%d]=0x%llx,0x%llx\n", cmd->index,
> >> +           (long long unsigned int)be64_to_cpu(*subnet),
> >> +           (long long unsigned int)be64_to_cpu(*if_id));
> >> +
> >> +    /* Driver forces to one port only */
> >> +    memcpy(dev->rdma_dev_res.ports[0].gid_tbl[cmd->index].raw, &cmd->new_gid,
> >> +           sizeof(cmd->new_gid));
> > 
> > ...but the gid_tbl[] array we index into is declared with
> > 
> >     union ibv_gid gid_tbl[MAX_PORT_GIDS];
> > 
> > so using MAX_PORT_GIDS as an index is off the end of it.
> > 
> > Presumably the check should be ">=".
> > 
> 
> Right, thanks for finding it!
> 
> >> +static int destroy_bind(PVRDMADev *dev, union pvrdma_cmd_req *req,
> >> +                        union pvrdma_cmd_resp *rsp)
> >> +{
> >> +    struct pvrdma_cmd_destroy_bind *cmd = &req->destroy_bind;
> >> +
> >> +    pr_dbg("clear index %d\n", cmd->index);
> >> +
> >> +    memset(dev->rdma_dev_res.ports[0].gid_tbl[cmd->index].raw, 0,
> >> +           sizeof(dev->rdma_dev_res.ports[0].gid_tbl[cmd->index].raw));
> > 
> > I'm assuming this function can't be called unless create_bind()
> > has previously succeeded and so it doesn't need its own
> > bounds check.
> > 
> 
> The index is provided by the guest, so we should check it,
> right Yuval?

Right,
Guest driver is considered trusted but we don't want a faulty driver to
crash the entire VM.

> 
> I'll take care of it.
> Thanks,
> Marcel
> 
> >> +
> >> +    return 0;
> >> +}
> > 
> > thanks
> > -- PMM
> > 
>
Peter Maydell June 20, 2023, 12:35 p.m. UTC | #10
On Mon, 19 Feb 2018 at 11:44, Marcel Apfelbaum <marcel@redhat.com> wrote:
>
> From: Yuval Shaia <yuval.shaia@oracle.com>
>
> First PVRDMA sub-module - implementation of the PVRDMA device.
> - PVRDMA commands such as create CQ and create MR.
> - Data path QP operations - post_send and post_recv.
> - Completion handler.
>
> Reviewed-by: Dotan Barak <dotanb@mellanox.com>
> Reviewed-by: Zhu Yanjun <yanjun.zhu@oracle.com>
> Signed-off-by: Yuval Shaia <yuval.shaia@oracle.com>
> Signed-off-by: Marcel Apfelbaum <marcel@redhat.com>

Hi; I know this is a 5 year old commit, but Coverity has just
noticed something odd in this code (CID 1507146):

> +static int query_port(PVRDMADev *dev, union pvrdma_cmd_req *req,
> +                      union pvrdma_cmd_resp *rsp)
> +{
> +    struct pvrdma_cmd_query_port *cmd = &req->query_port;
> +    struct pvrdma_cmd_query_port_resp *resp = &rsp->query_port_resp;
> +    struct pvrdma_port_attr attrs = {0};
> +
> +    pr_dbg("port=%d\n", cmd->port_num);
> +
> +    if (rdma_backend_query_port(&dev->backend_dev,
> +                                (struct ibv_port_attr *)&attrs)) {

rdma_backend_query_port() wants a pointer to a struct ibv_port_attr.
But instead of passing it one, we have a local struct
pvrdma_port_attr and then explicitly cast it to the
other type.

Unfortunately, ibv_port_attr is larger than pvrdma_port_attr
(50 bytes vs 48 bytes), so this could overrun the local
variable.

What's going on here, and what should the code be doing instead?

Given that we are just copying fields out of the structure,
and (other than the extra field at the end of ibv_port_attr)
the struct layout is identical, it looks to me that the simple
fix would be to make the local variable have the correct type
'struct ibv_port_attr' and drop the cast.

> +        return -ENOMEM;
> +    }
> +
> +    memset(resp, 0, sizeof(*resp));
> +    resp->hdr.response = cmd->hdr.response;
> +    resp->hdr.ack = PVRDMA_CMD_QUERY_PORT_RESP;
> +    resp->hdr.err = 0;
> +
> +    resp->attrs.state = attrs.state;
> +    resp->attrs.max_mtu = attrs.max_mtu;
> +    resp->attrs.active_mtu = attrs.active_mtu;
> +    resp->attrs.phys_state = attrs.phys_state;
> +    resp->attrs.gid_tbl_len = MIN(MAX_PORT_GIDS, attrs.gid_tbl_len);
> +    resp->attrs.max_msg_sz = 1024;
> +    resp->attrs.pkey_tbl_len = MIN(MAX_PORT_PKEYS, attrs.pkey_tbl_len);
> +    resp->attrs.active_width = 1;
> +    resp->attrs.active_speed = 1;
> +
> +    return 0;
> +}

thanks
-- PMM
diff mbox

Patch

diff --git a/hw/rdma/Makefile.objs b/hw/rdma/Makefile.objs
index 6a59bf0d5b..44a85f687d 100644
--- a/hw/rdma/Makefile.objs
+++ b/hw/rdma/Makefile.objs
@@ -1,3 +1,5 @@ 
 ifeq ($(CONFIG_RDMA),y)
 obj-$(CONFIG_PCI) += rdma_utils.o rdma_backend.o rdma_rm.o
+obj-$(CONFIG_PCI) += vmw/pvrdma_dev_ring.o vmw/pvrdma_cmd.o \
+                     vmw/pvrdma_qp_ops.o
 endif
diff --git a/hw/rdma/vmw/pvrdma.h b/hw/rdma/vmw/pvrdma.h
new file mode 100644
index 0000000000..b05f94a473
--- /dev/null
+++ b/hw/rdma/vmw/pvrdma.h
@@ -0,0 +1,122 @@ 
+/*
+ * QEMU VMWARE paravirtual RDMA device definitions
+ *
+ * Copyright (C) 2018 Oracle
+ * Copyright (C) 2018 Red Hat Inc
+ *
+ * Authors:
+ *     Yuval Shaia <yuval.shaia@oracle.com>
+ *     Marcel Apfelbaum <marcel@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef PVRDMA_PVRDMA_H
+#define PVRDMA_PVRDMA_H
+
+#include <hw/pci/pci.h>
+#include <hw/pci/msix.h>
+
+#include "../rdma_backend_defs.h"
+#include "../rdma_rm_defs.h"
+
+#include <standard-headers/drivers/infiniband/hw/vmw_pvrdma/pvrdma_ring.h>
+#include <standard-headers/drivers/infiniband/hw/vmw_pvrdma/pvrdma_dev_api.h>
+#include "pvrdma_dev_ring.h"
+
+/* BARs */
+#define RDMA_MSIX_BAR_IDX    0
+#define RDMA_REG_BAR_IDX     1
+#define RDMA_UAR_BAR_IDX     2
+#define RDMA_BAR0_MSIX_SIZE  (16 * 1024)
+#define RDMA_BAR1_REGS_SIZE  256
+#define RDMA_BAR2_UAR_SIZE   (0x1000 * MAX_UCS) /* each uc gets page */
+
+/* MSIX */
+#define RDMA_MAX_INTRS       3
+#define RDMA_MSIX_TABLE      0x0000
+#define RDMA_MSIX_PBA        0x2000
+
+/* Interrupts Vectors */
+#define INTR_VEC_CMD_RING            0
+#define INTR_VEC_CMD_ASYNC_EVENTS    1
+#define INTR_VEC_CMD_COMPLETION_Q    2
+
+/* HW attributes */
+#define PVRDMA_HW_NAME       "pvrdma"
+#define PVRDMA_HW_VERSION    17
+#define PVRDMA_FW_VERSION    14
+
+typedef struct DSRInfo {
+    dma_addr_t dma;
+    struct pvrdma_device_shared_region *dsr;
+
+    union pvrdma_cmd_req *req;
+    union pvrdma_cmd_resp *rsp;
+
+    struct pvrdma_ring *async_ring_state;
+    PvrdmaRing async;
+
+    struct pvrdma_ring *cq_ring_state;
+    PvrdmaRing cq;
+} DSRInfo;
+
+typedef struct PVRDMADev {
+    PCIDevice parent_obj;
+    MemoryRegion msix;
+    MemoryRegion regs;
+    uint32_t regs_data[RDMA_BAR1_REGS_SIZE];
+    MemoryRegion uar;
+    uint32_t uar_data[RDMA_BAR2_UAR_SIZE];
+    DSRInfo dsr_info;
+    int interrupt_mask;
+    struct ibv_device_attr dev_attr;
+    uint64_t node_guid;
+    char *backend_device_name;
+    uint8_t backend_gid_idx;
+    uint8_t backend_port_num;
+    RdmaBackendDev backend_dev;
+    RdmaDeviceResources rdma_dev_res;
+} PVRDMADev;
+#define PVRDMA_DEV(dev) OBJECT_CHECK(PVRDMADev, (dev), PVRDMA_HW_NAME)
+
+static inline int get_reg_val(PVRDMADev *dev, hwaddr addr, uint32_t *val)
+{
+    int idx = addr >> 2;
+
+    if (idx > RDMA_BAR1_REGS_SIZE) {
+        return -EINVAL;
+    }
+
+    *val = dev->regs_data[idx];
+
+    return 0;
+}
+
+static inline int set_reg_val(PVRDMADev *dev, hwaddr addr, uint32_t val)
+{
+    int idx = addr >> 2;
+
+    if (idx > RDMA_BAR1_REGS_SIZE) {
+        return -EINVAL;
+    }
+
+    dev->regs_data[idx] = val;
+
+    return 0;
+}
+
+static inline void post_interrupt(PVRDMADev *dev, unsigned vector)
+{
+    PCIDevice *pci_dev = PCI_DEVICE(dev);
+
+    if (likely(!dev->interrupt_mask)) {
+        msix_notify(pci_dev, vector);
+    }
+}
+
+int execute_command(PVRDMADev *dev);
+
+#endif
diff --git a/hw/rdma/vmw/pvrdma_cmd.c b/hw/rdma/vmw/pvrdma_cmd.c
new file mode 100644
index 0000000000..293dfed29f
--- /dev/null
+++ b/hw/rdma/vmw/pvrdma_cmd.c
@@ -0,0 +1,673 @@ 
+/*
+ * QEMU paravirtual RDMA - Command channel
+ *
+ * Copyright (C) 2018 Oracle
+ * Copyright (C) 2018 Red Hat Inc
+ *
+ * Authors:
+ *     Yuval Shaia <yuval.shaia@oracle.com>
+ *     Marcel Apfelbaum <marcel@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include <qemu/osdep.h>
+#include <qemu/error-report.h>
+#include <cpu.h>
+#include <linux/types.h>
+#include "hw/hw.h"
+#include "hw/pci/pci.h"
+#include "hw/pci/pci_ids.h"
+
+#include "../rdma_backend.h"
+#include "../rdma_rm.h"
+#include "../rdma_utils.h"
+
+#include "pvrdma.h"
+#include <standard-headers/rdma/vmw_pvrdma-abi.h>
+
+static void *pvrdma_map_to_pdir(PCIDevice *pdev, uint64_t pdir_dma,
+                                uint32_t nchunks, size_t length)
+{
+    uint64_t *dir, *tbl;
+    int tbl_idx, dir_idx, addr_idx;
+    void *host_virt = NULL, *curr_page;
+
+    if (!nchunks) {
+        pr_dbg("nchunks=0\n");
+        return NULL;
+    }
+
+    dir = rdma_pci_dma_map(pdev, pdir_dma, TARGET_PAGE_SIZE);
+    if (!dir) {
+        error_report("PVRDMA: Failed to map to page directory");
+        return NULL;
+    }
+
+    tbl = rdma_pci_dma_map(pdev, dir[0], TARGET_PAGE_SIZE);
+    if (!tbl) {
+        error_report("PVRDMA: Failed to map to page table 0");
+        goto out_unmap_dir;
+    }
+
+    curr_page = rdma_pci_dma_map(pdev, (dma_addr_t)tbl[0], TARGET_PAGE_SIZE);
+    if (!curr_page) {
+        error_report("PVRDMA: Failed to map the first page");
+        goto out_unmap_tbl;
+    }
+
+    host_virt = mremap(curr_page, 0, length, MREMAP_MAYMOVE);
+    if (host_virt == MAP_FAILED) {
+        host_virt = NULL;
+        error_report("PVRDMA: Failed to remap memory for host_virt");
+        goto out_unmap_tbl;
+    }
+
+    rdma_pci_dma_unmap(pdev, curr_page, TARGET_PAGE_SIZE);
+
+    pr_dbg("host_virt=%p\n", host_virt);
+
+    dir_idx = 0;
+    tbl_idx = 1;
+    addr_idx = 1;
+    while (addr_idx < nchunks) {
+        if ((tbl_idx == (TARGET_PAGE_SIZE / sizeof(uint64_t)))) {
+            tbl_idx = 0;
+            dir_idx++;
+            pr_dbg("Mapping to table %d\n", dir_idx);
+            rdma_pci_dma_unmap(pdev, tbl, TARGET_PAGE_SIZE);
+            tbl = rdma_pci_dma_map(pdev, dir[dir_idx], TARGET_PAGE_SIZE);
+            if (!tbl) {
+                error_report("PVRDMA: Failed to map to page table %d", dir_idx);
+                goto out_unmap_host_virt;
+            }
+        }
+
+        pr_dbg("guest_dma[%d]=0x%lx\n", addr_idx, tbl[tbl_idx]);
+
+        curr_page = rdma_pci_dma_map(pdev, (dma_addr_t)tbl[tbl_idx],
+                                     TARGET_PAGE_SIZE);
+        if (!curr_page) {
+            error_report("PVRDMA: Failed to map to page %d, dir %d", tbl_idx,
+                         dir_idx);
+            goto out_unmap_host_virt;
+        }
+
+        mremap(curr_page, 0, TARGET_PAGE_SIZE, MREMAP_MAYMOVE | MREMAP_FIXED,
+               host_virt + TARGET_PAGE_SIZE * addr_idx);
+
+        rdma_pci_dma_unmap(pdev, curr_page, TARGET_PAGE_SIZE);
+
+        addr_idx++;
+
+        tbl_idx++;
+    }
+
+    goto out_unmap_tbl;
+
+out_unmap_host_virt:
+    munmap(host_virt, length);
+    host_virt = NULL;
+
+out_unmap_tbl:
+    rdma_pci_dma_unmap(pdev, tbl, TARGET_PAGE_SIZE);
+
+out_unmap_dir:
+    rdma_pci_dma_unmap(pdev, dir, TARGET_PAGE_SIZE);
+
+    return host_virt;
+}
+
+static int query_port(PVRDMADev *dev, union pvrdma_cmd_req *req,
+                      union pvrdma_cmd_resp *rsp)
+{
+    struct pvrdma_cmd_query_port *cmd = &req->query_port;
+    struct pvrdma_cmd_query_port_resp *resp = &rsp->query_port_resp;
+    struct pvrdma_port_attr attrs = {0};
+
+    pr_dbg("port=%d\n", cmd->port_num);
+
+    if (rdma_backend_query_port(&dev->backend_dev,
+                                (struct ibv_port_attr *)&attrs)) {
+        return -ENOMEM;
+    }
+
+    memset(resp, 0, sizeof(*resp));
+    resp->hdr.response = cmd->hdr.response;
+    resp->hdr.ack = PVRDMA_CMD_QUERY_PORT_RESP;
+    resp->hdr.err = 0;
+
+    resp->attrs.state = attrs.state;
+    resp->attrs.max_mtu = attrs.max_mtu;
+    resp->attrs.active_mtu = attrs.active_mtu;
+    resp->attrs.phys_state = attrs.phys_state;
+    resp->attrs.gid_tbl_len = MIN(MAX_PORT_GIDS, attrs.gid_tbl_len);
+    resp->attrs.max_msg_sz = 1024;
+    resp->attrs.pkey_tbl_len = MIN(MAX_PORT_PKEYS, attrs.pkey_tbl_len);
+    resp->attrs.active_width = 1;
+    resp->attrs.active_speed = 1;
+
+    return 0;
+}
+
+static int query_pkey(PVRDMADev *dev, union pvrdma_cmd_req *req,
+                      union pvrdma_cmd_resp *rsp)
+{
+    struct pvrdma_cmd_query_pkey *cmd = &req->query_pkey;
+    struct pvrdma_cmd_query_pkey_resp *resp = &rsp->query_pkey_resp;
+
+    pr_dbg("port=%d\n", cmd->port_num);
+    pr_dbg("index=%d\n", cmd->index);
+
+    memset(resp, 0, sizeof(*resp));
+    resp->hdr.response = cmd->hdr.response;
+    resp->hdr.ack = PVRDMA_CMD_QUERY_PKEY_RESP;
+    resp->hdr.err = 0;
+
+    resp->pkey = 0x7FFF;
+    pr_dbg("pkey=0x%x\n", resp->pkey);
+
+    return 0;
+}
+
+static int create_pd(PVRDMADev *dev, union pvrdma_cmd_req *req,
+                     union pvrdma_cmd_resp *rsp)
+{
+    struct pvrdma_cmd_create_pd *cmd = &req->create_pd;
+    struct pvrdma_cmd_create_pd_resp *resp = &rsp->create_pd_resp;
+
+    pr_dbg("context=0x%x\n", cmd->ctx_handle ? cmd->ctx_handle : 0);
+
+    memset(resp, 0, sizeof(*resp));
+    resp->hdr.response = cmd->hdr.response;
+    resp->hdr.ack = PVRDMA_CMD_CREATE_PD_RESP;
+    resp->hdr.err = rdma_rm_alloc_pd(&dev->rdma_dev_res, &dev->backend_dev,
+                                     &resp->pd_handle, cmd->ctx_handle);
+
+    pr_dbg("ret=%d\n", resp->hdr.err);
+    return resp->hdr.err;
+}
+
+static int destroy_pd(PVRDMADev *dev, union pvrdma_cmd_req *req,
+                      union pvrdma_cmd_resp *rsp)
+{
+    struct pvrdma_cmd_destroy_pd *cmd = &req->destroy_pd;
+
+    pr_dbg("pd_handle=%d\n", cmd->pd_handle);
+
+    rdma_rm_dealloc_pd(&dev->rdma_dev_res, cmd->pd_handle);
+
+    return 0;
+}
+
+static int create_mr(PVRDMADev *dev, union pvrdma_cmd_req *req,
+                     union pvrdma_cmd_resp *rsp)
+{
+    struct pvrdma_cmd_create_mr *cmd = &req->create_mr;
+    struct pvrdma_cmd_create_mr_resp *resp = &rsp->create_mr_resp;
+    PCIDevice *pci_dev = PCI_DEVICE(dev);
+    void *host_virt = NULL;
+
+    memset(resp, 0, sizeof(*resp));
+    resp->hdr.response = cmd->hdr.response;
+    resp->hdr.ack = PVRDMA_CMD_CREATE_MR_RESP;
+
+    pr_dbg("pd_handle=%d\n", cmd->pd_handle);
+    pr_dbg("access_flags=0x%x\n", cmd->access_flags);
+    pr_dbg("flags=0x%x\n", cmd->flags);
+
+    if (!(cmd->flags & PVRDMA_MR_FLAG_DMA)) {
+        host_virt = pvrdma_map_to_pdir(pci_dev, cmd->pdir_dma, cmd->nchunks,
+                                       cmd->length);
+        if (!host_virt) {
+            pr_dbg("Failed to map to pdir\n");
+            resp->hdr.err = -EINVAL;
+            goto out;
+        }
+    }
+
+    resp->hdr.err = rdma_rm_alloc_mr(&dev->rdma_dev_res, cmd->pd_handle,
+                                     cmd->start, cmd->length, host_virt,
+                                     cmd->access_flags, &resp->mr_handle,
+                                     &resp->lkey, &resp->rkey);
+    if (!resp->hdr.err) {
+        munmap(host_virt, cmd->length);
+    }
+
+out:
+    pr_dbg("ret=%d\n", resp->hdr.err);
+    return resp->hdr.err;
+}
+
+static int destroy_mr(PVRDMADev *dev, union pvrdma_cmd_req *req,
+                      union pvrdma_cmd_resp *rsp)
+{
+    struct pvrdma_cmd_destroy_mr *cmd = &req->destroy_mr;
+
+    pr_dbg("mr_handle=%d\n", cmd->mr_handle);
+
+    rdma_rm_dealloc_mr(&dev->rdma_dev_res, cmd->mr_handle);
+
+    return 0;
+}
+
+static int create_cq_ring(PCIDevice *pci_dev , PvrdmaRing **ring,
+                          uint64_t pdir_dma, uint32_t nchunks, uint32_t cqe)
+{
+    uint64_t *dir = NULL, *tbl = NULL;
+    PvrdmaRing *r;
+    int rc = -EINVAL;
+    char ring_name[MAX_RING_NAME_SZ];
+
+    pr_dbg("pdir_dma=0x%llx\n", (long long unsigned int)pdir_dma);
+    dir = rdma_pci_dma_map(pci_dev, pdir_dma, TARGET_PAGE_SIZE);
+    if (!dir) {
+        pr_dbg("Failed to map to CQ page directory\n");
+        goto out;
+    }
+
+    tbl = rdma_pci_dma_map(pci_dev, dir[0], TARGET_PAGE_SIZE);
+    if (!tbl) {
+        pr_dbg("Failed to map to CQ page table\n");
+        goto out;
+    }
+
+    r = g_malloc(sizeof(*r));
+    *ring = r;
+
+    r->ring_state = (struct pvrdma_ring *)
+        rdma_pci_dma_map(pci_dev, tbl[0], TARGET_PAGE_SIZE);
+
+    if (!r->ring_state) {
+        pr_dbg("Failed to map to CQ ring state\n");
+        goto out_free_ring;
+    }
+
+    sprintf(ring_name, "cq_ring_%lx", pdir_dma);
+    rc = pvrdma_ring_init(r, ring_name, pci_dev, &r->ring_state[1],
+                          cqe, sizeof(struct pvrdma_cqe),
+                          /* first page is ring state */
+                          (dma_addr_t *)&tbl[1], nchunks - 1);
+    if (rc) {
+        goto out_unmap_ring_state;
+    }
+
+    goto out;
+
+out_unmap_ring_state:
+    /* ring_state was in slot 1, not 0 so need to jump back */
+    rdma_pci_dma_unmap(pci_dev, --r->ring_state, TARGET_PAGE_SIZE);
+
+out_free_ring:
+    g_free(r);
+
+out:
+    rdma_pci_dma_unmap(pci_dev, tbl, TARGET_PAGE_SIZE);
+    rdma_pci_dma_unmap(pci_dev, dir, TARGET_PAGE_SIZE);
+
+    return rc;
+}
+
+static int create_cq(PVRDMADev *dev, union pvrdma_cmd_req *req,
+                     union pvrdma_cmd_resp *rsp)
+{
+    struct pvrdma_cmd_create_cq *cmd = &req->create_cq;
+    struct pvrdma_cmd_create_cq_resp *resp = &rsp->create_cq_resp;
+    PvrdmaRing *ring = NULL;
+
+    memset(resp, 0, sizeof(*resp));
+    resp->hdr.response = cmd->hdr.response;
+    resp->hdr.ack = PVRDMA_CMD_CREATE_CQ_RESP;
+
+    resp->cqe = cmd->cqe;
+
+    resp->hdr.err = create_cq_ring(PCI_DEVICE(dev), &ring, cmd->pdir_dma,
+                                   cmd->nchunks, cmd->cqe);
+    if (resp->hdr.err) {
+        goto out;
+    }
+
+    pr_dbg("ring=%p\n", ring);
+
+    resp->hdr.err = rdma_rm_alloc_cq(&dev->rdma_dev_res, &dev->backend_dev,
+                                     cmd->cqe, &resp->cq_handle, ring);
+    resp->cqe = cmd->cqe;
+
+out:
+    pr_dbg("ret=%d\n", resp->hdr.err);
+    return resp->hdr.err;
+}
+
+static int destroy_cq(PVRDMADev *dev, union pvrdma_cmd_req *req,
+                      union pvrdma_cmd_resp *rsp)
+{
+    struct pvrdma_cmd_destroy_cq *cmd = &req->destroy_cq;
+    RdmaRmCQ *cq;
+    PvrdmaRing *ring;
+
+    pr_dbg("cq_handle=%d\n", cmd->cq_handle);
+
+    cq = rdma_rm_get_cq(&dev->rdma_dev_res, cmd->cq_handle);
+    if (!cq) {
+        pr_dbg("Invalid CQ handle\n");
+        return -EINVAL;
+    }
+
+    ring = (PvrdmaRing *)cq->opaque;
+    pvrdma_ring_free(ring);
+    /* ring_state was in slot 1, not 0 so need to jump back */
+    rdma_pci_dma_unmap(PCI_DEVICE(dev), --ring->ring_state, TARGET_PAGE_SIZE);
+    g_free(ring);
+
+    rdma_rm_dealloc_cq(&dev->rdma_dev_res, cmd->cq_handle);
+
+    return 0;
+}
+
+static int create_qp_rings(PCIDevice *pci_dev, uint64_t pdir_dma,
+                           PvrdmaRing **rings, uint32_t scqe, uint32_t smax_sge,
+                           uint32_t spages, uint32_t rcqe, uint32_t rmax_sge,
+                           uint32_t rpages)
+{
+    uint64_t *dir = NULL, *tbl = NULL;
+    PvrdmaRing *sr, *rr;
+    int rc = -EINVAL;
+    char ring_name[MAX_RING_NAME_SZ];
+    uint32_t wqe_sz;
+
+    pr_dbg("pdir_dma=0x%llx\n", (long long unsigned int)pdir_dma);
+    dir = rdma_pci_dma_map(pci_dev, pdir_dma, TARGET_PAGE_SIZE);
+    if (!dir) {
+        pr_dbg("Failed to map to CQ page directory\n");
+        goto out;
+    }
+
+    tbl = rdma_pci_dma_map(pci_dev, dir[0], TARGET_PAGE_SIZE);
+    if (!tbl) {
+        pr_dbg("Failed to map to CQ page table\n");
+        goto out;
+    }
+
+    sr = g_malloc(2 * sizeof(*rr));
+    rr = &sr[1];
+    pr_dbg("sring=%p\n", sr);
+    pr_dbg("rring=%p\n", rr);
+
+    *rings = sr;
+
+    pr_dbg("scqe=%d\n", scqe);
+    pr_dbg("smax_sge=%d\n", smax_sge);
+    pr_dbg("spages=%d\n", spages);
+    pr_dbg("rcqe=%d\n", rcqe);
+    pr_dbg("rmax_sge=%d\n", rmax_sge);
+    pr_dbg("rpages=%d\n", rpages);
+
+    /* Create send ring */
+    sr->ring_state = (struct pvrdma_ring *)
+        rdma_pci_dma_map(pci_dev, tbl[0], TARGET_PAGE_SIZE);
+    if (!sr->ring_state) {
+        pr_dbg("Failed to map to CQ ring state\n");
+        goto out_free_sr_mem;
+    }
+
+    wqe_sz = pow2ceil(sizeof(struct pvrdma_sq_wqe_hdr) +
+                      sizeof(struct pvrdma_sge) * smax_sge - 1);
+
+    sprintf(ring_name, "qp_sring_%lx", pdir_dma);
+    rc = pvrdma_ring_init(sr, ring_name, pci_dev, sr->ring_state,
+                          scqe, wqe_sz, (dma_addr_t *)&tbl[1], spages);
+    if (rc) {
+        goto out_unmap_ring_state;
+    }
+
+    /* Create recv ring */
+    rr->ring_state = &sr->ring_state[1];
+    wqe_sz = pow2ceil(sizeof(struct pvrdma_rq_wqe_hdr) +
+                      sizeof(struct pvrdma_sge) * rmax_sge - 1);
+    sprintf(ring_name, "qp_rring_%lx", pdir_dma);
+    rc = pvrdma_ring_init(rr, ring_name, pci_dev, rr->ring_state,
+                          rcqe, wqe_sz, (dma_addr_t *)&tbl[1 + spages], rpages);
+    if (rc) {
+        goto out_free_sr;
+    }
+
+    goto out;
+
+out_free_sr:
+    pvrdma_ring_free(sr);
+
+out_unmap_ring_state:
+    rdma_pci_dma_unmap(pci_dev, sr->ring_state, TARGET_PAGE_SIZE);
+
+out_free_sr_mem:
+    g_free(sr);
+
+out:
+    rdma_pci_dma_unmap(pci_dev, tbl, TARGET_PAGE_SIZE);
+    rdma_pci_dma_unmap(pci_dev, dir, TARGET_PAGE_SIZE);
+
+    return rc;
+}
+
+static int create_qp(PVRDMADev *dev, union pvrdma_cmd_req *req,
+                     union pvrdma_cmd_resp *rsp)
+{
+    struct pvrdma_cmd_create_qp *cmd = &req->create_qp;
+    struct pvrdma_cmd_create_qp_resp *resp = &rsp->create_qp_resp;
+    PvrdmaRing *rings = NULL;
+
+    memset(resp, 0, sizeof(*resp));
+    resp->hdr.response = cmd->hdr.response;
+    resp->hdr.ack = PVRDMA_CMD_CREATE_QP_RESP;
+
+    pr_dbg("total_chunks=%d\n", cmd->total_chunks);
+    pr_dbg("send_chunks=%d\n", cmd->send_chunks);
+
+    resp->hdr.err = create_qp_rings(PCI_DEVICE(dev), cmd->pdir_dma, &rings,
+                                    cmd->max_send_wr, cmd->max_send_sge,
+                                    cmd->send_chunks, cmd->max_recv_wr,
+                                    cmd->max_recv_sge, cmd->total_chunks -
+                                    cmd->send_chunks - 1);
+    if (resp->hdr.err) {
+        goto out;
+    }
+
+    pr_dbg("rings=%p\n", rings);
+
+    resp->hdr.err = rdma_rm_alloc_qp(&dev->rdma_dev_res, cmd->pd_handle,
+                                     cmd->qp_type, cmd->max_send_wr,
+                                     cmd->max_send_sge, cmd->send_cq_handle,
+                                     cmd->max_recv_wr, cmd->max_recv_sge,
+                                     cmd->recv_cq_handle, rings, &resp->qpn);
+
+    resp->max_send_wr = cmd->max_send_wr;
+    resp->max_recv_wr = cmd->max_recv_wr;
+    resp->max_send_sge = cmd->max_send_sge;
+    resp->max_recv_sge = cmd->max_recv_sge;
+    resp->max_inline_data = cmd->max_inline_data;
+
+out:
+    pr_dbg("ret=%d\n", resp->hdr.err);
+    return resp->hdr.err;
+}
+
+static int modify_qp(PVRDMADev *dev, union pvrdma_cmd_req *req,
+                     union pvrdma_cmd_resp *rsp)
+{
+    struct pvrdma_cmd_modify_qp *cmd = &req->modify_qp;
+
+    pr_dbg("qp_handle=%d\n", cmd->qp_handle);
+
+    memset(rsp, 0, sizeof(*rsp));
+    rsp->hdr.response = cmd->hdr.response;
+    rsp->hdr.ack = PVRDMA_CMD_MODIFY_QP_RESP;
+
+    rsp->hdr.err = rdma_rm_modify_qp(&dev->rdma_dev_res, &dev->backend_dev,
+                                 cmd->qp_handle, cmd->attr_mask,
+                                 (union ibv_gid *)&cmd->attrs.ah_attr.grh.dgid,
+                                 cmd->attrs.dest_qp_num, cmd->attrs.qp_state,
+                                 cmd->attrs.qkey, cmd->attrs.rq_psn,
+                                 cmd->attrs.sq_psn);
+
+    pr_dbg("ret=%d\n", rsp->hdr.err);
+    return rsp->hdr.err;
+}
+
+static int destroy_qp(PVRDMADev *dev, union pvrdma_cmd_req *req,
+                      union pvrdma_cmd_resp *rsp)
+{
+    struct pvrdma_cmd_destroy_qp *cmd = &req->destroy_qp;
+    RdmaRmQP *qp;
+    PvrdmaRing *ring;
+
+    qp = rdma_rm_get_qp(&dev->rdma_dev_res, cmd->qp_handle);
+    if (!qp) {
+        pr_dbg("Invalid QP handle\n");
+        return -EINVAL;
+    }
+
+    rdma_rm_dealloc_qp(&dev->rdma_dev_res, cmd->qp_handle);
+
+    ring = (PvrdmaRing *)qp->opaque;
+    pr_dbg("sring=%p\n", &ring[0]);
+    pvrdma_ring_free(&ring[0]);
+    pr_dbg("rring=%p\n", &ring[1]);
+    pvrdma_ring_free(&ring[1]);
+
+    rdma_pci_dma_unmap(PCI_DEVICE(dev), ring->ring_state, TARGET_PAGE_SIZE);
+    g_free(ring);
+
+    return 0;
+}
+
+static int create_bind(PVRDMADev *dev, union pvrdma_cmd_req *req,
+                       union pvrdma_cmd_resp *rsp)
+{
+    struct pvrdma_cmd_create_bind *cmd = &req->create_bind;
+#ifdef PVRDMA_DEBUG
+    __be64 *subnet = (__be64 *)&cmd->new_gid[0];
+    __be64 *if_id = (__be64 *)&cmd->new_gid[8];
+#endif
+
+    pr_dbg("index=%d\n", cmd->index);
+
+    if (cmd->index > MAX_PORT_GIDS) {
+        return -EINVAL;
+    }
+
+    pr_dbg("gid[%d]=0x%llx,0x%llx\n", cmd->index,
+           (long long unsigned int)be64_to_cpu(*subnet),
+           (long long unsigned int)be64_to_cpu(*if_id));
+
+    /* Driver forces to one port only */
+    memcpy(dev->rdma_dev_res.ports[0].gid_tbl[cmd->index].raw, &cmd->new_gid,
+           sizeof(cmd->new_gid));
+
+    /* TODO: Since drivers stores node_guid at load_dsr phase then this
+     * assignment is not relevant, i need to figure out a way how to
+     * retrieve MAC of our netdev */
+    dev->node_guid = dev->rdma_dev_res.ports[0].gid_tbl[0].global.interface_id;
+    pr_dbg("dev->node_guid=0x%llx\n",
+           (long long unsigned int)be64_to_cpu(dev->node_guid));
+
+    return 0;
+}
+
+static int destroy_bind(PVRDMADev *dev, union pvrdma_cmd_req *req,
+                        union pvrdma_cmd_resp *rsp)
+{
+    struct pvrdma_cmd_destroy_bind *cmd = &req->destroy_bind;
+
+    pr_dbg("clear index %d\n", cmd->index);
+
+    memset(dev->rdma_dev_res.ports[0].gid_tbl[cmd->index].raw, 0,
+           sizeof(dev->rdma_dev_res.ports[0].gid_tbl[cmd->index].raw));
+
+    return 0;
+}
+
+static int create_uc(PVRDMADev *dev, union pvrdma_cmd_req *req,
+                     union pvrdma_cmd_resp *rsp)
+{
+    struct pvrdma_cmd_create_uc *cmd = &req->create_uc;
+    struct pvrdma_cmd_create_uc_resp *resp = &rsp->create_uc_resp;
+
+    pr_dbg("pfn=%d\n", cmd->pfn);
+
+    memset(resp, 0, sizeof(*resp));
+    resp->hdr.response = cmd->hdr.response;
+    resp->hdr.ack = PVRDMA_CMD_CREATE_UC_RESP;
+    resp->hdr.err = rdma_rm_alloc_uc(&dev->rdma_dev_res, cmd->pfn,
+                                     &resp->ctx_handle);
+
+    pr_dbg("ret=%d\n", resp->hdr.err);
+
+    return 0;
+}
+
+static int destroy_uc(PVRDMADev *dev, union pvrdma_cmd_req *req,
+                      union pvrdma_cmd_resp *rsp)
+{
+    struct pvrdma_cmd_destroy_uc *cmd = &req->destroy_uc;
+
+    pr_dbg("ctx_handle=%d\n", cmd->ctx_handle);
+
+    rdma_rm_dealloc_uc(&dev->rdma_dev_res, cmd->ctx_handle);
+
+    return 0;
+}
+struct cmd_handler {
+    uint32_t cmd;
+    int (*exec)(PVRDMADev *dev, union pvrdma_cmd_req *req,
+            union pvrdma_cmd_resp *rsp);
+};
+
+static struct cmd_handler cmd_handlers[] = {
+    {PVRDMA_CMD_QUERY_PORT, query_port},
+    {PVRDMA_CMD_QUERY_PKEY, query_pkey},
+    {PVRDMA_CMD_CREATE_PD, create_pd},
+    {PVRDMA_CMD_DESTROY_PD, destroy_pd},
+    {PVRDMA_CMD_CREATE_MR, create_mr},
+    {PVRDMA_CMD_DESTROY_MR, destroy_mr},
+    {PVRDMA_CMD_CREATE_CQ, create_cq},
+    {PVRDMA_CMD_RESIZE_CQ, NULL},
+    {PVRDMA_CMD_DESTROY_CQ, destroy_cq},
+    {PVRDMA_CMD_CREATE_QP, create_qp},
+    {PVRDMA_CMD_MODIFY_QP, modify_qp},
+    {PVRDMA_CMD_QUERY_QP, NULL},
+    {PVRDMA_CMD_DESTROY_QP, destroy_qp},
+    {PVRDMA_CMD_CREATE_UC, create_uc},
+    {PVRDMA_CMD_DESTROY_UC, destroy_uc},
+    {PVRDMA_CMD_CREATE_BIND, create_bind},
+    {PVRDMA_CMD_DESTROY_BIND, destroy_bind},
+};
+
+int execute_command(PVRDMADev *dev)
+{
+    int err = 0xFFFF;
+    DSRInfo *dsr_info;
+
+    dsr_info = &dev->dsr_info;
+
+    pr_dbg("cmd=%d\n", dsr_info->req->hdr.cmd);
+    if (dsr_info->req->hdr.cmd >= sizeof(cmd_handlers) /
+                      sizeof(struct cmd_handler)) {
+        pr_dbg("Unsupported command\n");
+        goto out;
+    }
+
+    if (!cmd_handlers[dsr_info->req->hdr.cmd].exec) {
+        pr_dbg("Unsupported command (not implemented yet)\n");
+        goto out;
+    }
+
+    err = cmd_handlers[dsr_info->req->hdr.cmd].exec(dev, dsr_info->req,
+                            dsr_info->rsp);
+out:
+    set_reg_val(dev, PVRDMA_REG_ERR, err);
+    post_interrupt(dev, INTR_VEC_CMD_RING);
+
+    return (err == 0) ? 0 : -EINVAL;
+}
diff --git a/hw/rdma/vmw/pvrdma_dev_ring.c b/hw/rdma/vmw/pvrdma_dev_ring.c
new file mode 100644
index 0000000000..ec309dad55
--- /dev/null
+++ b/hw/rdma/vmw/pvrdma_dev_ring.c
@@ -0,0 +1,155 @@ 
+/*
+ * QEMU paravirtual RDMA - Device rings
+ *
+ * Copyright (C) 2018 Oracle
+ * Copyright (C) 2018 Red Hat Inc
+ *
+ * Authors:
+ *     Yuval Shaia <yuval.shaia@oracle.com>
+ *     Marcel Apfelbaum <marcel@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include <qemu/osdep.h>
+#include <hw/pci/pci.h>
+#include <cpu.h>
+
+#include "../rdma_utils.h"
+#include <standard-headers/drivers/infiniband/hw/vmw_pvrdma/pvrdma_ring.h>
+#include "pvrdma_dev_ring.h"
+
+int pvrdma_ring_init(PvrdmaRing *ring, const char *name, PCIDevice *dev,
+                     struct pvrdma_ring *ring_state, uint32_t max_elems,
+                     size_t elem_sz, dma_addr_t *tbl, dma_addr_t npages)
+{
+    int i;
+    int rc = 0;
+
+    strncpy(ring->name, name, MAX_RING_NAME_SZ);
+    ring->name[MAX_RING_NAME_SZ - 1] = 0;
+    pr_dbg("Initializing %s ring\n", ring->name);
+    ring->dev = dev;
+    ring->ring_state = ring_state;
+    ring->max_elems = max_elems;
+    ring->elem_sz = elem_sz;
+    pr_dbg("ring->elem_sz=%ld\n", ring->elem_sz);
+    pr_dbg("npages=%ld\n", npages);
+    /* TODO: Give a moment to think if we want to redo driver settings
+    atomic_set(&ring->ring_state->prod_tail, 0);
+    atomic_set(&ring->ring_state->cons_head, 0);
+    */
+    ring->npages = npages;
+    ring->pages = g_malloc(npages * sizeof(void *));
+
+    for (i = 0; i < npages; i++) {
+        if (!tbl[i]) {
+            pr_err("npages=%ld but tbl[%d] is NULL\n", (long)npages, i);
+            continue;
+        }
+
+        ring->pages[i] = rdma_pci_dma_map(dev, tbl[i], TARGET_PAGE_SIZE);
+        if (!ring->pages[i]) {
+            rc = -ENOMEM;
+            pr_dbg("Failed to map to page %d\n", i);
+            goto out_free;
+        }
+        memset(ring->pages[i], 0, TARGET_PAGE_SIZE);
+    }
+
+    goto out;
+
+out_free:
+    while (i--) {
+        rdma_pci_dma_unmap(dev, ring->pages[i], TARGET_PAGE_SIZE);
+    }
+    g_free(ring->pages);
+
+out:
+    return rc;
+}
+
+void *pvrdma_ring_next_elem_read(PvrdmaRing *ring)
+{
+    unsigned int idx = 0, offset;
+
+    /*
+    pr_dbg("%s: t=%d, h=%d\n", ring->name, ring->ring_state->prod_tail,
+           ring->ring_state->cons_head);
+    */
+
+    if (!pvrdma_idx_ring_has_data(ring->ring_state, ring->max_elems, &idx)) {
+        pr_dbg("No more data in ring\n");
+        return NULL;
+    }
+
+    offset = idx * ring->elem_sz;
+    /*
+    pr_dbg("idx=%d\n", idx);
+    pr_dbg("offset=%d\n", offset);
+    */
+    return ring->pages[offset / TARGET_PAGE_SIZE] + (offset % TARGET_PAGE_SIZE);
+}
+
+void pvrdma_ring_read_inc(PvrdmaRing *ring)
+{
+    pvrdma_idx_ring_inc(&ring->ring_state->cons_head, ring->max_elems);
+    /*
+    pr_dbg("%s: t=%d, h=%d, m=%ld\n", ring->name,
+           ring->ring_state->prod_tail, ring->ring_state->cons_head,
+           ring->max_elems);
+    */
+}
+
+void *pvrdma_ring_next_elem_write(PvrdmaRing *ring)
+{
+    unsigned int idx, offset, tail;
+
+    /*
+    pr_dbg("%s: t=%d, h=%d\n", ring->name, ring->ring_state->prod_tail,
+           ring->ring_state->cons_head);
+    */
+
+    if (!pvrdma_idx_ring_has_space(ring->ring_state, ring->max_elems, &tail)) {
+        pr_dbg("CQ is full\n");
+        return NULL;
+    }
+
+    idx = pvrdma_idx(&ring->ring_state->prod_tail, ring->max_elems);
+    /* TODO: tail == idx */
+
+    offset = idx * ring->elem_sz;
+    return ring->pages[offset / TARGET_PAGE_SIZE] + (offset % TARGET_PAGE_SIZE);
+}
+
+void pvrdma_ring_write_inc(PvrdmaRing *ring)
+{
+    pvrdma_idx_ring_inc(&ring->ring_state->prod_tail, ring->max_elems);
+    /*
+    pr_dbg("%s: t=%d, h=%d, m=%ld\n", ring->name,
+           ring->ring_state->prod_tail, ring->ring_state->cons_head,
+           ring->max_elems);
+    */
+}
+
+void pvrdma_ring_free(PvrdmaRing *ring)
+{
+    if (!ring) {
+        return;
+    }
+
+    if (!ring->pages) {
+        return;
+    }
+
+    pr_dbg("ring->npages=%d\n", ring->npages);
+    while (ring->npages--) {
+        rdma_pci_dma_unmap(ring->dev, ring->pages[ring->npages],
+                           TARGET_PAGE_SIZE);
+    }
+
+    g_free(ring->pages);
+    ring->pages = NULL;
+}
diff --git a/hw/rdma/vmw/pvrdma_dev_ring.h b/hw/rdma/vmw/pvrdma_dev_ring.h
new file mode 100644
index 0000000000..02a590b86d
--- /dev/null
+++ b/hw/rdma/vmw/pvrdma_dev_ring.h
@@ -0,0 +1,42 @@ 
+/*
+ * QEMU VMWARE paravirtual RDMA ring utilities
+ *
+ * Copyright (C) 2018 Oracle
+ * Copyright (C) 2018 Red Hat Inc
+ *
+ * Authors:
+ *     Yuval Shaia <yuval.shaia@oracle.com>
+ *     Marcel Apfelbaum <marcel@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef PVRDMA_DEV_RING_H
+#define PVRDMA_DEV_RING_H
+
+#include <qemu/typedefs.h>
+
+#define MAX_RING_NAME_SZ 32
+
+typedef struct PvrdmaRing {
+    char name[MAX_RING_NAME_SZ];
+    PCIDevice *dev;
+    uint32_t max_elems;
+    size_t elem_sz;
+    struct pvrdma_ring *ring_state; /* used only for unmap */
+    int npages;
+    void **pages;
+} PvrdmaRing;
+
+int pvrdma_ring_init(PvrdmaRing *ring, const char *name, PCIDevice *dev,
+                     struct pvrdma_ring *ring_state, uint32_t max_elems,
+                     size_t elem_sz, dma_addr_t *tbl, dma_addr_t npages);
+void *pvrdma_ring_next_elem_read(PvrdmaRing *ring);
+void pvrdma_ring_read_inc(PvrdmaRing *ring);
+void *pvrdma_ring_next_elem_write(PvrdmaRing *ring);
+void pvrdma_ring_write_inc(PvrdmaRing *ring);
+void pvrdma_ring_free(PvrdmaRing *ring);
+
+#endif
diff --git a/hw/rdma/vmw/pvrdma_qp_ops.c b/hw/rdma/vmw/pvrdma_qp_ops.c
new file mode 100644
index 0000000000..f0a1f9eb02
--- /dev/null
+++ b/hw/rdma/vmw/pvrdma_qp_ops.c
@@ -0,0 +1,222 @@ 
+/*
+ * QEMU paravirtual RDMA - QP implementation
+ *
+ * Copyright (C) 2018 Oracle
+ * Copyright (C) 2018 Red Hat Inc
+ *
+ * Authors:
+ *     Yuval Shaia <yuval.shaia@oracle.com>
+ *     Marcel Apfelbaum <marcel@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include <qemu/osdep.h>
+
+#include "../rdma_utils.h"
+#include "../rdma_rm.h"
+#include "../rdma_backend.h"
+
+#include "pvrdma.h"
+#include <standard-headers/rdma/vmw_pvrdma-abi.h>
+#include "pvrdma_qp_ops.h"
+
+typedef struct CompHandlerCtx {
+    PVRDMADev *dev;
+    uint32_t cq_handle;
+    struct pvrdma_cqe cqe;
+} CompHandlerCtx;
+
+/* Send Queue WQE */
+typedef struct PvrdmaSqWqe {
+    struct pvrdma_sq_wqe_hdr hdr;
+    struct pvrdma_sge sge[0];
+} PvrdmaSqWqe;
+
+/* Recv Queue WQE */
+typedef struct PvrdmaRqWqe {
+    struct pvrdma_rq_wqe_hdr hdr;
+    struct pvrdma_sge sge[0];
+} PvrdmaRqWqe;
+
+/*
+ * 1. Put CQE on send CQ ring
+ * 2. Put CQ number on dsr completion ring
+ * 3. Interrupt host
+ */
+static int pvrdma_post_cqe(PVRDMADev *dev, uint32_t cq_handle,
+                           struct pvrdma_cqe *cqe)
+{
+    struct pvrdma_cqe *cqe1;
+    struct pvrdma_cqne *cqne;
+    PvrdmaRing *ring;
+    RdmaRmCQ *cq = rdma_rm_get_cq(&dev->rdma_dev_res, cq_handle);
+
+    if (unlikely(!cq)) {
+        pr_dbg("Invalid cqn %d\n", cq_handle);
+        return -EINVAL;
+    }
+
+    ring = (PvrdmaRing *)cq->opaque;
+    pr_dbg("ring=%p\n", ring);
+
+    /* Step #1: Put CQE on CQ ring */
+    pr_dbg("Writing CQE\n");
+    cqe1 = pvrdma_ring_next_elem_write(ring);
+    if (unlikely(!cqe1)) {
+        return -EINVAL;
+    }
+
+    cqe1->wr_id = cqe->wr_id;
+    cqe1->qp = cqe->qp;
+    cqe1->opcode = cqe->opcode;
+    cqe1->status = cqe->status;
+    cqe1->vendor_err = cqe->vendor_err;
+
+    pvrdma_ring_write_inc(ring);
+
+    /* Step #2: Put CQ number on dsr completion ring */
+    pr_dbg("Writing CQNE\n");
+    cqne = pvrdma_ring_next_elem_write(&dev->dsr_info.cq);
+    if (unlikely(!cqne)) {
+        return -EINVAL;
+    }
+
+    cqne->info = cq_handle;
+    pvrdma_ring_write_inc(&dev->dsr_info.cq);
+
+    pr_dbg("cq->notify=%d\n", cq->notify);
+    if (cq->notify) {
+        cq->notify = false;
+        post_interrupt(dev, INTR_VEC_CMD_COMPLETION_Q);
+    }
+
+    return 0;
+}
+
+static void pvrdma_qp_ops_comp_handler(int status, unsigned int vendor_err,
+                                       void *ctx)
+{
+    CompHandlerCtx *comp_ctx = (CompHandlerCtx *)ctx;
+
+    pr_dbg("cq_handle=%d\n", comp_ctx->cq_handle);
+    pr_dbg("wr_id=%ld\n", comp_ctx->cqe.wr_id);
+    pr_dbg("status=%d\n", status);
+    pr_dbg("vendor_err=0x%x\n", vendor_err);
+    comp_ctx->cqe.status = status;
+    comp_ctx->cqe.vendor_err = vendor_err;
+    pvrdma_post_cqe(comp_ctx->dev, comp_ctx->cq_handle, &comp_ctx->cqe);
+    g_free(ctx);
+}
+
+void pvrdma_qp_ops_fini(void)
+{
+    rdma_backend_unregister_comp_handler();
+}
+
+int pvrdma_qp_ops_init(void)
+{
+    rdma_backend_register_comp_handler(pvrdma_qp_ops_comp_handler);
+
+    return 0;
+}
+
+int pvrdma_qp_send(PVRDMADev *dev, uint32_t qp_handle)
+{
+    RdmaRmQP *qp;
+    PvrdmaSqWqe *wqe;
+    PvrdmaRing *ring;
+
+    pr_dbg("qp_handle=%d\n", qp_handle);
+
+    qp = rdma_rm_get_qp(&dev->rdma_dev_res, qp_handle);
+    if (unlikely(!qp)) {
+        return -EINVAL;
+    }
+
+    ring = (PvrdmaRing *)qp->opaque;
+    pr_dbg("sring=%p\n", ring);
+
+    wqe = (struct PvrdmaSqWqe *)pvrdma_ring_next_elem_read(ring);
+    while (wqe) {
+        CompHandlerCtx *comp_ctx;
+
+        pr_dbg("wr_id=%ld\n", wqe->hdr.wr_id);
+
+        /* Prepare CQE */
+        comp_ctx = g_malloc(sizeof(CompHandlerCtx));
+        comp_ctx->dev = dev;
+        comp_ctx->cq_handle = qp->send_cq_handle;
+        comp_ctx->cqe.wr_id = wqe->hdr.wr_id;
+        comp_ctx->cqe.qp = qp_handle;
+        comp_ctx->cqe.opcode = wqe->hdr.opcode;
+
+        rdma_backend_post_send(&dev->backend_dev, &qp->backend_qp, qp->qp_type,
+                               (struct ibv_sge *)&wqe->sge[0], wqe->hdr.num_sge,
+                               (union ibv_gid *)wqe->hdr.wr.ud.av.dgid,
+                               wqe->hdr.wr.ud.remote_qpn,
+                               wqe->hdr.wr.ud.remote_qkey, comp_ctx);
+
+        pvrdma_ring_read_inc(ring);
+
+        wqe = pvrdma_ring_next_elem_read(ring);
+    }
+
+    return 0;
+}
+
+int pvrdma_qp_recv(PVRDMADev *dev, uint32_t qp_handle)
+{
+    RdmaRmQP *qp;
+    PvrdmaRqWqe *wqe;
+    PvrdmaRing *ring;
+
+    pr_dbg("qp_handle=%d\n", qp_handle);
+
+    qp = rdma_rm_get_qp(&dev->rdma_dev_res, qp_handle);
+    if (unlikely(!qp)) {
+        return -EINVAL;
+    }
+
+    ring = &((PvrdmaRing *)qp->opaque)[1];
+    pr_dbg("rring=%p\n", ring);
+
+    wqe = (struct PvrdmaRqWqe *)pvrdma_ring_next_elem_read(ring);
+    while (wqe) {
+        CompHandlerCtx *comp_ctx;
+
+        pr_dbg("wr_id=%ld\n", wqe->hdr.wr_id);
+
+        /* Prepare CQE */
+        comp_ctx = g_malloc(sizeof(CompHandlerCtx));
+        comp_ctx->dev = dev;
+        comp_ctx->cq_handle = qp->recv_cq_handle;
+        comp_ctx->cqe.qp = qp_handle;
+        comp_ctx->cqe.wr_id = wqe->hdr.wr_id;
+
+        rdma_backend_post_recv(&dev->backend_dev, &dev->rdma_dev_res,
+                               &qp->backend_qp, qp->qp_type,
+                               (struct ibv_sge *)&wqe->sge[0], wqe->hdr.num_sge,
+                               comp_ctx);
+
+        pvrdma_ring_read_inc(ring);
+
+        wqe = pvrdma_ring_next_elem_read(ring);
+    }
+
+    return 0;
+}
+
+void pvrdma_cq_poll(RdmaDeviceResources *dev_res, uint32_t cq_handle)
+{
+    RdmaRmCQ *cq;
+
+    cq = rdma_rm_get_cq(dev_res, cq_handle);
+    if (!cq) {
+        pr_dbg("Invalid CQ# %d\n", cq_handle);
+    }
+
+    rdma_backend_poll_cq(dev_res, &cq->backend_cq);
+}
diff --git a/hw/rdma/vmw/pvrdma_qp_ops.h b/hw/rdma/vmw/pvrdma_qp_ops.h
new file mode 100644
index 0000000000..ac46bf7fdf
--- /dev/null
+++ b/hw/rdma/vmw/pvrdma_qp_ops.h
@@ -0,0 +1,27 @@ 
+/*
+ * QEMU VMWARE paravirtual RDMA QP Operations
+ *
+ * Copyright (C) 2018 Oracle
+ * Copyright (C) 2018 Red Hat Inc
+ *
+ * Authors:
+ *     Yuval Shaia <yuval.shaia@oracle.com>
+ *     Marcel Apfelbaum <marcel@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef PVRDMA_QP_H
+#define PVRDMA_QP_H
+
+#include "pvrdma.h"
+
+int pvrdma_qp_ops_init(void);
+void pvrdma_qp_ops_fini(void);
+int pvrdma_qp_send(PVRDMADev *dev, uint32_t qp_handle);
+int pvrdma_qp_recv(PVRDMADev *dev, uint32_t qp_handle);
+void pvrdma_cq_poll(RdmaDeviceResources *dev_res, uint32_t cq_handle);
+
+#endif