diff mbox series

[v5,for-next,1/1] RDMA/hns: Support direct wqe of userspace

Message ID 20211130135740.4559-2-liangwenpeng@huawei.com (mailing list archive)
State Superseded
Headers show
Series RDMA/hns: Support direct WQE of userspace | expand

Commit Message

Wenpeng Liang Nov. 30, 2021, 1:57 p.m. UTC
From: Yixing Liu <liuyixing1@huawei.com>

Add direct wqe enable switch and address mapping.

Signed-off-by: Yixing Liu <liuyixing1@huawei.com>
Signed-off-by: Wenpeng Liang <liangwenpeng@huawei.com>
---
 drivers/infiniband/hw/hns/hns_roce_device.h |  8 +--
 drivers/infiniband/hw/hns/hns_roce_main.c   | 42 +++++++++++++---
 drivers/infiniband/hw/hns/hns_roce_pd.c     |  3 ++
 drivers/infiniband/hw/hns/hns_roce_qp.c     | 54 ++++++++++++++++++++-
 include/uapi/rdma/hns-abi.h                 |  2 +
 5 files changed, 98 insertions(+), 11 deletions(-)

Comments

Leon Romanovsky Dec. 1, 2021, 9:01 a.m. UTC | #1
On Tue, Nov 30, 2021 at 09:57:40PM +0800, Wenpeng Liang wrote:
> From: Yixing Liu <liuyixing1@huawei.com>
> 
> Add direct wqe enable switch and address mapping.
> 
> Signed-off-by: Yixing Liu <liuyixing1@huawei.com>
> Signed-off-by: Wenpeng Liang <liangwenpeng@huawei.com>
> ---
>  drivers/infiniband/hw/hns/hns_roce_device.h |  8 +--
>  drivers/infiniband/hw/hns/hns_roce_main.c   | 42 +++++++++++++---
>  drivers/infiniband/hw/hns/hns_roce_pd.c     |  3 ++
>  drivers/infiniband/hw/hns/hns_roce_qp.c     | 54 ++++++++++++++++++++-
>  include/uapi/rdma/hns-abi.h                 |  2 +
>  5 files changed, 98 insertions(+), 11 deletions(-)
> 

The description of pgprot_device() is still unsatisfactory,
but everything else looks ok.

Thanks,
Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
Barry Song Dec. 3, 2021, 10:18 a.m. UTC | #2
> +	switch (entry->mmap_type) {
> +	case HNS_ROCE_MMAP_TYPE_DB:
> +		prot = pgprot_noncached(vma->vm_page_prot);
> +		break;
> +	case HNS_ROCE_MMAP_TYPE_TPTR:
> +		prot = vma->vm_page_prot;
> +		break;
> +	/*
> +	 * The BAR region of direct WQE supports Early Write Ack,
> +	 * so pgprot_device is used to improve performance.
> +	 */
> +	case HNS_ROCE_MMAP_TYPE_DWQE:
> +		prot = pgprot_device(vma->vm_page_prot);
> +		break;
> +	default:
> +		return -EINVAL;
> +	}

i am still not convinced why HNS_ROCE_MMAP_TYPE_DB needs nocache and HNS_ROCE_MMAP_TYPE_DWQE needs
device. generally people use ioremap() to map pci bar spaces in pci device drivers, and ioremap()
is pretty much nGnRE:
#define ioremap(addr, size)             __ioremap((addr), (size), __pgprot(PROT_DEVICE_nGnRE))
#define ioremap_np(addr, size)          __ioremap((addr), (size), __pgprot(PROT_DEVICE_nGnRnE))

i am only seeing four places which are using nE in kernel:
   #   line  filename / context / line
   1    866  drivers/of/address.c <<of_iomap>>
             return ioremap_np(res.start, resource_size(&res));
   2    901  drivers/of/address.c <<of_io_request_and_map>>
             mem = ioremap_np(res.start, resource_size(&res));
   3     89  include/linux/io.h <<pci_remap_cfgspace>>
             return ioremap_np(offset, size) ?: ioremap(offset, size);
   4     47  lib/devres.c <<__devm_ioremap>>
             addr = ioremap_np(offset, size);

so i guess nGnRE is quite safe for pci device bar spaces. for config space, it is a different story
though which is the 3rd one in the above list:

#ifdef CONFIG_PCI
/*
 * The PCI specifications (Rev 3.0, 3.2.5 "Transaction Ordering and
 * Posting") mandate non-posted configuration transactions. This default
 * implementation attempts to use the ioremap_np() API to provide this
 * on arches that support it, and falls back to ioremap() on those that
 * don't. Overriding this function is deprecated; arches that properly
 * support non-posted accesses should implement ioremap_np() instead, which
 * this default implementation can then use to return mappings compliant with
 * the PCI specification.
 */
#ifndef pci_remap_cfgspace
#define pci_remap_cfgspace pci_remap_cfgspace
static inline void __iomem *pci_remap_cfgspace(phys_addr_t offset,
                                               size_t size)
{
        return ioremap_np(offset, size) ?: ioremap(offset, size);
}
#endif
#endif

Thanks
Barry
Wenpeng Liang Dec. 6, 2021, 1:34 p.m. UTC | #3
On 2021/12/3 18:18, Barry Song wrote:
>> +	switch (entry->mmap_type) {
>> +	case HNS_ROCE_MMAP_TYPE_DB:
>> +		prot = pgprot_noncached(vma->vm_page_prot);
>> +		break;
>> +	case HNS_ROCE_MMAP_TYPE_TPTR:
>> +		prot = vma->vm_page_prot;
>> +		break;
>> +	/*
>> +	 * The BAR region of direct WQE supports Early Write Ack,
>> +	 * so pgprot_device is used to improve performance.
>> +	 */
>> +	case HNS_ROCE_MMAP_TYPE_DWQE:
>> +		prot = pgprot_device(vma->vm_page_prot);
>> +		break;
>> +	default:
>> +		return -EINVAL;
>> +	}
> 
> i am still not convinced why HNS_ROCE_MMAP_TYPE_DB needs nocache and HNS_ROCE_MMAP_TYPE_DWQE needs
> device. generally people use ioremap() to map pci bar spaces in pci device drivers, and ioremap()
> is pretty much nGnRE:
> #define ioremap(addr, size)             __ioremap((addr), (size), __pgprot(PROT_DEVICE_nGnRE))
> #define ioremap_np(addr, size)          __ioremap((addr), (size), __pgprot(PROT_DEVICE_nGnRnE))
> 
> i am only seeing four places which are using nE in kernel:
>    #   line  filename / context / line
>    1    866  drivers/of/address.c <<of_iomap>>
>              return ioremap_np(res.start, resource_size(&res));
>    2    901  drivers/of/address.c <<of_io_request_and_map>>
>              mem = ioremap_np(res.start, resource_size(&res));
>    3     89  include/linux/io.h <<pci_remap_cfgspace>>
>              return ioremap_np(offset, size) ?: ioremap(offset, size);
>    4     47  lib/devres.c <<__devm_ioremap>>
>              addr = ioremap_np(offset, size);
> 
> so i guess nGnRE is quite safe for pci device bar spaces. for config space, it is a different story
> though which is the 3rd one in the above list:
> 
> #ifdef CONFIG_PCI
> /*
>  * The PCI specifications (Rev 3.0, 3.2.5 "Transaction Ordering and
>  * Posting") mandate non-posted configuration transactions. This default
>  * implementation attempts to use the ioremap_np() API to provide this
>  * on arches that support it, and falls back to ioremap() on those that
>  * don't. Overriding this function is deprecated; arches that properly
>  * support non-posted accesses should implement ioremap_np() instead, which
>  * this default implementation can then use to return mappings compliant with
>  * the PCI specification.
>  */
> #ifndef pci_remap_cfgspace
> #define pci_remap_cfgspace pci_remap_cfgspace
> static inline void __iomem *pci_remap_cfgspace(phys_addr_t offset,
>                                                size_t size)
> {
>         return ioremap_np(offset, size) ?: ioremap(offset, size);
> }
> #endif
> #endif
> 
> Thanks
> Barry
> 
> .
> 

Thank you for your comment. After my reconsideration, HNS_ROCE_MMAP_TYPE_DB should use
the device attribute. I will submit another patch to fix this problem first.

Thanks
Wenpeng
Jason Gunthorpe Dec. 6, 2021, 3:30 p.m. UTC | #4
On Fri, Dec 03, 2021 at 06:18:55PM +0800, Barry Song wrote:
> > +	switch (entry->mmap_type) {
> > +	case HNS_ROCE_MMAP_TYPE_DB:
> > +		prot = pgprot_noncached(vma->vm_page_prot);
> > +		break;
> > +	case HNS_ROCE_MMAP_TYPE_TPTR:
> > +		prot = vma->vm_page_prot;
> > +		break;
> > +	/*
> > +	 * The BAR region of direct WQE supports Early Write Ack,
> > +	 * so pgprot_device is used to improve performance.
> > +	 */
> > +	case HNS_ROCE_MMAP_TYPE_DWQE:
> > +		prot = pgprot_device(vma->vm_page_prot);
> > +		break;
> > +	default:
> > +		return -EINVAL;
> > +	}
> 
> i am still not convinced why HNS_ROCE_MMAP_TYPE_DB needs nocache and
> HNS_ROCE_MMAP_TYPE_DWQE needs device. generally people use ioremap()
> to map pci bar spaces in pci device drivers, and ioremap() is pretty
> much nGnRE:

Me too, please confirm with your HW that the device really cannot
handle device for the DB, and it is better to put the comment on the
noncached case as that is the obnormal thing here.

Jason
diff mbox series

Patch

diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h
index e35164ae7376..bc7112a205a7 100644
--- a/drivers/infiniband/hw/hns/hns_roce_device.h
+++ b/drivers/infiniband/hw/hns/hns_roce_device.h
@@ -182,6 +182,7 @@  enum {
 	HNS_ROCE_CAP_FLAG_FRMR                  = BIT(8),
 	HNS_ROCE_CAP_FLAG_QP_FLOW_CTRL		= BIT(9),
 	HNS_ROCE_CAP_FLAG_ATOMIC		= BIT(10),
+	HNS_ROCE_CAP_FLAG_DIRECT_WQE		= BIT(12),
 	HNS_ROCE_CAP_FLAG_SDI_MODE		= BIT(14),
 	HNS_ROCE_CAP_FLAG_STASH			= BIT(17),
 };
@@ -228,6 +229,7 @@  struct hns_roce_uar {
 enum hns_roce_mmap_type {
 	HNS_ROCE_MMAP_TYPE_DB = 1,
 	HNS_ROCE_MMAP_TYPE_TPTR,
+	HNS_ROCE_MMAP_TYPE_DWQE,
 };
 
 struct hns_user_mmap_entry {
@@ -627,10 +629,6 @@  struct hns_roce_work {
 	u32 queue_num;
 };
 
-enum {
-	HNS_ROCE_QP_CAP_DIRECT_WQE = BIT(5),
-};
-
 struct hns_roce_qp {
 	struct ib_qp		ibqp;
 	struct hns_roce_wq	rq;
@@ -675,6 +673,7 @@  struct hns_roce_qp {
 	struct list_head	node; /* all qps are on a list */
 	struct list_head	rq_node; /* all recv qps are on a list */
 	struct list_head	sq_node; /* all send qps are on a list */
+	struct hns_user_mmap_entry *dwqe_mmap_entry;
 };
 
 struct hns_roce_ib_iboe {
@@ -1010,6 +1009,7 @@  struct hns_roce_dev {
 	u32 func_num;
 	u32 is_vf;
 	u32 cong_algo_tmpl_id;
+	u64 dwqe_page;
 };
 
 static inline struct hns_roce_dev *to_hr_dev(struct ib_device *ib_dev)
diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c
index 8233bec053ee..700e32c584d8 100644
--- a/drivers/infiniband/hw/hns/hns_roce_main.c
+++ b/drivers/infiniband/hw/hns/hns_roce_main.c
@@ -310,9 +310,25 @@  hns_roce_user_mmap_entry_insert(struct ib_ucontext *ucontext, u64 address,
 	entry->address = address;
 	entry->mmap_type = mmap_type;
 
-	ret = rdma_user_mmap_entry_insert_exact(
-		ucontext, &entry->rdma_entry, length,
-		mmap_type == HNS_ROCE_MMAP_TYPE_DB ? 0 : 1);
+	switch (mmap_type) {
+	case HNS_ROCE_MMAP_TYPE_DB:
+		ret = rdma_user_mmap_entry_insert_exact(
+				ucontext, &entry->rdma_entry, length, 0);
+		break;
+	case HNS_ROCE_MMAP_TYPE_TPTR:
+		ret = rdma_user_mmap_entry_insert_exact(
+				ucontext, &entry->rdma_entry, length, 1);
+		break;
+	case HNS_ROCE_MMAP_TYPE_DWQE:
+		ret = rdma_user_mmap_entry_insert_range(
+				ucontext, &entry->rdma_entry, length, 2,
+				U32_MAX);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
 	if (ret) {
 		kfree(entry);
 		return NULL;
@@ -439,10 +455,24 @@  static int hns_roce_mmap(struct ib_ucontext *uctx, struct vm_area_struct *vma)
 
 	entry = to_hns_mmap(rdma_entry);
 	pfn = entry->address >> PAGE_SHIFT;
-	prot = vma->vm_page_prot;
 
-	if (entry->mmap_type != HNS_ROCE_MMAP_TYPE_TPTR)
-		prot = pgprot_noncached(prot);
+	switch (entry->mmap_type) {
+	case HNS_ROCE_MMAP_TYPE_DB:
+		prot = pgprot_noncached(vma->vm_page_prot);
+		break;
+	case HNS_ROCE_MMAP_TYPE_TPTR:
+		prot = vma->vm_page_prot;
+		break;
+	/*
+	 * The BAR region of direct WQE supports Early Write Ack,
+	 * so pgprot_device is used to improve performance.
+	 */
+	case HNS_ROCE_MMAP_TYPE_DWQE:
+		prot = pgprot_device(vma->vm_page_prot);
+		break;
+	default:
+		return -EINVAL;
+	}
 
 	ret = rdma_user_mmap_io(uctx, vma, pfn, rdma_entry->npages * PAGE_SIZE,
 				prot, rdma_entry);
diff --git a/drivers/infiniband/hw/hns/hns_roce_pd.c b/drivers/infiniband/hw/hns/hns_roce_pd.c
index 81ffad77ae42..03c349f7ebbe 100644
--- a/drivers/infiniband/hw/hns/hns_roce_pd.c
+++ b/drivers/infiniband/hw/hns/hns_roce_pd.c
@@ -115,6 +115,9 @@  int hns_roce_uar_alloc(struct hns_roce_dev *hr_dev, struct hns_roce_uar *uar)
 	} else {
 		uar->pfn = ((pci_resource_start(hr_dev->pci_dev, 2))
 			   >> PAGE_SHIFT);
+		if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_DIRECT_WQE)
+			hr_dev->dwqe_page =
+				pci_resource_start(hr_dev->pci_dev, 4);
 	}
 
 	return 0;
diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c
index 4fcab1611548..c84e1c23722c 100644
--- a/drivers/infiniband/hw/hns/hns_roce_qp.c
+++ b/drivers/infiniband/hw/hns/hns_roce_qp.c
@@ -379,6 +379,11 @@  static int alloc_qpc(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp)
 	return ret;
 }
 
+static void qp_user_mmap_entry_remove(struct hns_roce_qp *hr_qp)
+{
+	rdma_user_mmap_entry_remove(&hr_qp->dwqe_mmap_entry->rdma_entry);
+}
+
 void hns_roce_qp_remove(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp)
 {
 	struct xarray *xa = &hr_dev->qp_table_xa;
@@ -780,7 +785,11 @@  static int alloc_qp_buf(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
 		goto err_inline;
 	}
 
+	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_DIRECT_WQE)
+		hr_qp->en_flags |= HNS_ROCE_QP_CAP_DIRECT_WQE;
+
 	return 0;
+
 err_inline:
 	free_rq_inline_buf(hr_qp);
 
@@ -822,6 +831,35 @@  static inline bool kernel_qp_has_rdb(struct hns_roce_dev *hr_dev,
 		hns_roce_qp_has_rq(init_attr));
 }
 
+static int qp_mmap_entry(struct hns_roce_qp *hr_qp,
+			 struct hns_roce_dev *hr_dev,
+			 struct ib_udata *udata,
+			 struct hns_roce_ib_create_qp_resp *resp)
+{
+	struct hns_roce_ucontext *uctx =
+		rdma_udata_to_drv_context(udata,
+			struct hns_roce_ucontext, ibucontext);
+	struct rdma_user_mmap_entry *rdma_entry;
+	u64 address;
+
+	address = hr_dev->dwqe_page + hr_qp->qpn * HNS_ROCE_DWQE_SIZE;
+
+	hr_qp->dwqe_mmap_entry =
+		hns_roce_user_mmap_entry_insert(&uctx->ibucontext, address,
+						HNS_ROCE_DWQE_SIZE,
+						HNS_ROCE_MMAP_TYPE_DWQE);
+
+	if (!hr_qp->dwqe_mmap_entry) {
+		ibdev_err(&hr_dev->ib_dev, "failed to get dwqe mmap entry.\n");
+		return -ENOMEM;
+	}
+
+	rdma_entry = &hr_qp->dwqe_mmap_entry->rdma_entry;
+	resp->dwqe_mmap_key = rdma_user_mmap_get_offset(rdma_entry);
+
+	return 0;
+}
+
 static int alloc_user_qp_db(struct hns_roce_dev *hr_dev,
 			    struct hns_roce_qp *hr_qp,
 			    struct ib_qp_init_attr *init_attr,
@@ -909,10 +947,16 @@  static int alloc_qp_db(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
 		hr_qp->en_flags |= HNS_ROCE_QP_CAP_OWNER_DB;
 
 	if (udata) {
+		if (hr_qp->en_flags & HNS_ROCE_QP_CAP_DIRECT_WQE) {
+			ret = qp_mmap_entry(hr_qp, hr_dev, udata, resp);
+			if (ret)
+				return ret;
+		}
+
 		ret = alloc_user_qp_db(hr_dev, hr_qp, init_attr, udata, ucmd,
 				       resp);
 		if (ret)
-			return ret;
+			goto err_remove_qp;
 	} else {
 		ret = alloc_kernel_qp_db(hr_dev, hr_qp, init_attr);
 		if (ret)
@@ -920,6 +964,12 @@  static int alloc_qp_db(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
 	}
 
 	return 0;
+
+err_remove_qp:
+	if (hr_qp->en_flags & HNS_ROCE_QP_CAP_DIRECT_WQE)
+		qp_user_mmap_entry_remove(hr_qp);
+
+	return ret;
 }
 
 static void free_qp_db(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
@@ -933,6 +983,8 @@  static void free_qp_db(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
 			hns_roce_db_unmap_user(uctx, &hr_qp->rdb);
 		if (hr_qp->en_flags & HNS_ROCE_QP_CAP_SQ_RECORD_DB)
 			hns_roce_db_unmap_user(uctx, &hr_qp->sdb);
+		if (hr_qp->en_flags & HNS_ROCE_QP_CAP_DIRECT_WQE)
+			qp_user_mmap_entry_remove(hr_qp);
 	} else {
 		if (hr_qp->en_flags & HNS_ROCE_QP_CAP_RQ_RECORD_DB)
 			hns_roce_free_db(hr_dev, &hr_qp->rdb);
diff --git a/include/uapi/rdma/hns-abi.h b/include/uapi/rdma/hns-abi.h
index 42b177655560..f6fde06db4b4 100644
--- a/include/uapi/rdma/hns-abi.h
+++ b/include/uapi/rdma/hns-abi.h
@@ -77,10 +77,12 @@  enum hns_roce_qp_cap_flags {
 	HNS_ROCE_QP_CAP_RQ_RECORD_DB = 1 << 0,
 	HNS_ROCE_QP_CAP_SQ_RECORD_DB = 1 << 1,
 	HNS_ROCE_QP_CAP_OWNER_DB = 1 << 2,
+	HNS_ROCE_QP_CAP_DIRECT_WQE = 1 << 5,
 };
 
 struct hns_roce_ib_create_qp_resp {
 	__aligned_u64 cap_flags;
+	__aligned_u64 dwqe_mmap_key;
 };
 
 struct hns_roce_ib_alloc_ucontext_resp {