diff mbox

[rdma-next,V1,08/10] IB/umem: Add support to huge ODP

Message ID 20170405062359.26623-9-leon@kernel.org (mailing list archive)
State Accepted
Headers show

Commit Message

Leon Romanovsky April 5, 2017, 6:23 a.m. UTC
From: Artemy Kovalyov <artemyko@mellanox.com>

Add IB_ACCESS_HUGETLB ib_reg_mr flag.
Hugetlb region registered with this flag
will use single translation entry per huge page.

Signed-off-by: Artemy Kovalyov <artemyko@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 drivers/infiniband/core/umem.c     |  2 +-
 drivers/infiniband/core/umem_odp.c | 19 +++++++++++++++++--
 include/rdma/ib_umem_odp.h         |  6 ++++--
 include/rdma/ib_verbs.h            |  1 +
 4 files changed, 23 insertions(+), 5 deletions(-)

Comments

Shiraz Saleem April 5, 2017, 4:45 p.m. UTC | #1
On Wed, Apr 05, 2017 at 09:23:57AM +0300, Leon Romanovsky wrote:
> From: Artemy Kovalyov <artemyko@mellanox.com>
> 
> Add IB_ACCESS_HUGETLB ib_reg_mr flag.
> Hugetlb region registered with this flag
> will use single translation entry per huge page.
> 
> Signed-off-by: Artemy Kovalyov <artemyko@mellanox.com>
> Signed-off-by: Leon Romanovsky <leon@kernel.org>
> ---
>  drivers/infiniband/core/umem.c     |  2 +-
>  drivers/infiniband/core/umem_odp.c | 19 +++++++++++++++++--
>  include/rdma/ib_umem_odp.h         |  6 ++++--
>  include/rdma/ib_verbs.h            |  1 +
>  4 files changed, 23 insertions(+), 5 deletions(-)
> 
> @@ -315,6 +317,20 @@ int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem)
>  	if (!mm)
>  		return -EINVAL;
>  
> +	if (access & IB_ACCESS_HUGETLB) {
> +		struct vm_area_struct *vma;
> +		struct hstate *h;
> +
> +		vma = find_vma(mm, ib_umem_start(umem));
> +		if (!vma || !is_vm_hugetlb_page(vma))
> +			return -EINVAL;
> +		h = hstate_vma(vma);
> +		umem->page_shift = huge_page_shift(h);
> +		umem->hugetlb = 1;

User memory buffer could span multiple VMAs right? So shouldn’t we check all VMAs of the umem 
before setting the hugetlb flag?

> +	} else {
> +		umem->hugetlb = 0;
> +	}
> +
>  	/* Prevent creating ODP MRs in child processes */
>  	rcu_read_lock();
>  	our_pid = get_task_pid(current->group_leader, PIDTYPE_PID);
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Leon Romanovsky April 5, 2017, 5:33 p.m. UTC | #2
On Wed, Apr 05, 2017 at 11:45:39AM -0500, Shiraz Saleem wrote:
> On Wed, Apr 05, 2017 at 09:23:57AM +0300, Leon Romanovsky wrote:
> > From: Artemy Kovalyov <artemyko@mellanox.com>
> >
> > Add IB_ACCESS_HUGETLB ib_reg_mr flag.
> > Hugetlb region registered with this flag
> > will use single translation entry per huge page.
> >
> > Signed-off-by: Artemy Kovalyov <artemyko@mellanox.com>
> > Signed-off-by: Leon Romanovsky <leon@kernel.org>
> > ---
> >  drivers/infiniband/core/umem.c     |  2 +-
> >  drivers/infiniband/core/umem_odp.c | 19 +++++++++++++++++--
> >  include/rdma/ib_umem_odp.h         |  6 ++++--
> >  include/rdma/ib_verbs.h            |  1 +
> >  4 files changed, 23 insertions(+), 5 deletions(-)
> >
> > @@ -315,6 +317,20 @@ int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem)
> >  	if (!mm)
> >  		return -EINVAL;
> >
> > +	if (access & IB_ACCESS_HUGETLB) {
> > +		struct vm_area_struct *vma;
> > +		struct hstate *h;
> > +
> > +		vma = find_vma(mm, ib_umem_start(umem));
> > +		if (!vma || !is_vm_hugetlb_page(vma))
> > +			return -EINVAL;
> > +		h = hstate_vma(vma);
> > +		umem->page_shift = huge_page_shift(h);
> > +		umem->hugetlb = 1;
>
> User memory buffer could span multiple VMAs right? So shouldn’t we check all VMAs of the umem
> before setting the hugetlb flag?

It depends on what do you want to achieve. Current implementation uses
best effort strategy and it assumes that user called to madvise
with MADV_HUGETLB before. In such case, all VMAs will have hugettlb
property in it. If the user calls madavise for partial umem region,
the application and ODP will continue to work, but won't efficient
in its memory usage.

Thanks

>
> > +	} else {
> > +		umem->hugetlb = 0;
> > +	}
> > +
> >  	/* Prevent creating ODP MRs in child processes */
> >  	rcu_read_lock();
> >  	our_pid = get_task_pid(current->group_leader, PIDTYPE_PID);
diff mbox

Patch

diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index 6b87c051ffd4..3dbf811d3c51 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -133,7 +133,7 @@  struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
 
 	if (access & IB_ACCESS_ON_DEMAND) {
 		put_pid(umem->pid);
-		ret = ib_umem_odp_get(context, umem);
+		ret = ib_umem_odp_get(context, umem, access);
 		if (ret) {
 			kfree(umem);
 			return ERR_PTR(ret);
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
index 73053c8a9e3b..0780b1afefa9 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -38,6 +38,7 @@ 
 #include <linux/slab.h>
 #include <linux/export.h>
 #include <linux/vmalloc.h>
+#include <linux/hugetlb.h>
 
 #include <rdma/ib_verbs.h>
 #include <rdma/ib_umem.h>
@@ -306,7 +307,8 @@  struct ib_umem *ib_alloc_odp_umem(struct ib_ucontext *context,
 }
 EXPORT_SYMBOL(ib_alloc_odp_umem);
 
-int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem)
+int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem,
+		    int access)
 {
 	int ret_val;
 	struct pid *our_pid;
@@ -315,6 +317,20 @@  int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem)
 	if (!mm)
 		return -EINVAL;
 
+	if (access & IB_ACCESS_HUGETLB) {
+		struct vm_area_struct *vma;
+		struct hstate *h;
+
+		vma = find_vma(mm, ib_umem_start(umem));
+		if (!vma || !is_vm_hugetlb_page(vma))
+			return -EINVAL;
+		h = hstate_vma(vma);
+		umem->page_shift = huge_page_shift(h);
+		umem->hugetlb = 1;
+	} else {
+		umem->hugetlb = 0;
+	}
+
 	/* Prevent creating ODP MRs in child processes */
 	rcu_read_lock();
 	our_pid = get_task_pid(current->group_leader, PIDTYPE_PID);
@@ -325,7 +341,6 @@  int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem)
 		goto out_mm;
 	}
 
-	umem->hugetlb = 0;
 	umem->odp_data = kzalloc(sizeof(*umem->odp_data), GFP_KERNEL);
 	if (!umem->odp_data) {
 		ret_val = -ENOMEM;
diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h
index 542cd8b3414c..fb67554aabd6 100644
--- a/include/rdma/ib_umem_odp.h
+++ b/include/rdma/ib_umem_odp.h
@@ -84,7 +84,8 @@  struct ib_umem_odp {
 
 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
 
-int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem);
+int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem,
+		    int access);
 struct ib_umem *ib_alloc_odp_umem(struct ib_ucontext *context,
 				  unsigned long addr,
 				  size_t size);
@@ -154,7 +155,8 @@  static inline int ib_umem_mmu_notifier_retry(struct ib_umem *item,
 #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
 
 static inline int ib_umem_odp_get(struct ib_ucontext *context,
-				  struct ib_umem *umem)
+				  struct ib_umem *umem,
+				  int access)
 {
 	return -EINVAL;
 }
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 0f1813c13687..b60288beb3fe 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1336,6 +1336,7 @@  enum ib_access_flags {
 	IB_ACCESS_MW_BIND	= (1<<4),
 	IB_ZERO_BASED		= (1<<5),
 	IB_ACCESS_ON_DEMAND     = (1<<6),
+	IB_ACCESS_HUGETLB	= (1<<7),
 };
 
 /*