Message ID | 20170405062359.26623-9-leon@kernel.org (mailing list archive) |
---|---|
State | Accepted |
Headers | show |
On Wed, Apr 05, 2017 at 09:23:57AM +0300, Leon Romanovsky wrote: > From: Artemy Kovalyov <artemyko@mellanox.com> > > Add IB_ACCESS_HUGETLB ib_reg_mr flag. > Hugetlb region registered with this flag > will use single translation entry per huge page. > > Signed-off-by: Artemy Kovalyov <artemyko@mellanox.com> > Signed-off-by: Leon Romanovsky <leon@kernel.org> > --- > drivers/infiniband/core/umem.c | 2 +- > drivers/infiniband/core/umem_odp.c | 19 +++++++++++++++++-- > include/rdma/ib_umem_odp.h | 6 ++++-- > include/rdma/ib_verbs.h | 1 + > 4 files changed, 23 insertions(+), 5 deletions(-) > > @@ -315,6 +317,20 @@ int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem) > if (!mm) > return -EINVAL; > > + if (access & IB_ACCESS_HUGETLB) { > + struct vm_area_struct *vma; > + struct hstate *h; > + > + vma = find_vma(mm, ib_umem_start(umem)); > + if (!vma || !is_vm_hugetlb_page(vma)) > + return -EINVAL; > + h = hstate_vma(vma); > + umem->page_shift = huge_page_shift(h); > + umem->hugetlb = 1; User memory buffer could span multiple VMAs right? So shouldn’t we check all VMAs of the umem before setting the hugetlb flag? > + } else { > + umem->hugetlb = 0; > + } > + > /* Prevent creating ODP MRs in child processes */ > rcu_read_lock(); > our_pid = get_task_pid(current->group_leader, PIDTYPE_PID); -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Apr 05, 2017 at 11:45:39AM -0500, Shiraz Saleem wrote: > On Wed, Apr 05, 2017 at 09:23:57AM +0300, Leon Romanovsky wrote: > > From: Artemy Kovalyov <artemyko@mellanox.com> > > > > Add IB_ACCESS_HUGETLB ib_reg_mr flag. > > Hugetlb region registered with this flag > > will use single translation entry per huge page. > > > > Signed-off-by: Artemy Kovalyov <artemyko@mellanox.com> > > Signed-off-by: Leon Romanovsky <leon@kernel.org> > > --- > > drivers/infiniband/core/umem.c | 2 +- > > drivers/infiniband/core/umem_odp.c | 19 +++++++++++++++++-- > > include/rdma/ib_umem_odp.h | 6 ++++-- > > include/rdma/ib_verbs.h | 1 + > > 4 files changed, 23 insertions(+), 5 deletions(-) > > > > @@ -315,6 +317,20 @@ int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem) > > if (!mm) > > return -EINVAL; > > > > + if (access & IB_ACCESS_HUGETLB) { > > + struct vm_area_struct *vma; > > + struct hstate *h; > > + > > + vma = find_vma(mm, ib_umem_start(umem)); > > + if (!vma || !is_vm_hugetlb_page(vma)) > > + return -EINVAL; > > + h = hstate_vma(vma); > > + umem->page_shift = huge_page_shift(h); > > + umem->hugetlb = 1; > > User memory buffer could span multiple VMAs right? So shouldn’t we check all VMAs of the umem > before setting the hugetlb flag? It depends on what do you want to achieve. Current implementation uses best effort strategy and it assumes that user called to madvise with MADV_HUGETLB before. In such case, all VMAs will have hugettlb property in it. If the user calls madavise for partial umem region, the application and ODP will continue to work, but won't efficient in its memory usage. Thanks > > > + } else { > > + umem->hugetlb = 0; > > + } > > + > > /* Prevent creating ODP MRs in child processes */ > > rcu_read_lock(); > > our_pid = get_task_pid(current->group_leader, PIDTYPE_PID);
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 6b87c051ffd4..3dbf811d3c51 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -133,7 +133,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, if (access & IB_ACCESS_ON_DEMAND) { put_pid(umem->pid); - ret = ib_umem_odp_get(context, umem); + ret = ib_umem_odp_get(context, umem, access); if (ret) { kfree(umem); return ERR_PTR(ret); diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 73053c8a9e3b..0780b1afefa9 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -38,6 +38,7 @@ #include <linux/slab.h> #include <linux/export.h> #include <linux/vmalloc.h> +#include <linux/hugetlb.h> #include <rdma/ib_verbs.h> #include <rdma/ib_umem.h> @@ -306,7 +307,8 @@ struct ib_umem *ib_alloc_odp_umem(struct ib_ucontext *context, } EXPORT_SYMBOL(ib_alloc_odp_umem); -int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem) +int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem, + int access) { int ret_val; struct pid *our_pid; @@ -315,6 +317,20 @@ int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem) if (!mm) return -EINVAL; + if (access & IB_ACCESS_HUGETLB) { + struct vm_area_struct *vma; + struct hstate *h; + + vma = find_vma(mm, ib_umem_start(umem)); + if (!vma || !is_vm_hugetlb_page(vma)) + return -EINVAL; + h = hstate_vma(vma); + umem->page_shift = huge_page_shift(h); + umem->hugetlb = 1; + } else { + umem->hugetlb = 0; + } + /* Prevent creating ODP MRs in child processes */ rcu_read_lock(); our_pid = get_task_pid(current->group_leader, PIDTYPE_PID); @@ -325,7 +341,6 @@ int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem) goto out_mm; } - umem->hugetlb = 0; umem->odp_data = kzalloc(sizeof(*umem->odp_data), GFP_KERNEL); if (!umem->odp_data) { ret_val = -ENOMEM; diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index 542cd8b3414c..fb67554aabd6 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -84,7 +84,8 @@ struct ib_umem_odp { #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING -int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem); +int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem, + int access); struct ib_umem *ib_alloc_odp_umem(struct ib_ucontext *context, unsigned long addr, size_t size); @@ -154,7 +155,8 @@ static inline int ib_umem_mmu_notifier_retry(struct ib_umem *item, #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ static inline int ib_umem_odp_get(struct ib_ucontext *context, - struct ib_umem *umem) + struct ib_umem *umem, + int access) { return -EINVAL; } diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 0f1813c13687..b60288beb3fe 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1336,6 +1336,7 @@ enum ib_access_flags { IB_ACCESS_MW_BIND = (1<<4), IB_ZERO_BASED = (1<<5), IB_ACCESS_ON_DEMAND = (1<<6), + IB_ACCESS_HUGETLB = (1<<7), }; /*