diff mbox series

[rdma-next,v1,1/6] RDMA/umem: Combine contiguous PAGE_SIZE regions in SGEs

Message ID 20190219145745.13476-2-shiraz.saleem@intel.com (mailing list archive)
State Superseded
Headers show
Series Add APIs to get contiguous memory blocks aligned to a HW supported page size | expand

Commit Message

Saleem, Shiraz Feb. 19, 2019, 2:57 p.m. UTC
Combine contiguous regions of PAGE_SIZE pages
into single scatter list entries while adding
to the scatter table. This minimizes the number
of the entries in the scatter list and reduces
the DMA mapping overhead, particularly with the
IOMMU.

Suggested-by: Jason Gunthorpe <jgg@ziepe.ca>
Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
---
 drivers/infiniband/core/umem.c | 92 ++++++++++++++++++++++++++++++++++--------
 include/rdma/ib_umem.h         | 17 ++++----
 2 files changed, 84 insertions(+), 25 deletions(-)

Comments

Ira Weiny Feb. 19, 2019, 7:41 p.m. UTC | #1
On Tue, Feb 19, 2019 at 08:57:40AM -0600, Shiraz Saleem wrote:
> Combine contiguous regions of PAGE_SIZE pages
> into single scatter list entries while adding
> to the scatter table. This minimizes the number
> of the entries in the scatter list and reduces
> the DMA mapping overhead, particularly with the
> IOMMU.
> 
> Suggested-by: Jason Gunthorpe <jgg@ziepe.ca>
> Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
> Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
> ---
>  drivers/infiniband/core/umem.c | 92 ++++++++++++++++++++++++++++++++++--------
>  include/rdma/ib_umem.h         | 17 ++++----
>  2 files changed, 84 insertions(+), 25 deletions(-)
> 
> diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
> index b69d3ef..e8611b7 100644
> --- a/drivers/infiniband/core/umem.c
> +++ b/drivers/infiniband/core/umem.c
> @@ -39,25 +39,22 @@
>  #include <linux/export.h>
>  #include <linux/hugetlb.h>
>  #include <linux/slab.h>
> +#include <linux/pagemap.h>
>  #include <rdma/ib_umem_odp.h>
>  
>  #include "uverbs.h"
>  
> -
>  static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty)
>  {
> -	struct scatterlist *sg;
> +	struct sg_page_iter sg_iter;
>  	struct page *page;
> -	int i;
>  
>  	if (umem->nmap > 0)
> -		ib_dma_unmap_sg(dev, umem->sg_head.sgl,
> -				umem->npages,
> +		ib_dma_unmap_sg(dev, umem->sg_head.sgl, umem->sg_nents,
>  				DMA_BIDIRECTIONAL);
>  
> -	for_each_sg(umem->sg_head.sgl, sg, umem->npages, i) {
> -
> -		page = sg_page(sg);
> +	for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->sg_nents, 0) {
> +		page = sg_page_iter_page(&sg_iter);
>  		if (!PageDirty(page) && umem->writable && dirty)
>  			set_page_dirty_lock(page);
>  		put_page(page);
> @@ -66,6 +63,67 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d
>  	sg_free_table(&umem->sg_head);
>  }
>  
> +/* ib_umem_add_sg_table - Add N contiguous pages to scatter table
> + *
> + * sg: current scatterlist entry
> + * page_list: array of npage struct page pointers
> + * npages: number of pages in page_list
> + * nents: [out] number of entries in the scatterlist
> + *
> + * Return new end of scatterlist
> + */
> +struct scatterlist *ib_umem_add_sg_table(struct scatterlist *sg,
> +					 struct page **page_list,
> +					 unsigned long npages,
> +					 int *nents)
> +{
> +	unsigned long first_pfn;
> +	unsigned long i = 0;
> +	bool update_cur_sg = false;
> +	bool first = !sg_page(sg) ? true : false;
> +
> +	/* Check if new page_list is contiguous with end of previous page_list
> +	 */
> +	if (!first &&
> +	    (page_to_pfn(sg_page(sg)) + (sg->length >> PAGE_SHIFT) ==
> +	     page_to_pfn(page_list[0])))
> +		update_cur_sg = true;
> +
> +	while (i != npages) {
> +		unsigned long len;
> +		struct page *first_page = page_list[i];
> +
> +		first_pfn = page_to_pfn(first_page);
> +
> +		/* Compute the number of contiguous pages we have starting
> +		 * at i
> +		 */
> +		for (len = 0; i != npages &&
> +			      first_pfn + len == page_to_pfn(page_list[i]);
> +		     i++, len++)
> +		;
> +
> +		/* Squash N contiguous pages from page_list into current sge */
> +		if (update_cur_sg) {
> +			sg->length += len << PAGE_SHIFT;
> +			sg_set_page(sg, sg_page(sg), sg->length, 0);
> +			update_cur_sg = false;
> +			continue;
> +		}
> +
> +		/* Squash N contiguous pages into next sge or first sge */
> +		if (!first)
> +			sg = sg_next(sg);
> +
> +		(*nents)++;
> +		sg->length = len << PAGE_SHIFT;
> +		sg_set_page(sg, first_page, sg->length, 0);
> +		first = false;
> +	}
> +
> +	return sg;
> +}
> +
>  /**
>   * ib_umem_get - Pin and DMA map userspace memory.
>   *
> @@ -93,7 +151,7 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
>  	int ret;
>  	int i;
>  	unsigned long dma_attrs = 0;
> -	struct scatterlist *sg, *sg_list_start;
> +	struct scatterlist *sg;
>  	unsigned int gup_flags = FOLL_WRITE;
>  
>  	context = rdma_get_ucontext(udata);
> @@ -181,7 +239,7 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
>  	if (!umem->writable)
>  		gup_flags |= FOLL_FORCE;
>  
> -	sg_list_start = umem->sg_head.sgl;
> +	sg = umem->sg_head.sgl;
>  
>  	while (npages) {
>  		down_read(&mm->mmap_sem);
> @@ -198,24 +256,24 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
>  		cur_base += ret * PAGE_SIZE;
>  		npages   -= ret;
>  
> +		sg = ib_umem_add_sg_table(sg, page_list, ret, &umem->sg_nents);
> +
>  		/* Continue to hold the mmap_sem as vma_list access
>  		 * needs to be protected.
>  		 */
> -		for_each_sg(sg_list_start, sg, ret, i) {
> +		for (i = 0; i < ret && umem->hugetlb; i++) {
>  			if (vma_list && !is_vm_hugetlb_page(vma_list[i]))
>  				umem->hugetlb = 0;
> -
> -			sg_set_page(sg, page_list[i], PAGE_SIZE, 0);
>  		}
> -		up_read(&mm->mmap_sem);
>  
> -		/* preparing for next loop */
> -		sg_list_start = sg;
> +		up_read(&mm->mmap_sem);
>  	}
>  
> +	sg_mark_end(sg);
> +
>  	umem->nmap = ib_dma_map_sg_attrs(context->device,
>  				  umem->sg_head.sgl,
> -				  umem->npages,
> +				  umem->sg_nents,
>  				  DMA_BIDIRECTIONAL,
>  				  dma_attrs);
>  
> diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h
> index 73af05d..18407a4 100644
> --- a/include/rdma/ib_umem.h
> +++ b/include/rdma/ib_umem.h
> @@ -42,18 +42,19 @@
>  struct ib_umem_odp;
>  
>  struct ib_umem {
> -	struct ib_ucontext     *context;
> -	struct mm_struct       *owning_mm;
> -	size_t			length;
> -	unsigned long		address;
> -	int			page_shift;
> +	struct ib_ucontext *context;
> +	struct mm_struct *owning_mm;
> +	size_t length;
> +	unsigned long address;
> +	int page_shift;
>

NIT: Did anything change here?

>  	u32 writable : 1;
>  	u32 hugetlb : 1;
>  	u32 is_odp : 1;
> -	struct work_struct	work;
> +	struct work_struct work;
>  	struct sg_table sg_head;
> -	int             nmap;
> -	int             npages;
> +	int sg_nents;


Why can't we use sg_head.nents instead of tracking this in umem?

FWIW I don't see where sg_table.nents is used anywhere...  That is odd to me.

Ira


> +	int nmap;
> +	int npages;
>  };
>  
>  /* Returns the offset of the umem start relative to the first page. */
> -- 
> 1.8.3.1
>
Leon Romanovsky Feb. 20, 2019, 2:37 p.m. UTC | #2
On Tue, Feb 19, 2019 at 08:57:40AM -0600, Shiraz Saleem wrote:
> Combine contiguous regions of PAGE_SIZE pages
> into single scatter list entries while adding
> to the scatter table. This minimizes the number
> of the entries in the scatter list and reduces
> the DMA mapping overhead, particularly with the
> IOMMU.
>
> Suggested-by: Jason Gunthorpe <jgg@ziepe.ca>
> Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
> Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
> ---
>  drivers/infiniband/core/umem.c | 92 ++++++++++++++++++++++++++++++++++--------
>  include/rdma/ib_umem.h         | 17 ++++----
>  2 files changed, 84 insertions(+), 25 deletions(-)
>
> diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
> index b69d3ef..e8611b7 100644
> --- a/drivers/infiniband/core/umem.c
> +++ b/drivers/infiniband/core/umem.c
> @@ -39,25 +39,22 @@
>  #include <linux/export.h>
>  #include <linux/hugetlb.h>
>  #include <linux/slab.h>
> +#include <linux/pagemap.h>
>  #include <rdma/ib_umem_odp.h>
>
>  #include "uverbs.h"
>
> -
>  static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty)
>  {
> -	struct scatterlist *sg;
> +	struct sg_page_iter sg_iter;
>  	struct page *page;
> -	int i;
>
>  	if (umem->nmap > 0)
> -		ib_dma_unmap_sg(dev, umem->sg_head.sgl,
> -				umem->npages,
> +		ib_dma_unmap_sg(dev, umem->sg_head.sgl, umem->sg_nents,
>  				DMA_BIDIRECTIONAL);
>
> -	for_each_sg(umem->sg_head.sgl, sg, umem->npages, i) {
> -
> -		page = sg_page(sg);
> +	for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->sg_nents, 0) {
> +		page = sg_page_iter_page(&sg_iter);
>  		if (!PageDirty(page) && umem->writable && dirty)
>  			set_page_dirty_lock(page);
>  		put_page(page);
> @@ -66,6 +63,67 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d
>  	sg_free_table(&umem->sg_head);
>  }
>
> +/* ib_umem_add_sg_table - Add N contiguous pages to scatter table
> + *
> + * sg: current scatterlist entry
> + * page_list: array of npage struct page pointers
> + * npages: number of pages in page_list
> + * nents: [out] number of entries in the scatterlist
> + *
> + * Return new end of scatterlist
> + */
> +struct scatterlist *ib_umem_add_sg_table(struct scatterlist *sg,
> +					 struct page **page_list,
> +					 unsigned long npages,
> +					 int *nents)
> +{
> +	unsigned long first_pfn;
> +	unsigned long i = 0;
> +	bool update_cur_sg = false;
> +	bool first = !sg_page(sg) ? true : false;

For "bool" declarations you don't need to set explicitly.
"bool first = !sg_page(sg)" is the same as you wrote.

> +
> +	/* Check if new page_list is contiguous with end of previous page_list
> +	 */
> +	if (!first &&
> +	    (page_to_pfn(sg_page(sg)) + (sg->length >> PAGE_SHIFT) ==
> +	     page_to_pfn(page_list[0])))
> +		update_cur_sg = true;
> +
> +	while (i != npages) {
> +		unsigned long len;
> +		struct page *first_page = page_list[i];
> +
> +		first_pfn = page_to_pfn(first_page);
> +
> +		/* Compute the number of contiguous pages we have starting
> +		 * at i
> +		 */
> +		for (len = 0; i != npages &&
> +			      first_pfn + len == page_to_pfn(page_list[i]);
> +		     i++, len++)
> +		;
> +
> +		/* Squash N contiguous pages from page_list into current sge */
> +		if (update_cur_sg) {
> +			sg->length += len << PAGE_SHIFT;
> +			sg_set_page(sg, sg_page(sg), sg->length, 0);
> +			update_cur_sg = false;
> +			continue;
> +		}
> +
> +		/* Squash N contiguous pages into next sge or first sge */
> +		if (!first)
> +			sg = sg_next(sg);

How will it work with last element? For such case, sg will be NULL.

> +
> +		(*nents)++;
> +		sg->length = len << PAGE_SHIFT;

And you will derefence it here.

> +		sg_set_page(sg, first_page, sg->length, 0);
> +		first = false;
> +	}
> +
> +	return sg;
> +}
> +

Thanks
Leon Romanovsky Feb. 20, 2019, 5:20 p.m. UTC | #3
On Tue, Feb 19, 2019 at 08:57:40AM -0600, Shiraz Saleem wrote:
> Combine contiguous regions of PAGE_SIZE pages
> into single scatter list entries while adding
> to the scatter table. This minimizes the number
> of the entries in the scatter list and reduces
> the DMA mapping overhead, particularly with the
> IOMMU.
>
> Suggested-by: Jason Gunthorpe <jgg@ziepe.ca>
> Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
> Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
> ---
>  drivers/infiniband/core/umem.c | 92 ++++++++++++++++++++++++++++++++++--------
>  include/rdma/ib_umem.h         | 17 ++++----
>  2 files changed, 84 insertions(+), 25 deletions(-)
>
> diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
> index b69d3ef..e8611b7 100644
> --- a/drivers/infiniband/core/umem.c
> +++ b/drivers/infiniband/core/umem.c
> @@ -39,25 +39,22 @@
>  #include <linux/export.h>
>  #include <linux/hugetlb.h>
>  #include <linux/slab.h>
> +#include <linux/pagemap.h>
>  #include <rdma/ib_umem_odp.h>
>
>  #include "uverbs.h"
>
> -
>  static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty)
>  {
> -	struct scatterlist *sg;
> +	struct sg_page_iter sg_iter;
>  	struct page *page;
> -	int i;
>
>  	if (umem->nmap > 0)
> -		ib_dma_unmap_sg(dev, umem->sg_head.sgl,
> -				umem->npages,
> +		ib_dma_unmap_sg(dev, umem->sg_head.sgl, umem->sg_nents,
>  				DMA_BIDIRECTIONAL);
>
> -	for_each_sg(umem->sg_head.sgl, sg, umem->npages, i) {
> -
> -		page = sg_page(sg);
> +	for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->sg_nents, 0) {
> +		page = sg_page_iter_page(&sg_iter);
>  		if (!PageDirty(page) && umem->writable && dirty)
>  			set_page_dirty_lock(page);
>  		put_page(page);
> @@ -66,6 +63,67 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d
>  	sg_free_table(&umem->sg_head);
>  }
>
> +/* ib_umem_add_sg_table - Add N contiguous pages to scatter table
> + *
> + * sg: current scatterlist entry
> + * page_list: array of npage struct page pointers
> + * npages: number of pages in page_list
> + * nents: [out] number of entries in the scatterlist
> + *
> + * Return new end of scatterlist
> + */
> +struct scatterlist *ib_umem_add_sg_table(struct scatterlist *sg,
> +					 struct page **page_list,
> +					 unsigned long npages,
> +					 int *nents)

It produces compilation warnings for me, you forgot to put static in
front of function declaration.

500b1c079f73 (HEAD -> build) RDMA/umem: Remove hugetlb flag
drivers/infiniband/core/umem.c:75:21: warning: no previous prototype for
_ib_umem_add_sg_table_ [-Wmissing-prototypes]
 struct scatterlist *ib_umem_add_sg_table(struct scatterlist *sg,
                      ^~~~~~~~~~~~~~~~~~~~
Thanks
Saleem, Shiraz Feb. 23, 2019, 6:17 p.m. UTC | #4
On Tue, Feb 19, 2019 at 11:41:56AM -0800, Ira Weiny wrote:
> On Tue, Feb 19, 2019 at 08:57:40AM -0600, Shiraz Saleem wrote:
> > Combine contiguous regions of PAGE_SIZE pages
> > into single scatter list entries while adding
> > to the scatter table. This minimizes the number
> > of the entries in the scatter list and reduces
> > the DMA mapping overhead, particularly with the
> > IOMMU.
> > 
> > Suggested-by: Jason Gunthorpe <jgg@ziepe.ca>
> > Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
> > Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
> > ---
[...]

> >  
> > diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h
> > index 73af05d..18407a4 100644
> > --- a/include/rdma/ib_umem.h
> > +++ b/include/rdma/ib_umem.h
> > @@ -42,18 +42,19 @@
> >  struct ib_umem_odp;
> >  
> >  struct ib_umem {
> > -	struct ib_ucontext     *context;
> > -	struct mm_struct       *owning_mm;
> > -	size_t			length;
> > -	unsigned long		address;
> > -	int			page_shift;
> > +	struct ib_ucontext *context;
> > +	struct mm_struct *owning_mm;
> > +	size_t length;
> > +	unsigned long address;
> > +	int page_shift;
> >
> 
> NIT: Did anything change here?

I removed the horizonatl alignments as it was not consistent
on all struct members.

> 
> >  	u32 writable : 1;
> >  	u32 hugetlb : 1;
> >  	u32 is_odp : 1;
> > -	struct work_struct	work;
> > +	struct work_struct work;
> >  	struct sg_table sg_head;
> > -	int             nmap;
> > -	int             npages;
> > +	int sg_nents;
> 
> 
> Why can't we use sg_head.nents instead of tracking this in umem?
>

This is because we start off by overallocating the sg table off total system
pages in the umem. But when we iterate the page list after pinning and combine pages
into the SGEs, the scatterlist can turn out to be much smaller, i.e. umem->sg_nents
< umem->sg_head.nents. We remark the new end of the scatterlist and DMA map with umem->sg_nents.

Ideally, we should not be overllocating the sg table and this is part of future work.

Shiraz
 
> FWIW I don't see where sg_table.nents is used anywhere...  That is odd to me.
>
Saleem, Shiraz Feb. 23, 2019, 6:45 p.m. UTC | #5
On Wed, Feb 20, 2019 at 04:37:46PM +0200, Leon Romanovsky wrote:
> On Tue, Feb 19, 2019 at 08:57:40AM -0600, Shiraz Saleem wrote:
> > Combine contiguous regions of PAGE_SIZE pages
> > into single scatter list entries while adding
> > to the scatter table. This minimizes the number
> > of the entries in the scatter list and reduces
> > the DMA mapping overhead, particularly with the
> > IOMMU.
> >
[....]

> > +struct scatterlist *ib_umem_add_sg_table(struct scatterlist *sg,
> > +					 struct page **page_list,
> > +					 unsigned long npages,
> > +					 int *nents)
> > +{
> > +	unsigned long first_pfn;
> > +	unsigned long i = 0;
> > +	bool update_cur_sg = false;
> > +	bool first = !sg_page(sg) ? true : false;
> 
> For "bool" declarations you don't need to set explicitly.
> "bool first = !sg_page(sg)" is the same as you wrote.

Yes. Thanks :)

> 
> > +
> > +	/* Check if new page_list is contiguous with end of previous page_list
> > +	 */
> > +	if (!first &&
> > +	    (page_to_pfn(sg_page(sg)) + (sg->length >> PAGE_SHIFT) ==
> > +	     page_to_pfn(page_list[0])))
> > +		update_cur_sg = true;
> > +
> > +	while (i != npages) {
> > +		unsigned long len;
> > +		struct page *first_page = page_list[i];
> > +
> > +		first_pfn = page_to_pfn(first_page);
> > +
> > +		/* Compute the number of contiguous pages we have starting
> > +		 * at i
> > +		 */
> > +		for (len = 0; i != npages &&
> > +			      first_pfn + len == page_to_pfn(page_list[i]);
> > +		     i++, len++)
> > +		;
> > +
> > +		/* Squash N contiguous pages from page_list into current sge */
> > +		if (update_cur_sg) {
> > +			sg->length += len << PAGE_SHIFT;
> > +			sg_set_page(sg, sg_page(sg), sg->length, 0);
> > +			update_cur_sg = false;
> > +			continue;
> > +		}
> > +
> > +		/* Squash N contiguous pages into next sge or first sge */
> > +		if (!first)
> > +			sg = sg_next(sg);
> 
> How will it work with last element? For such case, sg will be NULL.

We have allocated a scatterlist array sized equal to npages and wont iterate past
end of the list. In worst case where the entire page list is discontiguous, the last
page would be assigned to last sg.

> > +
> > +		(*nents)++;
> > +		sg->length = len << PAGE_SHIFT;
> 
> And you will derefence it here.
> 
> > +		sg_set_page(sg, first_page, sg->length, 0);
> > +		first = false;
> > +	}
> > +
> > +	return sg;
> > +}
> > +
> 
> Thanks
Saleem, Shiraz Feb. 23, 2019, 6:46 p.m. UTC | #6
On Wed, Feb 20, 2019 at 07:20:26PM +0200, Leon Romanovsky wrote:
> On Tue, Feb 19, 2019 at 08:57:40AM -0600, Shiraz Saleem wrote:
> > Combine contiguous regions of PAGE_SIZE pages
> > into single scatter list entries while adding
> > to the scatter table. This minimizes the number
> > of the entries in the scatter list and reduces
> > the DMA mapping overhead, particularly with the
> > IOMMU.
> >
> > Suggested-by: Jason Gunthorpe <jgg@ziepe.ca>
> > Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
> > Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
> > ---
[...]
> >
> > +/* ib_umem_add_sg_table - Add N contiguous pages to scatter table
> > + *
> > + * sg: current scatterlist entry
> > + * page_list: array of npage struct page pointers
> > + * npages: number of pages in page_list
> > + * nents: [out] number of entries in the scatterlist
> > + *
> > + * Return new end of scatterlist
> > + */
> > +struct scatterlist *ib_umem_add_sg_table(struct scatterlist *sg,
> > +					 struct page **page_list,
> > +					 unsigned long npages,
> > +					 int *nents)
> 
> It produces compilation warnings for me, you forgot to put static in
> front of function declaration.
> 
> 500b1c079f73 (HEAD -> build) RDMA/umem: Remove hugetlb flag
> drivers/infiniband/core/umem.c:75:21: warning: no previous prototype for
> _ib_umem_add_sg_table_ [-Wmissing-prototypes]
>  struct scatterlist *ib_umem_add_sg_table(struct scatterlist *sg,
>                       ^~~~~~~~~~~~~~~~~~~~

I ll fix it. Thanks!
diff mbox series

Patch

diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index b69d3ef..e8611b7 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -39,25 +39,22 @@ 
 #include <linux/export.h>
 #include <linux/hugetlb.h>
 #include <linux/slab.h>
+#include <linux/pagemap.h>
 #include <rdma/ib_umem_odp.h>
 
 #include "uverbs.h"
 
-
 static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty)
 {
-	struct scatterlist *sg;
+	struct sg_page_iter sg_iter;
 	struct page *page;
-	int i;
 
 	if (umem->nmap > 0)
-		ib_dma_unmap_sg(dev, umem->sg_head.sgl,
-				umem->npages,
+		ib_dma_unmap_sg(dev, umem->sg_head.sgl, umem->sg_nents,
 				DMA_BIDIRECTIONAL);
 
-	for_each_sg(umem->sg_head.sgl, sg, umem->npages, i) {
-
-		page = sg_page(sg);
+	for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->sg_nents, 0) {
+		page = sg_page_iter_page(&sg_iter);
 		if (!PageDirty(page) && umem->writable && dirty)
 			set_page_dirty_lock(page);
 		put_page(page);
@@ -66,6 +63,67 @@  static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d
 	sg_free_table(&umem->sg_head);
 }
 
+/* ib_umem_add_sg_table - Add N contiguous pages to scatter table
+ *
+ * sg: current scatterlist entry
+ * page_list: array of npage struct page pointers
+ * npages: number of pages in page_list
+ * nents: [out] number of entries in the scatterlist
+ *
+ * Return new end of scatterlist
+ */
+struct scatterlist *ib_umem_add_sg_table(struct scatterlist *sg,
+					 struct page **page_list,
+					 unsigned long npages,
+					 int *nents)
+{
+	unsigned long first_pfn;
+	unsigned long i = 0;
+	bool update_cur_sg = false;
+	bool first = !sg_page(sg) ? true : false;
+
+	/* Check if new page_list is contiguous with end of previous page_list
+	 */
+	if (!first &&
+	    (page_to_pfn(sg_page(sg)) + (sg->length >> PAGE_SHIFT) ==
+	     page_to_pfn(page_list[0])))
+		update_cur_sg = true;
+
+	while (i != npages) {
+		unsigned long len;
+		struct page *first_page = page_list[i];
+
+		first_pfn = page_to_pfn(first_page);
+
+		/* Compute the number of contiguous pages we have starting
+		 * at i
+		 */
+		for (len = 0; i != npages &&
+			      first_pfn + len == page_to_pfn(page_list[i]);
+		     i++, len++)
+		;
+
+		/* Squash N contiguous pages from page_list into current sge */
+		if (update_cur_sg) {
+			sg->length += len << PAGE_SHIFT;
+			sg_set_page(sg, sg_page(sg), sg->length, 0);
+			update_cur_sg = false;
+			continue;
+		}
+
+		/* Squash N contiguous pages into next sge or first sge */
+		if (!first)
+			sg = sg_next(sg);
+
+		(*nents)++;
+		sg->length = len << PAGE_SHIFT;
+		sg_set_page(sg, first_page, sg->length, 0);
+		first = false;
+	}
+
+	return sg;
+}
+
 /**
  * ib_umem_get - Pin and DMA map userspace memory.
  *
@@ -93,7 +151,7 @@  struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
 	int ret;
 	int i;
 	unsigned long dma_attrs = 0;
-	struct scatterlist *sg, *sg_list_start;
+	struct scatterlist *sg;
 	unsigned int gup_flags = FOLL_WRITE;
 
 	context = rdma_get_ucontext(udata);
@@ -181,7 +239,7 @@  struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
 	if (!umem->writable)
 		gup_flags |= FOLL_FORCE;
 
-	sg_list_start = umem->sg_head.sgl;
+	sg = umem->sg_head.sgl;
 
 	while (npages) {
 		down_read(&mm->mmap_sem);
@@ -198,24 +256,24 @@  struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
 		cur_base += ret * PAGE_SIZE;
 		npages   -= ret;
 
+		sg = ib_umem_add_sg_table(sg, page_list, ret, &umem->sg_nents);
+
 		/* Continue to hold the mmap_sem as vma_list access
 		 * needs to be protected.
 		 */
-		for_each_sg(sg_list_start, sg, ret, i) {
+		for (i = 0; i < ret && umem->hugetlb; i++) {
 			if (vma_list && !is_vm_hugetlb_page(vma_list[i]))
 				umem->hugetlb = 0;
-
-			sg_set_page(sg, page_list[i], PAGE_SIZE, 0);
 		}
-		up_read(&mm->mmap_sem);
 
-		/* preparing for next loop */
-		sg_list_start = sg;
+		up_read(&mm->mmap_sem);
 	}
 
+	sg_mark_end(sg);
+
 	umem->nmap = ib_dma_map_sg_attrs(context->device,
 				  umem->sg_head.sgl,
-				  umem->npages,
+				  umem->sg_nents,
 				  DMA_BIDIRECTIONAL,
 				  dma_attrs);
 
diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h
index 73af05d..18407a4 100644
--- a/include/rdma/ib_umem.h
+++ b/include/rdma/ib_umem.h
@@ -42,18 +42,19 @@ 
 struct ib_umem_odp;
 
 struct ib_umem {
-	struct ib_ucontext     *context;
-	struct mm_struct       *owning_mm;
-	size_t			length;
-	unsigned long		address;
-	int			page_shift;
+	struct ib_ucontext *context;
+	struct mm_struct *owning_mm;
+	size_t length;
+	unsigned long address;
+	int page_shift;
 	u32 writable : 1;
 	u32 hugetlb : 1;
 	u32 is_odp : 1;
-	struct work_struct	work;
+	struct work_struct work;
 	struct sg_table sg_head;
-	int             nmap;
-	int             npages;
+	int sg_nents;
+	int nmap;
+	int npages;
 };
 
 /* Returns the offset of the umem start relative to the first page. */