diff mbox series

[rdma-next,1/6] RDMA/umem: Combine contiguous PAGE_SIZE regions in SGEs

Message ID 20181224223227.18016-2-shiraz.saleem@intel.com (mailing list archive)
State Changes Requested
Headers show
Series Introduce APIs to get DMA addresses aligned to a HW supported page size | expand

Commit Message

Saleem, Shiraz Dec. 24, 2018, 10:32 p.m. UTC
Combine contiguous regions of PAGE_SIZE pages
into single scatter list entries while adding
to the scatter table. This minimizes the number
of the entries in the scatter list and reduces
the DMA mapping overhead, particularly with the
IOMMU.

Additionally, update driver call sites to use the
for_each_sg_page variant to iterate through the
pages in the SGE. In certain call sites, there is
the assumption of single SGE to page mapping and
hence the iterator variant is required. In others,
it is simpler to use the iterator variant.

Suggested-by: Jason Gunthorpe <jgg@ziepe.ca>
Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
---
 drivers/infiniband/core/umem.c                 |  87 ++++++++---
 drivers/infiniband/hw/bnxt_re/ib_verbs.c       |  21 +--
 drivers/infiniband/hw/bnxt_re/qplib_res.c      |   9 +-
 drivers/infiniband/hw/cxgb3/iwch_provider.c    |  27 ++--
 drivers/infiniband/hw/cxgb4/mem.c              |  31 ++--
 drivers/infiniband/hw/hns/hns_roce_hw_v1.c     |   7 +-
 drivers/infiniband/hw/hns/hns_roce_hw_v2.c     |  25 ++-
 drivers/infiniband/hw/hns/hns_roce_mr.c        |  88 +++++------
 drivers/infiniband/hw/i40iw/i40iw_verbs.c      |  35 ++---
 drivers/infiniband/hw/mthca/mthca_provider.c   |  33 ++--
 drivers/infiniband/hw/nes/nes_verbs.c          | 203 +++++++++++--------------
 drivers/infiniband/hw/ocrdma/ocrdma_verbs.c    |  54 +++----
 drivers/infiniband/hw/qedr/verbs.c             |  56 +++----
 drivers/infiniband/hw/vmw_pvrdma/pvrdma_misc.c |  21 +--
 drivers/infiniband/sw/rdmavt/mr.c              |   8 +-
 drivers/infiniband/sw/rxe/rxe_mr.c             |   7 +-
 16 files changed, 342 insertions(+), 370 deletions(-)

Comments

Jason Gunthorpe Jan. 2, 2019, 6:29 p.m. UTC | #1
On Mon, Dec 24, 2018 at 04:32:22PM -0600, Shiraz Saleem wrote:
> Combine contiguous regions of PAGE_SIZE pages
> into single scatter list entries while adding
> to the scatter table. This minimizes the number
> of the entries in the scatter list and reduces
> the DMA mapping overhead, particularly with the
> IOMMU.
> 
> Additionally, update driver call sites to use the
> for_each_sg_page variant to iterate through the
> pages in the SGE. In certain call sites, there is
> the assumption of single SGE to page mapping and
> hence the iterator variant is required. In others,
> it is simpler to use the iterator variant.
> 
> Suggested-by: Jason Gunthorpe <jgg@ziepe.ca>
> Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
> Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
> ---
>  drivers/infiniband/core/umem.c                 |  87 ++++++++---
>  drivers/infiniband/hw/bnxt_re/ib_verbs.c       |  21 +--
>  drivers/infiniband/hw/bnxt_re/qplib_res.c      |   9 +-
>  drivers/infiniband/hw/cxgb3/iwch_provider.c    |  27 ++--
>  drivers/infiniband/hw/cxgb4/mem.c              |  31 ++--
>  drivers/infiniband/hw/hns/hns_roce_hw_v1.c     |   7 +-
>  drivers/infiniband/hw/hns/hns_roce_hw_v2.c     |  25 ++-
>  drivers/infiniband/hw/hns/hns_roce_mr.c        |  88 +++++------
>  drivers/infiniband/hw/i40iw/i40iw_verbs.c      |  35 ++---
>  drivers/infiniband/hw/mthca/mthca_provider.c   |  33 ++--
>  drivers/infiniband/hw/nes/nes_verbs.c          | 203 +++++++++++--------------
>  drivers/infiniband/hw/ocrdma/ocrdma_verbs.c    |  54 +++----
>  drivers/infiniband/hw/qedr/verbs.c             |  56 +++----
>  drivers/infiniband/hw/vmw_pvrdma/pvrdma_misc.c |  21 +--
>  drivers/infiniband/sw/rdmavt/mr.c              |   8 +-
>  drivers/infiniband/sw/rxe/rxe_mr.c             |   7 +-
>  16 files changed, 342 insertions(+), 370 deletions(-)

I think the driver parts of this should be split and done first, and
they should probably be split into per-driver patches so we can get
proper driver maintainer review. You need to CC the driver maintainers
on postings like this.

Most are generally an improvement, so just split and post them. They
can just go right away, independent of this series when ack'd.

> diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.c b/drivers/infiniband/hw/bnxt_re/qplib_res.c
> index 59eeac5..2ec668f 100644
> --- a/drivers/infiniband/hw/bnxt_re/qplib_res.c
> +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.c
> @@ -85,7 +85,7 @@ static void __free_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl,
>  static int __alloc_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl,
>  		       struct scatterlist *sghead, u32 pages, u32 pg_size)
>  {
> -	struct scatterlist *sg;
> +	struct sg_page_iter sg_iter;
>  	bool is_umem = false;
>  	int i;
>  
> @@ -116,12 +116,13 @@ static int __alloc_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl,
>  	} else {
>  		i = 0;
>  		is_umem = true;
> -		for_each_sg(sghead, sg, pages, i) {
> -			pbl->pg_map_arr[i] = sg_dma_address(sg);
> -			pbl->pg_arr[i] = sg_virt(sg);
> +		for_each_sg_page(sghead, &sg_iter, pages, 0) {
> +			pbl->pg_map_arr[i] = sg_page_iter_dma_address(&sg_iter);
> +			pbl->pg_arr[i] = page_address(sg_page_iter_page(&sg_iter));
>  			if (!pbl->pg_arr[i])
>  				goto fail;

I wonder if pg_arr[] is big enough now? It wasn't easy to tell, and
there is no bounding on i..

> @@ -563,19 +562,15 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
>  
>  	i = n = 0;
>  
> -	for_each_sg(mhp->umem->sg_head.sgl, sg, mhp->umem->nmap, entry) {
> -			len = sg_dma_len(sg) >> shift;
> -			for (k = 0; k < len; ++k) {
> -				pages[i++] = cpu_to_be64(sg_dma_address(sg) +
> -							 (k << shift));
> -				if (i == PAGE_SIZE / sizeof *pages) {
> -					err = iwch_write_pbl(mhp, pages, i, n);
> -					if (err)
> -						goto pbl_done;
> -					n += i;
> -					i = 0;
> -				}
> -			}
> +	for_each_sg_page(mhp->umem->sg_head.sgl, &sg_iter, mhp->umem->nmap, 0) {
> +		pages[i++] = cpu_to_be64(sg_page_iter_dma_address(&sg_iter));
> +		if (i == PAGE_SIZE / sizeof *pages) {
> +			err = iwch_write_pbl(mhp, pages, i, n);
> +			if (err)
> +				goto pbl_done;
> +			n += i;
> +			i = 0;
> +		}
>  	}

Stuff like this is really an improvement, getting rid of the double
loop, lots of example of this in here..

> diff --git a/drivers/infiniband/sw/rdmavt/mr.c b/drivers/infiniband/sw/rdmavt/mr.c
> index 49c9541..8f02059 100644
> --- a/drivers/infiniband/sw/rdmavt/mr.c
> +++ b/drivers/infiniband/sw/rdmavt/mr.c
> @@ -381,8 +381,8 @@ struct ib_mr *rvt_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
>  {
>  	struct rvt_mr *mr;
>  	struct ib_umem *umem;
> -	struct scatterlist *sg;
> -	int n, m, entry;
> +	struct sg_page_iter sg_iter;
> +	int n, m;
>  	struct ib_mr *ret;
>  
>  	if (length == 0)
> @@ -411,10 +411,10 @@ struct ib_mr *rvt_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
>  	mr->mr.page_shift = umem->page_shift;
>  	m = 0;
>  	n = 0;
> -	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
> +	for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->nmap, 0) {
>  		void *vaddr;
>  
> -		vaddr = page_address(sg_page(sg));
> +		vaddr = page_address(sg_page_iter_page(&sg_iter));
>  		if (!vaddr) {
>  			ret = ERR_PTR(-EINVAL);
>  			goto bail_inval;

[..]
		mr->mr.map[m]->segs[n].length = BIT(umem->page_shift);

When converting to use sg_page make sure to strip out all the
umem->page_shift usages too - the length of the current sg_iter is
just PAGE_SIZE. (basically any driver but mlx5 using page_shift is a
mistake)

Should check with Dennis if rvt can handle variable page sizes here
and if so just correct it to use the length of the SGL entry not
page_shift.

> diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c
> index 9d3916b..a6937de 100644
> --- a/drivers/infiniband/sw/rxe/rxe_mr.c
> +++ b/drivers/infiniband/sw/rxe/rxe_mr.c
> @@ -162,11 +162,10 @@ int rxe_mem_init_user(struct rxe_pd *pd, u64 start,
>  		      u64 length, u64 iova, int access, struct ib_udata *udata,
>  		      struct rxe_mem *mem)
>  {
> -	int			entry;
>  	struct rxe_map		**map;
>  	struct rxe_phys_buf	*buf = NULL;
>  	struct ib_umem		*umem;
> -	struct scatterlist	*sg;
> +	struct sg_page_iter	sg_iter;
>  	int			num_buf;
>  	void			*vaddr;
>  	int err;
> @@ -199,8 +198,8 @@ int rxe_mem_init_user(struct rxe_pd *pd, u64 start,
>  	if (length > 0) {
>  		buf = map[0]->buf;
>  
> -		for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
> -			vaddr = page_address(sg_page(sg));
> +		for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->nmap, 0) {
> +			vaddr = page_address(sg_page_iter_page(&sg_iter));
>  			if (!vaddr) {
>  				pr_warn("null vaddr\n");
>  				err = -ENOMEM;

Same comment

Jason
Jason Gunthorpe Jan. 2, 2019, 11:51 p.m. UTC | #2
On Mon, Dec 24, 2018 at 04:32:22PM -0600, Shiraz Saleem wrote:
> Combine contiguous regions of PAGE_SIZE pages
> into single scatter list entries while adding
> to the scatter table. This minimizes the number
> of the entries in the scatter list and reduces
> the DMA mapping overhead, particularly with the
> IOMMU.
> 
> Additionally, update driver call sites to use the
> for_each_sg_page variant to iterate through the
> pages in the SGE. In certain call sites, there is
> the assumption of single SGE to page mapping and
> hence the iterator variant is required. In others,
> it is simpler to use the iterator variant.
> 
> Suggested-by: Jason Gunthorpe <jgg@ziepe.ca>
> Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
> Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
>  drivers/infiniband/core/umem.c                 |  87 ++++++++---
>  drivers/infiniband/hw/bnxt_re/ib_verbs.c       |  21 +--
>  drivers/infiniband/hw/bnxt_re/qplib_res.c      |   9 +-
>  drivers/infiniband/hw/cxgb3/iwch_provider.c    |  27 ++--
>  drivers/infiniband/hw/cxgb4/mem.c              |  31 ++--
>  drivers/infiniband/hw/hns/hns_roce_hw_v1.c     |   7 +-
>  drivers/infiniband/hw/hns/hns_roce_hw_v2.c     |  25 ++-
>  drivers/infiniband/hw/hns/hns_roce_mr.c        |  88 +++++------
>  drivers/infiniband/hw/i40iw/i40iw_verbs.c      |  35 ++---
>  drivers/infiniband/hw/mthca/mthca_provider.c   |  33 ++--
>  drivers/infiniband/hw/nes/nes_verbs.c          | 203 +++++++++++--------------
>  drivers/infiniband/hw/ocrdma/ocrdma_verbs.c    |  54 +++----
>  drivers/infiniband/hw/qedr/verbs.c             |  56 +++----
>  drivers/infiniband/hw/vmw_pvrdma/pvrdma_misc.c |  21 +--
>  drivers/infiniband/sw/rdmavt/mr.c              |   8 +-
>  drivers/infiniband/sw/rxe/rxe_mr.c             |   7 +-
>  16 files changed, 342 insertions(+), 370 deletions(-)
> 
> diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
> index c6144df..64bacc5 100644
> +++ b/drivers/infiniband/core/umem.c
> @@ -39,25 +39,23 @@
>  #include <linux/export.h>
>  #include <linux/hugetlb.h>
>  #include <linux/slab.h>
> +#include <linux/pagemap.h>
>  #include <rdma/ib_umem_odp.h>
>  
>  #include "uverbs.h"
>  
> -
>  static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty)
>  {
> -	struct scatterlist *sg;
> +	struct sg_page_iter sg_iter;
>  	struct page *page;
> -	int i;
> +	int nents = sg_nents(umem->sg_head.sgl);

I wonder if we should cache sg_nents in the umem? We compute it twice
and since this now directly adds each sgl we know the nents
naturally..

> +/* ib_umem_add_sg_table - Add N contiguous pages to scatter table
> + *
> + * cur: current scatterlist entry
> + * nxt: next scatterlist entry
> + * page_list: array of npage struct page pointers
> + * npages: number of pages in page_list
> + */
> +static void ib_umem_add_sg_table(struct scatterlist **cur,
> +				 struct scatterlist **nxt,
> +				 struct page **page_list,
> +				 unsigned long npages)

I think this is more understandable as:

struct scatterlist *ib_umem_add_sg_table(struct scatterlist *sgl,
                                 struct page **page_list,
				 unsigned long npages,
                                 bool first)

Where first is set if sgl does not already point to a valid entry, and
the return result is new end of the list.


> +	/* Check if new page_list is contiguous with end of previous page_list*/
> +	if (*cur != *nxt) {
> +		cur_sglen = (*cur)->length >> PAGE_SHIFT;
> +		first_pfn = page_to_pfn(sg_page(*cur));
> +		if (first_pfn + cur_sglen == page_to_pfn(page_list[0]))
> +			update_cur_sg = true;
> +	}
> +
> +	while (i != npages) {
> +		unsigned long len;
> +		struct page *first_page = page_list[i];
> +
> +		first_pfn = page_to_pfn(first_page);
> +
> +		/* Compute the number of contiguous pages we have starting
> +		 * at i
> +		 */
> +		for (len = 0; i != npages &&
> +			      first_pfn + len == page_to_pfn(page_list[i]);
> +		     i++, len++);

The semicolon should be on a blank line alone for this construction

> +
> +		/* Squash first N contiguous pages from page_list into current
> +		 * SGE
> +		 */
> +		if (update_cur_sg) {
> +			sg_set_page(*cur, sg_page(*cur),
> +				    (len + cur_sglen) * PAGE_SIZE, 0);

This would be nicer as (*cur)->length instead of cur_sglen

And if we are touching cur->length, it begs the question why not just
write cur->length += len << PAGE_SHIFT ?

> +	sg_mark_end(sg_cur);

This seems like a good approach for now. We can look at trimming the
unused memory later..

Jason
Jason Gunthorpe Jan. 3, 2019, 10:18 p.m. UTC | #3
On Mon, Dec 24, 2018 at 04:32:22PM -0600, Shiraz Saleem wrote:
> -	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
> -		pages = sg_dma_len(sg) >> PAGE_SHIFT;
> -		for (i = 0; i < pages; i++) {
> -			paddr = sg_dma_address(sg) + (i << PAGE_SHIFT);
> -			if (pbl_tbl == pbl_tbl_orig)
> -				*pbl_tbl++ = paddr & ~page_mask;
> -			else if ((paddr & page_mask) == 0)
> -				*pbl_tbl++ = paddr;
> -		}
> +	struct sg_page_iter sg_iter;
> +
> +	for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->nmap, 0) {
> +		paddr = sg_page_iter_dma_address(&sg_iter);

Actually, I'm looking at this again, and thinking this looks
problematic..

for_each_sg_page iterates over the original SGL, it does not iterate
over the DMA mapped SGL (ie it does not use sg_dma_len()) - so this is
not an entirely equivalent transformation.

I think you need to add a for_each_sg_dma_page() to make this work
correctly? The only difference would be to use sg_dma_len instead of
length?

And I wonder how many of these places in drivers work properly if
PAGE_SIZE != 4k :(

CH: Does sg_page_iter_dma_address() even make any sense??

Jason
Christoph Hellwig Jan. 4, 2019, 7:18 a.m. UTC | #4
On Thu, Jan 03, 2019 at 03:18:19PM -0700, Jason Gunthorpe wrote:
> On Mon, Dec 24, 2018 at 04:32:22PM -0600, Shiraz Saleem wrote:
> > -	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
> > -		pages = sg_dma_len(sg) >> PAGE_SHIFT;
> > -		for (i = 0; i < pages; i++) {
> > -			paddr = sg_dma_address(sg) + (i << PAGE_SHIFT);
> > -			if (pbl_tbl == pbl_tbl_orig)
> > -				*pbl_tbl++ = paddr & ~page_mask;
> > -			else if ((paddr & page_mask) == 0)
> > -				*pbl_tbl++ = paddr;
> > -		}
> > +	struct sg_page_iter sg_iter;
> > +
> > +	for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->nmap, 0) {
> > +		paddr = sg_page_iter_dma_address(&sg_iter);
> 
> Actually, I'm looking at this again, and thinking this looks
> problematic..
> 
> for_each_sg_page iterates over the original SGL, it does not iterate
> over the DMA mapped SGL (ie it does not use sg_dma_len()) - so this is
> not an entirely equivalent transformation.
> 
> I think you need to add a for_each_sg_dma_page() to make this work
> correctly? The only difference would be to use sg_dma_len instead of
> length?
> 
> And I wonder how many of these places in drivers work properly if
> PAGE_SIZE != 4k :(
> 
> CH: Does sg_page_iter_dma_address() even make any sense??

No, it doesn't and should never have been edited.  It's one of these
typical scatterlist abuses in the media code :(
Christoph Hellwig Jan. 4, 2019, 7:22 a.m. UTC | #5
On Fri, Jan 04, 2019 at 08:18:52AM +0100, Christoph Hellwig wrote:
> > CH: Does sg_page_iter_dma_address() even make any sense??
> 
> No, it doesn't and should never have been edited.  It's one of these
> typical scatterlist abuses in the media code :(

Actually I take this back.  Due to the sg_page_iter structure iterating
over segments first and then pages inside the segment it actually
is ok.  It just looks very awkward.
Saleem, Shiraz Jan. 4, 2019, 3:43 p.m. UTC | #6
On Wed, Jan 02, 2019 at 04:51:24PM -0700, Jason Gunthorpe wrote:
> On Mon, Dec 24, 2018 at 04:32:22PM -0600, Shiraz Saleem wrote:
> > Combine contiguous regions of PAGE_SIZE pages
> > into single scatter list entries while adding
> > to the scatter table. This minimizes the number
> > of the entries in the scatter list and reduces
> > the DMA mapping overhead, particularly with the
> > IOMMU.
> > 
> > Additionally, update driver call sites to use the
> > for_each_sg_page variant to iterate through the
> > pages in the SGE. In certain call sites, there is
> > the assumption of single SGE to page mapping and
> > hence the iterator variant is required. In others,
> > it is simpler to use the iterator variant.
> >
[..]
 
> >  static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty)
> >  {
> > -	struct scatterlist *sg;
> > +	struct sg_page_iter sg_iter;
> >  	struct page *page;
> > -	int i;
> > +	int nents = sg_nents(umem->sg_head.sgl);
> 
> I wonder if we should cache sg_nents in the umem? We compute it twice
> and since this now directly adds each sgl we know the nents
> naturally..

Ah! So your saying, compute nents while we add to the scatter table
and cache in umem as opposed to using this helper in both places?
 
> > +/* ib_umem_add_sg_table - Add N contiguous pages to scatter table
> > + *
> > + * cur: current scatterlist entry
> > + * nxt: next scatterlist entry
> > + * page_list: array of npage struct page pointers
> > + * npages: number of pages in page_list
> > + */
> > +static void ib_umem_add_sg_table(struct scatterlist **cur,
> > +				 struct scatterlist **nxt,
> > +				 struct page **page_list,
> > +				 unsigned long npages)
> 
> I think this is more understandable as:
> 
> struct scatterlist *ib_umem_add_sg_table(struct scatterlist *sgl,
>                                  struct page **page_list,
> 				 unsigned long npages,
>                                  bool first)
> 
> Where first is set if sgl does not already point to a valid entry, and
> the return result is new end of the list.
> 

OK. I ll try to tweak the function accordingly.

> > +	/* Check if new page_list is contiguous with end of previous page_list*/
> > +	if (*cur != *nxt) {
> > +		cur_sglen = (*cur)->length >> PAGE_SHIFT;
> > +		first_pfn = page_to_pfn(sg_page(*cur));
> > +		if (first_pfn + cur_sglen == page_to_pfn(page_list[0]))
> > +			update_cur_sg = true;
> > +	}
> > +
> > +	while (i != npages) {
> > +		unsigned long len;
> > +		struct page *first_page = page_list[i];
> > +
> > +		first_pfn = page_to_pfn(first_page);
> > +
> > +		/* Compute the number of contiguous pages we have starting
> > +		 * at i
> > +		 */
> > +		for (len = 0; i != npages &&
> > +			      first_pfn + len == page_to_pfn(page_list[i]);
> > +		     i++, len++);
> 
> The semicolon should be on a blank line alone for this construction

OK.

> > +
> > +		/* Squash first N contiguous pages from page_list into current
> > +		 * SGE
> > +		 */
> > +		if (update_cur_sg) {
> > +			sg_set_page(*cur, sg_page(*cur),
> > +				    (len + cur_sglen) * PAGE_SIZE, 0);
> 
> This would be nicer as (*cur)->length instead of cur_sglen
> 
> And if we are touching cur->length, it begs the question why not just
> write cur->length += len << PAGE_SHIFT ?
OK.

> > +	sg_mark_end(sg_cur);
> 
> This seems like a good approach for now. We can look at trimming the
> unused memory later..

Agreed.
Jason Gunthorpe Jan. 4, 2019, 4:16 p.m. UTC | #7
On Fri, Jan 04, 2019 at 08:22:13AM +0100, Christoph Hellwig wrote:
> On Fri, Jan 04, 2019 at 08:18:52AM +0100, Christoph Hellwig wrote:
> > > CH: Does sg_page_iter_dma_address() even make any sense??
> > 
> > No, it doesn't and should never have been edited.  It's one of these
> > typical scatterlist abuses in the media code :(
> 
> Actually I take this back.  Due to the sg_page_iter structure iterating
> over segments first and then pages inside the segment it actually
> is ok.  It just looks very awkward.

I'm looking at __sg_page_iter_next which does this:

static int sg_page_count(struct scatterlist *sg)
{
	return PAGE_ALIGN(sg->offset + sg->length) >> PAGE_SHIFT;
}

bool __sg_page_iter_next(struct sg_page_iter *piter)
{
[..]
	while (piter->sg_pgoffset >= sg_page_count(piter->sg)) {
		piter->sg_pgoffset -= sg_page_count(piter->sg);

Which isn't going to yield the right math if 
  sg->length != sg_dma_len(sg)

Surely we need a version of sg_page_iter_next() that uses sg_dma_len
to compute page_count if the caller intents to call
sg_page_iter_dma_address() ?

I assume it works for media because they have arranged things so that
sg_dma_len == sg->length, but in the general case with dma_map_sg that
can't be assumed.

Jason
Saleem, Shiraz Jan. 4, 2019, 6:22 p.m. UTC | #8
On Fri, Jan 04, 2019 at 09:16:13AM -0700, Jason Gunthorpe wrote:
> On Fri, Jan 04, 2019 at 08:22:13AM +0100, Christoph Hellwig wrote:
> > On Fri, Jan 04, 2019 at 08:18:52AM +0100, Christoph Hellwig wrote:
> > > > CH: Does sg_page_iter_dma_address() even make any sense??
> > > 
> > > No, it doesn't and should never have been edited.  It's one of these
> > > typical scatterlist abuses in the media code :(
> > 
> > Actually I take this back.  Due to the sg_page_iter structure iterating
> > over segments first and then pages inside the segment it actually
> > is ok.  It just looks very awkward.
> 
> I'm looking at __sg_page_iter_next which does this:
> 
> static int sg_page_count(struct scatterlist *sg)
> {
> 	return PAGE_ALIGN(sg->offset + sg->length) >> PAGE_SHIFT;
> }
> 
> bool __sg_page_iter_next(struct sg_page_iter *piter)
> {
> [..]
> 	while (piter->sg_pgoffset >= sg_page_count(piter->sg)) {
> 		piter->sg_pgoffset -= sg_page_count(piter->sg);
> 
> Which isn't going to yield the right math if 
>   sg->length != sg_dma_len(sg)
> 
> Surely we need a version of sg_page_iter_next() that uses sg_dma_len
> to compute page_count if the caller intents to call
> sg_page_iter_dma_address() ?

/*
 * sg page iterator
 *
 * Iterates over sg entries page-by-page.  On each successful iteration,
 * you can call sg_page_iter_page(@piter) and sg_page_iter_dma_address(@piter)
 * to get the current page and its dma address. @piter->sg will point to the
 * sg holding this page and @piter->sg_pgoffset to the page's page offset
 * within the sg. The iteration will stop either when a maximum number of sg
 * entries was reached or a terminating sg (sg_last(sg) == true) was reached.
 */
struct sg_page_iter {
	struct scatterlist	*sg;		/* sg holding the page */
	unsigned int		sg_pgoffset;	/* page offset within the sg */
[..]

If the intent of the page iterator was to grab the cur page and dma address,
then shouldnt sg_page_count just be updated to use sg_dma_len(sg)?
 
> I assume it works for media because they have arranged things so that
> sg_dma_len == sg->length, but in the general case with dma_map_sg that
> can't be assumed.
>
Jason Gunthorpe Jan. 4, 2019, 6:49 p.m. UTC | #9
On Fri, Jan 04, 2019 at 12:22:10PM -0600, Shiraz Saleem wrote:
> On Fri, Jan 04, 2019 at 09:16:13AM -0700, Jason Gunthorpe wrote:
> > On Fri, Jan 04, 2019 at 08:22:13AM +0100, Christoph Hellwig wrote:
> > > On Fri, Jan 04, 2019 at 08:18:52AM +0100, Christoph Hellwig wrote:
> > > > > CH: Does sg_page_iter_dma_address() even make any sense??
> > > > 
> > > > No, it doesn't and should never have been edited.  It's one of these
> > > > typical scatterlist abuses in the media code :(
> > > 
> > > Actually I take this back.  Due to the sg_page_iter structure iterating
> > > over segments first and then pages inside the segment it actually
> > > is ok.  It just looks very awkward.
> > 
> > I'm looking at __sg_page_iter_next which does this:
> > 
> > static int sg_page_count(struct scatterlist *sg)
> > {
> > 	return PAGE_ALIGN(sg->offset + sg->length) >> PAGE_SHIFT;
> > }
> > 
> > bool __sg_page_iter_next(struct sg_page_iter *piter)
> > {
> > [..]
> > 	while (piter->sg_pgoffset >= sg_page_count(piter->sg)) {
> > 		piter->sg_pgoffset -= sg_page_count(piter->sg);
> > 
> > Which isn't going to yield the right math if 
> >   sg->length != sg_dma_len(sg)
> > 
> > Surely we need a version of sg_page_iter_next() that uses sg_dma_len
> > to compute page_count if the caller intents to call
> > sg_page_iter_dma_address() ?
> 
> /*
>  * sg page iterator
>  *
>  * Iterates over sg entries page-by-page.  On each successful iteration,
>  * you can call sg_page_iter_page(@piter) and sg_page_iter_dma_address(@piter)
>  * to get the current page and its dma address. @piter->sg will point to the
>  * sg holding this page and @piter->sg_pgoffset to the page's page offset
>  * within the sg. The iteration will stop either when a maximum number of sg
>  * entries was reached or a terminating sg (sg_last(sg) == true) was reached.
>  */
> struct sg_page_iter {
> 	struct scatterlist	*sg;		/* sg holding the page */
> 	unsigned int		sg_pgoffset;	/* page offset within the sg */
> [..]
> 
> If the intent of the page iterator was to grab the cur page and dma address,
> then shouldnt sg_page_count just be updated to use sg_dma_len(sg)?

The page iterator is used widely in places that have nothing to do
with DMA, it needs to keep working as is: iterating over the CPU
struct pages, not the DMA physical addresses.

This is what I was thinking of doing..

From cb4a9f568e9bffe1f6baf3d70e6fb9cfff21525f Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Fri, 4 Jan 2019 11:40:21 -0700
Subject: [PATCH] lib/scatterlist: Provide a DMA page iterator

Commit 2db76d7c3c6d ("lib/scatterlist: sg_page_iter: support sg lists w/o
backing pages") introduced the sg_page_iter_dma_address() function without
providing a way to use it in the general case. If the sg_dma_len is not
equal to the dma_length callers cannot safely use the
for_each_sg_page/sg_page_iter_dma_address combination.

Resolve this API mistake by providing a for_each_sg_dma_page() iterator
that uses the right length so sg_page_iter_dma_address() works as expected
in all cases.

Users cannot mix DMA and non-DMA accessors within the same iteration.

Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/media/pci/intel/ipu3/ipu3-cio2.c |  2 +-
 include/linux/scatterlist.h              | 37 ++++++++++++++++++++----
 lib/scatterlist.c                        | 24 +++++++++++++++
 3 files changed, 56 insertions(+), 7 deletions(-)

diff --git a/drivers/media/pci/intel/ipu3/ipu3-cio2.c b/drivers/media/pci/intel/ipu3/ipu3-cio2.c
index 447baaebca4486..a6f9bdfe9e46ed 100644
--- a/drivers/media/pci/intel/ipu3/ipu3-cio2.c
+++ b/drivers/media/pci/intel/ipu3/ipu3-cio2.c
@@ -873,7 +873,7 @@ static int cio2_vb2_buf_init(struct vb2_buffer *vb)
 		b->offset = sg->sgl->offset;
 
 	i = j = 0;
-	for_each_sg_page(sg->sgl, &sg_iter, sg->nents, 0) {
+	for_each_sg_dma_page(sg->sgl, &sg_iter, sg->nents, 0) {
 		if (!pages--)
 			break;
 		b->lop[i][j] = sg_page_iter_dma_address(&sg_iter) >> PAGE_SHIFT;
diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
index 093aa57120b0cf..8b64e59752ce6b 100644
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@ -339,12 +339,15 @@ int sg_alloc_table_chained(struct sg_table *table, int nents,
 /*
  * sg page iterator
  *
- * Iterates over sg entries page-by-page.  On each successful iteration,
- * you can call sg_page_iter_page(@piter) and sg_page_iter_dma_address(@piter)
- * to get the current page and its dma address. @piter->sg will point to the
- * sg holding this page and @piter->sg_pgoffset to the page's page offset
- * within the sg. The iteration will stop either when a maximum number of sg
- * entries was reached or a terminating sg (sg_last(sg) == true) was reached.
+ * Iterates over sg entries page-by-page.  When for_each_sg_page() is used
+ * each successful iteration can call sg_page_iter_page(@piter) to get the
+ * current page, while when using for_each_sg_dma_page() iterations can call
+ * sg_page_iter_dma_address(@piter) to get the current dma address
+ *
+ * @piter->sg will point to the sg holding this page and @piter->sg_pgoffset
+ * to the page's page offset within the sg. The iteration will stop either
+ * when a maximum number of sg entries was reached or a terminating sg
+ * (sg_last(sg) == true) was reached.
  */
 struct sg_page_iter {
 	struct scatterlist	*sg;		/* sg holding the page */
@@ -357,6 +360,7 @@ struct sg_page_iter {
 };
 
 bool __sg_page_iter_next(struct sg_page_iter *piter);
+bool __sg_page_iter_dma_next(struct sg_page_iter *piter);
 void __sg_page_iter_start(struct sg_page_iter *piter,
 			  struct scatterlist *sglist, unsigned int nents,
 			  unsigned long pgoffset);
@@ -385,11 +389,32 @@ static inline dma_addr_t sg_page_iter_dma_address(struct sg_page_iter *piter)
  * @piter:	page iterator to hold current page, sg, sg_pgoffset
  * @nents:	maximum number of sg entries to iterate over
  * @pgoffset:	starting page offset
+ *
+ * Callers may use sg_page_iter_page() to get each page pointer.
  */
 #define for_each_sg_page(sglist, piter, nents, pgoffset)		   \
 	for (__sg_page_iter_start((piter), (sglist), (nents), (pgoffset)); \
 	     __sg_page_iter_next(piter);)
 
+/**
+ * for_each_sg_dma_page - iterate over the pages of the given sg list
+ * @sglist:	sglist to iterate over
+ * @piter:	page iterator to hold current page, sg, sg_pgoffset
+ * @dma_nents:	maximum number of sg entries to iterate over, this is the value
+ *              returned from dma_map_sg
+ * @pgoffset:	starting page offset
+ *
+ * Callers may use sg_page_iter_dma_address() to get each page's DMA address.
+ */
+#ifdef CONFIG_NEED_SG_DMA_LENGTH
+#define for_each_sg_dma_page(sglist, piter, dma_nents, pgoffset)               \
+	for (__sg_page_iter_start(piter, sglist, dma_nents, pgoffset);         \
+	     __sg_page_iter_dma_next(piter);)
+#else
+#define for_each_sg_dma_page(sglist, piter, dma_nents, pgoffset)               \
+	for_each_sg_page(sglist, piter, dma_nents, pgoffest)
+#endif
+
 /*
  * Mapping sg iterator
  *
diff --git a/lib/scatterlist.c b/lib/scatterlist.c
index 7c6096a7170486..fc6e224869ad65 100644
--- a/lib/scatterlist.c
+++ b/lib/scatterlist.c
@@ -625,6 +625,30 @@ bool __sg_page_iter_next(struct sg_page_iter *piter)
 }
 EXPORT_SYMBOL(__sg_page_iter_next);
 
+static int sg_dma_page_count(struct scatterlist *sg)
+{
+	return PAGE_ALIGN(sg->offset + sg_dma_len(sg)) >> PAGE_SHIFT;
+}
+
+bool __sg_page_iter_dma_next(struct sg_page_iter *piter)
+{
+	if (!piter->__nents || !piter->sg)
+		return false;
+
+	piter->sg_pgoffset += piter->__pg_advance;
+	piter->__pg_advance = 1;
+
+	while (piter->sg_pgoffset >= sg_dma_page_count(piter->sg)) {
+		piter->sg_pgoffset -= sg_dma_page_count(piter->sg);
+		piter->sg = sg_next(piter->sg);
+		if (!--piter->__nents || !piter->sg)
+			return false;
+	}
+
+	return true;
+}
+EXPORT_SYMBOL(__sg_page_iter_next);
+
 /**
  * sg_miter_start - start mapping iteration over a sg list
  * @miter: sg mapping iter to be started
Christoph Hellwig Jan. 4, 2019, 8:22 p.m. UTC | #10
On Fri, Jan 04, 2019 at 11:49:09AM -0700, Jason Gunthorpe wrote:
> On Fri, Jan 04, 2019 at 12:22:10PM -0600, Shiraz Saleem wrote:
> > On Fri, Jan 04, 2019 at 09:16:13AM -0700, Jason Gunthorpe wrote:
> > > On Fri, Jan 04, 2019 at 08:22:13AM +0100, Christoph Hellwig wrote:
> > > > On Fri, Jan 04, 2019 at 08:18:52AM +0100, Christoph Hellwig wrote:
> > > > > > CH: Does sg_page_iter_dma_address() even make any sense??
> > > > > 
> > > > > No, it doesn't and should never have been edited.  It's one of these
> > > > > typical scatterlist abuses in the media code :(
> > > > 
> > > > Actually I take this back.  Due to the sg_page_iter structure iterating
> > > > over segments first and then pages inside the segment it actually
> > > > is ok.  It just looks very awkward.
> > > 
> > > I'm looking at __sg_page_iter_next which does this:
> > > 
> > > static int sg_page_count(struct scatterlist *sg)
> > > {
> > > 	return PAGE_ALIGN(sg->offset + sg->length) >> PAGE_SHIFT;
> > > }
> > > 
> > > bool __sg_page_iter_next(struct sg_page_iter *piter)
> > > {
> > > [..]
> > > 	while (piter->sg_pgoffset >= sg_page_count(piter->sg)) {
> > > 		piter->sg_pgoffset -= sg_page_count(piter->sg);
> > > 
> > > Which isn't going to yield the right math if 
> > >   sg->length != sg_dma_len(sg)
> > > 
> > > Surely we need a version of sg_page_iter_next() that uses sg_dma_len
> > > to compute page_count if the caller intents to call
> > > sg_page_iter_dma_address() ?
> > 
> > /*
> >  * sg page iterator
> >  *
> >  * Iterates over sg entries page-by-page.  On each successful iteration,
> >  * you can call sg_page_iter_page(@piter) and sg_page_iter_dma_address(@piter)
> >  * to get the current page and its dma address. @piter->sg will point to the
> >  * sg holding this page and @piter->sg_pgoffset to the page's page offset
> >  * within the sg. The iteration will stop either when a maximum number of sg
> >  * entries was reached or a terminating sg (sg_last(sg) == true) was reached.
> >  */
> > struct sg_page_iter {
> > 	struct scatterlist	*sg;		/* sg holding the page */
> > 	unsigned int		sg_pgoffset;	/* page offset within the sg */
> > [..]
> > 
> > If the intent of the page iterator was to grab the cur page and dma address,
> > then shouldnt sg_page_count just be updated to use sg_dma_len(sg)?
> 
> The page iterator is used widely in places that have nothing to do
> with DMA, it needs to keep working as is: iterating over the CPU
> struct pages, not the DMA physical addresses.
> 
> This is what I was thinking of doing..
> 
> >From cb4a9f568e9bffe1f6baf3d70e6fb9cfff21525f Mon Sep 17 00:00:00 2001
> From: Jason Gunthorpe <jgg@mellanox.com>
> Date: Fri, 4 Jan 2019 11:40:21 -0700
> Subject: [PATCH] lib/scatterlist: Provide a DMA page iterator
> 
> Commit 2db76d7c3c6d ("lib/scatterlist: sg_page_iter: support sg lists w/o
> backing pages") introduced the sg_page_iter_dma_address() function without
> providing a way to use it in the general case. If the sg_dma_len is not
> equal to the dma_length callers cannot safely use the
> for_each_sg_page/sg_page_iter_dma_address combination.
> 
> Resolve this API mistake by providing a for_each_sg_dma_page() iterator
> that uses the right length so sg_page_iter_dma_address() works as expected
> in all cases.
> 
> Users cannot mix DMA and non-DMA accessors within the same iteration.

This looks reasonable.  I wonder if we should use a different
struct type for the dma iter so that we get better type checking?

Also this seems to miss vmwgfx, the other user of the interface.

> +#ifdef CONFIG_NEED_SG_DMA_LENGTH
> +#define for_each_sg_dma_page(sglist, piter, dma_nents, pgoffset)               \
> +	for (__sg_page_iter_start(piter, sglist, dma_nents, pgoffset);         \
> +	     __sg_page_iter_dma_next(piter);)
> +#else
> +#define for_each_sg_dma_page(sglist, piter, dma_nents, pgoffset)               \
> +	for_each_sg_page(sglist, piter, dma_nents, pgoffest)
> +#endif

I'd rather not introduce ifdefs here and always have the DMA iter
code.
diff mbox series

Patch

diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index c6144df..64bacc5 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -39,25 +39,23 @@ 
 #include <linux/export.h>
 #include <linux/hugetlb.h>
 #include <linux/slab.h>
+#include <linux/pagemap.h>
 #include <rdma/ib_umem_odp.h>
 
 #include "uverbs.h"
 
-
 static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty)
 {
-	struct scatterlist *sg;
+	struct sg_page_iter sg_iter;
 	struct page *page;
-	int i;
+	int nents = sg_nents(umem->sg_head.sgl);
 
 	if (umem->nmap > 0)
-		ib_dma_unmap_sg(dev, umem->sg_head.sgl,
-				umem->npages,
+		ib_dma_unmap_sg(dev, umem->sg_head.sgl, nents,
 				DMA_BIDIRECTIONAL);
 
-	for_each_sg(umem->sg_head.sgl, sg, umem->npages, i) {
-
-		page = sg_page(sg);
+	for_each_sg_page(umem->sg_head.sgl, &sg_iter, nents, 0) {
+		page = sg_page_iter_page(&sg_iter);
 		if (!PageDirty(page) && umem->writable && dirty)
 			set_page_dirty_lock(page);
 		put_page(page);
@@ -66,6 +64,60 @@  static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d
 	sg_free_table(&umem->sg_head);
 }
 
+/* ib_umem_add_sg_table - Add N contiguous pages to scatter table
+ *
+ * cur: current scatterlist entry
+ * nxt: next scatterlist entry
+ * page_list: array of npage struct page pointers
+ * npages: number of pages in page_list
+ */
+static void ib_umem_add_sg_table(struct scatterlist **cur,
+				 struct scatterlist **nxt,
+				 struct page **page_list,
+				 unsigned long npages)
+{
+	unsigned long first_pfn;
+	unsigned long i = 0;
+	unsigned long cur_sglen = 0;
+	bool update_cur_sg = false;
+
+	/* Check if new page_list is contiguous with end of previous page_list*/
+	if (*cur != *nxt) {
+		cur_sglen = (*cur)->length >> PAGE_SHIFT;
+		first_pfn = page_to_pfn(sg_page(*cur));
+		if (first_pfn + cur_sglen == page_to_pfn(page_list[0]))
+			update_cur_sg = true;
+	}
+
+	while (i != npages) {
+		unsigned long len;
+		struct page *first_page = page_list[i];
+
+		first_pfn = page_to_pfn(first_page);
+
+		/* Compute the number of contiguous pages we have starting
+		 * at i
+		 */
+		for (len = 0; i != npages &&
+			      first_pfn + len == page_to_pfn(page_list[i]);
+		     i++, len++);
+
+		/* Squash first N contiguous pages from page_list into current
+		 * SGE
+		 */
+		if (update_cur_sg) {
+			sg_set_page(*cur, sg_page(*cur),
+				    (len + cur_sglen) * PAGE_SIZE, 0);
+			update_cur_sg = false;
+		/* Squash N contiguous pages into next SGE */
+		} else {
+			sg_set_page(*nxt, first_page, len * PAGE_SIZE, 0);
+			*cur = *nxt;
+			*nxt = sg_next(*cur);
+		}
+	}
+}
+
 /**
  * ib_umem_get - Pin and DMA map userspace memory.
  *
@@ -92,7 +144,7 @@  struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
 	int ret;
 	int i;
 	unsigned long dma_attrs = 0;
-	struct scatterlist *sg, *sg_list_start;
+	struct scatterlist *sg_cur, *sg_nxt;
 	unsigned int gup_flags = FOLL_WRITE;
 
 	if (dmasync)
@@ -179,7 +231,8 @@  struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
 	if (!umem->writable)
 		gup_flags |= FOLL_FORCE;
 
-	sg_list_start = umem->sg_head.sgl;
+	sg_cur = umem->sg_head.sgl;
+	sg_nxt = sg_cur;
 
 	while (npages) {
 		down_read(&mm->mmap_sem);
@@ -196,24 +249,24 @@  struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
 		cur_base += ret * PAGE_SIZE;
 		npages   -= ret;
 
+		ib_umem_add_sg_table(&sg_cur, &sg_nxt, page_list, ret);
+
 		/* Continue to hold the mmap_sem as vma_list access
 		 * needs to be protected.
 		 */
-		for_each_sg(sg_list_start, sg, ret, i) {
+		for (i = 0; i < ret && umem->hugetlb; i++) {
 			if (vma_list && !is_vm_hugetlb_page(vma_list[i]))
 				umem->hugetlb = 0;
-
-			sg_set_page(sg, page_list[i], PAGE_SIZE, 0);
 		}
-		up_read(&mm->mmap_sem);
 
-		/* preparing for next loop */
-		sg_list_start = sg;
+		up_read(&mm->mmap_sem);
 	}
 
+	sg_mark_end(sg_cur);
+
 	umem->nmap = ib_dma_map_sg_attrs(context->device,
 				  umem->sg_head.sgl,
-				  umem->npages,
+				  sg_nents(umem->sg_head.sgl),
 				  DMA_BIDIRECTIONAL,
 				  dma_attrs);
 
diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
index 1e2515e..fa0cb7c 100644
--- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c
+++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
@@ -3537,19 +3537,14 @@  static int fill_umem_pbl_tbl(struct ib_umem *umem, u64 *pbl_tbl_orig,
 	u64 *pbl_tbl = pbl_tbl_orig;
 	u64 paddr;
 	u64 page_mask = (1ULL << page_shift) - 1;
-	int i, pages;
-	struct scatterlist *sg;
-	int entry;
-
-	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
-		pages = sg_dma_len(sg) >> PAGE_SHIFT;
-		for (i = 0; i < pages; i++) {
-			paddr = sg_dma_address(sg) + (i << PAGE_SHIFT);
-			if (pbl_tbl == pbl_tbl_orig)
-				*pbl_tbl++ = paddr & ~page_mask;
-			else if ((paddr & page_mask) == 0)
-				*pbl_tbl++ = paddr;
-		}
+	struct sg_page_iter sg_iter;
+
+	for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->nmap, 0) {
+		paddr = sg_page_iter_dma_address(&sg_iter);
+		if (pbl_tbl == pbl_tbl_orig)
+			*pbl_tbl++ = paddr & ~page_mask;
+		else if ((paddr & page_mask) == 0)
+			*pbl_tbl++ = paddr;
 	}
 	return pbl_tbl - pbl_tbl_orig;
 }
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.c b/drivers/infiniband/hw/bnxt_re/qplib_res.c
index 59eeac5..2ec668f 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_res.c
+++ b/drivers/infiniband/hw/bnxt_re/qplib_res.c
@@ -85,7 +85,7 @@  static void __free_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl,
 static int __alloc_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl,
 		       struct scatterlist *sghead, u32 pages, u32 pg_size)
 {
-	struct scatterlist *sg;
+	struct sg_page_iter sg_iter;
 	bool is_umem = false;
 	int i;
 
@@ -116,12 +116,13 @@  static int __alloc_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl,
 	} else {
 		i = 0;
 		is_umem = true;
-		for_each_sg(sghead, sg, pages, i) {
-			pbl->pg_map_arr[i] = sg_dma_address(sg);
-			pbl->pg_arr[i] = sg_virt(sg);
+		for_each_sg_page(sghead, &sg_iter, pages, 0) {
+			pbl->pg_map_arr[i] = sg_page_iter_dma_address(&sg_iter);
+			pbl->pg_arr[i] = page_address(sg_page_iter_page(&sg_iter));
 			if (!pbl->pg_arr[i])
 				goto fail;
 
+			i++;
 			pbl->pg_count++;
 		}
 	}
diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c
index b34b1a1..884a653 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_provider.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c
@@ -522,14 +522,13 @@  static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 				      u64 virt, int acc, struct ib_udata *udata)
 {
 	__be64 *pages;
-	int shift, n, len;
-	int i, k, entry;
+	int shift, n, i;
 	int err = 0;
 	struct iwch_dev *rhp;
 	struct iwch_pd *php;
 	struct iwch_mr *mhp;
 	struct iwch_reg_user_mr_resp uresp;
-	struct scatterlist *sg;
+	struct sg_page_iter sg_iter;
 	pr_debug("%s ib_pd %p\n", __func__, pd);
 
 	php = to_iwch_pd(pd);
@@ -563,19 +562,15 @@  static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 
 	i = n = 0;
 
-	for_each_sg(mhp->umem->sg_head.sgl, sg, mhp->umem->nmap, entry) {
-			len = sg_dma_len(sg) >> shift;
-			for (k = 0; k < len; ++k) {
-				pages[i++] = cpu_to_be64(sg_dma_address(sg) +
-							 (k << shift));
-				if (i == PAGE_SIZE / sizeof *pages) {
-					err = iwch_write_pbl(mhp, pages, i, n);
-					if (err)
-						goto pbl_done;
-					n += i;
-					i = 0;
-				}
-			}
+	for_each_sg_page(mhp->umem->sg_head.sgl, &sg_iter, mhp->umem->nmap, 0) {
+		pages[i++] = cpu_to_be64(sg_page_iter_dma_address(&sg_iter));
+		if (i == PAGE_SIZE / sizeof *pages) {
+			err = iwch_write_pbl(mhp, pages, i, n);
+			if (err)
+				goto pbl_done;
+			n += i;
+			i = 0;
+		}
 	}
 
 	if (i)
diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c
index 7b76e6f..5806ac6 100644
--- a/drivers/infiniband/hw/cxgb4/mem.c
+++ b/drivers/infiniband/hw/cxgb4/mem.c
@@ -502,10 +502,9 @@  struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 			       u64 virt, int acc, struct ib_udata *udata)
 {
 	__be64 *pages;
-	int shift, n, len;
-	int i, k, entry;
+	int shift, n, i;
 	int err = -ENOMEM;
-	struct scatterlist *sg;
+	struct sg_page_iter sg_iter;
 	struct c4iw_dev *rhp;
 	struct c4iw_pd *php;
 	struct c4iw_mr *mhp;
@@ -556,21 +555,17 @@  struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 
 	i = n = 0;
 
-	for_each_sg(mhp->umem->sg_head.sgl, sg, mhp->umem->nmap, entry) {
-		len = sg_dma_len(sg) >> shift;
-		for (k = 0; k < len; ++k) {
-			pages[i++] = cpu_to_be64(sg_dma_address(sg) +
-						 (k << shift));
-			if (i == PAGE_SIZE / sizeof *pages) {
-				err = write_pbl(&mhp->rhp->rdev,
-				      pages,
-				      mhp->attr.pbl_addr + (n << 3), i,
-				      mhp->wr_waitp);
-				if (err)
-					goto pbl_done;
-				n += i;
-				i = 0;
-			}
+	for_each_sg_page(mhp->umem->sg_head.sgl, &sg_iter, mhp->umem->nmap, 0) {
+		pages[i++] = cpu_to_be64(sg_page_iter_dma_address(&sg_iter));
+		if (i == PAGE_SIZE / sizeof *pages) {
+			err = write_pbl(&mhp->rhp->rdev,
+					pages,
+					mhp->attr.pbl_addr + (n << 3), i,
+					mhp->wr_waitp);
+			if (err)
+				goto pbl_done;
+			n += i;
+			i = 0;
 		}
 	}
 
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
index b74c742..0bbf68f 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
@@ -1866,9 +1866,8 @@  static int hns_roce_v1_write_mtpt(void *mb_buf, struct hns_roce_mr *mr,
 				  unsigned long mtpt_idx)
 {
 	struct hns_roce_v1_mpt_entry *mpt_entry;
-	struct scatterlist *sg;
+	struct sg_page_iter sg_iter;
 	u64 *pages;
-	int entry;
 	int i;
 
 	/* MPT filled into mailbox buf */
@@ -1923,8 +1922,8 @@  static int hns_roce_v1_write_mtpt(void *mb_buf, struct hns_roce_mr *mr,
 		return -ENOMEM;
 
 	i = 0;
-	for_each_sg(mr->umem->sg_head.sgl, sg, mr->umem->nmap, entry) {
-		pages[i] = ((u64)sg_dma_address(sg)) >> 12;
+	for_each_sg_page(mr->umem->sg_head.sgl, &sg_iter, mr->umem->nmap, 0) {
+		pages[i] = ((u64)sg_page_iter_dma_address(&sg_iter)) >> 12;
 
 		/* Directly record to MTPT table firstly 7 entry */
 		if (i >= HNS_ROCE_MAX_INNER_MTPT_NUM)
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index 3a66945..1a98719 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -1831,12 +1831,10 @@  static int hns_roce_v2_set_mac(struct hns_roce_dev *hr_dev, u8 phy_port,
 static int set_mtpt_pbl(struct hns_roce_v2_mpt_entry *mpt_entry,
 			struct hns_roce_mr *mr)
 {
-	struct scatterlist *sg;
+	struct sg_page_iter sg_iter;
 	u64 page_addr;
 	u64 *pages;
-	int i, j;
-	int len;
-	int entry;
+	int i;
 
 	mpt_entry->pbl_size = cpu_to_le32(mr->pbl_size);
 	mpt_entry->pbl_ba_l = cpu_to_le32(lower_32_bits(mr->pbl_ba >> 3));
@@ -1849,17 +1847,14 @@  static int set_mtpt_pbl(struct hns_roce_v2_mpt_entry *mpt_entry,
 		return -ENOMEM;
 
 	i = 0;
-	for_each_sg(mr->umem->sg_head.sgl, sg, mr->umem->nmap, entry) {
-		len = sg_dma_len(sg) >> PAGE_SHIFT;
-		for (j = 0; j < len; ++j) {
-			page_addr = sg_dma_address(sg) +
-				(j << mr->umem->page_shift);
-			pages[i] = page_addr >> 6;
-			/* Record the first 2 entry directly to MTPT table */
-			if (i >= HNS_ROCE_V2_MAX_INNER_MTPT_NUM - 1)
-				goto found;
-			i++;
-		}
+	for_each_sg_page(mr->umem->sg_head.sgl, &sg_iter, mr->umem->nmap, 0) {
+		page_addr = sg_page_iter_dma_address(&sg_iter);
+		pages[i] = page_addr >> 6;
+
+		/* Record the first 2 entry directly to MTPT table */
+		if (i >= HNS_ROCE_V2_MAX_INNER_MTPT_NUM - 1)
+			goto found;
+		i++;
 	}
 found:
 	mpt_entry->pa0_l = cpu_to_le32(lower_32_bits(pages[0]));
diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c
index ee5991b..2cf5690 100644
--- a/drivers/infiniband/hw/hns/hns_roce_mr.c
+++ b/drivers/infiniband/hw/hns/hns_roce_mr.c
@@ -976,12 +976,11 @@  int hns_roce_ib_umem_write_mtt(struct hns_roce_dev *hr_dev,
 			       struct hns_roce_mtt *mtt, struct ib_umem *umem)
 {
 	struct device *dev = hr_dev->dev;
-	struct scatterlist *sg;
+	struct sg_page_iter sg_iter;
 	unsigned int order;
-	int i, k, entry;
 	int npage = 0;
 	int ret = 0;
-	int len;
+	int i;
 	u64 page_addr;
 	u64 *pages;
 	u32 bt_page_size;
@@ -1014,29 +1013,25 @@  int hns_roce_ib_umem_write_mtt(struct hns_roce_dev *hr_dev,
 
 	i = n = 0;
 
-	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
-		len = sg_dma_len(sg) >> PAGE_SHIFT;
-		for (k = 0; k < len; ++k) {
-			page_addr =
-				sg_dma_address(sg) + (k << umem->page_shift);
-			if (!(npage % (1 << (mtt->page_shift - PAGE_SHIFT)))) {
-				if (page_addr & ((1 << mtt->page_shift) - 1)) {
-					dev_err(dev, "page_addr 0x%llx is not page_shift %d alignment!\n",
-						page_addr, mtt->page_shift);
-					ret = -EINVAL;
-					goto out;
-				}
-				pages[i++] = page_addr;
-			}
-			npage++;
-			if (i == bt_page_size / sizeof(u64)) {
-				ret = hns_roce_write_mtt(hr_dev, mtt, n, i,
-							 pages);
-				if (ret)
-					goto out;
-				n += i;
-				i = 0;
+	for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->nmap, 0) {
+		page_addr = sg_page_iter_dma_address(&sg_iter);
+		if (!(npage % (1 << (mtt->page_shift - PAGE_SHIFT)))) {
+			if (page_addr & ((1 << mtt->page_shift) - 1)) {
+				dev_err(dev, "page_addr 0x%llx is not page_shift %d alignment!\n",
+					page_addr, mtt->page_shift);
+				ret = -EINVAL;
+				goto out;
 			}
+			pages[i++] = page_addr;
+		}
+		npage++;
+		if (i == bt_page_size / sizeof(u64)) {
+			ret = hns_roce_write_mtt(hr_dev, mtt, n, i,
+						 pages);
+			if (ret)
+				goto out;
+			n += i;
+			i = 0;
 		}
 	}
 
@@ -1052,10 +1047,8 @@  static int hns_roce_ib_umem_write_mr(struct hns_roce_dev *hr_dev,
 				     struct hns_roce_mr *mr,
 				     struct ib_umem *umem)
 {
-	struct scatterlist *sg;
-	int i = 0, j = 0, k;
-	int entry;
-	int len;
+	struct sg_page_iter sg_iter;
+	int i = 0, j = 0;
 	u64 page_addr;
 	u32 pbl_bt_sz;
 
@@ -1063,27 +1056,22 @@  static int hns_roce_ib_umem_write_mr(struct hns_roce_dev *hr_dev,
 		return 0;
 
 	pbl_bt_sz = 1 << (hr_dev->caps.pbl_ba_pg_sz + PAGE_SHIFT);
-	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
-		len = sg_dma_len(sg) >> PAGE_SHIFT;
-		for (k = 0; k < len; ++k) {
-			page_addr = sg_dma_address(sg) +
-				    (k << umem->page_shift);
-
-			if (!hr_dev->caps.pbl_hop_num) {
-				mr->pbl_buf[i++] = page_addr >> 12;
-			} else if (hr_dev->caps.pbl_hop_num == 1) {
-				mr->pbl_buf[i++] = page_addr;
-			} else {
-				if (hr_dev->caps.pbl_hop_num == 2)
-					mr->pbl_bt_l1[i][j] = page_addr;
-				else if (hr_dev->caps.pbl_hop_num == 3)
-					mr->pbl_bt_l2[i][j] = page_addr;
-
-				j++;
-				if (j >= (pbl_bt_sz / 8)) {
-					i++;
-					j = 0;
-				}
+	for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->nmap, 0) {
+		page_addr = sg_page_iter_dma_address(&sg_iter);
+		if (!hr_dev->caps.pbl_hop_num) {
+			mr->pbl_buf[i++] = page_addr >> 12;
+		} else if (hr_dev->caps.pbl_hop_num == 1) {
+			mr->pbl_buf[i++] = page_addr;
+		} else {
+			if (hr_dev->caps.pbl_hop_num == 2)
+				mr->pbl_bt_l1[i][j] = page_addr;
+			else if (hr_dev->caps.pbl_hop_num == 3)
+				mr->pbl_bt_l2[i][j] = page_addr;
+
+			j++;
+			if (j >= (pbl_bt_sz / 8)) {
+				i++;
+				j = 0;
 			}
 		}
 	}
diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
index 0b675b0..3e7a299 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
@@ -1369,32 +1369,29 @@  static void i40iw_copy_user_pgaddrs(struct i40iw_mr *iwmr,
 {
 	struct ib_umem *region = iwmr->region;
 	struct i40iw_pbl *iwpbl = &iwmr->iwpbl;
-	int chunk_pages, entry, i;
 	struct i40iw_pble_alloc *palloc = &iwpbl->pble_alloc;
 	struct i40iw_pble_info *pinfo;
-	struct scatterlist *sg;
+	struct sg_page_iter sg_iter;
 	u64 pg_addr = 0;
 	u32 idx = 0;
+	bool first_pg = true;
 
 	pinfo = (level == I40IW_LEVEL_1) ? NULL : palloc->level2.leaf;
 
-	for_each_sg(region->sg_head.sgl, sg, region->nmap, entry) {
-		chunk_pages = sg_dma_len(sg) >> region->page_shift;
-		if ((iwmr->type == IW_MEMREG_TYPE_QP) &&
-		    !iwpbl->qp_mr.sq_page)
-			iwpbl->qp_mr.sq_page = sg_page(sg);
-		for (i = 0; i < chunk_pages; i++) {
-			pg_addr = sg_dma_address(sg) +
-				(i << region->page_shift);
-
-			if ((entry + i) == 0)
-				*pbl = cpu_to_le64(pg_addr & iwmr->page_msk);
-			else if (!(pg_addr & ~iwmr->page_msk))
-				*pbl = cpu_to_le64(pg_addr);
-			else
-				continue;
-			pbl = i40iw_next_pbl_addr(pbl, &pinfo, &idx);
-		}
+	if (iwmr->type == IW_MEMREG_TYPE_QP)
+		iwpbl->qp_mr.sq_page = sg_page(region->sg_head.sgl);
+
+	for_each_sg_page(region->sg_head.sgl, &sg_iter, region->nmap, 0) {
+		pg_addr = sg_page_iter_dma_address(&sg_iter);
+		if (first_pg)
+			*pbl = cpu_to_le64(pg_addr & iwmr->page_msk);
+		else if (!(pg_addr & ~iwmr->page_msk))
+			*pbl = cpu_to_le64(pg_addr);
+		else
+			continue;
+
+		first_pg = false;
+		pbl = i40iw_next_pbl_addr(pbl, &pinfo, &idx);
 	}
 }
 
diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c
index 82cb6b7..9efc989 100644
--- a/drivers/infiniband/hw/mthca/mthca_provider.c
+++ b/drivers/infiniband/hw/mthca/mthca_provider.c
@@ -907,12 +907,11 @@  static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 				       u64 virt, int acc, struct ib_udata *udata)
 {
 	struct mthca_dev *dev = to_mdev(pd->device);
-	struct scatterlist *sg;
+	struct sg_page_iter sg_iter;
 	struct mthca_mr *mr;
 	struct mthca_reg_mr ucmd;
 	u64 *pages;
-	int shift, n, len;
-	int i, k, entry;
+	int shift, n, i;
 	int err = 0;
 	int write_mtt_size;
 
@@ -958,21 +957,19 @@  static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 
 	write_mtt_size = min(mthca_write_mtt_size(dev), (int) (PAGE_SIZE / sizeof *pages));
 
-	for_each_sg(mr->umem->sg_head.sgl, sg, mr->umem->nmap, entry) {
-		len = sg_dma_len(sg) >> shift;
-		for (k = 0; k < len; ++k) {
-			pages[i++] = sg_dma_address(sg) + (k << shift);
-			/*
-			 * Be friendly to write_mtt and pass it chunks
-			 * of appropriate size.
-			 */
-			if (i == write_mtt_size) {
-				err = mthca_write_mtt(dev, mr->mtt, n, pages, i);
-				if (err)
-					goto mtt_done;
-				n += i;
-				i = 0;
-			}
+	for_each_sg_page(mr->umem->sg_head.sgl, &sg_iter, mr->umem->nmap, 0) {
+		pages[i++] = sg_page_iter_dma_address(&sg_iter);
+
+		/*
+		 * Be friendly to write_mtt and pass it chunks
+		 * of appropriate size.
+		 */
+		if (i == write_mtt_size) {
+			err = mthca_write_mtt(dev, mr->mtt, n, pages, i);
+			if (err)
+				goto mtt_done;
+			n += i;
+			i = 0;
 		}
 	}
 
diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c
index 4e7f08e..dc32b7d 100644
--- a/drivers/infiniband/hw/nes/nes_verbs.c
+++ b/drivers/infiniband/hw/nes/nes_verbs.c
@@ -2109,7 +2109,7 @@  static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 	struct nes_device *nesdev = nesvnic->nesdev;
 	struct nes_adapter *nesadapter = nesdev->nesadapter;
 	struct ib_mr *ibmr = ERR_PTR(-EINVAL);
-	struct scatterlist *sg;
+	struct sg_page_iter sg_iter;
 	struct nes_ucontext *nes_ucontext;
 	struct nes_pbl *nespbl;
 	struct nes_mr *nesmr;
@@ -2117,10 +2117,9 @@  static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 	struct nes_mem_reg_req req;
 	struct nes_vpbl vpbl;
 	struct nes_root_vpbl root_vpbl;
-	int entry, page_index;
+	int page_index;
 	int page_count = 0;
 	int err, pbl_depth = 0;
-	int chunk_pages;
 	int ret;
 	u32 stag;
 	u32 stag_index = 0;
@@ -2132,7 +2131,6 @@  static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 	u16 pbl_count;
 	u8 single_page = 1;
 	u8 stag_key;
-	int first_page = 1;
 
 	region = ib_umem_get(pd->uobject->context, start, length, acc, 0);
 	if (IS_ERR(region)) {
@@ -2183,18 +2181,18 @@  static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 			}
 			nesmr->region = region;
 
-			for_each_sg(region->sg_head.sgl, sg, region->nmap, entry) {
-				if (sg_dma_address(sg) & ~PAGE_MASK) {
+			for_each_sg_page(region->sg_head.sgl, &sg_iter, region->nmap, 0) {
+				if (sg_dma_address(sg_iter.sg) & ~PAGE_MASK) {
 					ib_umem_release(region);
 					nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
 					nes_debug(NES_DBG_MR, "Unaligned Memory Buffer: 0x%x\n",
-						  (unsigned int) sg_dma_address(sg));
+						  (unsigned int)sg_dma_address(sg_iter.sg));
 					ibmr = ERR_PTR(-EINVAL);
 					kfree(nesmr);
 					goto reg_user_mr_err;
 				}
 
-				if (!sg_dma_len(sg)) {
+				if (!sg_dma_len(sg_iter.sg)) {
 					ib_umem_release(region);
 					nes_free_resource(nesadapter, nesadapter->allocated_mrs,
 							  stag_index);
@@ -2204,103 +2202,94 @@  static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 					goto reg_user_mr_err;
 				}
 
-				region_length += sg_dma_len(sg);
-				chunk_pages = sg_dma_len(sg) >> 12;
+				region_length += PAGE_SIZE;
 				region_length -= skip_pages << 12;
-				for (page_index = skip_pages; page_index < chunk_pages; page_index++) {
-					skip_pages = 0;
-					if ((page_count != 0) && (page_count << 12) - (ib_umem_offset(region) & (4096 - 1)) >= region->length)
-						goto enough_pages;
-					if ((page_count&0x01FF) == 0) {
-						if (page_count >= 1024 * 512) {
+				skip_pages = 0;
+				if ((page_count != 0) && (page_count << 12) - (ib_umem_offset(region) & (4096 - 1)) >= region->length)
+					goto enough_pages;
+				if ((page_count & 0x01FF) == 0) {
+					if (page_count >= 1024 * 512) {
+						ib_umem_release(region);
+						nes_free_resource(nesadapter,
+								  nesadapter->allocated_mrs, stag_index);
+						kfree(nesmr);
+						ibmr = ERR_PTR(-E2BIG);
+						goto reg_user_mr_err;
+					}
+					if (root_pbl_index == 1) {
+						root_vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev,
+								8192, &root_vpbl.pbl_pbase);
+						nes_debug(NES_DBG_MR, "Allocating root PBL, va = %p, pa = 0x%08X\n",
+							  root_vpbl.pbl_vbase, (unsigned int)root_vpbl.pbl_pbase);
+						if (!root_vpbl.pbl_vbase) {
 							ib_umem_release(region);
-							nes_free_resource(nesadapter,
-									  nesadapter->allocated_mrs, stag_index);
+							pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase,
+									    vpbl.pbl_pbase);
+							nes_free_resource(nesadapter, nesadapter->allocated_mrs,
+									  stag_index);
 							kfree(nesmr);
-							ibmr = ERR_PTR(-E2BIG);
+							ibmr = ERR_PTR(-ENOMEM);
 							goto reg_user_mr_err;
 						}
-						if (root_pbl_index == 1) {
-							root_vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev,
-									8192, &root_vpbl.pbl_pbase);
-							nes_debug(NES_DBG_MR, "Allocating root PBL, va = %p, pa = 0x%08X\n",
-								  root_vpbl.pbl_vbase, (unsigned int)root_vpbl.pbl_pbase);
-							if (!root_vpbl.pbl_vbase) {
-								ib_umem_release(region);
-								pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase,
-										    vpbl.pbl_pbase);
-								nes_free_resource(nesadapter, nesadapter->allocated_mrs,
-										  stag_index);
-								kfree(nesmr);
-								ibmr = ERR_PTR(-ENOMEM);
-								goto reg_user_mr_err;
-							}
-							root_vpbl.leaf_vpbl = kcalloc(1024,
-										      sizeof(*root_vpbl.leaf_vpbl),
-										      GFP_KERNEL);
-							if (!root_vpbl.leaf_vpbl) {
-								ib_umem_release(region);
-								pci_free_consistent(nesdev->pcidev, 8192, root_vpbl.pbl_vbase,
-										    root_vpbl.pbl_pbase);
-								pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase,
-										    vpbl.pbl_pbase);
-								nes_free_resource(nesadapter, nesadapter->allocated_mrs,
-										  stag_index);
-								kfree(nesmr);
-								ibmr = ERR_PTR(-ENOMEM);
-								goto reg_user_mr_err;
-							}
-							root_vpbl.pbl_vbase[0].pa_low =
-									cpu_to_le32((u32)vpbl.pbl_pbase);
-							root_vpbl.pbl_vbase[0].pa_high =
-									cpu_to_le32((u32)((((u64)vpbl.pbl_pbase) >> 32)));
-							root_vpbl.leaf_vpbl[0] = vpbl;
-						}
-						vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 4096,
-								&vpbl.pbl_pbase);
-						nes_debug(NES_DBG_MR, "Allocating leaf PBL, va = %p, pa = 0x%08X\n",
-							  vpbl.pbl_vbase, (unsigned int)vpbl.pbl_pbase);
-						if (!vpbl.pbl_vbase) {
+						root_vpbl.leaf_vpbl = kcalloc(1024,
+									      sizeof(*root_vpbl.leaf_vpbl),
+									      GFP_KERNEL);
+						if (!root_vpbl.leaf_vpbl) {
 							ib_umem_release(region);
-							nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
-							ibmr = ERR_PTR(-ENOMEM);
+							pci_free_consistent(nesdev->pcidev, 8192, root_vpbl.pbl_vbase,
+									    root_vpbl.pbl_pbase);
+							pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase,
+									    vpbl.pbl_pbase);
+							nes_free_resource(nesadapter, nesadapter->allocated_mrs,
+									  stag_index);
 							kfree(nesmr);
+							ibmr = ERR_PTR(-ENOMEM);
 							goto reg_user_mr_err;
 						}
-						if (1 <= root_pbl_index) {
-							root_vpbl.pbl_vbase[root_pbl_index].pa_low =
-									cpu_to_le32((u32)vpbl.pbl_pbase);
-							root_vpbl.pbl_vbase[root_pbl_index].pa_high =
-									cpu_to_le32((u32)((((u64)vpbl.pbl_pbase)>>32)));
-							root_vpbl.leaf_vpbl[root_pbl_index] = vpbl;
-						}
-						root_pbl_index++;
-						cur_pbl_index = 0;
+						root_vpbl.pbl_vbase[0].pa_low =
+								cpu_to_le32((u32)vpbl.pbl_pbase);
+						root_vpbl.pbl_vbase[0].pa_high =
+								cpu_to_le32((u32)((((u64)vpbl.pbl_pbase) >> 32)));
+						root_vpbl.leaf_vpbl[0] = vpbl;
 					}
-					if (single_page) {
-						if (page_count != 0) {
-							if ((last_dma_addr+4096) !=
-									(sg_dma_address(sg)+
-									(page_index*4096)))
-								single_page = 0;
-							last_dma_addr = sg_dma_address(sg)+
-									(page_index*4096);
-						} else {
-							first_dma_addr = sg_dma_address(sg)+
-									(page_index*4096);
-							last_dma_addr = first_dma_addr;
-						}
+					vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 4096,
+							&vpbl.pbl_pbase);
+					nes_debug(NES_DBG_MR, "Allocating leaf PBL, va = %p, pa = 0x%08X\n",
+						  vpbl.pbl_vbase, (unsigned int)vpbl.pbl_pbase);
+					if (!vpbl.pbl_vbase) {
+						ib_umem_release(region);
+						nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
+						ibmr = ERR_PTR(-ENOMEM);
+						kfree(nesmr);
+						goto reg_user_mr_err;
+					}
+					if (1 <= root_pbl_index) {
+						root_vpbl.pbl_vbase[root_pbl_index].pa_low =
+								cpu_to_le32((u32)vpbl.pbl_pbase);
+						root_vpbl.pbl_vbase[root_pbl_index].pa_high =
+								cpu_to_le32((u32)((((u64)vpbl.pbl_pbase) >> 32)));
+						root_vpbl.leaf_vpbl[root_pbl_index] = vpbl;
+					}
+					root_pbl_index++;
+					cur_pbl_index = 0;
+				}
+				if (single_page) {
+					if (page_count != 0) {
+						if ((last_dma_addr + 4096) != sg_page_iter_dma_address(&sg_iter))
+							single_page = 0;
+						last_dma_addr = sg_page_iter_dma_address(&sg_iter);
+					} else {
+						first_dma_addr = sg_page_iter_dma_address(&sg_iter);
+						last_dma_addr = first_dma_addr;
 					}
-
-					vpbl.pbl_vbase[cur_pbl_index].pa_low =
-							cpu_to_le32((u32)(sg_dma_address(sg)+
-							(page_index*4096)));
-					vpbl.pbl_vbase[cur_pbl_index].pa_high =
-							cpu_to_le32((u32)((((u64)(sg_dma_address(sg)+
-							(page_index*4096))) >> 32)));
-					cur_pbl_index++;
-					page_count++;
 				}
+
+				vpbl.pbl_vbase[cur_pbl_index].pa_low =
+						cpu_to_le32((u32)(sg_page_iter_dma_address(&sg_iter)));
+				vpbl.pbl_vbase[cur_pbl_index].pa_high =
+						cpu_to_le32((u32)((u64)(sg_page_iter_dma_address(&sg_iter))));
+				cur_pbl_index++;
+				page_count++;
 			}
 
 			enough_pages:
@@ -2412,26 +2401,14 @@  static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 				  nespbl->pbl_size, (unsigned long) nespbl->pbl_pbase,
 				  (void *) nespbl->pbl_vbase, nespbl->user_base);
 
-			for_each_sg(region->sg_head.sgl, sg, region->nmap, entry) {
-				chunk_pages = sg_dma_len(sg) >> 12;
-				chunk_pages += (sg_dma_len(sg) & (4096-1)) ? 1 : 0;
-				if (first_page) {
-					nespbl->page = sg_page(sg);
-					first_page = 0;
-				}
-
-				for (page_index = 0; page_index < chunk_pages; page_index++) {
-					((__le32 *)pbl)[0] = cpu_to_le32((u32)
-							(sg_dma_address(sg)+
-							(page_index*4096)));
-					((__le32 *)pbl)[1] = cpu_to_le32(((u64)
-							(sg_dma_address(sg)+
-							(page_index*4096)))>>32);
-					nes_debug(NES_DBG_MR, "pbl=%p, *pbl=0x%016llx, 0x%08x%08x\n", pbl,
-						  (unsigned long long)*pbl,
-						  le32_to_cpu(((__le32 *)pbl)[1]), le32_to_cpu(((__le32 *)pbl)[0]));
-					pbl++;
-				}
+			nespbl->page = sg_page(region->sg_head.sgl);
+			for_each_sg_page(region->sg_head.sgl, &sg_iter, region->nmap, 0) {
+				((__le32 *)pbl)[0] = cpu_to_le32((u32)(sg_page_iter_dma_address(&sg_iter)));
+				((__le32 *)pbl)[1] = cpu_to_le32(((u64)(sg_page_iter_dma_address(&sg_iter)))>>32);
+				nes_debug(NES_DBG_MR, "pbl=%p, *pbl=0x%016llx, 0x%08x%08x\n", pbl,
+					  (unsigned long long)*pbl,
+					  le32_to_cpu(((__le32 *)pbl)[1]), le32_to_cpu(((__le32 *)pbl)[0]));
+				pbl++;
 			}
 
 			if (req.reg_type == IWNES_MEMREG_TYPE_QP) {
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
index c46bed0..05224ef 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
@@ -854,10 +854,11 @@  static void build_user_pbes(struct ocrdma_dev *dev, struct ocrdma_mr *mr,
 			    u32 num_pbes)
 {
 	struct ocrdma_pbe *pbe;
-	struct scatterlist *sg;
+	struct sg_page_iter sg_iter;
 	struct ocrdma_pbl *pbl_tbl = mr->hwmr.pbl_table;
 	struct ib_umem *umem = mr->umem;
-	int shift, pg_cnt, pages, pbe_cnt, entry, total_num_pbes = 0;
+	int pbe_cnt, total_num_pbes = 0;
+	u64 pg_addr;
 
 	if (!mr->hwmr.num_pbes)
 		return;
@@ -865,36 +866,27 @@  static void build_user_pbes(struct ocrdma_dev *dev, struct ocrdma_mr *mr,
 	pbe = (struct ocrdma_pbe *)pbl_tbl->va;
 	pbe_cnt = 0;
 
-	shift = umem->page_shift;
-
-	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
-		pages = sg_dma_len(sg) >> shift;
-		for (pg_cnt = 0; pg_cnt < pages; pg_cnt++) {
-			/* store the page address in pbe */
-			pbe->pa_lo =
-			    cpu_to_le32(sg_dma_address(sg) +
-					(pg_cnt << shift));
-			pbe->pa_hi =
-			    cpu_to_le32(upper_32_bits(sg_dma_address(sg) +
-					 (pg_cnt << shift)));
-			pbe_cnt += 1;
-			total_num_pbes += 1;
-			pbe++;
-
-			/* if done building pbes, issue the mbx cmd. */
-			if (total_num_pbes == num_pbes)
-				return;
-
-			/* if the given pbl is full storing the pbes,
-			 * move to next pbl.
-			 */
-			if (pbe_cnt ==
-				(mr->hwmr.pbl_size / sizeof(u64))) {
-				pbl_tbl++;
-				pbe = (struct ocrdma_pbe *)pbl_tbl->va;
-				pbe_cnt = 0;
-			}
+	for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->nmap, 0) {
+		/* store the page address in pbe */
+		pg_addr = sg_page_iter_dma_address(&sg_iter);
+		pbe->pa_lo = cpu_to_le32(pg_addr);
+		pbe->pa_hi = cpu_to_le32(upper_32_bits(pg_addr));
+		pbe_cnt += 1;
+		total_num_pbes += 1;
+		pbe++;
+
+		/* if done building pbes, issue the mbx cmd. */
+		if (total_num_pbes == num_pbes)
+			return;
 
+		/* if the given pbl is full storing the pbes,
+		 * move to next pbl.
+		 */
+		if (pbe_cnt ==
+			(mr->hwmr.pbl_size / sizeof(u64))) {
+			pbl_tbl++;
+			pbe = (struct ocrdma_pbe *)pbl_tbl->va;
+			pbe_cnt = 0;
 		}
 	}
 }
diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c
index b342a70..db0f359 100644
--- a/drivers/infiniband/hw/qedr/verbs.c
+++ b/drivers/infiniband/hw/qedr/verbs.c
@@ -636,13 +636,12 @@  static void qedr_populate_pbls(struct qedr_dev *dev, struct ib_umem *umem,
 			       struct qedr_pbl *pbl,
 			       struct qedr_pbl_info *pbl_info, u32 pg_shift)
 {
-	int shift, pg_cnt, pages, pbe_cnt, total_num_pbes = 0;
+	int pbe_cnt, total_num_pbes = 0;
 	u32 fw_pg_cnt, fw_pg_per_umem_pg;
 	struct qedr_pbl *pbl_tbl;
-	struct scatterlist *sg;
+	struct sg_page_iter sg_iter;
 	struct regpair *pbe;
 	u64 pg_addr;
-	int entry;
 
 	if (!pbl_info->num_pbes)
 		return;
@@ -663,38 +662,33 @@  static void qedr_populate_pbls(struct qedr_dev *dev, struct ib_umem *umem,
 
 	pbe_cnt = 0;
 
-	shift = umem->page_shift;
-
 	fw_pg_per_umem_pg = BIT(umem->page_shift - pg_shift);
 
-	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
-		pages = sg_dma_len(sg) >> shift;
-		pg_addr = sg_dma_address(sg);
-		for (pg_cnt = 0; pg_cnt < pages; pg_cnt++) {
-			for (fw_pg_cnt = 0; fw_pg_cnt < fw_pg_per_umem_pg;) {
-				pbe->lo = cpu_to_le32(pg_addr);
-				pbe->hi = cpu_to_le32(upper_32_bits(pg_addr));
-
-				pg_addr += BIT(pg_shift);
-				pbe_cnt++;
-				total_num_pbes++;
-				pbe++;
-
-				if (total_num_pbes == pbl_info->num_pbes)
-					return;
-
-				/* If the given pbl is full storing the pbes,
-				 * move to next pbl.
-				 */
-				if (pbe_cnt ==
-				    (pbl_info->pbl_size / sizeof(u64))) {
-					pbl_tbl++;
-					pbe = (struct regpair *)pbl_tbl->va;
-					pbe_cnt = 0;
-				}
+	for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->nmap, 0) {
+		pg_addr = sg_page_iter_dma_address(&sg_iter);
+		for (fw_pg_cnt = 0; fw_pg_cnt < fw_pg_per_umem_pg;) {
+			pbe->lo = cpu_to_le32(pg_addr);
+			pbe->hi = cpu_to_le32(upper_32_bits(pg_addr));
+
+			pg_addr += BIT(pg_shift);
+			pbe_cnt++;
+			total_num_pbes++;
+			pbe++;
 
-				fw_pg_cnt++;
+			if (total_num_pbes == pbl_info->num_pbes)
+				return;
+
+			/* If the given pbl is full storing the pbes,
+			 * move to next pbl.
+			 */
+			if (pbe_cnt ==
+			    (pbl_info->pbl_size / sizeof(u64))) {
+				pbl_tbl++;
+				pbe = (struct regpair *)pbl_tbl->va;
+				pbe_cnt = 0;
 			}
+
+			fw_pg_cnt++;
 		}
 	}
 }
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_misc.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_misc.c
index fb0c5c0..d40099b 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_misc.c
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_misc.c
@@ -183,25 +183,20 @@  int pvrdma_page_dir_insert_umem(struct pvrdma_page_dir *pdir,
 				struct ib_umem *umem, u64 offset)
 {
 	u64 i = offset;
-	int j, entry;
-	int ret = 0, len = 0;
-	struct scatterlist *sg;
+	int ret = 0;
+	struct sg_page_iter sg_iter;
 
 	if (offset >= pdir->npages)
 		return -EINVAL;
 
-	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
-		len = sg_dma_len(sg) >> PAGE_SHIFT;
-		for (j = 0; j < len; j++) {
-			dma_addr_t addr = sg_dma_address(sg) +
-					  (j << umem->page_shift);
+	for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->nmap, 0) {
+		dma_addr_t addr = sg_page_iter_dma_address(&sg_iter);
 
-			ret = pvrdma_page_dir_insert_dma(pdir, i, addr);
-			if (ret)
-				goto exit;
+		ret = pvrdma_page_dir_insert_dma(pdir, i, addr);
+		if (ret)
+			goto exit;
 
-			i++;
-		}
+		i++;
 	}
 
 exit:
diff --git a/drivers/infiniband/sw/rdmavt/mr.c b/drivers/infiniband/sw/rdmavt/mr.c
index 49c9541..8f02059 100644
--- a/drivers/infiniband/sw/rdmavt/mr.c
+++ b/drivers/infiniband/sw/rdmavt/mr.c
@@ -381,8 +381,8 @@  struct ib_mr *rvt_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 {
 	struct rvt_mr *mr;
 	struct ib_umem *umem;
-	struct scatterlist *sg;
-	int n, m, entry;
+	struct sg_page_iter sg_iter;
+	int n, m;
 	struct ib_mr *ret;
 
 	if (length == 0)
@@ -411,10 +411,10 @@  struct ib_mr *rvt_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 	mr->mr.page_shift = umem->page_shift;
 	m = 0;
 	n = 0;
-	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
+	for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->nmap, 0) {
 		void *vaddr;
 
-		vaddr = page_address(sg_page(sg));
+		vaddr = page_address(sg_page_iter_page(&sg_iter));
 		if (!vaddr) {
 			ret = ERR_PTR(-EINVAL);
 			goto bail_inval;
diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c
index 9d3916b..a6937de 100644
--- a/drivers/infiniband/sw/rxe/rxe_mr.c
+++ b/drivers/infiniband/sw/rxe/rxe_mr.c
@@ -162,11 +162,10 @@  int rxe_mem_init_user(struct rxe_pd *pd, u64 start,
 		      u64 length, u64 iova, int access, struct ib_udata *udata,
 		      struct rxe_mem *mem)
 {
-	int			entry;
 	struct rxe_map		**map;
 	struct rxe_phys_buf	*buf = NULL;
 	struct ib_umem		*umem;
-	struct scatterlist	*sg;
+	struct sg_page_iter	sg_iter;
 	int			num_buf;
 	void			*vaddr;
 	int err;
@@ -199,8 +198,8 @@  int rxe_mem_init_user(struct rxe_pd *pd, u64 start,
 	if (length > 0) {
 		buf = map[0]->buf;
 
-		for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
-			vaddr = page_address(sg_page(sg));
+		for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->nmap, 0) {
+			vaddr = page_address(sg_page_iter_page(&sg_iter));
 			if (!vaddr) {
 				pr_warn("null vaddr\n");
 				err = -ENOMEM;