diff mbox series

[RFC,05/12] RDMA/cxgb4: Use for_each_sg_dma_page iterator on umem SGL

Message ID 20190126165913.18272-6-shiraz.saleem@intel.com (mailing list archive)
State Superseded
Headers show
Series Adapt drivers to handle page combining on umem SGEs | expand

Commit Message

Saleem, Shiraz Jan. 26, 2019, 4:59 p.m. UTC
From: "Shiraz, Saleem" <shiraz.saleem@intel.com>

Use the for_each_sg_dma_page iterator variant to walk the umem
DMA-mapped SGL and get the page DMA address. This avoids the extra
loop to iterate pages in the SGE when for_each_sg iterator is used.

Additionally, purge umem->page_shift usage in the driver
as its only relevant for ODP MRs. Use system page size and
shift instead.

Cc: Steve Wise <swise@chelsio.com>
Signed-off-by: Shiraz, Saleem <shiraz.saleem@intel.com>
---
 drivers/infiniband/hw/cxgb4/mem.c | 33 ++++++++++++++-------------------
 1 file changed, 14 insertions(+), 19 deletions(-)

Comments

Steve Wise Jan. 26, 2019, 5:09 p.m. UTC | #1
> -----Original Message-----
> From: linux-rdma-owner@vger.kernel.org <linux-rdma-
> owner@vger.kernel.org> On Behalf Of Shiraz Saleem
> Sent: Saturday, January 26, 2019 10:59 AM
> To: dledford@redhat.com; jgg@ziepe.ca; linux-rdma@vger.kernel.org
> Cc: Shiraz, Saleem <shiraz.saleem@intel.com>; Steve Wise
> <swise@chelsio.com>
> Subject: [PATCH RFC 05/12] RDMA/cxgb4: Use for_each_sg_dma_page
> iterator on umem SGL
> 
> From: "Shiraz, Saleem" <shiraz.saleem@intel.com>
> 
> Use the for_each_sg_dma_page iterator variant to walk the umem
> DMA-mapped SGL and get the page DMA address. This avoids the extra
> loop to iterate pages in the SGE when for_each_sg iterator is used.
> 
> Additionally, purge umem->page_shift usage in the driver
> as its only relevant for ODP MRs. Use system page size and
> shift instead.

Hey Shiraz, Doesn't umem->page_shift allow registering huge pages
efficiently?  IE is umem->page_shift set for the 2MB shift if the memory in
this umem region is from the 2MB huge page pool? 

> 
> Cc: Steve Wise <swise@chelsio.com>
> Signed-off-by: Shiraz, Saleem <shiraz.saleem@intel.com>
> ---
>  drivers/infiniband/hw/cxgb4/mem.c | 33 ++++++++++++++-------------------
>  1 file changed, 14 insertions(+), 19 deletions(-)
> 
> diff --git a/drivers/infiniband/hw/cxgb4/mem.c
> b/drivers/infiniband/hw/cxgb4/mem.c
> index 96760a3..a9cd6f1 100644
> --- a/drivers/infiniband/hw/cxgb4/mem.c
> +++ b/drivers/infiniband/hw/cxgb4/mem.c
> @@ -502,10 +502,9 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd,
> u64 start, u64 length,
>  			       u64 virt, int acc, struct ib_udata *udata)
>  {
>  	__be64 *pages;
> -	int shift, n, len;
> -	int i, k, entry;
> +	int shift, n, i;
>  	int err = -ENOMEM;
> -	struct scatterlist *sg;
> +	struct sg_dma_page_iter sg_iter;
>  	struct c4iw_dev *rhp;
>  	struct c4iw_pd *php;
>  	struct c4iw_mr *mhp;
> @@ -541,7 +540,7 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd,
> u64 start, u64 length,
>  	if (IS_ERR(mhp->umem))
>  		goto err_free_skb;
> 
> -	shift = mhp->umem->page_shift;
> +	shift = PAGE_SHIFT;
> 
>  	n = mhp->umem->nmap;
>  	err = alloc_pbl(mhp, n);
> @@ -556,21 +555,17 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd,
> u64 start, u64 length,
> 
>  	i = n = 0;
> 
> -	for_each_sg(mhp->umem->sg_head.sgl, sg, mhp->umem->nmap,
> entry) {
> -		len = sg_dma_len(sg) >> shift;
> -		for (k = 0; k < len; ++k) {
> -			pages[i++] = cpu_to_be64(sg_dma_address(sg) +
> -						 (k << shift));
> -			if (i == PAGE_SIZE / sizeof *pages) {
> -				err = write_pbl(&mhp->rhp->rdev,
> -				      pages,
> -				      mhp->attr.pbl_addr + (n << 3), i,
> -				      mhp->wr_waitp);
> -				if (err)
> -					goto pbl_done;
> -				n += i;
> -				i = 0;
> -			}
> +	for_each_sg_dma_page(mhp->umem->sg_head.sgl, &sg_iter, mhp-
> >umem->nmap, 0) {
> +		pages[i++] =
> cpu_to_be64(sg_page_iter_dma_address(&sg_iter));
> +		if (i == PAGE_SIZE / sizeof *pages) {
> +			err = write_pbl(&mhp->rhp->rdev,
> +					pages,
> +					mhp->attr.pbl_addr + (n << 3), i,
> +					mhp->wr_waitp);
> +			if (err)
> +				goto pbl_done;
> +			n += i;
> +			i = 0;
>  		}
>  	}
> 
> --
> 1.8.3.1
Saleem, Shiraz Jan. 26, 2019, 6:43 p.m. UTC | #2
On Sat, Jan 26, 2019 at 11:09:45AM -0600, Steve Wise wrote:
> 
> 
> > -----Original Message-----
> > From: linux-rdma-owner@vger.kernel.org <linux-rdma-
> > owner@vger.kernel.org> On Behalf Of Shiraz Saleem
> > Sent: Saturday, January 26, 2019 10:59 AM
> > To: dledford@redhat.com; jgg@ziepe.ca; linux-rdma@vger.kernel.org
> > Cc: Shiraz, Saleem <shiraz.saleem@intel.com>; Steve Wise
> > <swise@chelsio.com>
> > Subject: [PATCH RFC 05/12] RDMA/cxgb4: Use for_each_sg_dma_page
> > iterator on umem SGL
> > 
> > From: "Shiraz, Saleem" <shiraz.saleem@intel.com>
> > 
> > Use the for_each_sg_dma_page iterator variant to walk the umem
> > DMA-mapped SGL and get the page DMA address. This avoids the extra
> > loop to iterate pages in the SGE when for_each_sg iterator is used.
> > 
> > Additionally, purge umem->page_shift usage in the driver
> > as its only relevant for ODP MRs. Use system page size and
> > shift instead.
> 
> Hey Shiraz, Doesn't umem->page_shift allow registering huge pages
> efficiently?  IE is umem->page_shift set for the 2MB shift if the memory in
> this umem region is from the 2MB huge page pool? 
>

Not quite. For non-ODP MRs, its just the shift w.r.t the default arch page size
(4K on x86), i.e. PAGE_SHIFT. Even if the umem is backed by huge pages.

https://elixir.bootlin.com/linux/v5.0-rc3/source/drivers/infiniband/core/umem.c#L126

Shiraz
Steve Wise Jan. 26, 2019, 10:11 p.m. UTC | #3
> -----Original Message-----
> From: Shiraz Saleem <shiraz.saleem@intel.com>
> Sent: Saturday, January 26, 2019 12:44 PM
> To: Steve Wise <swise@opengridcomputing.com>
> Cc: dledford@redhat.com; jgg@ziepe.ca; linux-rdma@vger.kernel.org; 'Steve
> Wise' <swise@chelsio.com>
> Subject: Re: [PATCH RFC 05/12] RDMA/cxgb4: Use for_each_sg_dma_page
> iterator on umem SGL
> 
> On Sat, Jan 26, 2019 at 11:09:45AM -0600, Steve Wise wrote:
> >
> >
> > > -----Original Message-----
> > > From: linux-rdma-owner@vger.kernel.org <linux-rdma-
> > > owner@vger.kernel.org> On Behalf Of Shiraz Saleem
> > > Sent: Saturday, January 26, 2019 10:59 AM
> > > To: dledford@redhat.com; jgg@ziepe.ca; linux-rdma@vger.kernel.org
> > > Cc: Shiraz, Saleem <shiraz.saleem@intel.com>; Steve Wise
> > > <swise@chelsio.com>
> > > Subject: [PATCH RFC 05/12] RDMA/cxgb4: Use for_each_sg_dma_page
> > > iterator on umem SGL
> > >
> > > From: "Shiraz, Saleem" <shiraz.saleem@intel.com>
> > >
> > > Use the for_each_sg_dma_page iterator variant to walk the umem
> > > DMA-mapped SGL and get the page DMA address. This avoids the extra
> > > loop to iterate pages in the SGE when for_each_sg iterator is used.
> > >
> > > Additionally, purge umem->page_shift usage in the driver
> > > as its only relevant for ODP MRs. Use system page size and
> > > shift instead.
> >
> > Hey Shiraz, Doesn't umem->page_shift allow registering huge pages
> > efficiently?  IE is umem->page_shift set for the 2MB shift if the memory
in
> > this umem region is from the 2MB huge page pool?
> >
> 
> Not quite. For non-ODP MRs, its just the shift w.r.t the default arch page
size
> (4K on x86), i.e. PAGE_SHIFT. Even if the umem is backed by huge pages.
> 
> https://elixir.bootlin.com/linux/v5.0-
> rc3/source/drivers/infiniband/core/umem.c#L126
> 
> Shiraz
> 

Ok, Thanks.

Acked-by: Steve Wise <swise@opengridcomputing.com>
Jason Gunthorpe Jan. 28, 2019, 6:29 p.m. UTC | #4
On Sat, Jan 26, 2019 at 11:09:45AM -0600, Steve Wise wrote:
> 
> 
> > From: linux-rdma-owner@vger.kernel.org <linux-rdma-
> > owner@vger.kernel.org> On Behalf Of Shiraz Saleem
> > Sent: Saturday, January 26, 2019 10:59 AM
> > To: dledford@redhat.com; jgg@ziepe.ca; linux-rdma@vger.kernel.org
> > Cc: Shiraz, Saleem <shiraz.saleem@intel.com>; Steve Wise
> > <swise@chelsio.com>
> > Subject: [PATCH RFC 05/12] RDMA/cxgb4: Use for_each_sg_dma_page
> > iterator on umem SGL
> > 
> > From: "Shiraz, Saleem" <shiraz.saleem@intel.com>
> > 
> > Use the for_each_sg_dma_page iterator variant to walk the umem
> > DMA-mapped SGL and get the page DMA address. This avoids the extra
> > loop to iterate pages in the SGE when for_each_sg iterator is used.
> > 
> > Additionally, purge umem->page_shift usage in the driver
> > as its only relevant for ODP MRs. Use system page size and
> > shift instead.
> 
> Hey Shiraz, Doesn't umem->page_shift allow registering huge pages
> efficiently?  IE is umem->page_shift set for the 2MB shift if the memory in
> this umem region is from the 2MB huge page pool? 

I think long ago this might have been some feavered dream, but it was
never implemented and never made any sense. How would the core code
know it driver supported the CPU's huge page size?

Shiraz's version to ineject huge pages into the driver is much better

Jason
Steve Wise Jan. 28, 2019, 6:45 p.m. UTC | #5
On 1/28/2019 12:29 PM, Jason Gunthorpe wrote:
> On Sat, Jan 26, 2019 at 11:09:45AM -0600, Steve Wise wrote:
>>
>>> From: linux-rdma-owner@vger.kernel.org <linux-rdma-
>>> owner@vger.kernel.org> On Behalf Of Shiraz Saleem
>>> Sent: Saturday, January 26, 2019 10:59 AM
>>> To: dledford@redhat.com; jgg@ziepe.ca; linux-rdma@vger.kernel.org
>>> Cc: Shiraz, Saleem <shiraz.saleem@intel.com>; Steve Wise
>>> <swise@chelsio.com>
>>> Subject: [PATCH RFC 05/12] RDMA/cxgb4: Use for_each_sg_dma_page
>>> iterator on umem SGL
>>>
>>> From: "Shiraz, Saleem" <shiraz.saleem@intel.com>
>>>
>>> Use the for_each_sg_dma_page iterator variant to walk the umem
>>> DMA-mapped SGL and get the page DMA address. This avoids the extra
>>> loop to iterate pages in the SGE when for_each_sg iterator is used.
>>>
>>> Additionally, purge umem->page_shift usage in the driver
>>> as its only relevant for ODP MRs. Use system page size and
>>> shift instead.
>> Hey Shiraz, Doesn't umem->page_shift allow registering huge pages
>> efficiently?  IE is umem->page_shift set for the 2MB shift if the memory in
>> this umem region is from the 2MB huge page pool? 
> I think long ago this might have been some feavered dream, but it was
> never implemented and never made any sense. How would the core code
> know it driver supported the CPU's huge page size?
>
> Shiraz's version to ineject huge pages into the driver is much better

The driver advertises the "page sizes" it supports for MR PBLs
(ib_device_attr.page_size_cap).  For example, cxgb4 hw supports 4K up to
128MB.  So if a umem was composed of only huge pages, then the reg code
could pick a page size that is as big as the huge page size or up to the
device max supported page size, thus reducing the PBL depth for a given
MR.  There was code for this once upon a time, I thought.  Perhaps it
was never upstreamed or it was rejected.

Steve.


> Jason
Jason Gunthorpe Jan. 28, 2019, 7:27 p.m. UTC | #6
On Mon, Jan 28, 2019 at 12:45:26PM -0600, Steve Wise wrote:
> 
> On 1/28/2019 12:29 PM, Jason Gunthorpe wrote:
> > On Sat, Jan 26, 2019 at 11:09:45AM -0600, Steve Wise wrote:
> >>
> >>> From: linux-rdma-owner@vger.kernel.org <linux-rdma-
> >>> owner@vger.kernel.org> On Behalf Of Shiraz Saleem
> >>> Sent: Saturday, January 26, 2019 10:59 AM
> >>> To: dledford@redhat.com; jgg@ziepe.ca; linux-rdma@vger.kernel.org
> >>> Cc: Shiraz, Saleem <shiraz.saleem@intel.com>; Steve Wise
> >>> <swise@chelsio.com>
> >>> Subject: [PATCH RFC 05/12] RDMA/cxgb4: Use for_each_sg_dma_page
> >>> iterator on umem SGL
> >>>
> >>> From: "Shiraz, Saleem" <shiraz.saleem@intel.com>
> >>>
> >>> Use the for_each_sg_dma_page iterator variant to walk the umem
> >>> DMA-mapped SGL and get the page DMA address. This avoids the extra
> >>> loop to iterate pages in the SGE when for_each_sg iterator is used.
> >>>
> >>> Additionally, purge umem->page_shift usage in the driver
> >>> as its only relevant for ODP MRs. Use system page size and
> >>> shift instead.
> >> Hey Shiraz, Doesn't umem->page_shift allow registering huge pages
> >> efficiently?  IE is umem->page_shift set for the 2MB shift if the memory in
> >> this umem region is from the 2MB huge page pool? 
> > I think long ago this might have been some feavered dream, but it was
> > never implemented and never made any sense. How would the core code
> > know it driver supported the CPU's huge page size?
> >
> > Shiraz's version to ineject huge pages into the driver is much better
> 
> The driver advertises the "page sizes" it supports for MR PBLs
> (ib_device_attr.page_size_cap).  For example, cxgb4 hw supports 4K up to
> 128MB.  So if a umem was composed of only huge pages, then the reg code
> could pick a page size that is as big as the huge page size or up to the
> device max supported page size, thus reducing the PBL depth for a given
> MR.  

> There was code for this once upon a time, I thought.  Perhaps it
> was never upstreamed or it was rejected.

I don't know, nothing in the kernel uses page_size_cap, it is just
flowed to verbs.

I still think Shiraz's version is cleaner, having the driver break up
and iterate over maximally sized sgl entries makes sense to me. This
way we get the best efficiency on the iommu setup path. Esepcially if
we have drivers with different requirements for start/end alignment -
which hasn't been entirely investigated yet...

Jason
diff mbox series

Patch

diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c
index 96760a3..a9cd6f1 100644
--- a/drivers/infiniband/hw/cxgb4/mem.c
+++ b/drivers/infiniband/hw/cxgb4/mem.c
@@ -502,10 +502,9 @@  struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 			       u64 virt, int acc, struct ib_udata *udata)
 {
 	__be64 *pages;
-	int shift, n, len;
-	int i, k, entry;
+	int shift, n, i;
 	int err = -ENOMEM;
-	struct scatterlist *sg;
+	struct sg_dma_page_iter sg_iter;
 	struct c4iw_dev *rhp;
 	struct c4iw_pd *php;
 	struct c4iw_mr *mhp;
@@ -541,7 +540,7 @@  struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 	if (IS_ERR(mhp->umem))
 		goto err_free_skb;
 
-	shift = mhp->umem->page_shift;
+	shift = PAGE_SHIFT;
 
 	n = mhp->umem->nmap;
 	err = alloc_pbl(mhp, n);
@@ -556,21 +555,17 @@  struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 
 	i = n = 0;
 
-	for_each_sg(mhp->umem->sg_head.sgl, sg, mhp->umem->nmap, entry) {
-		len = sg_dma_len(sg) >> shift;
-		for (k = 0; k < len; ++k) {
-			pages[i++] = cpu_to_be64(sg_dma_address(sg) +
-						 (k << shift));
-			if (i == PAGE_SIZE / sizeof *pages) {
-				err = write_pbl(&mhp->rhp->rdev,
-				      pages,
-				      mhp->attr.pbl_addr + (n << 3), i,
-				      mhp->wr_waitp);
-				if (err)
-					goto pbl_done;
-				n += i;
-				i = 0;
-			}
+	for_each_sg_dma_page(mhp->umem->sg_head.sgl, &sg_iter, mhp->umem->nmap, 0) {
+		pages[i++] = cpu_to_be64(sg_page_iter_dma_address(&sg_iter));
+		if (i == PAGE_SIZE / sizeof *pages) {
+			err = write_pbl(&mhp->rhp->rdev,
+					pages,
+					mhp->attr.pbl_addr + (n << 3), i,
+					mhp->wr_waitp);
+			if (err)
+				goto pbl_done;
+			n += i;
+			i = 0;
 		}
 	}