diff mbox series

[rdma-next,2/6] RDMA/umem: Add API to find best driver supported page size in an MR

Message ID 20181224223227.18016-3-shiraz.saleem@intel.com (mailing list archive)
State Changes Requested
Headers show
Series Introduce APIs to get DMA addresses aligned to a HW supported page size | expand

Commit Message

Saleem, Shiraz Dec. 24, 2018, 10:32 p.m. UTC
This helper iterates through the SG list to find the best page size to use
from a bitmap of HW supported page sizes. Drivers that support multiple
page sizes, but not mixed pages in an MR can use this API.

Suggested-by: Jason Gunthorpe <jgg@ziepe.ca>
Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
---
 drivers/infiniband/core/umem.c | 86 ++++++++++++++++++++++++++++++++++++++++++
 include/rdma/ib_umem.h         |  9 +++++
 2 files changed, 95 insertions(+)

Comments

Jason Gunthorpe Jan. 3, 2019, 12:02 a.m. UTC | #1
On Mon, Dec 24, 2018 at 04:32:23PM -0600, Shiraz Saleem wrote:
> This helper iterates through the SG list to find the best page size to use
> from a bitmap of HW supported page sizes. Drivers that support multiple
> page sizes, but not mixed pages in an MR can use this API.
> 
> Suggested-by: Jason Gunthorpe <jgg@ziepe.ca>
> Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
> Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
>  drivers/infiniband/core/umem.c | 86 ++++++++++++++++++++++++++++++++++++++++++
>  include/rdma/ib_umem.h         |  9 +++++
>  2 files changed, 95 insertions(+)
> 
> diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
> index 64bacc5..b2f2d75 100644
> +++ b/drivers/infiniband/core/umem.c
> @@ -119,6 +119,92 @@ static void ib_umem_add_sg_table(struct scatterlist **cur,
>  }
>  
>  /**
> + * ib_umem_find_pg_bit - Find the page bit to use for phyaddr
> + *
> + * @phyaddr: Physical address after DMA translation
> + * @supported_pgsz: bitmask of HW supported page sizes
> + */
> +static unsigned int ib_umem_find_pg_bit(unsigned long phyaddr,
> +					unsigned long supported_pgsz)
> +{
> +	unsigned long num_zeroes;
> +	unsigned long pgsz;
> +
> +	/* Trailing zero bits in the address */
> +	num_zeroes = __ffs(phyaddr);
> +
> +	/* Find page bit such that phyaddr is aligned to the highest supported
> +	 * HW page size
> +	 */
> +	pgsz = supported_pgsz & (BIT_ULL(num_zeroes + 1) - 1);
> +	if (!pgsz)
> +		return __ffs(supported_pgsz);
> +
> +	return (fls64(pgsz) - 1);

extra brackets

> +}
> +
> +/**
> + * ib_umem_find_single_pg_size - Find best HW page size to use for this MR
> + * @umem: umem struct
> + * @supported_pgsz: bitmask of HW supported page sizes
> + * @uvirt_addr: user-space virtual MR base address
> + *
> + * This helper is intended for HW that support multiple page
> + * sizes but can do only a single page size in an MR.

Returns 0 if the umem requires page sizes not supported by the driver
to be mapped. Drivers always supporting PAGE_SIZE will never see a 0
result.

> + */
> +unsigned long ib_umem_find_single_pg_size(struct ib_umem *umem,
> +					  unsigned long supported_pgsz,
> +					  unsigned long uvirt_addr)
> +{
> +	struct scatterlist *sg;
> +	unsigned int pg_bit_sg, min_pg_bit, best_pg_bit;
> +	int i;
> +
> +	if (!supported_pgsz)
> +		return 0;

Maybe like:

if (WARN_ON(!(BIT_ULL(PAGE_SHIFT) & supported_pgsz)))
     return 0;

?

Drivers are not allowed to not support the system page size.

And also I guess it would be reasonable to have something like

/* Driver only supports the system page size, so no further checks required */
if (supported_pgsz >> PAGE_SHIFT == 1)
        return PAGE_SIZE;

Otherwise I think this looks fine

Jason
Saleem, Shiraz Jan. 4, 2019, 2:18 p.m. UTC | #2
On Wed, Jan 02, 2019 at 05:02:27PM -0700, Jason Gunthorpe wrote:
> On Mon, Dec 24, 2018 at 04:32:23PM -0600, Shiraz Saleem wrote:
> > This helper iterates through the SG list to find the best page size to use
> > from a bitmap of HW supported page sizes. Drivers that support multiple
> > page sizes, but not mixed pages in an MR can use this API.
> >
[..] 

> > +static unsigned int ib_umem_find_pg_bit(unsigned long phyaddr,
> > +					unsigned long supported_pgsz)
> > +{
> > +	unsigned long num_zeroes;
> > +	unsigned long pgsz;
> > +
> > +	/* Trailing zero bits in the address */
> > +	num_zeroes = __ffs(phyaddr);
> > +
> > +	/* Find page bit such that phyaddr is aligned to the highest supported
> > +	 * HW page size
> > +	 */
> > +	pgsz = supported_pgsz & (BIT_ULL(num_zeroes + 1) - 1);
> > +	if (!pgsz)
> > +		return __ffs(supported_pgsz);
> > +
> > +	return (fls64(pgsz) - 1);
> 
> extra brackets

OK.
 
> > +/**
> > + * ib_umem_find_single_pg_size - Find best HW page size to use for this MR
> > + * @umem: umem struct
> > + * @supported_pgsz: bitmask of HW supported page sizes
> > + * @uvirt_addr: user-space virtual MR base address
> > + *
> > + * This helper is intended for HW that support multiple page
> > + * sizes but can do only a single page size in an MR.
> 
> Returns 0 if the umem requires page sizes not supported by the driver
> to be mapped. Drivers always supporting PAGE_SIZE will never see a 0
> result.

OK.
 
> > + */
> > +unsigned long ib_umem_find_single_pg_size(struct ib_umem *umem,
> > +					  unsigned long supported_pgsz,
> > +					  unsigned long uvirt_addr)
> > +{
> > +	struct scatterlist *sg;
> > +	unsigned int pg_bit_sg, min_pg_bit, best_pg_bit;
> > +	int i;
> > +
> > +	if (!supported_pgsz)
> > +		return 0;
> 
> Maybe like:
> 
> if (WARN_ON(!(BIT_ULL(PAGE_SHIFT) & supported_pgsz)))
>      return 0;
> 
> ?
> 
> Drivers are not allowed to not support the system page size.
> 
> And also I guess it would be reasonable to have something like
> 
> /* Driver only supports the system page size, so no further checks required */
> if (supported_pgsz >> PAGE_SHIFT == 1)
>         return PAGE_SIZE;
> 

Yes, these 2 checks are warranted.

Shiraz
Jason Gunthorpe Jan. 4, 2019, 4:10 p.m. UTC | #3
On Fri, Jan 04, 2019 at 08:18:32AM -0600, Shiraz Saleem wrote:

> > > + */
> > > +unsigned long ib_umem_find_single_pg_size(struct ib_umem *umem,
> > > +					  unsigned long supported_pgsz,
> > > +					  unsigned long uvirt_addr)
> > > +{
> > > +	struct scatterlist *sg;
> > > +	unsigned int pg_bit_sg, min_pg_bit, best_pg_bit;
> > > +	int i;
> > > +
> > > +	if (!supported_pgsz)
> > > +		return 0;
> > 
> > Maybe like:
> > 
> > if (WARN_ON(!(BIT_ULL(PAGE_SHIFT) & supported_pgsz)))
> >      return 0;

Actually this is a bit over-reaching, it should be 'driver must
support the system page size or smaller' so maybe

WARN_ON(support_pgsz & GENMASK(PAGE_SHIFT, 0) == 0)

because we can always take large pages and break them down.

Jason
diff mbox series

Patch

diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index 64bacc5..b2f2d75 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -119,6 +119,92 @@  static void ib_umem_add_sg_table(struct scatterlist **cur,
 }
 
 /**
+ * ib_umem_find_pg_bit - Find the page bit to use for phyaddr
+ *
+ * @phyaddr: Physical address after DMA translation
+ * @supported_pgsz: bitmask of HW supported page sizes
+ */
+static unsigned int ib_umem_find_pg_bit(unsigned long phyaddr,
+					unsigned long supported_pgsz)
+{
+	unsigned long num_zeroes;
+	unsigned long pgsz;
+
+	/* Trailing zero bits in the address */
+	num_zeroes = __ffs(phyaddr);
+
+	/* Find page bit such that phyaddr is aligned to the highest supported
+	 * HW page size
+	 */
+	pgsz = supported_pgsz & (BIT_ULL(num_zeroes + 1) - 1);
+	if (!pgsz)
+		return __ffs(supported_pgsz);
+
+	return (fls64(pgsz) - 1);
+}
+
+/**
+ * ib_umem_find_single_pg_size - Find best HW page size to use for this MR
+ * @umem: umem struct
+ * @supported_pgsz: bitmask of HW supported page sizes
+ * @uvirt_addr: user-space virtual MR base address
+ *
+ * This helper is intended for HW that support multiple page
+ * sizes but can do only a single page size in an MR.
+ */
+unsigned long ib_umem_find_single_pg_size(struct ib_umem *umem,
+					  unsigned long supported_pgsz,
+					  unsigned long uvirt_addr)
+{
+	struct scatterlist *sg;
+	unsigned int pg_bit_sg, min_pg_bit, best_pg_bit;
+	int i;
+
+	if (!supported_pgsz)
+		return 0;
+
+	min_pg_bit = __ffs(supported_pgsz);
+	best_pg_bit = fls64(supported_pgsz) - 1;
+
+	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i) {
+		unsigned long dma_addr_start, dma_addr_end;
+
+		dma_addr_start = sg_dma_address(sg);
+		dma_addr_end = sg_dma_address(sg) + sg_dma_len(sg);
+		if (!i) {
+			pg_bit_sg = ib_umem_find_pg_bit(dma_addr_end, supported_pgsz);
+
+			/* The start offset of the MR into a first _large_ page
+			 * should line up exactly for the user-space virtual buf
+			 * and physical buffer, in order to upgrade the page bit
+			 */
+			if (pg_bit_sg > PAGE_SHIFT) {
+				unsigned int uvirt_pg_bit;
+
+				uvirt_pg_bit = ib_umem_find_pg_bit(uvirt_addr + sg_dma_len(sg),
+								   supported_pgsz);
+				pg_bit_sg = min_t(unsigned int, uvirt_pg_bit, pg_bit_sg);
+			}
+		} else if (i == (umem->nmap - 1)) {
+			/* last SGE: Does not matter if MR ends at an
+			 * unaligned offset.
+			 */
+			pg_bit_sg = ib_umem_find_pg_bit(dma_addr_start, supported_pgsz);
+		} else {
+			pg_bit_sg = ib_umem_find_pg_bit(dma_addr_start,
+				supported_pgsz & (BIT_ULL(__ffs(sg_dma_len(sg)) + 1) - 1));
+		}
+
+		best_pg_bit = min_t(unsigned int, best_pg_bit, pg_bit_sg);
+		if (best_pg_bit == min_pg_bit)
+			break;
+	}
+
+	return BIT_ULL(best_pg_bit);
+}
+EXPORT_SYMBOL(ib_umem_find_single_pg_size);
+
+/**
  * ib_umem_get - Pin and DMA map userspace memory.
  *
  * If access flags indicate ODP memory, avoid pinning. Instead, stores
diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h
index 5d3755e..3e8e1ed 100644
--- a/include/rdma/ib_umem.h
+++ b/include/rdma/ib_umem.h
@@ -86,6 +86,9 @@  void ib_umem_release(struct ib_umem *umem);
 int ib_umem_page_count(struct ib_umem *umem);
 int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,
 		      size_t length);
+unsigned long ib_umem_find_single_pg_size(struct ib_umem *umem,
+					  unsigned long supported_pgsz,
+					  unsigned long uvirt_addr);
 
 #else /* CONFIG_INFINIBAND_USER_MEM */
 
@@ -102,6 +105,12 @@  static inline int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offs
 		      		    size_t length) {
 	return -EINVAL;
 }
+static inline int ib_umem_find_single_pg_size(struct ib_umem *umem,
+					      unsigned long supported_pgsz,
+					      unsigned long uvirt_addr) {
+	return -EINVAL;
+}
+
 #endif /* CONFIG_INFINIBAND_USER_MEM */
 
 #endif /* IB_UMEM_H */