diff mbox series

[rdma-next,3/6] RDMA/umem: Add API to return optimal HW DMA addresses from SG list

Message ID 20181224223227.18016-4-shiraz.saleem@intel.com (mailing list archive)
State Changes Requested
Headers show
Series Introduce APIs to get DMA addresses aligned to a HW supported page size | expand

Commit Message

Saleem, Shiraz Dec. 24, 2018, 10:32 p.m. UTC
This helper iterates the SG list and returns suitable
HW aligned DMA addresses within a driver supported
page size. The implementation is intended to work for
HW that support single page sizes or mixed page sizes
in an MR. This avoids the need for having driver specific
algorithms to achieve the same thing and redundant walks
of the SG list.

Suggested-by: Jason Gunthorpe <jgg@ziepe.ca>
Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
---
 drivers/infiniband/core/umem.c | 78 ++++++++++++++++++++++++++++++++++++++++++
 include/rdma/ib_umem.h         | 23 +++++++++++++
 2 files changed, 101 insertions(+)

Comments

Jason Gunthorpe Jan. 3, 2019, 12:19 a.m. UTC | #1
On Mon, Dec 24, 2018 at 04:32:24PM -0600, Shiraz Saleem wrote:
> This helper iterates the SG list and returns suitable
> HW aligned DMA addresses within a driver supported
> page size. The implementation is intended to work for
> HW that support single page sizes or mixed page sizes
> in an MR. This avoids the need for having driver specific
> algorithms to achieve the same thing and redundant walks
> of the SG list.
> 
> Suggested-by: Jason Gunthorpe <jgg@ziepe.ca>
> Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
> Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
>  drivers/infiniband/core/umem.c | 78 ++++++++++++++++++++++++++++++++++++++++++
>  include/rdma/ib_umem.h         | 23 +++++++++++++
>  2 files changed, 101 insertions(+)
> 
> diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
> index b2f2d75..8f8a1f6 100644
> +++ b/drivers/infiniband/core/umem.c
> @@ -204,6 +204,84 @@ unsigned long ib_umem_find_single_pg_size(struct ib_umem *umem,
>  }
>  EXPORT_SYMBOL(ib_umem_find_single_pg_size);
>  
> +static unsigned int ib_umem_find_mixed_pg_bit(struct scatterlist *sgl_head,
> +					      struct sg_phys_iter *sg_phys_iter)
> +{
> +	unsigned long dma_addr_start, dma_addr_end;
> +
> +	dma_addr_start = sg_dma_address(sg_phys_iter->sg);
> +	dma_addr_end = sg_dma_address(sg_phys_iter->sg) +
> +		       sg_dma_len(sg_phys_iter->sg);
> +
> +	if (sg_phys_iter->sg == sgl_head)
> +		return ib_umem_find_pg_bit(dma_addr_end,
> +					   sg_phys_iter->supported_pgsz);
> +	else if (sg_is_last(sg_phys_iter->sg))
> +		return ib_umem_find_pg_bit(dma_addr_start,
> +					   sg_phys_iter->supported_pgsz);
> +	else
> +		return ib_umem_find_pg_bit(sg_phys_iter->phyaddr,
> +					   sg_phys_iter->supported_pgsz);
> +}
> +void ib_umem_start_phys_iter(struct ib_umem *umem,
> +			     struct sg_phys_iter *sg_phys_iter,
> +			     unsigned long supported_pgsz)
> +{
> +	memset(sg_phys_iter, 0, sizeof(struct sg_phys_iter));
> +	sg_phys_iter->sg = umem->sg_head.sgl;
> +	sg_phys_iter->supported_pgsz = supported_pgsz;
> +
> +	/* Single page support in MR */
> +	if (hweight_long(supported_pgsz) == 1)
> +		sg_phys_iter->pg_bit = fls64(supported_pgsz) - 1;
> +	else
> +		sg_phys_iter->mixed_pg_support = true;
> +}
> +EXPORT_SYMBOL(ib_umem_start_phys_iter);
> +
> +/**
> + * ib_umem_next_phys_iter - SG list iterator that returns aligned HW address
> + * @umem: umem struct
> + * @sg_phys_iter: SG HW address iterator
> + * @supported_pgsz: bitmask of HW supported page sizes
> + *
> + * This helper iterates over the SG list and returns the HW
> + * address aligned to a supported HW page size.

I think more documentation is appropriate here..

Each true result returns a contiguous aligned block of memory such
that:

- pg_bit indicate the alignment of this block such that
    phyaddr & ((1 << pg_bit) - 1) == 0

- All blocks except the starting block has a zero offset.
  For the starting block offset indicates the first valid byte in the
  MR, HW should not permit access to bytes earlier that offset.

  [And how is this being done in i40iw? A side effect of the user
   virtual bad address?]

- For all blocks except the last len + offset equals 1 << pg_bit

False is returned when iteration is completed and all blocks have been
seen.

> + */
> +bool ib_umem_next_phys_iter(struct ib_umem *umem,
> +			    struct sg_phys_iter *sg_phys_iter)
> +{
> +	unsigned long pg_mask;
> +
> +	if (!sg_phys_iter->sg || !sg_phys_iter->supported_pgsz)
> +		return false;
> +
> +	if (sg_phys_iter->remaining) {
> +		sg_phys_iter->phyaddr += (sg_phys_iter->len + sg_phys_iter->offset);
> +	} else {
> +		sg_phys_iter->phyaddr = sg_dma_address(sg_phys_iter->sg);
> +		sg_phys_iter->remaining = sg_dma_len(sg_phys_iter->sg);
> +	}
> +
> +	/* Mixed page support in MR*/
> +	if (sg_phys_iter->mixed_pg_support)
> +		sg_phys_iter->pg_bit = ib_umem_find_mixed_pg_bit(umem->sg_head.sgl,
> +								 sg_phys_iter);
> +
> +	pg_mask = ~(BIT_ULL(sg_phys_iter->pg_bit) - 1);
> +
> +	sg_phys_iter->offset = sg_phys_iter->phyaddr & ~pg_mask;
> +	sg_phys_iter->phyaddr = sg_phys_iter->phyaddr & pg_mask;
> +	sg_phys_iter->len = min_t(unsigned long, sg_phys_iter->remaining,
> +			BIT_ULL(sg_phys_iter->pg_bit) - sg_phys_iter->offset);

Maybe drop len and offset? Nothing uses them in this series, right?

Seems fine otherwise

Jason
Saleem, Shiraz Jan. 4, 2019, 3:32 p.m. UTC | #2
On Wed, Jan 02, 2019 at 05:19:55PM -0700, Jason Gunthorpe wrote:
> On Mon, Dec 24, 2018 at 04:32:24PM -0600, Shiraz Saleem wrote:
> > This helper iterates the SG list and returns suitable
> > HW aligned DMA addresses within a driver supported
> > page size. The implementation is intended to work for
> > HW that support single page sizes or mixed page sizes
> > in an MR. This avoids the need for having driver specific
> > algorithms to achieve the same thing and redundant walks
> > of the SG list.
> >
[..]
 
> > + * ib_umem_next_phys_iter - SG list iterator that returns aligned HW address
> > + * @umem: umem struct
> > + * @sg_phys_iter: SG HW address iterator
> > + * @supported_pgsz: bitmask of HW supported page sizes
> > + *
> > + * This helper iterates over the SG list and returns the HW
> > + * address aligned to a supported HW page size.
> 
> I think more documentation is appropriate here..

OK.
 
> Each true result returns a contiguous aligned block of memory such
> that:
> 
> - pg_bit indicate the alignment of this block such that
>     phyaddr & ((1 << pg_bit) - 1) == 0
> 
> - All blocks except the starting block has a zero offset.
>   For the starting block offset indicates the first valid byte in the
>   MR, HW should not permit access to bytes earlier that offset.
> 
>   [And how is this being done in i40iw? A side effect of the user
>    virtual bad address?]

Yes we have that restriction for the start offsets to align for user-VA buf
and physical buf. And use the least significant bits of VA.

> - For all blocks except the last len + offset equals 1 << pg_bit
> 
> False is returned when iteration is completed and all blocks have been
> seen.
> 
> > + */
> > +bool ib_umem_next_phys_iter(struct ib_umem *umem,
> > +			    struct sg_phys_iter *sg_phys_iter)
> > +{
> > +	unsigned long pg_mask;
> > +
> > +	if (!sg_phys_iter->sg || !sg_phys_iter->supported_pgsz)
> > +		return false;
> > +
> > +	if (sg_phys_iter->remaining) {
> > +		sg_phys_iter->phyaddr += (sg_phys_iter->len + sg_phys_iter->offset);
> > +	} else {
> > +		sg_phys_iter->phyaddr = sg_dma_address(sg_phys_iter->sg);
> > +		sg_phys_iter->remaining = sg_dma_len(sg_phys_iter->sg);
> > +	}
> > +
> > +	/* Mixed page support in MR*/
> > +	if (sg_phys_iter->mixed_pg_support)
> > +		sg_phys_iter->pg_bit = ib_umem_find_mixed_pg_bit(umem->sg_head.sgl,
> > +								 sg_phys_iter);
> > +
> > +	pg_mask = ~(BIT_ULL(sg_phys_iter->pg_bit) - 1);
> > +
> > +	sg_phys_iter->offset = sg_phys_iter->phyaddr & ~pg_mask;
> > +	sg_phys_iter->phyaddr = sg_phys_iter->phyaddr & pg_mask;
> > +	sg_phys_iter->len = min_t(unsigned long, sg_phys_iter->remaining,
> > +			BIT_ULL(sg_phys_iter->pg_bit) - sg_phys_iter->offset);
> 
> Maybe drop len and offset? Nothing uses them in this series, right?
> 

The individual drivers dont use them in the series, but could in the future to get
len/offset of each aligned block. They are currenlty just internal tracking fields
that in my purview improve readability in this function.

Shiraz
diff mbox series

Patch

diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index b2f2d75..8f8a1f6 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -204,6 +204,84 @@  unsigned long ib_umem_find_single_pg_size(struct ib_umem *umem,
 }
 EXPORT_SYMBOL(ib_umem_find_single_pg_size);
 
+static unsigned int ib_umem_find_mixed_pg_bit(struct scatterlist *sgl_head,
+					      struct sg_phys_iter *sg_phys_iter)
+{
+	unsigned long dma_addr_start, dma_addr_end;
+
+	dma_addr_start = sg_dma_address(sg_phys_iter->sg);
+	dma_addr_end = sg_dma_address(sg_phys_iter->sg) +
+		       sg_dma_len(sg_phys_iter->sg);
+
+	if (sg_phys_iter->sg == sgl_head)
+		return ib_umem_find_pg_bit(dma_addr_end,
+					   sg_phys_iter->supported_pgsz);
+	else if (sg_is_last(sg_phys_iter->sg))
+		return ib_umem_find_pg_bit(dma_addr_start,
+					   sg_phys_iter->supported_pgsz);
+	else
+		return ib_umem_find_pg_bit(sg_phys_iter->phyaddr,
+					   sg_phys_iter->supported_pgsz);
+}
+void ib_umem_start_phys_iter(struct ib_umem *umem,
+			     struct sg_phys_iter *sg_phys_iter,
+			     unsigned long supported_pgsz)
+{
+	memset(sg_phys_iter, 0, sizeof(struct sg_phys_iter));
+	sg_phys_iter->sg = umem->sg_head.sgl;
+	sg_phys_iter->supported_pgsz = supported_pgsz;
+
+	/* Single page support in MR */
+	if (hweight_long(supported_pgsz) == 1)
+		sg_phys_iter->pg_bit = fls64(supported_pgsz) - 1;
+	else
+		sg_phys_iter->mixed_pg_support = true;
+}
+EXPORT_SYMBOL(ib_umem_start_phys_iter);
+
+/**
+ * ib_umem_next_phys_iter - SG list iterator that returns aligned HW address
+ * @umem: umem struct
+ * @sg_phys_iter: SG HW address iterator
+ * @supported_pgsz: bitmask of HW supported page sizes
+ *
+ * This helper iterates over the SG list and returns the HW
+ * address aligned to a supported HW page size.
+ */
+bool ib_umem_next_phys_iter(struct ib_umem *umem,
+			    struct sg_phys_iter *sg_phys_iter)
+{
+	unsigned long pg_mask;
+
+	if (!sg_phys_iter->sg || !sg_phys_iter->supported_pgsz)
+		return false;
+
+	if (sg_phys_iter->remaining) {
+		sg_phys_iter->phyaddr += (sg_phys_iter->len + sg_phys_iter->offset);
+	} else {
+		sg_phys_iter->phyaddr = sg_dma_address(sg_phys_iter->sg);
+		sg_phys_iter->remaining = sg_dma_len(sg_phys_iter->sg);
+	}
+
+	/* Mixed page support in MR*/
+	if (sg_phys_iter->mixed_pg_support)
+		sg_phys_iter->pg_bit = ib_umem_find_mixed_pg_bit(umem->sg_head.sgl,
+								 sg_phys_iter);
+
+	pg_mask = ~(BIT_ULL(sg_phys_iter->pg_bit) - 1);
+
+	sg_phys_iter->offset = sg_phys_iter->phyaddr & ~pg_mask;
+	sg_phys_iter->phyaddr = sg_phys_iter->phyaddr & pg_mask;
+	sg_phys_iter->len = min_t(unsigned long, sg_phys_iter->remaining,
+			BIT_ULL(sg_phys_iter->pg_bit) - sg_phys_iter->offset);
+	sg_phys_iter->remaining -= sg_phys_iter->len;
+	if (!sg_phys_iter->remaining)
+		sg_phys_iter->sg = sg_next(sg_phys_iter->sg);
+
+	return true;
+}
+EXPORT_SYMBOL(ib_umem_next_phys_iter);
+
 /**
  * ib_umem_get - Pin and DMA map userspace memory.
  *
diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h
index 3e8e1ed..57cc621 100644
--- a/include/rdma/ib_umem.h
+++ b/include/rdma/ib_umem.h
@@ -55,6 +55,17 @@  struct ib_umem {
 	int             npages;
 };
 
+struct sg_phys_iter {
+	struct scatterlist *sg;
+	unsigned long phyaddr;
+	unsigned long len;
+	unsigned long offset;
+	unsigned long supported_pgsz;
+	unsigned long remaining;
+	unsigned int pg_bit;
+	u8 mixed_pg_support;
+};
+
 /* Returns the offset of the umem start relative to the first page. */
 static inline int ib_umem_offset(struct ib_umem *umem)
 {
@@ -89,6 +100,11 @@  int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,
 unsigned long ib_umem_find_single_pg_size(struct ib_umem *umem,
 					  unsigned long supported_pgsz,
 					  unsigned long uvirt_addr);
+void ib_umem_start_phys_iter(struct ib_umem *umem,
+			     struct sg_phys_iter *sg_phys_iter,
+			     unsigned long supported_pgsz);
+bool ib_umem_next_phys_iter(struct ib_umem *umem,
+			    struct sg_phys_iter *sg_phys_iter);
 
 #else /* CONFIG_INFINIBAND_USER_MEM */
 
@@ -110,6 +126,13 @@  static inline int ib_umem_find_single_pg_size(struct ib_umem *umem,
 					      unsigned long uvirt_addr) {
 	return -EINVAL;
 }
+static inline void ib_umem_start_phys_iter(struct ib_umem *umem,
+					   struct sg_phys_iter *sg_phys_iter,
+					   unsigned long supported_pgsz) { }
+static inline bool ib_umem_next_phys_iter(struct ib_umem *umem,
+					  struct sg_phys_iter *sg_phys_iter) {
+	return false;
+}
 
 #endif /* CONFIG_INFINIBAND_USER_MEM */