diff mbox series

[v5,net-next,02/36] iov_iter: DDP copy to iter/pages

Message ID 20210722110325.371-3-borisp@nvidia.com (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series nvme-tcp receive and tarnsmit offloads | expand

Checks

Context Check Description
netdev/apply fail Patch does not apply to net-next
netdev/tree_selection success Clearly marked for net-next

Commit Message

Boris Pismenny July 22, 2021, 11:02 a.m. UTC
From: Boris Pismenny <borisp@mellanox.com>

When using direct data placement (DDP) the NIC writes some of the payload
directly to the destination buffer, and constructs SKBs such that they
point to this data. To skip copies when SKB data already resides in the
destination we use the newly introduced routines in this commit, which
check if (src == dst), and skip the copy when that's true.

As the current user for these routines is in the block layer (nvme-tcp),
then we only apply the change for bio_vec. Other routines use the normal
methods for copying.

Signed-off-by: Boris Pismenny <borisp@mellanox.com>
Signed-off-by: Ben Ben-Ishay <benishay@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Yoray Zack <yorayz@mellanox.com>
---
 include/linux/uio.h | 17 ++++++++++++++
 lib/iov_iter.c      | 55 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 72 insertions(+)

Comments

Christoph Hellwig July 22, 2021, 1:31 p.m. UTC | #1
> +#ifdef CONFIG_ULP_DDP
> +size_t _ddp_copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i);
> +#endif
>  size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i);
>  bool _copy_from_iter_full(void *addr, size_t bytes, struct iov_iter *i);
>  size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i);
> @@ -145,6 +148,16 @@ size_t copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
>  		return _copy_to_iter(addr, bytes, i);
>  }
>  
> +#ifdef CONFIG_ULP_DDP
> +static __always_inline __must_check
> +size_t ddp_copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
> +{
> +	if (unlikely(!check_copy_size(addr, bytes, true)))
> +		return 0;
> +	return _ddp_copy_to_iter(addr, bytes, i);
> +}
> +#endif

There is no need to ifdef out externs with conditional implementations,
or inlines using them.

> +#ifdef CONFIG_ULP_DDP
> +static void ddp_memcpy_to_page(struct page *page, size_t offset, const char *from, size_t len)

Overly long line.

> +	char *to = kmap_atomic(page);
> +
> +	if (to + offset != from)
> +		memcpy(to + offset, from, len);
> +
> +	kunmap_atomic(to);

This looks completely bogus to any casual read, so please document why
it makes sense.  And no, a magic, unexplained ddp in the name does not
count as explanation at all.  Please think about a more useful name.

Can this ever write to user page?  If yes it needs a flush_dcache_page.

Last but not least: kmap_atomic is deprecated except for the very
rate use case where it is actually called from atomic context.  Please
use kmap_local_page instead.

> +#ifdef CONFIG_CRYPTO_HASH
> +	struct ahash_request *hash = hashp;
> +	struct scatterlist sg;
> +	size_t copied;
> +
> +	copied = ddp_copy_to_iter(addr, bytes, i);
> +	sg_init_one(&sg, addr, copied);
> +	ahash_request_set_crypt(hash, &sg, NULL, copied);
> +	crypto_ahash_update(hash);
> +	return copied;
> +#else
> +	return 0;
> +#endif

What is the point of this stub?  To me it looks extremely dangerous.
Boris Pismenny July 22, 2021, 8:23 p.m. UTC | #2
On 22/07/2021 16:31, Christoph Hellwig wrote:
>> +#ifdef CONFIG_ULP_DDP
>> +size_t _ddp_copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i);
>> +#endif
>>  size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i);
>>  bool _copy_from_iter_full(void *addr, size_t bytes, struct iov_iter *i);
>>  size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i);
>> @@ -145,6 +148,16 @@ size_t copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
>>  		return _copy_to_iter(addr, bytes, i);
>>  }
>>  
>> +#ifdef CONFIG_ULP_DDP
>> +static __always_inline __must_check
>> +size_t ddp_copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
>> +{
>> +	if (unlikely(!check_copy_size(addr, bytes, true)))
>> +		return 0;
>> +	return _ddp_copy_to_iter(addr, bytes, i);
>> +}
>> +#endif
> 
> There is no need to ifdef out externs with conditional implementations,
> or inlines using them.
> 
>> +#ifdef CONFIG_ULP_DDP
>> +static void ddp_memcpy_to_page(struct page *page, size_t offset, const char *from, size_t len)
> 
> Overly long line.
> 
>> +	char *to = kmap_atomic(page);
>> +
>> +	if (to + offset != from)
>> +		memcpy(to + offset, from, len);
>> +
>> +	kunmap_atomic(to);
> 
> This looks completely bogus to any casual read, so please document why
> it makes sense.  And no, a magic, unexplained ddp in the name does not
> count as explanation at all.  Please think about a more useful name.

This routine, like other changes in this file, replicates the logic in
memcpy_to_page. The only difference is that "ddp" avoids copies when the
copy source and destinations buffers are one and the same. These are
then used by nvme-tcp (see skb_ddp_copy_datagram_iter in nvme-tcp) which
receives SKBs from the NIC that already placed data in its destination,
and this is the source for the name Direct Data Placement. I'd gladly
take suggestions for better names, but this is the best we came up with
so far.

The reason we are doing it is to avoid modifying memcpy_to_page itself,
but rather allow users (e.g., nvme-tcp) to access this functionality
directly.

> 
> Can this ever write to user page?  If yes it needs a flush_dcache_page.

Yes, will add.

> 
> Last but not least: kmap_atomic is deprecated except for the very
> rate use case where it is actually called from atomic context.  Please
> use kmap_local_page instead.
> 

Will look into it, thanks!

>> +#ifdef CONFIG_CRYPTO_HASH
>> +	struct ahash_request *hash = hashp;
>> +	struct scatterlist sg;
>> +	size_t copied;
>> +
>> +	copied = ddp_copy_to_iter(addr, bytes, i);
>> +	sg_init_one(&sg, addr, copied);
>> +	ahash_request_set_crypt(hash, &sg, NULL, copied);
>> +	crypto_ahash_update(hash);
>> +	return copied;
>> +#else
>> +	return 0;
>> +#endif
> 
> What is the point of this stub?  To me it looks extremely dangerous.
> 

As above, we use the same logic as in hash_and_copy_to_iter. The purpose
is again to eventually avoid the copy in case the source and destination
buffers are one and the same.
Al Viro July 22, 2021, 8:55 p.m. UTC | #3
On Thu, Jul 22, 2021 at 02:02:51PM +0300, Boris Pismenny wrote:
> From: Boris Pismenny <borisp@mellanox.com>
> 
> When using direct data placement (DDP) the NIC writes some of the payload
> directly to the destination buffer, and constructs SKBs such that they
> point to this data. To skip copies when SKB data already resides in the
> destination we use the newly introduced routines in this commit, which
> check if (src == dst), and skip the copy when that's true.
> 
> As the current user for these routines is in the block layer (nvme-tcp),
> then we only apply the change for bio_vec. Other routines use the normal
> methods for copying.

Please, take a look at -rc1 and see the changes in lib/iov_iter.c in there.
Christoph Hellwig July 23, 2021, 5:03 a.m. UTC | #4
On Thu, Jul 22, 2021 at 11:23:38PM +0300, Boris Pismenny wrote:
> This routine, like other changes in this file, replicates the logic in
> memcpy_to_page. The only difference is that "ddp" avoids copies when the
> copy source and destinations buffers are one and the same.

Now why can't we just make that change to the generic routine?

If we can't, why do they not have a saner name documenting what they
actually do?
Al Viro July 23, 2021, 5:21 a.m. UTC | #5
On Fri, Jul 23, 2021 at 07:03:02AM +0200, Christoph Hellwig wrote:
> On Thu, Jul 22, 2021 at 11:23:38PM +0300, Boris Pismenny wrote:
> > This routine, like other changes in this file, replicates the logic in
> > memcpy_to_page. The only difference is that "ddp" avoids copies when the
> > copy source and destinations buffers are one and the same.
> 
> Now why can't we just make that change to the generic routine?

Doable... replace memcpy(base, addr + off, len) with
	base != addr + off && memcpy(base, addr + off, len)
in _copy_to_iter() and be done with that...
Or Gerlitz Aug. 4, 2021, 2:13 p.m. UTC | #6
On Fri, Jul 23, 2021 at 8:30 AM Al Viro <viro@zeniv.linux.org.uk> wrote:
> On Fri, Jul 23, 2021 at 07:03:02AM +0200, Christoph Hellwig wrote:
> > On Thu, Jul 22, 2021 at 11:23:38PM +0300, Boris Pismenny wrote:

>>> This routine, like other changes in this file, replicates the logic in
>>> memcpy_to_page. The only difference is that "ddp" avoids copies when the
>>> copy source and destinations buffers are one and the same.

>> Now why can't we just make that change to the generic routine?

> Doable... replace memcpy(base, addr + off, len) with
>         base != addr + off && memcpy(base, addr + off, len)
> in _copy_to_iter() and be done with that...

Guys,

AFAIR we did the adding ddp_ prefix exercise to the copy functions call chain

ddp_hash_and_copy_to_iter
-> ddp_copy_to_iter
-> _ddp_copy_to_iter
-> ddp_memcpy_to_page

to address feedback given on earlier versions of the series. So let's
decide please.. are we all set to remove the ddp_ prefixed calls and just
plant the new check (plus a nice comment!) as Al suggested?

re the comments given on ddp_memcpy_to_page, upstream move
to just call memcpy, so we need not have it anyway, will be fixed in v6
if we remain with ddp_ call chain or becomes irrelevant if we drop it.
Or Gerlitz Aug. 10, 2021, 1:29 p.m. UTC | #7
On Wed, Aug 4, 2021 at 5:13 PM Or Gerlitz <gerlitz.or@gmail.com> wrote:
> On Fri, Jul 23, 2021 at 8:30 AM Al Viro <viro@zeniv.linux.org.uk> wrote:
> > On Fri, Jul 23, 2021 at 07:03:02AM +0200, Christoph Hellwig wrote:
> > > On Thu, Jul 22, 2021 at 11:23:38PM +0300, Boris Pismenny wrote:
>
> >>> This routine, like other changes in this file, replicates the logic in
> >>> memcpy_to_page. The only difference is that "ddp" avoids copies when the
> >>> copy source and destinations buffers are one and the same.
>
> >> Now why can't we just make that change to the generic routine?
>
> > Doable... replace memcpy(base, addr + off, len) with
> >         base != addr + off && memcpy(base, addr + off, len)
> > in _copy_to_iter() and be done with that...
>
> Guys,
>
> AFAIR we did the adding ddp_ prefix exercise to the copy functions call chain
>
> ddp_hash_and_copy_to_iter
> -> ddp_copy_to_iter
> -> _ddp_copy_to_iter
> -> ddp_memcpy_to_page
>
> to address feedback given on earlier versions of the series. So let's
> decide please.. are we all set to remove the ddp_ prefixed calls and just
> plant the new check (plus a nice comment!) as Al suggested?

So we are okay going for the minimal approach / direction suggested by
Al of adding a (base != addr + offset) check before the memcpy call.

This will also simplify the changes to the nvme-tcp driver.  Please
speak if you want the ddp_ prefix approach to remain.

Or.

> re the comments given on ddp_memcpy_to_page, upstream move
> to just call memcpy, so we need not have it anyway, will be fixed in v6
> if we remain with ddp_ call chain or becomes irrelevant if we drop it.
diff mbox series

Patch

diff --git a/include/linux/uio.h b/include/linux/uio.h
index d3ec87706d75..a61fdb369e0e 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -131,6 +131,9 @@  size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
 			 struct iov_iter *i);
 
 size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i);
+#ifdef CONFIG_ULP_DDP
+size_t _ddp_copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i);
+#endif
 size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i);
 bool _copy_from_iter_full(void *addr, size_t bytes, struct iov_iter *i);
 size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i);
@@ -145,6 +148,16 @@  size_t copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
 		return _copy_to_iter(addr, bytes, i);
 }
 
+#ifdef CONFIG_ULP_DDP
+static __always_inline __must_check
+size_t ddp_copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
+{
+	if (unlikely(!check_copy_size(addr, bytes, true)))
+		return 0;
+	return _ddp_copy_to_iter(addr, bytes, i);
+}
+#endif
+
 static __always_inline __must_check
 size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
 {
@@ -281,6 +294,10 @@  size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, struct io
 bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i);
 size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp,
 		struct iov_iter *i);
+#ifdef CONFIG_ULP_DDP
+size_t ddp_hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp,
+		struct iov_iter *i);
+#endif
 
 struct iovec *iovec_from_user(const struct iovec __user *uvector,
 		unsigned long nr_segs, unsigned long fast_segs,
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index c701b7a187f2..2e9be46a9b56 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -508,6 +508,18 @@  void iov_iter_init(struct iov_iter *i, unsigned int direction,
 }
 EXPORT_SYMBOL(iov_iter_init);
 
+#ifdef CONFIG_ULP_DDP
+static void ddp_memcpy_to_page(struct page *page, size_t offset, const char *from, size_t len)
+{
+	char *to = kmap_atomic(page);
+
+	if (to + offset != from)
+		memcpy(to + offset, from, len);
+
+	kunmap_atomic(to);
+}
+#endif
+
 static inline bool allocated(struct pipe_buffer *buf)
 {
 	return buf->ops == &default_pipe_buf_ops;
@@ -648,6 +660,28 @@  static size_t csum_and_copy_to_pipe_iter(const void *addr, size_t bytes,
 	return bytes;
 }
 
+#ifdef CONFIG_ULP_DDP
+size_t _ddp_copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
+{
+	const char *from = addr;
+	if (unlikely(iov_iter_is_pipe(i)))
+		return copy_pipe_to_iter(addr, bytes, i);
+	if (iter_is_iovec(i))
+		might_fault();
+	iterate_and_advance(i, bytes, v,
+		copyout(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len),
+		ddp_memcpy_to_page(v.bv_page, v.bv_offset,
+				   (from += v.bv_len) - v.bv_len, v.bv_len),
+		memcpy(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len),
+		ddp_memcpy_to_page(v.bv_page, v.bv_offset,
+				   (from += v.bv_len) - v.bv_len, v.bv_len)
+		)
+
+	return bytes;
+}
+EXPORT_SYMBOL(_ddp_copy_to_iter);
+#endif
+
 size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
 {
 	const char *from = addr;
@@ -1818,6 +1852,27 @@  size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csstate,
 }
 EXPORT_SYMBOL(csum_and_copy_to_iter);
 
+#ifdef CONFIG_ULP_DDP
+size_t ddp_hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp,
+		struct iov_iter *i)
+{
+#ifdef CONFIG_CRYPTO_HASH
+	struct ahash_request *hash = hashp;
+	struct scatterlist sg;
+	size_t copied;
+
+	copied = ddp_copy_to_iter(addr, bytes, i);
+	sg_init_one(&sg, addr, copied);
+	ahash_request_set_crypt(hash, &sg, NULL, copied);
+	crypto_ahash_update(hash);
+	return copied;
+#else
+	return 0;
+#endif
+}
+EXPORT_SYMBOL(ddp_hash_and_copy_to_iter);
+#endif
+
 size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp,
 		struct iov_iter *i)
 {