diff mbox series

[net-next,v5,04/16] ceph: Use sendmsg(MSG_SPLICE_PAGES) rather than sendpage()

Message ID 20230623225513.2732256-5-dhowells@redhat.com (mailing list archive)
State New, archived
Headers show
Series None | expand

Commit Message

David Howells June 23, 2023, 10:55 p.m. UTC
Use sendmsg() and MSG_SPLICE_PAGES rather than sendpage in ceph when
transmitting data.  For the moment, this can only transmit one page at a
time because of the architecture of net/ceph/, but if
write_partial_message_data() can be given a bvec[] at a time by the
iteration code, this would allow pages to be sent in a batch.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Ilya Dryomov <idryomov@gmail.com>
cc: Xiubo Li <xiubli@redhat.com>
cc: Jeff Layton <jlayton@kernel.org>
cc: "David S. Miller" <davem@davemloft.net>
cc: Eric Dumazet <edumazet@google.com>
cc: Jakub Kicinski <kuba@kernel.org>
cc: Paolo Abeni <pabeni@redhat.com>
cc: Jens Axboe <axboe@kernel.dk>
cc: Matthew Wilcox <willy@infradead.org>
cc: ceph-devel@vger.kernel.org
cc: netdev@vger.kernel.org
---
 net/ceph/messenger_v2.c | 91 +++++++++--------------------------------
 1 file changed, 19 insertions(+), 72 deletions(-)

Comments

Ilya Dryomov June 25, 2023, 12:34 p.m. UTC | #1
On Sat, Jun 24, 2023 at 12:55 AM David Howells <dhowells@redhat.com> wrote:
>
> Use sendmsg() and MSG_SPLICE_PAGES rather than sendpage in ceph when
> transmitting data.  For the moment, this can only transmit one page at a
> time because of the architecture of net/ceph/, but if
> write_partial_message_data() can be given a bvec[] at a time by the

Hi David,

write_partial_message_data() is net/ceph/messenger_v1.c specific, so it
doesn't apply here.  I would suggest squashing the two net/ceph patches
into one since even the titles are the same.

Also, we tend to use "libceph: " prefix for net/ceph changes.

> iteration code, this would allow pages to be sent in a batch.
>
> Signed-off-by: David Howells <dhowells@redhat.com>
> cc: Ilya Dryomov <idryomov@gmail.com>
> cc: Xiubo Li <xiubli@redhat.com>
> cc: Jeff Layton <jlayton@kernel.org>
> cc: "David S. Miller" <davem@davemloft.net>
> cc: Eric Dumazet <edumazet@google.com>
> cc: Jakub Kicinski <kuba@kernel.org>
> cc: Paolo Abeni <pabeni@redhat.com>
> cc: Jens Axboe <axboe@kernel.dk>
> cc: Matthew Wilcox <willy@infradead.org>
> cc: ceph-devel@vger.kernel.org
> cc: netdev@vger.kernel.org
> ---
>  net/ceph/messenger_v2.c | 91 +++++++++--------------------------------
>  1 file changed, 19 insertions(+), 72 deletions(-)
>
> diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c
> index 301a991dc6a6..87ac97073e75 100644
> --- a/net/ceph/messenger_v2.c
> +++ b/net/ceph/messenger_v2.c
> @@ -117,91 +117,38 @@ static int ceph_tcp_recv(struct ceph_connection *con)
>         return ret;
>  }
>
> -static int do_sendmsg(struct socket *sock, struct iov_iter *it)
> -{
> -       struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS };
> -       int ret;
> -
> -       msg.msg_iter = *it;
> -       while (iov_iter_count(it)) {
> -               ret = sock_sendmsg(sock, &msg);
> -               if (ret <= 0) {
> -                       if (ret == -EAGAIN)
> -                               ret = 0;
> -                       return ret;
> -               }
> -
> -               iov_iter_advance(it, ret);
> -       }
> -
> -       WARN_ON(msg_data_left(&msg));
> -       return 1;
> -}
> -
> -static int do_try_sendpage(struct socket *sock, struct iov_iter *it)
> -{
> -       struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS };
> -       struct bio_vec bv;
> -       int ret;
> -
> -       if (WARN_ON(!iov_iter_is_bvec(it)))
> -               return -EINVAL;
> -
> -       while (iov_iter_count(it)) {
> -               /* iov_iter_iovec() for ITER_BVEC */
> -               bvec_set_page(&bv, it->bvec->bv_page,
> -                             min(iov_iter_count(it),
> -                                 it->bvec->bv_len - it->iov_offset),
> -                             it->bvec->bv_offset + it->iov_offset);
> -
> -               /*
> -                * sendpage cannot properly handle pages with
> -                * page_count == 0, we need to fall back to sendmsg if
> -                * that's the case.
> -                *
> -                * Same goes for slab pages: skb_can_coalesce() allows
> -                * coalescing neighboring slab objects into a single frag
> -                * which triggers one of hardened usercopy checks.
> -                */
> -               if (sendpage_ok(bv.bv_page)) {
> -                       ret = sock->ops->sendpage(sock, bv.bv_page,
> -                                                 bv.bv_offset, bv.bv_len,
> -                                                 CEPH_MSG_FLAGS);
> -               } else {
> -                       iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bv, 1, bv.bv_len);
> -                       ret = sock_sendmsg(sock, &msg);
> -               }
> -               if (ret <= 0) {
> -                       if (ret == -EAGAIN)
> -                               ret = 0;
> -                       return ret;
> -               }
> -
> -               iov_iter_advance(it, ret);
> -       }
> -
> -       return 1;
> -}
> -
>  /*
>   * Write as much as possible.  The socket is expected to be corked,
> - * so we don't bother with MSG_MORE/MSG_SENDPAGE_NOTLAST here.
> + * so we don't bother with MSG_MORE here.
>   *
>   * Return:
> - *   1 - done, nothing (else) to write
> + *  >0 - done, nothing (else) to write

It would be nice to avoid making tweaks like this to the outer
interface as part of switching to a new internal API.

>   *   0 - socket is full, need to wait
>   *  <0 - error
>   */
>  static int ceph_tcp_send(struct ceph_connection *con)
>  {
> +       struct msghdr msg = {
> +               .msg_iter       = con->v2.out_iter,
> +               .msg_flags      = CEPH_MSG_FLAGS,
> +       };
>         int ret;
>
> +       if (WARN_ON(!iov_iter_is_bvec(&con->v2.out_iter)))
> +               return -EINVAL;

Previously, this WARN_ON + error applied only to the "try sendpage"
path.  There is a ton of kvec usage in net/ceph/messenger_v2.c, so I'm
pretty sure that placing it here breaks everything.

> +
> +       if (con->v2.out_iter_sendpage)
> +               msg.msg_flags |= MSG_SPLICE_PAGES;
> +
>         dout("%s con %p have %zu try_sendpage %d\n", __func__, con,
>              iov_iter_count(&con->v2.out_iter), con->v2.out_iter_sendpage);
> -       if (con->v2.out_iter_sendpage)
> -               ret = do_try_sendpage(con->sock, &con->v2.out_iter);
> -       else
> -               ret = do_sendmsg(con->sock, &con->v2.out_iter);
> +
> +       ret = sock_sendmsg(con->sock, &msg);
> +       if (ret > 0)
> +               iov_iter_advance(&con->v2.out_iter, ret);
> +       else if (ret == -EAGAIN)
> +               ret = 0;

Hrm, is sock_sendmsg() now guaranteed to exhaust the iterator (i.e.
a "short write" is no longer possible)?  Unless that is the case, this
is not an equivalent transformation.

This is actually the reason for

>   * Return:
>   *   1 - done, nothing (else) to write

specification which you also tweaked.  It doesn't make sense for
ceph_tcp_send() to return the number of bytes sent because the caller
expects everything to be sent when a positive number is returned.

Thanks,

                Ilya
David Howells June 26, 2023, 3:30 p.m. UTC | #2
Ilya Dryomov <idryomov@gmail.com> wrote:

> write_partial_message_data() is net/ceph/messenger_v1.c specific, so it
> doesn't apply here.  I would suggest squashing the two net/ceph patches
> into one since even the titles are the same.

I would, but they're now applied to net-next, so we need to patch that.

> >   * Write as much as possible.  The socket is expected to be corked,
> > - * so we don't bother with MSG_MORE/MSG_SENDPAGE_NOTLAST here.
> > + * so we don't bother with MSG_MORE here.
> >   *
> >   * Return:
> > - *   1 - done, nothing (else) to write
> > + *  >0 - done, nothing (else) to write
> 
> It would be nice to avoid making tweaks like this to the outer
> interface as part of switching to a new internal API.

Ok.  I'll change that and wrap the sendmsg in a loop.  Though, as I asked in
an earlier reply, why is MSG_DONTWAIT used here?

> > +       if (WARN_ON(!iov_iter_is_bvec(&con->v2.out_iter)))
> > +               return -EINVAL;
> 
> Previously, this WARN_ON + error applied only to the "try sendpage"
> path.  There is a ton of kvec usage in net/ceph/messenger_v2.c, so I'm
> pretty sure that placing it here breaks everything.

This should have been removed as MSG_SPLICE_PAGES now accepts KVEC and XARRAY
iterators also.

Btw, is it feasible to use con->v2.out_iter_sendpage to apply MSG_SPLICE_PAGES
to the iterator to be transmitted as a whole?  It seems to be set depending on
iterator type.

David
Ilya Dryomov June 26, 2023, 4:07 p.m. UTC | #3
On Mon, Jun 26, 2023 at 5:30 PM David Howells <dhowells@redhat.com> wrote:
>
> Ilya Dryomov <idryomov@gmail.com> wrote:
>
> > write_partial_message_data() is net/ceph/messenger_v1.c specific, so it
> > doesn't apply here.  I would suggest squashing the two net/ceph patches
> > into one since even the titles are the same.
>
> I would, but they're now applied to net-next, so we need to patch that.

I don't see a problem with that given that the patches themselves have
major issues (i.e. it's not just a commit message/title nit).

>
> > >   * Write as much as possible.  The socket is expected to be corked,
> > > - * so we don't bother with MSG_MORE/MSG_SENDPAGE_NOTLAST here.
> > > + * so we don't bother with MSG_MORE here.
> > >   *
> > >   * Return:
> > > - *   1 - done, nothing (else) to write
> > > + *  >0 - done, nothing (else) to write
> >
> > It would be nice to avoid making tweaks like this to the outer
> > interface as part of switching to a new internal API.
>
> Ok.  I'll change that and wrap the sendmsg in a loop.  Though, as I asked in
> an earlier reply, why is MSG_DONTWAIT used here?

See my reply there.

>
> > > +       if (WARN_ON(!iov_iter_is_bvec(&con->v2.out_iter)))
> > > +               return -EINVAL;
> >
> > Previously, this WARN_ON + error applied only to the "try sendpage"
> > path.  There is a ton of kvec usage in net/ceph/messenger_v2.c, so I'm
> > pretty sure that placing it here breaks everything.
>
> This should have been removed as MSG_SPLICE_PAGES now accepts KVEC and XARRAY
> iterators also.
>
> Btw, is it feasible to use con->v2.out_iter_sendpage to apply MSG_SPLICE_PAGES
> to the iterator to be transmitted as a whole?  It seems to be set depending on
> iterator type.

I'm not sure I understand what you mean by "transmitted as a whole".
con->v2.out_iter_sendpage is set only when zerocopy is desired.  If the
underlying data is not guaranteed to remain stable, zerocopy behavior
is not safe.

Thanks,

                Ilya
David Howells June 26, 2023, 5:01 p.m. UTC | #4
Ilya Dryomov <idryomov@gmail.com> wrote:

> > Btw, is it feasible to use con->v2.out_iter_sendpage to apply
> > MSG_SPLICE_PAGES to the iterator to be transmitted as a whole?  It seems
> > to be set depending on iterator type.
> 
> I'm not sure I understand what you mean by "transmitted as a whole".
> con->v2.out_iter_sendpage is set only when zerocopy is desired.  If the
> underlying data is not guaranteed to remain stable, zerocopy behavior
> is not safe.

I think I need to reinstate the per-page sendpage_ok() check here also -
though Al pointed out it isn't sufficiently exhaustive.  There are pages that
sendpage_ok() will return true on that you shouldn't be passing to sendpage().

I'll whip up a patch to partially revert this also.

David
diff mbox series

Patch

diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c
index 301a991dc6a6..87ac97073e75 100644
--- a/net/ceph/messenger_v2.c
+++ b/net/ceph/messenger_v2.c
@@ -117,91 +117,38 @@  static int ceph_tcp_recv(struct ceph_connection *con)
 	return ret;
 }
 
-static int do_sendmsg(struct socket *sock, struct iov_iter *it)
-{
-	struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS };
-	int ret;
-
-	msg.msg_iter = *it;
-	while (iov_iter_count(it)) {
-		ret = sock_sendmsg(sock, &msg);
-		if (ret <= 0) {
-			if (ret == -EAGAIN)
-				ret = 0;
-			return ret;
-		}
-
-		iov_iter_advance(it, ret);
-	}
-
-	WARN_ON(msg_data_left(&msg));
-	return 1;
-}
-
-static int do_try_sendpage(struct socket *sock, struct iov_iter *it)
-{
-	struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS };
-	struct bio_vec bv;
-	int ret;
-
-	if (WARN_ON(!iov_iter_is_bvec(it)))
-		return -EINVAL;
-
-	while (iov_iter_count(it)) {
-		/* iov_iter_iovec() for ITER_BVEC */
-		bvec_set_page(&bv, it->bvec->bv_page,
-			      min(iov_iter_count(it),
-				  it->bvec->bv_len - it->iov_offset),
-			      it->bvec->bv_offset + it->iov_offset);
-
-		/*
-		 * sendpage cannot properly handle pages with
-		 * page_count == 0, we need to fall back to sendmsg if
-		 * that's the case.
-		 *
-		 * Same goes for slab pages: skb_can_coalesce() allows
-		 * coalescing neighboring slab objects into a single frag
-		 * which triggers one of hardened usercopy checks.
-		 */
-		if (sendpage_ok(bv.bv_page)) {
-			ret = sock->ops->sendpage(sock, bv.bv_page,
-						  bv.bv_offset, bv.bv_len,
-						  CEPH_MSG_FLAGS);
-		} else {
-			iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bv, 1, bv.bv_len);
-			ret = sock_sendmsg(sock, &msg);
-		}
-		if (ret <= 0) {
-			if (ret == -EAGAIN)
-				ret = 0;
-			return ret;
-		}
-
-		iov_iter_advance(it, ret);
-	}
-
-	return 1;
-}
-
 /*
  * Write as much as possible.  The socket is expected to be corked,
- * so we don't bother with MSG_MORE/MSG_SENDPAGE_NOTLAST here.
+ * so we don't bother with MSG_MORE here.
  *
  * Return:
- *   1 - done, nothing (else) to write
+ *  >0 - done, nothing (else) to write
  *   0 - socket is full, need to wait
  *  <0 - error
  */
 static int ceph_tcp_send(struct ceph_connection *con)
 {
+	struct msghdr msg = {
+		.msg_iter	= con->v2.out_iter,
+		.msg_flags	= CEPH_MSG_FLAGS,
+	};
 	int ret;
 
+	if (WARN_ON(!iov_iter_is_bvec(&con->v2.out_iter)))
+		return -EINVAL;
+
+	if (con->v2.out_iter_sendpage)
+		msg.msg_flags |= MSG_SPLICE_PAGES;
+
 	dout("%s con %p have %zu try_sendpage %d\n", __func__, con,
 	     iov_iter_count(&con->v2.out_iter), con->v2.out_iter_sendpage);
-	if (con->v2.out_iter_sendpage)
-		ret = do_try_sendpage(con->sock, &con->v2.out_iter);
-	else
-		ret = do_sendmsg(con->sock, &con->v2.out_iter);
+
+	ret = sock_sendmsg(con->sock, &msg);
+	if (ret > 0)
+		iov_iter_advance(&con->v2.out_iter, ret);
+	else if (ret == -EAGAIN)
+		ret = 0;
+
 	dout("%s con %p ret %d left %zu\n", __func__, con, ret,
 	     iov_iter_count(&con->v2.out_iter));
 	return ret;