diff mbox series

[RFC,v4,05/17] vsock/virtio: MSG_ZEROCOPY flag support

Message ID 20230603204939.1598818-6-AVKrasnov@sberdevices.ru (mailing list archive)
State RFC
Delegated to: Netdev Maintainers
Headers show
Series vsock: MSG_ZEROCOPY flag support | expand

Checks

Context Check Description
netdev/series_format fail Series longer than 15 patches (and no cover letter)
netdev/tree_selection success Guessed tree name to be net-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 8 this patch: 8
netdev/cc_maintainers success CCed 9 of 9 maintainers
netdev/build_clang success Errors and warnings before: 8 this patch: 8
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 8 this patch: 8
netdev/checkpatch warning WARNING: line length of 85 exceeds 80 columns WARNING: line length of 92 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Arseniy Krasnov June 3, 2023, 8:49 p.m. UTC
This adds handling of MSG_ZEROCOPY flag on transmission path: if this
flag is set and zerocopy transmission is possible, then non-linear skb
will be created and filled with the pages of user's buffer. Pages of
user's buffer are locked in memory by 'get_user_pages()'.

Signed-off-by: Arseniy Krasnov <AVKrasnov@sberdevices.ru>
---
 net/vmw_vsock/virtio_transport_common.c | 270 ++++++++++++++++++------
 1 file changed, 208 insertions(+), 62 deletions(-)

Comments

Stefano Garzarella June 26, 2023, 4:03 p.m. UTC | #1
On Sat, Jun 03, 2023 at 11:49:27PM +0300, Arseniy Krasnov wrote:
>This adds handling of MSG_ZEROCOPY flag on transmission path: if this
>flag is set and zerocopy transmission is possible, then non-linear skb
>will be created and filled with the pages of user's buffer. Pages of
>user's buffer are locked in memory by 'get_user_pages()'.
>
>Signed-off-by: Arseniy Krasnov <AVKrasnov@sberdevices.ru>
>---
> net/vmw_vsock/virtio_transport_common.c | 270 ++++++++++++++++++------
> 1 file changed, 208 insertions(+), 62 deletions(-)
>
>diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
>index 0de562c1dc4b..f1ec38c72db7 100644
>--- a/net/vmw_vsock/virtio_transport_common.c
>+++ b/net/vmw_vsock/virtio_transport_common.c
>@@ -37,27 +37,100 @@ virtio_transport_get_ops(struct vsock_sock *vsk)
> 	return container_of(t, struct virtio_transport, transport);
> }
>
>-/* Returns a new packet on success, otherwise returns NULL.
>- *
>- * If NULL is returned, errp is set to a negative errno.
>- */
>-static struct sk_buff *
>-virtio_transport_alloc_skb(struct virtio_vsock_pkt_info *info,
>-			   size_t len,
>-			   u32 src_cid,
>-			   u32 src_port,
>-			   u32 dst_cid,
>-			   u32 dst_port)
>-{
>-	const size_t skb_len = VIRTIO_VSOCK_SKB_HEADROOM + len;
>-	struct virtio_vsock_hdr *hdr;
>-	struct sk_buff *skb;
>+static bool virtio_transport_can_zcopy(struct virtio_vsock_pkt_info *info,
>+				       size_t max_to_send)
>+{
>+	struct iov_iter *iov_iter;
>+
>+	if (!info->msg)
>+		return false;
>+
>+	iov_iter = &info->msg->msg_iter;
>+
>+	/* Data is simple buffer. */
>+	if (iter_is_ubuf(iov_iter))
>+		return true;
>+
>+	if (!iter_is_iovec(iov_iter))
>+		return false;
>+
>+	if (iov_iter->iov_offset)
>+		return false;
>+
>+	/* We can't send whole iov. */
>+	if (iov_iter->count > max_to_send)
>+		return false;
>+
>+	return true;
>+}
>+
>+static int virtio_transport_init_zcopy_skb(struct vsock_sock *vsk,
>+					   struct sk_buff *skb,
>+					   struct msghdr *msg,
>+					   bool zerocopy)
>+{
>+	struct ubuf_info *uarg;
>+
>+	if (msg->msg_ubuf) {
>+		uarg = msg->msg_ubuf;
>+		net_zcopy_get(uarg);
>+	} else {
>+		struct iov_iter *iter = &msg->msg_iter;
>+		struct ubuf_info_msgzc *uarg_zc;
>+		int len;
>+
>+		/* Only ITER_IOVEC or ITER_UBUF are allowed and
>+		 * checked before.
>+		 */
>+		if (iter_is_iovec(iter))
>+			len = iov_length(iter->__iov, iter->nr_segs);
>+		else
>+			len = iter->count;
>+
>+		uarg = msg_zerocopy_realloc(sk_vsock(vsk),
>+					    len,
>+					    NULL);
>+
>+		if (!uarg)
>+			return -1;
>+
>+		uarg_zc = uarg_to_msgzc(uarg);
>+		uarg_zc->zerocopy = zerocopy ? 1 : 0;
>+	}
>+
>+	skb_zcopy_init(skb, uarg);
>+
>+	return 0;
>+}
>+
>+static int virtio_transport_fill_linear_skb(struct sk_buff *skb,
>+					    struct vsock_sock *vsk,

`vsk` seems unused

>+					    struct virtio_vsock_pkt_info *info,
>+					    size_t len)
>+{
> 	void *payload;
> 	int err;
>
>-	skb = virtio_vsock_alloc_skb(skb_len, GFP_KERNEL);
>-	if (!skb)
>-		return NULL;
>+	payload = skb_put(skb, len);
>+	err = memcpy_from_msg(payload, info->msg, len);
>+	if (err)
>+		return -1;
>+
>+	if (msg_data_left(info->msg))
>+		return 0;
>+
>+	return 0;
>+}
>+
>+static void virtio_transport_init_hdr(struct sk_buff *skb,
>+				      struct virtio_vsock_pkt_info *info,
>+				      u32 src_cid,
>+				      u32 src_port,
>+				      u32 dst_cid,
>+				      u32 dst_port,
>+				      size_t len)
>+{
>+	struct virtio_vsock_hdr *hdr;
>
> 	hdr = virtio_vsock_hdr(skb);
> 	hdr->type	= cpu_to_le16(info->type);
>@@ -68,42 +141,6 @@ virtio_transport_alloc_skb(struct virtio_vsock_pkt_info *info,
> 	hdr->dst_port	= cpu_to_le32(dst_port);
> 	hdr->flags	= cpu_to_le32(info->flags);
> 	hdr->len	= cpu_to_le32(len);
>-
>-	if (info->msg && len > 0) {
>-		payload = skb_put(skb, len);
>-		err = memcpy_from_msg(payload, info->msg, len);
>-		if (err)
>-			goto out;
>-
>-		if (msg_data_left(info->msg) == 0 &&
>-		    info->type == VIRTIO_VSOCK_TYPE_SEQPACKET) {
>-			hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOM);
>-
>-			if (info->msg->msg_flags & MSG_EOR)
>-				hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR);
>-		}
>-	}
>-
>-	if (info->reply)
>-		virtio_vsock_skb_set_reply(skb);
>-
>-	trace_virtio_transport_alloc_pkt(src_cid, src_port,
>-					 dst_cid, dst_port,
>-					 len,
>-					 info->type,
>-					 info->op,
>-					 info->flags);
>-
>-	if (info->vsk && !skb_set_owner_sk_safe(skb, sk_vsock(info->vsk))) {
>-		WARN_ONCE(1, "failed to allocate skb on vsock socket with sk_refcnt == 0\n");
>-		goto out;
>-	}
>-
>-	return skb;
>-
>-out:
>-	kfree_skb(skb);
>-	return NULL;
> }
>
> static void virtio_transport_copy_nonlinear_skb(struct sk_buff *skb,
>@@ -214,6 +251,85 @@ static u16 virtio_transport_get_type(struct sock *sk)
> 		return VIRTIO_VSOCK_TYPE_SEQPACKET;
> }
>
>+/* Returns a new packet on success, otherwise returns NULL.
>+ *
>+ * If NULL is returned, errp is set to a negative errno.

I had noticed this in Bobby's patches, I think it's an old comment we
left around.

>+ */
>+static struct sk_buff *virtio_transport_alloc_skb(struct vsock_sock *vsk,
>+						  struct virtio_vsock_pkt_info *info,
>+						  size_t payload_len,
>+						  bool zcopy,
>+						  u32 dst_cid,
>+						  u32 dst_port,
>+						  u32 src_cid,
>+						  u32 src_port)
>+{
>+	struct sk_buff *skb;
>+	size_t skb_len;
>+
>+	skb_len = VIRTIO_VSOCK_SKB_HEADROOM;
>+
>+	if (!zcopy)
>+		skb_len += payload_len;
>+
>+	skb = virtio_vsock_alloc_skb(skb_len, GFP_KERNEL);
>+	if (!skb)
>+		return NULL;
>+
>+	virtio_transport_init_hdr(skb, info, src_cid, src_port,
>+				  dst_cid, dst_port,
>+				  payload_len);
>+
>+	/* Set owner here, because '__zerocopy_sg_from_iter()' uses
>+	 * owner of skb without check to update 'sk_wmem_alloc'.
>+	 */
>+	if (vsk)
>+		skb_set_owner_w(skb, sk_vsock(vsk));

why we are moving from skb_set_owner_sk_safe() to skb_set_owner_w()?

We should mention this in the commit description.

>+
>+	if (info->msg && payload_len > 0) {
>+		int err;
>+
>+		if (zcopy) {
>+			err = __zerocopy_sg_from_iter(info->msg, NULL, skb,
>+						      &info->msg->msg_iter,
>+						      payload_len);
>+		} else {
>+			err = virtio_transport_fill_linear_skb(skb, vsk, info, payload_len);
>+		}
>+
>+		if (err)
>+			goto out;
>+
>+		VIRTIO_VSOCK_SKB_CB(skb)->frag_off = 0;
>+
>+		if (info->type == VIRTIO_VSOCK_TYPE_SEQPACKET) {
>+			struct virtio_vsock_hdr *hdr;
>+
>+			hdr = virtio_vsock_hdr(skb);

Just `struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb);` should be
fine.

>+
>+			hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOM);
>+
>+			if (info->msg->msg_flags & MSG_EOR)
>+				hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR);
>+		}
>+	}
>+
>+	if (info->reply)
>+		virtio_vsock_skb_set_reply(skb);
>+
>+	trace_virtio_transport_alloc_pkt(src_cid, src_port,
>+					 dst_cid, dst_port,
>+					 payload_len,
>+					 info->type,
>+					 info->op,
>+					 info->flags);
>+
>+	return skb;
>+out:
>+	kfree_skb(skb);
>+	return NULL;
>+}
>+
> /* This function can only be used on connecting/connected sockets,
>  * since a socket assigned to a transport is required.
>  *
>@@ -226,6 +342,8 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
> 	const struct virtio_transport *t_ops;
> 	struct virtio_vsock_sock *vvs;
> 	u32 pkt_len = info->pkt_len;
>+	bool can_zcopy = false;
>+	u32 max_skb_cap;
> 	u32 rest_len;
> 	int ret;
>
>@@ -254,22 +372,49 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
> 	if (pkt_len == 0 && info->op == VIRTIO_VSOCK_OP_RW)
> 		return pkt_len;
>
>+	/* If zerocopy is not enabled by 'setsockopt()', we behave as
>+	 * there is no MSG_ZEROCOPY flag set.
>+	 */
>+	if (!sock_flag(sk_vsock(vsk), SOCK_ZEROCOPY))
>+		info->flags &= ~MSG_ZEROCOPY;
>+
>+	if (info->flags & MSG_ZEROCOPY)
>+		can_zcopy = virtio_transport_can_zcopy(info, pkt_len);
>+
>+	if (can_zcopy)
>+		max_skb_cap = min_t(u32, VIRTIO_VSOCK_MAX_PKT_BUF_SIZE,
>+				    (MAX_SKB_FRAGS * PAGE_SIZE));
>+	else
>+		max_skb_cap = VIRTIO_VSOCK_MAX_PKT_BUF_SIZE;
>+

We use `len` very often, what about `max_skb_len`?

> 	rest_len = pkt_len;
>
> 	do {
> 		struct sk_buff *skb;
> 		size_t skb_len;
>
>-		skb_len = min_t(u32, VIRTIO_VSOCK_MAX_PKT_BUF_SIZE, rest_len);
>+		skb_len = min(max_skb_cap, rest_len);
>
>-		skb = virtio_transport_alloc_skb(info, skb_len,
>-						 src_cid, src_port,
>-						 dst_cid, dst_port);
>+		skb = virtio_transport_alloc_skb(vsk, info, skb_len, can_zcopy,
>+						 dst_cid, dst_port,
>+						 src_cid, src_port);
> 		if (!skb) {
> 			ret = -ENOMEM;
> 			break;
> 		}
>
>+		/* This is last skb to send this portion of data. */
>+		if (skb_len == rest_len &&
>+		    info->flags & MSG_ZEROCOPY &&
>+		    info->op == VIRTIO_VSOCK_OP_RW) {
>+			if (virtio_transport_init_zcopy_skb(vsk, skb,
>+							    info->msg,
>+							    can_zcopy)) {
>+				ret = -ENOMEM;
>+				break;
>+			}
>+		}
>+
> 		virtio_transport_inc_tx_pkt(vvs, skb);
>
> 		ret = t_ops->send_pkt(skb);
>@@ -884,6 +1029,7 @@ virtio_transport_stream_enqueue(struct vsock_sock *vsk,
> 		.msg = msg,
> 		.pkt_len = len,
> 		.vsk = vsk,
>+		.flags = msg->msg_flags,

These flags then get copied into the virtio_vsock_hdr, which I don't
think is a good idea.

Why not using directly info->msg->msg_flags?

> 	};
>
> 	return virtio_transport_send_pkt_info(vsk, &info);
>@@ -935,11 +1081,11 @@ static int virtio_transport_reset_no_sock(const struct virtio_transport *t,
> 	if (!t)
> 		return -ENOTCONN;
>
>-	reply = virtio_transport_alloc_skb(&info, 0,
>-					   le64_to_cpu(hdr->dst_cid),
>-					   le32_to_cpu(hdr->dst_port),
>+	reply = virtio_transport_alloc_skb(NULL, &info, 0, false,
> 					   le64_to_cpu(hdr->src_cid),
>-					   le32_to_cpu(hdr->src_port));
>+					   le32_to_cpu(hdr->src_port),
>+					   le64_to_cpu(hdr->dst_cid),
>+					   le32_to_cpu(hdr->dst_port));
> 	if (!reply)
> 		return -ENOMEM;
>
>-- 
>2.25.1
>
Arseniy Krasnov June 27, 2023, 4:41 a.m. UTC | #2
On 26.06.2023 19:03, Stefano Garzarella wrote:
> On Sat, Jun 03, 2023 at 11:49:27PM +0300, Arseniy Krasnov wrote:
>> This adds handling of MSG_ZEROCOPY flag on transmission path: if this
>> flag is set and zerocopy transmission is possible, then non-linear skb
>> will be created and filled with the pages of user's buffer. Pages of
>> user's buffer are locked in memory by 'get_user_pages()'.
>>
>> Signed-off-by: Arseniy Krasnov <AVKrasnov@sberdevices.ru>
>> ---
>> net/vmw_vsock/virtio_transport_common.c | 270 ++++++++++++++++++------
>> 1 file changed, 208 insertions(+), 62 deletions(-)
>>
>> diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
>> index 0de562c1dc4b..f1ec38c72db7 100644
>> --- a/net/vmw_vsock/virtio_transport_common.c
>> +++ b/net/vmw_vsock/virtio_transport_common.c
>> @@ -37,27 +37,100 @@ virtio_transport_get_ops(struct vsock_sock *vsk)
>>     return container_of(t, struct virtio_transport, transport);
>> }
>>
>> -/* Returns a new packet on success, otherwise returns NULL.
>> - *
>> - * If NULL is returned, errp is set to a negative errno.
>> - */
>> -static struct sk_buff *
>> -virtio_transport_alloc_skb(struct virtio_vsock_pkt_info *info,
>> -               size_t len,
>> -               u32 src_cid,
>> -               u32 src_port,
>> -               u32 dst_cid,
>> -               u32 dst_port)
>> -{
>> -    const size_t skb_len = VIRTIO_VSOCK_SKB_HEADROOM + len;
>> -    struct virtio_vsock_hdr *hdr;
>> -    struct sk_buff *skb;
>> +static bool virtio_transport_can_zcopy(struct virtio_vsock_pkt_info *info,
>> +                       size_t max_to_send)
>> +{
>> +    struct iov_iter *iov_iter;
>> +
>> +    if (!info->msg)
>> +        return false;
>> +
>> +    iov_iter = &info->msg->msg_iter;
>> +
>> +    /* Data is simple buffer. */
>> +    if (iter_is_ubuf(iov_iter))
>> +        return true;
>> +
>> +    if (!iter_is_iovec(iov_iter))
>> +        return false;
>> +
>> +    if (iov_iter->iov_offset)
>> +        return false;
>> +
>> +    /* We can't send whole iov. */
>> +    if (iov_iter->count > max_to_send)
>> +        return false;
>> +
>> +    return true;
>> +}
>> +
>> +static int virtio_transport_init_zcopy_skb(struct vsock_sock *vsk,
>> +                       struct sk_buff *skb,
>> +                       struct msghdr *msg,
>> +                       bool zerocopy)
>> +{
>> +    struct ubuf_info *uarg;
>> +
>> +    if (msg->msg_ubuf) {
>> +        uarg = msg->msg_ubuf;
>> +        net_zcopy_get(uarg);
>> +    } else {
>> +        struct iov_iter *iter = &msg->msg_iter;
>> +        struct ubuf_info_msgzc *uarg_zc;
>> +        int len;
>> +
>> +        /* Only ITER_IOVEC or ITER_UBUF are allowed and
>> +         * checked before.
>> +         */
>> +        if (iter_is_iovec(iter))
>> +            len = iov_length(iter->__iov, iter->nr_segs);
>> +        else
>> +            len = iter->count;
>> +
>> +        uarg = msg_zerocopy_realloc(sk_vsock(vsk),
>> +                        len,
>> +                        NULL);
>> +
>> +        if (!uarg)
>> +            return -1;
>> +
>> +        uarg_zc = uarg_to_msgzc(uarg);
>> +        uarg_zc->zerocopy = zerocopy ? 1 : 0;
>> +    }
>> +
>> +    skb_zcopy_init(skb, uarg);
>> +
>> +    return 0;
>> +}
>> +
>> +static int virtio_transport_fill_linear_skb(struct sk_buff *skb,
>> +                        struct vsock_sock *vsk,
> 
> `vsk` seems unused
> 
>> +                        struct virtio_vsock_pkt_info *info,
>> +                        size_t len)
>> +{
>>     void *payload;
>>     int err;
>>
>> -    skb = virtio_vsock_alloc_skb(skb_len, GFP_KERNEL);
>> -    if (!skb)
>> -        return NULL;
>> +    payload = skb_put(skb, len);
>> +    err = memcpy_from_msg(payload, info->msg, len);
>> +    if (err)
>> +        return -1;
>> +
>> +    if (msg_data_left(info->msg))
>> +        return 0;
>> +
>> +    return 0;
>> +}
>> +
>> +static void virtio_transport_init_hdr(struct sk_buff *skb,
>> +                      struct virtio_vsock_pkt_info *info,
>> +                      u32 src_cid,
>> +                      u32 src_port,
>> +                      u32 dst_cid,
>> +                      u32 dst_port,
>> +                      size_t len)
>> +{
>> +    struct virtio_vsock_hdr *hdr;
>>
>>     hdr = virtio_vsock_hdr(skb);
>>     hdr->type    = cpu_to_le16(info->type);
>> @@ -68,42 +141,6 @@ virtio_transport_alloc_skb(struct virtio_vsock_pkt_info *info,
>>     hdr->dst_port    = cpu_to_le32(dst_port);
>>     hdr->flags    = cpu_to_le32(info->flags);
>>     hdr->len    = cpu_to_le32(len);
>> -
>> -    if (info->msg && len > 0) {
>> -        payload = skb_put(skb, len);
>> -        err = memcpy_from_msg(payload, info->msg, len);
>> -        if (err)
>> -            goto out;
>> -
>> -        if (msg_data_left(info->msg) == 0 &&
>> -            info->type == VIRTIO_VSOCK_TYPE_SEQPACKET) {
>> -            hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOM);
>> -
>> -            if (info->msg->msg_flags & MSG_EOR)
>> -                hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR);
>> -        }
>> -    }
>> -
>> -    if (info->reply)
>> -        virtio_vsock_skb_set_reply(skb);
>> -
>> -    trace_virtio_transport_alloc_pkt(src_cid, src_port,
>> -                     dst_cid, dst_port,
>> -                     len,
>> -                     info->type,
>> -                     info->op,
>> -                     info->flags);
>> -
>> -    if (info->vsk && !skb_set_owner_sk_safe(skb, sk_vsock(info->vsk))) {
>> -        WARN_ONCE(1, "failed to allocate skb on vsock socket with sk_refcnt == 0\n");
>> -        goto out;
>> -    }
>> -
>> -    return skb;
>> -
>> -out:
>> -    kfree_skb(skb);
>> -    return NULL;
>> }
>>
>> static void virtio_transport_copy_nonlinear_skb(struct sk_buff *skb,
>> @@ -214,6 +251,85 @@ static u16 virtio_transport_get_type(struct sock *sk)
>>         return VIRTIO_VSOCK_TYPE_SEQPACKET;
>> }
>>
>> +/* Returns a new packet on success, otherwise returns NULL.
>> + *
>> + * If NULL is returned, errp is set to a negative errno.
> 
> I had noticed this in Bobby's patches, I think it's an old comment we
> left around.
> 
>> + */
>> +static struct sk_buff *virtio_transport_alloc_skb(struct vsock_sock *vsk,
>> +                          struct virtio_vsock_pkt_info *info,
>> +                          size_t payload_len,
>> +                          bool zcopy,
>> +                          u32 dst_cid,
>> +                          u32 dst_port,
>> +                          u32 src_cid,
>> +                          u32 src_port)
>> +{
>> +    struct sk_buff *skb;
>> +    size_t skb_len;
>> +
>> +    skb_len = VIRTIO_VSOCK_SKB_HEADROOM;
>> +
>> +    if (!zcopy)
>> +        skb_len += payload_len;
>> +
>> +    skb = virtio_vsock_alloc_skb(skb_len, GFP_KERNEL);
>> +    if (!skb)
>> +        return NULL;
>> +
>> +    virtio_transport_init_hdr(skb, info, src_cid, src_port,
>> +                  dst_cid, dst_port,
>> +                  payload_len);
>> +
>> +    /* Set owner here, because '__zerocopy_sg_from_iter()' uses
>> +     * owner of skb without check to update 'sk_wmem_alloc'.
>> +     */
>> +    if (vsk)
>> +        skb_set_owner_w(skb, sk_vsock(vsk));
> 
> why we are moving from skb_set_owner_sk_safe() to skb_set_owner_w()?
> 
> We should mention this in the commit description.
> 
>> +
>> +    if (info->msg && payload_len > 0) {
>> +        int err;
>> +
>> +        if (zcopy) {
>> +            err = __zerocopy_sg_from_iter(info->msg, NULL, skb,
>> +                              &info->msg->msg_iter,
>> +                              payload_len);
>> +        } else {
>> +            err = virtio_transport_fill_linear_skb(skb, vsk, info, payload_len);
>> +        }
>> +
>> +        if (err)
>> +            goto out;
>> +
>> +        VIRTIO_VSOCK_SKB_CB(skb)->frag_off = 0;
>> +
>> +        if (info->type == VIRTIO_VSOCK_TYPE_SEQPACKET) {
>> +            struct virtio_vsock_hdr *hdr;
>> +
>> +            hdr = virtio_vsock_hdr(skb);
> 
> Just `struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb);` should be
> fine.
> 
>> +
>> +            hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOM);
>> +
>> +            if (info->msg->msg_flags & MSG_EOR)
>> +                hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR);
>> +        }
>> +    }
>> +
>> +    if (info->reply)
>> +        virtio_vsock_skb_set_reply(skb);
>> +
>> +    trace_virtio_transport_alloc_pkt(src_cid, src_port,
>> +                     dst_cid, dst_port,
>> +                     payload_len,
>> +                     info->type,
>> +                     info->op,
>> +                     info->flags);
>> +
>> +    return skb;
>> +out:
>> +    kfree_skb(skb);
>> +    return NULL;
>> +}
>> +
>> /* This function can only be used on connecting/connected sockets,
>>  * since a socket assigned to a transport is required.
>>  *
>> @@ -226,6 +342,8 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
>>     const struct virtio_transport *t_ops;
>>     struct virtio_vsock_sock *vvs;
>>     u32 pkt_len = info->pkt_len;
>> +    bool can_zcopy = false;
>> +    u32 max_skb_cap;
>>     u32 rest_len;
>>     int ret;
>>
>> @@ -254,22 +372,49 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
>>     if (pkt_len == 0 && info->op == VIRTIO_VSOCK_OP_RW)
>>         return pkt_len;
>>
>> +    /* If zerocopy is not enabled by 'setsockopt()', we behave as
>> +     * there is no MSG_ZEROCOPY flag set.
>> +     */
>> +    if (!sock_flag(sk_vsock(vsk), SOCK_ZEROCOPY))
>> +        info->flags &= ~MSG_ZEROCOPY;
>> +
>> +    if (info->flags & MSG_ZEROCOPY)
>> +        can_zcopy = virtio_transport_can_zcopy(info, pkt_len);
>> +
>> +    if (can_zcopy)
>> +        max_skb_cap = min_t(u32, VIRTIO_VSOCK_MAX_PKT_BUF_SIZE,
>> +                    (MAX_SKB_FRAGS * PAGE_SIZE));
>> +    else
>> +        max_skb_cap = VIRTIO_VSOCK_MAX_PKT_BUF_SIZE;
>> +
> 
> We use `len` very often, what about `max_skb_len`?
> 
>>     rest_len = pkt_len;
>>
>>     do {
>>         struct sk_buff *skb;
>>         size_t skb_len;
>>
>> -        skb_len = min_t(u32, VIRTIO_VSOCK_MAX_PKT_BUF_SIZE, rest_len);
>> +        skb_len = min(max_skb_cap, rest_len);
>>
>> -        skb = virtio_transport_alloc_skb(info, skb_len,
>> -                         src_cid, src_port,
>> -                         dst_cid, dst_port);
>> +        skb = virtio_transport_alloc_skb(vsk, info, skb_len, can_zcopy,
>> +                         dst_cid, dst_port,
>> +                         src_cid, src_port);
>>         if (!skb) {
>>             ret = -ENOMEM;
>>             break;
>>         }
>>
>> +        /* This is last skb to send this portion of data. */
>> +        if (skb_len == rest_len &&
>> +            info->flags & MSG_ZEROCOPY &&
>> +            info->op == VIRTIO_VSOCK_OP_RW) {
>> +            if (virtio_transport_init_zcopy_skb(vsk, skb,
>> +                                info->msg,
>> +                                can_zcopy)) {
>> +                ret = -ENOMEM;
>> +                break;
>> +            }
>> +        }
>> +
>>         virtio_transport_inc_tx_pkt(vvs, skb);
>>
>>         ret = t_ops->send_pkt(skb);
>> @@ -884,6 +1029,7 @@ virtio_transport_stream_enqueue(struct vsock_sock *vsk,
>>         .msg = msg,
>>         .pkt_len = len,
>>         .vsk = vsk,
>> +        .flags = msg->msg_flags,
> 
> These flags then get copied into the virtio_vsock_hdr, which I don't
> think is a good idea.
> 
> Why not using directly info->msg->msg_flags?

Ops, yes, it's a bug, You're right, this is really wrong as there are two different
sets of flags - MSG_XXX passed to syscall and flags in the header of packet.

Thanks, Arseniy

> 
>>     };
>>
>>     return virtio_transport_send_pkt_info(vsk, &info);
>> @@ -935,11 +1081,11 @@ static int virtio_transport_reset_no_sock(const struct virtio_transport *t,
>>     if (!t)
>>         return -ENOTCONN;
>>
>> -    reply = virtio_transport_alloc_skb(&info, 0,
>> -                       le64_to_cpu(hdr->dst_cid),
>> -                       le32_to_cpu(hdr->dst_port),
>> +    reply = virtio_transport_alloc_skb(NULL, &info, 0, false,
>>                        le64_to_cpu(hdr->src_cid),
>> -                       le32_to_cpu(hdr->src_port));
>> +                       le32_to_cpu(hdr->src_port),
>> +                       le64_to_cpu(hdr->dst_cid),
>> +                       le32_to_cpu(hdr->dst_port));
>>     if (!reply)
>>         return -ENOMEM;
>>
>> -- 
>> 2.25.1
>>
>
Stefano Garzarella June 27, 2023, 7:50 a.m. UTC | #3
On Tue, Jun 27, 2023 at 07:41:51AM +0300, Arseniy Krasnov wrote:
>
>
>On 26.06.2023 19:03, Stefano Garzarella wrote:
>> On Sat, Jun 03, 2023 at 11:49:27PM +0300, Arseniy Krasnov wrote:
>>> This adds handling of MSG_ZEROCOPY flag on transmission path: if this
>>> flag is set and zerocopy transmission is possible, then non-linear skb
>>> will be created and filled with the pages of user's buffer. Pages of
>>> user's buffer are locked in memory by 'get_user_pages()'.
>>>
>>> Signed-off-by: Arseniy Krasnov <AVKrasnov@sberdevices.ru>
>>> ---
>>> net/vmw_vsock/virtio_transport_common.c | 270 ++++++++++++++++++------
>>> 1 file changed, 208 insertions(+), 62 deletions(-)
>>>
>>> diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
>>> index 0de562c1dc4b..f1ec38c72db7 100644
>>> --- a/net/vmw_vsock/virtio_transport_common.c
>>> +++ b/net/vmw_vsock/virtio_transport_common.c
>>> @@ -37,27 +37,100 @@ virtio_transport_get_ops(struct vsock_sock *vsk)
>>>     return container_of(t, struct virtio_transport, transport);
>>> }
>>>
>>> -/* Returns a new packet on success, otherwise returns NULL.
>>> - *
>>> - * If NULL is returned, errp is set to a negative errno.
>>> - */
>>> -static struct sk_buff *
>>> -virtio_transport_alloc_skb(struct virtio_vsock_pkt_info *info,
>>> -               size_t len,
>>> -               u32 src_cid,
>>> -               u32 src_port,
>>> -               u32 dst_cid,
>>> -               u32 dst_port)
>>> -{
>>> -    const size_t skb_len = VIRTIO_VSOCK_SKB_HEADROOM + len;
>>> -    struct virtio_vsock_hdr *hdr;
>>> -    struct sk_buff *skb;
>>> +static bool virtio_transport_can_zcopy(struct virtio_vsock_pkt_info *info,
>>> +                       size_t max_to_send)
>>> +{
>>> +    struct iov_iter *iov_iter;
>>> +
>>> +    if (!info->msg)
>>> +        return false;
>>> +
>>> +    iov_iter = &info->msg->msg_iter;
>>> +
>>> +    /* Data is simple buffer. */
>>> +    if (iter_is_ubuf(iov_iter))
>>> +        return true;
>>> +
>>> +    if (!iter_is_iovec(iov_iter))
>>> +        return false;
>>> +
>>> +    if (iov_iter->iov_offset)
>>> +        return false;
>>> +
>>> +    /* We can't send whole iov. */
>>> +    if (iov_iter->count > max_to_send)
>>> +        return false;
>>> +
>>> +    return true;
>>> +}
>>> +
>>> +static int virtio_transport_init_zcopy_skb(struct vsock_sock *vsk,
>>> +                       struct sk_buff *skb,
>>> +                       struct msghdr *msg,
>>> +                       bool zerocopy)
>>> +{
>>> +    struct ubuf_info *uarg;
>>> +
>>> +    if (msg->msg_ubuf) {
>>> +        uarg = msg->msg_ubuf;
>>> +        net_zcopy_get(uarg);
>>> +    } else {
>>> +        struct iov_iter *iter = &msg->msg_iter;
>>> +        struct ubuf_info_msgzc *uarg_zc;
>>> +        int len;
>>> +
>>> +        /* Only ITER_IOVEC or ITER_UBUF are allowed and
>>> +         * checked before.
>>> +         */
>>> +        if (iter_is_iovec(iter))
>>> +            len = iov_length(iter->__iov, iter->nr_segs);
>>> +        else
>>> +            len = iter->count;
>>> +
>>> +        uarg = msg_zerocopy_realloc(sk_vsock(vsk),
>>> +                        len,
>>> +                        NULL);
>>> +
>>> +        if (!uarg)
>>> +            return -1;
>>> +
>>> +        uarg_zc = uarg_to_msgzc(uarg);
>>> +        uarg_zc->zerocopy = zerocopy ? 1 : 0;
>>> +    }
>>> +
>>> +    skb_zcopy_init(skb, uarg);
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +static int virtio_transport_fill_linear_skb(struct sk_buff *skb,
>>> +                        struct vsock_sock *vsk,
>>
>> `vsk` seems unused
>>
>>> +                        struct virtio_vsock_pkt_info *info,
>>> +                        size_t len)
>>> +{
>>>     void *payload;
>>>     int err;
>>>
>>> -    skb = virtio_vsock_alloc_skb(skb_len, GFP_KERNEL);
>>> -    if (!skb)
>>> -        return NULL;
>>> +    payload = skb_put(skb, len);
>>> +    err = memcpy_from_msg(payload, info->msg, len);
>>> +    if (err)
>>> +        return -1;
>>> +
>>> +    if (msg_data_left(info->msg))
>>> +        return 0;
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +static void virtio_transport_init_hdr(struct sk_buff *skb,
>>> +                      struct virtio_vsock_pkt_info *info,
>>> +                      u32 src_cid,
>>> +                      u32 src_port,
>>> +                      u32 dst_cid,
>>> +                      u32 dst_port,
>>> +                      size_t len)
>>> +{
>>> +    struct virtio_vsock_hdr *hdr;
>>>
>>>     hdr = virtio_vsock_hdr(skb);
>>>     hdr->type    = cpu_to_le16(info->type);
>>> @@ -68,42 +141,6 @@ virtio_transport_alloc_skb(struct virtio_vsock_pkt_info *info,
>>>     hdr->dst_port    = cpu_to_le32(dst_port);
>>>     hdr->flags    = cpu_to_le32(info->flags);
>>>     hdr->len    = cpu_to_le32(len);
>>> -
>>> -    if (info->msg && len > 0) {
>>> -        payload = skb_put(skb, len);
>>> -        err = memcpy_from_msg(payload, info->msg, len);
>>> -        if (err)
>>> -            goto out;
>>> -
>>> -        if (msg_data_left(info->msg) == 0 &&
>>> -            info->type == VIRTIO_VSOCK_TYPE_SEQPACKET) {
>>> -            hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOM);
>>> -
>>> -            if (info->msg->msg_flags & MSG_EOR)
>>> -                hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR);
>>> -        }
>>> -    }
>>> -
>>> -    if (info->reply)
>>> -        virtio_vsock_skb_set_reply(skb);
>>> -
>>> -    trace_virtio_transport_alloc_pkt(src_cid, src_port,
>>> -                     dst_cid, dst_port,
>>> -                     len,
>>> -                     info->type,
>>> -                     info->op,
>>> -                     info->flags);
>>> -
>>> -    if (info->vsk && !skb_set_owner_sk_safe(skb, sk_vsock(info->vsk))) {
>>> -        WARN_ONCE(1, "failed to allocate skb on vsock socket with sk_refcnt == 0\n");
>>> -        goto out;
>>> -    }
>>> -
>>> -    return skb;
>>> -
>>> -out:
>>> -    kfree_skb(skb);
>>> -    return NULL;
>>> }
>>>
>>> static void virtio_transport_copy_nonlinear_skb(struct sk_buff *skb,
>>> @@ -214,6 +251,85 @@ static u16 virtio_transport_get_type(struct sock *sk)
>>>         return VIRTIO_VSOCK_TYPE_SEQPACKET;
>>> }
>>>
>>> +/* Returns a new packet on success, otherwise returns NULL.
>>> + *
>>> + * If NULL is returned, errp is set to a negative errno.
>>
>> I had noticed this in Bobby's patches, I think it's an old comment we
>> left around.
>>
>>> + */
>>> +static struct sk_buff *virtio_transport_alloc_skb(struct vsock_sock *vsk,
>>> +                          struct virtio_vsock_pkt_info *info,
>>> +                          size_t payload_len,
>>> +                          bool zcopy,
>>> +                          u32 dst_cid,
>>> +                          u32 dst_port,
>>> +                          u32 src_cid,
>>> +                          u32 src_port)
>>> +{
>>> +    struct sk_buff *skb;
>>> +    size_t skb_len;
>>> +
>>> +    skb_len = VIRTIO_VSOCK_SKB_HEADROOM;
>>> +
>>> +    if (!zcopy)
>>> +        skb_len += payload_len;
>>> +
>>> +    skb = virtio_vsock_alloc_skb(skb_len, GFP_KERNEL);
>>> +    if (!skb)
>>> +        return NULL;
>>> +
>>> +    virtio_transport_init_hdr(skb, info, src_cid, src_port,
>>> +                  dst_cid, dst_port,
>>> +                  payload_len);
>>> +
>>> +    /* Set owner here, because '__zerocopy_sg_from_iter()' uses
>>> +     * owner of skb without check to update 'sk_wmem_alloc'.
>>> +     */
>>> +    if (vsk)
>>> +        skb_set_owner_w(skb, sk_vsock(vsk));
>>
>> why we are moving from skb_set_owner_sk_safe() to skb_set_owner_w()?
>>
>> We should mention this in the commit description.
>>
>>> +
>>> +    if (info->msg && payload_len > 0) {
>>> +        int err;
>>> +
>>> +        if (zcopy) {
>>> +            err = __zerocopy_sg_from_iter(info->msg, NULL, skb,
>>> +                              &info->msg->msg_iter,
>>> +                              payload_len);
>>> +        } else {
>>> +            err = virtio_transport_fill_linear_skb(skb, vsk, info, payload_len);
>>> +        }
>>> +
>>> +        if (err)
>>> +            goto out;
>>> +
>>> +        VIRTIO_VSOCK_SKB_CB(skb)->frag_off = 0;
>>> +
>>> +        if (info->type == VIRTIO_VSOCK_TYPE_SEQPACKET) {
>>> +            struct virtio_vsock_hdr *hdr;
>>> +
>>> +            hdr = virtio_vsock_hdr(skb);
>>
>> Just `struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb);` should be
>> fine.
>>
>>> +
>>> +            hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOM);
>>> +
>>> +            if (info->msg->msg_flags & MSG_EOR)
>>> +                hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR);
>>> +        }
>>> +    }
>>> +
>>> +    if (info->reply)
>>> +        virtio_vsock_skb_set_reply(skb);
>>> +
>>> +    trace_virtio_transport_alloc_pkt(src_cid, src_port,
>>> +                     dst_cid, dst_port,
>>> +                     payload_len,
>>> +                     info->type,
>>> +                     info->op,
>>> +                     info->flags);
>>> +
>>> +    return skb;
>>> +out:
>>> +    kfree_skb(skb);
>>> +    return NULL;
>>> +}
>>> +
>>> /* This function can only be used on connecting/connected sockets,
>>>  * since a socket assigned to a transport is required.
>>>  *
>>> @@ -226,6 +342,8 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
>>>     const struct virtio_transport *t_ops;
>>>     struct virtio_vsock_sock *vvs;
>>>     u32 pkt_len = info->pkt_len;
>>> +    bool can_zcopy = false;
>>> +    u32 max_skb_cap;
>>>     u32 rest_len;
>>>     int ret;
>>>
>>> @@ -254,22 +372,49 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
>>>     if (pkt_len == 0 && info->op == VIRTIO_VSOCK_OP_RW)
>>>         return pkt_len;
>>>
>>> +    /* If zerocopy is not enabled by 'setsockopt()', we behave as
>>> +     * there is no MSG_ZEROCOPY flag set.
>>> +     */
>>> +    if (!sock_flag(sk_vsock(vsk), SOCK_ZEROCOPY))
>>> +        info->flags &= ~MSG_ZEROCOPY;
>>> +
>>> +    if (info->flags & MSG_ZEROCOPY)
>>> +        can_zcopy = virtio_transport_can_zcopy(info, pkt_len);
>>> +
>>> +    if (can_zcopy)
>>> +        max_skb_cap = min_t(u32, VIRTIO_VSOCK_MAX_PKT_BUF_SIZE,
>>> +                    (MAX_SKB_FRAGS * PAGE_SIZE));
>>> +    else
>>> +        max_skb_cap = VIRTIO_VSOCK_MAX_PKT_BUF_SIZE;
>>> +
>>
>> We use `len` very often, what about `max_skb_len`?
>>
>>>     rest_len = pkt_len;
>>>
>>>     do {
>>>         struct sk_buff *skb;
>>>         size_t skb_len;
>>>
>>> -        skb_len = min_t(u32, VIRTIO_VSOCK_MAX_PKT_BUF_SIZE, rest_len);
>>> +        skb_len = min(max_skb_cap, rest_len);
>>>
>>> -        skb = virtio_transport_alloc_skb(info, skb_len,
>>> -                         src_cid, src_port,
>>> -                         dst_cid, dst_port);
>>> +        skb = virtio_transport_alloc_skb(vsk, info, skb_len, can_zcopy,
>>> +                         dst_cid, dst_port,
>>> +                         src_cid, src_port);
>>>         if (!skb) {
>>>             ret = -ENOMEM;
>>>             break;
>>>         }
>>>
>>> +        /* This is last skb to send this portion of data. */
>>> +        if (skb_len == rest_len &&
>>> +            info->flags & MSG_ZEROCOPY &&
>>> +            info->op == VIRTIO_VSOCK_OP_RW) {
>>> +            if (virtio_transport_init_zcopy_skb(vsk, skb,
>>> +                                info->msg,
>>> +                                can_zcopy)) {
>>> +                ret = -ENOMEM;
>>> +                break;
>>> +            }
>>> +        }
>>> +
>>>         virtio_transport_inc_tx_pkt(vvs, skb);
>>>
>>>         ret = t_ops->send_pkt(skb);
>>> @@ -884,6 +1029,7 @@ virtio_transport_stream_enqueue(struct vsock_sock *vsk,
>>>         .msg = msg,
>>>         .pkt_len = len,
>>>         .vsk = vsk,
>>> +        .flags = msg->msg_flags,
>>
>> These flags then get copied into the virtio_vsock_hdr, which I don't
>> think is a good idea.
>>
>> Why not using directly info->msg->msg_flags?
>
>Ops, yes, it's a bug, You're right, this is really wrong as there are two different
>sets of flags - MSG_XXX passed to syscall and flags in the header of packet.

Yep.

What about the moving from skb_set_owner_sk_safe() to skb_set_owner_w()?
Was it voluntary? If so, can you explain why?


Thanks,
Stefano
Arseniy Krasnov June 27, 2023, 8:22 a.m. UTC | #4
On 27.06.2023 10:50, Stefano Garzarella wrote:
> On Tue, Jun 27, 2023 at 07:41:51AM +0300, Arseniy Krasnov wrote:
>>
>>
>> On 26.06.2023 19:03, Stefano Garzarella wrote:
>>> On Sat, Jun 03, 2023 at 11:49:27PM +0300, Arseniy Krasnov wrote:
>>>> This adds handling of MSG_ZEROCOPY flag on transmission path: if this
>>>> flag is set and zerocopy transmission is possible, then non-linear skb
>>>> will be created and filled with the pages of user's buffer. Pages of
>>>> user's buffer are locked in memory by 'get_user_pages()'.
>>>>
>>>> Signed-off-by: Arseniy Krasnov <AVKrasnov@sberdevices.ru>
>>>> ---
>>>> net/vmw_vsock/virtio_transport_common.c | 270 ++++++++++++++++++------
>>>> 1 file changed, 208 insertions(+), 62 deletions(-)
>>>>
>>>> diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
>>>> index 0de562c1dc4b..f1ec38c72db7 100644
>>>> --- a/net/vmw_vsock/virtio_transport_common.c
>>>> +++ b/net/vmw_vsock/virtio_transport_common.c
>>>> @@ -37,27 +37,100 @@ virtio_transport_get_ops(struct vsock_sock *vsk)
>>>>     return container_of(t, struct virtio_transport, transport);
>>>> }
>>>>
>>>> -/* Returns a new packet on success, otherwise returns NULL.
>>>> - *
>>>> - * If NULL is returned, errp is set to a negative errno.
>>>> - */
>>>> -static struct sk_buff *
>>>> -virtio_transport_alloc_skb(struct virtio_vsock_pkt_info *info,
>>>> -               size_t len,
>>>> -               u32 src_cid,
>>>> -               u32 src_port,
>>>> -               u32 dst_cid,
>>>> -               u32 dst_port)
>>>> -{
>>>> -    const size_t skb_len = VIRTIO_VSOCK_SKB_HEADROOM + len;
>>>> -    struct virtio_vsock_hdr *hdr;
>>>> -    struct sk_buff *skb;
>>>> +static bool virtio_transport_can_zcopy(struct virtio_vsock_pkt_info *info,
>>>> +                       size_t max_to_send)
>>>> +{
>>>> +    struct iov_iter *iov_iter;
>>>> +
>>>> +    if (!info->msg)
>>>> +        return false;
>>>> +
>>>> +    iov_iter = &info->msg->msg_iter;
>>>> +
>>>> +    /* Data is simple buffer. */
>>>> +    if (iter_is_ubuf(iov_iter))
>>>> +        return true;
>>>> +
>>>> +    if (!iter_is_iovec(iov_iter))
>>>> +        return false;
>>>> +
>>>> +    if (iov_iter->iov_offset)
>>>> +        return false;
>>>> +
>>>> +    /* We can't send whole iov. */
>>>> +    if (iov_iter->count > max_to_send)
>>>> +        return false;
>>>> +
>>>> +    return true;
>>>> +}
>>>> +
>>>> +static int virtio_transport_init_zcopy_skb(struct vsock_sock *vsk,
>>>> +                       struct sk_buff *skb,
>>>> +                       struct msghdr *msg,
>>>> +                       bool zerocopy)
>>>> +{
>>>> +    struct ubuf_info *uarg;
>>>> +
>>>> +    if (msg->msg_ubuf) {
>>>> +        uarg = msg->msg_ubuf;
>>>> +        net_zcopy_get(uarg);
>>>> +    } else {
>>>> +        struct iov_iter *iter = &msg->msg_iter;
>>>> +        struct ubuf_info_msgzc *uarg_zc;
>>>> +        int len;
>>>> +
>>>> +        /* Only ITER_IOVEC or ITER_UBUF are allowed and
>>>> +         * checked before.
>>>> +         */
>>>> +        if (iter_is_iovec(iter))
>>>> +            len = iov_length(iter->__iov, iter->nr_segs);
>>>> +        else
>>>> +            len = iter->count;
>>>> +
>>>> +        uarg = msg_zerocopy_realloc(sk_vsock(vsk),
>>>> +                        len,
>>>> +                        NULL);
>>>> +
>>>> +        if (!uarg)
>>>> +            return -1;
>>>> +
>>>> +        uarg_zc = uarg_to_msgzc(uarg);
>>>> +        uarg_zc->zerocopy = zerocopy ? 1 : 0;
>>>> +    }
>>>> +
>>>> +    skb_zcopy_init(skb, uarg);
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +static int virtio_transport_fill_linear_skb(struct sk_buff *skb,
>>>> +                        struct vsock_sock *vsk,
>>>
>>> `vsk` seems unused
>>>
>>>> +                        struct virtio_vsock_pkt_info *info,
>>>> +                        size_t len)
>>>> +{
>>>>     void *payload;
>>>>     int err;
>>>>
>>>> -    skb = virtio_vsock_alloc_skb(skb_len, GFP_KERNEL);
>>>> -    if (!skb)
>>>> -        return NULL;
>>>> +    payload = skb_put(skb, len);
>>>> +    err = memcpy_from_msg(payload, info->msg, len);
>>>> +    if (err)
>>>> +        return -1;
>>>> +
>>>> +    if (msg_data_left(info->msg))
>>>> +        return 0;
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +static void virtio_transport_init_hdr(struct sk_buff *skb,
>>>> +                      struct virtio_vsock_pkt_info *info,
>>>> +                      u32 src_cid,
>>>> +                      u32 src_port,
>>>> +                      u32 dst_cid,
>>>> +                      u32 dst_port,
>>>> +                      size_t len)
>>>> +{
>>>> +    struct virtio_vsock_hdr *hdr;
>>>>
>>>>     hdr = virtio_vsock_hdr(skb);
>>>>     hdr->type    = cpu_to_le16(info->type);
>>>> @@ -68,42 +141,6 @@ virtio_transport_alloc_skb(struct virtio_vsock_pkt_info *info,
>>>>     hdr->dst_port    = cpu_to_le32(dst_port);
>>>>     hdr->flags    = cpu_to_le32(info->flags);
>>>>     hdr->len    = cpu_to_le32(len);
>>>> -
>>>> -    if (info->msg && len > 0) {
>>>> -        payload = skb_put(skb, len);
>>>> -        err = memcpy_from_msg(payload, info->msg, len);
>>>> -        if (err)
>>>> -            goto out;
>>>> -
>>>> -        if (msg_data_left(info->msg) == 0 &&
>>>> -            info->type == VIRTIO_VSOCK_TYPE_SEQPACKET) {
>>>> -            hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOM);
>>>> -
>>>> -            if (info->msg->msg_flags & MSG_EOR)
>>>> -                hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR);
>>>> -        }
>>>> -    }
>>>> -
>>>> -    if (info->reply)
>>>> -        virtio_vsock_skb_set_reply(skb);
>>>> -
>>>> -    trace_virtio_transport_alloc_pkt(src_cid, src_port,
>>>> -                     dst_cid, dst_port,
>>>> -                     len,
>>>> -                     info->type,
>>>> -                     info->op,
>>>> -                     info->flags);
>>>> -
>>>> -    if (info->vsk && !skb_set_owner_sk_safe(skb, sk_vsock(info->vsk))) {
>>>> -        WARN_ONCE(1, "failed to allocate skb on vsock socket with sk_refcnt == 0\n");
>>>> -        goto out;
>>>> -    }
>>>> -
>>>> -    return skb;
>>>> -
>>>> -out:
>>>> -    kfree_skb(skb);
>>>> -    return NULL;
>>>> }
>>>>
>>>> static void virtio_transport_copy_nonlinear_skb(struct sk_buff *skb,
>>>> @@ -214,6 +251,85 @@ static u16 virtio_transport_get_type(struct sock *sk)
>>>>         return VIRTIO_VSOCK_TYPE_SEQPACKET;
>>>> }
>>>>
>>>> +/* Returns a new packet on success, otherwise returns NULL.
>>>> + *
>>>> + * If NULL is returned, errp is set to a negative errno.
>>>
>>> I had noticed this in Bobby's patches, I think it's an old comment we
>>> left around.
>>>
>>>> + */
>>>> +static struct sk_buff *virtio_transport_alloc_skb(struct vsock_sock *vsk,
>>>> +                          struct virtio_vsock_pkt_info *info,
>>>> +                          size_t payload_len,
>>>> +                          bool zcopy,
>>>> +                          u32 dst_cid,
>>>> +                          u32 dst_port,
>>>> +                          u32 src_cid,
>>>> +                          u32 src_port)
>>>> +{
>>>> +    struct sk_buff *skb;
>>>> +    size_t skb_len;
>>>> +
>>>> +    skb_len = VIRTIO_VSOCK_SKB_HEADROOM;
>>>> +
>>>> +    if (!zcopy)
>>>> +        skb_len += payload_len;
>>>> +
>>>> +    skb = virtio_vsock_alloc_skb(skb_len, GFP_KERNEL);
>>>> +    if (!skb)
>>>> +        return NULL;
>>>> +
>>>> +    virtio_transport_init_hdr(skb, info, src_cid, src_port,
>>>> +                  dst_cid, dst_port,
>>>> +                  payload_len);
>>>> +
>>>> +    /* Set owner here, because '__zerocopy_sg_from_iter()' uses
>>>> +     * owner of skb without check to update 'sk_wmem_alloc'.
>>>> +     */
>>>> +    if (vsk)
>>>> +        skb_set_owner_w(skb, sk_vsock(vsk));
>>>
>>> why we are moving from skb_set_owner_sk_safe() to skb_set_owner_w()?
>>>
>>> We should mention this in the commit description.
>>>
>>>> +
>>>> +    if (info->msg && payload_len > 0) {
>>>> +        int err;
>>>> +
>>>> +        if (zcopy) {
>>>> +            err = __zerocopy_sg_from_iter(info->msg, NULL, skb,
>>>> +                              &info->msg->msg_iter,
>>>> +                              payload_len);
>>>> +        } else {
>>>> +            err = virtio_transport_fill_linear_skb(skb, vsk, info, payload_len);
>>>> +        }
>>>> +
>>>> +        if (err)
>>>> +            goto out;
>>>> +
>>>> +        VIRTIO_VSOCK_SKB_CB(skb)->frag_off = 0;
>>>> +
>>>> +        if (info->type == VIRTIO_VSOCK_TYPE_SEQPACKET) {
>>>> +            struct virtio_vsock_hdr *hdr;
>>>> +
>>>> +            hdr = virtio_vsock_hdr(skb);
>>>
>>> Just `struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb);` should be
>>> fine.
>>>
>>>> +
>>>> +            hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOM);
>>>> +
>>>> +            if (info->msg->msg_flags & MSG_EOR)
>>>> +                hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR);
>>>> +        }
>>>> +    }
>>>> +
>>>> +    if (info->reply)
>>>> +        virtio_vsock_skb_set_reply(skb);
>>>> +
>>>> +    trace_virtio_transport_alloc_pkt(src_cid, src_port,
>>>> +                     dst_cid, dst_port,
>>>> +                     payload_len,
>>>> +                     info->type,
>>>> +                     info->op,
>>>> +                     info->flags);
>>>> +
>>>> +    return skb;
>>>> +out:
>>>> +    kfree_skb(skb);
>>>> +    return NULL;
>>>> +}
>>>> +
>>>> /* This function can only be used on connecting/connected sockets,
>>>>  * since a socket assigned to a transport is required.
>>>>  *
>>>> @@ -226,6 +342,8 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
>>>>     const struct virtio_transport *t_ops;
>>>>     struct virtio_vsock_sock *vvs;
>>>>     u32 pkt_len = info->pkt_len;
>>>> +    bool can_zcopy = false;
>>>> +    u32 max_skb_cap;
>>>>     u32 rest_len;
>>>>     int ret;
>>>>
>>>> @@ -254,22 +372,49 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
>>>>     if (pkt_len == 0 && info->op == VIRTIO_VSOCK_OP_RW)
>>>>         return pkt_len;
>>>>
>>>> +    /* If zerocopy is not enabled by 'setsockopt()', we behave as
>>>> +     * there is no MSG_ZEROCOPY flag set.
>>>> +     */
>>>> +    if (!sock_flag(sk_vsock(vsk), SOCK_ZEROCOPY))
>>>> +        info->flags &= ~MSG_ZEROCOPY;
>>>> +
>>>> +    if (info->flags & MSG_ZEROCOPY)
>>>> +        can_zcopy = virtio_transport_can_zcopy(info, pkt_len);
>>>> +
>>>> +    if (can_zcopy)
>>>> +        max_skb_cap = min_t(u32, VIRTIO_VSOCK_MAX_PKT_BUF_SIZE,
>>>> +                    (MAX_SKB_FRAGS * PAGE_SIZE));
>>>> +    else
>>>> +        max_skb_cap = VIRTIO_VSOCK_MAX_PKT_BUF_SIZE;
>>>> +
>>>
>>> We use `len` very often, what about `max_skb_len`?
>>>
>>>>     rest_len = pkt_len;
>>>>
>>>>     do {
>>>>         struct sk_buff *skb;
>>>>         size_t skb_len;
>>>>
>>>> -        skb_len = min_t(u32, VIRTIO_VSOCK_MAX_PKT_BUF_SIZE, rest_len);
>>>> +        skb_len = min(max_skb_cap, rest_len);
>>>>
>>>> -        skb = virtio_transport_alloc_skb(info, skb_len,
>>>> -                         src_cid, src_port,
>>>> -                         dst_cid, dst_port);
>>>> +        skb = virtio_transport_alloc_skb(vsk, info, skb_len, can_zcopy,
>>>> +                         dst_cid, dst_port,
>>>> +                         src_cid, src_port);
>>>>         if (!skb) {
>>>>             ret = -ENOMEM;
>>>>             break;
>>>>         }
>>>>
>>>> +        /* This is last skb to send this portion of data. */
>>>> +        if (skb_len == rest_len &&
>>>> +            info->flags & MSG_ZEROCOPY &&
>>>> +            info->op == VIRTIO_VSOCK_OP_RW) {
>>>> +            if (virtio_transport_init_zcopy_skb(vsk, skb,
>>>> +                                info->msg,
>>>> +                                can_zcopy)) {
>>>> +                ret = -ENOMEM;
>>>> +                break;
>>>> +            }
>>>> +        }
>>>> +
>>>>         virtio_transport_inc_tx_pkt(vvs, skb);
>>>>
>>>>         ret = t_ops->send_pkt(skb);
>>>> @@ -884,6 +1029,7 @@ virtio_transport_stream_enqueue(struct vsock_sock *vsk,
>>>>         .msg = msg,
>>>>         .pkt_len = len,
>>>>         .vsk = vsk,
>>>> +        .flags = msg->msg_flags,
>>>
>>> These flags then get copied into the virtio_vsock_hdr, which I don't
>>> think is a good idea.
>>>
>>> Why not using directly info->msg->msg_flags?
>>
>> Ops, yes, it's a bug, You're right, this is really wrong as there are two different
>> sets of flags - MSG_XXX passed to syscall and flags in the header of packet.
> 
> Yep.
> 
> What about the moving from skb_set_owner_sk_safe() to skb_set_owner_w()?
> Was it voluntary? If so, can you explain why?

Sure! here is what happens: difference between these functions is that 'skb_set_owner_w()'
sets another destructor - 'sock_wfree()':

https://elixir.bootlin.com/linux/v6.4/source/net/core/sock.c#L2408

This destructor also updates 'sk->sk_wmem_alloc' during skb freeing. Why this is needed?
Because '__zerocopy_sg_from_iter()' updates 'sk_wmem_alloc' if socket is NOT SOCK_STREAM
or socket was NOT passed as input argument (it is not obvious why, but it is so):

https://elixir.bootlin.com/linux/latest/source/net/core/datagram.c#L646

So to support both types of sockets (seqpacket and stream) I decided to try pass NULL always
as socket argument to '__zerocopy_sg_from_iter()', with this 'sk_wmem_alloc' will be always
updated in '__zerocopy_sg_from_iter()' and then we decrease 'sk_wmem_alloc' in 'sock_wfree()'.

May be this looks strange - I tried to understand logic of this code in '__zerocopy_sg_from_iter()',
but it requires more deep analysis. Anyway it works:)

Thanks, Arseniy

> 
> 
> Thanks,
> Stefano
>
Stefano Garzarella June 29, 2023, 12:32 p.m. UTC | #5
On Tue, Jun 27, 2023 at 11:22:52AM +0300, Arseniy Krasnov wrote:
>
>
>On 27.06.2023 10:50, Stefano Garzarella wrote:
>> On Tue, Jun 27, 2023 at 07:41:51AM +0300, Arseniy Krasnov wrote:
>>>
>>>
>>> On 26.06.2023 19:03, Stefano Garzarella wrote:
>>>> On Sat, Jun 03, 2023 at 11:49:27PM +0300, Arseniy Krasnov wrote:
>>>>> This adds handling of MSG_ZEROCOPY flag on transmission path: if this
>>>>> flag is set and zerocopy transmission is possible, then non-linear skb
>>>>> will be created and filled with the pages of user's buffer. Pages of
>>>>> user's buffer are locked in memory by 'get_user_pages()'.
>>>>>
>>>>> Signed-off-by: Arseniy Krasnov <AVKrasnov@sberdevices.ru>
>>>>> ---
>>>>> net/vmw_vsock/virtio_transport_common.c | 270 ++++++++++++++++++------
>>>>> 1 file changed, 208 insertions(+), 62 deletions(-)
>>>>>
>>>>> diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
>>>>> index 0de562c1dc4b..f1ec38c72db7 100644
>>>>> --- a/net/vmw_vsock/virtio_transport_common.c
>>>>> +++ b/net/vmw_vsock/virtio_transport_common.c
>>>>> @@ -37,27 +37,100 @@ virtio_transport_get_ops(struct vsock_sock *vsk)
>>>>>     return container_of(t, struct virtio_transport, transport);
>>>>> }
>>>>>
>>>>> -/* Returns a new packet on success, otherwise returns NULL.
>>>>> - *
>>>>> - * If NULL is returned, errp is set to a negative errno.
>>>>> - */
>>>>> -static struct sk_buff *
>>>>> -virtio_transport_alloc_skb(struct virtio_vsock_pkt_info *info,
>>>>> -               size_t len,
>>>>> -               u32 src_cid,
>>>>> -               u32 src_port,
>>>>> -               u32 dst_cid,
>>>>> -               u32 dst_port)
>>>>> -{
>>>>> -    const size_t skb_len = VIRTIO_VSOCK_SKB_HEADROOM + len;
>>>>> -    struct virtio_vsock_hdr *hdr;
>>>>> -    struct sk_buff *skb;
>>>>> +static bool virtio_transport_can_zcopy(struct virtio_vsock_pkt_info *info,
>>>>> +                       size_t max_to_send)
>>>>> +{
>>>>> +    struct iov_iter *iov_iter;
>>>>> +
>>>>> +    if (!info->msg)
>>>>> +        return false;
>>>>> +
>>>>> +    iov_iter = &info->msg->msg_iter;
>>>>> +
>>>>> +    /* Data is simple buffer. */
>>>>> +    if (iter_is_ubuf(iov_iter))
>>>>> +        return true;
>>>>> +
>>>>> +    if (!iter_is_iovec(iov_iter))
>>>>> +        return false;
>>>>> +
>>>>> +    if (iov_iter->iov_offset)
>>>>> +        return false;
>>>>> +
>>>>> +    /* We can't send whole iov. */
>>>>> +    if (iov_iter->count > max_to_send)
>>>>> +        return false;
>>>>> +
>>>>> +    return true;
>>>>> +}
>>>>> +
>>>>> +static int virtio_transport_init_zcopy_skb(struct vsock_sock *vsk,
>>>>> +                       struct sk_buff *skb,
>>>>> +                       struct msghdr *msg,
>>>>> +                       bool zerocopy)
>>>>> +{
>>>>> +    struct ubuf_info *uarg;
>>>>> +
>>>>> +    if (msg->msg_ubuf) {
>>>>> +        uarg = msg->msg_ubuf;
>>>>> +        net_zcopy_get(uarg);
>>>>> +    } else {
>>>>> +        struct iov_iter *iter = &msg->msg_iter;
>>>>> +        struct ubuf_info_msgzc *uarg_zc;
>>>>> +        int len;
>>>>> +
>>>>> +        /* Only ITER_IOVEC or ITER_UBUF are allowed and
>>>>> +         * checked before.
>>>>> +         */
>>>>> +        if (iter_is_iovec(iter))
>>>>> +            len = iov_length(iter->__iov, iter->nr_segs);
>>>>> +        else
>>>>> +            len = iter->count;
>>>>> +
>>>>> +        uarg = msg_zerocopy_realloc(sk_vsock(vsk),
>>>>> +                        len,
>>>>> +                        NULL);
>>>>> +
>>>>> +        if (!uarg)
>>>>> +            return -1;
>>>>> +
>>>>> +        uarg_zc = uarg_to_msgzc(uarg);
>>>>> +        uarg_zc->zerocopy = zerocopy ? 1 : 0;
>>>>> +    }
>>>>> +
>>>>> +    skb_zcopy_init(skb, uarg);
>>>>> +
>>>>> +    return 0;
>>>>> +}
>>>>> +
>>>>> +static int virtio_transport_fill_linear_skb(struct sk_buff *skb,
>>>>> +                        struct vsock_sock *vsk,
>>>>
>>>> `vsk` seems unused
>>>>
>>>>> +                        struct virtio_vsock_pkt_info *info,
>>>>> +                        size_t len)
>>>>> +{
>>>>>     void *payload;
>>>>>     int err;
>>>>>
>>>>> -    skb = virtio_vsock_alloc_skb(skb_len, GFP_KERNEL);
>>>>> -    if (!skb)
>>>>> -        return NULL;
>>>>> +    payload = skb_put(skb, len);
>>>>> +    err = memcpy_from_msg(payload, info->msg, len);
>>>>> +    if (err)
>>>>> +        return -1;
>>>>> +
>>>>> +    if (msg_data_left(info->msg))
>>>>> +        return 0;
>>>>> +
>>>>> +    return 0;
>>>>> +}
>>>>> +
>>>>> +static void virtio_transport_init_hdr(struct sk_buff *skb,
>>>>> +                      struct virtio_vsock_pkt_info *info,
>>>>> +                      u32 src_cid,
>>>>> +                      u32 src_port,
>>>>> +                      u32 dst_cid,
>>>>> +                      u32 dst_port,
>>>>> +                      size_t len)
>>>>> +{
>>>>> +    struct virtio_vsock_hdr *hdr;
>>>>>
>>>>>     hdr = virtio_vsock_hdr(skb);
>>>>>     hdr->type    = cpu_to_le16(info->type);
>>>>> @@ -68,42 +141,6 @@ virtio_transport_alloc_skb(struct virtio_vsock_pkt_info *info,
>>>>>     hdr->dst_port    = cpu_to_le32(dst_port);
>>>>>     hdr->flags    = cpu_to_le32(info->flags);
>>>>>     hdr->len    = cpu_to_le32(len);
>>>>> -
>>>>> -    if (info->msg && len > 0) {
>>>>> -        payload = skb_put(skb, len);
>>>>> -        err = memcpy_from_msg(payload, info->msg, len);
>>>>> -        if (err)
>>>>> -            goto out;
>>>>> -
>>>>> -        if (msg_data_left(info->msg) == 0 &&
>>>>> -            info->type == VIRTIO_VSOCK_TYPE_SEQPACKET) {
>>>>> -            hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOM);
>>>>> -
>>>>> -            if (info->msg->msg_flags & MSG_EOR)
>>>>> -                hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR);
>>>>> -        }
>>>>> -    }
>>>>> -
>>>>> -    if (info->reply)
>>>>> -        virtio_vsock_skb_set_reply(skb);
>>>>> -
>>>>> -    trace_virtio_transport_alloc_pkt(src_cid, src_port,
>>>>> -                     dst_cid, dst_port,
>>>>> -                     len,
>>>>> -                     info->type,
>>>>> -                     info->op,
>>>>> -                     info->flags);
>>>>> -
>>>>> -    if (info->vsk && !skb_set_owner_sk_safe(skb, sk_vsock(info->vsk))) {
>>>>> -        WARN_ONCE(1, "failed to allocate skb on vsock socket with sk_refcnt == 0\n");
>>>>> -        goto out;
>>>>> -    }
>>>>> -
>>>>> -    return skb;
>>>>> -
>>>>> -out:
>>>>> -    kfree_skb(skb);
>>>>> -    return NULL;
>>>>> }
>>>>>
>>>>> static void virtio_transport_copy_nonlinear_skb(struct sk_buff *skb,
>>>>> @@ -214,6 +251,85 @@ static u16 virtio_transport_get_type(struct sock *sk)
>>>>>         return VIRTIO_VSOCK_TYPE_SEQPACKET;
>>>>> }
>>>>>
>>>>> +/* Returns a new packet on success, otherwise returns NULL.
>>>>> + *
>>>>> + * If NULL is returned, errp is set to a negative errno.
>>>>
>>>> I had noticed this in Bobby's patches, I think it's an old comment we
>>>> left around.
>>>>
>>>>> + */
>>>>> +static struct sk_buff *virtio_transport_alloc_skb(struct vsock_sock *vsk,
>>>>> +                          struct virtio_vsock_pkt_info *info,
>>>>> +                          size_t payload_len,
>>>>> +                          bool zcopy,
>>>>> +                          u32 dst_cid,
>>>>> +                          u32 dst_port,
>>>>> +                          u32 src_cid,
>>>>> +                          u32 src_port)
>>>>> +{
>>>>> +    struct sk_buff *skb;
>>>>> +    size_t skb_len;
>>>>> +
>>>>> +    skb_len = VIRTIO_VSOCK_SKB_HEADROOM;
>>>>> +
>>>>> +    if (!zcopy)
>>>>> +        skb_len += payload_len;
>>>>> +
>>>>> +    skb = virtio_vsock_alloc_skb(skb_len, GFP_KERNEL);
>>>>> +    if (!skb)
>>>>> +        return NULL;
>>>>> +
>>>>> +    virtio_transport_init_hdr(skb, info, src_cid, src_port,
>>>>> +                  dst_cid, dst_port,
>>>>> +                  payload_len);
>>>>> +
>>>>> +    /* Set owner here, because '__zerocopy_sg_from_iter()' uses
>>>>> +     * owner of skb without check to update 'sk_wmem_alloc'.
>>>>> +     */
>>>>> +    if (vsk)
>>>>> +        skb_set_owner_w(skb, sk_vsock(vsk));
>>>>
>>>> why we are moving from skb_set_owner_sk_safe() to skb_set_owner_w()?
>>>>
>>>> We should mention this in the commit description.
>>>>
>>>>> +
>>>>> +    if (info->msg && payload_len > 0) {
>>>>> +        int err;
>>>>> +
>>>>> +        if (zcopy) {
>>>>> +            err = __zerocopy_sg_from_iter(info->msg, NULL, skb,
>>>>> +                              &info->msg->msg_iter,
>>>>> +                              payload_len);
>>>>> +        } else {
>>>>> +            err = virtio_transport_fill_linear_skb(skb, vsk, info, payload_len);
>>>>> +        }
>>>>> +
>>>>> +        if (err)
>>>>> +            goto out;
>>>>> +
>>>>> +        VIRTIO_VSOCK_SKB_CB(skb)->frag_off = 0;
>>>>> +
>>>>> +        if (info->type == VIRTIO_VSOCK_TYPE_SEQPACKET) {
>>>>> +            struct virtio_vsock_hdr *hdr;
>>>>> +
>>>>> +            hdr = virtio_vsock_hdr(skb);
>>>>
>>>> Just `struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb);` should be
>>>> fine.
>>>>
>>>>> +
>>>>> +            hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOM);
>>>>> +
>>>>> +            if (info->msg->msg_flags & MSG_EOR)
>>>>> +                hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR);
>>>>> +        }
>>>>> +    }
>>>>> +
>>>>> +    if (info->reply)
>>>>> +        virtio_vsock_skb_set_reply(skb);
>>>>> +
>>>>> +    trace_virtio_transport_alloc_pkt(src_cid, src_port,
>>>>> +                     dst_cid, dst_port,
>>>>> +                     payload_len,
>>>>> +                     info->type,
>>>>> +                     info->op,
>>>>> +                     info->flags);
>>>>> +
>>>>> +    return skb;
>>>>> +out:
>>>>> +    kfree_skb(skb);
>>>>> +    return NULL;
>>>>> +}
>>>>> +
>>>>> /* This function can only be used on connecting/connected sockets,
>>>>>  * since a socket assigned to a transport is required.
>>>>>  *
>>>>> @@ -226,6 +342,8 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
>>>>>     const struct virtio_transport *t_ops;
>>>>>     struct virtio_vsock_sock *vvs;
>>>>>     u32 pkt_len = info->pkt_len;
>>>>> +    bool can_zcopy = false;
>>>>> +    u32 max_skb_cap;
>>>>>     u32 rest_len;
>>>>>     int ret;
>>>>>
>>>>> @@ -254,22 +372,49 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
>>>>>     if (pkt_len == 0 && info->op == VIRTIO_VSOCK_OP_RW)
>>>>>         return pkt_len;
>>>>>
>>>>> +    /* If zerocopy is not enabled by 'setsockopt()', we behave as
>>>>> +     * there is no MSG_ZEROCOPY flag set.
>>>>> +     */
>>>>> +    if (!sock_flag(sk_vsock(vsk), SOCK_ZEROCOPY))
>>>>> +        info->flags &= ~MSG_ZEROCOPY;
>>>>> +
>>>>> +    if (info->flags & MSG_ZEROCOPY)
>>>>> +        can_zcopy = virtio_transport_can_zcopy(info, pkt_len);
>>>>> +
>>>>> +    if (can_zcopy)
>>>>> +        max_skb_cap = min_t(u32, VIRTIO_VSOCK_MAX_PKT_BUF_SIZE,
>>>>> +                    (MAX_SKB_FRAGS * PAGE_SIZE));
>>>>> +    else
>>>>> +        max_skb_cap = VIRTIO_VSOCK_MAX_PKT_BUF_SIZE;
>>>>> +
>>>>
>>>> We use `len` very often, what about `max_skb_len`?
>>>>
>>>>>     rest_len = pkt_len;
>>>>>
>>>>>     do {
>>>>>         struct sk_buff *skb;
>>>>>         size_t skb_len;
>>>>>
>>>>> -        skb_len = min_t(u32, VIRTIO_VSOCK_MAX_PKT_BUF_SIZE, rest_len);
>>>>> +        skb_len = min(max_skb_cap, rest_len);
>>>>>
>>>>> -        skb = virtio_transport_alloc_skb(info, skb_len,
>>>>> -                         src_cid, src_port,
>>>>> -                         dst_cid, dst_port);
>>>>> +        skb = virtio_transport_alloc_skb(vsk, info, skb_len, can_zcopy,
>>>>> +                         dst_cid, dst_port,
>>>>> +                         src_cid, src_port);
>>>>>         if (!skb) {
>>>>>             ret = -ENOMEM;
>>>>>             break;
>>>>>         }
>>>>>
>>>>> +        /* This is last skb to send this portion of data. */
>>>>> +        if (skb_len == rest_len &&
>>>>> +            info->flags & MSG_ZEROCOPY &&
>>>>> +            info->op == VIRTIO_VSOCK_OP_RW) {
>>>>> +            if (virtio_transport_init_zcopy_skb(vsk, skb,
>>>>> +                                info->msg,
>>>>> +                                can_zcopy)) {
>>>>> +                ret = -ENOMEM;
>>>>> +                break;
>>>>> +            }
>>>>> +        }
>>>>> +
>>>>>         virtio_transport_inc_tx_pkt(vvs, skb);
>>>>>
>>>>>         ret = t_ops->send_pkt(skb);
>>>>> @@ -884,6 +1029,7 @@ virtio_transport_stream_enqueue(struct vsock_sock *vsk,
>>>>>         .msg = msg,
>>>>>         .pkt_len = len,
>>>>>         .vsk = vsk,
>>>>> +        .flags = msg->msg_flags,
>>>>
>>>> These flags then get copied into the virtio_vsock_hdr, which I don't
>>>> think is a good idea.
>>>>
>>>> Why not using directly info->msg->msg_flags?
>>>
>>> Ops, yes, it's a bug, You're right, this is really wrong as there are two different
>>> sets of flags - MSG_XXX passed to syscall and flags in the header of packet.
>>
>> Yep.
>>
>> What about the moving from skb_set_owner_sk_safe() to skb_set_owner_w()?
>> Was it voluntary? If so, can you explain why?
>
>Sure! here is what happens: difference between these functions is that 'skb_set_owner_w()'
>sets another destructor - 'sock_wfree()':
>
>https://elixir.bootlin.com/linux/v6.4/source/net/core/sock.c#L2408
>
>This destructor also updates 'sk->sk_wmem_alloc' during skb freeing. Why this is needed?
>Because '__zerocopy_sg_from_iter()' updates 'sk_wmem_alloc' if socket is NOT SOCK_STREAM
>or socket was NOT passed as input argument (it is not obvious why, but it is so):
>
>https://elixir.bootlin.com/linux/latest/source/net/core/datagram.c#L646
>
>So to support both types of sockets (seqpacket and stream) I decided to try pass NULL always
>as socket argument to '__zerocopy_sg_from_iter()', with this 'sk_wmem_alloc' will be always
>updated in '__zerocopy_sg_from_iter()' and then we decrease 'sk_wmem_alloc' in 'sock_wfree()'.
>
>May be this looks strange - I tried to understand logic of this code in '__zerocopy_sg_from_iter()',
>but it requires more deep analysis. Anyway it works:)

Yep, it should require also more analysis on my side :-)

Please document it in the commit description, so if we will find any
issue in the future, we know why we did this change.

Thanks,
Stefano
diff mbox series

Patch

diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index 0de562c1dc4b..f1ec38c72db7 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -37,27 +37,100 @@  virtio_transport_get_ops(struct vsock_sock *vsk)
 	return container_of(t, struct virtio_transport, transport);
 }
 
-/* Returns a new packet on success, otherwise returns NULL.
- *
- * If NULL is returned, errp is set to a negative errno.
- */
-static struct sk_buff *
-virtio_transport_alloc_skb(struct virtio_vsock_pkt_info *info,
-			   size_t len,
-			   u32 src_cid,
-			   u32 src_port,
-			   u32 dst_cid,
-			   u32 dst_port)
-{
-	const size_t skb_len = VIRTIO_VSOCK_SKB_HEADROOM + len;
-	struct virtio_vsock_hdr *hdr;
-	struct sk_buff *skb;
+static bool virtio_transport_can_zcopy(struct virtio_vsock_pkt_info *info,
+				       size_t max_to_send)
+{
+	struct iov_iter *iov_iter;
+
+	if (!info->msg)
+		return false;
+
+	iov_iter = &info->msg->msg_iter;
+
+	/* Data is simple buffer. */
+	if (iter_is_ubuf(iov_iter))
+		return true;
+
+	if (!iter_is_iovec(iov_iter))
+		return false;
+
+	if (iov_iter->iov_offset)
+		return false;
+
+	/* We can't send whole iov. */
+	if (iov_iter->count > max_to_send)
+		return false;
+
+	return true;
+}
+
+static int virtio_transport_init_zcopy_skb(struct vsock_sock *vsk,
+					   struct sk_buff *skb,
+					   struct msghdr *msg,
+					   bool zerocopy)
+{
+	struct ubuf_info *uarg;
+
+	if (msg->msg_ubuf) {
+		uarg = msg->msg_ubuf;
+		net_zcopy_get(uarg);
+	} else {
+		struct iov_iter *iter = &msg->msg_iter;
+		struct ubuf_info_msgzc *uarg_zc;
+		int len;
+
+		/* Only ITER_IOVEC or ITER_UBUF are allowed and
+		 * checked before.
+		 */
+		if (iter_is_iovec(iter))
+			len = iov_length(iter->__iov, iter->nr_segs);
+		else
+			len = iter->count;
+
+		uarg = msg_zerocopy_realloc(sk_vsock(vsk),
+					    len,
+					    NULL);
+
+		if (!uarg)
+			return -1;
+
+		uarg_zc = uarg_to_msgzc(uarg);
+		uarg_zc->zerocopy = zerocopy ? 1 : 0;
+	}
+
+	skb_zcopy_init(skb, uarg);
+
+	return 0;
+}
+
+static int virtio_transport_fill_linear_skb(struct sk_buff *skb,
+					    struct vsock_sock *vsk,
+					    struct virtio_vsock_pkt_info *info,
+					    size_t len)
+{
 	void *payload;
 	int err;
 
-	skb = virtio_vsock_alloc_skb(skb_len, GFP_KERNEL);
-	if (!skb)
-		return NULL;
+	payload = skb_put(skb, len);
+	err = memcpy_from_msg(payload, info->msg, len);
+	if (err)
+		return -1;
+
+	if (msg_data_left(info->msg))
+		return 0;
+
+	return 0;
+}
+
+static void virtio_transport_init_hdr(struct sk_buff *skb,
+				      struct virtio_vsock_pkt_info *info,
+				      u32 src_cid,
+				      u32 src_port,
+				      u32 dst_cid,
+				      u32 dst_port,
+				      size_t len)
+{
+	struct virtio_vsock_hdr *hdr;
 
 	hdr = virtio_vsock_hdr(skb);
 	hdr->type	= cpu_to_le16(info->type);
@@ -68,42 +141,6 @@  virtio_transport_alloc_skb(struct virtio_vsock_pkt_info *info,
 	hdr->dst_port	= cpu_to_le32(dst_port);
 	hdr->flags	= cpu_to_le32(info->flags);
 	hdr->len	= cpu_to_le32(len);
-
-	if (info->msg && len > 0) {
-		payload = skb_put(skb, len);
-		err = memcpy_from_msg(payload, info->msg, len);
-		if (err)
-			goto out;
-
-		if (msg_data_left(info->msg) == 0 &&
-		    info->type == VIRTIO_VSOCK_TYPE_SEQPACKET) {
-			hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOM);
-
-			if (info->msg->msg_flags & MSG_EOR)
-				hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR);
-		}
-	}
-
-	if (info->reply)
-		virtio_vsock_skb_set_reply(skb);
-
-	trace_virtio_transport_alloc_pkt(src_cid, src_port,
-					 dst_cid, dst_port,
-					 len,
-					 info->type,
-					 info->op,
-					 info->flags);
-
-	if (info->vsk && !skb_set_owner_sk_safe(skb, sk_vsock(info->vsk))) {
-		WARN_ONCE(1, "failed to allocate skb on vsock socket with sk_refcnt == 0\n");
-		goto out;
-	}
-
-	return skb;
-
-out:
-	kfree_skb(skb);
-	return NULL;
 }
 
 static void virtio_transport_copy_nonlinear_skb(struct sk_buff *skb,
@@ -214,6 +251,85 @@  static u16 virtio_transport_get_type(struct sock *sk)
 		return VIRTIO_VSOCK_TYPE_SEQPACKET;
 }
 
+/* Returns a new packet on success, otherwise returns NULL.
+ *
+ * If NULL is returned, errp is set to a negative errno.
+ */
+static struct sk_buff *virtio_transport_alloc_skb(struct vsock_sock *vsk,
+						  struct virtio_vsock_pkt_info *info,
+						  size_t payload_len,
+						  bool zcopy,
+						  u32 dst_cid,
+						  u32 dst_port,
+						  u32 src_cid,
+						  u32 src_port)
+{
+	struct sk_buff *skb;
+	size_t skb_len;
+
+	skb_len = VIRTIO_VSOCK_SKB_HEADROOM;
+
+	if (!zcopy)
+		skb_len += payload_len;
+
+	skb = virtio_vsock_alloc_skb(skb_len, GFP_KERNEL);
+	if (!skb)
+		return NULL;
+
+	virtio_transport_init_hdr(skb, info, src_cid, src_port,
+				  dst_cid, dst_port,
+				  payload_len);
+
+	/* Set owner here, because '__zerocopy_sg_from_iter()' uses
+	 * owner of skb without check to update 'sk_wmem_alloc'.
+	 */
+	if (vsk)
+		skb_set_owner_w(skb, sk_vsock(vsk));
+
+	if (info->msg && payload_len > 0) {
+		int err;
+
+		if (zcopy) {
+			err = __zerocopy_sg_from_iter(info->msg, NULL, skb,
+						      &info->msg->msg_iter,
+						      payload_len);
+		} else {
+			err = virtio_transport_fill_linear_skb(skb, vsk, info, payload_len);
+		}
+
+		if (err)
+			goto out;
+
+		VIRTIO_VSOCK_SKB_CB(skb)->frag_off = 0;
+
+		if (info->type == VIRTIO_VSOCK_TYPE_SEQPACKET) {
+			struct virtio_vsock_hdr *hdr;
+
+			hdr = virtio_vsock_hdr(skb);
+
+			hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOM);
+
+			if (info->msg->msg_flags & MSG_EOR)
+				hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR);
+		}
+	}
+
+	if (info->reply)
+		virtio_vsock_skb_set_reply(skb);
+
+	trace_virtio_transport_alloc_pkt(src_cid, src_port,
+					 dst_cid, dst_port,
+					 payload_len,
+					 info->type,
+					 info->op,
+					 info->flags);
+
+	return skb;
+out:
+	kfree_skb(skb);
+	return NULL;
+}
+
 /* This function can only be used on connecting/connected sockets,
  * since a socket assigned to a transport is required.
  *
@@ -226,6 +342,8 @@  static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
 	const struct virtio_transport *t_ops;
 	struct virtio_vsock_sock *vvs;
 	u32 pkt_len = info->pkt_len;
+	bool can_zcopy = false;
+	u32 max_skb_cap;
 	u32 rest_len;
 	int ret;
 
@@ -254,22 +372,49 @@  static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
 	if (pkt_len == 0 && info->op == VIRTIO_VSOCK_OP_RW)
 		return pkt_len;
 
+	/* If zerocopy is not enabled by 'setsockopt()', we behave as
+	 * there is no MSG_ZEROCOPY flag set.
+	 */
+	if (!sock_flag(sk_vsock(vsk), SOCK_ZEROCOPY))
+		info->flags &= ~MSG_ZEROCOPY;
+
+	if (info->flags & MSG_ZEROCOPY)
+		can_zcopy = virtio_transport_can_zcopy(info, pkt_len);
+
+	if (can_zcopy)
+		max_skb_cap = min_t(u32, VIRTIO_VSOCK_MAX_PKT_BUF_SIZE,
+				    (MAX_SKB_FRAGS * PAGE_SIZE));
+	else
+		max_skb_cap = VIRTIO_VSOCK_MAX_PKT_BUF_SIZE;
+
 	rest_len = pkt_len;
 
 	do {
 		struct sk_buff *skb;
 		size_t skb_len;
 
-		skb_len = min_t(u32, VIRTIO_VSOCK_MAX_PKT_BUF_SIZE, rest_len);
+		skb_len = min(max_skb_cap, rest_len);
 
-		skb = virtio_transport_alloc_skb(info, skb_len,
-						 src_cid, src_port,
-						 dst_cid, dst_port);
+		skb = virtio_transport_alloc_skb(vsk, info, skb_len, can_zcopy,
+						 dst_cid, dst_port,
+						 src_cid, src_port);
 		if (!skb) {
 			ret = -ENOMEM;
 			break;
 		}
 
+		/* This is last skb to send this portion of data. */
+		if (skb_len == rest_len &&
+		    info->flags & MSG_ZEROCOPY &&
+		    info->op == VIRTIO_VSOCK_OP_RW) {
+			if (virtio_transport_init_zcopy_skb(vsk, skb,
+							    info->msg,
+							    can_zcopy)) {
+				ret = -ENOMEM;
+				break;
+			}
+		}
+
 		virtio_transport_inc_tx_pkt(vvs, skb);
 
 		ret = t_ops->send_pkt(skb);
@@ -884,6 +1029,7 @@  virtio_transport_stream_enqueue(struct vsock_sock *vsk,
 		.msg = msg,
 		.pkt_len = len,
 		.vsk = vsk,
+		.flags = msg->msg_flags,
 	};
 
 	return virtio_transport_send_pkt_info(vsk, &info);
@@ -935,11 +1081,11 @@  static int virtio_transport_reset_no_sock(const struct virtio_transport *t,
 	if (!t)
 		return -ENOTCONN;
 
-	reply = virtio_transport_alloc_skb(&info, 0,
-					   le64_to_cpu(hdr->dst_cid),
-					   le32_to_cpu(hdr->dst_port),
+	reply = virtio_transport_alloc_skb(NULL, &info, 0, false,
 					   le64_to_cpu(hdr->src_cid),
-					   le32_to_cpu(hdr->src_port));
+					   le32_to_cpu(hdr->src_port),
+					   le64_to_cpu(hdr->dst_cid),
+					   le32_to_cpu(hdr->dst_port));
 	if (!reply)
 		return -ENOMEM;