diff mbox series

[net-next,v3] virtio-net: page_to_skb() use build_skb when there's sufficient tailroom

Message ID 20210416091615.25198-1-xuanzhuo@linux.alibaba.com (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series [net-next,v3] virtio-net: page_to_skb() use build_skb when there's sufficient tailroom | expand

Checks

Context Check Description
netdev/cover_letter success Link
netdev/fixes_present success Link
netdev/patch_count success Link
netdev/tree_selection success Clearly marked for net-next
netdev/subject_prefix success Link
netdev/cc_maintainers success CCed 6 of 6 maintainers
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit success Errors and warnings before: 0 this patch: 0
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/verify_fixes success Link
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 128 lines checked
netdev/build_allmodconfig_warn success Errors and warnings before: 0 this patch: 0
netdev/header_inline success Link

Commit Message

Xuan Zhuo April 16, 2021, 9:16 a.m. UTC
In page_to_skb(), if we have enough tailroom to save skb_shared_info, we
can use build_skb to create skb directly. No need to alloc for
additional space. And it can save a 'frags slot', which is very friendly
to GRO.

Here, if the payload of the received package is too small (less than
GOOD_COPY_LEN), we still choose to copy it directly to the space got by
napi_alloc_skb. So we can reuse these pages.

Testing Machine:
    The four queues of the network card are bound to the cpu1.

Test command:
    for ((i=0;i<5;++i)); do sockperf tp --ip 192.168.122.64 -m 1000 -t 150& done

The size of the udp package is 1000, so in the case of this patch, there
will always be enough tailroom to use build_skb. The sent udp packet
will be discarded because there is no port to receive it. The irqsoftd
of the machine is 100%, we observe the received quantity displayed by
sar -n DEV 1:

no build_skb:  956864.00 rxpck/s
build_skb:    1158465.00 rxpck/s

Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Suggested-by: Jason Wang <jasowang@redhat.com>
---

v3: fix the truesize when headroom > 0

v2: conflict resolution

 drivers/net/virtio_net.c | 69 ++++++++++++++++++++++++++++------------
 1 file changed, 48 insertions(+), 21 deletions(-)

--
2.31.0

Comments

Jason Wang April 19, 2021, 3:10 a.m. UTC | #1
在 2021/4/16 下午5:16, Xuan Zhuo 写道:
> In page_to_skb(), if we have enough tailroom to save skb_shared_info, we
> can use build_skb to create skb directly. No need to alloc for
> additional space. And it can save a 'frags slot', which is very friendly
> to GRO.
>
> Here, if the payload of the received package is too small (less than
> GOOD_COPY_LEN), we still choose to copy it directly to the space got by
> napi_alloc_skb. So we can reuse these pages.
>
> Testing Machine:
>      The four queues of the network card are bound to the cpu1.
>
> Test command:
>      for ((i=0;i<5;++i)); do sockperf tp --ip 192.168.122.64 -m 1000 -t 150& done
>
> The size of the udp package is 1000, so in the case of this patch, there
> will always be enough tailroom to use build_skb. The sent udp packet
> will be discarded because there is no port to receive it. The irqsoftd
> of the machine is 100%, we observe the received quantity displayed by
> sar -n DEV 1:
>
> no build_skb:  956864.00 rxpck/s
> build_skb:    1158465.00 rxpck/s


I suggess to test the case of XDP_PASS in this case as well.


>
> Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> Suggested-by: Jason Wang <jasowang@redhat.com>
> ---
>
> v3: fix the truesize when headroom > 0
>
> v2: conflict resolution
>
>   drivers/net/virtio_net.c | 69 ++++++++++++++++++++++++++++------------
>   1 file changed, 48 insertions(+), 21 deletions(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 101659cd4b87..8cd76037c724 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -379,21 +379,17 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
>   				   struct receive_queue *rq,
>   				   struct page *page, unsigned int offset,
>   				   unsigned int len, unsigned int truesize,
> -				   bool hdr_valid, unsigned int metasize)
> +				   bool hdr_valid, unsigned int metasize,
> +				   unsigned int headroom)
>   {
>   	struct sk_buff *skb;
>   	struct virtio_net_hdr_mrg_rxbuf *hdr;
>   	unsigned int copy, hdr_len, hdr_padded_len;
> -	char *p;
> +	int tailroom, shinfo_size;
> +	char *p, *hdr_p;
>
>   	p = page_address(page) + offset;
> -
> -	/* copy small packet so we can reuse these pages for small data */
> -	skb = napi_alloc_skb(&rq->napi, GOOD_COPY_LEN);
> -	if (unlikely(!skb))
> -		return NULL;
> -
> -	hdr = skb_vnet_hdr(skb);
> +	hdr_p = p;
>
>   	hdr_len = vi->hdr_len;
>   	if (vi->mergeable_rx_bufs)
> @@ -401,14 +397,38 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
>   	else
>   		hdr_padded_len = sizeof(struct padded_vnet_hdr);
>
> -	/* hdr_valid means no XDP, so we can copy the vnet header */
> -	if (hdr_valid)
> -		memcpy(hdr, p, hdr_len);
> +	/* If headroom is not 0, there is an offset between the beginning of the
> +	 * data and the allocated space, otherwise the data and the allocated
> +	 * space are aligned.
> +	 */
> +	if (headroom) {
> +		/* The actual allocated space size is PAGE_SIZE. */


I think the comment in receive_mergeable() is better:

                 /* Buffers with headroom use PAGE_SIZE as alloc size,
                  * see add_recvbuf_mergeable() + get_mergeable_buf_len()
                  */


> +		truesize = PAGE_SIZE;
> +		tailroom = truesize - len - offset;
> +	} else {
> +		tailroom = truesize - len;
> +	}
>
>   	len -= hdr_len;
>   	offset += hdr_padded_len;
>   	p += hdr_padded_len;
>
> +	shinfo_size = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
> +
> +	if (len > GOOD_COPY_LEN && tailroom >= shinfo_size) {


Any reason that you don't use build_skb() for len < GOOD_COPY_LEN?

Other looks good.

Thanks


> +		skb = build_skb(p, truesize);
> +		if (unlikely(!skb))
> +			return NULL;
> +
> +		skb_put(skb, len);
> +		goto ok;
> +	}
> +
> +	/* copy small packet so we can reuse these pages for small data */
> +	skb = napi_alloc_skb(&rq->napi, GOOD_COPY_LEN);
> +	if (unlikely(!skb))
> +		return NULL;
> +
>   	/* Copy all frame if it fits skb->head, otherwise
>   	 * we let virtio_net_hdr_to_skb() and GRO pull headers as needed.
>   	 */
> @@ -418,11 +438,6 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
>   		copy = ETH_HLEN + metasize;
>   	skb_put_data(skb, p, copy);
>
> -	if (metasize) {
> -		__skb_pull(skb, metasize);
> -		skb_metadata_set(skb, metasize);
> -	}
> -
>   	len -= copy;
>   	offset += copy;
>
> @@ -431,7 +446,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
>   			skb_add_rx_frag(skb, 0, page, offset, len, truesize);
>   		else
>   			put_page(page);
> -		return skb;
> +		goto ok;
>   	}
>
>   	/*
> @@ -458,6 +473,18 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
>   	if (page)
>   		give_pages(rq, page);
>
> +ok:
> +	/* hdr_valid means no XDP, so we can copy the vnet header */
> +	if (hdr_valid) {
> +		hdr = skb_vnet_hdr(skb);
> +		memcpy(hdr, hdr_p, hdr_len);
> +	}
> +
> +	if (metasize) {
> +		__skb_pull(skb, metasize);
> +		skb_metadata_set(skb, metasize);
> +	}
> +
>   	return skb;
>   }
>
> @@ -808,7 +835,7 @@ static struct sk_buff *receive_big(struct net_device *dev,
>   {
>   	struct page *page = buf;
>   	struct sk_buff *skb =
> -		page_to_skb(vi, rq, page, 0, len, PAGE_SIZE, true, 0);
> +		page_to_skb(vi, rq, page, 0, len, PAGE_SIZE, true, 0, 0);
>
>   	stats->bytes += len - vi->hdr_len;
>   	if (unlikely(!skb))
> @@ -922,7 +949,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
>   				put_page(page);
>   				head_skb = page_to_skb(vi, rq, xdp_page, offset,
>   						       len, PAGE_SIZE, false,
> -						       metasize);
> +						       metasize, headroom);
>   				return head_skb;
>   			}
>   			break;
> @@ -980,7 +1007,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
>   	}
>
>   	head_skb = page_to_skb(vi, rq, page, offset, len, truesize, !xdp_prog,
> -			       metasize);
> +			       metasize, headroom);
>   	curr_skb = head_skb;
>
>   	if (unlikely(!curr_skb))
> --
> 2.31.0
>
David Ahern April 19, 2021, 4:48 p.m. UTC | #2
On 4/16/21 2:16 AM, Xuan Zhuo wrote:
> In page_to_skb(), if we have enough tailroom to save skb_shared_info, we
> can use build_skb to create skb directly. No need to alloc for
> additional space. And it can save a 'frags slot', which is very friendly
> to GRO.
> 
> Here, if the payload of the received package is too small (less than
> GOOD_COPY_LEN), we still choose to copy it directly to the space got by
> napi_alloc_skb. So we can reuse these pages.
> 
> Testing Machine:
>     The four queues of the network card are bound to the cpu1.
> 
> Test command:
>     for ((i=0;i<5;++i)); do sockperf tp --ip 192.168.122.64 -m 1000 -t 150& done
> 
> The size of the udp package is 1000, so in the case of this patch, there
> will always be enough tailroom to use build_skb. The sent udp packet
> will be discarded because there is no port to receive it. The irqsoftd
> of the machine is 100%, we observe the received quantity displayed by
> sar -n DEV 1:
> 
> no build_skb:  956864.00 rxpck/s
> build_skb:    1158465.00 rxpck/s
> 

virtio_net is using napi_consume_skb, so napi_build_skb should show a
small increase from build_skb.
Mat Martineau April 19, 2021, 11:29 p.m. UTC | #3
On Fri, 16 Apr 2021, Xuan Zhuo wrote:

> In page_to_skb(), if we have enough tailroom to save skb_shared_info, we
> can use build_skb to create skb directly. No need to alloc for
> additional space. And it can save a 'frags slot', which is very friendly
> to GRO.
>
> Here, if the payload of the received package is too small (less than
> GOOD_COPY_LEN), we still choose to copy it directly to the space got by
> napi_alloc_skb. So we can reuse these pages.
>
> Testing Machine:
>    The four queues of the network card are bound to the cpu1.
>
> Test command:
>    for ((i=0;i<5;++i)); do sockperf tp --ip 192.168.122.64 -m 1000 -t 150& done
>
> The size of the udp package is 1000, so in the case of this patch, there
> will always be enough tailroom to use build_skb. The sent udp packet
> will be discarded because there is no port to receive it. The irqsoftd
> of the machine is 100%, we observe the received quantity displayed by
> sar -n DEV 1:
>
> no build_skb:  956864.00 rxpck/s
> build_skb:    1158465.00 rxpck/s
>
> Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> Suggested-by: Jason Wang <jasowang@redhat.com>
> ---
>
> v3: fix the truesize when headroom > 0
>
> v2: conflict resolution
>
> drivers/net/virtio_net.c | 69 ++++++++++++++++++++++++++++------------
> 1 file changed, 48 insertions(+), 21 deletions(-)

Xuan,

I realize this has been merged to net-next already, but I'm getting a 
use-after-free with KASAN in page_to_skb() with this patch. Reverting this 
change fixes the UAF. I've included the KASAN dump below, and a couple of 
comments inline.


>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 101659cd4b87..8cd76037c724 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -379,21 +379,17 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> 				   struct receive_queue *rq,
> 				   struct page *page, unsigned int offset,
> 				   unsigned int len, unsigned int truesize,
> -				   bool hdr_valid, unsigned int metasize)
> +				   bool hdr_valid, unsigned int metasize,
> +				   unsigned int headroom)
> {
> 	struct sk_buff *skb;
> 	struct virtio_net_hdr_mrg_rxbuf *hdr;
> 	unsigned int copy, hdr_len, hdr_padded_len;
> -	char *p;
> +	int tailroom, shinfo_size;
> +	char *p, *hdr_p;
>
> 	p = page_address(page) + offset;
> -
> -	/* copy small packet so we can reuse these pages for small data */
> -	skb = napi_alloc_skb(&rq->napi, GOOD_COPY_LEN);
> -	if (unlikely(!skb))
> -		return NULL;
> -
> -	hdr = skb_vnet_hdr(skb);
> +	hdr_p = p;

hdr_p is assigned here, pointer to an address in the provided page...

>
> 	hdr_len = vi->hdr_len;
> 	if (vi->mergeable_rx_bufs)

(snip)

> @@ -431,7 +446,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> 			skb_add_rx_frag(skb, 0, page, offset, len, truesize);
> 		else
> 			put_page(page);

page is potentially released here...

> -		return skb;
> +		goto ok;
> 	}
>
> 	/*
> @@ -458,6 +473,18 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> 	if (page)
> 		give_pages(rq, page);
>
> +ok:
> +	/* hdr_valid means no XDP, so we can copy the vnet header */
> +	if (hdr_valid) {
> +		hdr = skb_vnet_hdr(skb);
> +		memcpy(hdr, hdr_p, hdr_len);

and hdr_p is dereferenced here.

I'm seeing this KASAN UAF at boot time in a kvm VM (Fedora 33 host and 
guest, if that helps):

[   61.202483] ==================================================================
[   61.204005] BUG: KASAN: use-after-free in page_to_skb+0x32a/0x4b0
[   61.205387] Read of size 12 at addr ffff888105bdf800 by task NetworkManager/579
[   61.207035] 
[   61.207408] CPU: 0 PID: 579 Comm: NetworkManager Not tainted 5.12.0-rc7+ #2
[   61.208715] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-1.fc33 04/01/2014
[   61.210257] Call Trace:
[   61.210730]  <IRQ>
[   61.211209]  dump_stack+0x93/0xc2
[   61.211996]  print_address_description.constprop.0+0x18/0x130
[   61.213310]  ? page_to_skb+0x32a/0x4b0
[   61.214318]  ? page_to_skb+0x32a/0x4b0
[   61.215085]  kasan_report.cold+0x7f/0x111
[   61.215966]  ? trace_hardirqs_off+0x10/0xe0
[   61.216823]  ? page_to_skb+0x32a/0x4b0
[   61.217809]  kasan_check_range+0xf9/0x1e0
[   61.217834]  memcpy+0x20/0x60
[   61.217848]  page_to_skb+0x32a/0x4b0
[   61.217888]  receive_buf+0x1434/0x2690
[   61.217926]  ? page_to_skb+0x4b0/0x4b0
[   61.217947]  ? find_held_lock+0x85/0xa0
[   61.217964]  ? lock_release+0x1d0/0x400
[   61.217974]  ? virtnet_poll+0x1d8/0x6b0
[   61.217983]  ? detach_buf_split+0x254/0x290
[   61.218008]  ? virtqueue_get_buf_ctx_split+0x145/0x1f0
[   61.218032]  virtnet_poll+0x2a8/0x6b0
[   61.218057]  ? receive_buf+0x2690/0x2690
[   61.218067]  ? lock_release+0x400/0x400
[   61.218119]  __napi_poll+0x57/0x2f0
[   61.229624]  net_rx_action+0x4dd/0x590
[   61.230453]  ? napi_threaded_poll+0x2b0/0x2b0
[   61.231379]  ? rcu_implicit_dynticks_qs+0x430/0x430
[   61.232429]  ? lock_is_held_type+0x98/0x110
[   61.233342]  __do_softirq+0xfd/0x59d
[   61.234131]  do_softirq+0x8a/0xb0
[   61.234896]  </IRQ>
[   61.235397]  ? virtnet_open+0x10a/0x2e0
[   61.236273]  __local_bh_enable_ip+0xb1/0xc0
[   61.237199]  virtnet_open+0x11b/0x2e0
[   61.237981]  __dev_open+0x1b7/0x2c0
[   61.238689]  ? dev_set_rx_mode+0x60/0x60
[   61.239499]  ? lockdep_hardirqs_on_prepare+0x12e/0x1f0
[   61.240523]  ? __local_bh_enable_ip+0x7b/0xc0
[   61.241399]  ? trace_hardirqs_on+0x1c/0x100
[   61.242248]  __dev_change_flags+0x2e6/0x370
[   61.243098]  ? dev_set_allmulti+0x10/0x10
[   61.243908]  ? lock_chain_count+0x20/0x20
[   61.244759]  dev_change_flags+0x55/0xb0
[   61.245595]  do_setlink+0xb52/0x1950
[   61.246385]  ? rtnl_getlink+0x560/0x560
[   61.247218]  ? mark_lock+0x101/0x19c0
[   61.248003]  ? lock_chain_count+0x20/0x20
[   61.248864]  ? lock_chain_count+0x20/0x20
[   61.249728]  ? lockdep_hardirqs_on_prepare+0x1f0/0x1f0
[   61.250821]  ? memset+0x20/0x40
[   61.251474]  ? __nla_validate_parse+0xac/0x12f0
[   61.252433]  ? nla_get_range_signed+0x1c0/0x1c0
[   61.253409]  ? __lock_acquire+0x85f/0x3090
[   61.254291]  __rtnl_newlink+0x85f/0xca0
[   61.255131]  ? rtnl_setlink+0x220/0x220
[   61.255988]  ? lock_is_held_type+0x98/0x110
[   61.256911]  ? find_held_lock+0x85/0xa0
[   61.257782]  ? __is_insn_slot_addr+0xa5/0x130
[   61.257794]  ? lock_downgrade+0x390/0x390
[   61.257802]  ? stack_access_ok+0x35/0x90
[   61.257818]  ? entry_SYSCALL_64_after_hwframe+0x44/0xae
[   61.257840]  ? __is_insn_slot_addr+0xc4/0x130
[   61.257859]  ? kernel_text_address+0xc8/0xf0
[   61.257876]  ? __kernel_text_address+0x9/0x30
[   61.257885]  ? unwind_get_return_address+0x2a/0x40
[   61.257893]  ? create_prof_cpu_mask+0x20/0x20
[   61.257903]  ? arch_stack_walk+0x99/0xf0
[   61.258007]  ? lock_release+0x1d0/0x400
[   61.258016]  ? fs_reclaim_release+0x56/0x90
[   61.258027]  ? lock_downgrade+0x390/0x390
[   61.258036]  ? find_held_lock+0x80/0xa0
[   61.258049]  ? lock_release+0x1d0/0x400
[   61.258059]  ? lock_is_held_type+0x98/0x110
[   61.258087]  rtnl_newlink+0x4b/0x70
[   61.258099]  rtnetlink_rcv_msg+0x22c/0x5e0
[   61.258116]  ? rtnetlink_put_metrics+0x2c0/0x2c0
[   61.258131]  ? lock_acquire+0x157/0x4d0
[   61.258140]  ? netlink_deliver_tap+0xa6/0x570
[   61.258154]  ? lock_release+0x400/0x400
[   61.258172]  netlink_rcv_skb+0xc4/0x1f0
[   61.258180]  ? rtnetlink_put_metrics+0x2c0/0x2c0
[   61.258193]  ? netlink_ack+0x4f0/0x4f0
[   61.258199]  ? netlink_deliver_tap+0x129/0x570
[   61.258234]  netlink_unicast+0x2d3/0x410
[   61.258248]  ? netlink_attachskb+0x400/0x400
[   61.258257]  ? _copy_from_iter_full+0xd8/0x360
[   61.258280]  netlink_sendmsg+0x394/0x670
[   61.258299]  ? netlink_unicast+0x410/0x410
[   61.258305]  ? iovec_from_user+0xa1/0x1d0
[   61.258327]  ? netlink_unicast+0x410/0x410
[   61.258340]  sock_sendmsg+0x91/0xa0
[   61.258353]  ____sys_sendmsg+0x3b7/0x400
[   61.258366]  ? kernel_sendmsg+0x30/0x30
[   61.258376]  ? __ia32_sys_recvmmsg+0x150/0x150
[   61.258390]  ? lockdep_hardirqs_on_prepare+0x1f0/0x1f0
[   61.258398]  ? stack_trace_save+0x8c/0xc0
[   61.258408]  ? stack_trace_consume_entry+0x80/0x80
[   61.258416]  ? __fput+0x1a9/0x3d0
[   61.258435]  ___sys_sendmsg+0xd3/0x130
[   61.258446]  ? sendmsg_copy_msghdr+0x110/0x110
[   61.258458]  ? find_held_lock+0x85/0xa0
[   61.258471]  ? lock_release+0x1d0/0x400
[   61.258479]  ? __fget_files+0x133/0x210
[   61.258490]  ? lock_downgrade+0x390/0x390
[   61.258508]  ? lockdep_hardirqs_on_prepare+0x1f0/0x1f0
[   61.258529]  ? __fget_files+0x152/0x210
[   61.258547]  ? __fget_light+0x66/0xf0
[   61.258568]  __sys_sendmsg+0xae/0x120
[   61.258578]  ? __sys_sendmsg_sock+0x10/0x10
[   61.258589]  ? lockdep_hardirqs_on_prepare+0x12e/0x1f0
[   61.258598]  ? call_rcu+0x414/0x670
[   61.258616]  ? mark_held_locks+0x25/0x90
[   61.258630]  ? lockdep_hardirqs_on_prepare+0x12e/0x1f0
[   61.258639]  ? syscall_enter_from_user_mode+0x1d/0x50
[   61.258647]  ? trace_hardirqs_on+0x1c/0x100
[   61.258662]  do_syscall_64+0x33/0x40
[   61.258670]  entry_SYSCALL_64_after_hwframe+0x44/0xae
[   61.258679] RIP: 0033:0x7fb33db83ecd
[   61.258686] Code: 28 89 54 24 1c 48 89 74 24 10 89 7c 24 08 e8 4a ee ff ff 8b 54 24 1c 48 8b 74 24 10 41 89 c0 8b 7c 24 08 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 33 44 89 c7 48 89 44 24 08 e8 9e ee ff ff 48
[   61.258692] RSP: 002b:00007ffc85e90e30 EFLAGS: 00000293 ORIG_RAX: 000000000000002e
[   61.258702] RAX: ffffffffffffffda RBX: 000055f87a7aa030 RCX: 00007fb33db83ecd
[   61.258708] RDX: 0000000000000000 RSI: 00007ffc85e90e70 RDI: 000000000000000c
[   61.258713] RBP: 000000000000000c R08: 0000000000000000 R09: 0000000000000000
[   61.258717] R10: 0000000000000000 R11: 0000000000000293 R12: 0000000000000000
[   61.258722] R13: 00007ffc85e90fd0 R14: 00007ffc85e90fcc R15: 0000000000000000


--
Mat Martineau
Intel
Jason Wang April 20, 2021, 2:38 a.m. UTC | #4
在 2021/4/20 上午7:29, Mat Martineau 写道:
>
> On Fri, 16 Apr 2021, Xuan Zhuo wrote:
>
>> In page_to_skb(), if we have enough tailroom to save skb_shared_info, we
>> can use build_skb to create skb directly. No need to alloc for
>> additional space. And it can save a 'frags slot', which is very friendly
>> to GRO.
>>
>> Here, if the payload of the received package is too small (less than
>> GOOD_COPY_LEN), we still choose to copy it directly to the space got by
>> napi_alloc_skb. So we can reuse these pages.
>>
>> Testing Machine:
>>    The four queues of the network card are bound to the cpu1.
>>
>> Test command:
>>    for ((i=0;i<5;++i)); do sockperf tp --ip 192.168.122.64 -m 1000 -t 
>> 150& done
>>
>> The size of the udp package is 1000, so in the case of this patch, there
>> will always be enough tailroom to use build_skb. The sent udp packet
>> will be discarded because there is no port to receive it. The irqsoftd
>> of the machine is 100%, we observe the received quantity displayed by
>> sar -n DEV 1:
>>
>> no build_skb:  956864.00 rxpck/s
>> build_skb:    1158465.00 rxpck/s
>>
>> Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
>> Suggested-by: Jason Wang <jasowang@redhat.com>
>> ---
>>
>> v3: fix the truesize when headroom > 0
>>
>> v2: conflict resolution
>>
>> drivers/net/virtio_net.c | 69 ++++++++++++++++++++++++++++------------
>> 1 file changed, 48 insertions(+), 21 deletions(-)
>
> Xuan,
>
> I realize this has been merged to net-next already, but I'm getting a 
> use-after-free with KASAN in page_to_skb() with this patch. Reverting 
> this change fixes the UAF. I've included the KASAN dump below, and a 
> couple of comments inline.
>
>
>>
>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
>> index 101659cd4b87..8cd76037c724 100644
>> --- a/drivers/net/virtio_net.c
>> +++ b/drivers/net/virtio_net.c
>> @@ -379,21 +379,17 @@ static struct sk_buff *page_to_skb(struct 
>> virtnet_info *vi,
>>                    struct receive_queue *rq,
>>                    struct page *page, unsigned int offset,
>>                    unsigned int len, unsigned int truesize,
>> -                   bool hdr_valid, unsigned int metasize)
>> +                   bool hdr_valid, unsigned int metasize,
>> +                   unsigned int headroom)
>> {
>>     struct sk_buff *skb;
>>     struct virtio_net_hdr_mrg_rxbuf *hdr;
>>     unsigned int copy, hdr_len, hdr_padded_len;
>> -    char *p;
>> +    int tailroom, shinfo_size;
>> +    char *p, *hdr_p;
>>
>>     p = page_address(page) + offset;
>> -
>> -    /* copy small packet so we can reuse these pages for small data */
>> -    skb = napi_alloc_skb(&rq->napi, GOOD_COPY_LEN);
>> -    if (unlikely(!skb))
>> -        return NULL;
>> -
>> -    hdr = skb_vnet_hdr(skb);
>> +    hdr_p = p;
>
> hdr_p is assigned here, pointer to an address in the provided page...
>
>>
>>     hdr_len = vi->hdr_len;
>>     if (vi->mergeable_rx_bufs)
>
> (snip)
>
>> @@ -431,7 +446,7 @@ static struct sk_buff *page_to_skb(struct 
>> virtnet_info *vi,
>>             skb_add_rx_frag(skb, 0, page, offset, len, truesize);
>>         else
>>             put_page(page);
>
> page is potentially released here...
>
>> -        return skb;
>> +        goto ok;
>>     }
>>
>>     /*
>> @@ -458,6 +473,18 @@ static struct sk_buff *page_to_skb(struct 
>> virtnet_info *vi,
>>     if (page)
>>         give_pages(rq, page);
>>
>> +ok:
>> +    /* hdr_valid means no XDP, so we can copy the vnet header */
>> +    if (hdr_valid) {
>> +        hdr = skb_vnet_hdr(skb);
>> +        memcpy(hdr, hdr_p, hdr_len);
>
> and hdr_p is dereferenced here.


Right, I tend to recover the way to copy hdr and set meta just after 
alloc/build_skb().

Thanks


>
> I'm seeing this KASAN UAF at boot time in a kvm VM (Fedora 33 host and 
> guest, if that helps):
>
> [   61.202483] 
> ==================================================================
> [   61.204005] BUG: KASAN: use-after-free in page_to_skb+0x32a/0x4b0
> [   61.205387] Read of size 12 at addr ffff888105bdf800 by task 
> NetworkManager/579
> [   61.207035] [   61.207408] CPU: 0 PID: 579 Comm: NetworkManager Not 
> tainted 5.12.0-rc7+ #2
> [   61.208715] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), 
> BIOS 1.14.0-1.fc33 04/01/2014
> [   61.210257] Call Trace:
> [   61.210730]  <IRQ>
> [   61.211209]  dump_stack+0x93/0xc2
> [   61.211996]  print_address_description.constprop.0+0x18/0x130
> [   61.213310]  ? page_to_skb+0x32a/0x4b0
> [   61.214318]  ? page_to_skb+0x32a/0x4b0
> [   61.215085]  kasan_report.cold+0x7f/0x111
> [   61.215966]  ? trace_hardirqs_off+0x10/0xe0
> [   61.216823]  ? page_to_skb+0x32a/0x4b0
> [   61.217809]  kasan_check_range+0xf9/0x1e0
> [   61.217834]  memcpy+0x20/0x60
> [   61.217848]  page_to_skb+0x32a/0x4b0
> [   61.217888]  receive_buf+0x1434/0x2690
> [   61.217926]  ? page_to_skb+0x4b0/0x4b0
> [   61.217947]  ? find_held_lock+0x85/0xa0
> [   61.217964]  ? lock_release+0x1d0/0x400
> [   61.217974]  ? virtnet_poll+0x1d8/0x6b0
> [   61.217983]  ? detach_buf_split+0x254/0x290
> [   61.218008]  ? virtqueue_get_buf_ctx_split+0x145/0x1f0
> [   61.218032]  virtnet_poll+0x2a8/0x6b0
> [   61.218057]  ? receive_buf+0x2690/0x2690
> [   61.218067]  ? lock_release+0x400/0x400
> [   61.218119]  __napi_poll+0x57/0x2f0
> [   61.229624]  net_rx_action+0x4dd/0x590
> [   61.230453]  ? napi_threaded_poll+0x2b0/0x2b0
> [   61.231379]  ? rcu_implicit_dynticks_qs+0x430/0x430
> [   61.232429]  ? lock_is_held_type+0x98/0x110
> [   61.233342]  __do_softirq+0xfd/0x59d
> [   61.234131]  do_softirq+0x8a/0xb0
> [   61.234896]  </IRQ>
> [   61.235397]  ? virtnet_open+0x10a/0x2e0
> [   61.236273]  __local_bh_enable_ip+0xb1/0xc0
> [   61.237199]  virtnet_open+0x11b/0x2e0
> [   61.237981]  __dev_open+0x1b7/0x2c0
> [   61.238689]  ? dev_set_rx_mode+0x60/0x60
> [   61.239499]  ? lockdep_hardirqs_on_prepare+0x12e/0x1f0
> [   61.240523]  ? __local_bh_enable_ip+0x7b/0xc0
> [   61.241399]  ? trace_hardirqs_on+0x1c/0x100
> [   61.242248]  __dev_change_flags+0x2e6/0x370
> [   61.243098]  ? dev_set_allmulti+0x10/0x10
> [   61.243908]  ? lock_chain_count+0x20/0x20
> [   61.244759]  dev_change_flags+0x55/0xb0
> [   61.245595]  do_setlink+0xb52/0x1950
> [   61.246385]  ? rtnl_getlink+0x560/0x560
> [   61.247218]  ? mark_lock+0x101/0x19c0
> [   61.248003]  ? lock_chain_count+0x20/0x20
> [   61.248864]  ? lock_chain_count+0x20/0x20
> [   61.249728]  ? lockdep_hardirqs_on_prepare+0x1f0/0x1f0
> [   61.250821]  ? memset+0x20/0x40
> [   61.251474]  ? __nla_validate_parse+0xac/0x12f0
> [   61.252433]  ? nla_get_range_signed+0x1c0/0x1c0
> [   61.253409]  ? __lock_acquire+0x85f/0x3090
> [   61.254291]  __rtnl_newlink+0x85f/0xca0
> [   61.255131]  ? rtnl_setlink+0x220/0x220
> [   61.255988]  ? lock_is_held_type+0x98/0x110
> [   61.256911]  ? find_held_lock+0x85/0xa0
> [   61.257782]  ? __is_insn_slot_addr+0xa5/0x130
> [   61.257794]  ? lock_downgrade+0x390/0x390
> [   61.257802]  ? stack_access_ok+0x35/0x90
> [   61.257818]  ? entry_SYSCALL_64_after_hwframe+0x44/0xae
> [   61.257840]  ? __is_insn_slot_addr+0xc4/0x130
> [   61.257859]  ? kernel_text_address+0xc8/0xf0
> [   61.257876]  ? __kernel_text_address+0x9/0x30
> [   61.257885]  ? unwind_get_return_address+0x2a/0x40
> [   61.257893]  ? create_prof_cpu_mask+0x20/0x20
> [   61.257903]  ? arch_stack_walk+0x99/0xf0
> [   61.258007]  ? lock_release+0x1d0/0x400
> [   61.258016]  ? fs_reclaim_release+0x56/0x90
> [   61.258027]  ? lock_downgrade+0x390/0x390
> [   61.258036]  ? find_held_lock+0x80/0xa0
> [   61.258049]  ? lock_release+0x1d0/0x400
> [   61.258059]  ? lock_is_held_type+0x98/0x110
> [   61.258087]  rtnl_newlink+0x4b/0x70
> [   61.258099]  rtnetlink_rcv_msg+0x22c/0x5e0
> [   61.258116]  ? rtnetlink_put_metrics+0x2c0/0x2c0
> [   61.258131]  ? lock_acquire+0x157/0x4d0
> [   61.258140]  ? netlink_deliver_tap+0xa6/0x570
> [   61.258154]  ? lock_release+0x400/0x400
> [   61.258172]  netlink_rcv_skb+0xc4/0x1f0
> [   61.258180]  ? rtnetlink_put_metrics+0x2c0/0x2c0
> [   61.258193]  ? netlink_ack+0x4f0/0x4f0
> [   61.258199]  ? netlink_deliver_tap+0x129/0x570
> [   61.258234]  netlink_unicast+0x2d3/0x410
> [   61.258248]  ? netlink_attachskb+0x400/0x400
> [   61.258257]  ? _copy_from_iter_full+0xd8/0x360
> [   61.258280]  netlink_sendmsg+0x394/0x670
> [   61.258299]  ? netlink_unicast+0x410/0x410
> [   61.258305]  ? iovec_from_user+0xa1/0x1d0
> [   61.258327]  ? netlink_unicast+0x410/0x410
> [   61.258340]  sock_sendmsg+0x91/0xa0
> [   61.258353]  ____sys_sendmsg+0x3b7/0x400
> [   61.258366]  ? kernel_sendmsg+0x30/0x30
> [   61.258376]  ? __ia32_sys_recvmmsg+0x150/0x150
> [   61.258390]  ? lockdep_hardirqs_on_prepare+0x1f0/0x1f0
> [   61.258398]  ? stack_trace_save+0x8c/0xc0
> [   61.258408]  ? stack_trace_consume_entry+0x80/0x80
> [   61.258416]  ? __fput+0x1a9/0x3d0
> [   61.258435]  ___sys_sendmsg+0xd3/0x130
> [   61.258446]  ? sendmsg_copy_msghdr+0x110/0x110
> [   61.258458]  ? find_held_lock+0x85/0xa0
> [   61.258471]  ? lock_release+0x1d0/0x400
> [   61.258479]  ? __fget_files+0x133/0x210
> [   61.258490]  ? lock_downgrade+0x390/0x390
> [   61.258508]  ? lockdep_hardirqs_on_prepare+0x1f0/0x1f0
> [   61.258529]  ? __fget_files+0x152/0x210
> [   61.258547]  ? __fget_light+0x66/0xf0
> [   61.258568]  __sys_sendmsg+0xae/0x120
> [   61.258578]  ? __sys_sendmsg_sock+0x10/0x10
> [   61.258589]  ? lockdep_hardirqs_on_prepare+0x12e/0x1f0
> [   61.258598]  ? call_rcu+0x414/0x670
> [   61.258616]  ? mark_held_locks+0x25/0x90
> [   61.258630]  ? lockdep_hardirqs_on_prepare+0x12e/0x1f0
> [   61.258639]  ? syscall_enter_from_user_mode+0x1d/0x50
> [   61.258647]  ? trace_hardirqs_on+0x1c/0x100
> [   61.258662]  do_syscall_64+0x33/0x40
> [   61.258670]  entry_SYSCALL_64_after_hwframe+0x44/0xae
> [   61.258679] RIP: 0033:0x7fb33db83ecd
> [   61.258686] Code: 28 89 54 24 1c 48 89 74 24 10 89 7c 24 08 e8 4a 
> ee ff ff 8b 54 24 1c 48 8b 74 24 10 41 89 c0 8b 7c 24 08 b8 2e 00 00 
> 00 0f 05 <48> 3d 00 f0 ff ff 77 33 44 89 c7 48 89 44 24 08 e8 9e ee ff 
> ff 48
> [   61.258692] RSP: 002b:00007ffc85e90e30 EFLAGS: 00000293 ORIG_RAX: 
> 000000000000002e
> [   61.258702] RAX: ffffffffffffffda RBX: 000055f87a7aa030 RCX: 
> 00007fb33db83ecd
> [   61.258708] RDX: 0000000000000000 RSI: 00007ffc85e90e70 RDI: 
> 000000000000000c
> [   61.258713] RBP: 000000000000000c R08: 0000000000000000 R09: 
> 0000000000000000
> [   61.258717] R10: 0000000000000000 R11: 0000000000000293 R12: 
> 0000000000000000
> [   61.258722] R13: 00007ffc85e90fd0 R14: 00007ffc85e90fcc R15: 
> 0000000000000000
>
>
> -- 
> Mat Martineau
> Intel
>
Jason Wang April 20, 2021, 2:41 a.m. UTC | #5
在 2021/4/20 上午10:38, Jason Wang 写道:
>>> :
>>> +    /* hdr_valid means no XDP, so we can copy the vnet header */
>>> +    if (hdr_valid) {
>>> +        hdr = skb_vnet_hdr(skb);
>>> +        memcpy(hdr, hdr_p, hdr_len);
>>
>> and hdr_p is dereferenced here.
>
>
> Right, I tend to recover the way to copy hdr and set meta just after 
> alloc/build_skb().
>
> Thanks 


Btw, since the patch modifies a critical path of virtio-net I suggest to 
test the following cases:

1) netperf TCP stream
2) netperf UDP with packet size from 64 to PAGE_SIZE
3) XDP_PASS with 1)
4) XDP_PASS with 2)
5) XDP metadata with 1)
6) XDP metadata with 2)

Thanks
Jason Wang April 20, 2021, 2:49 a.m. UTC | #6
在 2021/4/20 上午12:48, David Ahern 写道:
> On 4/16/21 2:16 AM, Xuan Zhuo wrote:
>> In page_to_skb(), if we have enough tailroom to save skb_shared_info, we
>> can use build_skb to create skb directly. No need to alloc for
>> additional space. And it can save a 'frags slot', which is very friendly
>> to GRO.
>>
>> Here, if the payload of the received package is too small (less than
>> GOOD_COPY_LEN), we still choose to copy it directly to the space got by
>> napi_alloc_skb. So we can reuse these pages.
>>
>> Testing Machine:
>>      The four queues of the network card are bound to the cpu1.
>>
>> Test command:
>>      for ((i=0;i<5;++i)); do sockperf tp --ip 192.168.122.64 -m 1000 -t 150& done
>>
>> The size of the udp package is 1000, so in the case of this patch, there
>> will always be enough tailroom to use build_skb. The sent udp packet
>> will be discarded because there is no port to receive it. The irqsoftd
>> of the machine is 100%, we observe the received quantity displayed by
>> sar -n DEV 1:
>>
>> no build_skb:  956864.00 rxpck/s
>> build_skb:    1158465.00 rxpck/s
>>
> virtio_net is using napi_consume_skb, so napi_build_skb should show a
> small increase from build_skb.
>

Yes and we probably need to do this in receive_small().

Thanks
Eric Dumazet April 20, 2021, 9:28 a.m. UTC | #7
On 4/16/21 11:16 AM, Xuan Zhuo wrote:
> In page_to_skb(), if we have enough tailroom to save skb_shared_info, we
> can use build_skb to create skb directly. No need to alloc for
> additional space. And it can save a 'frags slot', which is very friendly
> to GRO.
> 
> Here, if the payload of the received package is too small (less than
> GOOD_COPY_LEN), we still choose to copy it directly to the space got by
> napi_alloc_skb. So we can reuse these pages.
> 
> Testing Machine:
>     The four queues of the network card are bound to the cpu1.
> 
> Test command:
>     for ((i=0;i<5;++i)); do sockperf tp --ip 192.168.122.64 -m 1000 -t 150& done
> 
> The size of the udp package is 1000, so in the case of this patch, there
> will always be enough tailroom to use build_skb. The sent udp packet
> will be discarded because there is no port to receive it. The irqsoftd
> of the machine is 100%, we observe the received quantity displayed by
> sar -n DEV 1:
> 
> no build_skb:  956864.00 rxpck/s
> build_skb:    1158465.00 rxpck/s
> 
> Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> Suggested-by: Jason Wang <jasowang@redhat.com>
> ---
> 
> v3: fix the truesize when headroom > 0
> 
> v2: conflict resolution
> 
>  drivers/net/virtio_net.c | 69 ++++++++++++++++++++++++++++------------
>  1 file changed, 48 insertions(+), 21 deletions(-)
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 101659cd4b87..8cd76037c724 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -379,21 +379,17 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
>  				   struct receive_queue *rq,
>  				   struct page *page, unsigned int offset,
>  				   unsigned int len, unsigned int truesize,
> -				   bool hdr_valid, unsigned int metasize)
> +				   bool hdr_valid, unsigned int metasize,
> +				   unsigned int headroom)
>  {
>  	struct sk_buff *skb;
>  	struct virtio_net_hdr_mrg_rxbuf *hdr;
>  	unsigned int copy, hdr_len, hdr_padded_len;
> -	char *p;
> +	int tailroom, shinfo_size;
> +	char *p, *hdr_p;
> 
>  	p = page_address(page) + offset;
> -
> -	/* copy small packet so we can reuse these pages for small data */
> -	skb = napi_alloc_skb(&rq->napi, GOOD_COPY_LEN);
> -	if (unlikely(!skb))
> -		return NULL;
> -
> -	hdr = skb_vnet_hdr(skb);
> +	hdr_p = p;
> 
>  	hdr_len = vi->hdr_len;
>  	if (vi->mergeable_rx_bufs)
> @@ -401,14 +397,38 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
>  	else
>  		hdr_padded_len = sizeof(struct padded_vnet_hdr);
> 
> -	/* hdr_valid means no XDP, so we can copy the vnet header */
> -	if (hdr_valid)
> -		memcpy(hdr, p, hdr_len);
> +	/* If headroom is not 0, there is an offset between the beginning of the
> +	 * data and the allocated space, otherwise the data and the allocated
> +	 * space are aligned.
> +	 */
> +	if (headroom) {
> +		/* The actual allocated space size is PAGE_SIZE. */
> +		truesize = PAGE_SIZE;
> +		tailroom = truesize - len - offset;
> +	} else {
> +		tailroom = truesize - len;
> +	}
> 
>  	len -= hdr_len;
>  	offset += hdr_padded_len;
>  	p += hdr_padded_len;
> 
> +	shinfo_size = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
> +
> +	if (len > GOOD_COPY_LEN && tailroom >= shinfo_size) {
> +		skb = build_skb(p, truesize);
> +		if (unlikely(!skb))
> +			return NULL;
> +
> +		skb_put(skb, len);
> +		goto ok;
> +	}
> +
> +	/* copy small packet so we can reuse these pages for small data */
> +	skb = napi_alloc_skb(&rq->napi, GOOD_COPY_LEN);
> +	if (unlikely(!skb))
> +		return NULL;
> +
>  	/* Copy all frame if it fits skb->head, otherwise
>  	 * we let virtio_net_hdr_to_skb() and GRO pull headers as needed.
>  	 */
> @@ -418,11 +438,6 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
>  		copy = ETH_HLEN + metasize;
>  	skb_put_data(skb, p, copy);
> 
> -	if (metasize) {
> -		__skb_pull(skb, metasize);
> -		skb_metadata_set(skb, metasize);
> -	}
> -
>  	len -= copy;
>  	offset += copy;
> 
> @@ -431,7 +446,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
>  			skb_add_rx_frag(skb, 0, page, offset, len, truesize);
>  		else
>  			put_page(page);

Here the struct page has been freed..

> -		return skb;
> +		goto ok;
>  	}
> 
>  	/*
> @@ -458,6 +473,18 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
>  	if (page)
>  		give_pages(rq, page);
> 
> +ok:
> +	/* hdr_valid means no XDP, so we can copy the vnet header */
> +	if (hdr_valid) {
> +		hdr = skb_vnet_hdr(skb);
> +		memcpy(hdr, hdr_p, hdr_len);

But here you are reading its content.

This is a use-after-free.

> +	}
> +

I will test something like :


diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 8cd76037c72481200ea3e8429e9fdfec005dad85..2e28c04aa6351d2b4016f7d277ce104c4970069d 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -385,6 +385,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
        struct sk_buff *skb;
        struct virtio_net_hdr_mrg_rxbuf *hdr;
        unsigned int copy, hdr_len, hdr_padded_len;
+       struct page *page_to_free = NULL;
        int tailroom, shinfo_size;
        char *p, *hdr_p;
 
@@ -445,7 +446,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
                if (len)
                        skb_add_rx_frag(skb, 0, page, offset, len, truesize);
                else
-                       put_page(page);
+                       page_to_free = page;
                goto ok;
        }
 
@@ -479,6 +480,8 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
                hdr = skb_vnet_hdr(skb);
                memcpy(hdr, hdr_p, hdr_len);
        }
+       if (page_to_free)
+               put_page(page_to_free);
 
        if (metasize) {
                __skb_pull(skb, metasize);
Ido Schimmel April 22, 2021, 11:13 a.m. UTC | #8
On Fri, Apr 16, 2021 at 05:16:15PM +0800, Xuan Zhuo wrote:
> In page_to_skb(), if we have enough tailroom to save skb_shared_info, we
> can use build_skb to create skb directly. No need to alloc for
> additional space. And it can save a 'frags slot', which is very friendly
> to GRO.
> 
> Here, if the payload of the received package is too small (less than
> GOOD_COPY_LEN), we still choose to copy it directly to the space got by
> napi_alloc_skb. So we can reuse these pages.
> 
> Testing Machine:
>     The four queues of the network card are bound to the cpu1.
> 
> Test command:
>     for ((i=0;i<5;++i)); do sockperf tp --ip 192.168.122.64 -m 1000 -t 150& done
> 
> The size of the udp package is 1000, so in the case of this patch, there
> will always be enough tailroom to use build_skb. The sent udp packet
> will be discarded because there is no port to receive it. The irqsoftd
> of the machine is 100%, we observe the received quantity displayed by
> sar -n DEV 1:
> 
> no build_skb:  956864.00 rxpck/s
> build_skb:    1158465.00 rxpck/s
> 
> Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> Suggested-by: Jason Wang <jasowang@redhat.com>

We have VMs that use virtio_net for their management interface. After
this patch was applied we started seeing crashes when these VMs access
an NFS file system. I thought Eric's patches will fix it, but problem
persists even with his two patches:

af39c8f72301 virtio-net: fix use-after-free in page_to_skb()
f5d7872a8b8a virtio-net: restrict build_skb() use to some arches

Reverting all three patches makes the problem go away. A KASAN enabled
kernel emits the following (decoded) stack trace.

[1]
BUG: KASAN: use-after-free in skb_gro_receive (net/core/skbuff.c:4260)
Write of size 16 at addr ffff88811619fffc by task kworker/u9:0/534
CPU: 2 PID: 534 Comm: kworker/u9:0 Not tainted 5.12.0-rc7-custom-16372-gb150be05b806 #3382
Hardware name: QEMU MSN2700, BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
Workqueue: xprtiod xs_stream_data_receive_workfn [sunrpc]
Call Trace:
 <IRQ>
dump_stack (lib/dump_stack.c:122)
print_address_description.constprop.0 (mm/kasan/report.c:233)
kasan_report.cold (mm/kasan/report.c:400 mm/kasan/report.c:416)
skb_gro_receive (net/core/skbuff.c:4260)
tcp_gro_receive (net/ipv4/tcp_offload.c:266 (discriminator 1))
tcp4_gro_receive (net/ipv4/tcp_offload.c:316)
inet_gro_receive (net/ipv4/af_inet.c:1545 (discriminator 2))
dev_gro_receive (net/core/dev.c:6075)
napi_gro_receive (net/core/dev.c:6168 net/core/dev.c:6198)
receive_buf (drivers/net/virtio_net.c:1151) virtio_net
virtnet_poll (drivers/net/virtio_net.c:1415 drivers/net/virtio_net.c:1519) virtio_net
__napi_poll (net/core/dev.c:6964)
net_rx_action (net/core/dev.c:7033 net/core/dev.c:7118)
__do_softirq (./arch/x86/include/asm/jump_label.h:25 ./include/linux/jump_label.h:200 ./include/trace/events/irq.h:142 kernel/softirq.c:346)
irq_exit_rcu (kernel/softirq.c:221 kernel/softirq.c:422 kernel/softirq.c:434)
common_interrupt (arch/x86/kernel/irq.c:240 (discriminator 14))
</IRQ>
asm_common_interrupt (./arch/x86/include/asm/idtentry.h:623)
RIP: 0010:read_hpet (arch/x86/kernel/hpet.c:823 arch/x86/kernel/hpet.c:793)
Code: 90 48 8b 05 a5 ef 6f 02 48 89 44 24 68 48 c1 e8 20 89 c2 3b 44 24 4c 74 d1 89 d0 e9 c7 fe ff ff e8 38 90 35 00 fb 8b 44 24 6c <e9> b8 fe ff ff 8b 54 24 6
All code
========
   0:	90                   	nop
   1:	48 8b 05 a5 ef 6f 02 	mov    0x26fefa5(%rip),%rax        # 0x26fefad
   8:	48 89 44 24 68       	mov    %rax,0x68(%rsp)
   d:	48 c1 e8 20          	shr    $0x20,%rax
  11:	89 c2                	mov    %eax,%edx
  13:	3b 44 24 4c          	cmp    0x4c(%rsp),%eax
  17:	74 d1                	je     0xffffffffffffffea
  19:	89 d0                	mov    %edx,%eax
  1b:	e9 c7 fe ff ff       	jmpq   0xfffffffffffffee7
  20:	e8 38 90 35 00       	callq  0x35905d
  25:	fb                   	sti
  26:	8b 44 24 6c          	mov    0x6c(%rsp),%eax
  2a:*	e9 b8 fe ff ff       	jmpq   0xfffffffffffffee7		<-- trapping instruction
  2f:	8b 54 24 06          	mov    0x6(%rsp),%edx

Code starting with the faulting instruction
===========================================
   0:	e9 b8 fe ff ff       	jmpq   0xfffffffffffffebd
   5:	8b 54 24 06          	mov    0x6(%rsp),%edx
c 89 d0 e9 ad fe ff ff e8 fe 8c 35 00 e9
RSP: 0018:ffffc900015a7a68 EFLAGS: 00000206
RAX: 000000004ad84e9a RBX: 1ffff920002b4f4e RCX: ffffffff888897f7
RDX: 0000000000000000 RSI: 0000000000000001 RDI: 0000000000000000
RBP: 0000000000000200 R08: 0000000000000001 R09: ffffffff8c645737
R10: fffffbfff18c8ae6 R11: 0000000000000001 R12: dffffc0000000000
R13: 00000016f23c724e R14: ffff888154e24000 R15: ffff88810c2b2c00
ktime_get (kernel/time/timekeeping.c:290 (discriminator 4) kernel/time/timekeeping.c:386 (discriminator 4) kernel/time/timekeeping.c:829 (discriminator 4))
xprt_lookup_rqst (net/sunrpc/xprt.c:1049) sunrpc
xs_read_stream.constprop.0 (net/sunrpc/xprtsock.c:595 net/sunrpc/xprtsock.c:646) sunrpc
xs_stream_data_receive_workfn (net/sunrpc/xprtsock.c:712 net/sunrpc/xprtsock.c:732) sunrpc
process_one_work (./arch/x86/include/asm/jump_label.h:25 ./include/linux/jump_label.h:200 ./include/trace/events/workqueue.h:108 kernel/workqueue.c:2280)
worker_thread (./include/linux/list.h:282 kernel/workqueue.c:2422)
kthread (kernel/kthread.c:292)
ret_from_fork (arch/x86/entry/entry_64.S:300)

The buggy address belongs to the page:
page:000000000b3e5dba refcount:21 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x116198
head:000000000b3e5dba order:3 compound_mapcount:0 compound_pincount:0
flags: 0x200000000010000(head)
raw: 0200000000010000 dead000000000100 dead000000000122 0000000000000000
raw: 0000000000000000 0000000000000000 00000015ffffffff 0000000000000000
page dumped because: kasan: bad access detected

Memory state around the buggy address:
 ffff88811619ff00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
 ffff88811619ff80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>ffff8881161a0000: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
                   ^
 ffff8881161a0080: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
 ffff8881161a0100: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
Ido Schimmel April 22, 2021, 12:55 p.m. UTC | #9
On Thu, Apr 22, 2021 at 08:12:31PM +0800, Xuan Zhuo wrote:
> Thank you very much for reporting this problem. Can you try this patch? Of
> course, it also includes two patches from eric.
> 
>  af39c8f72301 virtio-net: fix use-after-free in page_to_skb()
>  f5d7872a8b8a virtio-net: restrict build_skb() use to some arches

Applied your patch on top of net-next, looks good. Feel free to add:

Tested-by: Ido Schimmel <idosch@nvidia.com>

Thanks for the quick fix
diff mbox series

Patch

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 101659cd4b87..8cd76037c724 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -379,21 +379,17 @@  static struct sk_buff *page_to_skb(struct virtnet_info *vi,
 				   struct receive_queue *rq,
 				   struct page *page, unsigned int offset,
 				   unsigned int len, unsigned int truesize,
-				   bool hdr_valid, unsigned int metasize)
+				   bool hdr_valid, unsigned int metasize,
+				   unsigned int headroom)
 {
 	struct sk_buff *skb;
 	struct virtio_net_hdr_mrg_rxbuf *hdr;
 	unsigned int copy, hdr_len, hdr_padded_len;
-	char *p;
+	int tailroom, shinfo_size;
+	char *p, *hdr_p;

 	p = page_address(page) + offset;
-
-	/* copy small packet so we can reuse these pages for small data */
-	skb = napi_alloc_skb(&rq->napi, GOOD_COPY_LEN);
-	if (unlikely(!skb))
-		return NULL;
-
-	hdr = skb_vnet_hdr(skb);
+	hdr_p = p;

 	hdr_len = vi->hdr_len;
 	if (vi->mergeable_rx_bufs)
@@ -401,14 +397,38 @@  static struct sk_buff *page_to_skb(struct virtnet_info *vi,
 	else
 		hdr_padded_len = sizeof(struct padded_vnet_hdr);

-	/* hdr_valid means no XDP, so we can copy the vnet header */
-	if (hdr_valid)
-		memcpy(hdr, p, hdr_len);
+	/* If headroom is not 0, there is an offset between the beginning of the
+	 * data and the allocated space, otherwise the data and the allocated
+	 * space are aligned.
+	 */
+	if (headroom) {
+		/* The actual allocated space size is PAGE_SIZE. */
+		truesize = PAGE_SIZE;
+		tailroom = truesize - len - offset;
+	} else {
+		tailroom = truesize - len;
+	}

 	len -= hdr_len;
 	offset += hdr_padded_len;
 	p += hdr_padded_len;

+	shinfo_size = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+
+	if (len > GOOD_COPY_LEN && tailroom >= shinfo_size) {
+		skb = build_skb(p, truesize);
+		if (unlikely(!skb))
+			return NULL;
+
+		skb_put(skb, len);
+		goto ok;
+	}
+
+	/* copy small packet so we can reuse these pages for small data */
+	skb = napi_alloc_skb(&rq->napi, GOOD_COPY_LEN);
+	if (unlikely(!skb))
+		return NULL;
+
 	/* Copy all frame if it fits skb->head, otherwise
 	 * we let virtio_net_hdr_to_skb() and GRO pull headers as needed.
 	 */
@@ -418,11 +438,6 @@  static struct sk_buff *page_to_skb(struct virtnet_info *vi,
 		copy = ETH_HLEN + metasize;
 	skb_put_data(skb, p, copy);

-	if (metasize) {
-		__skb_pull(skb, metasize);
-		skb_metadata_set(skb, metasize);
-	}
-
 	len -= copy;
 	offset += copy;

@@ -431,7 +446,7 @@  static struct sk_buff *page_to_skb(struct virtnet_info *vi,
 			skb_add_rx_frag(skb, 0, page, offset, len, truesize);
 		else
 			put_page(page);
-		return skb;
+		goto ok;
 	}

 	/*
@@ -458,6 +473,18 @@  static struct sk_buff *page_to_skb(struct virtnet_info *vi,
 	if (page)
 		give_pages(rq, page);

+ok:
+	/* hdr_valid means no XDP, so we can copy the vnet header */
+	if (hdr_valid) {
+		hdr = skb_vnet_hdr(skb);
+		memcpy(hdr, hdr_p, hdr_len);
+	}
+
+	if (metasize) {
+		__skb_pull(skb, metasize);
+		skb_metadata_set(skb, metasize);
+	}
+
 	return skb;
 }

@@ -808,7 +835,7 @@  static struct sk_buff *receive_big(struct net_device *dev,
 {
 	struct page *page = buf;
 	struct sk_buff *skb =
-		page_to_skb(vi, rq, page, 0, len, PAGE_SIZE, true, 0);
+		page_to_skb(vi, rq, page, 0, len, PAGE_SIZE, true, 0, 0);

 	stats->bytes += len - vi->hdr_len;
 	if (unlikely(!skb))
@@ -922,7 +949,7 @@  static struct sk_buff *receive_mergeable(struct net_device *dev,
 				put_page(page);
 				head_skb = page_to_skb(vi, rq, xdp_page, offset,
 						       len, PAGE_SIZE, false,
-						       metasize);
+						       metasize, headroom);
 				return head_skb;
 			}
 			break;
@@ -980,7 +1007,7 @@  static struct sk_buff *receive_mergeable(struct net_device *dev,
 	}

 	head_skb = page_to_skb(vi, rq, page, offset, len, truesize, !xdp_prog,
-			       metasize);
+			       metasize, headroom);
 	curr_skb = head_skb;

 	if (unlikely(!curr_skb))