diff mbox series

[NET-NEXT] ipv6: skb_expand_head() adjust skb->truesize incorrectly

Message ID ef4458d9-c4d7-f419-00f2-0f1cea5140ce@virtuozzo.com (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series [NET-NEXT] ipv6: skb_expand_head() adjust skb->truesize incorrectly | expand

Checks

Context Check Description
netdev/cover_letter success Link
netdev/fixes_present success Link
netdev/patch_count success Link
netdev/tree_selection success Guessed tree name to be net-next
netdev/subject_prefix warning Target tree name not specified in the subject
netdev/cc_maintainers warning 6 maintainers not CCed: jonathan.lemon@gmail.com willemb@google.com cong.wang@bytedance.com gnault@redhat.com alobakin@pm.me pabeni@redhat.com
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit fail Errors and warnings before: 33 this patch: 33
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/verify_fixes success Link
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 40 lines checked
netdev/build_allmodconfig_warn fail Errors and warnings before: 45 this patch: 33
netdev/header_inline success Link

Commit Message

Vasily Averin Aug. 23, 2021, 7:56 a.m. UTC
Christoph Paasch reports [1] about incorrect skb->truesize
after skb_expand_head() call in ip6_xmit.
This happen because skb_set_owner_w() for newly clone skb is called
too early, before pskb_expand_head() where truesize is adjusted for
(!skb-sk) case.

[1] https://lkml.org/lkml/2021/8/20/1082

Reported-by: Christoph Paasch <christoph.paasch@gmail.com>
Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
---
 net/core/skbuff.c | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

Comments

Christoph Paasch Aug. 23, 2021, 5:25 p.m. UTC | #1
Hello,

On Mon, Aug 23, 2021 at 12:56 AM Vasily Averin <vvs@virtuozzo.com> wrote:
>
> Christoph Paasch reports [1] about incorrect skb->truesize
> after skb_expand_head() call in ip6_xmit.
> This happen because skb_set_owner_w() for newly clone skb is called
> too early, before pskb_expand_head() where truesize is adjusted for
> (!skb-sk) case.
>
> [1] https://lkml.org/lkml/2021/8/20/1082
>
> Reported-by: Christoph Paasch <christoph.paasch@gmail.com>
> Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
> ---
>  net/core/skbuff.c | 24 +++++++++++++-----------
>  1 file changed, 13 insertions(+), 11 deletions(-)
>
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index f931176..508d5c4 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -1803,6 +1803,8 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
>
>  struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
>  {
> +       struct sk_buff *oskb = skb;
> +       struct sk_buff *nskb = NULL;
>         int delta = headroom - skb_headroom(skb);
>
>         if (WARN_ONCE(delta <= 0,
> @@ -1811,21 +1813,21 @@ struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
>
>         /* pskb_expand_head() might crash, if skb is shared */
>         if (skb_shared(skb)) {
> -               struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
> -
> -               if (likely(nskb)) {
> -                       if (skb->sk)
> -                               skb_set_owner_w(nskb, skb->sk);
> -                       consume_skb(skb);
> -               } else {
> -                       kfree_skb(skb);
> -               }
> +               nskb = skb_clone(skb, GFP_ATOMIC);
>                 skb = nskb;
>         }
>         if (skb &&
> -           pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
> -               kfree_skb(skb);
> +           pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC))
>                 skb = NULL;
> +
> +       if (!skb) {
> +               kfree_skb(oskb);
> +               if (nskb)
> +                       kfree_skb(nskb);
> +       } else if (nskb) {
> +               if (oskb->sk)
> +                       skb_set_owner_w(nskb, oskb->sk);
> +               consume_skb(oskb);

sorry, this does not fix the problem. The syzkaller repro still
triggers the WARN.

When it happens, the skb in ip6_xmit() is not shared as it comes from
__tcp_transmit_skb, where it is skb_clone()'d.


Christoph

>         }
>         return skb;
>  }
> --
> 1.8.3.1
>
Eric Dumazet Aug. 23, 2021, 9:45 p.m. UTC | #2
On 8/23/21 10:25 AM, Christoph Paasch wrote:
> Hello,
> 
> On Mon, Aug 23, 2021 at 12:56 AM Vasily Averin <vvs@virtuozzo.com> wrote:
>>
>> Christoph Paasch reports [1] about incorrect skb->truesize
>> after skb_expand_head() call in ip6_xmit.
>> This happen because skb_set_owner_w() for newly clone skb is called
>> too early, before pskb_expand_head() where truesize is adjusted for
>> (!skb-sk) case.
>>
>> [1] https://lkml.org/lkml/2021/8/20/1082
>>
>> Reported-by: Christoph Paasch <christoph.paasch@gmail.com>
>> Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
>> ---
>>  net/core/skbuff.c | 24 +++++++++++++-----------
>>  1 file changed, 13 insertions(+), 11 deletions(-)
>>
>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
>> index f931176..508d5c4 100644
>> --- a/net/core/skbuff.c
>> +++ b/net/core/skbuff.c
>> @@ -1803,6 +1803,8 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
>>
>>  struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
>>  {
>> +       struct sk_buff *oskb = skb;
>> +       struct sk_buff *nskb = NULL;
>>         int delta = headroom - skb_headroom(skb);
>>
>>         if (WARN_ONCE(delta <= 0,
>> @@ -1811,21 +1813,21 @@ struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
>>
>>         /* pskb_expand_head() might crash, if skb is shared */
>>         if (skb_shared(skb)) {
>> -               struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
>> -
>> -               if (likely(nskb)) {
>> -                       if (skb->sk)
>> -                               skb_set_owner_w(nskb, skb->sk);
>> -                       consume_skb(skb);
>> -               } else {
>> -                       kfree_skb(skb);
>> -               }
>> +               nskb = skb_clone(skb, GFP_ATOMIC);
>>                 skb = nskb;
>>         }
>>         if (skb &&
>> -           pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
>> -               kfree_skb(skb);
>> +           pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC))
>>                 skb = NULL;
>> +
>> +       if (!skb) {
>> +               kfree_skb(oskb);
>> +               if (nskb)
>> +                       kfree_skb(nskb);
>> +       } else if (nskb) {
>> +               if (oskb->sk)
>> +                       skb_set_owner_w(nskb, oskb->sk);
>> +               consume_skb(oskb);
> 
> sorry, this does not fix the problem. The syzkaller repro still
> triggers the WARN.
> 
> When it happens, the skb in ip6_xmit() is not shared as it comes from
> __tcp_transmit_skb, where it is skb_clone()'d.
> 
> 

Old code (in skb_realloc_headroom())
was first calling skb2 = skb_clone(skb, GFP_ATOMIC); 

At this point, skb2->sk was NULL
So pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0, ...) was able to tweak skb2->truesize

I would try :

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index f9311762cc475bd38d87c33e988d7c983b902e56..326749a8938637b044a616cc33b6a19ed191ac41 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1804,6 +1804,7 @@ EXPORT_SYMBOL(skb_realloc_headroom);
 struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
 {
        int delta = headroom - skb_headroom(skb);
+       struct sk_buff *oskb = NULL;
 
        if (WARN_ONCE(delta <= 0,
                      "%s is expecting an increase in the headroom", __func__))
@@ -1813,19 +1814,21 @@ struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
        if (skb_shared(skb)) {
                struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
 
-               if (likely(nskb)) {
-                       if (skb->sk)
-                               skb_set_owner_w(nskb, skb->sk);
-                       consume_skb(skb);
-               } else {
+               if (unlikely(!nskb)) {
                        kfree_skb(skb);
+                       return NULL;
                }
+               oskb = skb;
                skb = nskb;
        }
-       if (skb &&
-           pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
+       if (pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
                kfree_skb(skb);
-               skb = NULL;
+               kfree_skb(oskb);
+               return NULL;
+       }
+       if (oskb) {
+               skb_set_owner_w(skb, oskb->sk);
+               consume_skb(oskb);
        }
        return skb;
 }
Eric Dumazet Aug. 23, 2021, 9:51 p.m. UTC | #3
On 8/23/21 2:45 PM, Eric Dumazet wrote:
> 
> 
> On 8/23/21 10:25 AM, Christoph Paasch wrote:
>> Hello,
>>
>> On Mon, Aug 23, 2021 at 12:56 AM Vasily Averin <vvs@virtuozzo.com> wrote:
>>>
>>> Christoph Paasch reports [1] about incorrect skb->truesize
>>> after skb_expand_head() call in ip6_xmit.
>>> This happen because skb_set_owner_w() for newly clone skb is called
>>> too early, before pskb_expand_head() where truesize is adjusted for
>>> (!skb-sk) case.
>>>
>>> [1] https://lkml.org/lkml/2021/8/20/1082
>>>
>>> Reported-by: Christoph Paasch <christoph.paasch@gmail.com>
>>> Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
>>> ---
>>>  net/core/skbuff.c | 24 +++++++++++++-----------
>>>  1 file changed, 13 insertions(+), 11 deletions(-)
>>>
>>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
>>> index f931176..508d5c4 100644
>>> --- a/net/core/skbuff.c
>>> +++ b/net/core/skbuff.c
>>> @@ -1803,6 +1803,8 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
>>>
>>>  struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
>>>  {
>>> +       struct sk_buff *oskb = skb;
>>> +       struct sk_buff *nskb = NULL;
>>>         int delta = headroom - skb_headroom(skb);
>>>
>>>         if (WARN_ONCE(delta <= 0,
>>> @@ -1811,21 +1813,21 @@ struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
>>>
>>>         /* pskb_expand_head() might crash, if skb is shared */
>>>         if (skb_shared(skb)) {
>>> -               struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
>>> -
>>> -               if (likely(nskb)) {
>>> -                       if (skb->sk)
>>> -                               skb_set_owner_w(nskb, skb->sk);
>>> -                       consume_skb(skb);
>>> -               } else {
>>> -                       kfree_skb(skb);
>>> -               }
>>> +               nskb = skb_clone(skb, GFP_ATOMIC);
>>>                 skb = nskb;
>>>         }
>>>         if (skb &&
>>> -           pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
>>> -               kfree_skb(skb);
>>> +           pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC))
>>>                 skb = NULL;
>>> +
>>> +       if (!skb) {
>>> +               kfree_skb(oskb);
>>> +               if (nskb)
>>> +                       kfree_skb(nskb);
>>> +       } else if (nskb) {
>>> +               if (oskb->sk)
>>> +                       skb_set_owner_w(nskb, oskb->sk);
>>> +               consume_skb(oskb);
>>
>> sorry, this does not fix the problem. The syzkaller repro still
>> triggers the WARN.
>>
>> When it happens, the skb in ip6_xmit() is not shared as it comes from
>> __tcp_transmit_skb, where it is skb_clone()'d.
>>
>>
> 
> Old code (in skb_realloc_headroom())
> was first calling skb2 = skb_clone(skb, GFP_ATOMIC); 
> 
> At this point, skb2->sk was NULL
> So pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0, ...) was able to tweak skb2->truesize
> 
> I would try :
> 
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index f9311762cc475bd38d87c33e988d7c983b902e56..326749a8938637b044a616cc33b6a19ed191ac41 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -1804,6 +1804,7 @@ EXPORT_SYMBOL(skb_realloc_headroom);
>  struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
>  {
>         int delta = headroom - skb_headroom(skb);
> +       struct sk_buff *oskb = NULL;
>  
>         if (WARN_ONCE(delta <= 0,
>                       "%s is expecting an increase in the headroom", __func__))
> @@ -1813,19 +1814,21 @@ struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
>         if (skb_shared(skb)) {
>                 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
>  
> -               if (likely(nskb)) {
> -                       if (skb->sk)
> -                               skb_set_owner_w(nskb, skb->sk);
> -                       consume_skb(skb);
> -               } else {
> +               if (unlikely(!nskb)) {
>                         kfree_skb(skb);
> +                       return NULL;
>                 }
> +               oskb = skb;
>                 skb = nskb;
>         }
> -       if (skb &&
> -           pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
> +       if (pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
>                 kfree_skb(skb);
> -               skb = NULL;
> +               kfree_skb(oskb);
> +               return NULL;
> +       }
> +       if (oskb) {
> +               skb_set_owner_w(skb, oskb->sk);
> +               consume_skb(oskb);
>         }
>         return skb;
>  }


Oh well, probably not going to work.

We have to find a way to properly increase skb->truesize, even if skb_clone() is _not_ called.
Eric Dumazet Aug. 23, 2021, 10:23 p.m. UTC | #4
On 8/23/21 2:51 PM, Eric Dumazet wrote:
> 
> 
> On 8/23/21 2:45 PM, Eric Dumazet wrote:
>>
>>
>> On 8/23/21 10:25 AM, Christoph Paasch wrote:
>>> Hello,
>>>
>>> On Mon, Aug 23, 2021 at 12:56 AM Vasily Averin <vvs@virtuozzo.com> wrote:
>>>>
>>>> Christoph Paasch reports [1] about incorrect skb->truesize
>>>> after skb_expand_head() call in ip6_xmit.
>>>> This happen because skb_set_owner_w() for newly clone skb is called
>>>> too early, before pskb_expand_head() where truesize is adjusted for
>>>> (!skb-sk) case.
>>>>
>>>> [1] https://lkml.org/lkml/2021/8/20/1082
>>>>
>>>> Reported-by: Christoph Paasch <christoph.paasch@gmail.com>
>>>> Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
>>>> ---
>>>>  net/core/skbuff.c | 24 +++++++++++++-----------
>>>>  1 file changed, 13 insertions(+), 11 deletions(-)
>>>>
>>>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
>>>> index f931176..508d5c4 100644
>>>> --- a/net/core/skbuff.c
>>>> +++ b/net/core/skbuff.c
>>>> @@ -1803,6 +1803,8 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
>>>>
>>>>  struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
>>>>  {
>>>> +       struct sk_buff *oskb = skb;
>>>> +       struct sk_buff *nskb = NULL;
>>>>         int delta = headroom - skb_headroom(skb);
>>>>
>>>>         if (WARN_ONCE(delta <= 0,
>>>> @@ -1811,21 +1813,21 @@ struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
>>>>
>>>>         /* pskb_expand_head() might crash, if skb is shared */
>>>>         if (skb_shared(skb)) {
>>>> -               struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
>>>> -
>>>> -               if (likely(nskb)) {
>>>> -                       if (skb->sk)
>>>> -                               skb_set_owner_w(nskb, skb->sk);
>>>> -                       consume_skb(skb);
>>>> -               } else {
>>>> -                       kfree_skb(skb);
>>>> -               }
>>>> +               nskb = skb_clone(skb, GFP_ATOMIC);
>>>>                 skb = nskb;
>>>>         }
>>>>         if (skb &&
>>>> -           pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
>>>> -               kfree_skb(skb);
>>>> +           pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC))
>>>>                 skb = NULL;
>>>> +
>>>> +       if (!skb) {
>>>> +               kfree_skb(oskb);
>>>> +               if (nskb)
>>>> +                       kfree_skb(nskb);
>>>> +       } else if (nskb) {
>>>> +               if (oskb->sk)
>>>> +                       skb_set_owner_w(nskb, oskb->sk);
>>>> +               consume_skb(oskb);
>>>
>>> sorry, this does not fix the problem. The syzkaller repro still
>>> triggers the WARN.
>>>
>>> When it happens, the skb in ip6_xmit() is not shared as it comes from
>>> __tcp_transmit_skb, where it is skb_clone()'d.
>>>
>>>
>>
>> Old code (in skb_realloc_headroom())
>> was first calling skb2 = skb_clone(skb, GFP_ATOMIC); 
>>
>> At this point, skb2->sk was NULL
>> So pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0, ...) was able to tweak skb2->truesize
>>
>> I would try :
>>
>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
>> index f9311762cc475bd38d87c33e988d7c983b902e56..326749a8938637b044a616cc33b6a19ed191ac41 100644
>> --- a/net/core/skbuff.c
>> +++ b/net/core/skbuff.c
>> @@ -1804,6 +1804,7 @@ EXPORT_SYMBOL(skb_realloc_headroom);
>>  struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
>>  {
>>         int delta = headroom - skb_headroom(skb);
>> +       struct sk_buff *oskb = NULL;
>>  
>>         if (WARN_ONCE(delta <= 0,
>>                       "%s is expecting an increase in the headroom", __func__))
>> @@ -1813,19 +1814,21 @@ struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
>>         if (skb_shared(skb)) {
>>                 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
>>  
>> -               if (likely(nskb)) {
>> -                       if (skb->sk)
>> -                               skb_set_owner_w(nskb, skb->sk);
>> -                       consume_skb(skb);
>> -               } else {
>> +               if (unlikely(!nskb)) {
>>                         kfree_skb(skb);
>> +                       return NULL;
>>                 }
>> +               oskb = skb;
>>                 skb = nskb;
>>         }
>> -       if (skb &&
>> -           pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
>> +       if (pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
>>                 kfree_skb(skb);
>> -               skb = NULL;
>> +               kfree_skb(oskb);
>> +               return NULL;
>> +       }
>> +       if (oskb) {
>> +               skb_set_owner_w(skb, oskb->sk);
>> +               consume_skb(oskb);
>>         }
>>         return skb;
>>  }
> 
> 
> Oh well, probably not going to work.
> 
> We have to find a way to properly increase skb->truesize, even if skb_clone() is _not_ called.
> 

I also note that current use of skb_set_owner_w(), forcing skb->destructor to sock_wfree()
is probably breaking TCP Small queues, since original skb->destructor would be tcp_wfree() or __sock_wfree()
Vasily Averin Aug. 24, 2021, 8:50 a.m. UTC | #5
On 8/24/21 1:23 AM, Eric Dumazet wrote:
> On 8/23/21 2:51 PM, Eric Dumazet wrote:
>> On 8/23/21 2:45 PM, Eric Dumazet wrote:
>>> On 8/23/21 10:25 AM, Christoph Paasch wrote:
>>>> Hello,
>>>>
>>>> On Mon, Aug 23, 2021 at 12:56 AM Vasily Averin <vvs@virtuozzo.com> wrote:
>>>>>
>>>>> Christoph Paasch reports [1] about incorrect skb->truesize
>>>>> after skb_expand_head() call in ip6_xmit.
>>>>> This happen because skb_set_owner_w() for newly clone skb is called
>>>>> too early, before pskb_expand_head() where truesize is adjusted for
>>>>> (!skb-sk) case.
>>>>>
>>>>> [1] https://lkml.org/lkml/2021/8/20/1082
>>>>>
>>>>> Reported-by: Christoph Paasch <christoph.paasch@gmail.com>
>>>>> Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
>>>>> ---
>>>>>  net/core/skbuff.c | 24 +++++++++++++-----------
>>>>>  1 file changed, 13 insertions(+), 11 deletions(-)
>>>>>
>>>>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
>>>>> index f931176..508d5c4 100644
>>>>> --- a/net/core/skbuff.c
>>>>> +++ b/net/core/skbuff.c
>>>>> @@ -1803,6 +1803,8 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
>>>>>
>>>>>  struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
>>>>>  {
>>>>> +       struct sk_buff *oskb = skb;
>>>>> +       struct sk_buff *nskb = NULL;
>>>>>         int delta = headroom - skb_headroom(skb);
>>>>>
>>>>>         if (WARN_ONCE(delta <= 0,
>>>>> @@ -1811,21 +1813,21 @@ struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
>>>>>
>>>>>         /* pskb_expand_head() might crash, if skb is shared */
>>>>>         if (skb_shared(skb)) {
>>>>> -               struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
>>>>> -
>>>>> -               if (likely(nskb)) {
>>>>> -                       if (skb->sk)
>>>>> -                               skb_set_owner_w(nskb, skb->sk);
>>>>> -                       consume_skb(skb);
>>>>> -               } else {
>>>>> -                       kfree_skb(skb);
>>>>> -               }
>>>>> +               nskb = skb_clone(skb, GFP_ATOMIC);
>>>>>                 skb = nskb;
>>>>>         }
>>>>>         if (skb &&
>>>>> -           pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
>>>>> -               kfree_skb(skb);
>>>>> +           pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC))
>>>>>                 skb = NULL;
>>>>> +
>>>>> +       if (!skb) {
>>>>> +               kfree_skb(oskb);
>>>>> +               if (nskb)
>>>>> +                       kfree_skb(nskb);
>>>>> +       } else if (nskb) {
>>>>> +               if (oskb->sk)
>>>>> +                       skb_set_owner_w(nskb, oskb->sk);
>>>>> +               consume_skb(oskb);
>>>>
>>>> sorry, this does not fix the problem. The syzkaller repro still
>>>> triggers the WARN.
>>>>
>>>> When it happens, the skb in ip6_xmit() is not shared as it comes from
>>>> __tcp_transmit_skb, where it is skb_clone()'d.
>>>>
>>>>
>>>
>>> Old code (in skb_realloc_headroom())
>>> was first calling skb2 = skb_clone(skb, GFP_ATOMIC); 
>>>
>>> At this point, skb2->sk was NULL
>>> So pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0, ...) was able to tweak skb2->truesize
>>>
>>> I would try :
>>>
>>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
>>> index f9311762cc475bd38d87c33e988d7c983b902e56..326749a8938637b044a616cc33b6a19ed191ac41 100644
>>> --- a/net/core/skbuff.c
>>> +++ b/net/core/skbuff.c
>>> @@ -1804,6 +1804,7 @@ EXPORT_SYMBOL(skb_realloc_headroom);
>>>  struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
>>>  {
>>>         int delta = headroom - skb_headroom(skb);
>>> +       struct sk_buff *oskb = NULL;
>>>  
>>>         if (WARN_ONCE(delta <= 0,
>>>                       "%s is expecting an increase in the headroom", __func__))
>>> @@ -1813,19 +1814,21 @@ struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
>>>         if (skb_shared(skb)) {
>>>                 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
>>>  
>>> -               if (likely(nskb)) {
>>> -                       if (skb->sk)
>>> -                               skb_set_owner_w(nskb, skb->sk);
>>> -                       consume_skb(skb);
>>> -               } else {
>>> +               if (unlikely(!nskb)) {
>>>                         kfree_skb(skb);
>>> +                       return NULL;
>>>                 }
>>> +               oskb = skb;
>>>                 skb = nskb;
>>>         }
>>> -       if (skb &&
>>> -           pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
>>> +       if (pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
>>>                 kfree_skb(skb);
>>> -               skb = NULL;
>>> +               kfree_skb(oskb);
>>> +               return NULL;
>>> +       }
>>> +       if (oskb) {
>>> +               skb_set_owner_w(skb, oskb->sk);
>>> +               consume_skb(oskb);
>>>         }
>>>         return skb;
>>>  }
>>
>>
>> Oh well, probably not going to work.
>>
>> We have to find a way to properly increase skb->truesize, even if skb_clone() is _not_ called.

Can we adjust truesize outside pskb_expand_head()?
Could you please explain why it can be not safe?

> I also note that current use of skb_set_owner_w(), forcing skb->destructor to sock_wfree()
> is probably breaking TCP Small queues, since original skb->destructor would be tcp_wfree() or __sock_wfree()

I agree, however as far as I understand it is separate and more global problem.

Thank you,
	Vasily Averin
Vasily Averin Aug. 24, 2021, 5:21 p.m. UTC | #6
On 8/24/21 11:50 AM, Vasily Averin wrote:
> On 8/24/21 1:23 AM, Eric Dumazet wrote:
>> On 8/23/21 2:51 PM, Eric Dumazet wrote:
>>> On 8/23/21 2:45 PM, Eric Dumazet wrote:
>>>> On 8/23/21 10:25 AM, Christoph Paasch wrote:
>>>>> Hello,
>>>>>
>>>>> On Mon, Aug 23, 2021 at 12:56 AM Vasily Averin <vvs@virtuozzo.com> wrote:
>>>>>>
>>>>>> Christoph Paasch reports [1] about incorrect skb->truesize
>>>>>> after skb_expand_head() call in ip6_xmit.
>>>>>> This happen because skb_set_owner_w() for newly clone skb is called
>>>>>> too early, before pskb_expand_head() where truesize is adjusted for
>>>>>> (!skb-sk) case.
>>>>>>
>>>>>> [1] https://lkml.org/lkml/2021/8/20/1082
>>>>>>
>>>>>> Reported-by: Christoph Paasch <christoph.paasch@gmail.com>
>>>>>> Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
>>>>>> ---
>>>>>>  net/core/skbuff.c | 24 +++++++++++++-----------
>>>>>>  1 file changed, 13 insertions(+), 11 deletions(-)
>>>>>>
>>>>>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
>>>>>> index f931176..508d5c4 100644
>>>>>> --- a/net/core/skbuff.c
>>>>>> +++ b/net/core/skbuff.c
>>>>>> @@ -1803,6 +1803,8 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
>>>>>>
>>>>>>  struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
>>>>>>  {
>>>>>> +       struct sk_buff *oskb = skb;
>>>>>> +       struct sk_buff *nskb = NULL;
>>>>>>         int delta = headroom - skb_headroom(skb);
>>>>>>
>>>>>>         if (WARN_ONCE(delta <= 0,
>>>>>> @@ -1811,21 +1813,21 @@ struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
>>>>>>
>>>>>>         /* pskb_expand_head() might crash, if skb is shared */
>>>>>>         if (skb_shared(skb)) {
>>>>>> -               struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
>>>>>> -
>>>>>> -               if (likely(nskb)) {
>>>>>> -                       if (skb->sk)
>>>>>> -                               skb_set_owner_w(nskb, skb->sk);
>>>>>> -                       consume_skb(skb);
>>>>>> -               } else {
>>>>>> -                       kfree_skb(skb);
>>>>>> -               }
>>>>>> +               nskb = skb_clone(skb, GFP_ATOMIC);
>>>>>>                 skb = nskb;
>>>>>>         }
>>>>>>         if (skb &&
>>>>>> -           pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
>>>>>> -               kfree_skb(skb);
>>>>>> +           pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC))
>>>>>>                 skb = NULL;
>>>>>> +
>>>>>> +       if (!skb) {
>>>>>> +               kfree_skb(oskb);
>>>>>> +               if (nskb)
>>>>>> +                       kfree_skb(nskb);
>>>>>> +       } else if (nskb) {
>>>>>> +               if (oskb->sk)
>>>>>> +                       skb_set_owner_w(nskb, oskb->sk);
>>>>>> +               consume_skb(oskb);
>>>>>
>>>>> sorry, this does not fix the problem. The syzkaller repro still
>>>>> triggers the WARN.
>>>>>
>>>>> When it happens, the skb in ip6_xmit() is not shared as it comes from
>>>>> __tcp_transmit_skb, where it is skb_clone()'d.
>>>>>
>>>>>
>>>>
>>>> Old code (in skb_realloc_headroom())
>>>> was first calling skb2 = skb_clone(skb, GFP_ATOMIC); 
>>>>
>>>> At this point, skb2->sk was NULL
>>>> So pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0, ...) was able to tweak skb2->truesize
>>>>
>>>> I would try :
>>>>
>>>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
>>>> index f9311762cc475bd38d87c33e988d7c983b902e56..326749a8938637b044a616cc33b6a19ed191ac41 100644
>>>> --- a/net/core/skbuff.c
>>>> +++ b/net/core/skbuff.c
>>>> @@ -1804,6 +1804,7 @@ EXPORT_SYMBOL(skb_realloc_headroom);
>>>>  struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
>>>>  {
>>>>         int delta = headroom - skb_headroom(skb);
>>>> +       struct sk_buff *oskb = NULL;
>>>>  
>>>>         if (WARN_ONCE(delta <= 0,
>>>>                       "%s is expecting an increase in the headroom", __func__))
>>>> @@ -1813,19 +1814,21 @@ struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
>>>>         if (skb_shared(skb)) {
>>>>                 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
>>>>  
>>>> -               if (likely(nskb)) {
>>>> -                       if (skb->sk)
>>>> -                               skb_set_owner_w(nskb, skb->sk);
>>>> -                       consume_skb(skb);
>>>> -               } else {
>>>> +               if (unlikely(!nskb)) {
>>>>                         kfree_skb(skb);
>>>> +                       return NULL;
>>>>                 }
>>>> +               oskb = skb;
>>>>                 skb = nskb;
>>>>         }
>>>> -       if (skb &&
>>>> -           pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
>>>> +       if (pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
>>>>                 kfree_skb(skb);
>>>> -               skb = NULL;
>>>> +               kfree_skb(oskb);
>>>> +               return NULL;
>>>> +       }
>>>> +       if (oskb) {
>>>> +               skb_set_owner_w(skb, oskb->sk);
>>>> +               consume_skb(oskb);
>>>>         }
>>>>         return skb;
>>>>  }
>>>
>>>
>>> Oh well, probably not going to work.
>>>
>>> We have to find a way to properly increase skb->truesize, even if skb_clone() is _not_ called.
> 
> Can we adjust truesize outside pskb_expand_head()?
> Could you please explain why it can be not safe?

Do you mean truesize change should not break balance of sk->sk_wmem_alloc?

>> I also note that current use of skb_set_owner_w(), forcing skb->destructor to sock_wfree()
>> is probably breaking TCP Small queues, since original skb->destructor would be tcp_wfree() or __sock_wfree()
> 
> I agree, however as far as I understand it is separate and more global problem.
> 
> Thank you,
> 	Vasily Averin
>
Christoph Paasch Aug. 25, 2021, 5:49 p.m. UTC | #7
On Tue, Aug 24, 2021 at 10:22 AM Vasily Averin <vvs@virtuozzo.com> wrote:
>
> On 8/24/21 11:50 AM, Vasily Averin wrote:
> > On 8/24/21 1:23 AM, Eric Dumazet wrote:
> >> On 8/23/21 2:51 PM, Eric Dumazet wrote:
> >>> On 8/23/21 2:45 PM, Eric Dumazet wrote:
> >>>> On 8/23/21 10:25 AM, Christoph Paasch wrote:
> >>>>> Hello,
> >>>>>
> >>>>> On Mon, Aug 23, 2021 at 12:56 AM Vasily Averin <vvs@virtuozzo.com> wrote:
> >>>>>>
> >>>>>> Christoph Paasch reports [1] about incorrect skb->truesize
> >>>>>> after skb_expand_head() call in ip6_xmit.
> >>>>>> This happen because skb_set_owner_w() for newly clone skb is called
> >>>>>> too early, before pskb_expand_head() where truesize is adjusted for
> >>>>>> (!skb-sk) case.
> >>>>>>
> >>>>>> [1] https://lkml.org/lkml/2021/8/20/1082
> >>>>>>
> >>>>>> Reported-by: Christoph Paasch <christoph.paasch@gmail.com>
> >>>>>> Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
> >>>>>> ---
> >>>>>>  net/core/skbuff.c | 24 +++++++++++++-----------
> >>>>>>  1 file changed, 13 insertions(+), 11 deletions(-)
> >>>>>>
> >>>>>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> >>>>>> index f931176..508d5c4 100644
> >>>>>> --- a/net/core/skbuff.c
> >>>>>> +++ b/net/core/skbuff.c
> >>>>>> @@ -1803,6 +1803,8 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
> >>>>>>
> >>>>>>  struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
> >>>>>>  {
> >>>>>> +       struct sk_buff *oskb = skb;
> >>>>>> +       struct sk_buff *nskb = NULL;
> >>>>>>         int delta = headroom - skb_headroom(skb);
> >>>>>>
> >>>>>>         if (WARN_ONCE(delta <= 0,
> >>>>>> @@ -1811,21 +1813,21 @@ struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
> >>>>>>
> >>>>>>         /* pskb_expand_head() might crash, if skb is shared */
> >>>>>>         if (skb_shared(skb)) {
> >>>>>> -               struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
> >>>>>> -
> >>>>>> -               if (likely(nskb)) {
> >>>>>> -                       if (skb->sk)
> >>>>>> -                               skb_set_owner_w(nskb, skb->sk);
> >>>>>> -                       consume_skb(skb);
> >>>>>> -               } else {
> >>>>>> -                       kfree_skb(skb);
> >>>>>> -               }
> >>>>>> +               nskb = skb_clone(skb, GFP_ATOMIC);
> >>>>>>                 skb = nskb;
> >>>>>>         }
> >>>>>>         if (skb &&
> >>>>>> -           pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
> >>>>>> -               kfree_skb(skb);
> >>>>>> +           pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC))
> >>>>>>                 skb = NULL;
> >>>>>> +
> >>>>>> +       if (!skb) {
> >>>>>> +               kfree_skb(oskb);
> >>>>>> +               if (nskb)
> >>>>>> +                       kfree_skb(nskb);
> >>>>>> +       } else if (nskb) {
> >>>>>> +               if (oskb->sk)
> >>>>>> +                       skb_set_owner_w(nskb, oskb->sk);
> >>>>>> +               consume_skb(oskb);
> >>>>>
> >>>>> sorry, this does not fix the problem. The syzkaller repro still
> >>>>> triggers the WARN.
> >>>>>
> >>>>> When it happens, the skb in ip6_xmit() is not shared as it comes from
> >>>>> __tcp_transmit_skb, where it is skb_clone()'d.
> >>>>>
> >>>>>
> >>>>
> >>>> Old code (in skb_realloc_headroom())
> >>>> was first calling skb2 = skb_clone(skb, GFP_ATOMIC);
> >>>>
> >>>> At this point, skb2->sk was NULL
> >>>> So pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0, ...) was able to tweak skb2->truesize
> >>>>
> >>>> I would try :
> >>>>
> >>>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> >>>> index f9311762cc475bd38d87c33e988d7c983b902e56..326749a8938637b044a616cc33b6a19ed191ac41 100644
> >>>> --- a/net/core/skbuff.c
> >>>> +++ b/net/core/skbuff.c
> >>>> @@ -1804,6 +1804,7 @@ EXPORT_SYMBOL(skb_realloc_headroom);
> >>>>  struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
> >>>>  {
> >>>>         int delta = headroom - skb_headroom(skb);
> >>>> +       struct sk_buff *oskb = NULL;
> >>>>
> >>>>         if (WARN_ONCE(delta <= 0,
> >>>>                       "%s is expecting an increase in the headroom", __func__))
> >>>> @@ -1813,19 +1814,21 @@ struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
> >>>>         if (skb_shared(skb)) {
> >>>>                 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
> >>>>
> >>>> -               if (likely(nskb)) {
> >>>> -                       if (skb->sk)
> >>>> -                               skb_set_owner_w(nskb, skb->sk);
> >>>> -                       consume_skb(skb);
> >>>> -               } else {
> >>>> +               if (unlikely(!nskb)) {
> >>>>                         kfree_skb(skb);
> >>>> +                       return NULL;
> >>>>                 }
> >>>> +               oskb = skb;
> >>>>                 skb = nskb;
> >>>>         }
> >>>> -       if (skb &&
> >>>> -           pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
> >>>> +       if (pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
> >>>>                 kfree_skb(skb);
> >>>> -               skb = NULL;
> >>>> +               kfree_skb(oskb);
> >>>> +               return NULL;
> >>>> +       }
> >>>> +       if (oskb) {
> >>>> +               skb_set_owner_w(skb, oskb->sk);
> >>>> +               consume_skb(oskb);
> >>>>         }
> >>>>         return skb;
> >>>>  }
> >>>
> >>>
> >>> Oh well, probably not going to work.
> >>>
> >>> We have to find a way to properly increase skb->truesize, even if skb_clone() is _not_ called.
> >
> > Can we adjust truesize outside pskb_expand_head()?
> > Could you please explain why it can be not safe?
>
> Do you mean truesize change should not break balance of sk->sk_wmem_alloc?

AFAICS, that's the problem around adjusting truesize. So, maybe "just"
refcount_add the increase of the truesize.

The below does fix the syzkaller bug for me and seems to do the right
thing overall. But I honestly think that this is becoming too hacky
and not worth it... and who knows what other corner-cases this now
exposes...

Maybe a revert is a better course of action?

---
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index f9311762cc47..9cc18a0fdd1c 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -71,6 +71,7 @@
 #include <net/mpls.h>
 #include <net/mptcp.h>
 #include <net/page_pool.h>
+#include <net/tcp.h>

 #include <linux/uaccess.h>
 #include <trace/events/skb.h>
@@ -1756,9 +1757,14 @@ int pskb_expand_head(struct sk_buff *skb, int
nhead, int ntail,
  * For the moment, we really care of rx path, or
  * when skb is orphaned (not attached to a socket).
  */
- if (!skb->sk || skb->destructor == sock_edemux)
+ if (!skb->sk || skb->destructor == sock_edemux || skb->destructor ==
tcp_wfree) {
  skb->truesize += size - osize;

+ if (skb->sk && skb->destructor == tcp_wfree) {
+ refcount_add(size - osize, &skb->sk->sk_wmem_alloc);
+ }
+ }
+
  return 0;

 nofrags:
Vasily Averin Aug. 27, 2021, 3:23 p.m. UTC | #8
On 8/24/21 1:23 AM, Eric Dumazet wrote:
> On 8/23/21 2:51 PM, Eric Dumazet wrote:
>> On 8/23/21 2:45 PM, Eric Dumazet wrote:
>>> On 8/23/21 10:25 AM, Christoph Paasch wrote:
>>>> Hello,
>>>>
>>>> On Mon, Aug 23, 2021 at 12:56 AM Vasily Averin <vvs@virtuozzo.com> wrote:
>>>>>
>>>>> Christoph Paasch reports [1] about incorrect skb->truesize
>>>>> after skb_expand_head() call in ip6_xmit.
>>>>> This happen because skb_set_owner_w() for newly clone skb is called
>>>>> too early, before pskb_expand_head() where truesize is adjusted for
>>>>> (!skb-sk) case.
>>>>>
>>>>> [1] https://lkml.org/lkml/2021/8/20/1082
>>>>>
>>>>> Reported-by: Christoph Paasch <christoph.paasch@gmail.com>
>>>>> Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
>>>>> ---
>>>>>  net/core/skbuff.c | 24 +++++++++++++-----------
>>>>>  1 file changed, 13 insertions(+), 11 deletions(-)
>>>>>
>>>>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
>>>>> index f931176..508d5c4 100644
>>>>> --- a/net/core/skbuff.c
>>>>> +++ b/net/core/skbuff.c
>>>>> @@ -1803,6 +1803,8 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
>>>>>
>>>>>  struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
>>>>>  {
>>>>> +       struct sk_buff *oskb = skb;
>>>>> +       struct sk_buff *nskb = NULL;
>>>>>         int delta = headroom - skb_headroom(skb);
>>>>>
>>>>>         if (WARN_ONCE(delta <= 0,
>>>>> @@ -1811,21 +1813,21 @@ struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
>>>>>
>>>>>         /* pskb_expand_head() might crash, if skb is shared */
>>>>>         if (skb_shared(skb)) {
>>>>> -               struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
>>>>> -
>>>>> -               if (likely(nskb)) {
>>>>> -                       if (skb->sk)
>>>>> -                               skb_set_owner_w(nskb, skb->sk);
>>>>> -                       consume_skb(skb);
>>>>> -               } else {
>>>>> -                       kfree_skb(skb);
>>>>> -               }
>>>>> +               nskb = skb_clone(skb, GFP_ATOMIC);
>>>>>                 skb = nskb;
>>>>>         }
>>>>>         if (skb &&
>>>>> -           pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
>>>>> -               kfree_skb(skb);
>>>>> +           pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC))
>>>>>                 skb = NULL;
>>>>> +
>>>>> +       if (!skb) {
>>>>> +               kfree_skb(oskb);
>>>>> +               if (nskb)
>>>>> +                       kfree_skb(nskb);
>>>>> +       } else if (nskb) {
>>>>> +               if (oskb->sk)
>>>>> +                       skb_set_owner_w(nskb, oskb->sk);
>>>>> +               consume_skb(oskb);
>>>>
>>>> sorry, this does not fix the problem. The syzkaller repro still
>>>> triggers the WARN.
>>>>
>>>> When it happens, the skb in ip6_xmit() is not shared as it comes from
>>>> __tcp_transmit_skb, where it is skb_clone()'d.
>>>>
>>>>
>>>
>>> Old code (in skb_realloc_headroom())
>>> was first calling skb2 = skb_clone(skb, GFP_ATOMIC); 
>>>
>>> At this point, skb2->sk was NULL
>>> So pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0, ...) was able to tweak skb2->truesize
>>>
>>> I would try :
>>>
>>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
>>> index f9311762cc475bd38d87c33e988d7c983b902e56..326749a8938637b044a616cc33b6a19ed191ac41 100644
>>> --- a/net/core/skbuff.c
>>> +++ b/net/core/skbuff.c
>>> @@ -1804,6 +1804,7 @@ EXPORT_SYMBOL(skb_realloc_headroom);
>>>  struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
>>>  {
>>>         int delta = headroom - skb_headroom(skb);
>>> +       struct sk_buff *oskb = NULL;
>>>  
>>>         if (WARN_ONCE(delta <= 0,
>>>                       "%s is expecting an increase in the headroom", __func__))
>>> @@ -1813,19 +1814,21 @@ struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
>>>         if (skb_shared(skb)) {
>>>                 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
>>>  
>>> -               if (likely(nskb)) {
>>> -                       if (skb->sk)
>>> -                               skb_set_owner_w(nskb, skb->sk);
>>> -                       consume_skb(skb);
>>> -               } else {
>>> +               if (unlikely(!nskb)) {
>>>                         kfree_skb(skb);
>>> +                       return NULL;
>>>                 }
>>> +               oskb = skb;
>>>                 skb = nskb;
>>>         }
>>> -       if (skb &&
>>> -           pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
>>> +       if (pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
>>>                 kfree_skb(skb);
>>> -               skb = NULL;
>>> +               kfree_skb(oskb);
>>> +               return NULL;
>>> +       }
>>> +       if (oskb) {
>>> +               skb_set_owner_w(skb, oskb->sk);
>>> +               consume_skb(oskb);
>>>         }
>>>         return skb;
>>>  }
>> Oh well, probably not going to work.
>>
>> We have to find a way to properly increase skb->truesize, even if skb_clone() is _not_ called.
> 
> I also note that current use of skb_set_owner_w(), forcing skb->destructor to sock_wfree()
> is probably breaking TCP Small queues, since original skb->destructor would be tcp_wfree() or __sock_wfree()

I asked Alexey Kuznetsov to look at this problem. Below is his answer:
"I think the current scheme is obsolete. It was created
when we had only two kinds of skb accounting (rmem & wmem)
and with more kinds of accounting it just does not work.
Even there we had ignored problems with adjusting accounting.

Logically the best solution would be replacing ->destructor,
set_owner* etc with skb_ops. Something like:

struct skb_ops
{
        void init(struct sk_buff * skb, struct skb_ops * ops, struct
sock * owner);
        void fini(struct sk_buff * skb);
        void update(struct sk_buff * skb, int adjust);
        void inherit(struct sk_buff * skb2, struct sk_buff * skb);
};

init - is replacement for skb_set_owner_r|w
fini - is replacement for skb_orphan
update - is new operation to be used in places where skb->truesize changes,
       instead of awful constructions like:

       if (!skb->sk || skb->destructor == sock_edemux)
            skb->truesize += size - osize;

       Now it will look like:

       if (skb->ops)
            skb->ops->update(skb, size - osize);

inherit - is replacement for also awful constructs like:

      if (skb->sk)
            skb_set_owner_w(skb2, skb->sk);

      Now it will be:

      if (skb->ops)
            skb->ops->inherit(skb2, skb);

The implementation looks mostly obvious.
Some troubles can be only with new functionality:
update of accounting was never done before.


More efficient, functionally equivalent, but uglier and less flexible
alternative would be removal of ->destructor, replaced with
a small numeric indicator of ownership:

enum
{
        SKB_OWNER_NONE,  /* aka destructor == NULL */
        SKB_OWNER_WMEM,  /* aka destructor == sk_wfree */
        SKB_OWNER_RMEM,  /* aka destructor == sk_rfree */
        SKB_OWNER_SK,    /* aka destructor == sk_edemux */
        SKB_OWNER_TCP,   /* aka destructor == tcp_wfree */
}

And the same init,fini,inherit,update become functions
w/o any inidirect calls. Not sure it is really more efficient though."

Thank you,
	Vasily Averin
Eric Dumazet Aug. 27, 2021, 4:47 p.m. UTC | #9
On 8/27/21 8:23 AM, Vasily Averin wrote:

> I asked Alexey Kuznetsov to look at this problem. Below is his answer:
> "I think the current scheme is obsolete. It was created
> when we had only two kinds of skb accounting (rmem & wmem)
> and with more kinds of accounting it just does not work.
> Even there we had ignored problems with adjusting accounting.
> 
> Logically the best solution would be replacing ->destructor,
> set_owner* etc with skb_ops. Something like:
> 
> struct skb_ops
> {
>         void init(struct sk_buff * skb, struct skb_ops * ops, struct
> sock * owner);
>         void fini(struct sk_buff * skb);
>         void update(struct sk_buff * skb, int adjust);
>         void inherit(struct sk_buff * skb2, struct sk_buff * skb);
> };
> 
> init - is replacement for skb_set_owner_r|w
> fini - is replacement for skb_orphan
> update - is new operation to be used in places where skb->truesize changes,
>        instead of awful constructions like:
> 
>        if (!skb->sk || skb->destructor == sock_edemux)
>             skb->truesize += size - osize;
> 
>        Now it will look like:
> 
>        if (skb->ops)
>             skb->ops->update(skb, size - osize);
> 
> inherit - is replacement for also awful constructs like:
> 
>       if (skb->sk)
>             skb_set_owner_w(skb2, skb->sk);
> 
>       Now it will be:
> 
>       if (skb->ops)
>             skb->ops->inherit(skb2, skb);
> 
> The implementation looks mostly obvious.
> Some troubles can be only with new functionality:
> update of accounting was never done before.
> 
> 
> More efficient, functionally equivalent, but uglier and less flexible
> alternative would be removal of ->destructor, replaced with
> a small numeric indicator of ownership:
> 
> enum
> {
>         SKB_OWNER_NONE,  /* aka destructor == NULL */
>         SKB_OWNER_WMEM,  /* aka destructor == sk_wfree */
>         SKB_OWNER_RMEM,  /* aka destructor == sk_rfree */
>         SKB_OWNER_SK,    /* aka destructor == sk_edemux */
>         SKB_OWNER_TCP,   /* aka destructor == tcp_wfree */
> }
> 
> And the same init,fini,inherit,update become functions
> w/o any inidirect calls. Not sure it is really more efficient though."
> 

Well, this does not look as stable material, and would add a bunch
of indirect calls which are quite expensive these days (CONFIG_RETPOLINE=y)

I suggest we work on a fix, using existing infra, then eventually later
try to refactor if this is really bringing improvements.

A fix could simply be a revert of 0c9f227bee119 ("ipv6: use skb_expand_head in ip6_xmit")
since only IPv6 has the problem (because of arbitrary headers size)
Vasily Averin Aug. 28, 2021, 8:01 a.m. UTC | #10
On 8/27/21 7:47 PM, Eric Dumazet wrote:
> 
> 
> On 8/27/21 8:23 AM, Vasily Averin wrote:
> 
>> I asked Alexey Kuznetsov to look at this problem. Below is his answer:
>> "I think the current scheme is obsolete. It was created
>> when we had only two kinds of skb accounting (rmem & wmem)
>> and with more kinds of accounting it just does not work.
>> Even there we had ignored problems with adjusting accounting.
>>
>> Logically the best solution would be replacing ->destructor,
>> set_owner* etc with skb_ops. Something like:
>>
>> struct skb_ops
>> {
>>         void init(struct sk_buff * skb, struct skb_ops * ops, struct
>> sock * owner);
>>         void fini(struct sk_buff * skb);
>>         void update(struct sk_buff * skb, int adjust);
>>         void inherit(struct sk_buff * skb2, struct sk_buff * skb);
>> };
>>
>> init - is replacement for skb_set_owner_r|w
>> fini - is replacement for skb_orphan
>> update - is new operation to be used in places where skb->truesize changes,
>>        instead of awful constructions like:
>>
>>        if (!skb->sk || skb->destructor == sock_edemux)
>>             skb->truesize += size - osize;
>>
>>        Now it will look like:
>>
>>        if (skb->ops)
>>             skb->ops->update(skb, size - osize);
>>
>> inherit - is replacement for also awful constructs like:
>>
>>       if (skb->sk)
>>             skb_set_owner_w(skb2, skb->sk);
>>
>>       Now it will be:
>>
>>       if (skb->ops)
>>             skb->ops->inherit(skb2, skb);
>>
>> The implementation looks mostly obvious.
>> Some troubles can be only with new functionality:
>> update of accounting was never done before.
>>
>>
>> More efficient, functionally equivalent, but uglier and less flexible
>> alternative would be removal of ->destructor, replaced with
>> a small numeric indicator of ownership:
>>
>> enum
>> {
>>         SKB_OWNER_NONE,  /* aka destructor == NULL */
>>         SKB_OWNER_WMEM,  /* aka destructor == sk_wfree */
>>         SKB_OWNER_RMEM,  /* aka destructor == sk_rfree */
>>         SKB_OWNER_SK,    /* aka destructor == sk_edemux */
>>         SKB_OWNER_TCP,   /* aka destructor == tcp_wfree */
>> }
>>
>> And the same init,fini,inherit,update become functions
>> w/o any inidirect calls. Not sure it is really more efficient though."
>>
> 
> Well, this does not look as stable material, and would add a bunch
> of indirect calls which are quite expensive these days (CONFIG_RETPOLINE=y)
> 
> I suggest we work on a fix, using existing infra, then eventually later
> try to refactor if this is really bringing improvements.
> 
> A fix could simply be a revert of 0c9f227bee119 ("ipv6: use skb_expand_head in ip6_xmit")
> since only IPv6 has the problem (because of arbitrary headers size)

I think it is not enough.

Root of the problem is that skb_expand_head() works incorrectly with non-shared skb.
In this case it do not call skb_clone before pskb_expand_head() execution,
and as result pskb_expand_head() and does not adjust skb->truesize.

I think non-shared skb is more frequent case,
so all skb_expand_head() are affected.

Therefore we need to revert all my patch set in net-next:
f1260ff skbuff: introduce skb_expand_head()
e415ed3 ipv6: use skb_expand_head in ip6_finish_output2
0c9f227 ipv6: use skb_expand_head in ip6_xmit
5678a59 ipv4: use skb_expand_head in ip_finish_output2
14ee70c vrf: use skb_expand_head in vrf_finish_output
53744a4 ax25: use skb_expand_head
a1e975e bpf: use skb_expand_head in bpf_out_neigh_v4/6
07e1d6b Merge branch 'skb_expand_head'
with fixup
06669e6 vrf: fix NULL dereference in vrf_finish_output()

And then rework ip6_finish_output2() in upstream, 
to call skb_realloc_headroom() like it was done in first patch version:
https://lkml.org/lkml/2021/7/7/469.

Thank you,
	Vasily Averin
diff mbox series

Patch

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index f931176..508d5c4 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1803,6 +1803,8 @@  struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
 
 struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
 {
+	struct sk_buff *oskb = skb;
+	struct sk_buff *nskb = NULL;
 	int delta = headroom - skb_headroom(skb);
 
 	if (WARN_ONCE(delta <= 0,
@@ -1811,21 +1813,21 @@  struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
 
 	/* pskb_expand_head() might crash, if skb is shared */
 	if (skb_shared(skb)) {
-		struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
-
-		if (likely(nskb)) {
-			if (skb->sk)
-				skb_set_owner_w(nskb, skb->sk);
-			consume_skb(skb);
-		} else {
-			kfree_skb(skb);
-		}
+		nskb = skb_clone(skb, GFP_ATOMIC);
 		skb = nskb;
 	}
 	if (skb &&
-	    pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
-		kfree_skb(skb);
+	    pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC))
 		skb = NULL;
+
+	if (!skb) {
+		kfree_skb(oskb);
+		if (nskb)
+			kfree_skb(nskb);
+	} else if (nskb) {
+		if (oskb->sk)
+			skb_set_owner_w(nskb, oskb->sk);
+		consume_skb(oskb);
 	}
 	return skb;
 }