diff mbox series

[for-4.19] x86/altcall: fix clang code-gen when using altcall in loop constructs

Message ID 20240723093117.99487-1-roger.pau@citrix.com (mailing list archive)
State New
Headers show
Series [for-4.19] x86/altcall: fix clang code-gen when using altcall in loop constructs | expand

Commit Message

Roger Pau Monné July 23, 2024, 9:31 a.m. UTC
Yet another clang code generation issue when using altcalls.

The issue this time is with using loop constructs around alternative_{,v}call
instances using parameter types smaller than the register size.

Given the following example code:

static void bar(bool b)
{
    unsigned int i;

    for ( i = 0; i < 10; i++ )
    {
        int ret_;
        register union {
            bool e;
            unsigned long r;
        } di asm("rdi") = { .e = b };
        register unsigned long si asm("rsi");
        register unsigned long dx asm("rdx");
        register unsigned long cx asm("rcx");
        register unsigned long r8 asm("r8");
        register unsigned long r9 asm("r9");
        register unsigned long r10 asm("r10");
        register unsigned long r11 asm("r11");

        asm volatile ( "call %c[addr]"
                       : "+r" (di), "=r" (si), "=r" (dx),
                         "=r" (cx), "=r" (r8), "=r" (r9),
                         "=r" (r10), "=r" (r11), "=a" (ret_)
                       : [addr] "i" (&(func)), "g" (func)
                       : "memory" );
    }
}

See: https://godbolt.org/z/qvxMGd84q

Clang will generate machine code that only resets the low 8 bits of %rdi
between loop calls, leaving the rest of the register possibly containing
garbage from the use of %rdi inside the called function.  Note also that clang
doesn't truncate the input parameters at the callee, thus breaking the psABI.

Fix this by turning the `e` element in the anonymous union into an array that
consumes the same space as an unsigned long, as this forces clang to reset the
whole %rdi register instead of just the low 8 bits.

Fixes: 2ce562b2a413 ('x86/altcall: use a union as register type for function parameters on clang')
Suggested-by: Jan Beulich <jbeulich@suse.com>
Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
---
Adding Oleksii as to whether this could be considered for 4.19: it's strictly
limited to clang builds, plus will need to be backported anyway.
---
 xen/arch/x86/include/asm/alternative.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

Comments

Oleksii Kurochko July 23, 2024, 9:56 a.m. UTC | #1
On Tue, 2024-07-23 at 11:31 +0200, Roger Pau Monne wrote:
> Yet another clang code generation issue when using altcalls.
> 
> The issue this time is with using loop constructs around
> alternative_{,v}call
> instances using parameter types smaller than the register size.
> 
> Given the following example code:
> 
> static void bar(bool b)
> {
>     unsigned int i;
> 
>     for ( i = 0; i < 10; i++ )
>     {
>         int ret_;
>         register union {
>             bool e;
>             unsigned long r;
>         } di asm("rdi") = { .e = b };
>         register unsigned long si asm("rsi");
>         register unsigned long dx asm("rdx");
>         register unsigned long cx asm("rcx");
>         register unsigned long r8 asm("r8");
>         register unsigned long r9 asm("r9");
>         register unsigned long r10 asm("r10");
>         register unsigned long r11 asm("r11");
> 
>         asm volatile ( "call %c[addr]"
>                        : "+r" (di), "=r" (si), "=r" (dx),
>                          "=r" (cx), "=r" (r8), "=r" (r9),
>                          "=r" (r10), "=r" (r11), "=a" (ret_)
>                        : [addr] "i" (&(func)), "g" (func)
>                        : "memory" );
>     }
> }
> 
> See: https://godbolt.org/z/qvxMGd84q
> 
> Clang will generate machine code that only resets the low 8 bits of
> %rdi
> between loop calls, leaving the rest of the register possibly
> containing
> garbage from the use of %rdi inside the called function.  Note also
> that clang
> doesn't truncate the input parameters at the callee, thus breaking
> the psABI.
> 
> Fix this by turning the `e` element in the anonymous union into an
> array that
> consumes the same space as an unsigned long, as this forces clang to
> reset the
> whole %rdi register instead of just the low 8 bits.
> 
> Fixes: 2ce562b2a413 ('x86/altcall: use a union as register type for
> function parameters on clang')
> Suggested-by: Jan Beulich <jbeulich@suse.com>
> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
> ---
> Adding Oleksii as to whether this could be considered for 4.19: it's
> strictly
> limited to clang builds, plus will need to be backported anyway.
> ---
I am okay to have this change in 4.19 but then it should be reviewed
and merged ASAP as the tree was tagged few minutes ago and I expected
that it will be the last one RC in this release cycle:

Release-Acked-By: Oleksii Kurochko <oleksii.kurochko@gmail.com>

~ Oleksii

>  xen/arch/x86/include/asm/alternative.h | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/xen/arch/x86/include/asm/alternative.h
> b/xen/arch/x86/include/asm/alternative.h
> index 0d3697f1de49..e63b45927643 100644
> --- a/xen/arch/x86/include/asm/alternative.h
> +++ b/xen/arch/x86/include/asm/alternative.h
> @@ -185,10 +185,10 @@ extern void alternative_branches(void);
>   */
>  #define ALT_CALL_ARG(arg,
> n)                                            \
>      register union
> {                                                    \
> -        typeof(arg)
> e;                                                  \
> +        typeof(arg) e[sizeof(long) /
> sizeof(arg)];                      \
>          unsigned long
> r;                                                \
>      } a ## n ## _ asm ( ALT_CALL_arg ## n ) =
> {                         \
> -        .e = ({ BUILD_BUG_ON(sizeof(arg) > sizeof(void *)); (arg);
> })   \
> +        .e[0] = ({ BUILD_BUG_ON(sizeof(arg) > sizeof(void *));
> (arg); })\
>      }
>  #else
>  #define ALT_CALL_ARG(arg, n) \
Jan Beulich July 23, 2024, 10:07 a.m. UTC | #2
On 23.07.2024 11:31, Roger Pau Monne wrote:
> Yet another clang code generation issue when using altcalls.
> 
> The issue this time is with using loop constructs around alternative_{,v}call
> instances using parameter types smaller than the register size.
> 
> Given the following example code:
> 
> static void bar(bool b)
> {
>     unsigned int i;
> 
>     for ( i = 0; i < 10; i++ )
>     {
>         int ret_;
>         register union {
>             bool e;
>             unsigned long r;
>         } di asm("rdi") = { .e = b };
>         register unsigned long si asm("rsi");
>         register unsigned long dx asm("rdx");
>         register unsigned long cx asm("rcx");
>         register unsigned long r8 asm("r8");
>         register unsigned long r9 asm("r9");
>         register unsigned long r10 asm("r10");
>         register unsigned long r11 asm("r11");
> 
>         asm volatile ( "call %c[addr]"
>                        : "+r" (di), "=r" (si), "=r" (dx),
>                          "=r" (cx), "=r" (r8), "=r" (r9),
>                          "=r" (r10), "=r" (r11), "=a" (ret_)
>                        : [addr] "i" (&(func)), "g" (func)
>                        : "memory" );
>     }
> }
> 
> See: https://godbolt.org/z/qvxMGd84q
> 
> Clang will generate machine code that only resets the low 8 bits of %rdi
> between loop calls, leaving the rest of the register possibly containing
> garbage from the use of %rdi inside the called function.  Note also that clang
> doesn't truncate the input parameters at the callee, thus breaking the psABI.
> 
> Fix this by turning the `e` element in the anonymous union into an array that
> consumes the same space as an unsigned long, as this forces clang to reset the
> whole %rdi register instead of just the low 8 bits.
> 
> Fixes: 2ce562b2a413 ('x86/altcall: use a union as register type for function parameters on clang')
> Suggested-by: Jan Beulich <jbeulich@suse.com>
> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>

Reviewed-by: Jan Beulich <jbeulich@suse.com>
Alejandro Vallejo July 23, 2024, 3:37 p.m. UTC | #3
On Tue Jul 23, 2024 at 10:31 AM BST, Roger Pau Monne wrote:
> Yet another clang code generation issue when using altcalls.
>
> The issue this time is with using loop constructs around alternative_{,v}call
> instances using parameter types smaller than the register size.
>
> Given the following example code:
>
> static void bar(bool b)
> {
>     unsigned int i;
>
>     for ( i = 0; i < 10; i++ )
>     {
>         int ret_;
>         register union {
>             bool e;
>             unsigned long r;
>         } di asm("rdi") = { .e = b };
>         register unsigned long si asm("rsi");
>         register unsigned long dx asm("rdx");
>         register unsigned long cx asm("rcx");
>         register unsigned long r8 asm("r8");
>         register unsigned long r9 asm("r9");
>         register unsigned long r10 asm("r10");
>         register unsigned long r11 asm("r11");
>
>         asm volatile ( "call %c[addr]"
>                        : "+r" (di), "=r" (si), "=r" (dx),
>                          "=r" (cx), "=r" (r8), "=r" (r9),
>                          "=r" (r10), "=r" (r11), "=a" (ret_)
>                        : [addr] "i" (&(func)), "g" (func)
>                        : "memory" );
>     }
> }
>
> See: https://godbolt.org/z/qvxMGd84q
>
> Clang will generate machine code that only resets the low 8 bits of %rdi
> between loop calls, leaving the rest of the register possibly containing
> garbage from the use of %rdi inside the called function.  Note also that clang
> doesn't truncate the input parameters at the callee, thus breaking the psABI.
>
> Fix this by turning the `e` element in the anonymous union into an array that
> consumes the same space as an unsigned long, as this forces clang to reset the
> whole %rdi register instead of just the low 8 bits.
>
> Fixes: 2ce562b2a413 ('x86/altcall: use a union as register type for function parameters on clang')
> Suggested-by: Jan Beulich <jbeulich@suse.com>
> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
> ---
> Adding Oleksii as to whether this could be considered for 4.19: it's strictly
> limited to clang builds, plus will need to be backported anyway.
> ---
>  xen/arch/x86/include/asm/alternative.h | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/xen/arch/x86/include/asm/alternative.h b/xen/arch/x86/include/asm/alternative.h
> index 0d3697f1de49..e63b45927643 100644
> --- a/xen/arch/x86/include/asm/alternative.h
> +++ b/xen/arch/x86/include/asm/alternative.h
> @@ -185,10 +185,10 @@ extern void alternative_branches(void);
>   */
>  #define ALT_CALL_ARG(arg, n)                                            \
>      register union {                                                    \
> -        typeof(arg) e;                                                  \
> +        typeof(arg) e[sizeof(long) / sizeof(arg)];                      \
>          unsigned long r;                                                \
>      } a ## n ## _ asm ( ALT_CALL_arg ## n ) = {                         \
> -        .e = ({ BUILD_BUG_ON(sizeof(arg) > sizeof(void *)); (arg); })   \
> +        .e[0] = ({ BUILD_BUG_ON(sizeof(arg) > sizeof(void *)); (arg); })\
>      }
>  #else
>  #define ALT_CALL_ARG(arg, n) \

Don't we want BUILD_BUG_ON(sizeof(long) % sizeof(arg) == 0) instead? Otherwise
odd sizes will cause the wrong union size to prevail, and while I can't see
today how those might come to happen there's Murphy's law.

Cheers,
Alejandro
Roger Pau Monné July 23, 2024, 4:09 p.m. UTC | #4
On Tue, Jul 23, 2024 at 04:37:12PM +0100, Alejandro Vallejo wrote:
> On Tue Jul 23, 2024 at 10:31 AM BST, Roger Pau Monne wrote:
> > Yet another clang code generation issue when using altcalls.
> >
> > The issue this time is with using loop constructs around alternative_{,v}call
> > instances using parameter types smaller than the register size.
> >
> > Given the following example code:
> >
> > static void bar(bool b)
> > {
> >     unsigned int i;
> >
> >     for ( i = 0; i < 10; i++ )
> >     {
> >         int ret_;
> >         register union {
> >             bool e;
> >             unsigned long r;
> >         } di asm("rdi") = { .e = b };
> >         register unsigned long si asm("rsi");
> >         register unsigned long dx asm("rdx");
> >         register unsigned long cx asm("rcx");
> >         register unsigned long r8 asm("r8");
> >         register unsigned long r9 asm("r9");
> >         register unsigned long r10 asm("r10");
> >         register unsigned long r11 asm("r11");
> >
> >         asm volatile ( "call %c[addr]"
> >                        : "+r" (di), "=r" (si), "=r" (dx),
> >                          "=r" (cx), "=r" (r8), "=r" (r9),
> >                          "=r" (r10), "=r" (r11), "=a" (ret_)
> >                        : [addr] "i" (&(func)), "g" (func)
> >                        : "memory" );
> >     }
> > }
> >
> > See: https://godbolt.org/z/qvxMGd84q
> >
> > Clang will generate machine code that only resets the low 8 bits of %rdi
> > between loop calls, leaving the rest of the register possibly containing
> > garbage from the use of %rdi inside the called function.  Note also that clang
> > doesn't truncate the input parameters at the callee, thus breaking the psABI.
> >
> > Fix this by turning the `e` element in the anonymous union into an array that
> > consumes the same space as an unsigned long, as this forces clang to reset the
> > whole %rdi register instead of just the low 8 bits.
> >
> > Fixes: 2ce562b2a413 ('x86/altcall: use a union as register type for function parameters on clang')
> > Suggested-by: Jan Beulich <jbeulich@suse.com>
> > Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
> > ---
> > Adding Oleksii as to whether this could be considered for 4.19: it's strictly
> > limited to clang builds, plus will need to be backported anyway.
> > ---
> >  xen/arch/x86/include/asm/alternative.h | 4 ++--
> >  1 file changed, 2 insertions(+), 2 deletions(-)
> >
> > diff --git a/xen/arch/x86/include/asm/alternative.h b/xen/arch/x86/include/asm/alternative.h
> > index 0d3697f1de49..e63b45927643 100644
> > --- a/xen/arch/x86/include/asm/alternative.h
> > +++ b/xen/arch/x86/include/asm/alternative.h
> > @@ -185,10 +185,10 @@ extern void alternative_branches(void);
> >   */
> >  #define ALT_CALL_ARG(arg, n)                                            \
> >      register union {                                                    \
> > -        typeof(arg) e;                                                  \
> > +        typeof(arg) e[sizeof(long) / sizeof(arg)];                      \
> >          unsigned long r;                                                \
> >      } a ## n ## _ asm ( ALT_CALL_arg ## n ) = {                         \
> > -        .e = ({ BUILD_BUG_ON(sizeof(arg) > sizeof(void *)); (arg); })   \
> > +        .e[0] = ({ BUILD_BUG_ON(sizeof(arg) > sizeof(void *)); (arg); })\
> >      }
> >  #else
> >  #define ALT_CALL_ARG(arg, n) \
> 
> Don't we want BUILD_BUG_ON(sizeof(long) % sizeof(arg) == 0) instead?

I think you meant BUILD_BUG_ON(sizeof(long) % sizeof(arg) != 0)?

> Otherwise
> odd sizes will cause the wrong union size to prevail, and while I can't see
> today how those might come to happen there's Murphy's law.

The overall union size would still be fine, because it has the
unsigned long element, it's just that the array won't cover all the
space assigned to the long member?

IOW if sizeof(arg) == 7, then we would define an array with only 1
element, which won't make the size of the union change, but won't
cover the same space that's used by the long member.

However it's not possible for sizeof(arg) > 8 due to the existing
BUILD_BUG_ON(), so the union can never be bigger than a long.

Thanks, Roger.
Alejandro Vallejo July 23, 2024, 4:24 p.m. UTC | #5
On Tue Jul 23, 2024 at 5:09 PM BST, Roger Pau Monné wrote:
> On Tue, Jul 23, 2024 at 04:37:12PM +0100, Alejandro Vallejo wrote:
> > On Tue Jul 23, 2024 at 10:31 AM BST, Roger Pau Monne wrote:
> > > Clang will generate machine code that only resets the low 8 bits of %rdi
> > > between loop calls, leaving the rest of the register possibly containing
> > > garbage from the use of %rdi inside the called function.  Note also that clang
> > > doesn't truncate the input parameters at the callee, thus breaking the psABI.
> > >
> > > Fix this by turning the `e` element in the anonymous union into an array that
> > > consumes the same space as an unsigned long, as this forces clang to reset the
> > > whole %rdi register instead of just the low 8 bits.
> > >
> > > Fixes: 2ce562b2a413 ('x86/altcall: use a union as register type for function parameters on clang')
> > > Suggested-by: Jan Beulich <jbeulich@suse.com>
> > > Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
> > > ---
> > > Adding Oleksii as to whether this could be considered for 4.19: it's strictly
> > > limited to clang builds, plus will need to be backported anyway.
> > > ---
> > >  xen/arch/x86/include/asm/alternative.h | 4 ++--
> > >  1 file changed, 2 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/xen/arch/x86/include/asm/alternative.h b/xen/arch/x86/include/asm/alternative.h
> > > index 0d3697f1de49..e63b45927643 100644
> > > --- a/xen/arch/x86/include/asm/alternative.h
> > > +++ b/xen/arch/x86/include/asm/alternative.h
> > > @@ -185,10 +185,10 @@ extern void alternative_branches(void);
> > >   */
> > >  #define ALT_CALL_ARG(arg, n)                                            \
> > >      register union {                                                    \
> > > -        typeof(arg) e;                                                  \
> > > +        typeof(arg) e[sizeof(long) / sizeof(arg)];                      \
> > >          unsigned long r;                                                \
> > >      } a ## n ## _ asm ( ALT_CALL_arg ## n ) = {                         \
> > > -        .e = ({ BUILD_BUG_ON(sizeof(arg) > sizeof(void *)); (arg); })   \
> > > +        .e[0] = ({ BUILD_BUG_ON(sizeof(arg) > sizeof(void *)); (arg); })\
> > >      }
> > >  #else
> > >  #define ALT_CALL_ARG(arg, n) \
> > 
> > Don't we want BUILD_BUG_ON(sizeof(long) % sizeof(arg) == 0) instead?
>
> I think you meant BUILD_BUG_ON(sizeof(long) % sizeof(arg) != 0)?

Bah, yes. I wrote it as a COMPILE_ASSERT().

>
> > Otherwise
> > odd sizes will cause the wrong union size to prevail, and while I can't see
> > today how those might come to happen there's Murphy's law.
>
> The overall union size would still be fine, because it has the
> unsigned long element, it's just that the array won't cover all the
> space assigned to the long member?

I explained myself poorly. If the current BUILD_BUG_ON() stays as-is that's
right, but...

>
> IOW if sizeof(arg) == 7, then we would define an array with only 1
> element, which won't make the size of the union change, but won't
> cover the same space that's used by the long member.

... I thought the point of the patch was to cover the full union with the
array, and not just a subset. My proposed alternative merely tries to ensure
the argument is always a submultiple in size of a long so the array is always a
perfect match.

Though admittedly, it wouldn't be rare for this to be enough to work around the
bug.

>
> However it's not possible for sizeof(arg) > 8 due to the existing
> BUILD_BUG_ON(), so the union can never be bigger than a long.
>
> Thanks, Roger.

Cheers,
Alejandro
Jan Beulich July 24, 2024, 6:39 a.m. UTC | #6
On 23.07.2024 18:24, Alejandro Vallejo wrote:
> On Tue Jul 23, 2024 at 5:09 PM BST, Roger Pau Monné wrote:
>> On Tue, Jul 23, 2024 at 04:37:12PM +0100, Alejandro Vallejo wrote:
>>> On Tue Jul 23, 2024 at 10:31 AM BST, Roger Pau Monne wrote:
>>>> --- a/xen/arch/x86/include/asm/alternative.h
>>>> +++ b/xen/arch/x86/include/asm/alternative.h
>>>> @@ -185,10 +185,10 @@ extern void alternative_branches(void);
>>>>   */
>>>>  #define ALT_CALL_ARG(arg, n)                                            \
>>>>      register union {                                                    \
>>>> -        typeof(arg) e;                                                  \
>>>> +        typeof(arg) e[sizeof(long) / sizeof(arg)];                      \
>>>>          unsigned long r;                                                \
>>>>      } a ## n ## _ asm ( ALT_CALL_arg ## n ) = {                         \
>>>> -        .e = ({ BUILD_BUG_ON(sizeof(arg) > sizeof(void *)); (arg); })   \
>>>> +        .e[0] = ({ BUILD_BUG_ON(sizeof(arg) > sizeof(void *)); (arg); })\
>>>>      }
>>>>  #else
>>>>  #define ALT_CALL_ARG(arg, n) \
>>>
>>> Don't we want BUILD_BUG_ON(sizeof(long) % sizeof(arg) == 0) instead?
>>
>> I think you meant BUILD_BUG_ON(sizeof(long) % sizeof(arg) != 0)?
> 
> Bah, yes. I wrote it as a COMPILE_ASSERT().
> 
>>
>>> Otherwise
>>> odd sizes will cause the wrong union size to prevail, and while I can't see
>>> today how those might come to happen there's Murphy's law.
>>
>> The overall union size would still be fine, because it has the
>> unsigned long element, it's just that the array won't cover all the
>> space assigned to the long member?
> 
> I explained myself poorly. If the current BUILD_BUG_ON() stays as-is that's
> right, but...
> 
>>
>> IOW if sizeof(arg) == 7, then we would define an array with only 1
>> element, which won't make the size of the union change, but won't
>> cover the same space that's used by the long member.
> 
> ... I thought the point of the patch was to cover the full union with the
> array, and not just a subset. My proposed alternative merely tries to ensure
> the argument is always a submultiple in size of a long so the array is always a
> perfect match.

Question is whether there's an issue with odd sized values in Clang. I
wouldn't want to exclude such (admittedly somewhat exotic) uses "just
in case". My understanding here is that the issue the patch addresses
is not merely the treatment of the union by Clang, but the combination
thereof with it violating the psABI when it comes to passing bool
around.

Jan
Roger Pau Monné July 24, 2024, 8:28 a.m. UTC | #7
On Wed, Jul 24, 2024 at 08:39:05AM +0200, Jan Beulich wrote:
> On 23.07.2024 18:24, Alejandro Vallejo wrote:
> > On Tue Jul 23, 2024 at 5:09 PM BST, Roger Pau Monné wrote:
> >> On Tue, Jul 23, 2024 at 04:37:12PM +0100, Alejandro Vallejo wrote:
> >>> On Tue Jul 23, 2024 at 10:31 AM BST, Roger Pau Monne wrote:
> >>>> --- a/xen/arch/x86/include/asm/alternative.h
> >>>> +++ b/xen/arch/x86/include/asm/alternative.h
> >>>> @@ -185,10 +185,10 @@ extern void alternative_branches(void);
> >>>>   */
> >>>>  #define ALT_CALL_ARG(arg, n)                                            \
> >>>>      register union {                                                    \
> >>>> -        typeof(arg) e;                                                  \
> >>>> +        typeof(arg) e[sizeof(long) / sizeof(arg)];                      \
> >>>>          unsigned long r;                                                \
> >>>>      } a ## n ## _ asm ( ALT_CALL_arg ## n ) = {                         \
> >>>> -        .e = ({ BUILD_BUG_ON(sizeof(arg) > sizeof(void *)); (arg); })   \
> >>>> +        .e[0] = ({ BUILD_BUG_ON(sizeof(arg) > sizeof(void *)); (arg); })\
> >>>>      }
> >>>>  #else
> >>>>  #define ALT_CALL_ARG(arg, n) \
> >>>
> >>> Don't we want BUILD_BUG_ON(sizeof(long) % sizeof(arg) == 0) instead?
> >>
> >> I think you meant BUILD_BUG_ON(sizeof(long) % sizeof(arg) != 0)?
> > 
> > Bah, yes. I wrote it as a COMPILE_ASSERT().
> > 
> >>
> >>> Otherwise
> >>> odd sizes will cause the wrong union size to prevail, and while I can't see
> >>> today how those might come to happen there's Murphy's law.
> >>
> >> The overall union size would still be fine, because it has the
> >> unsigned long element, it's just that the array won't cover all the
> >> space assigned to the long member?
> > 
> > I explained myself poorly. If the current BUILD_BUG_ON() stays as-is that's
> > right, but...
> > 
> >>
> >> IOW if sizeof(arg) == 7, then we would define an array with only 1
> >> element, which won't make the size of the union change, but won't
> >> cover the same space that's used by the long member.
> > 
> > ... I thought the point of the patch was to cover the full union with the
> > array, and not just a subset. My proposed alternative merely tries to ensure
> > the argument is always a submultiple in size of a long so the array is always a
> > perfect match.

I would be fine with the adjusted BUILD_BUG_ON(), but if we change the
instance for clang we should also update the BUILD_BUG_ON() used on
the gcc counterpart so they both match.  That might be undesirable
however since gcc doesn't exhibit any of those bugs, and hence
shouldn't have such constraints.

> Question is whether there's an issue with odd sized values in Clang. I
> wouldn't want to exclude such (admittedly somewhat exotic) uses "just
> in case". My understanding here is that the issue the patch addresses
> is not merely the treatment of the union by Clang, but the combination
> thereof with it violating the psABI when it comes to passing bool
> around.

So there are at least two issues, one is the lack of truncation of
register value done by the callee, which is a psABI violation.  The
other is clang not resetting the high bits of the register when the
altcall is inside a loop construct.  In the godbolt example provided
the high bits of %rdi are not cleared between loops.  This last issue
would also be 'fixed' if clang implemented the psABI properly and
truncated the register at the callee.

I've given a try in godbolt, and odd structure sizes seem to be
affected:

https://godbolt.org/z/778YsoWsn

We have no usages of such structures in Xen so far.

The only way I've found to cope with this is to use something like:

#define ALT_CALL_ARG(arg, n)                                            \
    union {                                                             \
        typeof(arg) e[(sizeof(long) + sizeof(arg) - 1) / sizeof(arg)];  \
        unsigned long r;                                                \
    } a ## n ## __  = {                                                 \
        .e[0] = ({ BUILD_BUG_ON(sizeof(arg) > sizeof(void *)); (arg); })\
    };                                                                  \
    register unsigned long a ## n ## _ asm ( ALT_CALL_arg ## n ) =      \
        a ## n ## __.r

An oversized array that ensures all the space of the long is covered
by the array, but then we need an extra variable, as we would
otherwise require modifying ALT_CALL{0..6}_OUT to use aX_.r instead of
aX_.

Thanks, Roger.
Jan Beulich July 24, 2024, 8:41 a.m. UTC | #8
On 24.07.2024 10:28, Roger Pau Monné wrote:
> The only way I've found to cope with this is to use something like:
> 
> #define ALT_CALL_ARG(arg, n)                                            \
>     union {                                                             \
>         typeof(arg) e[(sizeof(long) + sizeof(arg) - 1) / sizeof(arg)];  \
>         unsigned long r;                                                \
>     } a ## n ## __  = {                                                 \
>         .e[0] = ({ BUILD_BUG_ON(sizeof(arg) > sizeof(void *)); (arg); })\
>     };                                                                  \
>     register unsigned long a ## n ## _ asm ( ALT_CALL_arg ## n ) =      \
>         a ## n ## __.r
> 
> An oversized array that ensures all the space of the long is covered
> by the array, but then we need an extra variable, as we would
> otherwise require modifying ALT_CALL{0..6}_OUT to use aX_.r instead of
> aX_.

I don't think we need to over-size the array. It just gets yet a little
more complex then:

#define ALT_CALL_ARG(arg, n)                                            \
    register union {                                                    \
        struct {                                                        \
            typeof(arg) e;                                              \
            char pad[sizeof(long) - sizeof(arg)];                       \
        } s;                                                            \
        unsigned long r;                                                \
    } a ## n ## _ asm ( ALT_CALL_arg ## n ) = {                         \
        .s.e = ({ BUILD_BUG_ON(sizeof(arg) > sizeof(void *)); (arg); }) \
    }

Jan
Roger Pau Monné July 24, 2024, 9:13 a.m. UTC | #9
On Wed, Jul 24, 2024 at 10:41:43AM +0200, Jan Beulich wrote:
> On 24.07.2024 10:28, Roger Pau Monné wrote:
> > The only way I've found to cope with this is to use something like:
> > 
> > #define ALT_CALL_ARG(arg, n)                                            \
> >     union {                                                             \
> >         typeof(arg) e[(sizeof(long) + sizeof(arg) - 1) / sizeof(arg)];  \
> >         unsigned long r;                                                \
> >     } a ## n ## __  = {                                                 \
> >         .e[0] = ({ BUILD_BUG_ON(sizeof(arg) > sizeof(void *)); (arg); })\
> >     };                                                                  \
> >     register unsigned long a ## n ## _ asm ( ALT_CALL_arg ## n ) =      \
> >         a ## n ## __.r
> > 
> > An oversized array that ensures all the space of the long is covered
> > by the array, but then we need an extra variable, as we would
> > otherwise require modifying ALT_CALL{0..6}_OUT to use aX_.r instead of
> > aX_.
> 
> I don't think we need to over-size the array. It just gets yet a little
> more complex then:
> 
> #define ALT_CALL_ARG(arg, n)                                            \
>     register union {                                                    \
>         struct {                                                        \
>             typeof(arg) e;                                              \
>             char pad[sizeof(long) - sizeof(arg)];                       \
>         } s;                                                            \
>         unsigned long r;                                                \
>     } a ## n ## _ asm ( ALT_CALL_arg ## n ) = {                         \
>         .s.e = ({ BUILD_BUG_ON(sizeof(arg) > sizeof(void *)); (arg); }) \
>     }

We could even simplify this, there's no need for the union anymore,
since struct size == sizeof(long) already:

#define ALT_CALL_ARG(arg, n)                                            \
    register struct {                                                   \
        typeof(arg) e;                                                  \
        char pad[sizeof(long) - sizeof(arg)];                           \
    } a ## n ## _ asm ( ALT_CALL_arg ## n ) = {                         \
        .e = ({ BUILD_BUG_ON(sizeof(arg) > sizeof(void *)); (arg); })   \
    }

The above seems to work for both the original issue and the new one.

Thanks, Roger.
Jan Beulich July 24, 2024, 9:16 a.m. UTC | #10
On 24.07.2024 11:13, Roger Pau Monné wrote:
> On Wed, Jul 24, 2024 at 10:41:43AM +0200, Jan Beulich wrote:
>> On 24.07.2024 10:28, Roger Pau Monné wrote:
>>> The only way I've found to cope with this is to use something like:
>>>
>>> #define ALT_CALL_ARG(arg, n)                                            \
>>>     union {                                                             \
>>>         typeof(arg) e[(sizeof(long) + sizeof(arg) - 1) / sizeof(arg)];  \
>>>         unsigned long r;                                                \
>>>     } a ## n ## __  = {                                                 \
>>>         .e[0] = ({ BUILD_BUG_ON(sizeof(arg) > sizeof(void *)); (arg); })\
>>>     };                                                                  \
>>>     register unsigned long a ## n ## _ asm ( ALT_CALL_arg ## n ) =      \
>>>         a ## n ## __.r
>>>
>>> An oversized array that ensures all the space of the long is covered
>>> by the array, but then we need an extra variable, as we would
>>> otherwise require modifying ALT_CALL{0..6}_OUT to use aX_.r instead of
>>> aX_.
>>
>> I don't think we need to over-size the array. It just gets yet a little
>> more complex then:
>>
>> #define ALT_CALL_ARG(arg, n)                                            \
>>     register union {                                                    \
>>         struct {                                                        \
>>             typeof(arg) e;                                              \
>>             char pad[sizeof(long) - sizeof(arg)];                       \
>>         } s;                                                            \
>>         unsigned long r;                                                \
>>     } a ## n ## _ asm ( ALT_CALL_arg ## n ) = {                         \
>>         .s.e = ({ BUILD_BUG_ON(sizeof(arg) > sizeof(void *)); (arg); }) \
>>     }
> 
> We could even simplify this, there's no need for the union anymore,
> since struct size == sizeof(long) already:
> 
> #define ALT_CALL_ARG(arg, n)                                            \
>     register struct {                                                   \
>         typeof(arg) e;                                                  \
>         char pad[sizeof(long) - sizeof(arg)];                           \
>     } a ## n ## _ asm ( ALT_CALL_arg ## n ) = {                         \
>         .e = ({ BUILD_BUG_ON(sizeof(arg) > sizeof(void *)); (arg); })   \
>     }
> 
> The above seems to work for both the original issue and the new one.

Oh, good. If the compiler's happy with not being dealt an unsigned long,
even better. Maybe for consistency we want to use sizeof(void *) also in
pad's definition then.

Jan
Roger Pau Monné July 24, 2024, 9:23 a.m. UTC | #11
On Wed, Jul 24, 2024 at 11:16:52AM +0200, Jan Beulich wrote:
> On 24.07.2024 11:13, Roger Pau Monné wrote:
> > On Wed, Jul 24, 2024 at 10:41:43AM +0200, Jan Beulich wrote:
> >> On 24.07.2024 10:28, Roger Pau Monné wrote:
> >>> The only way I've found to cope with this is to use something like:
> >>>
> >>> #define ALT_CALL_ARG(arg, n)                                            \
> >>>     union {                                                             \
> >>>         typeof(arg) e[(sizeof(long) + sizeof(arg) - 1) / sizeof(arg)];  \
> >>>         unsigned long r;                                                \
> >>>     } a ## n ## __  = {                                                 \
> >>>         .e[0] = ({ BUILD_BUG_ON(sizeof(arg) > sizeof(void *)); (arg); })\
> >>>     };                                                                  \
> >>>     register unsigned long a ## n ## _ asm ( ALT_CALL_arg ## n ) =      \
> >>>         a ## n ## __.r
> >>>
> >>> An oversized array that ensures all the space of the long is covered
> >>> by the array, but then we need an extra variable, as we would
> >>> otherwise require modifying ALT_CALL{0..6}_OUT to use aX_.r instead of
> >>> aX_.
> >>
> >> I don't think we need to over-size the array. It just gets yet a little
> >> more complex then:
> >>
> >> #define ALT_CALL_ARG(arg, n)                                            \
> >>     register union {                                                    \
> >>         struct {                                                        \
> >>             typeof(arg) e;                                              \
> >>             char pad[sizeof(long) - sizeof(arg)];                       \
> >>         } s;                                                            \
> >>         unsigned long r;                                                \
> >>     } a ## n ## _ asm ( ALT_CALL_arg ## n ) = {                         \
> >>         .s.e = ({ BUILD_BUG_ON(sizeof(arg) > sizeof(void *)); (arg); }) \
> >>     }
> > 
> > We could even simplify this, there's no need for the union anymore,
> > since struct size == sizeof(long) already:
> > 
> > #define ALT_CALL_ARG(arg, n)                                            \
> >     register struct {                                                   \
> >         typeof(arg) e;                                                  \
> >         char pad[sizeof(long) - sizeof(arg)];                           \
> >     } a ## n ## _ asm ( ALT_CALL_arg ## n ) = {                         \
> >         .e = ({ BUILD_BUG_ON(sizeof(arg) > sizeof(void *)); (arg); })   \
> >     }
> > 
> > The above seems to work for both the original issue and the new one.
> 
> Oh, good. If the compiler's happy with not being dealt an unsigned long,
> even better. Maybe for consistency we want to use sizeof(void *) also in
> pad's definition then.

I have to test this on Gitlab, it seems to be fine with clang version
18.1.6, not sure about older releases.

I can switch to void *, that would be fine, will attempt to prepare a
followup patch.

Thanks, Roger.
diff mbox series

Patch

diff --git a/xen/arch/x86/include/asm/alternative.h b/xen/arch/x86/include/asm/alternative.h
index 0d3697f1de49..e63b45927643 100644
--- a/xen/arch/x86/include/asm/alternative.h
+++ b/xen/arch/x86/include/asm/alternative.h
@@ -185,10 +185,10 @@  extern void alternative_branches(void);
  */
 #define ALT_CALL_ARG(arg, n)                                            \
     register union {                                                    \
-        typeof(arg) e;                                                  \
+        typeof(arg) e[sizeof(long) / sizeof(arg)];                      \
         unsigned long r;                                                \
     } a ## n ## _ asm ( ALT_CALL_arg ## n ) = {                         \
-        .e = ({ BUILD_BUG_ON(sizeof(arg) > sizeof(void *)); (arg); })   \
+        .e[0] = ({ BUILD_BUG_ON(sizeof(arg) > sizeof(void *)); (arg); })\
     }
 #else
 #define ALT_CALL_ARG(arg, n) \