diff mbox series

[v2] x86/PV: avoid indirect call for I/O emulation quirk hook

Message ID 28b46611-ff98-45cd-a2b0-ffe36b8f0ccf@suse.com (mailing list archive)
State New
Headers show
Series [v2] x86/PV: avoid indirect call for I/O emulation quirk hook | expand

Commit Message

Jan Beulich Jan. 17, 2024, 9:37 a.m. UTC
This way ioemul_handle_proliant_quirk() won't need ENDBR anymore.

While touching this code, also
- arrange for it to not be built at all when !PV,
- add "const" to the last function parameter and bring the definition
  in sync with the declaration (for Misra).

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
Obviously the file may want moving to pv/ then. I wasn't sure whether
to also fold doing so right into here.

Of course things could also be done the other way around: Have
ioemul_handle_quirk as function name and ioemul_handle_proliant_quirk
be the boolean.
---
v2: Don't use alternative_call(); drop indirect call altogether.

Comments

Andrew Cooper Jan. 17, 2024, 10:59 a.m. UTC | #1
On 17/01/2024 9:37 am, Jan Beulich wrote:
> This way ioemul_handle_proliant_quirk() won't need ENDBR anymore.
>
> While touching this code, also
> - arrange for it to not be built at all when !PV,
> - add "const" to the last function parameter and bring the definition
>   in sync with the declaration (for Misra).
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

> ---
> Obviously the file may want moving to pv/ then. I wasn't sure whether
> to also fold doing so right into here.

I'm sure we discussed this before, but I don't think we had a firm
conclusion.

IIRC there was a concern that we might need it for PVH dom0 too,
depending on the degree to which the CLI is actually necessary here.

The CLI is definitely not NMI safe, and whatever SMM logic we're
tickling here should be synchronous with the INB.  At a guess the CLI is
just to get the RPC all done without adverse delay, but there is a
distinct lack of useful information here.

~Andrew
Andrew Cooper Jan. 18, 2024, 11:04 a.m. UTC | #2
On 17/01/2024 9:37 am, Jan Beulich wrote:
> --- a/xen/arch/x86/ioport_emulate.c
> +++ b/xen/arch/x86/ioport_emulate.c
> @@ -8,11 +8,10 @@
>  #include <xen/sched.h>
>  #include <xen/dmi.h>
>  
> -unsigned int (*__read_mostly ioemul_handle_quirk)(
> -    uint8_t opcode, char *io_emul_stub, struct cpu_user_regs *regs);
> +bool __ro_after_init ioemul_handle_quirk;
>  
> -static unsigned int cf_check ioemul_handle_proliant_quirk(
> -    u8 opcode, char *io_emul_stub, struct cpu_user_regs *regs)
> +unsigned int ioemul_handle_proliant_quirk(
> +    uint8_t opcode, char *io_emul_stub, const struct cpu_user_regs *regs)
>  {
>      static const char stub[] = {
>          0x9c,       /*    pushf           */

Something occurred to me.

diff --git a/xen/arch/x86/ioport_emulate.c b/xen/arch/x86/ioport_emulate.c
index 23cba842b22e..70f94febe255 100644
--- a/xen/arch/x86/ioport_emulate.c
+++ b/xen/arch/x86/ioport_emulate.c
@@ -13,7 +13,7 @@ bool __ro_after_init ioemul_handle_quirk;
 unsigned int ioemul_handle_proliant_quirk(
     uint8_t opcode, char *io_emul_stub, const struct cpu_user_regs *regs)
 {
-    static const char stub[] = {
+    const char stub[] = {
         0x9c,       /*    pushf           */
         0xfa,       /*    cli             */
         0xee,       /*    out %al, %dx    */

is an improvement, confirmed by bloat-o-meter:

add/remove: 0/1 grow/shrink: 1/0 up/down: 1/-9 (-8)
Function                                     old     new   delta
ioemul_handle_proliant_quirk                  58      59      +1
stub                                           9       -      -9

The reason is that we've got a 9 byte object that's decomposed into two
rip-relative accesses.  i.e. we've got more pointer than data in this case.

But this adjustment seems to tickle a GCC bug.  With that change in
place, GCC emits:

<ioemul_handle_proliant_quirk>:
       48 83 ec 10             sub    $0x10,%rsp
       ...
       48 83 c4 10             add    $0x10,%rsp
       c3                      retq

i.e. we get a stack frame (space at least, no initialisation) despite
the object having been converted entirely to instruction immediates.

Or in other words, there's a further 12 byte saving available when GCC
can be persuaded to not even emit the stack frame.

What is even more weird is that I see this GCC-10, and a build of gcc
master from last week, but not when I try to reproduce in
https://godbolt.org/z/PnachbznW so there's probably some other setting
used by Xen which tickles this bug.

~Andrew
Jan Beulich Jan. 18, 2024, 11:09 a.m. UTC | #3
On 18.01.2024 12:04, Andrew Cooper wrote:
> On 17/01/2024 9:37 am, Jan Beulich wrote:
>> --- a/xen/arch/x86/ioport_emulate.c
>> +++ b/xen/arch/x86/ioport_emulate.c
>> @@ -8,11 +8,10 @@
>>  #include <xen/sched.h>
>>  #include <xen/dmi.h>
>>  
>> -unsigned int (*__read_mostly ioemul_handle_quirk)(
>> -    uint8_t opcode, char *io_emul_stub, struct cpu_user_regs *regs);
>> +bool __ro_after_init ioemul_handle_quirk;
>>  
>> -static unsigned int cf_check ioemul_handle_proliant_quirk(
>> -    u8 opcode, char *io_emul_stub, struct cpu_user_regs *regs)
>> +unsigned int ioemul_handle_proliant_quirk(
>> +    uint8_t opcode, char *io_emul_stub, const struct cpu_user_regs *regs)
>>  {
>>      static const char stub[] = {
>>          0x9c,       /*    pushf           */
> 
> Something occurred to me.
> 
> diff --git a/xen/arch/x86/ioport_emulate.c b/xen/arch/x86/ioport_emulate.c
> index 23cba842b22e..70f94febe255 100644
> --- a/xen/arch/x86/ioport_emulate.c
> +++ b/xen/arch/x86/ioport_emulate.c
> @@ -13,7 +13,7 @@ bool __ro_after_init ioemul_handle_quirk;
>  unsigned int ioemul_handle_proliant_quirk(
>      uint8_t opcode, char *io_emul_stub, const struct cpu_user_regs *regs)
>  {
> -    static const char stub[] = {
> +    const char stub[] = {
>          0x9c,       /*    pushf           */
>          0xfa,       /*    cli             */
>          0xee,       /*    out %al, %dx    */
> 
> is an improvement, confirmed by bloat-o-meter:
> 
> add/remove: 0/1 grow/shrink: 1/0 up/down: 1/-9 (-8)
> Function                                     old     new   delta
> ioemul_handle_proliant_quirk                  58      59      +1
> stub                                           9       -      -9
> 
> The reason is that we've got a 9 byte object that's decomposed into two
> rip-relative accesses.  i.e. we've got more pointer than data in this case.

I wouldn't mind this as a separate change, but I don't see how it would
fit right here.

> But this adjustment seems to tickle a GCC bug.  With that change in
> place, GCC emits:
> 
> <ioemul_handle_proliant_quirk>:
>        48 83 ec 10             sub    $0x10,%rsp
>        ...
>        48 83 c4 10             add    $0x10,%rsp
>        c3                      retq
> 
> i.e. we get a stack frame (space at least, no initialisation) despite
> the object having been converted entirely to instruction immediates.
> 
> Or in other words, there's a further 12 byte saving available when GCC
> can be persuaded to not even emit the stack frame.
> 
> What is even more weird is that I see this GCC-10, and a build of gcc
> master from last week, but not when I try to reproduce in
> https://godbolt.org/z/PnachbznW so there's probably some other setting
> used by Xen which tickles this bug.

Yeah, I've observed such pointless frame allocation elsewhere as well,
so far without being able what exactly triggers it.

Jan
Andrew Cooper Jan. 18, 2024, 11:11 a.m. UTC | #4
On 18/01/2024 11:09 am, Jan Beulich wrote:
> On 18.01.2024 12:04, Andrew Cooper wrote:
>> On 17/01/2024 9:37 am, Jan Beulich wrote:
>>> --- a/xen/arch/x86/ioport_emulate.c
>>> +++ b/xen/arch/x86/ioport_emulate.c
>>> @@ -8,11 +8,10 @@
>>>  #include <xen/sched.h>
>>>  #include <xen/dmi.h>
>>>  
>>> -unsigned int (*__read_mostly ioemul_handle_quirk)(
>>> -    uint8_t opcode, char *io_emul_stub, struct cpu_user_regs *regs);
>>> +bool __ro_after_init ioemul_handle_quirk;
>>>  
>>> -static unsigned int cf_check ioemul_handle_proliant_quirk(
>>> -    u8 opcode, char *io_emul_stub, struct cpu_user_regs *regs)
>>> +unsigned int ioemul_handle_proliant_quirk(
>>> +    uint8_t opcode, char *io_emul_stub, const struct cpu_user_regs *regs)
>>>  {
>>>      static const char stub[] = {
>>>          0x9c,       /*    pushf           */
>> Something occurred to me.
>>
>> diff --git a/xen/arch/x86/ioport_emulate.c b/xen/arch/x86/ioport_emulate.c
>> index 23cba842b22e..70f94febe255 100644
>> --- a/xen/arch/x86/ioport_emulate.c
>> +++ b/xen/arch/x86/ioport_emulate.c
>> @@ -13,7 +13,7 @@ bool __ro_after_init ioemul_handle_quirk;
>>  unsigned int ioemul_handle_proliant_quirk(
>>      uint8_t opcode, char *io_emul_stub, const struct cpu_user_regs *regs)
>>  {
>> -    static const char stub[] = {
>> +    const char stub[] = {
>>          0x9c,       /*    pushf           */
>>          0xfa,       /*    cli             */
>>          0xee,       /*    out %al, %dx    */
>>
>> is an improvement, confirmed by bloat-o-meter:
>>
>> add/remove: 0/1 grow/shrink: 1/0 up/down: 1/-9 (-8)
>> Function                                     old     new   delta
>> ioemul_handle_proliant_quirk                  58      59      +1
>> stub                                           9       -      -9
>>
>> The reason is that we've got a 9 byte object that's decomposed into two
>> rip-relative accesses.  i.e. we've got more pointer than data in this case.
> I wouldn't mind this as a separate change, but I don't see how it would
> fit right here.

I'm not suggesting changing this patch.  I just linked here because I
noticed it because of this patch.

We've got similar patterns elsewhere, so I was intending to do a patch
covering all of them.

>
>> But this adjustment seems to tickle a GCC bug.  With that change in
>> place, GCC emits:
>>
>> <ioemul_handle_proliant_quirk>:
>>        48 83 ec 10             sub    $0x10,%rsp
>>        ...
>>        48 83 c4 10             add    $0x10,%rsp
>>        c3                      retq
>>
>> i.e. we get a stack frame (space at least, no initialisation) despite
>> the object having been converted entirely to instruction immediates.
>>
>> Or in other words, there's a further 12 byte saving available when GCC
>> can be persuaded to not even emit the stack frame.
>>
>> What is even more weird is that I see this GCC-10, and a build of gcc
>> master from last week, but not when I try to reproduce in
>> https://godbolt.org/z/PnachbznW so there's probably some other setting
>> used by Xen which tickles this bug.
> Yeah, I've observed such pointless frame allocation elsewhere as well,
> so far without being able what exactly triggers it.

Ok - more experimentation required, I guess.

~Andrew
diff mbox series

Patch

--- a/xen/arch/x86/Makefile
+++ b/xen/arch/x86/Makefile
@@ -45,7 +45,7 @@  obj-$(CONFIG_LIVEPATCH) += alternative.o
 obj-y += msi.o
 obj-y += msr.o
 obj-$(CONFIG_INDIRECT_THUNK) += indirect-thunk.o
-obj-y += ioport_emulate.o
+obj-$(CONFIG_PV) += ioport_emulate.o
 obj-y += irq.o
 obj-$(CONFIG_KEXEC) += machine_kexec.o
 obj-y += mm.o x86_64/mm.o
--- a/xen/arch/x86/include/asm/io.h
+++ b/xen/arch/x86/include/asm/io.h
@@ -47,10 +47,14 @@  __OUT(b,"b",char)
 __OUT(w,"w",short)
 __OUT(l,,int)
 
-/* Function pointer used to handle platform specific I/O port emulation. */
+/*
+ * Boolean indicator and function used to handle platform specific I/O port
+ * emulation.
+ */
 #define IOEMUL_QUIRK_STUB_BYTES 9
+extern bool ioemul_handle_quirk;
 struct cpu_user_regs;
-extern unsigned int (*ioemul_handle_quirk)(
-    uint8_t opcode, char *io_emul_stub, struct cpu_user_regs *regs);
+unsigned int ioemul_handle_proliant_quirk(
+    uint8_t opcode, char *io_emul_stub, const struct cpu_user_regs *regs);
 
 #endif
--- a/xen/arch/x86/ioport_emulate.c
+++ b/xen/arch/x86/ioport_emulate.c
@@ -8,11 +8,10 @@ 
 #include <xen/sched.h>
 #include <xen/dmi.h>
 
-unsigned int (*__read_mostly ioemul_handle_quirk)(
-    uint8_t opcode, char *io_emul_stub, struct cpu_user_regs *regs);
+bool __ro_after_init ioemul_handle_quirk;
 
-static unsigned int cf_check ioemul_handle_proliant_quirk(
-    u8 opcode, char *io_emul_stub, struct cpu_user_regs *regs)
+unsigned int ioemul_handle_proliant_quirk(
+    uint8_t opcode, char *io_emul_stub, const struct cpu_user_regs *regs)
 {
     static const char stub[] = {
         0x9c,       /*    pushf           */
@@ -95,7 +94,7 @@  static const struct dmi_system_id __init
 static int __init cf_check ioport_quirks_init(void)
 {
     if ( dmi_check_system(ioport_quirks_tbl) )
-        ioemul_handle_quirk = ioemul_handle_proliant_quirk;
+        ioemul_handle_quirk = true;
 
     return 0;
 }
--- a/xen/arch/x86/pv/emul-priv-op.c
+++ b/xen/arch/x86/pv/emul-priv-op.c
@@ -112,7 +112,7 @@  static io_emul_stub_t *io_emul_stub_setu
     /* Some platforms might need to quirk the stub for specific inputs. */
     if ( unlikely(ioemul_handle_quirk) )
     {
-        quirk_bytes = ioemul_handle_quirk(opcode, p, ctxt->ctxt.regs);
+        quirk_bytes = ioemul_handle_proliant_quirk(opcode, p, ctxt->ctxt.regs);
         p += quirk_bytes;
     }