diff mbox series

[1/4] target/i386: Fix sanity check on max APIC ID / X2APIC enablement

Message ID 20220314142544.150555-1-dwmw2@infradead.org (mailing list archive)
State New, archived
Headers show
Series [1/4] target/i386: Fix sanity check on max APIC ID / X2APIC enablement | expand

Commit Message

David Woodhouse March 14, 2022, 2:25 p.m. UTC
The check on x86ms->apic_id_limit in pc_machine_done() had two problems.

Firstly, we need KVM to support the X2APIC API in order to allow IRQ
delivery to APICs >= 255. So we need to call/check kvm_enable_x2apic(),
which was done elsewhere in *some* cases but not all.

Secondly, microvm needs the same check. So move it from pc_machine_done()
to x86_cpus_init() where it will work for both.

The check in kvm_cpu_instance_init() is now redundant and can be dropped.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
Acked-by: Claudio Fontana <cfontana@suse.de>
---
 hw/i386/pc.c              |  8 --------
 hw/i386/x86.c             | 16 ++++++++++++++++
 target/i386/kvm/kvm-cpu.c |  2 +-
 3 files changed, 17 insertions(+), 9 deletions(-)

Comments

Igor Mammedov March 16, 2022, 9:04 a.m. UTC | #1
On Mon, 14 Mar 2022 14:25:41 +0000
David Woodhouse <dwmw2@infradead.org> wrote:

> The check on x86ms->apic_id_limit in pc_machine_done() had two problems.
> 
> Firstly, we need KVM to support the X2APIC API in order to allow IRQ
> delivery to APICs >= 255. So we need to call/check kvm_enable_x2apic(),
> which was done elsewhere in *some* cases but not all.
> 
> Secondly, microvm needs the same check. So move it from pc_machine_done()
> to x86_cpus_init() where it will work for both.
> 
> The check in kvm_cpu_instance_init() is now redundant and can be dropped.

Well, I retested with the latest upstream kernel (both guest and host),
and adding kvm_enable_x2apic() is not sufficient as guest according
to your patches in kernel caps max APICID at 255 unless kvm-msi-ext-dest-id
is enabled. And attempt in enabling kvm-msi-ext-dest-id with kernel-irqchip
fails.
So number of usable CPUs in guest stays at legacy level, leaving the rest
of CPUs in limbo.


> Signed-off-by: David Woodhouse <dwmw2@infradead.org>
> Acked-by: Claudio Fontana <cfontana@suse.de>
> ---
>  hw/i386/pc.c              |  8 --------
>  hw/i386/x86.c             | 16 ++++++++++++++++
>  target/i386/kvm/kvm-cpu.c |  2 +-
>  3 files changed, 17 insertions(+), 9 deletions(-)
> 
> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> index fd55fc725c..d3ab28fec5 100644
> --- a/hw/i386/pc.c
> +++ b/hw/i386/pc.c
> @@ -740,14 +740,6 @@ void pc_machine_done(Notifier *notifier, void *data)
>          /* update FW_CFG_NB_CPUS to account for -device added CPUs */
>          fw_cfg_modify_i16(x86ms->fw_cfg, FW_CFG_NB_CPUS, x86ms->boot_cpus);
>      }
> -
> -
> -    if (x86ms->apic_id_limit > 255 && !xen_enabled() &&
> -        !kvm_irqchip_in_kernel()) {
> -        error_report("current -smp configuration requires kernel "
> -                     "irqchip support.");
> -        exit(EXIT_FAILURE);
> -    }
>  }
>  
>  void pc_guest_info_init(PCMachineState *pcms)
> diff --git a/hw/i386/x86.c b/hw/i386/x86.c
> index 4cf107baea..8da55d58ea 100644
> --- a/hw/i386/x86.c
> +++ b/hw/i386/x86.c
> @@ -39,6 +39,7 @@
>  #include "sysemu/replay.h"
>  #include "sysemu/sysemu.h"
>  #include "sysemu/cpu-timers.h"
> +#include "sysemu/xen.h"
>  #include "trace.h"
>  
>  #include "hw/i386/x86.h"
> @@ -123,6 +124,21 @@ void x86_cpus_init(X86MachineState *x86ms, int default_cpu_version)
>       */
>      x86ms->apic_id_limit = x86_cpu_apic_id_from_index(x86ms,
>                                                        ms->smp.max_cpus - 1) + 1;
> +
> +    /*
> +     * Can we support APIC ID 255 or higher?
> +     *
> +     * Under Xen: yes.
> +     * With userspace emulated lapic: no
> +     * With KVM's in-kernel lapic: only if X2APIC API is enabled.
> +     */
> +    if (x86ms->apic_id_limit > 255 && !xen_enabled() &&
> +        (!kvm_irqchip_in_kernel() || !kvm_enable_x2apic())) {
> +        error_report("current -smp configuration requires kernel "
> +                     "irqchip and X2APIC API support.");
> +        exit(EXIT_FAILURE);
> +    }
> +
>      possible_cpus = mc->possible_cpu_arch_ids(ms);
>      for (i = 0; i < ms->smp.cpus; i++) {
>          x86_cpu_new(x86ms, possible_cpus->cpus[i].arch_id, &error_fatal);
> diff --git a/target/i386/kvm/kvm-cpu.c b/target/i386/kvm/kvm-cpu.c
> index d95028018e..c60cb2dafb 100644
> --- a/target/i386/kvm/kvm-cpu.c
> +++ b/target/i386/kvm/kvm-cpu.c
> @@ -165,7 +165,7 @@ static void kvm_cpu_instance_init(CPUState *cs)
>          /* only applies to builtin_x86_defs cpus */
>          if (!kvm_irqchip_in_kernel()) {
>              x86_cpu_change_kvm_default("x2apic", "off");
> -        } else if (kvm_irqchip_is_split() && kvm_enable_x2apic()) {
> +        } else if (kvm_irqchip_is_split()) {
>              x86_cpu_change_kvm_default("kvm-msi-ext-dest-id", "on");
>          }
>
David Woodhouse March 16, 2022, 9:37 a.m. UTC | #2
On Wed, 2022-03-16 at 10:04 +0100, Igor Mammedov wrote:
> Well, I retested with the latest upstream kernel (both guest and host),
> and adding kvm_enable_x2apic() is not sufficient as guest according
> to your patches in kernel caps max APICID at 255 unless kvm-msi-ext-dest-id
> is enabled. And attempt in enabling kvm-msi-ext-dest-id with kernel-irqchip
> fails.

Correctly so. We need the split irqchip to support kvm-msi-ext-dest-id
which is why there's an explicity check for it.

> So number of usable CPUs in guest stays at legacy level, leaving the rest
> of CPUs in limbo.

Yep, that's the guest operating system's choice. Not a qemu problem.

Even if you have the split IRQ chip, if you boot a guest without kvm-
msi-ext-dest-id support, it'll refuse to use higher CPUs.

Or if you boot a guest without X2APIC support, it'll refuse to use
higher CPUs. 

That doesn't mean a user should be *forbidden* from launching qemu in
that configuration.
Michael S. Tsirkin March 16, 2022, 9:56 a.m. UTC | #3
On Wed, Mar 16, 2022 at 09:37:07AM +0000, David Woodhouse wrote:
> On Wed, 2022-03-16 at 10:04 +0100, Igor Mammedov wrote:
> > Well, I retested with the latest upstream kernel (both guest and host),
> > and adding kvm_enable_x2apic() is not sufficient as guest according
> > to your patches in kernel caps max APICID at 255 unless kvm-msi-ext-dest-id
> > is enabled. And attempt in enabling kvm-msi-ext-dest-id with kernel-irqchip
> > fails.
> 
> Correctly so. We need the split irqchip to support kvm-msi-ext-dest-id
> which is why there's an explicity check for it.
> 
> > So number of usable CPUs in guest stays at legacy level, leaving the rest
> > of CPUs in limbo.
> 
> Yep, that's the guest operating system's choice. Not a qemu problem.
> 
> Even if you have the split IRQ chip, if you boot a guest without kvm-
> msi-ext-dest-id support, it'll refuse to use higher CPUs.
> 
> Or if you boot a guest without X2APIC support, it'll refuse to use
> higher CPUs. 
> 
> That doesn't mean a user should be *forbidden* from launching qemu in
> that configuration.

Well the issue with all these configs which kind of work but not
the way they were specified is that down the road someone
creates a VM with this config and then expects us to maintain it
indefinitely.

So yes, if we are not sure we can support something properly it is
better to validate and exit than create a VM guests don't know how
to treat.
David Woodhouse March 16, 2022, 10:37 a.m. UTC | #4
On Wed, 2022-03-16 at 05:56 -0400, Michael S. Tsirkin wrote:
> On Wed, Mar 16, 2022 at 09:37:07AM +0000, David Woodhouse wrote:
> > Yep, that's the guest operating system's choice. Not a qemu problem.
> > 
> > Even if you have the split IRQ chip, if you boot a guest without kvm-
> > msi-ext-dest-id support, it'll refuse to use higher CPUs.
> > 
> > Or if you boot a guest without X2APIC support, it'll refuse to use
> > higher CPUs. 
> > 
> > That doesn't mean a user should be *forbidden* from launching qemu in
> > that configuration.
> 
> Well the issue with all these configs which kind of work but not
> the way they were specified is that down the road someone
> creates a VM with this config and then expects us to maintain it
> indefinitely.
> 
> So yes, if we are not sure we can support something properly it is
> better to validate and exit than create a VM guests don't know how
> to treat.

Not entirely sure how to reconcile that with what Daniel said in
https://lore.kernel.org/qemu-devel/Yi9BTkZIM3iZsvdK@redhat.com/ which
was:

> We've generally said QEMU should not reject / block startup of valid
> hardware configurations, based on existance of bugs in certain guest
> OS, if the config would be valid for other guest.

That said, I cannot point at a *specific* example of a guest which can
use the higher CPUs even when it can't direct external interrupts at
them. I worked on making Linux capable of it, as I said, but didn't
pursue that in the end.

I *suspect* Windows might be able to do it, based on the way the
hyperv-iommu works (by cheating and returning -EINVAL when external
interrupts are directed at higher CPUs).
Michael S. Tsirkin March 16, 2022, 10:47 a.m. UTC | #5
On Wed, Mar 16, 2022 at 10:37:49AM +0000, David Woodhouse wrote:
> On Wed, 2022-03-16 at 05:56 -0400, Michael S. Tsirkin wrote:
> > On Wed, Mar 16, 2022 at 09:37:07AM +0000, David Woodhouse wrote:
> > > Yep, that's the guest operating system's choice. Not a qemu problem.
> > > 
> > > Even if you have the split IRQ chip, if you boot a guest without kvm-
> > > msi-ext-dest-id support, it'll refuse to use higher CPUs.
> > > 
> > > Or if you boot a guest without X2APIC support, it'll refuse to use
> > > higher CPUs. 
> > > 
> > > That doesn't mean a user should be *forbidden* from launching qemu in
> > > that configuration.
> > 
> > Well the issue with all these configs which kind of work but not
> > the way they were specified is that down the road someone
> > creates a VM with this config and then expects us to maintain it
> > indefinitely.
> > 
> > So yes, if we are not sure we can support something properly it is
> > better to validate and exit than create a VM guests don't know how
> > to treat.
> 
> Not entirely sure how to reconcile that with what Daniel said in
> https://lore.kernel.org/qemu-devel/Yi9BTkZIM3iZsvdK@redhat.com/ which
> was:
> 
> > We've generally said QEMU should not reject / block startup of valid
> > hardware configurations, based on existance of bugs in certain guest
> > OS, if the config would be valid for other guest.

For sure, but is this a valid hardware configuration? That's
really the question.

> That said, I cannot point at a *specific* example of a guest which can
> use the higher CPUs even when it can't direct external interrupts at
> them. I worked on making Linux capable of it, as I said, but didn't
> pursue that in the end.
> 
> I *suspect* Windows might be able to do it, based on the way the
> hyperv-iommu works (by cheating and returning -EINVAL when external
> interrupts are directed at higher CPUs).
> 
>
Igor Mammedov March 16, 2022, 11:28 a.m. UTC | #6
On Wed, 16 Mar 2022 06:47:48 -0400
"Michael S. Tsirkin" <mst@redhat.com> wrote:

> On Wed, Mar 16, 2022 at 10:37:49AM +0000, David Woodhouse wrote:
> > On Wed, 2022-03-16 at 05:56 -0400, Michael S. Tsirkin wrote:  
> > > On Wed, Mar 16, 2022 at 09:37:07AM +0000, David Woodhouse wrote:  
> > > > Yep, that's the guest operating system's choice. Not a qemu problem.
> > > > 
> > > > Even if you have the split IRQ chip, if you boot a guest without kvm-
> > > > msi-ext-dest-id support, it'll refuse to use higher CPUs.
> > > > 
> > > > Or if you boot a guest without X2APIC support, it'll refuse to use
> > > > higher CPUs. 
> > > > 
> > > > That doesn't mean a user should be *forbidden* from launching qemu in
> > > > that configuration.  
> > > 
> > > Well the issue with all these configs which kind of work but not
> > > the way they were specified is that down the road someone
> > > creates a VM with this config and then expects us to maintain it
> > > indefinitely.
> > > 
> > > So yes, if we are not sure we can support something properly it is
> > > better to validate and exit than create a VM guests don't know how
> > > to treat.  
> > 
> > Not entirely sure how to reconcile that with what Daniel said in
> > https://lore.kernel.org/qemu-devel/Yi9BTkZIM3iZsvdK@redhat.com/ which
> > was:

Generally Daniel is right, as long as it's something that what real hardware
supports. (usually it's job if upper layers which know what guest OS is used,
and can tweak config based on that knowledge).

But it's virt only extension and none (tested with
 Windows (hangs on boot),
 Linux (brings up only first 255 cpus)
) of mainline OSes ended up up working as expected (i.e. user asked for this
many CPUs but can't really use them as expected).
Which would just lead to users reporting (obscure) bugs.

> > > We've generally said QEMU should not reject / block startup of valid
> > > hardware configurations, based on existance of bugs in certain guest
> > > OS, if the config would be valid for other guest.  
> 
> For sure, but is this a valid hardware configuration? That's
> really the question.

to me it looks like not complete PV feature so far.
if it's a configuration that is interesting for some users (some special
build OS/appliance that can use CPUs which are able to handle only IPIs)
or for development purposes than in should be an opt-in feature
instead of default one.
 
> > That said, I cannot point at a *specific* example of a guest which can
> > use the higher CPUs even when it can't direct external interrupts at
> > them. I worked on making Linux capable of it, as I said, but didn't
> > pursue that in the end.
> > 
> > I *suspect* Windows might be able to do it, based on the way the
> > hyperv-iommu works (by cheating and returning -EINVAL when external
> > interrupts are directed at higher CPUs).
Testing shows, Windows (2019 and 2004 build) doesn't work (at least with
just kernel-irqchip=on in current state). (CCing Vitaly, he might know
if Windows might work and under what conditions)

Linux(recentish) was able to bring up all CPUs with APICID above 255
with 'split' irqchip and without iommu present (at least it boots to
command prompt).

What worked for both OSes (full boot), was split irqchip + iommu
(even without irq remapping, but I haven't tested with older guests
so irq remapping might be required anyways).
David Woodhouse March 16, 2022, 2:31 p.m. UTC | #7
On Wed, 2022-03-16 at 12:28 +0100, Igor Mammedov wrote:
> Generally Daniel is right, as long as it's something that what real hardware
> supports. (usually it's job if upper layers which know what guest OS is used,
> and can tweak config based on that knowledge).
> 
> But it's virt only extension and none (tested with
>  Windows (hangs on boot),
>  Linux (brings up only first 255 cpus)
> ) of mainline OSes ended up up working as expected (i.e. user asked for this
> many CPUs but can't really use them as expected).

As I said, that kind of failure mode will happen even with the split
irq chip and EXT_DEST_ID, with Windows and older (pre-5.10) Linux
kernels.

For older guests it would also happen on real hardware, and in VMs
where you expose an IOMMU with interrupt remapping. Some guests don't
support interrupt remapping, or don't support X2APIC at all.

> Which would just lead to users reporting (obscure) bugs.

It's not virt only. This could happen with real hardware.

> Testing shows, Windows (2019 and 2004 build) doesn't work (at least with
> just kernel-irqchip=on in current state). (CCing Vitaly, he might know
> if Windows might work and under what conditions)
> 
> Linux(recentish) was able to bring up all CPUs with APICID above 255
> with 'split' irqchip and without iommu present (at least it boots to
> command prompt).

That'll be using the EXT_DEST_ID support.

> What worked for both OSes (full boot), was split irqchip + iommu
> (even without irq remapping, but I haven't tested with older guests
> so irq remapping might be required anyways).

Hm, that's surprising for Windows unless it's learned to use the
EXT_DEST_ID support. Or maybe it *can* cope with only targeting
external interrupts at CPUs < 255 but has a gratuitous check that
prevents it bringing them up unless there's an IOMMU... *even* if that
IOMMU doesn't have irq remapping anyway?

Anyway, as fas as I'm concerned it doesn't matter very much whether we
insist on the split irq chip or not. Feel free to repost your patch
rebased on top of my fixes, which are also in my tree at
https://git.infradead.org/users/dwmw2/qemu.git

The check you're modifying has moved to x86_cpus_init().
Igor Mammedov March 17, 2022, 9:05 a.m. UTC | #8
re-sending reply as something went wrong with headers (I suspect Daniel's name formatting)
and email got bounced back.

On Wed, 16 Mar 2022 14:31:33 +0000
David Woodhouse <dwmw2@infradead.org> wrote:

> On Wed, 2022-03-16 at 12:28 +0100, Igor Mammedov wrote:  
> > Generally Daniel is right, as long as it's something that what real hardware
> > supports. (usually it's job if upper layers which know what guest OS is used,
> > and can tweak config based on that knowledge).
> > 
> > But it's virt only extension and none (tested with
> >  Windows (hangs on boot),
> >  Linux (brings up only first 255 cpus)
> > ) of mainline OSes ended up up working as expected (i.e. user asked for this
> > many CPUs but can't really use them as expected).    
> 
> As I said, that kind of failure mode will happen even with the split
> irq chip and EXT_DEST_ID, with Windows and older (pre-5.10) Linux
> kernels.
> 
> For older guests it would also happen on real hardware, and in VMs
> where you expose an IOMMU with interrupt remapping. Some guests don't
> support interrupt remapping, or don't support X2APIC at all.
>   
> > Which would just lead to users reporting (obscure) bugs.    
> 
> It's not virt only. This could happen with real hardware.  

I was talking about EXT_DEST_ID kvm extension.
With current upstream guest kernel, user gets only "bad cpu" messages
and then go figure what's wrong with configuration or
simply hangs in case of Windows.

> > Testing shows, Windows (2019 and 2004 build) doesn't work (at least with
> > just kernel-irqchip=on in current state). (CCing Vitaly, he might know
> > if Windows might work and under what conditions)
> > 
> > Linux(recentish) was able to bring up all CPUs with APICID above 255
> > with 'split' irqchip and without iommu present (at least it boots to
> > command prompt).    
> 
> That'll be using the EXT_DEST_ID support.
>   
> > What worked for both OSes (full boot), was split irqchip + iommu
> > (even without irq remapping, but I haven't tested with older guests
> > so irq remapping might be required anyways).    
> 
> Hm, that's surprising for Windows unless it's learned to use the
> EXT_DEST_ID support. Or maybe it *can* cope with only targeting
> external interrupts at CPUs < 255 but has a gratuitous check that
> prevents it bringing them up unless there's an IOMMU... *even* if that
> IOMMU doesn't have irq remapping anyway?  

or maybe we are enabling irq remapping by default now.
I'll try to check, if guest is actually brings all CPUs up.

> Anyway, as fas as I'm concerned it doesn't matter very much whether we
> insist on the split irq chip or not. Feel free to repost your patch
> rebased on top of my fixes, which are also in my tree at
> https://git.infradead.org/users/dwmw2/qemu.git
> 
> The check you're modifying has moved to x86_cpus_init().  

if we are to keep iommu dependency then moving to x86_cpus_init()
isn't an option, it should be done at pc_machine_done() time.

in practice partial revert of your c1bb5418e to restore
iommu check including irq remapping.
In which case, do we still need kvm_enable_x2apic() check
you are adding here?
David Woodhouse March 17, 2022, 11:13 a.m. UTC | #9
On Thu, 2022-03-17 at 10:05 +0100, Igor Mammedov wrote:
> re-sending reply as something went wrong with headers (I suspect Daniel's name formatting)
> and email got bounced back.
> 
> On Wed, 16 Mar 2022 14:31:33 +0000
> David Woodhouse <dwmw2@infradead.org> wrote:
> 
> > On Wed, 2022-03-16 at 12:28 +0100, Igor Mammedov wrote:  
> > > Generally Daniel is right, as long as it's something that what real hardware
> > > supports. (usually it's job if upper layers which know what guest OS is used,
> > > and can tweak config based on that knowledge).
> > > 
> > > But it's virt only extension and none (tested with
> > >  Windows (hangs on boot),
> > >  Linux (brings up only first 255 cpus)
> > > ) of mainline OSes ended up up working as expected (i.e. user asked for this
> > > many CPUs but can't really use them as expected).    
> > 
> > As I said, that kind of failure mode will happen even with the split
> > irq chip and EXT_DEST_ID, with Windows and older (pre-5.10) Linux
> > kernels.
> > 
> > For older guests it would also happen on real hardware, and in VMs
> > where you expose an IOMMU with interrupt remapping. Some guests don't
> > support interrupt remapping, or don't support X2APIC at all.
> >   
> > > Which would just lead to users reporting (obscure) bugs.    
> > 
> > It's not virt only. This could happen with real hardware.  
> 
> I was talking about EXT_DEST_ID kvm extension.

Then I'm confused, because that isn't the conversation we were having
before. And in that case what you say above about Linux only bringing
up the first 255 CPUs directly contradicts what you say below, my own
experience, and the whole *point* of the EXT_DEST_ID extension :)

Let's start again. You observed that Linux guests fail to bring up >254
vcPUs if qemu doesn't enable the EXT_DEST_ID support, and your fix was
to require EXT_DEST_ID (as a side-effect of requiring split irqchip).

This reminded me of the fixes I'd been posting since 2020 which still
haven't been merged, so I dusted those off and resent them.

I didn't incorporate your change, and objected to your patch because I
think it's pointless babysitting. Yes, in the general case if you want
your guest to use more than 254 vCPUs you need to take a moment to
think about precisely what your guest operating system requires in
order to support that.

At the very least it needs X2APIC support, and then you need *one* of:

 • EXT_DEST_ID,
 • Interrupt remapping, or
 • just using those vCPUs without external interrupts.

Both of the first two require the split irqchip, so your patch just
doesn't let users rely on that last option. I conceded (cited below)
because I don't know of any existing guest OS which does use that last
option. I'd attempted to make Linux do so, but eventually abandoned it:
https://git.infradead.org/users/dwmw2/linux.git/shortlog/refs/heads/irqaffinity

But now you seem to be making a different argument?

> > Anyway, as far as I'm concerned it doesn't matter very much whether we
> > insist on the split irq chip or not. Feel free to repost your patch
> > rebased on top of my fixes, which are also in my tree at
> > https://git.infradead.org/users/dwmw2/qemu.git
> > 
> > 
> > The check you're modifying has moved to x86_cpus_init().  
> 
> if we are to keep iommu dependency then moving to x86_cpus_init()
> isn't an option, it should be done at pc_machine_done() time.

Thus far, I didn't think anyone had been talking about a dependency on
IOMMU. That doesn't make any sense at all. EXT_DEST_ID is perfectly
sufficient for Linux kernels from 5.10 onwards and they don't need the
IOMMU.

So no. Post your patch to s/kvm_irqchip_in_kernel/kvm_irqchip_is_split/
in x86_cpus_init() by all means; I don't care much about that and I've
even incorporated that change into my tree at
https://git.infradead.org/users/dwmw2/qemu.git so that if I do have to
repost these fixes yet again, it'll be included.

But let's not re-add the IOMMU dependency. That would be wrong.
Igor Mammedov March 18, 2022, 2:17 p.m. UTC | #10
On Thu, 17 Mar 2022 11:13:44 +0000
David Woodhouse <dwmw2@infradead.org> wrote:

> On Thu, 2022-03-17 at 10:05 +0100, Igor Mammedov wrote:
> > re-sending reply as something went wrong with headers (I suspect Daniel's name formatting)
> > and email got bounced back.
> > 
> > On Wed, 16 Mar 2022 14:31:33 +0000
> > David Woodhouse <dwmw2@infradead.org> wrote:
> >   
> > > On Wed, 2022-03-16 at 12:28 +0100, Igor Mammedov wrote:    
> > > > Generally Daniel is right, as long as it's something that what real hardware
> > > > supports. (usually it's job if upper layers which know what guest OS is used,
> > > > and can tweak config based on that knowledge).
> > > > 
> > > > But it's virt only extension and none (tested with
> > > >  Windows (hangs on boot),
> > > >  Linux (brings up only first 255 cpus)
> > > > ) of mainline OSes ended up up working as expected (i.e. user asked for this
> > > > many CPUs but can't really use them as expected).      
> > > 
> > > As I said, that kind of failure mode will happen even with the split
> > > irq chip and EXT_DEST_ID, with Windows and older (pre-5.10) Linux
> > > kernels.
> > > 
> > > For older guests it would also happen on real hardware, and in VMs
> > > where you expose an IOMMU with interrupt remapping. Some guests don't
> > > support interrupt remapping, or don't support X2APIC at all.
> > >     
> > > > Which would just lead to users reporting (obscure) bugs.      
> > > 
> > > It's not virt only. This could happen with real hardware.    
> > 
> > I was talking about EXT_DEST_ID kvm extension.  
> 
> Then I'm confused, because that isn't the conversation we were having
> before. And in that case what you say above about Linux only bringing
> up the first 255 CPUs directly contradicts what you say below, my own
> experience, and the whole *point* of the EXT_DEST_ID extension :)

Now I'm lost in translation too :)

> Let's start again. You observed that Linux guests fail to bring up >254
> vcPUs if qemu doesn't enable the EXT_DEST_ID support, and your fix was
> to require EXT_DEST_ID (as a side-effect of requiring split irqchip).
> 
> This reminded me of the fixes I'd been posting since 2020 which still
> haven't been merged, so I dusted those off and resent them.
> 
> I didn't incorporate your change, and objected to your patch because I
> think it's pointless babysitting.

1)

> Yes, in the general case if you want
> your guest to use more than 254 vCPUs you need to take a moment to
> think about precisely what your guest operating system requires in
> order to support that.
> 
> At the very least it needs X2APIC support, and then you need *one* of:
> 
>  • EXT_DEST_ID,
>  • Interrupt remapping, or
>  • just using those vCPUs without external interrupts.
> 
> Both of the first two require the split irqchip, so your patch just
> doesn't let users rely on that last option. I conceded (cited below)
> because I don't know of any existing guest OS which does use that last
> option. I'd attempted to make Linux do so, but eventually abandoned it:
> https://git.infradead.org/users/dwmw2/linux.git/shortlog/refs/heads/irqaffinity

Given that is was abandoned, it's unlikely that the last option was ever
used (and before EXT_DEST_ID, qemu was requiring irq remapping to start
guest with so many vcpus). If there will be a user for it in the future
we can relax restriction given that it will be properly documented/
user gets sane error/warning messages, so they could figure out what to do.

> But now you seem to be making a different argument?
> 
> > > Anyway, as far as I'm concerned it doesn't matter very much whether we
> > > insist on the split irq chip or not. Feel free to repost your patch
> > > rebased on top of my fixes, which are also in my tree at
> > > https://git.infradead.org/users/dwmw2/qemu.git
> > > 
> > > 
> > > The check you're modifying has moved to x86_cpus_init().    
> > 
> > if we are to keep iommu dependency then moving to x86_cpus_init()
> > isn't an option, it should be done at pc_machine_done() time.  
> 
> Thus far, I didn't think anyone had been talking about a dependency on
> IOMMU. That doesn't make any sense at all. EXT_DEST_ID is perfectly
> sufficient for Linux kernels from 5.10 onwards and they don't need the
> IOMMU.

IOMMU was required before EXT_DEST_ID due to irq-remapping dependency,
and that conservative config worked fine for both Linux and Windows
guests. That's why I've raised question if we should revert restriction
to the way it was back then.

With Linux pre-5.10 guests, dmesg output at least complains about
IRQ remapping, so user has a small chance to be able to figure out
that IOMMU should be configured to get all CPUs working.
For post-5.10, all one gets is "bad cpu" without any clue as to why,
if EXT_DEST_ID is not advertised by hypervisor.
It would be better if guest kernel printed some error/warning in that case.

If we start with IOMMU, (Win/Linux) guests boot fine (modulo ancient ones)
(irq-remapping is 'on' by default since qemu-4.0).

> So no. Post your patch to s/kvm_irqchip_in_kernel/kvm_irqchip_is_split/
> in x86_cpus_init() by all means; I don't care much about that and I've
> even incorporated that change into my tree at
> https://git.infradead.org/users/dwmw2/qemu.git so that if I do have to
> repost these fixes yet again, it'll be included.

Looks fine to me, thanks.

> But let's not re-add the IOMMU dependency. That would be wrong.

We should document possible options, somewhere in QEMU.
So not only few would know about what options to use and when.
Something along lines above [1].
David Woodhouse March 18, 2022, 2:56 p.m. UTC | #11
On Fri, 2022-03-18 at 15:17 +0100, Igor Mammedov wrote:
> On Thu, 17 Mar 2022 11:13:44 +0000 David Woodhouse <dwmw2@infradead.org> wrote:
> > Thus far, I didn't think anyone had been talking about a dependency on
> > IOMMU. That doesn't make any sense at all. EXT_DEST_ID is perfectly
> > sufficient for Linux kernels from 5.10 onwards and they don't need the
> > IOMMU.
> 
> IOMMU was required before EXT_DEST_ID due to irq-remapping dependency,
> and that conservative config worked fine for both Linux and Windows
> guests. That's why I've raised question if we should revert restriction
> to the way it was back then.
> 
> With Linux pre-5.10 guests, dmesg output at least complains about
> IRQ remapping, so user has a small chance to be able to figure out
> that IOMMU should be configured to get all CPUs working.
> For post-5.10, all one gets is "bad cpu" without any clue as to why,
> if EXT_DEST_ID is not advertised by hypervisor.
>
> It would be better if guest kernel printed some error/warning in that case.

That case doesn't exist any more since your change. You *can't* launch
qemu with that many vCPUs and not support EXT_DEST_ID.
Michael S. Tsirkin May 13, 2022, 1:37 p.m. UTC | #12
On Mon, Mar 14, 2022 at 02:25:41PM +0000, David Woodhouse wrote:
> The check on x86ms->apic_id_limit in pc_machine_done() had two problems.
> 
> Firstly, we need KVM to support the X2APIC API in order to allow IRQ
> delivery to APICs >= 255. So we need to call/check kvm_enable_x2apic(),
> which was done elsewhere in *some* cases but not all.
> 
> Secondly, microvm needs the same check. So move it from pc_machine_done()
> to x86_cpus_init() where it will work for both.
> 
> The check in kvm_cpu_instance_init() is now redundant and can be dropped.
> 
> Signed-off-by: David Woodhouse <dwmw2@infradead.org>
> Acked-by: Claudio Fontana <cfontana@suse.de>


Fow now I applied this as-is, in particular because if the added split
irqchip test make check started failing for me on Fedora.
Igor please go ahead and make a change on top limiting things
to the split irqchip.

> ---
>  hw/i386/pc.c              |  8 --------
>  hw/i386/x86.c             | 16 ++++++++++++++++
>  target/i386/kvm/kvm-cpu.c |  2 +-
>  3 files changed, 17 insertions(+), 9 deletions(-)
> 
> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> index fd55fc725c..d3ab28fec5 100644
> --- a/hw/i386/pc.c
> +++ b/hw/i386/pc.c
> @@ -740,14 +740,6 @@ void pc_machine_done(Notifier *notifier, void *data)
>          /* update FW_CFG_NB_CPUS to account for -device added CPUs */
>          fw_cfg_modify_i16(x86ms->fw_cfg, FW_CFG_NB_CPUS, x86ms->boot_cpus);
>      }
> -
> -
> -    if (x86ms->apic_id_limit > 255 && !xen_enabled() &&
> -        !kvm_irqchip_in_kernel()) {
> -        error_report("current -smp configuration requires kernel "
> -                     "irqchip support.");
> -        exit(EXIT_FAILURE);
> -    }
>  }
>  
>  void pc_guest_info_init(PCMachineState *pcms)
> diff --git a/hw/i386/x86.c b/hw/i386/x86.c
> index 4cf107baea..8da55d58ea 100644
> --- a/hw/i386/x86.c
> +++ b/hw/i386/x86.c
> @@ -39,6 +39,7 @@
>  #include "sysemu/replay.h"
>  #include "sysemu/sysemu.h"
>  #include "sysemu/cpu-timers.h"
> +#include "sysemu/xen.h"
>  #include "trace.h"
>  
>  #include "hw/i386/x86.h"
> @@ -123,6 +124,21 @@ void x86_cpus_init(X86MachineState *x86ms, int default_cpu_version)
>       */
>      x86ms->apic_id_limit = x86_cpu_apic_id_from_index(x86ms,
>                                                        ms->smp.max_cpus - 1) + 1;
> +
> +    /*
> +     * Can we support APIC ID 255 or higher?
> +     *
> +     * Under Xen: yes.
> +     * With userspace emulated lapic: no
> +     * With KVM's in-kernel lapic: only if X2APIC API is enabled.
> +     */
> +    if (x86ms->apic_id_limit > 255 && !xen_enabled() &&
> +        (!kvm_irqchip_in_kernel() || !kvm_enable_x2apic())) {
> +        error_report("current -smp configuration requires kernel "
> +                     "irqchip and X2APIC API support.");
> +        exit(EXIT_FAILURE);
> +    }
> +
>      possible_cpus = mc->possible_cpu_arch_ids(ms);
>      for (i = 0; i < ms->smp.cpus; i++) {
>          x86_cpu_new(x86ms, possible_cpus->cpus[i].arch_id, &error_fatal);
> diff --git a/target/i386/kvm/kvm-cpu.c b/target/i386/kvm/kvm-cpu.c
> index d95028018e..c60cb2dafb 100644
> --- a/target/i386/kvm/kvm-cpu.c
> +++ b/target/i386/kvm/kvm-cpu.c
> @@ -165,7 +165,7 @@ static void kvm_cpu_instance_init(CPUState *cs)
>          /* only applies to builtin_x86_defs cpus */
>          if (!kvm_irqchip_in_kernel()) {
>              x86_cpu_change_kvm_default("x2apic", "off");
> -        } else if (kvm_irqchip_is_split() && kvm_enable_x2apic()) {
> +        } else if (kvm_irqchip_is_split()) {
>              x86_cpu_change_kvm_default("kvm-msi-ext-dest-id", "on");
>          }
>  
> -- 
> 2.33.1
diff mbox series

Patch

diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index fd55fc725c..d3ab28fec5 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -740,14 +740,6 @@  void pc_machine_done(Notifier *notifier, void *data)
         /* update FW_CFG_NB_CPUS to account for -device added CPUs */
         fw_cfg_modify_i16(x86ms->fw_cfg, FW_CFG_NB_CPUS, x86ms->boot_cpus);
     }
-
-
-    if (x86ms->apic_id_limit > 255 && !xen_enabled() &&
-        !kvm_irqchip_in_kernel()) {
-        error_report("current -smp configuration requires kernel "
-                     "irqchip support.");
-        exit(EXIT_FAILURE);
-    }
 }
 
 void pc_guest_info_init(PCMachineState *pcms)
diff --git a/hw/i386/x86.c b/hw/i386/x86.c
index 4cf107baea..8da55d58ea 100644
--- a/hw/i386/x86.c
+++ b/hw/i386/x86.c
@@ -39,6 +39,7 @@ 
 #include "sysemu/replay.h"
 #include "sysemu/sysemu.h"
 #include "sysemu/cpu-timers.h"
+#include "sysemu/xen.h"
 #include "trace.h"
 
 #include "hw/i386/x86.h"
@@ -123,6 +124,21 @@  void x86_cpus_init(X86MachineState *x86ms, int default_cpu_version)
      */
     x86ms->apic_id_limit = x86_cpu_apic_id_from_index(x86ms,
                                                       ms->smp.max_cpus - 1) + 1;
+
+    /*
+     * Can we support APIC ID 255 or higher?
+     *
+     * Under Xen: yes.
+     * With userspace emulated lapic: no
+     * With KVM's in-kernel lapic: only if X2APIC API is enabled.
+     */
+    if (x86ms->apic_id_limit > 255 && !xen_enabled() &&
+        (!kvm_irqchip_in_kernel() || !kvm_enable_x2apic())) {
+        error_report("current -smp configuration requires kernel "
+                     "irqchip and X2APIC API support.");
+        exit(EXIT_FAILURE);
+    }
+
     possible_cpus = mc->possible_cpu_arch_ids(ms);
     for (i = 0; i < ms->smp.cpus; i++) {
         x86_cpu_new(x86ms, possible_cpus->cpus[i].arch_id, &error_fatal);
diff --git a/target/i386/kvm/kvm-cpu.c b/target/i386/kvm/kvm-cpu.c
index d95028018e..c60cb2dafb 100644
--- a/target/i386/kvm/kvm-cpu.c
+++ b/target/i386/kvm/kvm-cpu.c
@@ -165,7 +165,7 @@  static void kvm_cpu_instance_init(CPUState *cs)
         /* only applies to builtin_x86_defs cpus */
         if (!kvm_irqchip_in_kernel()) {
             x86_cpu_change_kvm_default("x2apic", "off");
-        } else if (kvm_irqchip_is_split() && kvm_enable_x2apic()) {
+        } else if (kvm_irqchip_is_split()) {
             x86_cpu_change_kvm_default("kvm-msi-ext-dest-id", "on");
         }