diff mbox series

[v5,02/22] cc_platform: Add new attribute to prevent ACPI CPU hotplug

Message ID f4bff93d83814ea1f54494f51ce3e5d954cf0f5b.1655894131.git.kai.huang@intel.com (mailing list archive)
State New, archived
Headers show
Series TDX host kernel support | expand

Commit Message

Huang, Kai June 22, 2022, 11:15 a.m. UTC
Platforms with confidential computing technology may not support ACPI
CPU hotplug when such technology is enabled by the BIOS.  Examples
include Intel platforms which support Intel Trust Domain Extensions
(TDX).

If the kernel ever receives ACPI CPU hotplug event, it is likely a BIOS
bug.  For ACPI CPU hot-add, the kernel should speak out this is a BIOS
bug and reject the new CPU.  For hot-removal, for simplicity just assume
the kernel cannot continue to work normally, and BUG().

Add a new attribute CC_ATTR_ACPI_CPU_HOTPLUG_DISABLED to indicate the
platform doesn't support ACPI CPU hotplug, so that kernel can handle
ACPI CPU hotplug events for such platform.  The existing attribute
CC_ATTR_HOTPLUG_DISABLED is for software CPU hotplug thus doesn't fit.

In acpi_processor_{add|remove}(), add early check against this attribute
and handle accordingly if it is set.

Also take this chance to rename existing CC_ATTR_HOTPLUG_DISABLED to
CC_ATTR_CPU_HOTPLUG_DISABLED as it is for software CPU hotplug.

Signed-off-by: Kai Huang <kai.huang@intel.com>
---
 arch/x86/coco/core.c          |  2 +-
 drivers/acpi/acpi_processor.c | 23 +++++++++++++++++++++++
 include/linux/cc_platform.h   | 15 +++++++++++++--
 kernel/cpu.c                  |  2 +-
 4 files changed, 38 insertions(+), 4 deletions(-)

Comments

Rafael J. Wysocki June 22, 2022, 11:42 a.m. UTC | #1
On Wed, Jun 22, 2022 at 1:16 PM Kai Huang <kai.huang@intel.com> wrote:
>
> Platforms with confidential computing technology may not support ACPI
> CPU hotplug when such technology is enabled by the BIOS.  Examples
> include Intel platforms which support Intel Trust Domain Extensions
> (TDX).
>
> If the kernel ever receives ACPI CPU hotplug event, it is likely a BIOS
> bug.  For ACPI CPU hot-add, the kernel should speak out this is a BIOS
> bug and reject the new CPU.  For hot-removal, for simplicity just assume
> the kernel cannot continue to work normally, and BUG().
>
> Add a new attribute CC_ATTR_ACPI_CPU_HOTPLUG_DISABLED to indicate the
> platform doesn't support ACPI CPU hotplug, so that kernel can handle
> ACPI CPU hotplug events for such platform.  The existing attribute
> CC_ATTR_HOTPLUG_DISABLED is for software CPU hotplug thus doesn't fit.
>
> In acpi_processor_{add|remove}(), add early check against this attribute
> and handle accordingly if it is set.
>
> Also take this chance to rename existing CC_ATTR_HOTPLUG_DISABLED to
> CC_ATTR_CPU_HOTPLUG_DISABLED as it is for software CPU hotplug.
>
> Signed-off-by: Kai Huang <kai.huang@intel.com>
> ---
>  arch/x86/coco/core.c          |  2 +-
>  drivers/acpi/acpi_processor.c | 23 +++++++++++++++++++++++
>  include/linux/cc_platform.h   | 15 +++++++++++++--
>  kernel/cpu.c                  |  2 +-
>  4 files changed, 38 insertions(+), 4 deletions(-)
>
> diff --git a/arch/x86/coco/core.c b/arch/x86/coco/core.c
> index 4320fadae716..1bde1af75296 100644
> --- a/arch/x86/coco/core.c
> +++ b/arch/x86/coco/core.c
> @@ -20,7 +20,7 @@ static bool intel_cc_platform_has(enum cc_attr attr)
>  {
>         switch (attr) {
>         case CC_ATTR_GUEST_UNROLL_STRING_IO:
> -       case CC_ATTR_HOTPLUG_DISABLED:
> +       case CC_ATTR_CPU_HOTPLUG_DISABLED:
>         case CC_ATTR_GUEST_MEM_ENCRYPT:
>         case CC_ATTR_MEM_ENCRYPT:
>                 return true;
> diff --git a/drivers/acpi/acpi_processor.c b/drivers/acpi/acpi_processor.c
> index 6737b1cbf6d6..b960db864cd4 100644
> --- a/drivers/acpi/acpi_processor.c
> +++ b/drivers/acpi/acpi_processor.c
> @@ -15,6 +15,7 @@
>  #include <linux/kernel.h>
>  #include <linux/module.h>
>  #include <linux/pci.h>
> +#include <linux/cc_platform.h>
>
>  #include <acpi/processor.h>
>
> @@ -357,6 +358,17 @@ static int acpi_processor_add(struct acpi_device *device,
>         struct device *dev;
>         int result = 0;
>
> +       /*
> +        * If the confidential computing platform doesn't support ACPI
> +        * memory hotplug, the BIOS should never deliver such event to
> +        * the kernel.  Report ACPI CPU hot-add as a BIOS bug and ignore
> +        * the new CPU.
> +        */
> +       if (cc_platform_has(CC_ATTR_ACPI_CPU_HOTPLUG_DISABLED)) {

This will affect initialization, not just hotplug AFAICS.

You should reset the .hotplug.enabled flag in processor_handler to
false instead.

> +               dev_err(&device->dev, "[BIOS bug]: Platform doesn't support ACPI CPU hotplug.  New CPU ignored.\n");
> +               return -EINVAL;
> +       }
> +
>         pr = kzalloc(sizeof(struct acpi_processor), GFP_KERNEL);
>         if (!pr)
>                 return -ENOMEM;
> @@ -434,6 +446,17 @@ static void acpi_processor_remove(struct acpi_device *device)
>         if (!device || !acpi_driver_data(device))
>                 return;
>
> +       /*
> +        * The confidential computing platform is broken if ACPI memory
> +        * hot-removal isn't supported but it happened anyway.  Assume
> +        * it's not guaranteed that the kernel can continue to work
> +        * normally.  Just BUG().
> +        */
> +       if (cc_platform_has(CC_ATTR_ACPI_CPU_HOTPLUG_DISABLED)) {
> +               dev_err(&device->dev, "Platform doesn't support ACPI CPU hotplug. BUG().\n");
> +               BUG();
> +       }
> +
>         pr = acpi_driver_data(device);
>         if (pr->id >= nr_cpu_ids)
>                 goto out;
> diff --git a/include/linux/cc_platform.h b/include/linux/cc_platform.h
> index 691494bbaf5a..9ce9256facc8 100644
> --- a/include/linux/cc_platform.h
> +++ b/include/linux/cc_platform.h
> @@ -74,14 +74,25 @@ enum cc_attr {
>         CC_ATTR_GUEST_UNROLL_STRING_IO,
>
>         /**
> -        * @CC_ATTR_HOTPLUG_DISABLED: Hotplug is not supported or disabled.
> +        * @CC_ATTR_CPU_HOTPLUG_DISABLED: CPU hotplug is not supported or
> +        *                                disabled.
>          *
>          * The platform/OS is running as a guest/virtual machine does not
>          * support CPU hotplug feature.
>          *
>          * Examples include TDX Guest.
>          */
> -       CC_ATTR_HOTPLUG_DISABLED,
> +       CC_ATTR_CPU_HOTPLUG_DISABLED,
> +
> +       /**
> +        * @CC_ATTR_ACPI_CPU_HOTPLUG_DISABLED: ACPI CPU hotplug is not
> +        *                                     supported.
> +        *
> +        * The platform/OS does not support ACPI CPU hotplug.
> +        *
> +        * Examples include TDX platform.
> +        */
> +       CC_ATTR_ACPI_CPU_HOTPLUG_DISABLED,
>  };
>
>  #ifdef CONFIG_ARCH_HAS_CC_PLATFORM
> diff --git a/kernel/cpu.c b/kernel/cpu.c
> index edb8c199f6a3..966772cce063 100644
> --- a/kernel/cpu.c
> +++ b/kernel/cpu.c
> @@ -1191,7 +1191,7 @@ static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target)
>          * If the platform does not support hotplug, report it explicitly to
>          * differentiate it from a transient offlining failure.
>          */
> -       if (cc_platform_has(CC_ATTR_HOTPLUG_DISABLED))
> +       if (cc_platform_has(CC_ATTR_CPU_HOTPLUG_DISABLED))
>                 return -EOPNOTSUPP;
>         if (cpu_hotplug_disabled)
>                 return -EBUSY;
> --
> 2.36.1
>
Huang, Kai June 23, 2022, 12:01 a.m. UTC | #2
On Wed, 2022-06-22 at 13:42 +0200, Rafael J. Wysocki wrote:
> On Wed, Jun 22, 2022 at 1:16 PM Kai Huang <kai.huang@intel.com> wrote:
> > 
> > Platforms with confidential computing technology may not support ACPI
> > CPU hotplug when such technology is enabled by the BIOS.  Examples
> > include Intel platforms which support Intel Trust Domain Extensions
> > (TDX).
> > 
> > If the kernel ever receives ACPI CPU hotplug event, it is likely a BIOS
> > bug.  For ACPI CPU hot-add, the kernel should speak out this is a BIOS
> > bug and reject the new CPU.  For hot-removal, for simplicity just assume
> > the kernel cannot continue to work normally, and BUG().
> > 
> > Add a new attribute CC_ATTR_ACPI_CPU_HOTPLUG_DISABLED to indicate the
> > platform doesn't support ACPI CPU hotplug, so that kernel can handle
> > ACPI CPU hotplug events for such platform.  The existing attribute
> > CC_ATTR_HOTPLUG_DISABLED is for software CPU hotplug thus doesn't fit.
> > 
> > In acpi_processor_{add|remove}(), add early check against this attribute
> > and handle accordingly if it is set.
> > 
> > Also take this chance to rename existing CC_ATTR_HOTPLUG_DISABLED to
> > CC_ATTR_CPU_HOTPLUG_DISABLED as it is for software CPU hotplug.
> > 
> > Signed-off-by: Kai Huang <kai.huang@intel.com>
> > ---
> >  arch/x86/coco/core.c          |  2 +-
> >  drivers/acpi/acpi_processor.c | 23 +++++++++++++++++++++++
> >  include/linux/cc_platform.h   | 15 +++++++++++++--
> >  kernel/cpu.c                  |  2 +-
> >  4 files changed, 38 insertions(+), 4 deletions(-)
> > 
> > diff --git a/arch/x86/coco/core.c b/arch/x86/coco/core.c
> > index 4320fadae716..1bde1af75296 100644
> > --- a/arch/x86/coco/core.c
> > +++ b/arch/x86/coco/core.c
> > @@ -20,7 +20,7 @@ static bool intel_cc_platform_has(enum cc_attr attr)
> >  {
> >         switch (attr) {
> >         case CC_ATTR_GUEST_UNROLL_STRING_IO:
> > -       case CC_ATTR_HOTPLUG_DISABLED:
> > +       case CC_ATTR_CPU_HOTPLUG_DISABLED:
> >         case CC_ATTR_GUEST_MEM_ENCRYPT:
> >         case CC_ATTR_MEM_ENCRYPT:
> >                 return true;
> > diff --git a/drivers/acpi/acpi_processor.c b/drivers/acpi/acpi_processor.c
> > index 6737b1cbf6d6..b960db864cd4 100644
> > --- a/drivers/acpi/acpi_processor.c
> > +++ b/drivers/acpi/acpi_processor.c
> > @@ -15,6 +15,7 @@
> >  #include <linux/kernel.h>
> >  #include <linux/module.h>
> >  #include <linux/pci.h>
> > +#include <linux/cc_platform.h>
> > 
> >  #include <acpi/processor.h>
> > 
> > @@ -357,6 +358,17 @@ static int acpi_processor_add(struct acpi_device *device,
> >         struct device *dev;
> >         int result = 0;
> > 
> > +       /*
> > +        * If the confidential computing platform doesn't support ACPI
> > +        * memory hotplug, the BIOS should never deliver such event to
> > +        * the kernel.  Report ACPI CPU hot-add as a BIOS bug and ignore
> > +        * the new CPU.
> > +        */
> > +       if (cc_platform_has(CC_ATTR_ACPI_CPU_HOTPLUG_DISABLED)) {
> 
> This will affect initialization, not just hotplug AFAICS.
> 
> You should reset the .hotplug.enabled flag in processor_handler to
> false instead.

Hi Rafael,

Thanks for the review.  By "affect initialization" did you mean this
acpi_processor_add() is also called during kernel boot when any logical cpu is
brought up?  Or do you mean ACPI CPU hotplug can also happen during kernel boot
(after acpi_processor_init())?

I see acpi_processor_init() calls acpi_processor_check_duplicates() which calls
acpi_evaluate_object() but I don't know details of ACPI so I don't know whether
this would trigger acpi_processor_add().

One thing is TDX doesn't support ACPI CPU hotplug is an architectural thing, so
it is illegal even if it happens during kernel boot.  Dave's idea is the kernel
should  speak out loudly if physical CPU hotplug indeed happened on (BIOS) TDX-
enabled platforms.  Otherwise perhaps we can just give up initializing the ACPI
CPU hotplug in acpi_processor_init(), something like below?

--- a/drivers/acpi/acpi_processor.c
+++ b/drivers/acpi/acpi_processor.c
@@ -707,6 +707,10 @@ bool acpi_duplicate_processor_id(int proc_id)
 void __init acpi_processor_init(void)
 {
        acpi_processor_check_duplicates();
+
+       if (cc_platform_has(CC_ATTR_ACPI_CPU_HOTPLUG_DISABLED))
+               return;
+
        acpi_scan_add_handler_with_hotplug(&processor_handler, "processor");
        acpi_scan_add_handler(&processor_container_handler);
 }


> 
> > +               dev_err(&device->dev, "[BIOS bug]: Platform doesn't support ACPI CPU hotplug.  New CPU ignored.\n");
> > +               return -EINVAL;
> > +       }
> > +
Dave Hansen June 24, 2022, 6:57 p.m. UTC | #3
On 6/22/22 04:15, Kai Huang wrote:
> Platforms with confidential computing technology may not support ACPI
> CPU hotplug when such technology is enabled by the BIOS.  Examples
> include Intel platforms which support Intel Trust Domain Extensions
> (TDX).
> 
> If the kernel ever receives ACPI CPU hotplug event, it is likely a BIOS
> bug.  For ACPI CPU hot-add, the kernel should speak out this is a BIOS
> bug and reject the new CPU.  For hot-removal, for simplicity just assume
> the kernel cannot continue to work normally, and BUG().

So, the kernel is now declaring ACPI CPU hotplug and TDX to be
incompatible and even BUG()'ing if we see them together.  Has anyone
told the firmware guys about this?  Is this in a spec somewhere?  When
the kernel goes boom, are the firmware folks going to cry "Kernel bug!!"?

This doesn't seem like something the kernel should be doing unilaterally.
Huang, Kai June 27, 2022, 5:05 a.m. UTC | #4
On Fri, 2022-06-24 at 11:57 -0700, Dave Hansen wrote:
> On 6/22/22 04:15, Kai Huang wrote:
> > Platforms with confidential computing technology may not support ACPI
> > CPU hotplug when such technology is enabled by the BIOS.  Examples
> > include Intel platforms which support Intel Trust Domain Extensions
> > (TDX).
> > 
> > If the kernel ever receives ACPI CPU hotplug event, it is likely a BIOS
> > bug.  For ACPI CPU hot-add, the kernel should speak out this is a BIOS
> > bug and reject the new CPU.  For hot-removal, for simplicity just assume
> > the kernel cannot continue to work normally, and BUG().
> 
> So, the kernel is now declaring ACPI CPU hotplug and TDX to be
> incompatible and even BUG()'ing if we see them together.  Has anyone
> told the firmware guys about this?  Is this in a spec somewhere?  When
> the kernel goes boom, are the firmware folks going to cry "Kernel bug!!"?
> 
> This doesn't seem like something the kernel should be doing unilaterally.

TDX doesn't support ACPI CPU hotplug (both hot-add and hot-removal) is an
architectural behaviour.  The public specs doesn't explicitly say  it, but it is
implied:

1) During platform boot MCHECK verifies all logical CPUs on all packages that
they are TDX compatible, and it keeps some information, such as total CPU
packages and total logical cpus at some location of SEAMRR so it can later be
used by P-SEAMLDR and TDX module.  Please see "3.4 SEAMLDR_SEAMINFO" in the P-
SEAMLDR spec:

https://cdrdv2.intel.com/v1/dl/getContent/733584

2) Also some SEAMCALLs must be called on all logical CPUs or CPU packages that
the platform has (such as such as TDH.SYS.INIT.LP and TDH.SYS.KEY.CONFIG),
otherwise the further step of TDX module initialization will fail.

Unfortunately there's no public spec mentioning what's the behaviour of ACPI CPU
hotplug on TDX enabled platform.  For instance, whether BIOS will ever get the
ACPI CPU hot-plug event, or if BIOS gets the event, will it suppress it.  What I
got from Intel internally is a non-buggy BIOS should never report such event to
the kernel, so if kernel receives such event, it should be fair enough to treat
it as BIOS bug.

But theoretically, the BIOS isn't in TDX's TCB, and can be from 3rd party..

Also, I was told "CPU hot-plug is a system feature, not a CPU feature or Intel
architecture feature", so Intel doesn't have an architectural specification for
CPU hot-plug. 

At the meantime, I am pushing Intel internally to add some statements regarding
to the TDX and CPU hotplug interaction to the BIOS write guide and make it
public.  I guess this is the best thing we can do.

Regarding to the code change, I agree the BUG() isn't good.  I used it because:
1) this basically on a theoretical problem and shouldn't happen in practice; 2)
because there's no architectural specification regarding to the behaviour of TDX
when CPU hot-removal, so I just used BUG() in assumption that TDX isn't safe to
use anymore.

But Rafael doesn't like current code change either. I think maybe we can just
disable CPU hotplug code when TDX is enabled by BIOS (something like below):

--- a/drivers/acpi/acpi_processor.c
+++ b/drivers/acpi/acpi_processor.c
@@ -707,6 +707,10 @@ bool acpi_duplicate_processor_id(int proc_id)
 void __init acpi_processor_init(void)
 {
        acpi_processor_check_duplicates();
+
+       if (cc_platform_has(CC_ATTR_ACPI_CPU_HOTPLUG_DISABLED))
+               return;
+
        acpi_scan_add_handler_with_hotplug(&processor_handler, "processor");
        acpi_scan_add_handler(&processor_container_handler);
 }

This approach is cleaner I think, but we won't be able to report "BIOS bug" when
ACPI CPU hotplug happens.  But to me it's OK as perhaps it's arguable to treat
it as BIOS bug (as theoretically BIOS can be from 3rd party). 

What's your opinion?
Igor Mammedov June 27, 2022, 8:01 a.m. UTC | #5
On Thu, 23 Jun 2022 12:01:48 +1200
Kai Huang <kai.huang@intel.com> wrote:

> On Wed, 2022-06-22 at 13:42 +0200, Rafael J. Wysocki wrote:
> > On Wed, Jun 22, 2022 at 1:16 PM Kai Huang <kai.huang@intel.com> wrote:  
> > > 
> > > Platforms with confidential computing technology may not support ACPI
> > > CPU hotplug when such technology is enabled by the BIOS.  Examples
> > > include Intel platforms which support Intel Trust Domain Extensions
> > > (TDX).
> > > 
> > > If the kernel ever receives ACPI CPU hotplug event, it is likely a BIOS
> > > bug.  For ACPI CPU hot-add, the kernel should speak out this is a BIOS
> > > bug and reject the new CPU.  For hot-removal, for simplicity just assume
> > > the kernel cannot continue to work normally, and BUG().
> > > 
> > > Add a new attribute CC_ATTR_ACPI_CPU_HOTPLUG_DISABLED to indicate the
> > > platform doesn't support ACPI CPU hotplug, so that kernel can handle
> > > ACPI CPU hotplug events for such platform.  The existing attribute
> > > CC_ATTR_HOTPLUG_DISABLED is for software CPU hotplug thus doesn't fit.
> > > 
> > > In acpi_processor_{add|remove}(), add early check against this attribute
> > > and handle accordingly if it is set.
> > > 
> > > Also take this chance to rename existing CC_ATTR_HOTPLUG_DISABLED to
> > > CC_ATTR_CPU_HOTPLUG_DISABLED as it is for software CPU hotplug.
> > > 
> > > Signed-off-by: Kai Huang <kai.huang@intel.com>
> > > ---
> > >  arch/x86/coco/core.c          |  2 +-
> > >  drivers/acpi/acpi_processor.c | 23 +++++++++++++++++++++++
> > >  include/linux/cc_platform.h   | 15 +++++++++++++--
> > >  kernel/cpu.c                  |  2 +-
> > >  4 files changed, 38 insertions(+), 4 deletions(-)
> > > 
> > > diff --git a/arch/x86/coco/core.c b/arch/x86/coco/core.c
> > > index 4320fadae716..1bde1af75296 100644
> > > --- a/arch/x86/coco/core.c
> > > +++ b/arch/x86/coco/core.c
> > > @@ -20,7 +20,7 @@ static bool intel_cc_platform_has(enum cc_attr attr)
> > >  {
> > >         switch (attr) {
> > >         case CC_ATTR_GUEST_UNROLL_STRING_IO:
> > > -       case CC_ATTR_HOTPLUG_DISABLED:
> > > +       case CC_ATTR_CPU_HOTPLUG_DISABLED:
> > >         case CC_ATTR_GUEST_MEM_ENCRYPT:
> > >         case CC_ATTR_MEM_ENCRYPT:
> > >                 return true;
> > > diff --git a/drivers/acpi/acpi_processor.c b/drivers/acpi/acpi_processor.c
> > > index 6737b1cbf6d6..b960db864cd4 100644
> > > --- a/drivers/acpi/acpi_processor.c
> > > +++ b/drivers/acpi/acpi_processor.c
> > > @@ -15,6 +15,7 @@
> > >  #include <linux/kernel.h>
> > >  #include <linux/module.h>
> > >  #include <linux/pci.h>
> > > +#include <linux/cc_platform.h>
> > > 
> > >  #include <acpi/processor.h>
> > > 
> > > @@ -357,6 +358,17 @@ static int acpi_processor_add(struct acpi_device *device,
> > >         struct device *dev;
> > >         int result = 0;
> > > 
> > > +       /*
> > > +        * If the confidential computing platform doesn't support ACPI
> > > +        * memory hotplug, the BIOS should never deliver such event to
> > > +        * the kernel.  Report ACPI CPU hot-add as a BIOS bug and ignore
> > > +        * the new CPU.
> > > +        */
> > > +       if (cc_platform_has(CC_ATTR_ACPI_CPU_HOTPLUG_DISABLED)) {  
> > 
> > This will affect initialization, not just hotplug AFAICS.
> > 
> > You should reset the .hotplug.enabled flag in processor_handler to
> > false instead.  
> 
> Hi Rafael,
> 
> Thanks for the review.  By "affect initialization" did you mean this
> acpi_processor_add() is also called during kernel boot when any logical cpu is
> brought up?  Or do you mean ACPI CPU hotplug can also happen during kernel boot
> (after acpi_processor_init())?
> 
> I see acpi_processor_init() calls acpi_processor_check_duplicates() which calls
> acpi_evaluate_object() but I don't know details of ACPI so I don't know whether
> this would trigger acpi_processor_add().
> 
> One thing is TDX doesn't support ACPI CPU hotplug is an architectural thing, so
> it is illegal even if it happens during kernel boot.  Dave's idea is the kernel
> should  speak out loudly if physical CPU hotplug indeed happened on (BIOS) TDX-
> enabled platforms.  Otherwise perhaps we can just give up initializing the ACPI
> CPU hotplug in acpi_processor_init(), something like below?

The thing is that by the time ACPI machinery kicks in, physical hotplug
has already happened and in case of (kvm+qemu+ovmf hypervisor combo)
firmware has already handled it somehow and handed it over to ACPI.
If you say it's architectural thing then cpu hotplug is platform/firmware
bug and should be disabled there instead of working around it in the kernel.

Perhaps instead of 'preventing' hotplug, complain/panic and be done with it.
 
> --- a/drivers/acpi/acpi_processor.c
> +++ b/drivers/acpi/acpi_processor.c
> @@ -707,6 +707,10 @@ bool acpi_duplicate_processor_id(int proc_id)
>  void __init acpi_processor_init(void)
>  {
>         acpi_processor_check_duplicates();
> +
> +       if (cc_platform_has(CC_ATTR_ACPI_CPU_HOTPLUG_DISABLED))
> +               return;
> +
>         acpi_scan_add_handler_with_hotplug(&processor_handler, "processor");
>         acpi_scan_add_handler(&processor_container_handler);
>  }
> 
> 
> >   
> > > +               dev_err(&device->dev, "[BIOS bug]: Platform doesn't support ACPI CPU hotplug.  New CPU ignored.\n");
> > > +               return -EINVAL;
> > > +       }
> > > +  
>
Huang, Kai June 28, 2022, 10:04 a.m. UTC | #6
On Mon, 2022-06-27 at 10:01 +0200, Igor Mammedov wrote:
> On Thu, 23 Jun 2022 12:01:48 +1200
> Kai Huang <kai.huang@intel.com> wrote:
> 
> > On Wed, 2022-06-22 at 13:42 +0200, Rafael J. Wysocki wrote:
> > > On Wed, Jun 22, 2022 at 1:16 PM Kai Huang <kai.huang@intel.com> wrote:  
> > > > 
> > > > Platforms with confidential computing technology may not support ACPI
> > > > CPU hotplug when such technology is enabled by the BIOS.  Examples
> > > > include Intel platforms which support Intel Trust Domain Extensions
> > > > (TDX).
> > > > 
> > > > If the kernel ever receives ACPI CPU hotplug event, it is likely a BIOS
> > > > bug.  For ACPI CPU hot-add, the kernel should speak out this is a BIOS
> > > > bug and reject the new CPU.  For hot-removal, for simplicity just assume
> > > > the kernel cannot continue to work normally, and BUG().
> > > > 
> > > > Add a new attribute CC_ATTR_ACPI_CPU_HOTPLUG_DISABLED to indicate the
> > > > platform doesn't support ACPI CPU hotplug, so that kernel can handle
> > > > ACPI CPU hotplug events for such platform.  The existing attribute
> > > > CC_ATTR_HOTPLUG_DISABLED is for software CPU hotplug thus doesn't fit.
> > > > 
> > > > In acpi_processor_{add|remove}(), add early check against this attribute
> > > > and handle accordingly if it is set.
> > > > 
> > > > Also take this chance to rename existing CC_ATTR_HOTPLUG_DISABLED to
> > > > CC_ATTR_CPU_HOTPLUG_DISABLED as it is for software CPU hotplug.
> > > > 
> > > > Signed-off-by: Kai Huang <kai.huang@intel.com>
> > > > ---
> > > >  arch/x86/coco/core.c          |  2 +-
> > > >  drivers/acpi/acpi_processor.c | 23 +++++++++++++++++++++++
> > > >  include/linux/cc_platform.h   | 15 +++++++++++++--
> > > >  kernel/cpu.c                  |  2 +-
> > > >  4 files changed, 38 insertions(+), 4 deletions(-)
> > > > 
> > > > diff --git a/arch/x86/coco/core.c b/arch/x86/coco/core.c
> > > > index 4320fadae716..1bde1af75296 100644
> > > > --- a/arch/x86/coco/core.c
> > > > +++ b/arch/x86/coco/core.c
> > > > @@ -20,7 +20,7 @@ static bool intel_cc_platform_has(enum cc_attr attr)
> > > >  {
> > > >         switch (attr) {
> > > >         case CC_ATTR_GUEST_UNROLL_STRING_IO:
> > > > -       case CC_ATTR_HOTPLUG_DISABLED:
> > > > +       case CC_ATTR_CPU_HOTPLUG_DISABLED:
> > > >         case CC_ATTR_GUEST_MEM_ENCRYPT:
> > > >         case CC_ATTR_MEM_ENCRYPT:
> > > >                 return true;
> > > > diff --git a/drivers/acpi/acpi_processor.c b/drivers/acpi/acpi_processor.c
> > > > index 6737b1cbf6d6..b960db864cd4 100644
> > > > --- a/drivers/acpi/acpi_processor.c
> > > > +++ b/drivers/acpi/acpi_processor.c
> > > > @@ -15,6 +15,7 @@
> > > >  #include <linux/kernel.h>
> > > >  #include <linux/module.h>
> > > >  #include <linux/pci.h>
> > > > +#include <linux/cc_platform.h>
> > > > 
> > > >  #include <acpi/processor.h>
> > > > 
> > > > @@ -357,6 +358,17 @@ static int acpi_processor_add(struct acpi_device *device,
> > > >         struct device *dev;
> > > >         int result = 0;
> > > > 
> > > > +       /*
> > > > +        * If the confidential computing platform doesn't support ACPI
> > > > +        * memory hotplug, the BIOS should never deliver such event to
> > > > +        * the kernel.  Report ACPI CPU hot-add as a BIOS bug and ignore
> > > > +        * the new CPU.
> > > > +        */
> > > > +       if (cc_platform_has(CC_ATTR_ACPI_CPU_HOTPLUG_DISABLED)) {  
> > > 
> > > This will affect initialization, not just hotplug AFAICS.
> > > 
> > > You should reset the .hotplug.enabled flag in processor_handler to
> > > false instead.  
> > 
> > Hi Rafael,
> > 
> > Thanks for the review.  By "affect initialization" did you mean this
> > acpi_processor_add() is also called during kernel boot when any logical cpu is
> > brought up?  Or do you mean ACPI CPU hotplug can also happen during kernel boot
> > (after acpi_processor_init())?
> > 
> > I see acpi_processor_init() calls acpi_processor_check_duplicates() which calls
> > acpi_evaluate_object() but I don't know details of ACPI so I don't know whether
> > this would trigger acpi_processor_add().
> > 
> > One thing is TDX doesn't support ACPI CPU hotplug is an architectural thing, so
> > it is illegal even if it happens during kernel boot.  Dave's idea is the kernel
> > should  speak out loudly if physical CPU hotplug indeed happened on (BIOS) TDX-
> > enabled platforms.  Otherwise perhaps we can just give up initializing the ACPI
> > CPU hotplug in acpi_processor_init(), something like below?
> 
> The thing is that by the time ACPI machinery kicks in, physical hotplug
> has already happened and in case of (kvm+qemu+ovmf hypervisor combo)
> firmware has already handled it somehow and handed it over to ACPI.
> If you say it's architectural thing then cpu hotplug is platform/firmware
> bug and should be disabled there instead of working around it in the kernel.
> 
> Perhaps instead of 'preventing' hotplug, complain/panic and be done with it.

Hi Igor,

Thanks for feedback.  Yes the current implementation actually reports CPU hot-
add as BIOS bug.  I think I can report BIOS bug for hot-removal too.  And
currently I actually used BUG() for the hot-removal case.  For hot-add I didn't
use BUG() but rejected the new CPU as the latter is more conservative. 

Hi Rafael,

I am not sure I got what you mean by "This will affect initialization, not just
hotplug AFAICS", could you elaborate a little bit?  Thanks.
Igor Mammedov June 28, 2022, 11:52 a.m. UTC | #7
On Tue, 28 Jun 2022 22:04:43 +1200
Kai Huang <kai.huang@intel.com> wrote:

> On Mon, 2022-06-27 at 10:01 +0200, Igor Mammedov wrote:
> > On Thu, 23 Jun 2022 12:01:48 +1200
> > Kai Huang <kai.huang@intel.com> wrote:
> >   
> > > On Wed, 2022-06-22 at 13:42 +0200, Rafael J. Wysocki wrote:  
> > > > On Wed, Jun 22, 2022 at 1:16 PM Kai Huang <kai.huang@intel.com> wrote:    
> > > > > 
> > > > > Platforms with confidential computing technology may not support ACPI
> > > > > CPU hotplug when such technology is enabled by the BIOS.  Examples
> > > > > include Intel platforms which support Intel Trust Domain Extensions
> > > > > (TDX).
> > > > > 
> > > > > If the kernel ever receives ACPI CPU hotplug event, it is likely a BIOS
> > > > > bug.  For ACPI CPU hot-add, the kernel should speak out this is a BIOS
> > > > > bug and reject the new CPU.  For hot-removal, for simplicity just assume
> > > > > the kernel cannot continue to work normally, and BUG().
> > > > > 
> > > > > Add a new attribute CC_ATTR_ACPI_CPU_HOTPLUG_DISABLED to indicate the
> > > > > platform doesn't support ACPI CPU hotplug, so that kernel can handle
> > > > > ACPI CPU hotplug events for such platform.  The existing attribute
> > > > > CC_ATTR_HOTPLUG_DISABLED is for software CPU hotplug thus doesn't fit.
> > > > > 
> > > > > In acpi_processor_{add|remove}(), add early check against this attribute
> > > > > and handle accordingly if it is set.
> > > > > 
> > > > > Also take this chance to rename existing CC_ATTR_HOTPLUG_DISABLED to
> > > > > CC_ATTR_CPU_HOTPLUG_DISABLED as it is for software CPU hotplug.
> > > > > 
> > > > > Signed-off-by: Kai Huang <kai.huang@intel.com>
> > > > > ---
> > > > >  arch/x86/coco/core.c          |  2 +-
> > > > >  drivers/acpi/acpi_processor.c | 23 +++++++++++++++++++++++
> > > > >  include/linux/cc_platform.h   | 15 +++++++++++++--
> > > > >  kernel/cpu.c                  |  2 +-
> > > > >  4 files changed, 38 insertions(+), 4 deletions(-)
> > > > > 
> > > > > diff --git a/arch/x86/coco/core.c b/arch/x86/coco/core.c
> > > > > index 4320fadae716..1bde1af75296 100644
> > > > > --- a/arch/x86/coco/core.c
> > > > > +++ b/arch/x86/coco/core.c
> > > > > @@ -20,7 +20,7 @@ static bool intel_cc_platform_has(enum cc_attr attr)
> > > > >  {
> > > > >         switch (attr) {
> > > > >         case CC_ATTR_GUEST_UNROLL_STRING_IO:
> > > > > -       case CC_ATTR_HOTPLUG_DISABLED:
> > > > > +       case CC_ATTR_CPU_HOTPLUG_DISABLED:
> > > > >         case CC_ATTR_GUEST_MEM_ENCRYPT:
> > > > >         case CC_ATTR_MEM_ENCRYPT:
> > > > >                 return true;
> > > > > diff --git a/drivers/acpi/acpi_processor.c b/drivers/acpi/acpi_processor.c
> > > > > index 6737b1cbf6d6..b960db864cd4 100644
> > > > > --- a/drivers/acpi/acpi_processor.c
> > > > > +++ b/drivers/acpi/acpi_processor.c
> > > > > @@ -15,6 +15,7 @@
> > > > >  #include <linux/kernel.h>
> > > > >  #include <linux/module.h>
> > > > >  #include <linux/pci.h>
> > > > > +#include <linux/cc_platform.h>
> > > > > 
> > > > >  #include <acpi/processor.h>
> > > > > 
> > > > > @@ -357,6 +358,17 @@ static int acpi_processor_add(struct acpi_device *device,
> > > > >         struct device *dev;
> > > > >         int result = 0;
> > > > > 
> > > > > +       /*
> > > > > +        * If the confidential computing platform doesn't support ACPI
> > > > > +        * memory hotplug, the BIOS should never deliver such event to
> > > > > +        * the kernel.  Report ACPI CPU hot-add as a BIOS bug and ignore
> > > > > +        * the new CPU.
> > > > > +        */
> > > > > +       if (cc_platform_has(CC_ATTR_ACPI_CPU_HOTPLUG_DISABLED)) {    
> > > > 
> > > > This will affect initialization, not just hotplug AFAICS.
> > > > 
> > > > You should reset the .hotplug.enabled flag in processor_handler to
> > > > false instead.    
> > > 
> > > Hi Rafael,
> > > 
> > > Thanks for the review.  By "affect initialization" did you mean this
> > > acpi_processor_add() is also called during kernel boot when any logical cpu is
> > > brought up?  Or do you mean ACPI CPU hotplug can also happen during kernel boot
> > > (after acpi_processor_init())?
> > > 
> > > I see acpi_processor_init() calls acpi_processor_check_duplicates() which calls
> > > acpi_evaluate_object() but I don't know details of ACPI so I don't know whether
> > > this would trigger acpi_processor_add().
> > > 
> > > One thing is TDX doesn't support ACPI CPU hotplug is an architectural thing, so
> > > it is illegal even if it happens during kernel boot.  Dave's idea is the kernel
> > > should  speak out loudly if physical CPU hotplug indeed happened on (BIOS) TDX-
> > > enabled platforms.  Otherwise perhaps we can just give up initializing the ACPI
> > > CPU hotplug in acpi_processor_init(), something like below?  
> > 
> > The thing is that by the time ACPI machinery kicks in, physical hotplug
> > has already happened and in case of (kvm+qemu+ovmf hypervisor combo)
> > firmware has already handled it somehow and handed it over to ACPI.
> > If you say it's architectural thing then cpu hotplug is platform/firmware
> > bug and should be disabled there instead of working around it in the kernel.
> > 
> > Perhaps instead of 'preventing' hotplug, complain/panic and be done with it.  
> 
> Hi Igor,
> 
> Thanks for feedback.  Yes the current implementation actually reports CPU hot-
> add as BIOS bug.  I think I can report BIOS bug for hot-removal too.  And
> currently I actually used BUG() for the hot-removal case.  For hot-add I didn't
> use BUG() but rejected the new CPU as the latter is more conservative. 

Is it safe to ignore not properly initialized for TDX CPU,
sitting there (it may wake up to IRQs (as minimum SMI, but
maybe to IPIs as well (depending in what state FW left it))?

for hypervisors, one should disable cpu hotplug there
(ex: in QEMU, you can try to disable cpu hotplug completely
if TDX is enabled so it won't ever come to 'physical' cpu
being added to guest and no CPU hotplug related ACPI AML
code generated)

> Hi Rafael,
> 
> I am not sure I got what you mean by "This will affect initialization, not just
> hotplug AFAICS", could you elaborate a little bit?  Thanks.
> 
>
Rafael J. Wysocki June 28, 2022, 5:33 p.m. UTC | #8
On Tue, Jun 28, 2022 at 12:04 PM Kai Huang <kai.huang@intel.com> wrote:
>
> On Mon, 2022-06-27 at 10:01 +0200, Igor Mammedov wrote:
> > On Thu, 23 Jun 2022 12:01:48 +1200
> > Kai Huang <kai.huang@intel.com> wrote:
> >
> > > On Wed, 2022-06-22 at 13:42 +0200, Rafael J. Wysocki wrote:
> > > > On Wed, Jun 22, 2022 at 1:16 PM Kai Huang <kai.huang@intel.com> wrote:
> > > > >
> > > > > Platforms with confidential computing technology may not support ACPI
> > > > > CPU hotplug when such technology is enabled by the BIOS.  Examples
> > > > > include Intel platforms which support Intel Trust Domain Extensions
> > > > > (TDX).
> > > > >
> > > > > If the kernel ever receives ACPI CPU hotplug event, it is likely a BIOS
> > > > > bug.  For ACPI CPU hot-add, the kernel should speak out this is a BIOS
> > > > > bug and reject the new CPU.  For hot-removal, for simplicity just assume
> > > > > the kernel cannot continue to work normally, and BUG().
> > > > >
> > > > > Add a new attribute CC_ATTR_ACPI_CPU_HOTPLUG_DISABLED to indicate the
> > > > > platform doesn't support ACPI CPU hotplug, so that kernel can handle
> > > > > ACPI CPU hotplug events for such platform.  The existing attribute
> > > > > CC_ATTR_HOTPLUG_DISABLED is for software CPU hotplug thus doesn't fit.
> > > > >
> > > > > In acpi_processor_{add|remove}(), add early check against this attribute
> > > > > and handle accordingly if it is set.
> > > > >
> > > > > Also take this chance to rename existing CC_ATTR_HOTPLUG_DISABLED to
> > > > > CC_ATTR_CPU_HOTPLUG_DISABLED as it is for software CPU hotplug.
> > > > >
> > > > > Signed-off-by: Kai Huang <kai.huang@intel.com>
> > > > > ---
> > > > >  arch/x86/coco/core.c          |  2 +-
> > > > >  drivers/acpi/acpi_processor.c | 23 +++++++++++++++++++++++
> > > > >  include/linux/cc_platform.h   | 15 +++++++++++++--
> > > > >  kernel/cpu.c                  |  2 +-
> > > > >  4 files changed, 38 insertions(+), 4 deletions(-)
> > > > >
> > > > > diff --git a/arch/x86/coco/core.c b/arch/x86/coco/core.c
> > > > > index 4320fadae716..1bde1af75296 100644
> > > > > --- a/arch/x86/coco/core.c
> > > > > +++ b/arch/x86/coco/core.c
> > > > > @@ -20,7 +20,7 @@ static bool intel_cc_platform_has(enum cc_attr attr)
> > > > >  {
> > > > >         switch (attr) {
> > > > >         case CC_ATTR_GUEST_UNROLL_STRING_IO:
> > > > > -       case CC_ATTR_HOTPLUG_DISABLED:
> > > > > +       case CC_ATTR_CPU_HOTPLUG_DISABLED:
> > > > >         case CC_ATTR_GUEST_MEM_ENCRYPT:
> > > > >         case CC_ATTR_MEM_ENCRYPT:
> > > > >                 return true;
> > > > > diff --git a/drivers/acpi/acpi_processor.c b/drivers/acpi/acpi_processor.c
> > > > > index 6737b1cbf6d6..b960db864cd4 100644
> > > > > --- a/drivers/acpi/acpi_processor.c
> > > > > +++ b/drivers/acpi/acpi_processor.c
> > > > > @@ -15,6 +15,7 @@
> > > > >  #include <linux/kernel.h>
> > > > >  #include <linux/module.h>
> > > > >  #include <linux/pci.h>
> > > > > +#include <linux/cc_platform.h>
> > > > >
> > > > >  #include <acpi/processor.h>
> > > > >
> > > > > @@ -357,6 +358,17 @@ static int acpi_processor_add(struct acpi_device *device,
> > > > >         struct device *dev;
> > > > >         int result = 0;
> > > > >
> > > > > +       /*
> > > > > +        * If the confidential computing platform doesn't support ACPI
> > > > > +        * memory hotplug, the BIOS should never deliver such event to
> > > > > +        * the kernel.  Report ACPI CPU hot-add as a BIOS bug and ignore
> > > > > +        * the new CPU.
> > > > > +        */
> > > > > +       if (cc_platform_has(CC_ATTR_ACPI_CPU_HOTPLUG_DISABLED)) {
> > > >
> > > > This will affect initialization, not just hotplug AFAICS.
> > > >
> > > > You should reset the .hotplug.enabled flag in processor_handler to
> > > > false instead.
> > >
> > > Hi Rafael,
> > >
> > > Thanks for the review.  By "affect initialization" did you mean this
> > > acpi_processor_add() is also called during kernel boot when any logical cpu is
> > > brought up?  Or do you mean ACPI CPU hotplug can also happen during kernel boot
> > > (after acpi_processor_init())?
> > >
> > > I see acpi_processor_init() calls acpi_processor_check_duplicates() which calls
> > > acpi_evaluate_object() but I don't know details of ACPI so I don't know whether
> > > this would trigger acpi_processor_add().
> > >
> > > One thing is TDX doesn't support ACPI CPU hotplug is an architectural thing, so
> > > it is illegal even if it happens during kernel boot.  Dave's idea is the kernel
> > > should  speak out loudly if physical CPU hotplug indeed happened on (BIOS) TDX-
> > > enabled platforms.  Otherwise perhaps we can just give up initializing the ACPI
> > > CPU hotplug in acpi_processor_init(), something like below?
> >
> > The thing is that by the time ACPI machinery kicks in, physical hotplug
> > has already happened and in case of (kvm+qemu+ovmf hypervisor combo)
> > firmware has already handled it somehow and handed it over to ACPI.
> > If you say it's architectural thing then cpu hotplug is platform/firmware
> > bug and should be disabled there instead of working around it in the kernel.
> >
> > Perhaps instead of 'preventing' hotplug, complain/panic and be done with it.
>
> Hi Igor,
>
> Thanks for feedback.  Yes the current implementation actually reports CPU hot-
> add as BIOS bug.  I think I can report BIOS bug for hot-removal too.  And
> currently I actually used BUG() for the hot-removal case.  For hot-add I didn't
> use BUG() but rejected the new CPU as the latter is more conservative.
>
> Hi Rafael,
>
> I am not sure I got what you mean by "This will affect initialization, not just
> hotplug AFAICS", could you elaborate a little bit?  Thanks.

So acpi_processor_add() is called for CPUs that are already present at
init time, not just for the hot-added ones.

One of the things it does is to associate an ACPI companion with the given CPU.

Don't you need that to happen?
Huang, Kai June 28, 2022, 11:41 p.m. UTC | #9
On Tue, 2022-06-28 at 19:33 +0200, Rafael J. Wysocki wrote:
> > Hi Rafael,
> > 
> > I am not sure I got what you mean by "This will affect initialization, not
> > just
> > hotplug AFAICS", could you elaborate a little bit?  Thanks.
> 
> So acpi_processor_add() is called for CPUs that are already present at
> init time, not just for the hot-added ones.
> 
> One of the things it does is to associate an ACPI companion with the given
> CPU.
> 
> Don't you need that to happen?

You are right.  I did test again and yes it was also called after boot-time
present cpus are up (after smp_init()).  I didn't check this carefully at my
previous test.  Thanks for catching.
Christoph Hellwig June 29, 2022, 5:33 a.m. UTC | #10
On Wed, Jun 22, 2022 at 11:15:43PM +1200, Kai Huang wrote:
> Platforms with confidential computing technology may not support ACPI
> CPU hotplug when such technology is enabled by the BIOS.  Examples
> include Intel platforms which support Intel Trust Domain Extensions
> (TDX).

What does this have to to wit hthe cc_platform abstraction?  This is
just an intel implementation bug because they hastended so much into
implementing this.  So the quirks should not overload the cc_platform
abstraction.
Huang, Kai June 29, 2022, 9:09 a.m. UTC | #11
On Tue, 2022-06-28 at 22:33 -0700, Christoph Hellwig wrote:
> On Wed, Jun 22, 2022 at 11:15:43PM +1200, Kai Huang wrote:
> > Platforms with confidential computing technology may not support ACPI
> > CPU hotplug when such technology is enabled by the BIOS.  Examples
> > include Intel platforms which support Intel Trust Domain Extensions
> > (TDX).
> 
> What does this have to to wit hthe cc_platform abstraction?  This is
> just an intel implementation bug because they hastended so much into
> implementing this.  So the quirks should not overload the cc_platform
> abstraction.
> 

Thanks for feedback.  I thought there might be similar technologies and it would
be better to have a common attribute.  I'll give up this approach and change to
use arch-specific check.
Huang, Kai July 13, 2022, 11:09 a.m. UTC | #12
On Mon, 2022-06-27 at 17:05 +1200, Kai Huang wrote:
> On Fri, 2022-06-24 at 11:57 -0700, Dave Hansen wrote:
> > On 6/22/22 04:15, Kai Huang wrote:
> > > Platforms with confidential computing technology may not support ACPI
> > > CPU hotplug when such technology is enabled by the BIOS.  Examples
> > > include Intel platforms which support Intel Trust Domain Extensions
> > > (TDX).
> > > 
> > > If the kernel ever receives ACPI CPU hotplug event, it is likely a BIOS
> > > bug.  For ACPI CPU hot-add, the kernel should speak out this is a BIOS
> > > bug and reject the new CPU.  For hot-removal, for simplicity just assume
> > > the kernel cannot continue to work normally, and BUG().
> > 
> > So, the kernel is now declaring ACPI CPU hotplug and TDX to be
> > incompatible and even BUG()'ing if we see them together.  Has anyone
> > told the firmware guys about this?  Is this in a spec somewhere?  When
> > the kernel goes boom, are the firmware folks going to cry "Kernel bug!!"?
> > 
> > This doesn't seem like something the kernel should be doing unilaterally.
> 
> TDX doesn't support ACPI CPU hotplug (both hot-add and hot-removal) is an
> architectural behaviour.  The public specs doesn't explicitly say  it, but it is
> implied:
> 
> 1) During platform boot MCHECK verifies all logical CPUs on all packages that
> they are TDX compatible, and it keeps some information, such as total CPU
> packages and total logical cpus at some location of SEAMRR so it can later be
> used by P-SEAMLDR and TDX module.  Please see "3.4 SEAMLDR_SEAMINFO" in the P-
> SEAMLDR spec:
> 
> https://cdrdv2.intel.com/v1/dl/getContent/733584
> 
> 2) Also some SEAMCALLs must be called on all logical CPUs or CPU packages that
> the platform has (such as such as TDH.SYS.INIT.LP and TDH.SYS.KEY.CONFIG),
> otherwise the further step of TDX module initialization will fail.
> 
> Unfortunately there's no public spec mentioning what's the behaviour of ACPI CPU
> hotplug on TDX enabled platform.  For instance, whether BIOS will ever get the
> ACPI CPU hot-plug event, or if BIOS gets the event, will it suppress it.  What I
> got from Intel internally is a non-buggy BIOS should never report such event to
> the kernel, so if kernel receives such event, it should be fair enough to treat
> it as BIOS bug.
> 
> But theoretically, the BIOS isn't in TDX's TCB, and can be from 3rd party..
> 
> Also, I was told "CPU hot-plug is a system feature, not a CPU feature or Intel
> architecture feature", so Intel doesn't have an architectural specification for
> CPU hot-plug. 
> 
> At the meantime, I am pushing Intel internally to add some statements regarding
> to the TDX and CPU hotplug interaction to the BIOS write guide and make it
> public.  I guess this is the best thing we can do.
> 
> Regarding to the code change, I agree the BUG() isn't good.  I used it because:
> 1) this basically on a theoretical problem and shouldn't happen in practice; 2)
> because there's no architectural specification regarding to the behaviour of TDX
> when CPU hot-removal, so I just used BUG() in assumption that TDX isn't safe to
> use anymore.

Hi Dave,

Trying to close how to handle ACPI CPU hotplug for TDX.  Could you give some
suggestion?

After discussion with TDX guys, they have agreed they will add below to either
the TDX module spec or TDX whitepaper:

"TDX doesn’t support adding or removing CPUs from TDX security perimeter. The
BIOS should prevent CPUs from being hot-added or hot-removed after platform
boots."

This means if TDX is enabled in BIOS, a non-buggy BIOS should never deliver ACPI
CPU hotplug event to kernel, otherwise it is a BIOS bug.  And this is only
related to whether TDX is enabled in BIOS, no matter whether the TDX module has
been loaded/initialized or not.

So I think the proper way to handle is: we still have code to detect whether TDX
is enabled by BIOS (patch 01 in this series), and when ACPI CPU hotplug happens
on TDX enabled platform, we print out error message saying it is a BIOS bug.

Specifically, for CPU hot-add, we can print error message and reject the new
CPU.  For CPU hot-removal, when the kernel receives this event, the CPU hot-
removal has already handled by BIOS so the kernel cannot reject it.  So I think
we can either BUG(), or say "TDX is broken and please reboot the machine".

I guess BUG() would catch a lot of eyeball, so how about choose the latter, like
below?

--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -799,6 +799,7 @@ static void __init acpi_set_irq_model_ioapic(void)
  */
 #ifdef CONFIG_ACPI_HOTPLUG_CPU
 #include <acpi/processor.h>
+#include <asm/tdx.h>
 
 static int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
 {
@@ -819,6 +820,12 @@ int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid,
u32 acpi_id,
 {
        int cpu;
 
+       if (platform_tdx_enabled()) {
+               pr_err("BIOS bug: CPU (physid %u) hot-added on TDX enabled
platform. Reject it.\n",
+                               physid);
+               return -EINVAL;
+       }
+
        cpu = acpi_register_lapic(physid, acpi_id, ACPI_MADT_ENABLED);
        if (cpu < 0) {
                pr_info("Unable to map lapic to logical cpu number\n");
@@ -835,6 +842,10 @@ EXPORT_SYMBOL(acpi_map_cpu);
 
 int acpi_unmap_cpu(int cpu)
 {
+       if (platform_tdx_enabled())
+               pr_err("BIOS bug: CPU %d hot-removed on TDX enabled platform.
TDX is broken. Please reboot the machine.\n",
+                               cpu);
+
 #ifdef CONFIG_ACPI_NUMA
        set_apicid_to_node(per_cpu(x86_cpu_to_apicid, cpu), NUMA_NO_NODE);
 #endif
Dave Hansen July 19, 2022, 5:46 p.m. UTC | #13
On 7/13/22 04:09, Kai Huang wrote:
...
> "TDX doesn’t support adding or removing CPUs from TDX security perimeter. The
> BIOS should prevent CPUs from being hot-added or hot-removed after platform
> boots."

That's a start.  It also probably needs to say that the security
perimeter includes all logical CPUs, though.

>  static int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
>  {
> @@ -819,6 +820,12 @@ int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid,
> u32 acpi_id,
>  {
>         int cpu;
>  
> +       if (platform_tdx_enabled()) {
> +               pr_err("BIOS bug: CPU (physid %u) hot-added on TDX enabled
> platform. Reject it.\n",
> +                               physid);
> +               return -EINVAL;
> +       }

Is this the right place?  There are other sanity checks in
acpi_processor_hotadd_init() and it seems like a better spot.

>         cpu = acpi_register_lapic(physid, acpi_id, ACPI_MADT_ENABLED);
>         if (cpu < 0) {
>                 pr_info("Unable to map lapic to logical cpu number\n");
> @@ -835,6 +842,10 @@ EXPORT_SYMBOL(acpi_map_cpu);
>  
>  int acpi_unmap_cpu(int cpu)
>  {
> +       if (platform_tdx_enabled())
> +               pr_err("BIOS bug: CPU %d hot-removed on TDX enabled platform.
> TDX is broken. Please reboot the machine.\n",
> +                               cpu);
> +
>  #ifdef CONFIG_ACPI_NUMA
>         set_apicid_to_node(per_cpu(x86_cpu_to_apicid, cpu), NUMA_NO_NODE);
>  #endif
Huang, Kai July 19, 2022, 11:54 p.m. UTC | #14
On Tue, 2022-07-19 at 10:46 -0700, Dave Hansen wrote:
> On 7/13/22 04:09, Kai Huang wrote:
> ...
> > "TDX doesn’t support adding or removing CPUs from TDX security perimeter. The
> > BIOS should prevent CPUs from being hot-added or hot-removed after platform
> > boots."
> 
> That's a start.  It also probably needs to say that the security
> perimeter includes all logical CPUs, though.

To me it is kinda implied.  But I have sent email to TDX spec owner to see
whether we can say it more explicitly.

> 
> >  static int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
> >  {
> > @@ -819,6 +820,12 @@ int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid,
> > u32 acpi_id,
> >  {
> >         int cpu;
> >  
> > +       if (platform_tdx_enabled()) {
> > +               pr_err("BIOS bug: CPU (physid %u) hot-added on TDX enabled
> > platform. Reject it.\n",
> > +                               physid);
> > +               return -EINVAL;
> > +       }
> 
> Is this the right place?  There are other sanity checks in
> acpi_processor_hotadd_init() and it seems like a better spot.

It has below additional check:

        if (invalid_phys_cpuid(pr->phys_id))
                return -ENODEV;
        
        status = acpi_evaluate_integer(pr->handle, "_STA", NULL, &sta);
        if (ACPI_FAILURE(status) || !(sta & ACPI_STA_DEVICE_PRESENT))
                return -ENODEV;


I don't know exactly when will the first "invalid_phys_cpuid()" case happen, but
the CPU is enumerated as "present" only after the second check.  I.e. if BIOS is
buggy and somehow sends a ACPI CPU hot-add event to kernel w/o having the CPU
being actually hot-added, the kernel just returns -ENODEV here.

So to me, adding to acpi_map_cpu() is more reasonable, because by reaching here,
it is sure that a real CPU is being hot-added.


> 
> >         cpu = acpi_register_lapic(physid, acpi_id, ACPI_MADT_ENABLED);
> >         if (cpu < 0) {
> >                 pr_info("Unable to map lapic to logical cpu number\n");
> > @@ -835,6 +842,10 @@ EXPORT_SYMBOL(acpi_map_cpu);
> >  
> >  int acpi_unmap_cpu(int cpu)
> >  {
> > +       if (platform_tdx_enabled())
> > +               pr_err("BIOS bug: CPU %d hot-removed on TDX enabled platform.
> > TDX is broken. Please reboot the machine.\n",
> > +                               cpu);
> > +
> >  #ifdef CONFIG_ACPI_NUMA
> >         set_apicid_to_node(per_cpu(x86_cpu_to_apicid, cpu), NUMA_NO_NODE);
> >  #endif
>
Binbin Wu Aug. 3, 2022, 3:40 a.m. UTC | #15
On 2022/6/27 13:05, Kai Huang wrote:
> On Fri, 2022-06-24 at 11:57 -0700, Dave Hansen wrote:
>> On 6/22/22 04:15, Kai Huang wrote:
>>> Platforms with confidential computing technology may not support ACPI
>>> CPU hotplug when such technology is enabled by the BIOS.  Examples
>>> include Intel platforms which support Intel Trust Domain Extensions
>>> (TDX).
>>>
>>> If the kernel ever receives ACPI CPU hotplug event, it is likely a BIOS
>>> bug.  For ACPI CPU hot-add, the kernel should speak out this is a BIOS
>>> bug and reject the new CPU.  For hot-removal, for simplicity just assume
>>> the kernel cannot continue to work normally, and BUG().
>> So, the kernel is now declaring ACPI CPU hotplug and TDX to be
>> incompatible and even BUG()'ing if we see them together.  Has anyone
>> told the firmware guys about this?  Is this in a spec somewhere?  When
>> the kernel goes boom, are the firmware folks going to cry "Kernel bug!!"?
>>
>> This doesn't seem like something the kernel should be doing unilaterally.
> TDX doesn't support ACPI CPU hotplug (both hot-add and hot-removal) is an
> architectural behaviour.  The public specs doesn't explicitly say  it, but it is
> implied:
>
> 1) During platform boot MCHECK verifies all logical CPUs on all packages that
> they are TDX compatible, and it keeps some information, such as total CPU
> packages and total logical cpus at some location of SEAMRR so it can later be
> used by P-SEAMLDR and TDX module.  Please see "3.4 SEAMLDR_SEAMINFO" in the P-
> SEAMLDR spec:
>
> https://cdrdv2.intel.com/v1/dl/getContent/733584
>
> 2) Also some SEAMCALLs must be called on all logical CPUs or CPU packages that
> the platform has (such as such as TDH.SYS.INIT.LP and TDH.SYS.KEY.CONFIG),
> otherwise the further step of TDX module initialization will fail.
>
> Unfortunately there's no public spec mentioning what's the behaviour of ACPI CPU
> hotplug on TDX enabled platform.  For instance, whether BIOS will ever get the
> ACPI CPU hot-plug event, or if BIOS gets the event, will it suppress it.  What I
> got from Intel internally is a non-buggy BIOS should never report such event to
> the kernel, so if kernel receives such event, it should be fair enough to treat
> it as BIOS bug.
>
> But theoretically, the BIOS isn't in TDX's TCB, and can be from 3rd party..
>
> Also, I was told "CPU hot-plug is a system feature, not a CPU feature or Intel
> architecture feature", so Intel doesn't have an architectural specification for
> CPU hot-plug.
>
> At the meantime, I am pushing Intel internally to add some statements regarding
> to the TDX and CPU hotplug interaction to the BIOS write guide and make it
> public.  I guess this is the best thing we can do.
>
> Regarding to the code change, I agree the BUG() isn't good.  I used it because:
> 1) this basically on a theoretical problem and shouldn't happen in practice; 2)
> because there's no architectural specification regarding to the behaviour of TDX
> when CPU hot-removal, so I just used BUG() in assumption that TDX isn't safe to
> use anymore.

host kernel is also not in TDX's TCB either, what would happen if kernel 
doesn't
do anything in case of buggy BIOS? How does TDX handle the case to 
enforce the
secure of TDs?


>
> But Rafael doesn't like current code change either. I think maybe we can just
> disable CPU hotplug code when TDX is enabled by BIOS (something like below):
>
> --- a/drivers/acpi/acpi_processor.c
> +++ b/drivers/acpi/acpi_processor.c
> @@ -707,6 +707,10 @@ bool acpi_duplicate_processor_id(int proc_id)
>   void __init acpi_processor_init(void)
>   {
>          acpi_processor_check_duplicates();
> +
> +       if (cc_platform_has(CC_ATTR_ACPI_CPU_HOTPLUG_DISABLED))
> +               return;
> +
>          acpi_scan_add_handler_with_hotplug(&processor_handler, "processor");
>          acpi_scan_add_handler(&processor_container_handler);
>   }
>
> This approach is cleaner I think, but we won't be able to report "BIOS bug" when
> ACPI CPU hotplug happens.  But to me it's OK as perhaps it's arguable to treat
> it as BIOS bug (as theoretically BIOS can be from 3rd party).
>
> What's your opinion?
>
Binbin Wu Aug. 3, 2022, 3:55 a.m. UTC | #16
On 2022/6/22 19:15, Kai Huang wrote:
>   
> @@ -357,6 +358,17 @@ static int acpi_processor_add(struct acpi_device *device,
>   	struct device *dev;
>   	int result = 0;
>   
> +	/*
> +	 * If the confidential computing platform doesn't support ACPI
> +	 * memory hotplug, the BIOS should never deliver such event to
memory or cpu hotplug?


> +	 * the kernel.  Report ACPI CPU hot-add as a BIOS bug and ignore
> +	 * the new CPU.
> +	 */
> +	if (cc_platform_has(CC_ATTR_ACPI_CPU_HOTPLUG_DISABLED)) {
> +		dev_err(&device->dev, "[BIOS bug]: Platform doesn't support ACPI CPU hotplug.  New CPU ignored.\n");
> +		return -EINVAL;
> +	}
> +
>   	pr = kzalloc(sizeof(struct acpi_processor), GFP_KERNEL);
>   	if (!pr)
>   		return -ENOMEM;
> @@ -434,6 +446,17 @@ static void acpi_processor_remove(struct acpi_device *device)
>   	if (!device || !acpi_driver_data(device))
>   		return;
>   
> +	/*
> +	 * The confidential computing platform is broken if ACPI memory
ditto


> +	 * hot-removal isn't supported but it happened anyway.  Assume
> +	 * it's not guaranteed that the kernel can continue to work
> +	 * normally.  Just BUG().
> +	 */
> +	if (cc_platform_has(CC_ATTR_ACPI_CPU_HOTPLUG_DISABLED)) {
> +		dev_err(&device->dev, "Platform doesn't support ACPI CPU hotplug. BUG().\n");
> +		BUG();
> +	}
>
Huang, Kai Aug. 3, 2022, 9:20 a.m. UTC | #17
On Wed, 2022-08-03 at 11:40 +0800, Binbin Wu wrote:
> host kernel is also not in TDX's TCB either, what would happen if kernel 
> doesn't
> do anything in case of buggy BIOS? How does TDX handle the case to 
> enforce the
> secure of TDs?

TDX doesn't support hot-add or hot-removal CPU from TDX' security perimeter at
runtime.  Even BIOS/kernel can ever bring up new CPUs at runtime, the new CPUs
cannot run within TDX's security domain, in which case TDX's security isn't
compromised.  If kernel schedules a TD to a new added CPU, then AFAICT the
behaviour is TDX module implementation specific but not architectural.  A
reasonable behaviour would be the TDENTER should refuse to run when the CPU
isn't verified by TDX during boot.

If any CPU is hot-removed, then the security's TDX isn't compromised, but TDX is
not guaranteed to functionally work anymore.
Huang, Kai Aug. 3, 2022, 9:21 a.m. UTC | #18
On Wed, 2022-08-03 at 11:55 +0800, Binbin Wu wrote:
> On 2022/6/22 19:15, Kai Huang wrote:
> >   
> > @@ -357,6 +358,17 @@ static int acpi_processor_add(struct acpi_device *device,
> >   	struct device *dev;
> >   	int result = 0;
> >   
> > +	/*
> > +	 * If the confidential computing platform doesn't support ACPI
> > +	 * memory hotplug, the BIOS should never deliver such event to
> memory or cpu hotplug?

Sorry typo.  Should be CPU.

Anyway this patch will be dropped in next version.
diff mbox series

Patch

diff --git a/arch/x86/coco/core.c b/arch/x86/coco/core.c
index 4320fadae716..1bde1af75296 100644
--- a/arch/x86/coco/core.c
+++ b/arch/x86/coco/core.c
@@ -20,7 +20,7 @@  static bool intel_cc_platform_has(enum cc_attr attr)
 {
 	switch (attr) {
 	case CC_ATTR_GUEST_UNROLL_STRING_IO:
-	case CC_ATTR_HOTPLUG_DISABLED:
+	case CC_ATTR_CPU_HOTPLUG_DISABLED:
 	case CC_ATTR_GUEST_MEM_ENCRYPT:
 	case CC_ATTR_MEM_ENCRYPT:
 		return true;
diff --git a/drivers/acpi/acpi_processor.c b/drivers/acpi/acpi_processor.c
index 6737b1cbf6d6..b960db864cd4 100644
--- a/drivers/acpi/acpi_processor.c
+++ b/drivers/acpi/acpi_processor.c
@@ -15,6 +15,7 @@ 
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/pci.h>
+#include <linux/cc_platform.h>
 
 #include <acpi/processor.h>
 
@@ -357,6 +358,17 @@  static int acpi_processor_add(struct acpi_device *device,
 	struct device *dev;
 	int result = 0;
 
+	/*
+	 * If the confidential computing platform doesn't support ACPI
+	 * memory hotplug, the BIOS should never deliver such event to
+	 * the kernel.  Report ACPI CPU hot-add as a BIOS bug and ignore
+	 * the new CPU.
+	 */
+	if (cc_platform_has(CC_ATTR_ACPI_CPU_HOTPLUG_DISABLED)) {
+		dev_err(&device->dev, "[BIOS bug]: Platform doesn't support ACPI CPU hotplug.  New CPU ignored.\n");
+		return -EINVAL;
+	}
+
 	pr = kzalloc(sizeof(struct acpi_processor), GFP_KERNEL);
 	if (!pr)
 		return -ENOMEM;
@@ -434,6 +446,17 @@  static void acpi_processor_remove(struct acpi_device *device)
 	if (!device || !acpi_driver_data(device))
 		return;
 
+	/*
+	 * The confidential computing platform is broken if ACPI memory
+	 * hot-removal isn't supported but it happened anyway.  Assume
+	 * it's not guaranteed that the kernel can continue to work
+	 * normally.  Just BUG().
+	 */
+	if (cc_platform_has(CC_ATTR_ACPI_CPU_HOTPLUG_DISABLED)) {
+		dev_err(&device->dev, "Platform doesn't support ACPI CPU hotplug. BUG().\n");
+		BUG();
+	}
+
 	pr = acpi_driver_data(device);
 	if (pr->id >= nr_cpu_ids)
 		goto out;
diff --git a/include/linux/cc_platform.h b/include/linux/cc_platform.h
index 691494bbaf5a..9ce9256facc8 100644
--- a/include/linux/cc_platform.h
+++ b/include/linux/cc_platform.h
@@ -74,14 +74,25 @@  enum cc_attr {
 	CC_ATTR_GUEST_UNROLL_STRING_IO,
 
 	/**
-	 * @CC_ATTR_HOTPLUG_DISABLED: Hotplug is not supported or disabled.
+	 * @CC_ATTR_CPU_HOTPLUG_DISABLED: CPU hotplug is not supported or
+	 *				  disabled.
 	 *
 	 * The platform/OS is running as a guest/virtual machine does not
 	 * support CPU hotplug feature.
 	 *
 	 * Examples include TDX Guest.
 	 */
-	CC_ATTR_HOTPLUG_DISABLED,
+	CC_ATTR_CPU_HOTPLUG_DISABLED,
+
+	/**
+	 * @CC_ATTR_ACPI_CPU_HOTPLUG_DISABLED: ACPI CPU hotplug is not
+	 *				       supported.
+	 *
+	 * The platform/OS does not support ACPI CPU hotplug.
+	 *
+	 * Examples include TDX platform.
+	 */
+	CC_ATTR_ACPI_CPU_HOTPLUG_DISABLED,
 };
 
 #ifdef CONFIG_ARCH_HAS_CC_PLATFORM
diff --git a/kernel/cpu.c b/kernel/cpu.c
index edb8c199f6a3..966772cce063 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1191,7 +1191,7 @@  static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target)
 	 * If the platform does not support hotplug, report it explicitly to
 	 * differentiate it from a transient offlining failure.
 	 */
-	if (cc_platform_has(CC_ATTR_HOTPLUG_DISABLED))
+	if (cc_platform_has(CC_ATTR_CPU_HOTPLUG_DISABLED))
 		return -EOPNOTSUPP;
 	if (cpu_hotplug_disabled)
 		return -EBUSY;