diff mbox

[v4,14/14] xen/x86: setup PVHv2 Dom0 ACPI tables

Message ID 20161130164950.43543-15-roger.pau@citrix.com (mailing list archive)
State New, archived
Headers show

Commit Message

Roger Pau Monné Nov. 30, 2016, 4:49 p.m. UTC
Create a new MADT table that contains the topology exposed to the guest. A
new XSDT table is also created, in order to filter the tables that we want
to expose to the guest, plus the Xen crafted MADT. This in turn requires Xen
to also create a new RSDP in order to make it point to the custom XSDT.

Also, regions marked as E820_ACPI or E820_NVS are identity mapped into Dom0
p2m, plus any top-level ACPI tables that should be accessible to Dom0 and
reside in reserved regions. This is needed because some memory maps don't
properly account for all the memory used by ACPI, so it's common to find ACPI
tables in reserved regions.

Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
---
Cc: Jan Beulich <jbeulich@suse.com>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
---
Changes since v3:
 - Use hvm_copy_to_phys in order to copy the tables to Dom0 memory.
 - Return EEXIST for overlaping ranges in hvm_add_mem_range.
 - s/ov/ovr/ for interrupt override parsing functions.
 - Constify intr local variable in acpi_set_intr_ovr.
 - Use structure asignement for type safety.
 - Perform sizeof using local variables in hvm_setup_acpi_madt.
 - Manually set revision of crafted/modified tables.
 - Only map tables to guest that reside in reserved or ACPI memory regions.
 - Copy the RSDP OEM signature to the crafted RSDP.
 - Pair calls to acpi_os_map_memory/acpi_os_unmap_memory.
 - Add memory regions for allowed ACPI tables to the memory map and then
   perform the identity mappings. This avoids having to call modify_identity_mmio
   multiple times.
 - Add a FIXME comment regarding the lack of multiple vIO-APICs.

Changes since v2:
 - Completely reworked.
---
 xen/arch/x86/domain_build.c | 429 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 428 insertions(+), 1 deletion(-)

Comments

Jan Beulich Dec. 12, 2016, 1:56 p.m. UTC | #1
>>> On 30.11.16 at 17:49, <roger.pau@citrix.com> wrote:
> @@ -567,7 +573,7 @@ static __init void hvm_setup_e820(struct domain *d, unsigned long nr_pages)
>      /*
>       * Craft the e820 memory map for Dom0 based on the hardware e820 map.
>       */
> -    d->arch.e820 = xzalloc_array(struct e820entry, e820.nr_map);
> +    d->arch.e820 = xzalloc_array(struct e820entry, E820MAX);

I have to admit that I consider this both wasteful and inflexible.
While commonly you'll need far less entries, what if in fact you
need more?

> +static int __init hvm_setup_acpi_madt(struct domain *d, paddr_t *addr)
> +{
> +    struct acpi_table_madt *madt;
> +    struct acpi_table_header *table;
> +    struct acpi_madt_io_apic *io_apic;
> +    struct acpi_madt_local_apic *local_apic;

I think I had asked about the absence of struct acpi_madt_local_x2apic
here before, but now that I look again I also wonder how you get away
without struct acpi_madt_nmi_source and struct acpi_madt_local_apic_nmi;
I can kind of see struct acpi_madt_local_apic_override not being
needed here (and I'd really hope anyway that no BIOS actually makes
use of it). The question mainly is what possible damage there may be
if what Dom0 gets to see disagrees with what the firmware acts on
(remember that AML code may alter MADT contents).

> +    acpi_status status;
> +    unsigned long size;
> +    unsigned int i;
> +    int rc;
> +
> +    /* Count number of interrupt overrides in the MADT. */
> +    acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE,
> +                          acpi_count_intr_ovr, MAX_IRQ_SOURCES);

What's the reason for using MAX_IRQ_SOURCES here? There's no
static array involved here limiting the supported count.

> +    /* Calculate the size of the crafted MADT. */
> +    size = sizeof(*madt);
> +    size += sizeof(*io_apic);
> +    size += sizeof(*local_apic) * dom0_max_vcpus();
> +    size += sizeof(struct acpi_madt_interrupt_override) * acpi_intr_overrrides;

sizeof(*intsrcovr)

> +    madt = xzalloc_bytes(size);
> +    if ( !madt )
> +    {
> +        printk("Unable to allocate memory for MADT table\n");
> +        return -ENOMEM;
> +    }
> +
> +    /* Copy the native MADT table header. */
> +    status = acpi_get_table(ACPI_SIG_MADT, 0, &table);
> +    if ( !ACPI_SUCCESS(status) )
> +    {
> +        printk("Failed to get MADT ACPI table, aborting.\n");
> +        return -EINVAL;
> +    }
> +    *((struct acpi_table_header *)madt) = *table;

madt.header = *table;

> +    madt->address = APIC_DEFAULT_PHYS_BASE;
> +    /*
> +     * NB: this is currently set to 4, which is the revision in the ACPI
> +     * spec 6.1. Sadly ACPICA doesn't provide revision numbers for the
> +     * tables described in the headers.
> +     */
> +    madt->header.revision = 4;

I can see you wanting to cap the revision here, but is it safe to
possibly bump it from what firmware has?

> +    /*
> +     * Setup the IO APIC entry.
> +     * FIXME: the current vIO-APIC code just supports one IO-APIC instance
> +     * per domain. This must be fixed in order to provide the same amount of
> +     * IO APICs as available on bare metal.
> +     */
> +    if ( nr_ioapics > 1 )
> +        printk("WARNING: found %d IO APICs, Dom0 will only have access to 1 emulated IO APIC\n",
> +               nr_ioapics);
> +    io_apic = (struct acpi_madt_io_apic *)(madt + 1);
> +    io_apic->header.type = ACPI_MADT_TYPE_IO_APIC;
> +    io_apic->header.length = sizeof(*io_apic);
> +    io_apic->id = 1;

Is it safe to use an ID other than what firmware did assign? Where
is this 1 coming from? And how does this get sync-ed up with the
respective struct hvm_hw_vioapic field?

> +    io_apic->address = VIOAPIC_DEFAULT_BASE_ADDRESS;
> +
> +    local_apic = (struct acpi_madt_local_apic *)(io_apic + 1);
> +    for ( i = 0; i < dom0_max_vcpus(); i++ )
> +    {
> +        local_apic->header.type = ACPI_MADT_TYPE_LOCAL_APIC;
> +        local_apic->header.length = sizeof(*local_apic);
> +        local_apic->processor_id = i;
> +        local_apic->id = i * 2;
> +        local_apic->lapic_flags = ACPI_MADT_ENABLED;
> +        local_apic++;
> +    }
> +
> +    /* Setup interrupt overrides. */
> +    intsrcovr = (struct acpi_madt_interrupt_override *)local_apic;
> +    acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, acpi_set_intr_ovr,
> +                          MAX_IRQ_SOURCES);
> +    ASSERT(((unsigned char *)intsrcovr - (unsigned char *)madt) == size);

Likely easier to read if you used void * or long for the casts here.

> +static bool __init hvm_acpi_table_allowed(const char *sig)
> +{
> +    static const char __init banned_tables[][ACPI_NAME_SIZE] = {
> +        ACPI_SIG_HPET, ACPI_SIG_SLIT, ACPI_SIG_SRAT, ACPI_SIG_MPST,
> +        ACPI_SIG_PMTT, ACPI_SIG_MADT, ACPI_SIG_DMAR};
> +    unsigned long pfn, nr_pages;
> +    int i;
> +
> +    for ( i = 0 ; i < ARRAY_SIZE(banned_tables); i++ )
> +        if ( strncmp(sig, banned_tables[i], ACPI_NAME_SIZE) == 0 )
> +            return false;
> +
> +    /* Make sure table doesn't reside in a RAM region. */
> +    pfn = PFN_DOWN(acpi_gbl_root_table_list.tables[i].address);
> +    nr_pages = DIV_ROUND_UP(
> +                    (acpi_gbl_root_table_list.tables[i].address & ~PAGE_MASK) +
> +                    acpi_gbl_root_table_list.tables[i].length, PAGE_SIZE);

How did you choose indentation here? I think there are a number
of ways compatible with how we commonly do it, but this is not one
of them. Preferably

    nr_pages =
        DIV_ROUND_UP((acpi_gbl_root_table_list.tables[i].address & ~PAGE_MASK) +
                     acpi_gbl_root_table_list.tables[i].length,
                     PAGE_SIZE);

or even

    nr_pages =
        PFN_UP((acpi_gbl_root_table_list.tables[i].address & ~PAGE_MASK) +
               acpi_gbl_root_table_list.tables[i].length);

> +    if ( acpi_memory_banned(pfn, nr_pages) )
> +    {
> +        printk("Skipping table %.4s because resides in a non ACPI or reserved region\n",

I think this wording can be mis-read. How about "... because resides
in a non-ACPI, non-reserved region"?

> +static int __init hvm_setup_acpi_xsdt(struct domain *d, paddr_t madt_addr,
> +                                      paddr_t *addr)
> +{
> +    struct acpi_table_xsdt *xsdt;
> +    struct acpi_table_header *table;
> +    struct acpi_table_rsdp *rsdp;
> +    unsigned long size;
> +    unsigned int i, num_tables;
> +    paddr_t xsdt_paddr;
> +    int j, rc;

How come i is properly unsigned, but j is plain int?

> +    /*
> +     * Restore original DMAR table signature, we are going to filter it
> +     * from the new XSDT that is presented to the guest, so it no longer
> +     * makes sense to have it's signature zapped.
> +     */
> +    acpi_dmar_reinstate();
> +
> +    /* Account for the space needed by the XSDT. */
> +    size = sizeof(*xsdt);
> +    num_tables = 0;

Mind making these the initializers of the variables?

> +    /* Count the number of tables that will be added to the XSDT. */
> +    for( i = 0; i < acpi_gbl_root_table_list.count; i++ )
> +    {
> +        const char *sig = acpi_gbl_root_table_list.tables[i].signature.ascii;
> +
> +        if ( hvm_acpi_table_allowed(sig) )
> +            num_tables++;
> +    }
> +
> +    /*
> +     * No need to subtract one because we will be adding a custom MADT (and
> +     * the native one is not accounted for).
> +     */
> +    size += num_tables * sizeof(xsdt->table_offset_entry[0]);

The comment has been confusing me each time I've come here so
far. The reason you don't need to add or subtract anything is
because struct acpi_table_xsdt includes one array slot already
(which isn't visible here, i.e. one would need to look at the header),
_and_ you've skipped native MADT _and_ you're going to add a
made up MADT, so I think it would be better to explain it in those
terms.

> +    xsdt = xzalloc_bytes(size);
> +    if ( !xsdt )
> +    {
> +        printk("Unable to allocate memory for XSDT table\n");
> +        return -ENOMEM;
> +    }
> +
> +    /* Copy the native XSDT table header. */
> +    rsdp = acpi_os_map_memory(acpi_os_get_root_pointer(), sizeof(*rsdp));
> +    if ( !rsdp )
> +    {
> +        printk("Unable to map RSDP\n");
> +        return -EINVAL;
> +    }
> +    xsdt_paddr = rsdp->xsdt_physical_address;
> +    acpi_os_unmap_memory(rsdp, sizeof(*rsdp));
> +    table = acpi_os_map_memory(xsdt_paddr, sizeof(*table));
> +    if ( !table )
> +    {
> +        printk("Unable to map XSDT\n");
> +        return -EINVAL;
> +    }
> +    *((struct acpi_table_header *)xsdt) = *table;

xsdt.header

> +    acpi_os_unmap_memory(table, sizeof(*table));
> +
> +    /* Add the custom MADT. */
> +    j = 0;
> +    xsdt->table_offset_entry[j++] = madt_addr;

Please re-use num_tables for the counting here. It also would
likely be a little more clear if you used plain 0 as array index
and assigned 1 right away.

> +static int __init hvm_setup_acpi(struct domain *d, paddr_t start_info)
> +{
> +    struct acpi_table_rsdp rsdp, *native_rsdp;
> +    unsigned long pfn, nr_pages;
> +    paddr_t madt_paddr, xsdt_paddr, rsdp_paddr;
> +    unsigned int i;
> +    int rc;
> +
> +    /* Scan top-level tables and add their regions to the guest memory map. 
> */
> +    for( i = 0; i < acpi_gbl_root_table_list.count; i++ )
> +    {
> +        const char *sig = acpi_gbl_root_table_list.tables[i].signature.ascii;
> +
> +        if ( hvm_acpi_table_allowed(sig) )
> +            hvm_add_mem_range(d, acpi_gbl_root_table_list.tables[i].address,
> +                              acpi_gbl_root_table_list.tables[i].address +
> +                              acpi_gbl_root_table_list.tables[i].length,
> +                              E820_ACPI);
> +    }
> +
> +    /* Identity map ACPI e820 regions. */
> +    for ( i = 0; i < d->arch.nr_e820; i++ )
> +    {
> +        if ( d->arch.e820[i].type != E820_ACPI &&
> +             d->arch.e820[i].type != E820_NVS )
> +            continue;
> +
> +        pfn = PFN_DOWN(d->arch.e820[i].addr);
> +        nr_pages = DIV_ROUND_UP((d->arch.e820[i].addr & ~PAGE_MASK) +
> +                                d->arch.e820[i].size, PAGE_SIZE);

Perhaps PFN_UP() again?

> +        rc = modify_identity_mmio(d, pfn, nr_pages, true);
> +        if ( rc )
> +        {
> +            printk(
> +                "Failed to map ACPI region [%#lx, %#lx) into Dom0 memory map\n",

Please don't split lines like this.

> +                   pfn, pfn + nr_pages);
> +            return rc;
> +        }
> +    }
> +
> +    rc = hvm_setup_acpi_madt(d, &madt_paddr);
> +    if ( rc )
> +        return rc;
> +
> +    rc = hvm_setup_acpi_xsdt(d, madt_paddr, &xsdt_paddr);
> +    if ( rc )
> +        return rc;
> +
> +    /* Craft a custom RSDP. */
> +    memset(&rsdp, 0, sizeof(rsdp));
> +    memcpy(rsdp.signature, ACPI_SIG_RSDP, sizeof(rsdp.signature));
> +    native_rsdp = acpi_os_map_memory(acpi_os_get_root_pointer(), sizeof(rsdp));

While this one is fine to call from here, ...

> +    memcpy(rsdp.oem_id, native_rsdp->oem_id, sizeof(rsdp.oem_id));
> +    acpi_os_unmap_memory(native_rsdp, sizeof(rsdp));
> +    rsdp.revision = 2;
> +    rsdp.xsdt_physical_address = xsdt_paddr;
> +    rsdp.rsdt_physical_address = 0;
> +    rsdp.length = sizeof(rsdp);
> +    rsdp.checksum -= acpi_tb_checksum(ACPI_CAST_PTR(u8, &rsdp),
> +                                      ACPI_RSDP_REV0_SIZE);
> +    rsdp.extended_checksum -= acpi_tb_checksum(ACPI_CAST_PTR(u8, &rsdp),
> +                                               sizeof(rsdp));

... these two represent a layering violation: Code outside of
xen/drivers/acpi/*.c is not supposed to call anything in
xen/drivers/acpi/{tables,utilities}/. Since introducing a wrapper
is likely overkill for the purpose here I think we can tolerate it if
suitably annotated in a comment. (Granted, ARM has a similar
issue.)

Jan
Roger Pau Monné Dec. 21, 2016, 4:32 p.m. UTC | #2
On Mon, Dec 12, 2016 at 06:56:36AM -0700, Jan Beulich wrote:
> >>> On 30.11.16 at 17:49, <roger.pau@citrix.com> wrote:
> > @@ -567,7 +573,7 @@ static __init void hvm_setup_e820(struct domain *d, unsigned long nr_pages)
> >      /*
> >       * Craft the e820 memory map for Dom0 based on the hardware e820 map.
> >       */
> > -    d->arch.e820 = xzalloc_array(struct e820entry, e820.nr_map);
> > +    d->arch.e820 = xzalloc_array(struct e820entry, E820MAX);
> 
> I have to admit that I consider this both wasteful and inflexible.
> While commonly you'll need far less entries, what if in fact you
> need more?

I've updated pvh_add_mem_range so that now it expands the memory map when
needed, in order to add new regions. This solves both the waste and the
inflexibility:

map = xzalloc_array(struct e820entry, d->arch.nr_e820 + 1);
if ( !map )
{
    printk(XENLOG_WARNING "E820: out of memory to add region\n");
    return -ENOMEM;
}

memcpy(map, d->arch.e820, i * sizeof(*d->arch.e820));
memcpy(map + i + 1, d->arch.e820 + i,
       (d->arch.nr_e820 - i) * sizeof(*d->arch.e820));
map[i].addr = s;
map[i].size = e - s;
map[i].type = type;
xfree(d->arch.e820);
d->arch.e820 = map;
d->arch.nr_e820++;

> > +static int __init hvm_setup_acpi_madt(struct domain *d, paddr_t *addr)
> > +{
> > +    struct acpi_table_madt *madt;
> > +    struct acpi_table_header *table;
> > +    struct acpi_madt_io_apic *io_apic;
> > +    struct acpi_madt_local_apic *local_apic;
> 
> I think I had asked about the absence of struct acpi_madt_local_x2apic
> here before, but now that I look again I also wonder how you get away
> without struct acpi_madt_nmi_source and struct acpi_madt_local_apic_nmi;

Since we are currently limited to 128 vCPUs (max APIC ID 255), I don't think
acpi_madt_local_x2apic is required in this scenario:

https://lists.xen.org/archives/html/xen-devel/2016-11/msg02815.html

I will work on adding those entries once that limit is lifted, would you be
fine with me adding a comment here regarding the no-need of
acpi_madt_local_x2apic until support for > 128vCPUs is added?

Regarding the local/IO APIC NMI structures, won't those NMI's be delivered to
Xen, or are those then re-injected to the guest?

> I can kind of see struct acpi_madt_local_apic_override not being

Since Xen is the one that sets the local APIC address in the MADT, and it
always matches the position of the emulated local APIC I don't see why we would
want to use acpi_madt_local_apic_override. I see that your worries are related
to what would happen if AML tries to modify the MADT, but wouldn't in that case
modify the native MADT, instead of the crafted one?

> needed here (and I'd really hope anyway that no BIOS actually makes
> use of it). The question mainly is what possible damage there may be
> if what Dom0 gets to see disagrees with what the firmware acts on
> (remember that AML code may alter MADT contents).
> 
> > +    acpi_status status;
> > +    unsigned long size;
> > +    unsigned int i;
> > +    int rc;
> > +
> > +    /* Count number of interrupt overrides in the MADT. */
> > +    acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE,
> > +                          acpi_count_intr_ovr, MAX_IRQ_SOURCES);
> 
> What's the reason for using MAX_IRQ_SOURCES here? There's no
> static array involved here limiting the supported count.

Right, this can be INT_MAX, or UINT_MAX if I change the type of
acpi_intr_overrrides.

> > +    madt->address = APIC_DEFAULT_PHYS_BASE;
> > +    /*
> > +     * NB: this is currently set to 4, which is the revision in the ACPI
> > +     * spec 6.1. Sadly ACPICA doesn't provide revision numbers for the
> > +     * tables described in the headers.
> > +     */
> > +    madt->header.revision = 4;
> 
> I can see you wanting to cap the revision here, but is it safe to
> possibly bump it from what firmware has?

Hm, we only use the headers and the interrupt overrides as-is from the original
table, the rest comes from the acpica structs. So I assume it's better to be
safe than sorry, and a min should be used here together with the original table
version.

> > +    /*
> > +     * Setup the IO APIC entry.
> > +     * FIXME: the current vIO-APIC code just supports one IO-APIC instance
> > +     * per domain. This must be fixed in order to provide the same amount of
> > +     * IO APICs as available on bare metal.
> > +     */
> > +    if ( nr_ioapics > 1 )
> > +        printk("WARNING: found %d IO APICs, Dom0 will only have access to 1 emulated IO APIC\n",
> > +               nr_ioapics);
> > +    io_apic = (struct acpi_madt_io_apic *)(madt + 1);
> > +    io_apic->header.type = ACPI_MADT_TYPE_IO_APIC;
> > +    io_apic->header.length = sizeof(*io_apic);
> > +    io_apic->id = 1;
> 
> Is it safe to use an ID other than what firmware did assign? Where
> is this 1 coming from? And how does this get sync-ed up with the
> respective struct hvm_hw_vioapic field?

Not sure why I've set this to 1, AFAICT the vioapic code sets this to 0 on
reset, so it should be 0 here (until we have proper support in order to
completely reproduce the native IO APIC topology).

[ ... fixed all the comments below ..]

Thanks for the detailed review!

Roger.
Jan Beulich Dec. 21, 2016, 5:04 p.m. UTC | #3
>>> On 21.12.16 at 17:32, <roger.pau@citrix.com> wrote:
> On Mon, Dec 12, 2016 at 06:56:36AM -0700, Jan Beulich wrote:
>> >>> On 30.11.16 at 17:49, <roger.pau@citrix.com> wrote:
>> > +static int __init hvm_setup_acpi_madt(struct domain *d, paddr_t *addr)
>> > +{
>> > +    struct acpi_table_madt *madt;
>> > +    struct acpi_table_header *table;
>> > +    struct acpi_madt_io_apic *io_apic;
>> > +    struct acpi_madt_local_apic *local_apic;
>> 
>> I think I had asked about the absence of struct acpi_madt_local_x2apic
>> here before, but now that I look again I also wonder how you get away
>> without struct acpi_madt_nmi_source and struct acpi_madt_local_apic_nmi;
> 
> Since we are currently limited to 128 vCPUs (max APIC ID 255), I don't think
> acpi_madt_local_x2apic is required in this scenario:
> 
> https://lists.xen.org/archives/html/xen-devel/2016-11/msg02815.html 
> 
> I will work on adding those entries once that limit is lifted, would you be
> fine with me adding a comment here regarding the no-need of
> acpi_madt_local_x2apic until support for > 128vCPUs is added?

I'm not convinced these table entries are tied to >255 CPUs - I'm
seeing them on systems with far less. Hence I simply wonder what
functionality we may miss to offer to OSes with these tables
absent.

> Regarding the local/IO APIC NMI structures, won't those NMI's be delivered to
> Xen, or are those then re-injected to the guest?

PV Dom0 may get to see NMIs, so I'd expect PVH to behave
similarly.

>> I can kind of see struct acpi_madt_local_apic_override not being
> 
> Since Xen is the one that sets the local APIC address in the MADT, and it
> always matches the position of the emulated local APIC I don't see why we would
> want to use acpi_madt_local_apic_override. I see that your worries are related
> to what would happen if AML tries to modify the MADT, but wouldn't in that case
> modify the native MADT, instead of the crafted one?

Exactly, so how would you see the modification to get
propagated?

Jan
Roger Pau Monné Dec. 22, 2016, 10:43 a.m. UTC | #4
On Wed, Dec 21, 2016 at 10:04:20AM -0700, Jan Beulich wrote:
> >>> On 21.12.16 at 17:32, <roger.pau@citrix.com> wrote:
> > On Mon, Dec 12, 2016 at 06:56:36AM -0700, Jan Beulich wrote:
> >> >>> On 30.11.16 at 17:49, <roger.pau@citrix.com> wrote:
> >> > +static int __init hvm_setup_acpi_madt(struct domain *d, paddr_t *addr)
> >> > +{
> >> > +    struct acpi_table_madt *madt;
> >> > +    struct acpi_table_header *table;
> >> > +    struct acpi_madt_io_apic *io_apic;
> >> > +    struct acpi_madt_local_apic *local_apic;
> >> 
> >> I think I had asked about the absence of struct acpi_madt_local_x2apic
> >> here before, but now that I look again I also wonder how you get away
> >> without struct acpi_madt_nmi_source and struct acpi_madt_local_apic_nmi;
> > 
> > Since we are currently limited to 128 vCPUs (max APIC ID 255), I don't think
> > acpi_madt_local_x2apic is required in this scenario:
> > 
> > https://lists.xen.org/archives/html/xen-devel/2016-11/msg02815.html 
> > 
> > I will work on adding those entries once that limit is lifted, would you be
> > fine with me adding a comment here regarding the no-need of
> > acpi_madt_local_x2apic until support for > 128vCPUs is added?
> 
> I'm not convinced these table entries are tied to >255 CPUs - I'm
> seeing them on systems with far less. Hence I simply wonder what
> functionality we may miss to offer to OSes with these tables
> absent.

From the ACPI spec, both entries looks very similar, with the exception that
the x2APIC entry has a bigger size (from 1byte to 4bytes) for both the
processor and the APIC ID.

I don't have a system with x2APIC entries at hand, but I guess the easiest way
to solve this would be to add x2APIC and x2APIC NMI entries if they are also
present on the native MADT?

Could it be that your systems provide those entries because the firmware is
shared with other high-end boxes or prepared to handle configurations with >255
APIC IDs?

> >> I can kind of see struct acpi_madt_local_apic_override not being
> > 
> > Since Xen is the one that sets the local APIC address in the MADT, and it
> > always matches the position of the emulated local APIC I don't see why we would
> > want to use acpi_madt_local_apic_override. I see that your worries are related
> > to what would happen if AML tries to modify the MADT, but wouldn't in that case
> > modify the native MADT, instead of the crafted one?
> 
> Exactly, so how would you see the modification to get
> propagated?

Please bear with me, by modifications here do you mean what the _MAT method
returns from processor objects?

The ACPI 6.1 spec has something that worries me quite a lot (page 145):

"
a) When _MAT (see Section 6.2.10) appears under a Processor Device object (see
Section 8.4), OSPM processes the Interrupt Controller Structures returned by
_MAT with the types labeled "yes" and ignores other types.

b) When _MAT appears under an I/O APIC Device (see Section 9.17), OSPM
processes the Interrupt Controller Structures returned by _MAT with the types
labeled "yes" and ignores other types.
"

Which from my understanding means that almost everything from the original MADT
can be overwritten by the returns of _MAT methods. The same seems to be true
for ARM, and I don't see them dealing with this in any way. Is this something
that has yet to be implemented by any vendor?

Roger.
Jan Beulich Dec. 22, 2016, 11:03 a.m. UTC | #5
>>> On 22.12.16 at 11:43, <roger.pau@citrix.com> wrote:
> On Wed, Dec 21, 2016 at 10:04:20AM -0700, Jan Beulich wrote:
>> I'm not convinced these table entries are tied to >255 CPUs - I'm
>> seeing them on systems with far less. Hence I simply wonder what
>> functionality we may miss to offer to OSes with these tables
>> absent.
> 
> From the ACPI spec, both entries looks very similar, with the exception that
> the x2APIC entry has a bigger size (from 1byte to 4bytes) for both the
> processor and the APIC ID.
> 
> I don't have a system with x2APIC entries at hand, but I guess the easiest way
> to solve this would be to add x2APIC and x2APIC NMI entries if they are also
> present on the native MADT?

Yes, that might be a good basis for determining, at least for now.
Otoh we always emulate an x2APIC, so it may not be unreasonable
to always provide these entries in parallel with the legacy LAPIC
ones.

> Could it be that your systems provide those entries because the firmware is
> shared with other high-end boxes or prepared to handle configurations with >255
> APIC IDs?

I obviously don't know (and hence can't exclude) that.

>> >> I can kind of see struct acpi_madt_local_apic_override not being
>> > 
>> > Since Xen is the one that sets the local APIC address in the MADT, and it
>> > always matches the position of the emulated local APIC I don't see why we would
>> > want to use acpi_madt_local_apic_override. I see that your worries are related
>> > to what would happen if AML tries to modify the MADT, but wouldn't in that case
>> > modify the native MADT, instead of the crafted one?
>> 
>> Exactly, so how would you see the modification to get
>> propagated?
> 
> Please bear with me, by modifications here do you mean what the _MAT method
> returns from processor objects?
> 
> The ACPI 6.1 spec has something that worries me quite a lot (page 145):
> 
> "
> a) When _MAT (see Section 6.2.10) appears under a Processor Device object (see
> Section 8.4), OSPM processes the Interrupt Controller Structures returned by
> _MAT with the types labeled "yes" and ignores other types.
> 
> b) When _MAT appears under an I/O APIC Device (see Section 9.17), OSPM
> processes the Interrupt Controller Structures returned by _MAT with the types
> labeled "yes" and ignores other types.
> "
> 
> Which from my understanding means that almost everything from the original MADT
> can be overwritten by the returns of _MAT methods. The same seems to be true
> for ARM, and I don't see them dealing with this in any way. Is this something
> that has yet to be implemented by any vendor?

I don't understand. For one, I can't see the spec requiring or excluding
the in place modification of MADT entries for the purpose of implementing
_MAT. Which route is taken is imo an implementation choice. In hvmloader
we use the in place modification approach at present. And then looking
at the original MADT is a boot time thing, so subsequent modifications
should be of no interest to the OS (they'd get those as _MAT return
values, not by looking at the static table).

The thing that I'm worried about is a physical machine's firmware using
the same approach as we do in hvmloader, thus returning from _MAT
values which are out of sync with the MADT we present to Dom0.
Although - thinking about it a second time now, we'd have the same
inconsistency issue if the firmware constructed the _MAT return
values from scratch, so this will need dealing with in any case for
CPU hotplug to work.

Jan
Roger Pau Monné Dec. 22, 2016, 12:17 p.m. UTC | #6
On Thu, Dec 22, 2016 at 04:03:12AM -0700, Jan Beulich wrote:
> >>> On 22.12.16 at 11:43, <roger.pau@citrix.com> wrote:
> > On Wed, Dec 21, 2016 at 10:04:20AM -0700, Jan Beulich wrote:
> >> > Since Xen is the one that sets the local APIC address in the MADT, and it
> >> > always matches the position of the emulated local APIC I don't see why we would
> >> > want to use acpi_madt_local_apic_override. I see that your worries are related
> >> > to what would happen if AML tries to modify the MADT, but wouldn't in that case
> >> > modify the native MADT, instead of the crafted one?
> >> 
> >> Exactly, so how would you see the modification to get
> >> propagated?
> > 
> > Please bear with me, by modifications here do you mean what the _MAT method
> > returns from processor objects?
> > 
> > The ACPI 6.1 spec has something that worries me quite a lot (page 145):
> > 
> > "
> > a) When _MAT (see Section 6.2.10) appears under a Processor Device object (see
> > Section 8.4), OSPM processes the Interrupt Controller Structures returned by
> > _MAT with the types labeled "yes" and ignores other types.
> > 
> > b) When _MAT appears under an I/O APIC Device (see Section 9.17), OSPM
> > processes the Interrupt Controller Structures returned by _MAT with the types
> > labeled "yes" and ignores other types.
> > "
> > 
> > Which from my understanding means that almost everything from the original MADT
> > can be overwritten by the returns of _MAT methods. The same seems to be true
> > for ARM, and I don't see them dealing with this in any way. Is this something
> > that has yet to be implemented by any vendor?
> 
> I don't understand. For one, I can't see the spec requiring or excluding
> the in place modification of MADT entries for the purpose of implementing
> _MAT. Which route is taken is imo an implementation choice. In hvmloader
> we use the in place modification approach at present. And then looking
> at the original MADT is a boot time thing, so subsequent modifications
> should be of no interest to the OS (they'd get those as _MAT return
> values, not by looking at the static table).
> 
> The thing that I'm worried about is a physical machine's firmware using
> the same approach as we do in hvmloader, thus returning from _MAT
> values which are out of sync with the MADT we present to Dom0.
> Although - thinking about it a second time now, we'd have the same
> inconsistency issue if the firmware constructed the _MAT return
> values from scratch, so this will need dealing with in any case for
> CPU hotplug to work.

I don't see an obvious solution to this, those _MAT methods reside in AML, and
ATM Xen has no way to parse, much even less modify this code. Xen could add the
Processor device path to the STAO so that Dom0 ignores them, but that would
also prevent ACPI CPU hotplug from working in Dom0.

Maybe Boris has some ideas about how to do CPU hotplug for Dom0?

Roger.
Boris Ostrovsky Dec. 22, 2016, 4:17 p.m. UTC | #7
On 12/22/2016 07:17 AM, Roger Pau Monne wrote:
> On Thu, Dec 22, 2016 at 04:03:12AM -0700, Jan Beulich wrote:
>>>>> On 22.12.16 at 11:43, <roger.pau@citrix.com> wrote:
>>> On Wed, Dec 21, 2016 at 10:04:20AM -0700, Jan Beulich wrote:
>>>>> Since Xen is the one that sets the local APIC address in the MADT, and it
>>>>> always matches the position of the emulated local APIC I don't see why we would
>>>>> want to use acpi_madt_local_apic_override. I see that your worries are related
>>>>> to what would happen if AML tries to modify the MADT, but wouldn't in that case
>>>>> modify the native MADT, instead of the crafted one?
>>>> Exactly, so how would you see the modification to get
>>>> propagated?
>>> Please bear with me, by modifications here do you mean what the _MAT method
>>> returns from processor objects?
>>>
>>> The ACPI 6.1 spec has something that worries me quite a lot (page 145):
>>>
>>> "
>>> a) When _MAT (see Section 6.2.10) appears under a Processor Device object (see
>>> Section 8.4), OSPM processes the Interrupt Controller Structures returned by
>>> _MAT with the types labeled "yes" and ignores other types.
>>>
>>> b) When _MAT appears under an I/O APIC Device (see Section 9.17), OSPM
>>> processes the Interrupt Controller Structures returned by _MAT with the types
>>> labeled "yes" and ignores other types.
>>> "
>>>
>>> Which from my understanding means that almost everything from the original MADT
>>> can be overwritten by the returns of _MAT methods. The same seems to be true
>>> for ARM, and I don't see them dealing with this in any way. Is this something
>>> that has yet to be implemented by any vendor?
>> I don't understand. For one, I can't see the spec requiring or excluding
>> the in place modification of MADT entries for the purpose of implementing
>> _MAT. Which route is taken is imo an implementation choice. In hvmloader
>> we use the in place modification approach at present. And then looking
>> at the original MADT is a boot time thing, so subsequent modifications
>> should be of no interest to the OS (they'd get those as _MAT return
>> values, not by looking at the static table).
>>
>> The thing that I'm worried about is a physical machine's firmware using
>> the same approach as we do in hvmloader, thus returning from _MAT
>> values which are out of sync with the MADT we present to Dom0.
>> Although - thinking about it a second time now, we'd have the same
>> inconsistency issue if the firmware constructed the _MAT return
>> values from scratch, so this will need dealing with in any case for
>> CPU hotplug to work.
> I don't see an obvious solution to this, those _MAT methods reside in AML, and
> ATM Xen has no way to parse, much even less modify this code. Xen could add the
> Processor device path to the STAO so that Dom0 ignores them, but that would
> also prevent ACPI CPU hotplug from working in Dom0.
>
> Maybe Boris has some ideas about how to do CPU hotplug for Dom0?


Why would Xen need to be able to parse the AML that is intended to be
executed by dom0? I'd think that all the hypervisor would need to do is
to load it into memory, not any different from how it's done for regular
guests.

-boris
Jan Beulich Dec. 22, 2016, 4:24 p.m. UTC | #8
>>> On 22.12.16 at 17:17, <boris.ostrovsky@oracle.com> wrote:
> On 12/22/2016 07:17 AM, Roger Pau Monne wrote:
>> Maybe Boris has some ideas about how to do CPU hotplug for Dom0?
> 
> Why would Xen need to be able to parse the AML that is intended to be
> executed by dom0? I'd think that all the hypervisor would need to do is
> to load it into memory, not any different from how it's done for regular
> guests.

Well, if Dom0 executed the unmodified _MAT, it would get back
data relating to the physical CPU. Roger is overriding MADT to
present virtual CPU information to Dom0, and this would likewise
need to happen for the _MAT return value.

Jan
Roger Pau Monné Dec. 22, 2016, 4:44 p.m. UTC | #9
On Thu, Dec 22, 2016 at 09:24:02AM -0700, Jan Beulich wrote:
> >>> On 22.12.16 at 17:17, <boris.ostrovsky@oracle.com> wrote:
> > On 12/22/2016 07:17 AM, Roger Pau Monne wrote:
> >> Maybe Boris has some ideas about how to do CPU hotplug for Dom0?
> > 
> > Why would Xen need to be able to parse the AML that is intended to be
> > executed by dom0? I'd think that all the hypervisor would need to do is
> > to load it into memory, not any different from how it's done for regular
> > guests.
> 
> Well, if Dom0 executed the unmodified _MAT, it would get back
> data relating to the physical CPU. Roger is overriding MADT to
> present virtual CPU information to Dom0, and this would likewise
> need to happen for the _MAT return value.

This is one of the problems with this Dom/Xen0 split brain problem that we
have, and cannot get away from.

To clarify a little bit, I'm not overwriting the original MADT in memory, so
Dom0 should still be able to access it if the implementation of _MAT returns
data from that area. AFAICT when plugging in a physical CPU (pCPU) into the
hardware, Dom0 should see "correct" data returned by the _MAT method. However
(as represented by the " I've used), this data will not match Dom0 vCPU
topology, and should not be used by Dom0 (only reported to Xen in order to
bring up the new pCPU).

Then the problem arises because we have no way to perform vCPU hotplug for
Dom0, not at least as it is done for DomU (using ACPI), so we would have to use
an out-of-band method in order to do vCPU hotplug for Dom0, which is a PITA.

Roger.
Boris Ostrovsky Dec. 22, 2016, 6:19 p.m. UTC | #10
On 12/22/2016 11:44 AM, Roger Pau Monne wrote:
> On Thu, Dec 22, 2016 at 09:24:02AM -0700, Jan Beulich wrote:
>>>>> On 22.12.16 at 17:17, <boris.ostrovsky@oracle.com> wrote:
>>> On 12/22/2016 07:17 AM, Roger Pau Monne wrote:
>>>> Maybe Boris has some ideas about how to do CPU hotplug for Dom0?
>>> Why would Xen need to be able to parse the AML that is intended to be
>>> executed by dom0? I'd think that all the hypervisor would need to do is
>>> to load it into memory, not any different from how it's done for regular
>>> guests.
>> Well, if Dom0 executed the unmodified _MAT, it would get back
>> data relating to the physical CPU. Roger is overriding MADT to
>> present virtual CPU information to Dom0, and this would likewise
>> need to happen for the _MAT return value.

By "unmodified _MAT" you mean system's _MAT?  Why can't we provide our
own that will return _MAT object properly adjusted for dom0? We are
going to provide our own (i.e. not system's) DSDT, aren't we?

> This is one of the problems with this Dom/Xen0 split brain problem that we
> have, and cannot get away from.
>
> To clarify a little bit, I'm not overwriting the original MADT in memory, so
> Dom0 should still be able to access it if the implementation of _MAT returns
> data from that area. AFAICT when plugging in a physical CPU (pCPU) into the
> hardware, Dom0 should see "correct" data returned by the _MAT method. However
> (as represented by the " I've used), this data will not match Dom0 vCPU
> topology, and should not be used by Dom0 (only reported to Xen in order to
> bring up the new pCPU).

So the problem seems to be that we need to run both _MATs --- system's
and dom0's.


> Then the problem arises because we have no way to perform vCPU hotplug for
> Dom0, not at least as it is done for DomU (using ACPI), so we would have to use
> an out-of-band method in order to do vCPU hotplug for Dom0, which is a PITA.


I would very much like to avoid this.

-boris
Roger Pau Monné Dec. 23, 2016, 10:27 a.m. UTC | #11
On Thu, Dec 22, 2016 at 01:19:36PM -0500, Boris Ostrovsky wrote:
> On 12/22/2016 11:44 AM, Roger Pau Monne wrote:
> > On Thu, Dec 22, 2016 at 09:24:02AM -0700, Jan Beulich wrote:
> >>>>> On 22.12.16 at 17:17, <boris.ostrovsky@oracle.com> wrote:
> >>> On 12/22/2016 07:17 AM, Roger Pau Monne wrote:
> >>>> Maybe Boris has some ideas about how to do CPU hotplug for Dom0?
> >>> Why would Xen need to be able to parse the AML that is intended to be
> >>> executed by dom0? I'd think that all the hypervisor would need to do is
> >>> to load it into memory, not any different from how it's done for regular
> >>> guests.
> >> Well, if Dom0 executed the unmodified _MAT, it would get back
> >> data relating to the physical CPU. Roger is overriding MADT to
> >> present virtual CPU information to Dom0, and this would likewise
> >> need to happen for the _MAT return value.
> 
> By "unmodified _MAT" you mean system's _MAT?  Why can't we provide our
> own that will return _MAT object properly adjusted for dom0? We are
> going to provide our own (i.e. not system's) DSDT, aren't we?

Providing Dom0 with a different DSDT is almost impossible from a Xen PoV, for
once Xen cannot parse the original DSDT (because it's a dynamic table), and
then if we would be to provide a modified DSDT, we would also need an asl
assembler, so that we could parse the DSDT, modify it, and then compile it
again in order to provide it to Dom0.

Although all this code could be put under an init section that would be freed
after Dom0 creation it seems overkill and very far from trivial, not to mention
that I'm not even sure what side-effects there would be if Xen parsed the DSDT
itself without having any drivers.

> > This is one of the problems with this Dom/Xen0 split brain problem that we
> > have, and cannot get away from.
> >
> > To clarify a little bit, I'm not overwriting the original MADT in memory, so
> > Dom0 should still be able to access it if the implementation of _MAT returns
> > data from that area. AFAICT when plugging in a physical CPU (pCPU) into the
> > hardware, Dom0 should see "correct" data returned by the _MAT method. However
> > (as represented by the " I've used), this data will not match Dom0 vCPU
> > topology, and should not be used by Dom0 (only reported to Xen in order to
> > bring up the new pCPU).
> 
> So the problem seems to be that we need to run both _MATs --- system's
> and dom0's.

Exactly, we need _MAT for pCPUs and _MAT for _vCPUs.

> > Then the problem arises because we have no way to perform vCPU hotplug for
> > Dom0, not at least as it is done for DomU (using ACPI), so we would have to use
> > an out-of-band method in order to do vCPU hotplug for Dom0, which is a PITA.
> 
> 
> I would very much like to avoid this.

Maybe we can provide an extra SSDT for Dom0 that basically overwrites the CPU
objects (_PR.CPUX), but I'm not sure if ACPI allows this kind of objects
overwrites?

After reading the spec, I came across the following note in the SSDT section:

"Additional tables can only add data; they cannot overwrite data from previous
tables."

So I guess this is a no-go.

I only see the following options:

 - Prevent Dom0 from using the original _MAT methods (or even the full _PR.CPU
   objects) using the STAO, and then provide Dom0 with an out-of-band method
   (ie: not ACPI) in order to do CPU hotplug.

 - Expand the STAO so that it can be used to override ACPI namespace objects,
   possibly by adding a payload field that contains aml code. It seems that
   Linux already supports overwriting part of the ACPI namespace from
   user-space[0], so part of the needed machinery seem to be already in place
   (hopefully in acpica code?).

 - Disable the native CPU objects in the DSDT/SSDT using the STAO. Then pick up
   unused ACPI CPU IDs and use those for vCPUs. Provide an additional SSDT that
   contains ACPI objects for those vCPUs (as is done for DomU). This means we
   would probably have to start using x2APIC entries in the MADT, since the
   CPUs IDs might easily expand past 255 (AFAICT we could still keep the APIC
   IDs low however, since those two are disjoint).

I don't really fancy any of these two options, probably the last one seems like
the best IMHO, but I would like to hear some feedback about them, and of course
I'm open to suggestions :).

Roger.

[0] https://www.kernel.org/doc/Documentation/acpi/method-customizing.txt
Boris Ostrovsky Dec. 23, 2016, 3:13 p.m. UTC | #12
On 12/23/2016 05:27 AM, Roger Pau Monne wrote:
> On Thu, Dec 22, 2016 at 01:19:36PM -0500, Boris Ostrovsky wrote:
>> On 12/22/2016 11:44 AM, Roger Pau Monne wrote:
>>> On Thu, Dec 22, 2016 at 09:24:02AM -0700, Jan Beulich wrote:
>>>>>>> On 22.12.16 at 17:17, <boris.ostrovsky@oracle.com> wrote:
>>>>> On 12/22/2016 07:17 AM, Roger Pau Monne wrote:
>>>>>> Maybe Boris has some ideas about how to do CPU hotplug for Dom0?
>>>>> Why would Xen need to be able to parse the AML that is intended to be
>>>>> executed by dom0? I'd think that all the hypervisor would need to do is
>>>>> to load it into memory, not any different from how it's done for regular
>>>>> guests.
>>>> Well, if Dom0 executed the unmodified _MAT, it would get back
>>>> data relating to the physical CPU. Roger is overriding MADT to
>>>> present virtual CPU information to Dom0, and this would likewise
>>>> need to happen for the _MAT return value.
>> By "unmodified _MAT" you mean system's _MAT?  Why can't we provide our
>> own that will return _MAT object properly adjusted for dom0? We are
>> going to provide our own (i.e. not system's) DSDT, aren't we?
> Providing Dom0 with a different DSDT is almost impossible from a Xen PoV, for
> once Xen cannot parse the original DSDT (because it's a dynamic table), and
> then if we would be to provide a modified DSDT, we would also need an asl
> assembler, so that we could parse the DSDT, modify it, and then compile it
> again in order to provide it to Dom0.
>
> Although all this code could be put under an init section that would be freed
> after Dom0 creation it seems overkill and very far from trivial, not to mention
> that I'm not even sure what side-effects there would be if Xen parsed the DSDT
> itself without having any drivers.
>
>>> This is one of the problems with this Dom/Xen0 split brain problem that we
>>> have, and cannot get away from.
>>>
>>> To clarify a little bit, I'm not overwriting the original MADT in memory, so
>>> Dom0 should still be able to access it if the implementation of _MAT returns
>>> data from that area. AFAICT when plugging in a physical CPU (pCPU) into the
>>> hardware, Dom0 should see "correct" data returned by the _MAT method. However
>>> (as represented by the " I've used), this data will not match Dom0 vCPU
>>> topology, and should not be used by Dom0 (only reported to Xen in order to
>>> bring up the new pCPU).
>> So the problem seems to be that we need to run both _MATs --- system's
>> and dom0's.
> Exactly, we need _MAT for pCPUs and _MAT for _vCPUs.
>
>>> Then the problem arises because we have no way to perform vCPU hotplug for
>>> Dom0, not at least as it is done for DomU (using ACPI), so we would have to use
>>> an out-of-band method in order to do vCPU hotplug for Dom0, which is a PITA.
>>
>> I would very much like to avoid this.
> Maybe we can provide an extra SSDT for Dom0 that basically overwrites the CPU
> objects (_PR.CPUX), but I'm not sure if ACPI allows this kind of objects
> overwrites?
>
> After reading the spec, I came across the following note in the SSDT section:
>
> "Additional tables can only add data; they cannot overwrite data from previous
> tables."
>
> So I guess this is a no-go.
>
> I only see the following options:
>
>  - Prevent Dom0 from using the original _MAT methods (or even the full _PR.CPU
>    objects) using the STAO, and then provide Dom0 with an out-of-band method
>    (ie: not ACPI) in order to do CPU hotplug.

I don't see how we can have dom0 execute system's _MAT (or, indeed,
anything in _PR_CPU) and not try to add the processor to its own (vCPU)
list.

>
>  - Expand the STAO so that it can be used to override ACPI namespace objects,
>    possibly by adding a payload field that contains aml code. It seems that
>    Linux already supports overwriting part of the ACPI namespace from
>    user-space[0], so part of the needed machinery seem to be already in place
>    (hopefully in acpica code?).

At least judging by the pathname in sysfs this may be a debugging
feature. Which doesn't mean it can't be used, but we'll need to confirm
this with (Linux) APCI people.

>
>  - Disable the native CPU objects in the DSDT/SSDT using the STAO. Then pick up
>    unused ACPI CPU IDs and use those for vCPUs. Provide an additional SSDT that
>    contains ACPI objects for those vCPUs (as is done for DomU). This means we
>    would probably have to start using x2APIC entries in the MADT, since the
>    CPUs IDs might easily expand past 255 (AFAICT we could still keep the APIC
>    IDs low however, since those two are disjoint).

But this still assumes that dom0 handles ACPI event for a pCPUs as well,
right? And I am not sure this can work.

Actually, how do we hotplug pCPUs now?

-boris

>
> I don't really fancy any of these two options, probably the last one seems like
> the best IMHO, but I would like to hear some feedback about them, and of course
> I'm open to suggestions :).
>
> Roger.
>
> [0] https://www.kernel.org/doc/Documentation/acpi/method-customizing.txt
>
Konrad Rzeszutek Wilk Dec. 23, 2016, 3:21 p.m. UTC | #13
On Fri, Dec 23, 2016 at 10:27:46AM +0000, Roger Pau Monne wrote:
> On Thu, Dec 22, 2016 at 01:19:36PM -0500, Boris Ostrovsky wrote:
> > On 12/22/2016 11:44 AM, Roger Pau Monne wrote:
> > > On Thu, Dec 22, 2016 at 09:24:02AM -0700, Jan Beulich wrote:
> > >>>>> On 22.12.16 at 17:17, <boris.ostrovsky@oracle.com> wrote:
> > >>> On 12/22/2016 07:17 AM, Roger Pau Monne wrote:
> > >>>> Maybe Boris has some ideas about how to do CPU hotplug for Dom0?
> > >>> Why would Xen need to be able to parse the AML that is intended to be
> > >>> executed by dom0? I'd think that all the hypervisor would need to do is
> > >>> to load it into memory, not any different from how it's done for regular
> > >>> guests.
> > >> Well, if Dom0 executed the unmodified _MAT, it would get back
> > >> data relating to the physical CPU. Roger is overriding MADT to
> > >> present virtual CPU information to Dom0, and this would likewise
> > >> need to happen for the _MAT return value.
> > 
> > By "unmodified _MAT" you mean system's _MAT?  Why can't we provide our
> > own that will return _MAT object properly adjusted for dom0? We are
> > going to provide our own (i.e. not system's) DSDT, aren't we?
> 
> Providing Dom0 with a different DSDT is almost impossible from a Xen PoV, for
> once Xen cannot parse the original DSDT (because it's a dynamic table), and
> then if we would be to provide a modified DSDT, we would also need an asl
> assembler, so that we could parse the DSDT, modify it, and then compile it
> again in order to provide it to Dom0.
> 
> Although all this code could be put under an init section that would be freed
> after Dom0 creation it seems overkill and very far from trivial, not to mention
> that I'm not even sure what side-effects there would be if Xen parsed the DSDT
> itself without having any drivers.
> 
> > > This is one of the problems with this Dom/Xen0 split brain problem that we
> > > have, and cannot get away from.
> > >
> > > To clarify a little bit, I'm not overwriting the original MADT in memory, so
> > > Dom0 should still be able to access it if the implementation of _MAT returns
> > > data from that area. AFAICT when plugging in a physical CPU (pCPU) into the
> > > hardware, Dom0 should see "correct" data returned by the _MAT method. However
> > > (as represented by the " I've used), this data will not match Dom0 vCPU
> > > topology, and should not be used by Dom0 (only reported to Xen in order to
> > > bring up the new pCPU).
> > 
> > So the problem seems to be that we need to run both _MATs --- system's
> > and dom0's.
> 
> Exactly, we need _MAT for pCPUs and _MAT for _vCPUs.
> 
> > > Then the problem arises because we have no way to perform vCPU hotplug for
> > > Dom0, not at least as it is done for DomU (using ACPI), so we would have to use
> > > an out-of-band method in order to do vCPU hotplug for Dom0, which is a PITA.
> > 
> > 
> > I would very much like to avoid this.
> 
> Maybe we can provide an extra SSDT for Dom0 that basically overwrites the CPU
> objects (_PR.CPUX), but I'm not sure if ACPI allows this kind of objects
> overwrites?
> 
> After reading the spec, I came across the following note in the SSDT section:
> 
> "Additional tables can only add data; they cannot overwrite data from previous
> tables."
> 
> So I guess this is a no-go.
> 
> I only see the following options:
> 
>  - Prevent Dom0 from using the original _MAT methods (or even the full _PR.CPU
>    objects) using the STAO, and then provide Dom0 with an out-of-band method
>    (ie: not ACPI) in order to do CPU hotplug.
> 
>  - Expand the STAO so that it can be used to override ACPI namespace objects,
>    possibly by adding a payload field that contains aml code. It seems that
>    Linux already supports overwriting part of the ACPI namespace from
>    user-space[0], so part of the needed machinery seem to be already in place
>    (hopefully in acpica code?).
> 
>  - Disable the native CPU objects in the DSDT/SSDT using the STAO. Then pick up
>    unused ACPI CPU IDs and use those for vCPUs. Provide an additional SSDT that

You need those so that the C and P state parser can actually parse the other
CPU states which it will upload to Xen.

>    contains ACPI objects for those vCPUs (as is done for DomU). This means we
>    would probably have to start using x2APIC entries in the MADT, since the
>    CPUs IDs might easily expand past 255 (AFAICT we could still keep the APIC
>    IDs low however, since those two are disjoint).
>

- Import ACPI AML parser in Xen. <mad laughter>

 
> I don't really fancy any of these two options, probably the last one seems like
> the best IMHO, but I would like to hear some feedback about them, and of course
> I'm open to suggestions :).

- Do the same thing we had been doing with Xen and Linux PV. Expose the full
  machine MADT and then massage it. Yuck.

I am full of bad ideas today.
Konrad Rzeszutek Wilk Dec. 23, 2016, 3:31 p.m. UTC | #14
> But this still assumes that dom0 handles ACPI event for a pCPUs as well,
> right? And I am not sure this can work.
> 
> Actually, how do we hotplug pCPUs now?

xen-hptool
Boris Ostrovsky Dec. 23, 2016, 3:35 p.m. UTC | #15
On 12/23/2016 10:31 AM, Konrad Rzeszutek Wilk wrote:
>> But this still assumes that dom0 handles ACPI event for a pCPUs as well,
>> right? And I am not sure this can work.
>>
>> Actually, how do we hotplug pCPUs now?
> xen-hptool

Yes, but this has nothing to do with an actual pCPU being hot-plugged
into the system. It's similar to doing 'echo 1 > /sys/.../cpuX/online in
Lnux.

My question is --- how do we (hypervisor/dom0/toolstack) become aware of
appearance of a new processor.

-boris
Konrad Rzeszutek Wilk Dec. 23, 2016, 4:02 p.m. UTC | #16
On Fri, Dec 23, 2016 at 10:35:10AM -0500, Boris Ostrovsky wrote:
> On 12/23/2016 10:31 AM, Konrad Rzeszutek Wilk wrote:
> >> But this still assumes that dom0 handles ACPI event for a pCPUs as well,
> >> right? And I am not sure this can work.
> >>
> >> Actually, how do we hotplug pCPUs now?
> > xen-hptool
> 
> Yes, but this has nothing to do with an actual pCPU being hot-plugged
> into the system. It's similar to doing 'echo 1 > /sys/.../cpuX/online in
> Lnux.
> 
> My question is --- how do we (hypervisor/dom0/toolstack) become aware of
> appearance of a new processor.

Ooooh. drivers/xen/xen-acpi-cpuhotplug.c


But
213 config XEN_STUB                                                                 
214     bool "Xen stub drivers"                                                     
215     depends on XEN && X86_64 && BROKEN                                          
216     default n                                          


235 config XEN_ACPI_HOTPLUG_CPU                                                     
236     tristate "Xen ACPI cpu hotplug"                                             
237     depends on XEN_DOM0 && XEN_STUB && ACPI       

So it is probably very very badly not working.

> 
> -boris
Boris Ostrovsky Dec. 23, 2016, 5:13 p.m. UTC | #17
On 12/23/2016 11:02 AM, Konrad Rzeszutek Wilk wrote:
> On Fri, Dec 23, 2016 at 10:35:10AM -0500, Boris Ostrovsky wrote:
>> On 12/23/2016 10:31 AM, Konrad Rzeszutek Wilk wrote:
>>>> But this still assumes that dom0 handles ACPI event for a pCPUs as well,
>>>> right? And I am not sure this can work.
>>>>
>>>> Actually, how do we hotplug pCPUs now?
>>> xen-hptool
>> Yes, but this has nothing to do with an actual pCPU being hot-plugged
>> into the system. It's similar to doing 'echo 1 > /sys/.../cpuX/online in
>> Lnux.
>>
>> My question is --- how do we (hypervisor/dom0/toolstack) become aware of
>> appearance of a new processor.
> Ooooh. drivers/xen/xen-acpi-cpuhotplug.c

Looking at this code I think we should be able to at least differentiate
between pCPU and vCPU and not add pCPU to dom0.

-boris

>
>
> But
> 213 config XEN_STUB                                                                 
> 214     bool "Xen stub drivers"                                                     
> 215     depends on XEN && X86_64 && BROKEN                                          
> 216     default n                                          
>
>
> 235 config XEN_ACPI_HOTPLUG_CPU                                                     
> 236     tristate "Xen ACPI cpu hotplug"                                             
> 237     depends on XEN_DOM0 && XEN_STUB && ACPI       
>
> So it is probably very very badly not working.
>
>> -boris
Jan Beulich Jan. 3, 2017, 7:53 a.m. UTC | #18
>>> On 22.12.16 at 19:19, <boris.ostrovsky@oracle.com> wrote:
> On 12/22/2016 11:44 AM, Roger Pau Monne wrote:
>> On Thu, Dec 22, 2016 at 09:24:02AM -0700, Jan Beulich wrote:
>>>>>> On 22.12.16 at 17:17, <boris.ostrovsky@oracle.com> wrote:
>>>> On 12/22/2016 07:17 AM, Roger Pau Monne wrote:
>>>>> Maybe Boris has some ideas about how to do CPU hotplug for Dom0?
>>>> Why would Xen need to be able to parse the AML that is intended to be
>>>> executed by dom0? I'd think that all the hypervisor would need to do is
>>>> to load it into memory, not any different from how it's done for regular
>>>> guests.
>>> Well, if Dom0 executed the unmodified _MAT, it would get back
>>> data relating to the physical CPU. Roger is overriding MADT to
>>> present virtual CPU information to Dom0, and this would likewise
>>> need to happen for the _MAT return value.
> 
> By "unmodified _MAT" you mean system's _MAT?  Why can't we provide our
> own that will return _MAT object properly adjusted for dom0? We are
> going to provide our own (i.e. not system's) DSDT, aren't we?

For Dom0? No, I didn't think so. We can't reasonably edit the host's,
and we also can't craft our own (that works only for DomU).

Jan
Roger Pau Monné Jan. 3, 2017, 3:55 p.m. UTC | #19
On Fri, Dec 23, 2016 at 12:13:47PM -0500, Boris Ostrovsky wrote:
> On 12/23/2016 11:02 AM, Konrad Rzeszutek Wilk wrote:
> > On Fri, Dec 23, 2016 at 10:35:10AM -0500, Boris Ostrovsky wrote:
> >> On 12/23/2016 10:31 AM, Konrad Rzeszutek Wilk wrote:
> >>>> But this still assumes that dom0 handles ACPI event for a pCPUs as well,
> >>>> right? And I am not sure this can work.
> >>>>
> >>>> Actually, how do we hotplug pCPUs now?
> >>> xen-hptool
> >> Yes, but this has nothing to do with an actual pCPU being hot-plugged
> >> into the system. It's similar to doing 'echo 1 > /sys/.../cpuX/online in
> >> Lnux.
> >>
> >> My question is --- how do we (hypervisor/dom0/toolstack) become aware of
> >> appearance of a new processor.
> > Ooooh. drivers/xen/xen-acpi-cpuhotplug.c
> 
> Looking at this code I think we should be able to at least differentiate
> between pCPU and vCPU and not add pCPU to dom0.

Right, but I don't really see any way to do this with the current ACPI
interface, and still be able to use ACPI CPU hotplug for both pCPUs and
vCPUs.

We can provide a crafted MADT that reflects the number of vCPUs available to
Dom0 at boot time, let Dom0 find the Processor objects in the ACPI namespace
and perform pCPU hotplug as it's been done until now. Then for Dom0 vCPU
hotplug we would need to use the hypercall interface as it's used by classic PV
guests, because AFAICT there's no way we can provide Dom0 with a PRST/PRSC
that's valid for both vCPUs and pCPUs.

Roger.
Boris Ostrovsky Jan. 3, 2017, 7:13 p.m. UTC | #20
> We can provide a crafted MADT that reflects the number of vCPUs available to
> Dom0 at boot time, let Dom0 find the Processor objects in the ACPI namespace
> and perform pCPU hotplug as it's been done until now. Then for Dom0 vCPU
> hotplug we would need to use the hypercall interface as it's used by classic PV
> guests, because AFAICT there's no way we can provide Dom0 with a PRST/PRSC
> that's valid for both vCPUs and pCPUs.

If we have a PV-like interface for bringing up vCPUs for dom0 then
what's the point of doing it differently for domUs?

-boris
diff mbox

Patch

diff --git a/xen/arch/x86/domain_build.c b/xen/arch/x86/domain_build.c
index 5c7592b..fc778b2 100644
--- a/xen/arch/x86/domain_build.c
+++ b/xen/arch/x86/domain_build.c
@@ -23,6 +23,7 @@ 
 #include <xen/libelf.h>
 #include <xen/pfn.h>
 #include <xen/guest_access.h>
+#include <xen/acpi.h>
 #include <asm/regs.h>
 #include <asm/system.h>
 #include <asm/io.h>
@@ -38,6 +39,8 @@ 
 #include <asm/io_apic.h>
 #include <asm/hpet.h>
 
+#include <acpi/actables.h>
+
 #include <public/version.h>
 #include <public/arch-x86/hvm/start_info.h>
 #include <public/hvm/hvm_info_table.h>
@@ -50,6 +53,9 @@  static long __initdata dom0_max_nrpages = LONG_MAX;
 /* Size of the VM86 TSS for virtual 8086 mode to use. */
 #define HVM_VM86_TSS_SIZE   128
 
+static unsigned int __initdata acpi_intr_overrrides;
+static struct acpi_madt_interrupt_override __initdata *intsrcovr;
+
 /*
  * dom0_mem=[min:<min_amt>,][max:<max_amt>,][<amt>]
  * 
@@ -567,7 +573,7 @@  static __init void hvm_setup_e820(struct domain *d, unsigned long nr_pages)
     /*
      * Craft the e820 memory map for Dom0 based on the hardware e820 map.
      */
-    d->arch.e820 = xzalloc_array(struct e820entry, e820.nr_map);
+    d->arch.e820 = xzalloc_array(struct e820entry, E820MAX);
     if ( !d->arch.e820 )
         panic("Unable to allocate memory for Dom0 e820 map");
     entry_guest = d->arch.e820;
@@ -1779,6 +1785,55 @@  static int __init hvm_steal_ram(struct domain *d, unsigned long size,
     return -ENOMEM;
 }
 
+/* NB: memory map must be sorted at all times for this to work correctly. */
+static int __init hvm_add_mem_range(struct domain *d, uint64_t s, uint64_t e,
+                                    unsigned int type)
+{
+    unsigned int i;
+
+    for ( i = 0; i < d->arch.nr_e820; i++ )
+    {
+        uint64_t rs = d->arch.e820[i].addr;
+        uint64_t re = rs + d->arch.e820[i].size;
+
+        if ( rs == e && d->arch.e820[i].type == type )
+        {
+            d->arch.e820[i].addr = s;
+            return 0;
+        }
+
+        if ( re == s && d->arch.e820[i].type == type &&
+             (i + 1 == d->arch.nr_e820 || d->arch.e820[i + 1].addr >= e) )
+        {
+            d->arch.e820[i].size += e - s;
+            return 0;
+        }
+
+        if ( rs >= e )
+            break;
+
+        if ( re > s )
+            return -EEXIST;
+    }
+
+    if ( d->arch.nr_e820 >= E820MAX )
+    {
+        printk(XENLOG_WARNING "E820: overflow while adding region"
+               "[%"PRIx64", %"PRIx64")\n", s, e);
+        return -ENOMEM;
+    }
+
+    memmove(d->arch.e820 + i + 1, d->arch.e820 + i,
+            (d->arch.nr_e820 - i) * sizeof(*d->arch.e820));
+
+    d->arch.nr_e820++;
+    d->arch.e820[i].addr = s;
+    d->arch.e820[i].size = e - s;
+    d->arch.e820[i].type = type;
+
+    return 0;
+}
+
 static int __init hvm_setup_vmx_realmode_helpers(struct domain *d)
 {
     p2m_type_t p2mt;
@@ -2157,6 +2212,371 @@  static int __init hvm_setup_cpus(struct domain *d, paddr_t entry,
     return 0;
 }
 
+static int __init acpi_count_intr_ovr(struct acpi_subtable_header *header,
+                                     const unsigned long end)
+{
+
+    acpi_intr_overrrides++;
+    return 0;
+}
+
+static int __init acpi_set_intr_ovr(struct acpi_subtable_header *header,
+                                    const unsigned long end)
+{
+    const struct acpi_madt_interrupt_override *intr =
+        container_of(header, struct acpi_madt_interrupt_override, header);
+
+    *intsrcovr = *intr;
+    intsrcovr++;
+
+    return 0;
+}
+
+static int __init hvm_setup_acpi_madt(struct domain *d, paddr_t *addr)
+{
+    struct acpi_table_madt *madt;
+    struct acpi_table_header *table;
+    struct acpi_madt_io_apic *io_apic;
+    struct acpi_madt_local_apic *local_apic;
+    acpi_status status;
+    unsigned long size;
+    unsigned int i;
+    int rc;
+
+    /* Count number of interrupt overrides in the MADT. */
+    acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE,
+                          acpi_count_intr_ovr, MAX_IRQ_SOURCES);
+
+    /* Calculate the size of the crafted MADT. */
+    size = sizeof(*madt);
+    size += sizeof(*io_apic);
+    size += sizeof(*local_apic) * dom0_max_vcpus();
+    size += sizeof(struct acpi_madt_interrupt_override) * acpi_intr_overrrides;
+
+    madt = xzalloc_bytes(size);
+    if ( !madt )
+    {
+        printk("Unable to allocate memory for MADT table\n");
+        return -ENOMEM;
+    }
+
+    /* Copy the native MADT table header. */
+    status = acpi_get_table(ACPI_SIG_MADT, 0, &table);
+    if ( !ACPI_SUCCESS(status) )
+    {
+        printk("Failed to get MADT ACPI table, aborting.\n");
+        return -EINVAL;
+    }
+    *((struct acpi_table_header *)madt) = *table;
+    madt->address = APIC_DEFAULT_PHYS_BASE;
+    /*
+     * NB: this is currently set to 4, which is the revision in the ACPI
+     * spec 6.1. Sadly ACPICA doesn't provide revision numbers for the
+     * tables described in the headers.
+     */
+    madt->header.revision = 4;
+
+    /*
+     * Setup the IO APIC entry.
+     * FIXME: the current vIO-APIC code just supports one IO-APIC instance
+     * per domain. This must be fixed in order to provide the same amount of
+     * IO APICs as available on bare metal.
+     */
+    if ( nr_ioapics > 1 )
+        printk("WARNING: found %d IO APICs, Dom0 will only have access to 1 emulated IO APIC\n",
+               nr_ioapics);
+    io_apic = (struct acpi_madt_io_apic *)(madt + 1);
+    io_apic->header.type = ACPI_MADT_TYPE_IO_APIC;
+    io_apic->header.length = sizeof(*io_apic);
+    io_apic->id = 1;
+    io_apic->address = VIOAPIC_DEFAULT_BASE_ADDRESS;
+
+    local_apic = (struct acpi_madt_local_apic *)(io_apic + 1);
+    for ( i = 0; i < dom0_max_vcpus(); i++ )
+    {
+        local_apic->header.type = ACPI_MADT_TYPE_LOCAL_APIC;
+        local_apic->header.length = sizeof(*local_apic);
+        local_apic->processor_id = i;
+        local_apic->id = i * 2;
+        local_apic->lapic_flags = ACPI_MADT_ENABLED;
+        local_apic++;
+    }
+
+    /* Setup interrupt overrides. */
+    intsrcovr = (struct acpi_madt_interrupt_override *)local_apic;
+    acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, acpi_set_intr_ovr,
+                          MAX_IRQ_SOURCES);
+    ASSERT(((unsigned char *)intsrcovr - (unsigned char *)madt) == size);
+    madt->header.length = size;
+    madt->header.checksum -= acpi_tb_checksum(ACPI_CAST_PTR(u8, madt), size);
+
+    /* Place the new MADT in guest memory space. */
+    if ( hvm_steal_ram(d, size, GB(4), addr) )
+    {
+        printk("Unable to find allocate guest RAM for MADT\n");
+        return -ENOMEM;
+    }
+
+    /* Mark this region as E820_ACPI. */
+    if ( hvm_add_mem_range(d, *addr, *addr + size, E820_ACPI) )
+        printk("Unable to add MADT region to memory map\n");
+
+    rc = hvm_copy_to_phys(d, *addr, madt, size);
+    if ( rc )
+    {
+        printk("Unable to copy MADT into guest memory\n");
+        return rc;
+    }
+    xfree(madt);
+
+    return 0;
+}
+
+static bool __init acpi_memory_banned(unsigned long mfn, unsigned long nr_pages)
+{
+    unsigned long i;
+
+    for ( i = 0 ; i < nr_pages; i++ )
+        if ( !page_is_ram_type(mfn + i, RAM_TYPE_RESERVED) &&
+             !page_is_ram_type(mfn + i, RAM_TYPE_ACPI) )
+            return true;
+
+    return false;
+}
+
+static bool __init hvm_acpi_table_allowed(const char *sig)
+{
+    static const char __init banned_tables[][ACPI_NAME_SIZE] = {
+        ACPI_SIG_HPET, ACPI_SIG_SLIT, ACPI_SIG_SRAT, ACPI_SIG_MPST,
+        ACPI_SIG_PMTT, ACPI_SIG_MADT, ACPI_SIG_DMAR};
+    unsigned long pfn, nr_pages;
+    int i;
+
+    for ( i = 0 ; i < ARRAY_SIZE(banned_tables); i++ )
+        if ( strncmp(sig, banned_tables[i], ACPI_NAME_SIZE) == 0 )
+            return false;
+
+    /* Make sure table doesn't reside in a RAM region. */
+    pfn = PFN_DOWN(acpi_gbl_root_table_list.tables[i].address);
+    nr_pages = DIV_ROUND_UP(
+                    (acpi_gbl_root_table_list.tables[i].address & ~PAGE_MASK) +
+                    acpi_gbl_root_table_list.tables[i].length, PAGE_SIZE);
+    if ( acpi_memory_banned(pfn, nr_pages) )
+    {
+        printk("Skipping table %.4s because resides in a non ACPI or reserved region\n",
+               sig);
+        return false;
+    }
+
+    return true;
+}
+
+static int __init hvm_setup_acpi_xsdt(struct domain *d, paddr_t madt_addr,
+                                      paddr_t *addr)
+{
+    struct acpi_table_xsdt *xsdt;
+    struct acpi_table_header *table;
+    struct acpi_table_rsdp *rsdp;
+    unsigned long size;
+    unsigned int i, num_tables;
+    paddr_t xsdt_paddr;
+    int j, rc;
+
+    /*
+     * Restore original DMAR table signature, we are going to filter it
+     * from the new XSDT that is presented to the guest, so it no longer
+     * makes sense to have it's signature zapped.
+     */
+    acpi_dmar_reinstate();
+
+    /* Account for the space needed by the XSDT. */
+    size = sizeof(*xsdt);
+    num_tables = 0;
+
+    /* Count the number of tables that will be added to the XSDT. */
+    for( i = 0; i < acpi_gbl_root_table_list.count; i++ )
+    {
+        const char *sig = acpi_gbl_root_table_list.tables[i].signature.ascii;
+
+        if ( hvm_acpi_table_allowed(sig) )
+            num_tables++;
+    }
+
+    /*
+     * No need to subtract one because we will be adding a custom MADT (and
+     * the native one is not accounted for).
+     */
+    size += num_tables * sizeof(xsdt->table_offset_entry[0]);
+
+    xsdt = xzalloc_bytes(size);
+    if ( !xsdt )
+    {
+        printk("Unable to allocate memory for XSDT table\n");
+        return -ENOMEM;
+    }
+
+    /* Copy the native XSDT table header. */
+    rsdp = acpi_os_map_memory(acpi_os_get_root_pointer(), sizeof(*rsdp));
+    if ( !rsdp )
+    {
+        printk("Unable to map RSDP\n");
+        return -EINVAL;
+    }
+    xsdt_paddr = rsdp->xsdt_physical_address;
+    acpi_os_unmap_memory(rsdp, sizeof(*rsdp));
+    table = acpi_os_map_memory(xsdt_paddr, sizeof(*table));
+    if ( !table )
+    {
+        printk("Unable to map XSDT\n");
+        return -EINVAL;
+    }
+    *((struct acpi_table_header *)xsdt) = *table;
+    acpi_os_unmap_memory(table, sizeof(*table));
+
+    /* Add the custom MADT. */
+    j = 0;
+    xsdt->table_offset_entry[j++] = madt_addr;
+
+    /* Copy the addresses of the rest of the allowed tables. */
+    for( i = 0; i < acpi_gbl_root_table_list.count; i++ )
+    {
+        const char *sig = acpi_gbl_root_table_list.tables[i].signature.ascii;
+
+        if ( hvm_acpi_table_allowed(sig) )
+            xsdt->table_offset_entry[j++] =
+                                acpi_gbl_root_table_list.tables[i].address;
+    }
+
+    xsdt->header.revision = 1;
+    xsdt->header.length = size;
+    xsdt->header.checksum -= acpi_tb_checksum(ACPI_CAST_PTR(u8, xsdt), size);
+
+    /* Place the new XSDT in guest memory space. */
+    if ( hvm_steal_ram(d, size, GB(4), addr) )
+    {
+        printk("Unable to find guest RAM for XSDT\n");
+        return -ENOMEM;
+    }
+
+    /* Mark this region as E820_ACPI. */
+    if ( hvm_add_mem_range(d, *addr, *addr + size, E820_ACPI) )
+        printk("Unable to add XSDT region to memory map\n");
+
+    rc = hvm_copy_to_phys(d, *addr, xsdt, size);
+    if ( rc )
+    {
+        printk("Unable to copy XSDT into guest memory\n");
+        return rc;
+    }
+    xfree(xsdt);
+
+    return 0;
+}
+
+static int __init hvm_setup_acpi(struct domain *d, paddr_t start_info)
+{
+    struct acpi_table_rsdp rsdp, *native_rsdp;
+    unsigned long pfn, nr_pages;
+    paddr_t madt_paddr, xsdt_paddr, rsdp_paddr;
+    unsigned int i;
+    int rc;
+
+    /* Scan top-level tables and add their regions to the guest memory map. */
+    for( i = 0; i < acpi_gbl_root_table_list.count; i++ )
+    {
+        const char *sig = acpi_gbl_root_table_list.tables[i].signature.ascii;
+
+        if ( hvm_acpi_table_allowed(sig) )
+            hvm_add_mem_range(d, acpi_gbl_root_table_list.tables[i].address,
+                              acpi_gbl_root_table_list.tables[i].address +
+                              acpi_gbl_root_table_list.tables[i].length,
+                              E820_ACPI);
+    }
+
+    /* Identity map ACPI e820 regions. */
+    for ( i = 0; i < d->arch.nr_e820; i++ )
+    {
+        if ( d->arch.e820[i].type != E820_ACPI &&
+             d->arch.e820[i].type != E820_NVS )
+            continue;
+
+        pfn = PFN_DOWN(d->arch.e820[i].addr);
+        nr_pages = DIV_ROUND_UP((d->arch.e820[i].addr & ~PAGE_MASK) +
+                                d->arch.e820[i].size, PAGE_SIZE);
+
+        rc = modify_identity_mmio(d, pfn, nr_pages, true);
+        if ( rc )
+        {
+            printk(
+                "Failed to map ACPI region [%#lx, %#lx) into Dom0 memory map\n",
+                   pfn, pfn + nr_pages);
+            return rc;
+        }
+    }
+
+    rc = hvm_setup_acpi_madt(d, &madt_paddr);
+    if ( rc )
+        return rc;
+
+    rc = hvm_setup_acpi_xsdt(d, madt_paddr, &xsdt_paddr);
+    if ( rc )
+        return rc;
+
+    /* Craft a custom RSDP. */
+    memset(&rsdp, 0, sizeof(rsdp));
+    memcpy(rsdp.signature, ACPI_SIG_RSDP, sizeof(rsdp.signature));
+    native_rsdp = acpi_os_map_memory(acpi_os_get_root_pointer(), sizeof(rsdp));
+    memcpy(rsdp.oem_id, native_rsdp->oem_id, sizeof(rsdp.oem_id));
+    acpi_os_unmap_memory(native_rsdp, sizeof(rsdp));
+    rsdp.revision = 2;
+    rsdp.xsdt_physical_address = xsdt_paddr;
+    rsdp.rsdt_physical_address = 0;
+    rsdp.length = sizeof(rsdp);
+    rsdp.checksum -= acpi_tb_checksum(ACPI_CAST_PTR(u8, &rsdp),
+                                      ACPI_RSDP_REV0_SIZE);
+    rsdp.extended_checksum -= acpi_tb_checksum(ACPI_CAST_PTR(u8, &rsdp),
+                                               sizeof(rsdp));
+
+    /*
+     * Place the new RSDP in guest memory space.
+     *
+     * NB: this RSDP is not going to replace the original RSDP, which
+     * should still be accessible to the guest. However that RSDP is
+     * going to point to the native RSDT, and should not be used.
+     */
+    if ( hvm_steal_ram(d, sizeof(rsdp), GB(4), &rsdp_paddr) )
+    {
+        printk("Unable to allocate guest RAM for RSDP\n");
+        return -ENOMEM;
+    }
+
+    /* Mark this region as E820_ACPI. */
+    if ( hvm_add_mem_range(d, rsdp_paddr, rsdp_paddr + sizeof(rsdp),
+                           E820_ACPI) )
+        printk("Unable to add RSDP region to memory map\n");
+
+    /* Copy RSDP into guest memory. */
+    rc = hvm_copy_to_phys(d, rsdp_paddr, &rsdp, sizeof(rsdp));
+    if ( rc )
+    {
+        printk("Unable to copy RSDP into guest memory\n");
+        return rc;
+    }
+
+    /* Copy RSDP address to start_info. */
+    rc = hvm_copy_to_phys(d, start_info +
+                          offsetof(struct hvm_start_info, rsdp_paddr),
+                          &rsdp_paddr,
+                          sizeof(((struct hvm_start_info *)0)->rsdp_paddr));
+    if ( rc )
+    {
+        printk("Unable to copy RSDP into guest memory\n");
+        return rc;
+    }
+
+    return 0;
+}
+
 static int __init construct_dom0_hvm(struct domain *d, const module_t *image,
                                      unsigned long image_headroom,
                                      module_t *initrd,
@@ -2196,6 +2616,13 @@  static int __init construct_dom0_hvm(struct domain *d, const module_t *image,
         return rc;
     }
 
+    rc = hvm_setup_acpi(d, start_info);
+    if ( rc )
+    {
+        printk("Failed to setup Dom0 ACPI tables: %d\n", rc);
+        return rc;
+    }
+
     clear_bit(_VPF_down, &d->vcpu[0]->pause_flags);
 
     return 0;