diff mbox

[v2,16/17] libxc/xc_dom_arm: Copy ACPI tables to guest space

Message ID 1466651824-6964-17-git-send-email-zhaoshenglong@huawei.com (mailing list archive)
State New, archived
Headers show

Commit Message

Shannon Zhao June 23, 2016, 3:17 a.m. UTC
From: Shannon Zhao <shannon.zhao@linaro.org>

Copy all the ACPI tables to guest space so that UEFI or guest could
access them.

Signed-off-by: Shannon Zhao <shannon.zhao@linaro.org>
---
 tools/libxc/xc_dom_arm.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)

Comments

Julien Grall June 23, 2016, 6:46 p.m. UTC | #1
Hi Shannon,

On 23/06/2016 04:17, Shannon Zhao wrote:
> From: Shannon Zhao <shannon.zhao@linaro.org>
>
> Copy all the ACPI tables to guest space so that UEFI or guest could
> access them.
>
> Signed-off-by: Shannon Zhao <shannon.zhao@linaro.org>
> ---
>  tools/libxc/xc_dom_arm.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 51 insertions(+)
>
> diff --git a/tools/libxc/xc_dom_arm.c b/tools/libxc/xc_dom_arm.c
> index 64a8b67..6a0a5b7 100644
> --- a/tools/libxc/xc_dom_arm.c
> +++ b/tools/libxc/xc_dom_arm.c
> @@ -63,6 +63,47 @@ static int setup_pgtables_arm(struct xc_dom_image *dom)
>
>  /* ------------------------------------------------------------------------ */
>
> +static int xc_dom_copy_acpi(struct xc_dom_image *dom)
> +{
> +    int rc, i;
> +    uint32_t pages_num = ROUNDUP(dom->acpitable_size, XC_PAGE_SHIFT) >>
> +                         XC_PAGE_SHIFT;
> +    const xen_pfn_t base = GUEST_ACPI_BASE >> XC_PAGE_SHIFT;
> +    xen_pfn_t *p2m;
> +    void *acpi_pages;
> +
> +    p2m = malloc(pages_num * sizeof(*p2m));
> +    for (i = 0; i < pages_num; i++)
> +        p2m[i] = base + i;
> +
> +    rc = xc_domain_populate_physmap_exact(dom->xch, dom->guest_domid,
> +                                          pages_num, 0, 0, p2m);

Hmmmm... it looks like this is working because libxl is setting the 
maximum size of the domain with some slack (1MB). However, I guess the 
slack was for something else. Wei, Stefano, Ian, can you confirm?

> +    if ( rc )
> +    {
> +        DOMPRINTF("%s: xc_domain_populate_physmap_exact failed with %d",
> +                  __FUNCTION__, rc);
> +        goto out;
> +    }
> +
> +    acpi_pages = xc_map_foreign_range(dom->xch, dom->guest_domid,
> +                                      PAGE_SIZE * pages_num,
> +                                      PROT_READ | PROT_WRITE, base);
> +    if ( !acpi_pages )
> +    {
> +        DOMPRINTF("%s Can't map acpi_pages", __FUNCTION__);
> +        rc = -1;
> +        goto out;
> +    }
> +
> +    memcpy(acpi_pages, dom->acpitable_blob, dom->acpitable_size);
> +
> +out:
> +    munmap(acpi_pages, pages_num * PAGE_SIZE);
> +    free(p2m);
> +
> +    return rc;
> +}
> +
>  static int alloc_magic_pages(struct xc_dom_image *dom)
>  {
	>      int rc, i;
> @@ -100,6 +141,16 @@ static int alloc_magic_pages(struct xc_dom_image *dom)
>      xc_hvm_param_set(dom->xch, dom->guest_domid, HVM_PARAM_STORE_EVTCHN,
>              dom->xenstore_evtchn);
>
> +    if ( dom->acpitable_blob && dom->acpitable_size > 0 )
> +    {
> +        rc = xc_dom_copy_acpi(dom);
> +        if ( rc != 0 )
> +        {
> +            DOMPRINTF("Unable to copy ACPI tables");
> +            return rc;
> +        }
> +    }

alloc_magic_pages looks the wrong place with this function. Any reason 
to not have a generic ACPI blob loading in xc_dom_core.c as we do for 
devicetree?

Regards,
Shannon Zhao June 27, 2016, 6:25 a.m. UTC | #2
On 2016/6/24 2:46, Julien Grall wrote:
>> +
>>  static int alloc_magic_pages(struct xc_dom_image *dom)
>>  {
>     >      int rc, i;
>> @@ -100,6 +141,16 @@ static int alloc_magic_pages(struct xc_dom_image
>> *dom)
>>      xc_hvm_param_set(dom->xch, dom->guest_domid, HVM_PARAM_STORE_EVTCHN,
>>              dom->xenstore_evtchn);
>>
>> +    if ( dom->acpitable_blob && dom->acpitable_size > 0 )
>> +    {
>> +        rc = xc_dom_copy_acpi(dom);
>> +        if ( rc != 0 )
>> +        {
>> +            DOMPRINTF("Unable to copy ACPI tables");
>> +            return rc;
>> +        }
>> +    }
> 
> alloc_magic_pages looks the wrong place with this function. Any reason
> to not have a generic ACPI blob loading in xc_dom_core.c as we do for
> devicetree?
Looks like xc_dom_build_image is used for allocating pages in guest RAM
while ACPI blob is not put in guest RAM.

Thanks,
Julien Grall June 27, 2016, 10:49 a.m. UTC | #3
(CC Boris)

On 27/06/16 07:25, Shannon Zhao wrote:
>
>
> On 2016/6/24 2:46, Julien Grall wrote:
>>> +
>>>   static int alloc_magic_pages(struct xc_dom_image *dom)
>>>   {
>>      >      int rc, i;
>>> @@ -100,6 +141,16 @@ static int alloc_magic_pages(struct xc_dom_image
>>> *dom)
>>>       xc_hvm_param_set(dom->xch, dom->guest_domid, HVM_PARAM_STORE_EVTCHN,
>>>               dom->xenstore_evtchn);
>>>
>>> +    if ( dom->acpitable_blob && dom->acpitable_size > 0 )
>>> +    {
>>> +        rc = xc_dom_copy_acpi(dom);
>>> +        if ( rc != 0 )
>>> +        {
>>> +            DOMPRINTF("Unable to copy ACPI tables");
>>> +            return rc;
>>> +        }
>>> +    }
>>
>> alloc_magic_pages looks the wrong place with this function. Any reason
>> to not have a generic ACPI blob loading in xc_dom_core.c as we do for
>> devicetree?
> Looks like xc_dom_build_image is used for allocating pages in guest RAM
> while ACPI blob is not put in guest RAM.

The function xc_dom_build_image is used to load blob into the guest 
memory and allocate others pages (such as magic pages) which may not be 
part of the actual guest RAM.

The callback alloc_magic_pages is used to allocate per-architecture 
specific pages. However, the ACPI blob is pretty much generic. So I am 
not sure why we would want to do it for ARM only.

Regards,
Boris Ostrovsky June 27, 2016, 12:11 p.m. UTC | #4
On 06/27/2016 06:49 AM, Julien Grall wrote:
> (CC Boris)
>
> On 27/06/16 07:25, Shannon Zhao wrote:
>>
>>
>> On 2016/6/24 2:46, Julien Grall wrote:
>>>> +
>>>>   static int alloc_magic_pages(struct xc_dom_image *dom)
>>>>   {
>>>      >      int rc, i;
>>>> @@ -100,6 +141,16 @@ static int alloc_magic_pages(struct xc_dom_image
>>>> *dom)
>>>>       xc_hvm_param_set(dom->xch, dom->guest_domid, 
>>>> HVM_PARAM_STORE_EVTCHN,
>>>>               dom->xenstore_evtchn);
>>>>
>>>> +    if ( dom->acpitable_blob && dom->acpitable_size > 0 )
>>>> +    {
>>>> +        rc = xc_dom_copy_acpi(dom);
>>>> +        if ( rc != 0 )
>>>> +        {
>>>> +            DOMPRINTF("Unable to copy ACPI tables");
>>>> +            return rc;
>>>> +        }
>>>> +    }
>>>
>>> alloc_magic_pages looks the wrong place with this function. Any reason
>>> to not have a generic ACPI blob loading in xc_dom_core.c as we do for
>>> devicetree?
>> Looks like xc_dom_build_image is used for allocating pages in guest RAM
>> while ACPI blob is not put in guest RAM.
>
> The function xc_dom_build_image is used to load blob into the guest 
> memory and allocate others pages (such as magic pages) which may not 
> be part of the actual guest RAM.
>
> The callback alloc_magic_pages is used to allocate per-architecture 
> specific pages. However, the ACPI blob is pretty much generic. So I am 
> not sure why we would want to do it for ARM only.


FWIW, for PVH I don't plan, at least for now, to keep a pointer to ACPI 
stuff in xc_dom_image. I am building the tables and loading them into 
the guest right away.

(As a separate point, I noticed that this series adds 4th type of blob 
(in addition to kernel, ramdisk and devicetree) so I wonder whether 
introducing a struct blob might be useful.)

-boris
Stefano Stabellini July 5, 2016, 5:13 p.m. UTC | #5
On Thu, 23 Jun 2016, Julien Grall wrote:
> Hi Shannon,
> 
> On 23/06/2016 04:17, Shannon Zhao wrote:
> > From: Shannon Zhao <shannon.zhao@linaro.org>
> > 
> > Copy all the ACPI tables to guest space so that UEFI or guest could
> > access them.
> > 
> > Signed-off-by: Shannon Zhao <shannon.zhao@linaro.org>
> > ---
> >  tools/libxc/xc_dom_arm.c | 51
> > ++++++++++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 51 insertions(+)
> > 
> > diff --git a/tools/libxc/xc_dom_arm.c b/tools/libxc/xc_dom_arm.c
> > index 64a8b67..6a0a5b7 100644
> > --- a/tools/libxc/xc_dom_arm.c
> > +++ b/tools/libxc/xc_dom_arm.c
> > @@ -63,6 +63,47 @@ static int setup_pgtables_arm(struct xc_dom_image *dom)
> > 
> >  /* ------------------------------------------------------------------------
> > */
> > 
> > +static int xc_dom_copy_acpi(struct xc_dom_image *dom)
> > +{
> > +    int rc, i;
> > +    uint32_t pages_num = ROUNDUP(dom->acpitable_size, XC_PAGE_SHIFT) >>
> > +                         XC_PAGE_SHIFT;
> > +    const xen_pfn_t base = GUEST_ACPI_BASE >> XC_PAGE_SHIFT;
> > +    xen_pfn_t *p2m;
> > +    void *acpi_pages;
> > +
> > +    p2m = malloc(pages_num * sizeof(*p2m));
> > +    for (i = 0; i < pages_num; i++)
> > +        p2m[i] = base + i;
> > +
> > +    rc = xc_domain_populate_physmap_exact(dom->xch, dom->guest_domid,
> > +                                          pages_num, 0, 0, p2m);
> 
> Hmmmm... it looks like this is working because libxl is setting the maximum
> size of the domain with some slack (1MB). However, I guess the slack was for
> something else. Wei, Stefano, Ian, can you confirm?

If I recall correctly, the slack is a magic value coming from the
ancient history of toolstacks.
Julien Grall July 6, 2016, 9:46 a.m. UTC | #6
Hi Stefano,

On 05/07/16 18:13, Stefano Stabellini wrote:
> On Thu, 23 Jun 2016, Julien Grall wrote:
>> On 23/06/2016 04:17, Shannon Zhao wrote:
>>> From: Shannon Zhao <shannon.zhao@linaro.org>
>>>
>>> Copy all the ACPI tables to guest space so that UEFI or guest could
>>> access them.
>>>
>>> Signed-off-by: Shannon Zhao <shannon.zhao@linaro.org>
>>> ---
>>>   tools/libxc/xc_dom_arm.c | 51
>>> ++++++++++++++++++++++++++++++++++++++++++++++++
>>>   1 file changed, 51 insertions(+)
>>>
>>> diff --git a/tools/libxc/xc_dom_arm.c b/tools/libxc/xc_dom_arm.c
>>> index 64a8b67..6a0a5b7 100644
>>> --- a/tools/libxc/xc_dom_arm.c
>>> +++ b/tools/libxc/xc_dom_arm.c
>>> @@ -63,6 +63,47 @@ static int setup_pgtables_arm(struct xc_dom_image *dom)
>>>
>>>   /* ------------------------------------------------------------------------
>>> */
>>>
>>> +static int xc_dom_copy_acpi(struct xc_dom_image *dom)
>>> +{
>>> +    int rc, i;
>>> +    uint32_t pages_num = ROUNDUP(dom->acpitable_size, XC_PAGE_SHIFT) >>
>>> +                         XC_PAGE_SHIFT;
>>> +    const xen_pfn_t base = GUEST_ACPI_BASE >> XC_PAGE_SHIFT;
>>> +    xen_pfn_t *p2m;
>>> +    void *acpi_pages;
>>> +
>>> +    p2m = malloc(pages_num * sizeof(*p2m));
>>> +    for (i = 0; i < pages_num; i++)
>>> +        p2m[i] = base + i;
>>> +
>>> +    rc = xc_domain_populate_physmap_exact(dom->xch, dom->guest_domid,
>>> +                                          pages_num, 0, 0, p2m);
>>
>> Hmmmm... it looks like this is working because libxl is setting the maximum
>> size of the domain with some slack (1MB). However, I guess the slack was for
>> something else. Wei, Stefano, Ian, can you confirm?
>
> If I recall correctly, the slack is a magic value coming from the
> ancient history of toolstacks.

Does it mean we would need to update the slack to take into account the 
ACPI blob?

Cheers,
Stefano Stabellini July 6, 2016, 10:12 a.m. UTC | #7
On Wed, 6 Jul 2016, Julien Grall wrote:
> Hi Stefano,
> 
> On 05/07/16 18:13, Stefano Stabellini wrote:
> > On Thu, 23 Jun 2016, Julien Grall wrote:
> > > On 23/06/2016 04:17, Shannon Zhao wrote:
> > > > From: Shannon Zhao <shannon.zhao@linaro.org>
> > > > 
> > > > Copy all the ACPI tables to guest space so that UEFI or guest could
> > > > access them.
> > > > 
> > > > Signed-off-by: Shannon Zhao <shannon.zhao@linaro.org>
> > > > ---
> > > >   tools/libxc/xc_dom_arm.c | 51
> > > > ++++++++++++++++++++++++++++++++++++++++++++++++
> > > >   1 file changed, 51 insertions(+)
> > > > 
> > > > diff --git a/tools/libxc/xc_dom_arm.c b/tools/libxc/xc_dom_arm.c
> > > > index 64a8b67..6a0a5b7 100644
> > > > --- a/tools/libxc/xc_dom_arm.c
> > > > +++ b/tools/libxc/xc_dom_arm.c
> > > > @@ -63,6 +63,47 @@ static int setup_pgtables_arm(struct xc_dom_image
> > > > *dom)
> > > > 
> > > >   /*
> > > > ------------------------------------------------------------------------
> > > > */
> > > > 
> > > > +static int xc_dom_copy_acpi(struct xc_dom_image *dom)
> > > > +{
> > > > +    int rc, i;
> > > > +    uint32_t pages_num = ROUNDUP(dom->acpitable_size, XC_PAGE_SHIFT) >>
> > > > +                         XC_PAGE_SHIFT;
> > > > +    const xen_pfn_t base = GUEST_ACPI_BASE >> XC_PAGE_SHIFT;
> > > > +    xen_pfn_t *p2m;
> > > > +    void *acpi_pages;
> > > > +
> > > > +    p2m = malloc(pages_num * sizeof(*p2m));
> > > > +    for (i = 0; i < pages_num; i++)
> > > > +        p2m[i] = base + i;
> > > > +
> > > > +    rc = xc_domain_populate_physmap_exact(dom->xch, dom->guest_domid,
> > > > +                                          pages_num, 0, 0, p2m);
> > > 
> > > Hmmmm... it looks like this is working because libxl is setting the
> > > maximum
> > > size of the domain with some slack (1MB). However, I guess the slack was
> > > for
> > > something else. Wei, Stefano, Ian, can you confirm?
> > 
> > If I recall correctly, the slack is a magic value coming from the
> > ancient history of toolstacks.
> 
> Does it mean we would need to update the slack to take into account the ACPI
> blob?

Yes, we need to take into account the ACPI blob. Probably not in the
slack but directly in mam_memkb.
Wei Liu July 7, 2016, 3:35 p.m. UTC | #8
On Thu, Jun 23, 2016 at 07:46:42PM +0100, Julien Grall wrote:
> Hi Shannon,
> 
> On 23/06/2016 04:17, Shannon Zhao wrote:
> >From: Shannon Zhao <shannon.zhao@linaro.org>
> >
> >Copy all the ACPI tables to guest space so that UEFI or guest could
> >access them.
> >
> >Signed-off-by: Shannon Zhao <shannon.zhao@linaro.org>
> >---
> > tools/libxc/xc_dom_arm.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++++
> > 1 file changed, 51 insertions(+)
> >
> >diff --git a/tools/libxc/xc_dom_arm.c b/tools/libxc/xc_dom_arm.c
> >index 64a8b67..6a0a5b7 100644
> >--- a/tools/libxc/xc_dom_arm.c
> >+++ b/tools/libxc/xc_dom_arm.c
> >@@ -63,6 +63,47 @@ static int setup_pgtables_arm(struct xc_dom_image *dom)
> >
> > /* ------------------------------------------------------------------------ */
> >
> >+static int xc_dom_copy_acpi(struct xc_dom_image *dom)
> >+{
> >+    int rc, i;
> >+    uint32_t pages_num = ROUNDUP(dom->acpitable_size, XC_PAGE_SHIFT) >>
> >+                         XC_PAGE_SHIFT;
> >+    const xen_pfn_t base = GUEST_ACPI_BASE >> XC_PAGE_SHIFT;
> >+    xen_pfn_t *p2m;
> >+    void *acpi_pages;
> >+
> >+    p2m = malloc(pages_num * sizeof(*p2m));
> >+    for (i = 0; i < pages_num; i++)
> >+        p2m[i] = base + i;
> >+
> >+    rc = xc_domain_populate_physmap_exact(dom->xch, dom->guest_domid,
> >+                                          pages_num, 0, 0, p2m);
> 
> Hmmmm... it looks like this is working because libxl is setting the maximum
> size of the domain with some slack (1MB). However, I guess the slack was for
> something else. Wei, Stefano, Ian, can you confirm?

Please don't rely on this slack. Account memory properly please.

Wei.
Shannon Zhao July 12, 2016, 3:47 a.m. UTC | #9
On 2016/7/6 18:12, Stefano Stabellini wrote:
> On Wed, 6 Jul 2016, Julien Grall wrote:
>> > Hi Stefano,
>> > 
>> > On 05/07/16 18:13, Stefano Stabellini wrote:
>>> > > On Thu, 23 Jun 2016, Julien Grall wrote:
>>>> > > > On 23/06/2016 04:17, Shannon Zhao wrote:
>>>>> > > > > From: Shannon Zhao <shannon.zhao@linaro.org>
>>>>> > > > > 
>>>>> > > > > Copy all the ACPI tables to guest space so that UEFI or guest could
>>>>> > > > > access them.
>>>>> > > > > 
>>>>> > > > > Signed-off-by: Shannon Zhao <shannon.zhao@linaro.org>
>>>>> > > > > ---
>>>>> > > > >   tools/libxc/xc_dom_arm.c | 51
>>>>> > > > > ++++++++++++++++++++++++++++++++++++++++++++++++
>>>>> > > > >   1 file changed, 51 insertions(+)
>>>>> > > > > 
>>>>> > > > > diff --git a/tools/libxc/xc_dom_arm.c b/tools/libxc/xc_dom_arm.c
>>>>> > > > > index 64a8b67..6a0a5b7 100644
>>>>> > > > > --- a/tools/libxc/xc_dom_arm.c
>>>>> > > > > +++ b/tools/libxc/xc_dom_arm.c
>>>>> > > > > @@ -63,6 +63,47 @@ static int setup_pgtables_arm(struct xc_dom_image
>>>>> > > > > *dom)
>>>>> > > > > 
>>>>> > > > >   /*
>>>>> > > > > ------------------------------------------------------------------------
>>>>> > > > > */
>>>>> > > > > 
>>>>> > > > > +static int xc_dom_copy_acpi(struct xc_dom_image *dom)
>>>>> > > > > +{
>>>>> > > > > +    int rc, i;
>>>>> > > > > +    uint32_t pages_num = ROUNDUP(dom->acpitable_size, XC_PAGE_SHIFT) >>
>>>>> > > > > +                         XC_PAGE_SHIFT;
>>>>> > > > > +    const xen_pfn_t base = GUEST_ACPI_BASE >> XC_PAGE_SHIFT;
>>>>> > > > > +    xen_pfn_t *p2m;
>>>>> > > > > +    void *acpi_pages;
>>>>> > > > > +
>>>>> > > > > +    p2m = malloc(pages_num * sizeof(*p2m));
>>>>> > > > > +    for (i = 0; i < pages_num; i++)
>>>>> > > > > +        p2m[i] = base + i;
>>>>> > > > > +
>>>>> > > > > +    rc = xc_domain_populate_physmap_exact(dom->xch, dom->guest_domid,
>>>>> > > > > +                                          pages_num, 0, 0, p2m);
>>>> > > > 
>>>> > > > Hmmmm... it looks like this is working because libxl is setting the
>>>> > > > maximum
>>>> > > > size of the domain with some slack (1MB). However, I guess the slack was
>>>> > > > for
>>>> > > > something else. Wei, Stefano, Ian, can you confirm?
>>> > > 
>>> > > If I recall correctly, the slack is a magic value coming from the
>>> > > ancient history of toolstacks.
>> > 
>> > Does it mean we would need to update the slack to take into account the ACPI
>> > blob?
> Yes, we need to take into account the ACPI blob. Probably not in the
> slack but directly in mam_memkb.
Sorry, I'm not sure understand this. I found the b_info->max_memkb but
didn't find the slack you said. And how to fix this? Update
b_info->max_memkb or the slack?

Thanks,
Julien Grall July 12, 2016, 9:25 a.m. UTC | #10
Hi Shannon,

On 12/07/16 04:47, Shannon Zhao wrote:
> On 2016/7/6 18:12, Stefano Stabellini wrote:
>> On Wed, 6 Jul 2016, Julien Grall wrote:
>>>> On 05/07/16 18:13, Stefano Stabellini wrote:
>>>>>> On Thu, 23 Jun 2016, Julien Grall wrote:
>>>>>>>> On 23/06/2016 04:17, Shannon Zhao wrote:
>>>>>>>>>> From: Shannon Zhao <shannon.zhao@linaro.org>
>>>>>>>>>> +static int xc_dom_copy_acpi(struct xc_dom_image *dom)
>>>>>>>>>> +{
>>>>>>>>>> +    int rc, i;
>>>>>>>>>> +    uint32_t pages_num = ROUNDUP(dom->acpitable_size, XC_PAGE_SHIFT) >>
>>>>>>>>>> +                         XC_PAGE_SHIFT;
>>>>>>>>>> +    const xen_pfn_t base = GUEST_ACPI_BASE >> XC_PAGE_SHIFT;
>>>>>>>>>> +    xen_pfn_t *p2m;
>>>>>>>>>> +    void *acpi_pages;
>>>>>>>>>> +
>>>>>>>>>> +    p2m = malloc(pages_num * sizeof(*p2m));
>>>>>>>>>> +    for (i = 0; i < pages_num; i++)
>>>>>>>>>> +        p2m[i] = base + i;
>>>>>>>>>> +
>>>>>>>>>> +    rc = xc_domain_populate_physmap_exact(dom->xch, dom->guest_domid,
>>>>>>>>>> +                                          pages_num, 0, 0, p2m);
>>>>>>>>
>>>>>>>> Hmmmm... it looks like this is working because libxl is setting the
>>>>>>>> maximum
>>>>>>>> size of the domain with some slack (1MB). However, I guess the slack was
>>>>>>>> for
>>>>>>>> something else. Wei, Stefano, Ian, can you confirm?
>>>>>>
>>>>>> If I recall correctly, the slack is a magic value coming from the
>>>>>> ancient history of toolstacks.
>>>>
>>>> Does it mean we would need to update the slack to take into account the ACPI
>>>> blob?
>> Yes, we need to take into account the ACPI blob. Probably not in the
>> slack but directly in mam_memkb.
> Sorry, I'm not sure understand this. I found the b_info->max_memkb but
> didn't find the slack you said. And how to fix this? Update
> b_info->max_memkb or the slack?

You can give a look to LIBXL_MAXMEM_CONSTANT.

I am not very familiar with libxl, so I will let Wei and Stefano giving 
you advice on how to fix this.

Regards,
Wei Liu July 12, 2016, 11:35 a.m. UTC | #11
On Tue, Jul 12, 2016 at 11:47:04AM +0800, Shannon Zhao wrote:
> 
> 
> On 2016/7/6 18:12, Stefano Stabellini wrote:
> > On Wed, 6 Jul 2016, Julien Grall wrote:
> >> > Hi Stefano,
> >> > 
> >> > On 05/07/16 18:13, Stefano Stabellini wrote:
> >>> > > On Thu, 23 Jun 2016, Julien Grall wrote:
> >>>> > > > On 23/06/2016 04:17, Shannon Zhao wrote:
> >>>>> > > > > From: Shannon Zhao <shannon.zhao@linaro.org>
> >>>>> > > > > 
> >>>>> > > > > Copy all the ACPI tables to guest space so that UEFI or guest could
> >>>>> > > > > access them.
> >>>>> > > > > 
> >>>>> > > > > Signed-off-by: Shannon Zhao <shannon.zhao@linaro.org>
> >>>>> > > > > ---
> >>>>> > > > >   tools/libxc/xc_dom_arm.c | 51
> >>>>> > > > > ++++++++++++++++++++++++++++++++++++++++++++++++
> >>>>> > > > >   1 file changed, 51 insertions(+)
> >>>>> > > > > 
> >>>>> > > > > diff --git a/tools/libxc/xc_dom_arm.c b/tools/libxc/xc_dom_arm.c
> >>>>> > > > > index 64a8b67..6a0a5b7 100644
> >>>>> > > > > --- a/tools/libxc/xc_dom_arm.c
> >>>>> > > > > +++ b/tools/libxc/xc_dom_arm.c
> >>>>> > > > > @@ -63,6 +63,47 @@ static int setup_pgtables_arm(struct xc_dom_image
> >>>>> > > > > *dom)
> >>>>> > > > > 
> >>>>> > > > >   /*
> >>>>> > > > > ------------------------------------------------------------------------
> >>>>> > > > > */
> >>>>> > > > > 
> >>>>> > > > > +static int xc_dom_copy_acpi(struct xc_dom_image *dom)
> >>>>> > > > > +{
> >>>>> > > > > +    int rc, i;
> >>>>> > > > > +    uint32_t pages_num = ROUNDUP(dom->acpitable_size, XC_PAGE_SHIFT) >>
> >>>>> > > > > +                         XC_PAGE_SHIFT;
> >>>>> > > > > +    const xen_pfn_t base = GUEST_ACPI_BASE >> XC_PAGE_SHIFT;
> >>>>> > > > > +    xen_pfn_t *p2m;
> >>>>> > > > > +    void *acpi_pages;
> >>>>> > > > > +
> >>>>> > > > > +    p2m = malloc(pages_num * sizeof(*p2m));
> >>>>> > > > > +    for (i = 0; i < pages_num; i++)
> >>>>> > > > > +        p2m[i] = base + i;
> >>>>> > > > > +
> >>>>> > > > > +    rc = xc_domain_populate_physmap_exact(dom->xch, dom->guest_domid,
> >>>>> > > > > +                                          pages_num, 0, 0, p2m);
> >>>> > > > 
> >>>> > > > Hmmmm... it looks like this is working because libxl is setting the
> >>>> > > > maximum
> >>>> > > > size of the domain with some slack (1MB). However, I guess the slack was
> >>>> > > > for
> >>>> > > > something else. Wei, Stefano, Ian, can you confirm?
> >>> > > 
> >>> > > If I recall correctly, the slack is a magic value coming from the
> >>> > > ancient history of toolstacks.
> >> > 
> >> > Does it mean we would need to update the slack to take into account the ACPI
> >> > blob?
> > Yes, we need to take into account the ACPI blob. Probably not in the
> > slack but directly in mam_memkb.
> Sorry, I'm not sure understand this. I found the b_info->max_memkb but
> didn't find the slack you said. And how to fix this? Update
> b_info->max_memkb or the slack?

Can you calculate the size of your payload and add that to max_memkb?

Don't bump the slack.

Wei.

> 
> Thanks,
> -- 
> Shannon
>
Shannon Zhao July 12, 2016, 2:42 p.m. UTC | #12
On 2016年07月12日 19:35, Wei Liu wrote:
> On Tue, Jul 12, 2016 at 11:47:04AM +0800, Shannon Zhao wrote:
>> > 
>> > 
>> > On 2016/7/6 18:12, Stefano Stabellini wrote:
>>> > > On Wed, 6 Jul 2016, Julien Grall wrote:
>>>>> > >> > Hi Stefano,
>>>>> > >> > 
>>>>> > >> > On 05/07/16 18:13, Stefano Stabellini wrote:
>>>>>>> > >>> > > On Thu, 23 Jun 2016, Julien Grall wrote:
>>>>>>>>> > >>>> > > > On 23/06/2016 04:17, Shannon Zhao wrote:
>>>>>>>>>>> > >>>>> > > > > From: Shannon Zhao <shannon.zhao@linaro.org>
>>>>>>>>>>> > >>>>> > > > > 
>>>>>>>>>>> > >>>>> > > > > Copy all the ACPI tables to guest space so that UEFI or guest could
>>>>>>>>>>> > >>>>> > > > > access them.
>>>>>>>>>>> > >>>>> > > > > 
>>>>>>>>>>> > >>>>> > > > > Signed-off-by: Shannon Zhao <shannon.zhao@linaro.org>
>>>>>>>>>>> > >>>>> > > > > ---
>>>>>>>>>>> > >>>>> > > > >   tools/libxc/xc_dom_arm.c | 51
>>>>>>>>>>> > >>>>> > > > > ++++++++++++++++++++++++++++++++++++++++++++++++
>>>>>>>>>>> > >>>>> > > > >   1 file changed, 51 insertions(+)
>>>>>>>>>>> > >>>>> > > > > 
>>>>>>>>>>> > >>>>> > > > > diff --git a/tools/libxc/xc_dom_arm.c b/tools/libxc/xc_dom_arm.c
>>>>>>>>>>> > >>>>> > > > > index 64a8b67..6a0a5b7 100644
>>>>>>>>>>> > >>>>> > > > > --- a/tools/libxc/xc_dom_arm.c
>>>>>>>>>>> > >>>>> > > > > +++ b/tools/libxc/xc_dom_arm.c
>>>>>>>>>>> > >>>>> > > > > @@ -63,6 +63,47 @@ static int setup_pgtables_arm(struct xc_dom_image
>>>>>>>>>>> > >>>>> > > > > *dom)
>>>>>>>>>>> > >>>>> > > > > 
>>>>>>>>>>> > >>>>> > > > >   /*
>>>>>>>>>>> > >>>>> > > > > ------------------------------------------------------------------------
>>>>>>>>>>> > >>>>> > > > > */
>>>>>>>>>>> > >>>>> > > > > 
>>>>>>>>>>> > >>>>> > > > > +static int xc_dom_copy_acpi(struct xc_dom_image *dom)
>>>>>>>>>>> > >>>>> > > > > +{
>>>>>>>>>>> > >>>>> > > > > +    int rc, i;
>>>>>>>>>>> > >>>>> > > > > +    uint32_t pages_num = ROUNDUP(dom->acpitable_size, XC_PAGE_SHIFT) >>
>>>>>>>>>>> > >>>>> > > > > +                         XC_PAGE_SHIFT;
>>>>>>>>>>> > >>>>> > > > > +    const xen_pfn_t base = GUEST_ACPI_BASE >> XC_PAGE_SHIFT;
>>>>>>>>>>> > >>>>> > > > > +    xen_pfn_t *p2m;
>>>>>>>>>>> > >>>>> > > > > +    void *acpi_pages;
>>>>>>>>>>> > >>>>> > > > > +
>>>>>>>>>>> > >>>>> > > > > +    p2m = malloc(pages_num * sizeof(*p2m));
>>>>>>>>>>> > >>>>> > > > > +    for (i = 0; i < pages_num; i++)
>>>>>>>>>>> > >>>>> > > > > +        p2m[i] = base + i;
>>>>>>>>>>> > >>>>> > > > > +
>>>>>>>>>>> > >>>>> > > > > +    rc = xc_domain_populate_physmap_exact(dom->xch, dom->guest_domid,
>>>>>>>>>>> > >>>>> > > > > +                                          pages_num, 0, 0, p2m);
>>>>>>>>> > >>>> > > > 
>>>>>>>>> > >>>> > > > Hmmmm... it looks like this is working because libxl is setting the
>>>>>>>>> > >>>> > > > maximum
>>>>>>>>> > >>>> > > > size of the domain with some slack (1MB). However, I guess the slack was
>>>>>>>>> > >>>> > > > for
>>>>>>>>> > >>>> > > > something else. Wei, Stefano, Ian, can you confirm?
>>>>>>> > >>> > > 
>>>>>>> > >>> > > If I recall correctly, the slack is a magic value coming from the
>>>>>>> > >>> > > ancient history of toolstacks.
>>>>> > >> > 
>>>>> > >> > Does it mean we would need to update the slack to take into account the ACPI
>>>>> > >> > blob?
>>> > > Yes, we need to take into account the ACPI blob. Probably not in the
>>> > > slack but directly in mam_memkb.
>> > Sorry, I'm not sure understand this. I found the b_info->max_memkb but
>> > didn't find the slack you said. And how to fix this? Update
>> > b_info->max_memkb or the slack?
> Can you calculate the size of your payload and add that to max_memkb?
> 
Yeah, but the size will be changed if we change the tables in the future
and this also should consider x86, right?

Thanks,
Wei Liu July 12, 2016, 2:50 p.m. UTC | #13
On Tue, Jul 12, 2016 at 10:42:07PM +0800, Shannon Zhao wrote:
> >>>>> > >> > 
> >>>>> > >> > Does it mean we would need to update the slack to take into account the ACPI
> >>>>> > >> > blob?
> >>> > > Yes, we need to take into account the ACPI blob. Probably not in the
> >>> > > slack but directly in mam_memkb.
> >> > Sorry, I'm not sure understand this. I found the b_info->max_memkb but
> >> > didn't find the slack you said. And how to fix this? Update
> >> > b_info->max_memkb or the slack?
> > Can you calculate the size of your payload and add that to max_memkb?
> > 
> Yeah, but the size will be changed if we change the tables in the future
> and this also should consider x86, right?

That could easily be solved by introducing a function to calculate the
size, right?

Wei.

> 
> Thanks,
> -- 
> Shannon
Shannon Zhao July 12, 2016, 2:57 p.m. UTC | #14
On 2016年07月12日 22:50, Wei Liu wrote:
> On Tue, Jul 12, 2016 at 10:42:07PM +0800, Shannon Zhao wrote:
>>>>>>>>>>> > >>>>> > >> > 
>>>>>>>>>>> > >>>>> > >> > Does it mean we would need to update the slack to take into account the ACPI
>>>>>>>>>>> > >>>>> > >> > blob?
>>>>>>> > >>> > > Yes, we need to take into account the ACPI blob. Probably not in the
>>>>>>> > >>> > > slack but directly in mam_memkb.
>>>>> > >> > Sorry, I'm not sure understand this. I found the b_info->max_memkb but
>>>>> > >> > didn't find the slack you said. And how to fix this? Update
>>>>> > >> > b_info->max_memkb or the slack?
>>> > > Can you calculate the size of your payload and add that to max_memkb?
>>> > > 
>> > Yeah, but the size will be changed if we change the tables in the future
>> > and this also should consider x86, right?
> That could easily be solved by introducing a function to calculate the
> size, right?
Oh, I'm not familiar with this. Let's clarify on this. It can add the
size to max_memkb after generating the ACPI tables and before loading
the tables to guest space and it doesn't have to add the size at
libxl__domain_build_info_setdefault(), right?

Thanks,
Boris Ostrovsky July 12, 2016, 3:08 p.m. UTC | #15
On 07/12/2016 10:57 AM, Shannon Zhao wrote:
> On 2016年07月12日 22:50, Wei Liu wrote:
>> On Tue, Jul 12, 2016 at 10:42:07PM +0800, Shannon Zhao wrote:
>>>>>>>>>>>>>>>>>>>>>> Does it mean we would need to update the slack to take into account the ACPI
>>>>>>>>>>>>>>>>>>>>>> blob?
>>>>>>>>>>>>>> Yes, we need to take into account the ACPI blob. Probably not in the
>>>>>>>>>>>>>> slack but directly in mam_memkb.
>>>>>>>>>> Sorry, I'm not sure understand this. I found the b_info->max_memkb but
>>>>>>>>>> didn't find the slack you said. And how to fix this? Update
>>>>>>>>>> b_info->max_memkb or the slack?
>>>>>> Can you calculate the size of your payload and add that to max_memkb?
>>>>>>
>>>> Yeah, but the size will be changed if we change the tables in the future
>>>> and this also should consider x86, right?
>> That could easily be solved by introducing a function to calculate the
>> size, right?
> Oh, I'm not familiar with this. Let's clarify on this. It can add the
> size to max_memkb after generating the ACPI tables and before loading
> the tables to guest space and it doesn't have to add the size at
> libxl__domain_build_info_setdefault(), right?

This was discussed before: ACPI tables are part of RAM whose size is
specified by the config file (and is reflected in max_memkb I believe).
It may not be presented to the guest as RAM (i.e. on x86 it is labeled
by BIOS (or whoever) as a dedicated type in e820) but it still resides
in DIMMs.

I believe we should not increase memory resources for ACPI tables.

-boris
Wei Liu July 12, 2016, 3:13 p.m. UTC | #16
On Tue, Jul 12, 2016 at 11:08:47AM -0400, Boris Ostrovsky wrote:
> On 07/12/2016 10:57 AM, Shannon Zhao wrote:
> > On 2016年07月12日 22:50, Wei Liu wrote:
> >> On Tue, Jul 12, 2016 at 10:42:07PM +0800, Shannon Zhao wrote:
> >>>>>>>>>>>>>>>>>>>>>> Does it mean we would need to update the slack to take into account the ACPI
> >>>>>>>>>>>>>>>>>>>>>> blob?
> >>>>>>>>>>>>>> Yes, we need to take into account the ACPI blob. Probably not in the
> >>>>>>>>>>>>>> slack but directly in mam_memkb.
> >>>>>>>>>> Sorry, I'm not sure understand this. I found the b_info->max_memkb but
> >>>>>>>>>> didn't find the slack you said. And how to fix this? Update
> >>>>>>>>>> b_info->max_memkb or the slack?
> >>>>>> Can you calculate the size of your payload and add that to max_memkb?
> >>>>>>
> >>>> Yeah, but the size will be changed if we change the tables in the future
> >>>> and this also should consider x86, right?
> >> That could easily be solved by introducing a function to calculate the
> >> size, right?
> > Oh, I'm not familiar with this. Let's clarify on this. It can add the
> > size to max_memkb after generating the ACPI tables and before loading
> > the tables to guest space and it doesn't have to add the size at
> > libxl__domain_build_info_setdefault(), right?
> 

Hmm... I would like to think a bit more about this. But before dwelling
on this too much...

> This was discussed before: ACPI tables are part of RAM whose size is
> specified by the config file (and is reflected in max_memkb I believe).
> It may not be presented to the guest as RAM (i.e. on x86 it is labeled
> by BIOS (or whoever) as a dedicated type in e820) but it still resides
> in DIMMs.
> 
> I believe we should not increase memory resources for ACPI tables.
> 

This is an interesting point. If there is already such resolution I will
be happy to follow it.

Any reference?

Wei.

> -boris
> 
> 
> 
>
Boris Ostrovsky July 12, 2016, 3:21 p.m. UTC | #17
On 07/12/2016 11:13 AM, Wei Liu wrote:
> On Tue, Jul 12, 2016 at 11:08:47AM -0400, Boris Ostrovsky wrote:
>> On 07/12/2016 10:57 AM, Shannon Zhao wrote:
>>> On 2016年07月12日 22:50, Wei Liu wrote:
>>>> On Tue, Jul 12, 2016 at 10:42:07PM +0800, Shannon Zhao wrote:
>>>>>>>>>>>>>>>>>>>>>>>> Does it mean we would need to update the slack to take into account the ACPI
>>>>>>>>>>>>>>>>>>>>>>>> blob?
>>>>>>>>>>>>>>>> Yes, we need to take into account the ACPI blob. Probably not in the
>>>>>>>>>>>>>>>> slack but directly in mam_memkb.
>>>>>>>>>>>> Sorry, I'm not sure understand this. I found the b_info->max_memkb but
>>>>>>>>>>>> didn't find the slack you said. And how to fix this? Update
>>>>>>>>>>>> b_info->max_memkb or the slack?
>>>>>>>> Can you calculate the size of your payload and add that to max_memkb?
>>>>>>>>
>>>>>> Yeah, but the size will be changed if we change the tables in the future
>>>>>> and this also should consider x86, right?
>>>> That could easily be solved by introducing a function to calculate the
>>>> size, right?
>>> Oh, I'm not familiar with this. Let's clarify on this. It can add the
>>> size to max_memkb after generating the ACPI tables and before loading
>>> the tables to guest space and it doesn't have to add the size at
>>> libxl__domain_build_info_setdefault(), right?
> Hmm... I would like to think a bit more about this. But before dwelling
> on this too much...
>
>> This was discussed before: ACPI tables are part of RAM whose size is
>> specified by the config file (and is reflected in max_memkb I believe).
>> It may not be presented to the guest as RAM (i.e. on x86 it is labeled
>> by BIOS (or whoever) as a dedicated type in e820) but it still resides
>> in DIMMs.
>>
>> I believe we should not increase memory resources for ACPI tables.
>>
> This is an interesting point. If there is already such resolution I will
> be happy to follow it.
>
> Any reference?

The last one (that I can find) is
   
https://lists.xenproject.org/archives/html/xen-devel/2016-07/msg00821.html

(conveniently in the same thread is the one we have been using to talk
about licensing)

-boris
Wei Liu July 12, 2016, 4:05 p.m. UTC | #18
On Tue, Jul 12, 2016 at 11:21:12AM -0400, Boris Ostrovsky wrote:
> On 07/12/2016 11:13 AM, Wei Liu wrote:
> > On Tue, Jul 12, 2016 at 11:08:47AM -0400, Boris Ostrovsky wrote:
> >> On 07/12/2016 10:57 AM, Shannon Zhao wrote:
> >>> On 2016年07月12日 22:50, Wei Liu wrote:
> >>>> On Tue, Jul 12, 2016 at 10:42:07PM +0800, Shannon Zhao wrote:
> >>>>>>>>>>>>>>>>>>>>>>>> Does it mean we would need to update the slack to take into account the ACPI
> >>>>>>>>>>>>>>>>>>>>>>>> blob?
> >>>>>>>>>>>>>>>> Yes, we need to take into account the ACPI blob. Probably not in the
> >>>>>>>>>>>>>>>> slack but directly in mam_memkb.
> >>>>>>>>>>>> Sorry, I'm not sure understand this. I found the b_info->max_memkb but
> >>>>>>>>>>>> didn't find the slack you said. And how to fix this? Update
> >>>>>>>>>>>> b_info->max_memkb or the slack?
> >>>>>>>> Can you calculate the size of your payload and add that to max_memkb?
> >>>>>>>>
> >>>>>> Yeah, but the size will be changed if we change the tables in the future
> >>>>>> and this also should consider x86, right?
> >>>> That could easily be solved by introducing a function to calculate the
> >>>> size, right?
> >>> Oh, I'm not familiar with this. Let's clarify on this. It can add the
> >>> size to max_memkb after generating the ACPI tables and before loading
> >>> the tables to guest space and it doesn't have to add the size at
> >>> libxl__domain_build_info_setdefault(), right?
> > Hmm... I would like to think a bit more about this. But before dwelling
> > on this too much...
> >
> >> This was discussed before: ACPI tables are part of RAM whose size is
> >> specified by the config file (and is reflected in max_memkb I believe).
> >> It may not be presented to the guest as RAM (i.e. on x86 it is labeled
> >> by BIOS (or whoever) as a dedicated type in e820) but it still resides
> >> in DIMMs.
> >>
> >> I believe we should not increase memory resources for ACPI tables.
> >>
> > This is an interesting point. If there is already such resolution I will
> > be happy to follow it.
> >
> > Any reference?
> 
> The last one (that I can find) is
>    
> https://lists.xenproject.org/archives/html/xen-devel/2016-07/msg00821.html
> 
> (conveniently in the same thread is the one we have been using to talk
> about licensing)
> 

Ah, OK.

Shannon, you don't need to modify max_memkb then. 

Wei.

> -boris
> 
>
Julien Grall July 12, 2016, 4:10 p.m. UTC | #19
Hello,

On 12/07/2016 16:08, Boris Ostrovsky wrote:
> On 07/12/2016 10:57 AM, Shannon Zhao wrote:
>> On 2016年07月12日 22:50, Wei Liu wrote:
>>> On Tue, Jul 12, 2016 at 10:42:07PM +0800, Shannon Zhao wrote:
>>>>>>>>>>>>>>>>>>>>>>> Does it mean we would need to update the slack to take into account the ACPI
>>>>>>>>>>>>>>>>>>>>>>> blob?
>>>>>>>>>>>>>>> Yes, we need to take into account the ACPI blob. Probably not in the
>>>>>>>>>>>>>>> slack but directly in mam_memkb.
>>>>>>>>>>> Sorry, I'm not sure understand this. I found the b_info->max_memkb but
>>>>>>>>>>> didn't find the slack you said. And how to fix this? Update
>>>>>>>>>>> b_info->max_memkb or the slack?
>>>>>>> Can you calculate the size of your payload and add that to max_memkb?
>>>>>>>
>>>>> Yeah, but the size will be changed if we change the tables in the future
>>>>> and this also should consider x86, right?
>>> That could easily be solved by introducing a function to calculate the
>>> size, right?
>> Oh, I'm not familiar with this. Let's clarify on this. It can add the
>> size to max_memkb after generating the ACPI tables and before loading
>> the tables to guest space and it doesn't have to add the size at
>> libxl__domain_build_info_setdefault(), right?
>
> This was discussed before: ACPI tables are part of RAM whose size is
> specified by the config file (and is reflected in max_memkb I believe).
> It may not be presented to the guest as RAM (i.e. on x86 it is labeled
> by BIOS (or whoever) as a dedicated type in e820) but it still resides
> in DIMMs.

I don't think this was the conclusion of the thread. IHMO, "maxmem" is 
the amount of RAM a guest could effectively use.

Whilst the ACPI tables will be in the DIMM from the host point of view. 
 From a guest point of view it will be a ROM.

It will affect some others part of the guest if we don't increment the 
"maxmem" requested by the user. For ARM the ACPI blob will be exposed at 
a specific address that is outside of the guest RAM (see the guest 
memory layout in public/arch-arm.h).

We chose this solution over putting in the RAM because the ACPI tables 
are not easily relocatable (compare to the device tree, initrd and 
kernel) so we could not take advantage of superpage in both stage-2 
(hypervisor) and stage-1 (kernel) page table.

To give you a concrete example. When the user requests 1 gigabytes of 
memory, Xen will try to allocate a 1 gigabytes superpage.

If few kilobytes is removed from this 1 gigabytes, then we would have to 
use 2MB superpage which will impact both performance and memory usage.

So I think we should take into account the size of the ACPI blob in 
libxl and not reduce the memory requested by the user.

Regards,
Boris Ostrovsky July 12, 2016, 4:58 p.m. UTC | #20
On 07/12/2016 12:10 PM, Julien Grall wrote:
> Hello,
>
> On 12/07/2016 16:08, Boris Ostrovsky wrote:
>> On 07/12/2016 10:57 AM, Shannon Zhao wrote:
>>> On 2016年07月12日 22:50, Wei Liu wrote:
>>>> On Tue, Jul 12, 2016 at 10:42:07PM +0800, Shannon Zhao wrote:
>>>>>>>>>>>>>>>>>>>>>>>> Does it mean we would need to update the slack
>>>>>>>>>>>>>>>>>>>>>>>> to take into account the ACPI
>>>>>>>>>>>>>>>>>>>>>>>> blob?
>>>>>>>>>>>>>>>> Yes, we need to take into account the ACPI blob.
>>>>>>>>>>>>>>>> Probably not in the
>>>>>>>>>>>>>>>> slack but directly in mam_memkb.
>>>>>>>>>>>> Sorry, I'm not sure understand this. I found the
>>>>>>>>>>>> b_info->max_memkb but
>>>>>>>>>>>> didn't find the slack you said. And how to fix this? Update
>>>>>>>>>>>> b_info->max_memkb or the slack?
>>>>>>>> Can you calculate the size of your payload and add that to
>>>>>>>> max_memkb?
>>>>>>>>
>>>>>> Yeah, but the size will be changed if we change the tables in the
>>>>>> future
>>>>>> and this also should consider x86, right?
>>>> That could easily be solved by introducing a function to calculate the
>>>> size, right?
>>> Oh, I'm not familiar with this. Let's clarify on this. It can add the
>>> size to max_memkb after generating the ACPI tables and before loading
>>> the tables to guest space and it doesn't have to add the size at
>>> libxl__domain_build_info_setdefault(), right?
>>
>> This was discussed before: ACPI tables are part of RAM whose size is
>> specified by the config file (and is reflected in max_memkb I believe).
>> It may not be presented to the guest as RAM (i.e. on x86 it is labeled
>> by BIOS (or whoever) as a dedicated type in e820) but it still resides
>> in DIMMs.
>
> I don't think this was the conclusion of the thread. IHMO, "maxmem" is
> the amount of RAM a guest could effectively use.
>
> Whilst the ACPI tables will be in the DIMM from the host point of
> view. From a guest point of view it will be a ROM.

The config file specifies resources provided by the host. How the guest
views those resources is not important, I think.

>
> It will affect some others part of the guest if we don't increment the
> "maxmem" requested by the user. For ARM the ACPI blob will be exposed
> at a specific address that is outside of the guest RAM (see the guest
> memory layout in public/arch-arm.h).
>
> We chose this solution over putting in the RAM because the ACPI tables
> are not easily relocatable (compare to the device tree, initrd and
> kernel) so we could not take advantage of superpage in both stage-2
> (hypervisor) and stage-1 (kernel) page table.

Maybe this is something ARM-specific then. For x86 we will want to keep
maxmem unchanged.

-boris


>
> To give you a concrete example. When the user requests 1 gigabytes of
> memory, Xen will try to allocate a 1 gigabytes superpage.
>
> If few kilobytes is removed from this 1 gigabytes, then we would have
> to use 2MB superpage which will impact both performance and memory usage.
>
> So I think we should take into account the size of the ACPI blob in
> libxl and not reduce the memory requested by the user.
>
> Regards,
>
Julien Grall July 13, 2016, 3:22 p.m. UTC | #21
Hello,

On 12/07/2016 17:58, Boris Ostrovsky wrote:
> On 07/12/2016 12:10 PM, Julien Grall wrote:
>> On 12/07/2016 16:08, Boris Ostrovsky wrote:
>>> On 07/12/2016 10:57 AM, Shannon Zhao wrote:
>>>> On 2016年07月12日 22:50, Wei Liu wrote:
>>>>> On Tue, Jul 12, 2016 at 10:42:07PM +0800, Shannon Zhao wrote:
>>>>>>>>>>>>>>>>>>>>>>>>> Does it mean we would need to update the slack
>>>>>>>>>>>>>>>>>>>>>>>>> to take into account the ACPI
>>>>>>>>>>>>>>>>>>>>>>>>> blob?
>>>>>>>>>>>>>>>>> Yes, we need to take into account the ACPI blob.
>>>>>>>>>>>>>>>>> Probably not in the
>>>>>>>>>>>>>>>>> slack but directly in mam_memkb.
>>>>>>>>>>>>> Sorry, I'm not sure understand this. I found the
>>>>>>>>>>>>> b_info->max_memkb but
>>>>>>>>>>>>> didn't find the slack you said. And how to fix this? Update
>>>>>>>>>>>>> b_info->max_memkb or the slack?
>>>>>>>>> Can you calculate the size of your payload and add that to
>>>>>>>>> max_memkb?
>>>>>>>>>
>>>>>>> Yeah, but the size will be changed if we change the tables in the
>>>>>>> future
>>>>>>> and this also should consider x86, right?
>>>>> That could easily be solved by introducing a function to calculate the
>>>>> size, right?
>>>> Oh, I'm not familiar with this. Let's clarify on this. It can add the
>>>> size to max_memkb after generating the ACPI tables and before loading
>>>> the tables to guest space and it doesn't have to add the size at
>>>> libxl__domain_build_info_setdefault(), right?
>>>
>>> This was discussed before: ACPI tables are part of RAM whose size is
>>> specified by the config file (and is reflected in max_memkb I believe).
>>> It may not be presented to the guest as RAM (i.e. on x86 it is labeled
>>> by BIOS (or whoever) as a dedicated type in e820) but it still resides
>>> in DIMMs.
>>
>> I don't think this was the conclusion of the thread. IHMO, "maxmem" is
>> the amount of RAM a guest could effectively use.
>>
>> Whilst the ACPI tables will be in the DIMM from the host point of
>> view. From a guest point of view it will be a ROM.
>
> The config file specifies resources provided by the host. How the guest
> views those resources is not important, I think.

This would need to be clarified. For instance special pages (Xenstore, 
Console...) are RAM from the host point of view but not taken into 
account in the "maxmem" provided by the user. For my understanding, some 
kB of the slack is used for that.

>
>>
>> It will affect some others part of the guest if we don't increment the
>> "maxmem" requested by the user. For ARM the ACPI blob will be exposed
>> at a specific address that is outside of the guest RAM (see the guest
>> memory layout in public/arch-arm.h).
>>
>> We chose this solution over putting in the RAM because the ACPI tables
>> are not easily relocatable (compare to the device tree, initrd and
>> kernel) so we could not take advantage of superpage in both stage-2
>> (hypervisor) and stage-1 (kernel) page table.
>
> Maybe this is something ARM-specific then. For x86 we will want to keep
> maxmem unchanged.

I don't think what I described in my previous mail is ARM-specific. The 
pressure will be more important on the TLBs, if Xen does not use 
superpage in the stage 2 page tables (i.e EPT for x86) no matter the 
architecture.

IHMO, this seems to be a bigger drawback compare to add few more 
kilobytes to maxmem in the toolstack for the ACPI blob. You will loose 
them when creating the intermediate page table in any case.

May I ask why you want to keep maxmem unchanged on x86?

Regards,
Boris Ostrovsky July 13, 2016, 5:08 p.m. UTC | #22
On 07/13/2016 11:22 AM, Julien Grall wrote:
> Hello,
>
> On 12/07/2016 17:58, Boris Ostrovsky wrote:
>> On 07/12/2016 12:10 PM, Julien Grall wrote:
>>> On 12/07/2016 16:08, Boris Ostrovsky wrote:
>>>> On 07/12/2016 10:57 AM, Shannon Zhao wrote:
>>>>> On 2016年07月12日 22:50, Wei Liu wrote:
>>>>>> On Tue, Jul 12, 2016 at 10:42:07PM +0800, Shannon Zhao wrote:
>>>>>>>>>>>>>>>>>>>>>>>>>> Does it mean we would need to update the slack
>>>>>>>>>>>>>>>>>>>>>>>>>> to take into account the ACPI
>>>>>>>>>>>>>>>>>>>>>>>>>> blob?
>>>>>>>>>>>>>>>>>> Yes, we need to take into account the ACPI blob.
>>>>>>>>>>>>>>>>>> Probably not in the
>>>>>>>>>>>>>>>>>> slack but directly in mam_memkb.
>>>>>>>>>>>>>> Sorry, I'm not sure understand this. I found the
>>>>>>>>>>>>>> b_info->max_memkb but
>>>>>>>>>>>>>> didn't find the slack you said. And how to fix this? Update
>>>>>>>>>>>>>> b_info->max_memkb or the slack?
>>>>>>>>>> Can you calculate the size of your payload and add that to
>>>>>>>>>> max_memkb?
>>>>>>>>>>
>>>>>>>> Yeah, but the size will be changed if we change the tables in the
>>>>>>>> future
>>>>>>>> and this also should consider x86, right?
>>>>>> That could easily be solved by introducing a function to
>>>>>> calculate the
>>>>>> size, right?
>>>>> Oh, I'm not familiar with this. Let's clarify on this. It can add the
>>>>> size to max_memkb after generating the ACPI tables and before loading
>>>>> the tables to guest space and it doesn't have to add the size at
>>>>> libxl__domain_build_info_setdefault(), right?
>>>>
>>>> This was discussed before: ACPI tables are part of RAM whose size is
>>>> specified by the config file (and is reflected in max_memkb I
>>>> believe).
>>>> It may not be presented to the guest as RAM (i.e. on x86 it is labeled
>>>> by BIOS (or whoever) as a dedicated type in e820) but it still resides
>>>> in DIMMs.
>>>
>>> I don't think this was the conclusion of the thread. IHMO, "maxmem" is
>>> the amount of RAM a guest could effectively use.
>>>
>>> Whilst the ACPI tables will be in the DIMM from the host point of
>>> view. From a guest point of view it will be a ROM.
>>
>> The config file specifies resources provided by the host. How the guest
>> views those resources is not important, I think.
>
> This would need to be clarified. For instance special pages (Xenstore,
> Console...) are RAM from the host point of view but not taken into
> account in the "maxmem" provided by the user. For my understanding,
> some kB of the slack is used for that.


Are these pages part of guest's address space?


>
>>
>>>
>>> It will affect some others part of the guest if we don't increment the
>>> "maxmem" requested by the user. For ARM the ACPI blob will be exposed
>>> at a specific address that is outside of the guest RAM (see the guest
>>> memory layout in public/arch-arm.h).
>>>
>>> We chose this solution over putting in the RAM because the ACPI tables
>>> are not easily relocatable (compare to the device tree, initrd and
>>> kernel) so we could not take advantage of superpage in both stage-2
>>> (hypervisor) and stage-1 (kernel) page table.
>>
>> Maybe this is something ARM-specific then. For x86 we will want to keep
>> maxmem unchanged.
>
> I don't think what I described in my previous mail is ARM-specific.
> The pressure will be more important on the TLBs, if Xen does not use
> superpage in the stage 2 page tables (i.e EPT for x86) no matter the
> architecture.
>
> IHMO, this seems to be a bigger drawback compare to add few more
> kilobytes to maxmem in the toolstack for the ACPI blob. You will loose
> them when creating the intermediate page table in any case.


Why not have the guest ask for more memory in the config file then?

(OK, I can see that they can't ask for a few KB since we have MB
resolution but they can ask for an extra 1MB)


>
> May I ask why you want to keep maxmem unchanged on x86?

Only to keep resource accounting fair. The guest may decide to use
memory reserved for ACPI as a regular memory.



-boris
Wei Liu July 14, 2016, 11:15 a.m. UTC | #23
On Wed, Jul 13, 2016 at 01:08:57PM -0400, Boris Ostrovsky wrote:
> On 07/13/2016 11:22 AM, Julien Grall wrote:
> > Hello,
> >
> > On 12/07/2016 17:58, Boris Ostrovsky wrote:
> >> On 07/12/2016 12:10 PM, Julien Grall wrote:
> >>> On 12/07/2016 16:08, Boris Ostrovsky wrote:
> >>>> On 07/12/2016 10:57 AM, Shannon Zhao wrote:
> >>>>> On 2016年07月12日 22:50, Wei Liu wrote:
> >>>>>> On Tue, Jul 12, 2016 at 10:42:07PM +0800, Shannon Zhao wrote:
> >>>>>>>>>>>>>>>>>>>>>>>>>> Does it mean we would need to update the slack
> >>>>>>>>>>>>>>>>>>>>>>>>>> to take into account the ACPI
> >>>>>>>>>>>>>>>>>>>>>>>>>> blob?
> >>>>>>>>>>>>>>>>>> Yes, we need to take into account the ACPI blob.
> >>>>>>>>>>>>>>>>>> Probably not in the
> >>>>>>>>>>>>>>>>>> slack but directly in mam_memkb.
> >>>>>>>>>>>>>> Sorry, I'm not sure understand this. I found the
> >>>>>>>>>>>>>> b_info->max_memkb but
> >>>>>>>>>>>>>> didn't find the slack you said. And how to fix this? Update
> >>>>>>>>>>>>>> b_info->max_memkb or the slack?
> >>>>>>>>>> Can you calculate the size of your payload and add that to
> >>>>>>>>>> max_memkb?
> >>>>>>>>>>
> >>>>>>>> Yeah, but the size will be changed if we change the tables in the
> >>>>>>>> future
> >>>>>>>> and this also should consider x86, right?
> >>>>>> That could easily be solved by introducing a function to
> >>>>>> calculate the
> >>>>>> size, right?
> >>>>> Oh, I'm not familiar with this. Let's clarify on this. It can add the
> >>>>> size to max_memkb after generating the ACPI tables and before loading
> >>>>> the tables to guest space and it doesn't have to add the size at
> >>>>> libxl__domain_build_info_setdefault(), right?
> >>>>
> >>>> This was discussed before: ACPI tables are part of RAM whose size is
> >>>> specified by the config file (and is reflected in max_memkb I
> >>>> believe).
> >>>> It may not be presented to the guest as RAM (i.e. on x86 it is labeled
> >>>> by BIOS (or whoever) as a dedicated type in e820) but it still resides
> >>>> in DIMMs.
> >>>
> >>> I don't think this was the conclusion of the thread. IHMO, "maxmem" is
> >>> the amount of RAM a guest could effectively use.
> >>>
> >>> Whilst the ACPI tables will be in the DIMM from the host point of
> >>> view. From a guest point of view it will be a ROM.
> >>
> >> The config file specifies resources provided by the host. How the guest
> >> views those resources is not important, I think.
> >
> > This would need to be clarified. For instance special pages (Xenstore,
> > Console...) are RAM from the host point of view but not taken into
> > account in the "maxmem" provided by the user. For my understanding,
> > some kB of the slack is used for that.
> 
> 
> Are these pages part of guest's address space?
> 
> 
> >
> >>
> >>>
> >>> It will affect some others part of the guest if we don't increment the
> >>> "maxmem" requested by the user. For ARM the ACPI blob will be exposed
> >>> at a specific address that is outside of the guest RAM (see the guest
> >>> memory layout in public/arch-arm.h).
> >>>
> >>> We chose this solution over putting in the RAM because the ACPI tables
> >>> are not easily relocatable (compare to the device tree, initrd and
> >>> kernel) so we could not take advantage of superpage in both stage-2
> >>> (hypervisor) and stage-1 (kernel) page table.
> >>
> >> Maybe this is something ARM-specific then. For x86 we will want to keep
> >> maxmem unchanged.
> >
> > I don't think what I described in my previous mail is ARM-specific.
> > The pressure will be more important on the TLBs, if Xen does not use
> > superpage in the stage 2 page tables (i.e EPT for x86) no matter the
> > architecture.
> >
> > IHMO, this seems to be a bigger drawback compare to add few more
> > kilobytes to maxmem in the toolstack for the ACPI blob. You will loose
> > them when creating the intermediate page table in any case.
> 
> 
> Why not have the guest ask for more memory in the config file then?
> 
> (OK, I can see that they can't ask for a few KB since we have MB
> resolution but they can ask for an extra 1MB)
> 

It would be trivial to have another option in xl.cfg to allow MB
granularity. But I don't think that's a good idea. Asking for more
memory when you don't really know how much is enough is not very useful.
If an admin can know how much is needed, surely the library can be
taught to obtain that knowledge, too.

We need to decide which model we should go with. And, if we decide to
diverge, document the difference between x86 and ARM model.

Wei.
Julien Grall July 14, 2016, 11:29 a.m. UTC | #24
Hello,

On 13/07/16 18:08, Boris Ostrovsky wrote:
> On 07/13/2016 11:22 AM, Julien Grall wrote:
>> On 12/07/2016 17:58, Boris Ostrovsky wrote:
>>> The config file specifies resources provided by the host. How the guest
>>> views those resources is not important, I think.
>>
>> This would need to be clarified. For instance special pages (Xenstore,
>> Console...) are RAM from the host point of view but not taken into
>> account in the "maxmem" provided by the user. For my understanding,
>> some kB of the slack is used for that.
>
>
> Are these pages part of guest's address space?

Yes, they are used to communicate between the guest and the backend 
(xenconsole, xenstore,...).

Regards,
Stefano Stabellini July 14, 2016, 1:37 p.m. UTC | #25
On Wed, 13 Jul 2016, Julien Grall wrote:
> Hello,
> 
> On 12/07/2016 17:58, Boris Ostrovsky wrote:
> > On 07/12/2016 12:10 PM, Julien Grall wrote:
> > > On 12/07/2016 16:08, Boris Ostrovsky wrote:
> > > > On 07/12/2016 10:57 AM, Shannon Zhao wrote:
> > > > > On 2016年07月12日 22:50, Wei Liu wrote:
> > > > > > On Tue, Jul 12, 2016 at 10:42:07PM +0800, Shannon Zhao wrote:
> > > > > > > > > > > > > > > > > > > > > > > > > > Does it mean we would need
> > > > > > > > > > > > > > > > > > > > > > > > > > to update the slack
> > > > > > > > > > > > > > > > > > > > > > > > > > to take into account the
> > > > > > > > > > > > > > > > > > > > > > > > > > ACPI
> > > > > > > > > > > > > > > > > > > > > > > > > > blob?
> > > > > > > > > > > > > > > > > > Yes, we need to take into account the ACPI
> > > > > > > > > > > > > > > > > > blob.
> > > > > > > > > > > > > > > > > > Probably not in the
> > > > > > > > > > > > > > > > > > slack but directly in mam_memkb.
> > > > > > > > > > > > > > Sorry, I'm not sure understand this. I found the
> > > > > > > > > > > > > > b_info->max_memkb but
> > > > > > > > > > > > > > didn't find the slack you said. And how to fix this?
> > > > > > > > > > > > > > Update
> > > > > > > > > > > > > > b_info->max_memkb or the slack?
> > > > > > > > > > Can you calculate the size of your payload and add that to
> > > > > > > > > > max_memkb?
> > > > > > > > > > 
> > > > > > > > Yeah, but the size will be changed if we change the tables in
> > > > > > > > the
> > > > > > > > future
> > > > > > > > and this also should consider x86, right?
> > > > > > That could easily be solved by introducing a function to calculate
> > > > > > the
> > > > > > size, right?
> > > > > Oh, I'm not familiar with this. Let's clarify on this. It can add the
> > > > > size to max_memkb after generating the ACPI tables and before loading
> > > > > the tables to guest space and it doesn't have to add the size at
> > > > > libxl__domain_build_info_setdefault(), right?
> > > > 
> > > > This was discussed before: ACPI tables are part of RAM whose size is
> > > > specified by the config file (and is reflected in max_memkb I believe).
> > > > It may not be presented to the guest as RAM (i.e. on x86 it is labeled
> > > > by BIOS (or whoever) as a dedicated type in e820) but it still resides
> > > > in DIMMs.
> > > 
> > > I don't think this was the conclusion of the thread. IHMO, "maxmem" is
> > > the amount of RAM a guest could effectively use.
> > > 
> > > Whilst the ACPI tables will be in the DIMM from the host point of
> > > view. From a guest point of view it will be a ROM.
> > 
> > The config file specifies resources provided by the host. How the guest
> > views those resources is not important, I think.
> 
> This would need to be clarified. For instance special pages (Xenstore,
> Console...) are RAM from the host point of view but not taken into account in
> the "maxmem" provided by the user. For my understanding, some kB of the slack
> is used for that.
> 
> > 
> > > 
> > > It will affect some others part of the guest if we don't increment the
> > > "maxmem" requested by the user. For ARM the ACPI blob will be exposed
> > > at a specific address that is outside of the guest RAM (see the guest
> > > memory layout in public/arch-arm.h).
> > > 
> > > We chose this solution over putting in the RAM because the ACPI tables
> > > are not easily relocatable (compare to the device tree, initrd and
> > > kernel) so we could not take advantage of superpage in both stage-2
> > > (hypervisor) and stage-1 (kernel) page table.
> > 
> > Maybe this is something ARM-specific then. For x86 we will want to keep
> > maxmem unchanged.
> 
> I don't think what I described in my previous mail is ARM-specific. The
> pressure will be more important on the TLBs, if Xen does not use superpage in
> the stage 2 page tables (i.e EPT for x86) no matter the architecture.
> 
> IHMO, this seems to be a bigger drawback compare to add few more kilobytes to
> maxmem in the toolstack for the ACPI blob. You will loose them when creating
> the intermediate page table in any case.

I agree with Julien. On ARM we have to increase maxmem because I don't
think that shattering a superpage is acceptable for just a few KBs. In
fact, it's not much about increasing maxmem, but it's about keeping the
allocation of guest memory to the value passed by the user in "memory",
so that it can be done in the most efficient way possible. (I am
assuming users are going to allocate VMs of 2048MB, rather than 2049MB.)

I wouldn't want to end up adding to the performance tuning page on the
wiki "Make sure to add 1 more MB to the memory of your VM to get the
most out of the system."

I know that the location of the ACPI blob on x86 is different in guest
memory space, but it seems to me that the problem would be the same. Do
you have 1 gigabyte pages in stage-2 on x86? If so, I would think twice
about this. Otherwise, if you only have 4K and 2MB allocations, then it
might not make that much of a difference.
Shannon Zhao July 15, 2016, 9:39 a.m. UTC | #26
On 2016/7/14 19:15, Wei Liu wrote:
> On Wed, Jul 13, 2016 at 01:08:57PM -0400, Boris Ostrovsky wrote:
>> On 07/13/2016 11:22 AM, Julien Grall wrote:
>>> Hello,
>>>
>>> On 12/07/2016 17:58, Boris Ostrovsky wrote:
>>>> On 07/12/2016 12:10 PM, Julien Grall wrote:
>>>>> On 12/07/2016 16:08, Boris Ostrovsky wrote:
>>>>>> On 07/12/2016 10:57 AM, Shannon Zhao wrote:
>>>>>>> On 2016年07月12日 22:50, Wei Liu wrote:
>>>>>>>> On Tue, Jul 12, 2016 at 10:42:07PM +0800, Shannon Zhao wrote:
>>>>>>>>>>>>>>>>>>>>>>>>>>>> Does it mean we would need to update the slack
>>>>>>>>>>>>>>>>>>>>>>>>>>>> to take into account the ACPI
>>>>>>>>>>>>>>>>>>>>>>>>>>>> blob?
>>>>>>>>>>>>>>>>>>>> Yes, we need to take into account the ACPI blob.
>>>>>>>>>>>>>>>>>>>> Probably not in the
>>>>>>>>>>>>>>>>>>>> slack but directly in mam_memkb.
>>>>>>>>>>>>>>>> Sorry, I'm not sure understand this. I found the
>>>>>>>>>>>>>>>> b_info->max_memkb but
>>>>>>>>>>>>>>>> didn't find the slack you said. And how to fix this? Update
>>>>>>>>>>>>>>>> b_info->max_memkb or the slack?
>>>>>>>>>>>> Can you calculate the size of your payload and add that to
>>>>>>>>>>>> max_memkb?
>>>>>>>>>>>>
>>>>>>>>>> Yeah, but the size will be changed if we change the tables in the
>>>>>>>>>> future
>>>>>>>>>> and this also should consider x86, right?
>>>>>>>> That could easily be solved by introducing a function to
>>>>>>>> calculate the
>>>>>>>> size, right?
>>>>>>> Oh, I'm not familiar with this. Let's clarify on this. It can add the
>>>>>>> size to max_memkb after generating the ACPI tables and before loading
>>>>>>> the tables to guest space and it doesn't have to add the size at
>>>>>>> libxl__domain_build_info_setdefault(), right?
>>>>>>
>>>>>> This was discussed before: ACPI tables are part of RAM whose size is
>>>>>> specified by the config file (and is reflected in max_memkb I
>>>>>> believe).
>>>>>> It may not be presented to the guest as RAM (i.e. on x86 it is labeled
>>>>>> by BIOS (or whoever) as a dedicated type in e820) but it still resides
>>>>>> in DIMMs.
>>>>>
>>>>> I don't think this was the conclusion of the thread. IHMO, "maxmem" is
>>>>> the amount of RAM a guest could effectively use.
>>>>>
>>>>> Whilst the ACPI tables will be in the DIMM from the host point of
>>>>> view. From a guest point of view it will be a ROM.
>>>>
>>>> The config file specifies resources provided by the host. How the guest
>>>> views those resources is not important, I think.
>>>
>>> This would need to be clarified. For instance special pages (Xenstore,
>>> Console...) are RAM from the host point of view but not taken into
>>> account in the "maxmem" provided by the user. For my understanding,
>>> some kB of the slack is used for that.
>>
>>
>> Are these pages part of guest's address space?
>>
>>
>>>
>>>>
>>>>>
>>>>> It will affect some others part of the guest if we don't increment the
>>>>> "maxmem" requested by the user. For ARM the ACPI blob will be exposed
>>>>> at a specific address that is outside of the guest RAM (see the guest
>>>>> memory layout in public/arch-arm.h).
>>>>>
>>>>> We chose this solution over putting in the RAM because the ACPI tables
>>>>> are not easily relocatable (compare to the device tree, initrd and
>>>>> kernel) so we could not take advantage of superpage in both stage-2
>>>>> (hypervisor) and stage-1 (kernel) page table.
>>>>
>>>> Maybe this is something ARM-specific then. For x86 we will want to keep
>>>> maxmem unchanged.
>>>
>>> I don't think what I described in my previous mail is ARM-specific.
>>> The pressure will be more important on the TLBs, if Xen does not use
>>> superpage in the stage 2 page tables (i.e EPT for x86) no matter the
>>> architecture.
>>>
>>> IHMO, this seems to be a bigger drawback compare to add few more
>>> kilobytes to maxmem in the toolstack for the ACPI blob. You will loose
>>> them when creating the intermediate page table in any case.
>>
>>
>> Why not have the guest ask for more memory in the config file then?
>>
>> (OK, I can see that they can't ask for a few KB since we have MB
>> resolution but they can ask for an extra 1MB)
>>
> 
> It would be trivial to have another option in xl.cfg to allow MB
> granularity. But I don't think that's a good idea. Asking for more
> memory when you don't really know how much is enough is not very useful.
> If an admin can know how much is needed, surely the library can be
> taught to obtain that knowledge, too.
> 
> We need to decide which model we should go with. And, if we decide to
> diverge, document the difference between x86 and ARM model.
> 
Hi Wei,

Do you decide how to add the size of ACPI blob to max_memkb?

Thanks,
Wei Liu July 19, 2016, 10:38 a.m. UTC | #27
On Fri, Jul 15, 2016 at 05:39:32PM +0800, Shannon Zhao wrote:
[...]
> > 
> > It would be trivial to have another option in xl.cfg to allow MB
> > granularity. But I don't think that's a good idea. Asking for more
> > memory when you don't really know how much is enough is not very useful.
> > If an admin can know how much is needed, surely the library can be
> > taught to obtain that knowledge, too.
> > 
> > We need to decide which model we should go with. And, if we decide to
> > diverge, document the difference between x86 and ARM model.
> > 
> Hi Wei,
> 
> Do you decide how to add the size of ACPI blob to max_memkb?
> 

AFAICT ARM and x86 maintainers hold different opinions on how memory
should be accounted.

I would like to have a unified memory accounting model. But if we can't
have that at the moment, I'm fine with divergence, but please document
it somewhere (comment near code snippet, in header, or a file under docs
etc). And the amount added to max_memkb needs to be properly calculated,
not some magic number, so that we have a chance in the future to
confidently change how we do thing.


Wei.

> Thanks,
> -- 
> Shannon
>
Shannon Zhao July 20, 2016, 6:52 a.m. UTC | #28
On 2016/7/19 18:38, Wei Liu wrote:
> On Fri, Jul 15, 2016 at 05:39:32PM +0800, Shannon Zhao wrote:
> [...]
>>> > > 
>>> > > It would be trivial to have another option in xl.cfg to allow MB
>>> > > granularity. But I don't think that's a good idea. Asking for more
>>> > > memory when you don't really know how much is enough is not very useful.
>>> > > If an admin can know how much is needed, surely the library can be
>>> > > taught to obtain that knowledge, too.
>>> > > 
>>> > > We need to decide which model we should go with. And, if we decide to
>>> > > diverge, document the difference between x86 and ARM model.
>>> > > 
>> > Hi Wei,
>> > 
>> > Do you decide how to add the size of ACPI blob to max_memkb?
>> > 
> AFAICT ARM and x86 maintainers hold different opinions on how memory
> should be accounted.
> 
> I would like to have a unified memory accounting model. But if we can't
> have that at the moment, I'm fine with divergence, but please document
> it somewhere (comment near code snippet, in header, or a file under docs
> etc). And the amount added to max_memkb needs to be properly calculated,
> not some magic number, so that we have a chance in the future to
> confidently change how we do thing.
If it's only allowed to add the size to max_memkb in
libxl__domain_build_info_setdefault(), it only can use a const number
since the tables are not generted and we don;t know the size. But the
const number could be chosen properly since we could know the maximum
ACPI tables size of current ARM approach.

But maybe in the future, if we add some new ACPI tables, it should
increase the size as well. So I think this should be documented.

As I asked before, is it ok to add the size to max_memkb after
generating the ACPI tables and before loading the tables to guest space?

Thanks,
Wei Liu July 20, 2016, 9:32 a.m. UTC | #29
On Wed, Jul 20, 2016 at 02:52:05PM +0800, Shannon Zhao wrote:
> 
> 
> On 2016/7/19 18:38, Wei Liu wrote:
> > On Fri, Jul 15, 2016 at 05:39:32PM +0800, Shannon Zhao wrote:
> > [...]
> >>> > > 
> >>> > > It would be trivial to have another option in xl.cfg to allow MB
> >>> > > granularity. But I don't think that's a good idea. Asking for more
> >>> > > memory when you don't really know how much is enough is not very useful.
> >>> > > If an admin can know how much is needed, surely the library can be
> >>> > > taught to obtain that knowledge, too.
> >>> > > 
> >>> > > We need to decide which model we should go with. And, if we decide to
> >>> > > diverge, document the difference between x86 and ARM model.
> >>> > > 
> >> > Hi Wei,
> >> > 
> >> > Do you decide how to add the size of ACPI blob to max_memkb?
> >> > 
> > AFAICT ARM and x86 maintainers hold different opinions on how memory
> > should be accounted.
> > 
> > I would like to have a unified memory accounting model. But if we can't
> > have that at the moment, I'm fine with divergence, but please document
> > it somewhere (comment near code snippet, in header, or a file under docs
> > etc). And the amount added to max_memkb needs to be properly calculated,
> > not some magic number, so that we have a chance in the future to
> > confidently change how we do thing.
> If it's only allowed to add the size to max_memkb in
> libxl__domain_build_info_setdefault(), it only can use a const number
> since the tables are not generted and we don;t know the size. But the
> const number could be chosen properly since we could know the maximum
> ACPI tables size of current ARM approach.
> 
> But maybe in the future, if we add some new ACPI tables, it should
> increase the size as well. So I think this should be documented.
> 
> As I asked before, is it ok to add the size to max_memkb after
> generating the ACPI tables and before loading the tables to guest space?
> 

Yes. I don't think shoehorning everything into setdefault is a good
idea.

I think libxl_arm.c:libxl__arch_domain_create would be the right place
to do it.

I am thinking about calling xc_domain_setmaxmem there, but not adding a
number to d_config->b_info.max_memkb. Adding that to ->max_memkb would
be wrong because the bigger ->max_memkb will be recored and the same
algorithm will be applied every time you migrate your guest, so the
max_memkb will grow bigger and bigger.

Given the different approach taken by ARM and x86, maybe we need to also
record the size of acpi blobs somewhere in xenstore (also needs to be
documented) so that subsequent libxl_domain_setmaxmem can extract that
number again.

Please wait a bit for Stefano and Julien to comment before you do work.

I know, memory accounting is a bit messy. I don't claim that I figure
every last detail out. If I'm not clear enough or ignores some basic
facts do let me know.

Wei.

> Thanks,
> -- 
> Shannon
>
Julien Grall July 20, 2016, 12:33 p.m. UTC | #30
Hi,

On 14/07/16 14:37, Stefano Stabellini wrote:
> On Wed, 13 Jul 2016, Julien Grall wrote:
>> Hello,
>>
>> On 12/07/2016 17:58, Boris Ostrovsky wrote:
>>> On 07/12/2016 12:10 PM, Julien Grall wrote:
>>>> On 12/07/2016 16:08, Boris Ostrovsky wrote:
>>>>> On 07/12/2016 10:57 AM, Shannon Zhao wrote:
>>>> It will affect some others part of the guest if we don't increment the
>>>> "maxmem" requested by the user. For ARM the ACPI blob will be exposed
>>>> at a specific address that is outside of the guest RAM (see the guest
>>>> memory layout in public/arch-arm.h).
>>>>
>>>> We chose this solution over putting in the RAM because the ACPI tables
>>>> are not easily relocatable (compare to the device tree, initrd and
>>>> kernel) so we could not take advantage of superpage in both stage-2
>>>> (hypervisor) and stage-1 (kernel) page table.
>>>
>>> Maybe this is something ARM-specific then. For x86 we will want to keep
>>> maxmem unchanged.
>>
>> I don't think what I described in my previous mail is ARM-specific. The
>> pressure will be more important on the TLBs, if Xen does not use superpage in
>> the stage 2 page tables (i.e EPT for x86) no matter the architecture.
>>
>> IHMO, this seems to be a bigger drawback compare to add few more kilobytes to
>> maxmem in the toolstack for the ACPI blob. You will loose them when creating
>> the intermediate page table in any case.
>
> I agree with Julien. On ARM we have to increase maxmem because I don't
> think that shattering a superpage is acceptable for just a few KBs. In
> fact, it's not much about increasing maxmem, but it's about keeping the
> allocation of guest memory to the value passed by the user in "memory",
> so that it can be done in the most efficient way possible. (I am
> assuming users are going to allocate VMs of 2048MB, rather than 2049MB.)
>
> I wouldn't want to end up adding to the performance tuning page on the
> wiki "Make sure to add 1 more MB to the memory of your VM to get the
> most out of the system."
>
> I know that the location of the ACPI blob on x86 is different in guest
> memory space, but it seems to me that the problem would be the same. Do
> you have 1 gigabyte pages in stage-2 on x86? If so, I would think twice
> about this. Otherwise, if you only have 4K and 2MB allocations, then it
> might not make that much of a difference.

Looking at the x86 code, 1 gigabyte pages seems to be supported.

Boris, do you have any opinions on this?

Regards,
Boris Ostrovsky July 20, 2016, 1:33 p.m. UTC | #31
On 07/20/2016 08:33 AM, Julien Grall wrote:
> Hi,
>
> On 14/07/16 14:37, Stefano Stabellini wrote:
>> On Wed, 13 Jul 2016, Julien Grall wrote:
>>> Hello,
>>>
>>> On 12/07/2016 17:58, Boris Ostrovsky wrote:
>>>> On 07/12/2016 12:10 PM, Julien Grall wrote:
>>>>> On 12/07/2016 16:08, Boris Ostrovsky wrote:
>>>>>> On 07/12/2016 10:57 AM, Shannon Zhao wrote:
>>>>> It will affect some others part of the guest if we don't increment
>>>>> the
>>>>> "maxmem" requested by the user. For ARM the ACPI blob will be exposed
>>>>> at a specific address that is outside of the guest RAM (see the guest
>>>>> memory layout in public/arch-arm.h).
>>>>>
>>>>> We chose this solution over putting in the RAM because the ACPI
>>>>> tables
>>>>> are not easily relocatable (compare to the device tree, initrd and
>>>>> kernel) so we could not take advantage of superpage in both stage-2
>>>>> (hypervisor) and stage-1 (kernel) page table.
>>>>
>>>> Maybe this is something ARM-specific then. For x86 we will want to
>>>> keep
>>>> maxmem unchanged.
>>>
>>> I don't think what I described in my previous mail is ARM-specific. The
>>> pressure will be more important on the TLBs, if Xen does not use
>>> superpage in
>>> the stage 2 page tables (i.e EPT for x86) no matter the architecture.
>>>
>>> IHMO, this seems to be a bigger drawback compare to add few more
>>> kilobytes to
>>> maxmem in the toolstack for the ACPI blob. You will loose them when
>>> creating
>>> the intermediate page table in any case.
>>
>> I agree with Julien. On ARM we have to increase maxmem because I don't
>> think that shattering a superpage is acceptable for just a few KBs. In
>> fact, it's not much about increasing maxmem, but it's about keeping the
>> allocation of guest memory to the value passed by the user in "memory",
>> so that it can be done in the most efficient way possible. (I am
>> assuming users are going to allocate VMs of 2048MB, rather than 2049MB.)
>>
>> I wouldn't want to end up adding to the performance tuning page on the
>> wiki "Make sure to add 1 more MB to the memory of your VM to get the
>> most out of the system."
>>
>> I know that the location of the ACPI blob on x86 is different in guest
>> memory space, but it seems to me that the problem would be the same. Do
>> you have 1 gigabyte pages in stage-2 on x86? If so, I would think twice
>> about this. Otherwise, if you only have 4K and 2MB allocations, then it
>> might not make that much of a difference.
>
> Looking at the x86 code, 1 gigabyte pages seems to be supported.
>
> Boris, do you have any opinions on this?


I don't think I understand the superpage shattering argument.  In x86
the tables live in regular RAM and a guest is free to use those
addresses as regular memory.

This apparently is different from how ARM manages the tables (you said
in an earlier message that they are not part of RAM) so I can see that
taking memory from RAM allocation to store the tables may affect how
mapping is done, potentially causing GB pages to be broken.

In fact (and I am totally speculating here) padding memory for x86 may
actually *cause* shattering because we will have (for example) 2049MB of
RAM to deal with.

If we were talking about large code difference (and if I am wrong about
what I said above) I'd agree that adding a few KB for x86 is OK. But
given that we are talking about a couple of lines of code ifdef'd by
'ARM' (and maybe a comment explaining why) I'd prefer not adding
anything for x86.

-boris
Julien Grall July 20, 2016, 1:41 p.m. UTC | #32
On 20/07/2016 14:33, Boris Ostrovsky wrote:
> On 07/20/2016 08:33 AM, Julien Grall wrote:
>> Hi,
>>
>> On 14/07/16 14:37, Stefano Stabellini wrote:
>>> On Wed, 13 Jul 2016, Julien Grall wrote:
>>>> Hello,
>>>>
>>>> On 12/07/2016 17:58, Boris Ostrovsky wrote:
>>>>> On 07/12/2016 12:10 PM, Julien Grall wrote:
>>>>>> On 12/07/2016 16:08, Boris Ostrovsky wrote:
>>>>>>> On 07/12/2016 10:57 AM, Shannon Zhao wrote:
>>>>>> It will affect some others part of the guest if we don't increment
>>>>>> the
>>>>>> "maxmem" requested by the user. For ARM the ACPI blob will be exposed
>>>>>> at a specific address that is outside of the guest RAM (see the guest
>>>>>> memory layout in public/arch-arm.h).
>>>>>>
>>>>>> We chose this solution over putting in the RAM because the ACPI
>>>>>> tables
>>>>>> are not easily relocatable (compare to the device tree, initrd and
>>>>>> kernel) so we could not take advantage of superpage in both stage-2
>>>>>> (hypervisor) and stage-1 (kernel) page table.
>>>>>
>>>>> Maybe this is something ARM-specific then. For x86 we will want to
>>>>> keep
>>>>> maxmem unchanged.
>>>>
>>>> I don't think what I described in my previous mail is ARM-specific. The
>>>> pressure will be more important on the TLBs, if Xen does not use
>>>> superpage in
>>>> the stage 2 page tables (i.e EPT for x86) no matter the architecture.
>>>>
>>>> IHMO, this seems to be a bigger drawback compare to add few more
>>>> kilobytes to
>>>> maxmem in the toolstack for the ACPI blob. You will loose them when
>>>> creating
>>>> the intermediate page table in any case.
>>>
>>> I agree with Julien. On ARM we have to increase maxmem because I don't
>>> think that shattering a superpage is acceptable for just a few KBs. In
>>> fact, it's not much about increasing maxmem, but it's about keeping the
>>> allocation of guest memory to the value passed by the user in "memory",
>>> so that it can be done in the most efficient way possible. (I am
>>> assuming users are going to allocate VMs of 2048MB, rather than 2049MB.)
>>>
>>> I wouldn't want to end up adding to the performance tuning page on the
>>> wiki "Make sure to add 1 more MB to the memory of your VM to get the
>>> most out of the system."
>>>
>>> I know that the location of the ACPI blob on x86 is different in guest
>>> memory space, but it seems to me that the problem would be the same. Do
>>> you have 1 gigabyte pages in stage-2 on x86? If so, I would think twice
>>> about this. Otherwise, if you only have 4K and 2MB allocations, then it
>>> might not make that much of a difference.
>>
>> Looking at the x86 code, 1 gigabyte pages seems to be supported.
>>
>> Boris, do you have any opinions on this?
>
>
> I don't think I understand the superpage shattering argument.  In x86
> the tables live in regular RAM and a guest is free to use those
> addresses as regular memory.
>
> This apparently is different from how ARM manages the tables (you said
> in an earlier message that they are not part of RAM) so I can see that
> taking memory from RAM allocation to store the tables may affect how
> mapping is done, potentially causing GB pages to be broken.
>
> In fact (and I am totally speculating here) padding memory for x86 may
> actually *cause* shattering because we will have (for example) 2049MB of
> RAM to deal with.

Correct me if I am wrong. On your series you are populating the page at 
a specific address for the ACPI tables separately to the RAM allocation. 
So you will shatter GB pages if the user provides 2048MB because the 
ACPI tables is accounted in the 2048MB.

The plan is to call setmaxmem with the size of the RAM + size of the 
ACPI tales. The RAM will still be 2048MB and therefore GB pages may be used.
Boris Ostrovsky July 20, 2016, 2:09 p.m. UTC | #33
On 07/20/2016 09:41 AM, Julien Grall wrote:
>
>
> On 20/07/2016 14:33, Boris Ostrovsky wrote:
>> On 07/20/2016 08:33 AM, Julien Grall wrote:
>>> Hi,
>>>
>>> On 14/07/16 14:37, Stefano Stabellini wrote:
>>>> On Wed, 13 Jul 2016, Julien Grall wrote:
>>>>> Hello,
>>>>>
>>>>> On 12/07/2016 17:58, Boris Ostrovsky wrote:
>>>>>> On 07/12/2016 12:10 PM, Julien Grall wrote:
>>>>>>> On 12/07/2016 16:08, Boris Ostrovsky wrote:
>>>>>>>> On 07/12/2016 10:57 AM, Shannon Zhao wrote:
>>>>>>> It will affect some others part of the guest if we don't increment
>>>>>>> the
>>>>>>> "maxmem" requested by the user. For ARM the ACPI blob will be
>>>>>>> exposed
>>>>>>> at a specific address that is outside of the guest RAM (see the
>>>>>>> guest
>>>>>>> memory layout in public/arch-arm.h).
>>>>>>>
>>>>>>> We chose this solution over putting in the RAM because the ACPI
>>>>>>> tables
>>>>>>> are not easily relocatable (compare to the device tree, initrd and
>>>>>>> kernel) so we could not take advantage of superpage in both stage-2
>>>>>>> (hypervisor) and stage-1 (kernel) page table.
>>>>>>
>>>>>> Maybe this is something ARM-specific then. For x86 we will want to
>>>>>> keep
>>>>>> maxmem unchanged.
>>>>>
>>>>> I don't think what I described in my previous mail is
>>>>> ARM-specific. The
>>>>> pressure will be more important on the TLBs, if Xen does not use
>>>>> superpage in
>>>>> the stage 2 page tables (i.e EPT for x86) no matter the architecture.
>>>>>
>>>>> IHMO, this seems to be a bigger drawback compare to add few more
>>>>> kilobytes to
>>>>> maxmem in the toolstack for the ACPI blob. You will loose them when
>>>>> creating
>>>>> the intermediate page table in any case.
>>>>
>>>> I agree with Julien. On ARM we have to increase maxmem because I don't
>>>> think that shattering a superpage is acceptable for just a few KBs. In
>>>> fact, it's not much about increasing maxmem, but it's about keeping
>>>> the
>>>> allocation of guest memory to the value passed by the user in
>>>> "memory",
>>>> so that it can be done in the most efficient way possible. (I am
>>>> assuming users are going to allocate VMs of 2048MB, rather than
>>>> 2049MB.)
>>>>
>>>> I wouldn't want to end up adding to the performance tuning page on the
>>>> wiki "Make sure to add 1 more MB to the memory of your VM to get the
>>>> most out of the system."
>>>>
>>>> I know that the location of the ACPI blob on x86 is different in guest
>>>> memory space, but it seems to me that the problem would be the
>>>> same. Do
>>>> you have 1 gigabyte pages in stage-2 on x86? If so, I would think
>>>> twice
>>>> about this. Otherwise, if you only have 4K and 2MB allocations,
>>>> then it
>>>> might not make that much of a difference.
>>>
>>> Looking at the x86 code, 1 gigabyte pages seems to be supported.
>>>
>>> Boris, do you have any opinions on this?
>>
>>
>> I don't think I understand the superpage shattering argument.  In x86
>> the tables live in regular RAM and a guest is free to use those
>> addresses as regular memory.
>>
>> This apparently is different from how ARM manages the tables (you said
>> in an earlier message that they are not part of RAM) so I can see that
>> taking memory from RAM allocation to store the tables may affect how
>> mapping is done, potentially causing GB pages to be broken.
>>
>> In fact (and I am totally speculating here) padding memory for x86 may
>> actually *cause* shattering because we will have (for example) 2049MB of
>> RAM to deal with.
>
> Correct me if I am wrong. On your series you are populating the page
> at a specific address for the ACPI tables separately to the RAM
> allocation. So you will shatter GB pages if the user provides 2048MB
> because the ACPI tables is accounted in the 2048MB.

And to be honest I am not convinced this was a well selected address
(0xfc000000). I am actually thinking about moving it down (this may
require changing dsdt.asl). I don't know whether I will actually do it
in this version but it is certainly a possibility.

-boris

>
> The plan is to call setmaxmem with the size of the RAM + size of the
> ACPI tales. The RAM will still be 2048MB and therefore GB pages may be
> used.
>
Stefano Stabellini July 20, 2016, 5:28 p.m. UTC | #34
On Wed, 20 Jul 2016, Boris Ostrovsky wrote:
> On 07/20/2016 09:41 AM, Julien Grall wrote:
> >
> >
> > On 20/07/2016 14:33, Boris Ostrovsky wrote:
> >> On 07/20/2016 08:33 AM, Julien Grall wrote:
> >>> Hi,
> >>>
> >>> On 14/07/16 14:37, Stefano Stabellini wrote:
> >>>> On Wed, 13 Jul 2016, Julien Grall wrote:
> >>>>> Hello,
> >>>>>
> >>>>> On 12/07/2016 17:58, Boris Ostrovsky wrote:
> >>>>>> On 07/12/2016 12:10 PM, Julien Grall wrote:
> >>>>>>> On 12/07/2016 16:08, Boris Ostrovsky wrote:
> >>>>>>>> On 07/12/2016 10:57 AM, Shannon Zhao wrote:
> >>>>>>> It will affect some others part of the guest if we don't increment
> >>>>>>> the
> >>>>>>> "maxmem" requested by the user. For ARM the ACPI blob will be
> >>>>>>> exposed
> >>>>>>> at a specific address that is outside of the guest RAM (see the
> >>>>>>> guest
> >>>>>>> memory layout in public/arch-arm.h).
> >>>>>>>
> >>>>>>> We chose this solution over putting in the RAM because the ACPI
> >>>>>>> tables
> >>>>>>> are not easily relocatable (compare to the device tree, initrd and
> >>>>>>> kernel) so we could not take advantage of superpage in both stage-2
> >>>>>>> (hypervisor) and stage-1 (kernel) page table.
> >>>>>>
> >>>>>> Maybe this is something ARM-specific then. For x86 we will want to
> >>>>>> keep
> >>>>>> maxmem unchanged.
> >>>>>
> >>>>> I don't think what I described in my previous mail is
> >>>>> ARM-specific. The
> >>>>> pressure will be more important on the TLBs, if Xen does not use
> >>>>> superpage in
> >>>>> the stage 2 page tables (i.e EPT for x86) no matter the architecture.
> >>>>>
> >>>>> IHMO, this seems to be a bigger drawback compare to add few more
> >>>>> kilobytes to
> >>>>> maxmem in the toolstack for the ACPI blob. You will loose them when
> >>>>> creating
> >>>>> the intermediate page table in any case.
> >>>>
> >>>> I agree with Julien. On ARM we have to increase maxmem because I don't
> >>>> think that shattering a superpage is acceptable for just a few KBs. In
> >>>> fact, it's not much about increasing maxmem, but it's about keeping
> >>>> the
> >>>> allocation of guest memory to the value passed by the user in
> >>>> "memory",
> >>>> so that it can be done in the most efficient way possible. (I am
> >>>> assuming users are going to allocate VMs of 2048MB, rather than
> >>>> 2049MB.)
> >>>>
> >>>> I wouldn't want to end up adding to the performance tuning page on the
> >>>> wiki "Make sure to add 1 more MB to the memory of your VM to get the
> >>>> most out of the system."
> >>>>
> >>>> I know that the location of the ACPI blob on x86 is different in guest
> >>>> memory space, but it seems to me that the problem would be the
> >>>> same. Do
> >>>> you have 1 gigabyte pages in stage-2 on x86? If so, I would think
> >>>> twice
> >>>> about this. Otherwise, if you only have 4K and 2MB allocations,
> >>>> then it
> >>>> might not make that much of a difference.
> >>>
> >>> Looking at the x86 code, 1 gigabyte pages seems to be supported.
> >>>
> >>> Boris, do you have any opinions on this?
> >>
> >>
> >> I don't think I understand the superpage shattering argument.  In x86
> >> the tables live in regular RAM and a guest is free to use those
> >> addresses as regular memory.
> >>
> >> This apparently is different from how ARM manages the tables (you said
> >> in an earlier message that they are not part of RAM) so I can see that
> >> taking memory from RAM allocation to store the tables may affect how
> >> mapping is done, potentially causing GB pages to be broken.
> >>
> >> In fact (and I am totally speculating here) padding memory for x86 may
> >> actually *cause* shattering because we will have (for example) 2049MB of
> >> RAM to deal with.
> >
> > Correct me if I am wrong. On your series you are populating the page
> > at a specific address for the ACPI tables separately to the RAM
> > allocation. So you will shatter GB pages if the user provides 2048MB
> > because the ACPI tables is accounted in the 2048MB.
> 
> And to be honest I am not convinced this was a well selected address
> (0xfc000000). I am actually thinking about moving it down (this may
> require changing dsdt.asl). I don't know whether I will actually do it
> in this version but it is certainly a possibility.

I don't understand how this statement fits in the discussion.

The memory allocation for the ACPI blob is done by the toolstack
separately from the rest of guest memory, leading to two separate
stage-2 pagetable allocations of less than 1-gigabyte pages. Is that
correct?
Boris Ostrovsky July 20, 2016, 7:51 p.m. UTC | #35
On 07/20/2016 01:28 PM, Stefano Stabellini wrote:
> On Wed, 20 Jul 2016, Boris Ostrovsky wrote:
>> On 07/20/2016 09:41 AM, Julien Grall wrote:
>>>
>>> On 20/07/2016 14:33, Boris Ostrovsky wrote:
>>>> On 07/20/2016 08:33 AM, Julien Grall wrote:
>>>>> Hi,
>>>>>
>>>>> On 14/07/16 14:37, Stefano Stabellini wrote:
>>>>>> On Wed, 13 Jul 2016, Julien Grall wrote:
>>>>>>> Hello,
>>>>>>>
>>>>>>> On 12/07/2016 17:58, Boris Ostrovsky wrote:
>>>>>>>> On 07/12/2016 12:10 PM, Julien Grall wrote:
>>>>>>>>> On 12/07/2016 16:08, Boris Ostrovsky wrote:
>>>>>>>>>> On 07/12/2016 10:57 AM, Shannon Zhao wrote:
>>>>>>>>> It will affect some others part of the guest if we don't increment
>>>>>>>>> the
>>>>>>>>> "maxmem" requested by the user. For ARM the ACPI blob will be
>>>>>>>>> exposed
>>>>>>>>> at a specific address that is outside of the guest RAM (see the
>>>>>>>>> guest
>>>>>>>>> memory layout in public/arch-arm.h).
>>>>>>>>>
>>>>>>>>> We chose this solution over putting in the RAM because the ACPI
>>>>>>>>> tables
>>>>>>>>> are not easily relocatable (compare to the device tree, initrd and
>>>>>>>>> kernel) so we could not take advantage of superpage in both stage-2
>>>>>>>>> (hypervisor) and stage-1 (kernel) page table.
>>>>>>>> Maybe this is something ARM-specific then. For x86 we will want to
>>>>>>>> keep
>>>>>>>> maxmem unchanged.
>>>>>>> I don't think what I described in my previous mail is
>>>>>>> ARM-specific. The
>>>>>>> pressure will be more important on the TLBs, if Xen does not use
>>>>>>> superpage in
>>>>>>> the stage 2 page tables (i.e EPT for x86) no matter the architecture.
>>>>>>>
>>>>>>> IHMO, this seems to be a bigger drawback compare to add few more
>>>>>>> kilobytes to
>>>>>>> maxmem in the toolstack for the ACPI blob. You will loose them when
>>>>>>> creating
>>>>>>> the intermediate page table in any case.
>>>>>> I agree with Julien. On ARM we have to increase maxmem because I don't
>>>>>> think that shattering a superpage is acceptable for just a few KBs. In
>>>>>> fact, it's not much about increasing maxmem, but it's about keeping
>>>>>> the
>>>>>> allocation of guest memory to the value passed by the user in
>>>>>> "memory",
>>>>>> so that it can be done in the most efficient way possible. (I am
>>>>>> assuming users are going to allocate VMs of 2048MB, rather than
>>>>>> 2049MB.)
>>>>>>
>>>>>> I wouldn't want to end up adding to the performance tuning page on the
>>>>>> wiki "Make sure to add 1 more MB to the memory of your VM to get the
>>>>>> most out of the system."
>>>>>>
>>>>>> I know that the location of the ACPI blob on x86 is different in guest
>>>>>> memory space, but it seems to me that the problem would be the
>>>>>> same. Do
>>>>>> you have 1 gigabyte pages in stage-2 on x86? If so, I would think
>>>>>> twice
>>>>>> about this. Otherwise, if you only have 4K and 2MB allocations,
>>>>>> then it
>>>>>> might not make that much of a difference.
>>>>> Looking at the x86 code, 1 gigabyte pages seems to be supported.
>>>>>
>>>>> Boris, do you have any opinions on this?
>>>>
>>>> I don't think I understand the superpage shattering argument.  In x86
>>>> the tables live in regular RAM and a guest is free to use those
>>>> addresses as regular memory.
>>>>
>>>> This apparently is different from how ARM manages the tables (you said
>>>> in an earlier message that they are not part of RAM) so I can see that
>>>> taking memory from RAM allocation to store the tables may affect how
>>>> mapping is done, potentially causing GB pages to be broken.
>>>>
>>>> In fact (and I am totally speculating here) padding memory for x86 may
>>>> actually *cause* shattering because we will have (for example) 2049MB of
>>>> RAM to deal with.
>>> Correct me if I am wrong. On your series you are populating the page
>>> at a specific address for the ACPI tables separately to the RAM
>>> allocation. So you will shatter GB pages if the user provides 2048MB
>>> because the ACPI tables is accounted in the 2048MB.
>> And to be honest I am not convinced this was a well selected address
>> (0xfc000000). I am actually thinking about moving it down (this may
>> require changing dsdt.asl). I don't know whether I will actually do it
>> in this version but it is certainly a possibility.
> I don't understand how this statement fits in the discussion.
>
> The memory allocation for the ACPI blob is done by the toolstack
> separately from the rest of guest memory, leading to two separate
> stage-2 pagetable allocations of less than 1-gigabyte pages. Is that
> correct?


If I move the table lower into memory we won't have to do any extra
allocation. The memory will have been already allocated for the guest,
we just map it and copy the tables.


-boris
Stefano Stabellini July 21, 2016, 5:53 p.m. UTC | #36
On Wed, 20 Jul 2016, Boris Ostrovsky wrote:
> On 07/20/2016 01:28 PM, Stefano Stabellini wrote:
> > On Wed, 20 Jul 2016, Boris Ostrovsky wrote:
> >> On 07/20/2016 09:41 AM, Julien Grall wrote:
> >>>
> >>> On 20/07/2016 14:33, Boris Ostrovsky wrote:
> >>>> On 07/20/2016 08:33 AM, Julien Grall wrote:
> >>>>> Hi,
> >>>>>
> >>>>> On 14/07/16 14:37, Stefano Stabellini wrote:
> >>>>>> On Wed, 13 Jul 2016, Julien Grall wrote:
> >>>>>>> Hello,
> >>>>>>>
> >>>>>>> On 12/07/2016 17:58, Boris Ostrovsky wrote:
> >>>>>>>> On 07/12/2016 12:10 PM, Julien Grall wrote:
> >>>>>>>>> On 12/07/2016 16:08, Boris Ostrovsky wrote:
> >>>>>>>>>> On 07/12/2016 10:57 AM, Shannon Zhao wrote:
> >>>>>>>>> It will affect some others part of the guest if we don't increment
> >>>>>>>>> the
> >>>>>>>>> "maxmem" requested by the user. For ARM the ACPI blob will be
> >>>>>>>>> exposed
> >>>>>>>>> at a specific address that is outside of the guest RAM (see the
> >>>>>>>>> guest
> >>>>>>>>> memory layout in public/arch-arm.h).
> >>>>>>>>>
> >>>>>>>>> We chose this solution over putting in the RAM because the ACPI
> >>>>>>>>> tables
> >>>>>>>>> are not easily relocatable (compare to the device tree, initrd and
> >>>>>>>>> kernel) so we could not take advantage of superpage in both stage-2
> >>>>>>>>> (hypervisor) and stage-1 (kernel) page table.
> >>>>>>>> Maybe this is something ARM-specific then. For x86 we will want to
> >>>>>>>> keep
> >>>>>>>> maxmem unchanged.
> >>>>>>> I don't think what I described in my previous mail is
> >>>>>>> ARM-specific. The
> >>>>>>> pressure will be more important on the TLBs, if Xen does not use
> >>>>>>> superpage in
> >>>>>>> the stage 2 page tables (i.e EPT for x86) no matter the architecture.
> >>>>>>>
> >>>>>>> IHMO, this seems to be a bigger drawback compare to add few more
> >>>>>>> kilobytes to
> >>>>>>> maxmem in the toolstack for the ACPI blob. You will loose them when
> >>>>>>> creating
> >>>>>>> the intermediate page table in any case.
> >>>>>> I agree with Julien. On ARM we have to increase maxmem because I don't
> >>>>>> think that shattering a superpage is acceptable for just a few KBs. In
> >>>>>> fact, it's not much about increasing maxmem, but it's about keeping
> >>>>>> the
> >>>>>> allocation of guest memory to the value passed by the user in
> >>>>>> "memory",
> >>>>>> so that it can be done in the most efficient way possible. (I am
> >>>>>> assuming users are going to allocate VMs of 2048MB, rather than
> >>>>>> 2049MB.)
> >>>>>>
> >>>>>> I wouldn't want to end up adding to the performance tuning page on the
> >>>>>> wiki "Make sure to add 1 more MB to the memory of your VM to get the
> >>>>>> most out of the system."
> >>>>>>
> >>>>>> I know that the location of the ACPI blob on x86 is different in guest
> >>>>>> memory space, but it seems to me that the problem would be the
> >>>>>> same. Do
> >>>>>> you have 1 gigabyte pages in stage-2 on x86? If so, I would think
> >>>>>> twice
> >>>>>> about this. Otherwise, if you only have 4K and 2MB allocations,
> >>>>>> then it
> >>>>>> might not make that much of a difference.
> >>>>> Looking at the x86 code, 1 gigabyte pages seems to be supported.
> >>>>>
> >>>>> Boris, do you have any opinions on this?
> >>>>
> >>>> I don't think I understand the superpage shattering argument.  In x86
> >>>> the tables live in regular RAM and a guest is free to use those
> >>>> addresses as regular memory.
> >>>>
> >>>> This apparently is different from how ARM manages the tables (you said
> >>>> in an earlier message that they are not part of RAM) so I can see that
> >>>> taking memory from RAM allocation to store the tables may affect how
> >>>> mapping is done, potentially causing GB pages to be broken.
> >>>>
> >>>> In fact (and I am totally speculating here) padding memory for x86 may
> >>>> actually *cause* shattering because we will have (for example) 2049MB of
> >>>> RAM to deal with.
> >>> Correct me if I am wrong. On your series you are populating the page
> >>> at a specific address for the ACPI tables separately to the RAM
> >>> allocation. So you will shatter GB pages if the user provides 2048MB
> >>> because the ACPI tables is accounted in the 2048MB.
> >> And to be honest I am not convinced this was a well selected address
> >> (0xfc000000). I am actually thinking about moving it down (this may
> >> require changing dsdt.asl). I don't know whether I will actually do it
> >> in this version but it is certainly a possibility.
> > I don't understand how this statement fits in the discussion.
> >
> > The memory allocation for the ACPI blob is done by the toolstack
> > separately from the rest of guest memory, leading to two separate
> > stage-2 pagetable allocations of less than 1-gigabyte pages. Is that
> > correct?
> 
> 
> If I move the table lower into memory we won't have to do any extra
> allocation. The memory will have been already allocated for the guest,
> we just map it and copy the tables.

I see, thanks for the explanation. I think this could work for ARM too
and should avoid the stage-2 shattering issue described above.

Julien, what do you think? I agree that having the ACPI blob separate
would be cleaner, but using the same allocation scheme as x86 is
important.
Julien Grall July 21, 2016, 6:23 p.m. UTC | #37
Hi Stefano,

On 21/07/16 18:53, Stefano Stabellini wrote:
> On Wed, 20 Jul 2016, Boris Ostrovsky wrote:
>> On 07/20/2016 01:28 PM, Stefano Stabellini wrote:
>>> On Wed, 20 Jul 2016, Boris Ostrovsky wrote:
>>>> On 07/20/2016 09:41 AM, Julien Grall wrote:
>>>>>
>>>>> On 20/07/2016 14:33, Boris Ostrovsky wrote:
>>>>>> On 07/20/2016 08:33 AM, Julien Grall wrote:
>>>>>>> Hi,
>>>>>>>
>>>>>>> On 14/07/16 14:37, Stefano Stabellini wrote:
>>>>>>>> On Wed, 13 Jul 2016, Julien Grall wrote:
>>>>>>>>> Hello,
>>>>>>>>>
>>>>>>>>> On 12/07/2016 17:58, Boris Ostrovsky wrote:
>>>>>>>>>> On 07/12/2016 12:10 PM, Julien Grall wrote:
>>>>>>>>>>> On 12/07/2016 16:08, Boris Ostrovsky wrote:
>>>>>>>>>>>> On 07/12/2016 10:57 AM, Shannon Zhao wrote:
>>>>>>>>>>> It will affect some others part of the guest if we don't increment
>>>>>>>>>>> the
>>>>>>>>>>> "maxmem" requested by the user. For ARM the ACPI blob will be
>>>>>>>>>>> exposed
>>>>>>>>>>> at a specific address that is outside of the guest RAM (see the
>>>>>>>>>>> guest
>>>>>>>>>>> memory layout in public/arch-arm.h).
>>>>>>>>>>>
>>>>>>>>>>> We chose this solution over putting in the RAM because the ACPI
>>>>>>>>>>> tables
>>>>>>>>>>> are not easily relocatable (compare to the device tree, initrd and
>>>>>>>>>>> kernel) so we could not take advantage of superpage in both stage-2
>>>>>>>>>>> (hypervisor) and stage-1 (kernel) page table.
>>>>>>>>>> Maybe this is something ARM-specific then. For x86 we will want to
>>>>>>>>>> keep
>>>>>>>>>> maxmem unchanged.
>>>>>>>>> I don't think what I described in my previous mail is
>>>>>>>>> ARM-specific. The
>>>>>>>>> pressure will be more important on the TLBs, if Xen does not use
>>>>>>>>> superpage in
>>>>>>>>> the stage 2 page tables (i.e EPT for x86) no matter the architecture.
>>>>>>>>>
>>>>>>>>> IHMO, this seems to be a bigger drawback compare to add few more
>>>>>>>>> kilobytes to
>>>>>>>>> maxmem in the toolstack for the ACPI blob. You will loose them when
>>>>>>>>> creating
>>>>>>>>> the intermediate page table in any case.
>>>>>>>> I agree with Julien. On ARM we have to increase maxmem because I don't
>>>>>>>> think that shattering a superpage is acceptable for just a few KBs. In
>>>>>>>> fact, it's not much about increasing maxmem, but it's about keeping
>>>>>>>> the
>>>>>>>> allocation of guest memory to the value passed by the user in
>>>>>>>> "memory",
>>>>>>>> so that it can be done in the most efficient way possible. (I am
>>>>>>>> assuming users are going to allocate VMs of 2048MB, rather than
>>>>>>>> 2049MB.)
>>>>>>>>
>>>>>>>> I wouldn't want to end up adding to the performance tuning page on the
>>>>>>>> wiki "Make sure to add 1 more MB to the memory of your VM to get the
>>>>>>>> most out of the system."
>>>>>>>>
>>>>>>>> I know that the location of the ACPI blob on x86 is different in guest
>>>>>>>> memory space, but it seems to me that the problem would be the
>>>>>>>> same. Do
>>>>>>>> you have 1 gigabyte pages in stage-2 on x86? If so, I would think
>>>>>>>> twice
>>>>>>>> about this. Otherwise, if you only have 4K and 2MB allocations,
>>>>>>>> then it
>>>>>>>> might not make that much of a difference.
>>>>>>> Looking at the x86 code, 1 gigabyte pages seems to be supported.
>>>>>>>
>>>>>>> Boris, do you have any opinions on this?
>>>>>>
>>>>>> I don't think I understand the superpage shattering argument.  In x86
>>>>>> the tables live in regular RAM and a guest is free to use those
>>>>>> addresses as regular memory.
>>>>>>
>>>>>> This apparently is different from how ARM manages the tables (you said
>>>>>> in an earlier message that they are not part of RAM) so I can see that
>>>>>> taking memory from RAM allocation to store the tables may affect how
>>>>>> mapping is done, potentially causing GB pages to be broken.
>>>>>>
>>>>>> In fact (and I am totally speculating here) padding memory for x86 may
>>>>>> actually *cause* shattering because we will have (for example) 2049MB of
>>>>>> RAM to deal with.
>>>>> Correct me if I am wrong. On your series you are populating the page
>>>>> at a specific address for the ACPI tables separately to the RAM
>>>>> allocation. So you will shatter GB pages if the user provides 2048MB
>>>>> because the ACPI tables is accounted in the 2048MB.
>>>> And to be honest I am not convinced this was a well selected address
>>>> (0xfc000000). I am actually thinking about moving it down (this may
>>>> require changing dsdt.asl). I don't know whether I will actually do it
>>>> in this version but it is certainly a possibility.
>>> I don't understand how this statement fits in the discussion.
>>>
>>> The memory allocation for the ACPI blob is done by the toolstack
>>> separately from the rest of guest memory, leading to two separate
>>> stage-2 pagetable allocations of less than 1-gigabyte pages. Is that
>>> correct?
>>
>>
>> If I move the table lower into memory we won't have to do any extra
>> allocation. The memory will have been already allocated for the guest,
>> we just map it and copy the tables.
>
> I see, thanks for the explanation. I think this could work for ARM too
> and should avoid the stage-2 shattering issue described above.

But you will end up to have stage-1 shattering issue if you put the ACPI 
tables lower into the guest RAM, reducing the overall performance. It is 
why I first asked Shannon to put the ACPI outside of the guest RAM.

> Julien, what do you think? I agree that having the ACPI blob separate
> would be cleaner, but using the same allocation scheme as x86 is
> important.

While I agree that having the same scheme is important, I care a lot 
more about performance.

So far, I have concerns about performance on all Boris suggestions. I 
believe that the impact is the same on x86. However it seems that it is 
not important for the x86 folks.

I think the scheme I suggested is the best one, because it will maximize 
the theoretical performance of the guests (a guest is free to not use 
superpage).

I could be convinced to put the ACPI tables at the end of the guest RAM, 
although this would require more code in the toolstack because the ACPI 
base address will not be static anymore.

Regards,
Stefano Stabellini July 21, 2016, 6:54 p.m. UTC | #38
On Thu, 21 Jul 2016, Julien Grall wrote:
> Hi Stefano,
> 
> On 21/07/16 18:53, Stefano Stabellini wrote:
> > On Wed, 20 Jul 2016, Boris Ostrovsky wrote:
> > > On 07/20/2016 01:28 PM, Stefano Stabellini wrote:
> > > > On Wed, 20 Jul 2016, Boris Ostrovsky wrote:
> > > > > On 07/20/2016 09:41 AM, Julien Grall wrote:
> > > > > > 
> > > > > > On 20/07/2016 14:33, Boris Ostrovsky wrote:
> > > > > > > On 07/20/2016 08:33 AM, Julien Grall wrote:
> > > > > > > > Hi,
> > > > > > > > 
> > > > > > > > On 14/07/16 14:37, Stefano Stabellini wrote:
> > > > > > > > > On Wed, 13 Jul 2016, Julien Grall wrote:
> > > > > > > > > > Hello,
> > > > > > > > > > 
> > > > > > > > > > On 12/07/2016 17:58, Boris Ostrovsky wrote:
> > > > > > > > > > > On 07/12/2016 12:10 PM, Julien Grall wrote:
> > > > > > > > > > > > On 12/07/2016 16:08, Boris Ostrovsky wrote:
> > > > > > > > > > > > > On 07/12/2016 10:57 AM, Shannon Zhao wrote:
> > > > > > > > > > > > It will affect some others part of the guest if we don't
> > > > > > > > > > > > increment
> > > > > > > > > > > > the
> > > > > > > > > > > > "maxmem" requested by the user. For ARM the ACPI blob
> > > > > > > > > > > > will be
> > > > > > > > > > > > exposed
> > > > > > > > > > > > at a specific address that is outside of the guest RAM
> > > > > > > > > > > > (see the
> > > > > > > > > > > > guest
> > > > > > > > > > > > memory layout in public/arch-arm.h).
> > > > > > > > > > > > 
> > > > > > > > > > > > We chose this solution over putting in the RAM because
> > > > > > > > > > > > the ACPI
> > > > > > > > > > > > tables
> > > > > > > > > > > > are not easily relocatable (compare to the device tree,
> > > > > > > > > > > > initrd and
> > > > > > > > > > > > kernel) so we could not take advantage of superpage in
> > > > > > > > > > > > both stage-2
> > > > > > > > > > > > (hypervisor) and stage-1 (kernel) page table.
> > > > > > > > > > > Maybe this is something ARM-specific then. For x86 we will
> > > > > > > > > > > want to
> > > > > > > > > > > keep
> > > > > > > > > > > maxmem unchanged.
> > > > > > > > > > I don't think what I described in my previous mail is
> > > > > > > > > > ARM-specific. The
> > > > > > > > > > pressure will be more important on the TLBs, if Xen does not
> > > > > > > > > > use
> > > > > > > > > > superpage in
> > > > > > > > > > the stage 2 page tables (i.e EPT for x86) no matter the
> > > > > > > > > > architecture.
> > > > > > > > > > 
> > > > > > > > > > IHMO, this seems to be a bigger drawback compare to add few
> > > > > > > > > > more
> > > > > > > > > > kilobytes to
> > > > > > > > > > maxmem in the toolstack for the ACPI blob. You will loose
> > > > > > > > > > them when
> > > > > > > > > > creating
> > > > > > > > > > the intermediate page table in any case.
> > > > > > > > > I agree with Julien. On ARM we have to increase maxmem because
> > > > > > > > > I don't
> > > > > > > > > think that shattering a superpage is acceptable for just a few
> > > > > > > > > KBs. In
> > > > > > > > > fact, it's not much about increasing maxmem, but it's about
> > > > > > > > > keeping
> > > > > > > > > the
> > > > > > > > > allocation of guest memory to the value passed by the user in
> > > > > > > > > "memory",
> > > > > > > > > so that it can be done in the most efficient way possible. (I
> > > > > > > > > am
> > > > > > > > > assuming users are going to allocate VMs of 2048MB, rather
> > > > > > > > > than
> > > > > > > > > 2049MB.)
> > > > > > > > > 
> > > > > > > > > I wouldn't want to end up adding to the performance tuning
> > > > > > > > > page on the
> > > > > > > > > wiki "Make sure to add 1 more MB to the memory of your VM to
> > > > > > > > > get the
> > > > > > > > > most out of the system."
> > > > > > > > > 
> > > > > > > > > I know that the location of the ACPI blob on x86 is different
> > > > > > > > > in guest
> > > > > > > > > memory space, but it seems to me that the problem would be the
> > > > > > > > > same. Do
> > > > > > > > > you have 1 gigabyte pages in stage-2 on x86? If so, I would
> > > > > > > > > think
> > > > > > > > > twice
> > > > > > > > > about this. Otherwise, if you only have 4K and 2MB
> > > > > > > > > allocations,
> > > > > > > > > then it
> > > > > > > > > might not make that much of a difference.
> > > > > > > > Looking at the x86 code, 1 gigabyte pages seems to be supported.
> > > > > > > > 
> > > > > > > > Boris, do you have any opinions on this?
> > > > > > > 
> > > > > > > I don't think I understand the superpage shattering argument.  In
> > > > > > > x86
> > > > > > > the tables live in regular RAM and a guest is free to use those
> > > > > > > addresses as regular memory.
> > > > > > > 
> > > > > > > This apparently is different from how ARM manages the tables (you
> > > > > > > said
> > > > > > > in an earlier message that they are not part of RAM) so I can see
> > > > > > > that
> > > > > > > taking memory from RAM allocation to store the tables may affect
> > > > > > > how
> > > > > > > mapping is done, potentially causing GB pages to be broken.
> > > > > > > 
> > > > > > > In fact (and I am totally speculating here) padding memory for x86
> > > > > > > may
> > > > > > > actually *cause* shattering because we will have (for example)
> > > > > > > 2049MB of
> > > > > > > RAM to deal with.
> > > > > > Correct me if I am wrong. On your series you are populating the page
> > > > > > at a specific address for the ACPI tables separately to the RAM
> > > > > > allocation. So you will shatter GB pages if the user provides 2048MB
> > > > > > because the ACPI tables is accounted in the 2048MB.
> > > > > And to be honest I am not convinced this was a well selected address
> > > > > (0xfc000000). I am actually thinking about moving it down (this may
> > > > > require changing dsdt.asl). I don't know whether I will actually do it
> > > > > in this version but it is certainly a possibility.
> > > > I don't understand how this statement fits in the discussion.
> > > > 
> > > > The memory allocation for the ACPI blob is done by the toolstack
> > > > separately from the rest of guest memory, leading to two separate
> > > > stage-2 pagetable allocations of less than 1-gigabyte pages. Is that
> > > > correct?
> > > 
> > > 
> > > If I move the table lower into memory we won't have to do any extra
> > > allocation. The memory will have been already allocated for the guest,
> > > we just map it and copy the tables.
> > 
> > I see, thanks for the explanation. I think this could work for ARM too
> > and should avoid the stage-2 shattering issue described above.
> 
> But you will end up to have stage-1 shattering issue if you put the ACPI
> tables lower into the guest RAM, reducing the overall performance. It is why I
> first asked Shannon to put the ACPI outside of the guest RAM.

I am not sure about this actually: even with the ACPI blob in the middle
of guest memory, the guest OS could still use a single superpage for its
own stage-1 memory mappings. I don't know if Linux does it that way, but
it should be possible.
Julien Grall July 21, 2016, 7:14 p.m. UTC | #39
On 21/07/2016 19:54, Stefano Stabellini wrote:
> On Thu, 21 Jul 2016, Julien Grall wrote:
>> Hi Stefano,
>>
>> On 21/07/16 18:53, Stefano Stabellini wrote:
>>> On Wed, 20 Jul 2016, Boris Ostrovsky wrote:
>>>> On 07/20/2016 01:28 PM, Stefano Stabellini wrote:
>>>>> On Wed, 20 Jul 2016, Boris Ostrovsky wrote:
>>>>>> On 07/20/2016 09:41 AM, Julien Grall wrote:
>>>>>>>
>>>>>>> On 20/07/2016 14:33, Boris Ostrovsky wrote:
>>>>>>>> On 07/20/2016 08:33 AM, Julien Grall wrote:
>>>>>>>>> Hi,
>>>>>>>>>
>>>>>>>>> On 14/07/16 14:37, Stefano Stabellini wrote:
>>>>>>>>>> On Wed, 13 Jul 2016, Julien Grall wrote:
>>>>>>>>>>> Hello,
>>>>>>>>>>>
>>>>>>>>>>> On 12/07/2016 17:58, Boris Ostrovsky wrote:
>>>>>>>>>>>> On 07/12/2016 12:10 PM, Julien Grall wrote:
>>>>>>>>>>>>> On 12/07/2016 16:08, Boris Ostrovsky wrote:
>>>>>>>>>>>>>> On 07/12/2016 10:57 AM, Shannon Zhao wrote:
>>>>>>>>>>>>> It will affect some others part of the guest if we don't
>>>>>>>>>>>>> increment
>>>>>>>>>>>>> the
>>>>>>>>>>>>> "maxmem" requested by the user. For ARM the ACPI blob
>>>>>>>>>>>>> will be
>>>>>>>>>>>>> exposed
>>>>>>>>>>>>> at a specific address that is outside of the guest RAM
>>>>>>>>>>>>> (see the
>>>>>>>>>>>>> guest
>>>>>>>>>>>>> memory layout in public/arch-arm.h).
>>>>>>>>>>>>>
>>>>>>>>>>>>> We chose this solution over putting in the RAM because
>>>>>>>>>>>>> the ACPI
>>>>>>>>>>>>> tables
>>>>>>>>>>>>> are not easily relocatable (compare to the device tree,
>>>>>>>>>>>>> initrd and
>>>>>>>>>>>>> kernel) so we could not take advantage of superpage in
>>>>>>>>>>>>> both stage-2
>>>>>>>>>>>>> (hypervisor) and stage-1 (kernel) page table.
>>>>>>>>>>>> Maybe this is something ARM-specific then. For x86 we will
>>>>>>>>>>>> want to
>>>>>>>>>>>> keep
>>>>>>>>>>>> maxmem unchanged.
>>>>>>>>>>> I don't think what I described in my previous mail is
>>>>>>>>>>> ARM-specific. The
>>>>>>>>>>> pressure will be more important on the TLBs, if Xen does not
>>>>>>>>>>> use
>>>>>>>>>>> superpage in
>>>>>>>>>>> the stage 2 page tables (i.e EPT for x86) no matter the
>>>>>>>>>>> architecture.
>>>>>>>>>>>
>>>>>>>>>>> IHMO, this seems to be a bigger drawback compare to add few
>>>>>>>>>>> more
>>>>>>>>>>> kilobytes to
>>>>>>>>>>> maxmem in the toolstack for the ACPI blob. You will loose
>>>>>>>>>>> them when
>>>>>>>>>>> creating
>>>>>>>>>>> the intermediate page table in any case.
>>>>>>>>>> I agree with Julien. On ARM we have to increase maxmem because
>>>>>>>>>> I don't
>>>>>>>>>> think that shattering a superpage is acceptable for just a few
>>>>>>>>>> KBs. In
>>>>>>>>>> fact, it's not much about increasing maxmem, but it's about
>>>>>>>>>> keeping
>>>>>>>>>> the
>>>>>>>>>> allocation of guest memory to the value passed by the user in
>>>>>>>>>> "memory",
>>>>>>>>>> so that it can be done in the most efficient way possible. (I
>>>>>>>>>> am
>>>>>>>>>> assuming users are going to allocate VMs of 2048MB, rather
>>>>>>>>>> than
>>>>>>>>>> 2049MB.)
>>>>>>>>>>
>>>>>>>>>> I wouldn't want to end up adding to the performance tuning
>>>>>>>>>> page on the
>>>>>>>>>> wiki "Make sure to add 1 more MB to the memory of your VM to
>>>>>>>>>> get the
>>>>>>>>>> most out of the system."
>>>>>>>>>>
>>>>>>>>>> I know that the location of the ACPI blob on x86 is different
>>>>>>>>>> in guest
>>>>>>>>>> memory space, but it seems to me that the problem would be the
>>>>>>>>>> same. Do
>>>>>>>>>> you have 1 gigabyte pages in stage-2 on x86? If so, I would
>>>>>>>>>> think
>>>>>>>>>> twice
>>>>>>>>>> about this. Otherwise, if you only have 4K and 2MB
>>>>>>>>>> allocations,
>>>>>>>>>> then it
>>>>>>>>>> might not make that much of a difference.
>>>>>>>>> Looking at the x86 code, 1 gigabyte pages seems to be supported.
>>>>>>>>>
>>>>>>>>> Boris, do you have any opinions on this?
>>>>>>>>
>>>>>>>> I don't think I understand the superpage shattering argument.  In
>>>>>>>> x86
>>>>>>>> the tables live in regular RAM and a guest is free to use those
>>>>>>>> addresses as regular memory.
>>>>>>>>
>>>>>>>> This apparently is different from how ARM manages the tables (you
>>>>>>>> said
>>>>>>>> in an earlier message that they are not part of RAM) so I can see
>>>>>>>> that
>>>>>>>> taking memory from RAM allocation to store the tables may affect
>>>>>>>> how
>>>>>>>> mapping is done, potentially causing GB pages to be broken.
>>>>>>>>
>>>>>>>> In fact (and I am totally speculating here) padding memory for x86
>>>>>>>> may
>>>>>>>> actually *cause* shattering because we will have (for example)
>>>>>>>> 2049MB of
>>>>>>>> RAM to deal with.
>>>>>>> Correct me if I am wrong. On your series you are populating the page
>>>>>>> at a specific address for the ACPI tables separately to the RAM
>>>>>>> allocation. So you will shatter GB pages if the user provides 2048MB
>>>>>>> because the ACPI tables is accounted in the 2048MB.
>>>>>> And to be honest I am not convinced this was a well selected address
>>>>>> (0xfc000000). I am actually thinking about moving it down (this may
>>>>>> require changing dsdt.asl). I don't know whether I will actually do it
>>>>>> in this version but it is certainly a possibility.
>>>>> I don't understand how this statement fits in the discussion.
>>>>>
>>>>> The memory allocation for the ACPI blob is done by the toolstack
>>>>> separately from the rest of guest memory, leading to two separate
>>>>> stage-2 pagetable allocations of less than 1-gigabyte pages. Is that
>>>>> correct?
>>>>
>>>>
>>>> If I move the table lower into memory we won't have to do any extra
>>>> allocation. The memory will have been already allocated for the guest,
>>>> we just map it and copy the tables.
>>>
>>> I see, thanks for the explanation. I think this could work for ARM too
>>> and should avoid the stage-2 shattering issue described above.
>>
>> But you will end up to have stage-1 shattering issue if you put the ACPI
>> tables lower into the guest RAM, reducing the overall performance. It is why I
>> first asked Shannon to put the ACPI outside of the guest RAM.
>
> I am not sure about this actually: even with the ACPI blob in the middle
> of guest memory, the guest OS could still use a single superpage for its
> own stage-1 memory mappings. I don't know if Linux does it that way, but
> it should be possible.

You are assuming that the guest will map the ACPI blob with the same 
attributes as the rest of the superpage.

IHMO, a sane operating system will want to map the ACPI blob read-only.

Regards,
Stefano Stabellini July 21, 2016, 9:15 p.m. UTC | #40
On Thu, 21 Jul 2016, Julien Grall wrote:
> On 21/07/2016 19:54, Stefano Stabellini wrote:
> > On Thu, 21 Jul 2016, Julien Grall wrote:
> > > Hi Stefano,
> > > 
> > > On 21/07/16 18:53, Stefano Stabellini wrote:
> > > > On Wed, 20 Jul 2016, Boris Ostrovsky wrote:
> > > > > On 07/20/2016 01:28 PM, Stefano Stabellini wrote:
> > > > > > On Wed, 20 Jul 2016, Boris Ostrovsky wrote:
> > > > > > > On 07/20/2016 09:41 AM, Julien Grall wrote:
> > > > > > > > 
> > > > > > > > On 20/07/2016 14:33, Boris Ostrovsky wrote:
> > > > > > > > > On 07/20/2016 08:33 AM, Julien Grall wrote:
> > > > > > > > > > Hi,
> > > > > > > > > > 
> > > > > > > > > > On 14/07/16 14:37, Stefano Stabellini wrote:
> > > > > > > > > > > On Wed, 13 Jul 2016, Julien Grall wrote:
> > > > > > > > > > > > Hello,
> > > > > > > > > > > > 
> > > > > > > > > > > > On 12/07/2016 17:58, Boris Ostrovsky wrote:
> > > > > > > > > > > > > On 07/12/2016 12:10 PM, Julien Grall wrote:
> > > > > > > > > > > > > > On 12/07/2016 16:08, Boris Ostrovsky wrote:
> > > > > > > > > > > > > > > On 07/12/2016 10:57 AM, Shannon Zhao wrote:
> > > > > > > > > > > > > > It will affect some others part of the guest if we
> > > > > > > > > > > > > > don't
> > > > > > > > > > > > > > increment
> > > > > > > > > > > > > > the
> > > > > > > > > > > > > > "maxmem" requested by the user. For ARM the ACPI
> > > > > > > > > > > > > > blob
> > > > > > > > > > > > > > will be
> > > > > > > > > > > > > > exposed
> > > > > > > > > > > > > > at a specific address that is outside of the guest
> > > > > > > > > > > > > > RAM
> > > > > > > > > > > > > > (see the
> > > > > > > > > > > > > > guest
> > > > > > > > > > > > > > memory layout in public/arch-arm.h).
> > > > > > > > > > > > > > 
> > > > > > > > > > > > > > We chose this solution over putting in the RAM
> > > > > > > > > > > > > > because
> > > > > > > > > > > > > > the ACPI
> > > > > > > > > > > > > > tables
> > > > > > > > > > > > > > are not easily relocatable (compare to the device
> > > > > > > > > > > > > > tree,
> > > > > > > > > > > > > > initrd and
> > > > > > > > > > > > > > kernel) so we could not take advantage of superpage
> > > > > > > > > > > > > > in
> > > > > > > > > > > > > > both stage-2
> > > > > > > > > > > > > > (hypervisor) and stage-1 (kernel) page table.
> > > > > > > > > > > > > Maybe this is something ARM-specific then. For x86 we
> > > > > > > > > > > > > will
> > > > > > > > > > > > > want to
> > > > > > > > > > > > > keep
> > > > > > > > > > > > > maxmem unchanged.
> > > > > > > > > > > > I don't think what I described in my previous mail is
> > > > > > > > > > > > ARM-specific. The
> > > > > > > > > > > > pressure will be more important on the TLBs, if Xen does
> > > > > > > > > > > > not
> > > > > > > > > > > > use
> > > > > > > > > > > > superpage in
> > > > > > > > > > > > the stage 2 page tables (i.e EPT for x86) no matter the
> > > > > > > > > > > > architecture.
> > > > > > > > > > > > 
> > > > > > > > > > > > IHMO, this seems to be a bigger drawback compare to add
> > > > > > > > > > > > few
> > > > > > > > > > > > more
> > > > > > > > > > > > kilobytes to
> > > > > > > > > > > > maxmem in the toolstack for the ACPI blob. You will
> > > > > > > > > > > > loose
> > > > > > > > > > > > them when
> > > > > > > > > > > > creating
> > > > > > > > > > > > the intermediate page table in any case.
> > > > > > > > > > > I agree with Julien. On ARM we have to increase maxmem
> > > > > > > > > > > because
> > > > > > > > > > > I don't
> > > > > > > > > > > think that shattering a superpage is acceptable for just a
> > > > > > > > > > > few
> > > > > > > > > > > KBs. In
> > > > > > > > > > > fact, it's not much about increasing maxmem, but it's
> > > > > > > > > > > about
> > > > > > > > > > > keeping
> > > > > > > > > > > the
> > > > > > > > > > > allocation of guest memory to the value passed by the user
> > > > > > > > > > > in
> > > > > > > > > > > "memory",
> > > > > > > > > > > so that it can be done in the most efficient way possible.
> > > > > > > > > > > (I
> > > > > > > > > > > am
> > > > > > > > > > > assuming users are going to allocate VMs of 2048MB, rather
> > > > > > > > > > > than
> > > > > > > > > > > 2049MB.)
> > > > > > > > > > > 
> > > > > > > > > > > I wouldn't want to end up adding to the performance tuning
> > > > > > > > > > > page on the
> > > > > > > > > > > wiki "Make sure to add 1 more MB to the memory of your VM
> > > > > > > > > > > to
> > > > > > > > > > > get the
> > > > > > > > > > > most out of the system."
> > > > > > > > > > > 
> > > > > > > > > > > I know that the location of the ACPI blob on x86 is
> > > > > > > > > > > different
> > > > > > > > > > > in guest
> > > > > > > > > > > memory space, but it seems to me that the problem would be
> > > > > > > > > > > the
> > > > > > > > > > > same. Do
> > > > > > > > > > > you have 1 gigabyte pages in stage-2 on x86? If so, I
> > > > > > > > > > > would
> > > > > > > > > > > think
> > > > > > > > > > > twice
> > > > > > > > > > > about this. Otherwise, if you only have 4K and 2MB
> > > > > > > > > > > allocations,
> > > > > > > > > > > then it
> > > > > > > > > > > might not make that much of a difference.
> > > > > > > > > > Looking at the x86 code, 1 gigabyte pages seems to be
> > > > > > > > > > supported.
> > > > > > > > > > 
> > > > > > > > > > Boris, do you have any opinions on this?
> > > > > > > > > 
> > > > > > > > > I don't think I understand the superpage shattering argument.
> > > > > > > > > In
> > > > > > > > > x86
> > > > > > > > > the tables live in regular RAM and a guest is free to use
> > > > > > > > > those
> > > > > > > > > addresses as regular memory.
> > > > > > > > > 
> > > > > > > > > This apparently is different from how ARM manages the tables
> > > > > > > > > (you
> > > > > > > > > said
> > > > > > > > > in an earlier message that they are not part of RAM) so I can
> > > > > > > > > see
> > > > > > > > > that
> > > > > > > > > taking memory from RAM allocation to store the tables may
> > > > > > > > > affect
> > > > > > > > > how
> > > > > > > > > mapping is done, potentially causing GB pages to be broken.
> > > > > > > > > 
> > > > > > > > > In fact (and I am totally speculating here) padding memory for
> > > > > > > > > x86
> > > > > > > > > may
> > > > > > > > > actually *cause* shattering because we will have (for example)
> > > > > > > > > 2049MB of
> > > > > > > > > RAM to deal with.
> > > > > > > > Correct me if I am wrong. On your series you are populating the
> > > > > > > > page
> > > > > > > > at a specific address for the ACPI tables separately to the RAM
> > > > > > > > allocation. So you will shatter GB pages if the user provides
> > > > > > > > 2048MB
> > > > > > > > because the ACPI tables is accounted in the 2048MB.
> > > > > > > And to be honest I am not convinced this was a well selected
> > > > > > > address
> > > > > > > (0xfc000000). I am actually thinking about moving it down (this
> > > > > > > may
> > > > > > > require changing dsdt.asl). I don't know whether I will actually
> > > > > > > do it
> > > > > > > in this version but it is certainly a possibility.
> > > > > > I don't understand how this statement fits in the discussion.
> > > > > > 
> > > > > > The memory allocation for the ACPI blob is done by the toolstack
> > > > > > separately from the rest of guest memory, leading to two separate
> > > > > > stage-2 pagetable allocations of less than 1-gigabyte pages. Is that
> > > > > > correct?
> > > > > 
> > > > > 
> > > > > If I move the table lower into memory we won't have to do any extra
> > > > > allocation. The memory will have been already allocated for the guest,
> > > > > we just map it and copy the tables.
> > > > 
> > > > I see, thanks for the explanation. I think this could work for ARM too
> > > > and should avoid the stage-2 shattering issue described above.
> > > 
> > > But you will end up to have stage-1 shattering issue if you put the ACPI
> > > tables lower into the guest RAM, reducing the overall performance. It is
> > > why I
> > > first asked Shannon to put the ACPI outside of the guest RAM.
> > 
> > I am not sure about this actually: even with the ACPI blob in the middle
> > of guest memory, the guest OS could still use a single superpage for its
> > own stage-1 memory mappings. I don't know if Linux does it that way, but
> > it should be possible.
> 
> You are assuming that the guest will map the ACPI blob with the same
> attributes as the rest of the superpage.
> 
> IHMO, a sane operating system will want to map the ACPI blob read-only.

That's true. But there are other things which might be mapped
differently and could shatter a stage-1 superpage mapping (especially on
x86 that has a much more complex memory map than ARM). Obviously adding
one more is not doing it any good, but it might not make a difference in
practice.

Anyway, I agree with Julien that his suggestion is the best for ARM. If
the libxl maintainers are willing to accept two different code paths for
this on ARM and x86, then I am fine with it too.
Shannon Zhao July 25, 2016, 7:56 a.m. UTC | #41
On 2016/7/20 17:32, Wei Liu wrote:
> On Wed, Jul 20, 2016 at 02:52:05PM +0800, Shannon Zhao wrote:
>> > 
>> > 
>> > On 2016/7/19 18:38, Wei Liu wrote:
>>> > > On Fri, Jul 15, 2016 at 05:39:32PM +0800, Shannon Zhao wrote:
>>> > > [...]
>>>>>>> > >>> > > 
>>>>>>> > >>> > > It would be trivial to have another option in xl.cfg to allow MB
>>>>>>> > >>> > > granularity. But I don't think that's a good idea. Asking for more
>>>>>>> > >>> > > memory when you don't really know how much is enough is not very useful.
>>>>>>> > >>> > > If an admin can know how much is needed, surely the library can be
>>>>>>> > >>> > > taught to obtain that knowledge, too.
>>>>>>> > >>> > > 
>>>>>>> > >>> > > We need to decide which model we should go with. And, if we decide to
>>>>>>> > >>> > > diverge, document the difference between x86 and ARM model.
>>>>>>> > >>> > > 
>>>>> > >> > Hi Wei,
>>>>> > >> > 
>>>>> > >> > Do you decide how to add the size of ACPI blob to max_memkb?
>>>>> > >> > 
>>> > > AFAICT ARM and x86 maintainers hold different opinions on how memory
>>> > > should be accounted.
>>> > > 
>>> > > I would like to have a unified memory accounting model. But if we can't
>>> > > have that at the moment, I'm fine with divergence, but please document
>>> > > it somewhere (comment near code snippet, in header, or a file under docs
>>> > > etc). And the amount added to max_memkb needs to be properly calculated,
>>> > > not some magic number, so that we have a chance in the future to
>>> > > confidently change how we do thing.
>> > If it's only allowed to add the size to max_memkb in
>> > libxl__domain_build_info_setdefault(), it only can use a const number
>> > since the tables are not generted and we don;t know the size. But the
>> > const number could be chosen properly since we could know the maximum
>> > ACPI tables size of current ARM approach.
>> > 
>> > But maybe in the future, if we add some new ACPI tables, it should
>> > increase the size as well. So I think this should be documented.
>> > 
>> > As I asked before, is it ok to add the size to max_memkb after
>> > generating the ACPI tables and before loading the tables to guest space?
>> > 
> Yes. I don't think shoehorning everything into setdefault is a good
> idea.
> 
> I think libxl_arm.c:libxl__arch_domain_create would be the right place
> to do it.
> 
> I am thinking about calling xc_domain_setmaxmem there, but not adding a
> number to d_config->b_info.max_memkb. Adding that to ->max_memkb would
> be wrong because the bigger ->max_memkb will be recored and the same
> algorithm will be applied every time you migrate your guest, so the
> max_memkb will grow bigger and bigger.
> 
> Given the different approach taken by ARM and x86, maybe we need to also
> record the size of acpi blobs somewhere in xenstore (also needs to be
> documented) so that subsequent libxl_domain_setmaxmem can extract that
> number again.
> 
> Please wait a bit for Stefano and Julien to comment before you do work.
> 
Stefano, Julien,
Any comments regarding how to add the ACPI size to max_memkb?

Thanks,
George Dunlap July 25, 2016, 8:38 a.m. UTC | #42
On Thu, Jul 21, 2016 at 10:15 PM, Stefano Stabellini
<sstabellini@kernel.org> wrote:
>> You are assuming that the guest will map the ACPI blob with the same
>> attributes as the rest of the superpage.
>>
>> IHMO, a sane operating system will want to map the ACPI blob read-only.
>
> That's true. But there are other things which might be mapped
> differently and could shatter a stage-1 superpage mapping (especially on
> x86 that has a much more complex memory map than ARM). Obviously adding
> one more is not doing it any good, but it might not make a difference in
> practice.
>
> Anyway, I agree with Julien that his suggestion is the best for ARM. If
> the libxl maintainers are willing to accept two different code paths for
> this on ARM and x86, then I am fine with it too.

Sorry to be a bit late to this thread -- there's a interface principle
that I think we should at some point have a larger discussion about:
whether "maxmem" means the amount of RAM which the guest sees as RAM,
or whether "maxmem" means the amount of RAM that the administrator
sees as used by the guest.  At the moment tnhere's no consistent
answer actually; but I am strongly of the opinion that for usability
the best answer is for "memory" to be the *total* amount of *host*
memory used by the guest.  In an ideal world, the admin should be able
to do "xl info", see that there is 3000MiB free, and then start a
guest with 3000MiB and expect it to succeed.  At the moment he has to
guess.

 -George
Julien Grall July 25, 2016, 9:46 a.m. UTC | #43
Hi George,

On 25/07/16 09:38, George Dunlap wrote:
> On Thu, Jul 21, 2016 at 10:15 PM, Stefano Stabellini
> <sstabellini@kernel.org> wrote:
>>> You are assuming that the guest will map the ACPI blob with the same
>>> attributes as the rest of the superpage.
>>>
>>> IHMO, a sane operating system will want to map the ACPI blob read-only.
>>
>> That's true. But there are other things which might be mapped
>> differently and could shatter a stage-1 superpage mapping (especially on
>> x86 that has a much more complex memory map than ARM). Obviously adding
>> one more is not doing it any good, but it might not make a difference in
>> practice.
>>
>> Anyway, I agree with Julien that his suggestion is the best for ARM. If
>> the libxl maintainers are willing to accept two different code paths for
>> this on ARM and x86, then I am fine with it too.
>
> Sorry to be a bit late to this thread -- there's a interface principle
> that I think we should at some point have a larger discussion about:
> whether "maxmem" means the amount of RAM which the guest sees as RAM,
> or whether "maxmem" means the amount of RAM that the administrator
> sees as used by the guest.  At the moment tnhere's no consistent
> answer actually; but I am strongly of the opinion that for usability
> the best answer is for "memory" to be the *total* amount of *host*
> memory used by the guest.  In an ideal world, the admin should be able
> to do "xl info", see that there is 3000MiB free, and then start a
> guest with 3000MiB and expect it to succeed.  At the moment he has to
> guess.

To confirm, do you include memory allocated by the hypervisor to keep 
track of the guest (i.e struc domain, struct vcpu...)?

If not, the problem stays the same because the admin will have to know 
how much memory Xen will allocate to keep track of the guest. So if "xl 
info" tells you that 3000MiB is free, you will only be able to use 
3000MiB - few kilobytes.

If yes, this would be a problem for migration because a newer version of 
Xen may allocate less/more memory.

Furthermore, with this suggestion, we will likely not be able to take 
advantage of 1GB superpage unless the admin knows how much memory will 
be used by Xen outside of the guest RAM (such as the console page, 
xenstore page, ....).

IHMO, it would be easier if we define "maxmem" as the usable for guest 
RAM and then let the toolstack take add the extra memory (ACPI blob, 
special pages...).

Regards,
Stefano Stabellini July 25, 2016, 10:06 p.m. UTC | #44
On Mon, 25 Jul 2016, George Dunlap wrote:
> On Thu, Jul 21, 2016 at 10:15 PM, Stefano Stabellini
> <sstabellini@kernel.org> wrote:
> >> You are assuming that the guest will map the ACPI blob with the same
> >> attributes as the rest of the superpage.
> >>
> >> IHMO, a sane operating system will want to map the ACPI blob read-only.
> >
> > That's true. But there are other things which might be mapped
> > differently and could shatter a stage-1 superpage mapping (especially on
> > x86 that has a much more complex memory map than ARM). Obviously adding
> > one more is not doing it any good, but it might not make a difference in
> > practice.
> >
> > Anyway, I agree with Julien that his suggestion is the best for ARM. If
> > the libxl maintainers are willing to accept two different code paths for
> > this on ARM and x86, then I am fine with it too.
> 
> Sorry to be a bit late to this thread -- there's a interface principle
> that I think we should at some point have a larger discussion about:
> whether "maxmem" means the amount of RAM which the guest sees as RAM,
> or whether "maxmem" means the amount of RAM that the administrator
> sees as used by the guest.  At the moment tnhere's no consistent
> answer actually; but I am strongly of the opinion that for usability
> the best answer is for "memory" to be the *total* amount of *host*
> memory used by the guest.  In an ideal world, the admin should be able
> to do "xl info", see that there is 3000MiB free, and then start a
> guest with 3000MiB and expect it to succeed.  At the moment he has to
> guess.

I don't want to add to the confusion, but maxmem is often higher than
the actual memory allocated for the guest at any given moment, given
that it's the upper limit enforced by the hypervisor (maxmem and mem are
often different, think about ballooning). So how can it be "the amount
of RAM that the administrator sees as used by the guest"? At best it
could be "the amount of RAM that the administrator sees could be at most
used by the guest" or "the amount of RAM that the administrator sees as
allocated on behalf of the guest at boot".


> To confirm, do you include memory allocated by the hypervisor to keep
> track of the guest (i.e struc domain, struct vcpu...)?
> 
> If not, the problem stays the same because the admin will have to know
> how much memory Xen will allocate to keep track of the guest. So if "xl
> info" tells you that 3000MiB is free, you will only be able to use
> 3000MiB - few kilobytes.

That's right, unfortunately all those structs allocated by the
hypervisor are completely unknown to the tootlstack. However they should
be an order of magnitude or two smaller than things like the videoram,
the ethernet blob (on x86) or the ACPI blob. So taking the memory for
ACPI and videoram from the existing maxmem pool without increasing it,
would significantly improve, but not completely solve, the problem
described by George.


Going back to the discussion about how to account for the ACPI blob in
maxmem, let's make this simple, if we increase maxmem by the size of the
ACPI blob:

- the toolstack allocates more RAM than expected (bad)
- when the admin specifies 1GB of RAM, the guest actually gets 1GB of
  usable RAM (good)
- things are faster as Xen and the guest can exploit superpage mappings
  more easily at stage-1 and stage-2 (good)

Let's call this option A.

If we do not increase maxmem:

- the toolstack allocates less RAM, closer to the size specified in the
  VM config file (good)
- the guest gets less usable memory than expected, less than what was
  specified in the VM config file (bad)
- things get slower as one or two 1GB superpage mappings are going to be
  shattered, almost certainly the stage-1 mapping, probably the stage-2
  mapping too, depending on the guest memory layout which is arch
  specific (bad)

Let's call this option B.

Both have pros and cons. Julien feels strongly for option A. I vote for
option A, but I find option B also acceptable. Let's make a decision so
that Shannon can move forward.
Boris Ostrovsky July 25, 2016, 10:46 p.m. UTC | #45
On 07/25/2016 06:06 PM, Stefano Stabellini wrote:
> On Mon, 25 Jul 2016, George Dunlap wrote:
>> On Thu, Jul 21, 2016 at 10:15 PM, Stefano Stabellini
>> <sstabellini@kernel.org> wrote:
>>>> You are assuming that the guest will map the ACPI blob with the same
>>>> attributes as the rest of the superpage.
>>>>
>>>> IHMO, a sane operating system will want to map the ACPI blob read-only.
>>> That's true. But there are other things which might be mapped
>>> differently and could shatter a stage-1 superpage mapping (especially on
>>> x86 that has a much more complex memory map than ARM). Obviously adding
>>> one more is not doing it any good, but it might not make a difference in
>>> practice.
>>>
>>> Anyway, I agree with Julien that his suggestion is the best for ARM. If
>>> the libxl maintainers are willing to accept two different code paths for
>>> this on ARM and x86, then I am fine with it too.
>> Sorry to be a bit late to this thread -- there's a interface principle
>> that I think we should at some point have a larger discussion about:
>> whether "maxmem" means the amount of RAM which the guest sees as RAM,
>> or whether "maxmem" means the amount of RAM that the administrator
>> sees as used by the guest.  At the moment tnhere's no consistent
>> answer actually; but I am strongly of the opinion that for usability
>> the best answer is for "memory" to be the *total* amount of *host*
>> memory used by the guest.  In an ideal world, the admin should be able
>> to do "xl info", see that there is 3000MiB free, and then start a
>> guest with 3000MiB and expect it to succeed.  At the moment he has to
>> guess.
> I don't want to add to the confusion, but maxmem is often higher than
> the actual memory allocated for the guest at any given moment, given
> that it's the upper limit enforced by the hypervisor (maxmem and mem are
> often different, think about ballooning). So how can it be "the amount
> of RAM that the administrator sees as used by the guest"? At best it
> could be "the amount of RAM that the administrator sees could be at most
> used by the guest" or "the amount of RAM that the administrator sees as
> allocated on behalf of the guest at boot".
>
>
>> To confirm, do you include memory allocated by the hypervisor to keep
>> track of the guest (i.e struc domain, struct vcpu...)?
>>
>> If not, the problem stays the same because the admin will have to know
>> how much memory Xen will allocate to keep track of the guest. So if "xl
>> info" tells you that 3000MiB is free, you will only be able to use
>> 3000MiB - few kilobytes.
> That's right, unfortunately all those structs allocated by the
> hypervisor are completely unknown to the tootlstack. However they should
> be an order of magnitude or two smaller than things like the videoram,
> the ethernet blob (on x86) or the ACPI blob. So taking the memory for
> ACPI and videoram from the existing maxmem pool without increasing it,
> would significantly improve, but not completely solve, the problem
> described by George.
>
>
> Going back to the discussion about how to account for the ACPI blob in
> maxmem, let's make this simple, if we increase maxmem by the size of the
> ACPI blob:
>
> - the toolstack allocates more RAM than expected (bad)
> - when the admin specifies 1GB of RAM, the guest actually gets 1GB of
>   usable RAM (good)
> - things are faster as Xen and the guest can exploit superpage mappings
>   more easily at stage-1 and stage-2 (good)
>
> Let's call this option A.
>
> If we do not increase maxmem:
>
> - the toolstack allocates less RAM, closer to the size specified in the
>   VM config file (good)
> - the guest gets less usable memory than expected, less than what was
>   specified in the VM config file (bad)


Not sure I agree with this, at least for x86/Linux: guest gets 1GB of
usable RAM and part of that RAM stores ACPI stuff. Guest is free to
stash ACPI tables somewhere else or ignore them altogether and use that
memory for whatever it wants.


-boris


> - things get slower as one or two 1GB superpage mappings are going to be
>   shattered, almost certainly the stage-1 mapping, probably the stage-2
>   mapping too, depending on the guest memory layout which is arch
>   specific (bad)
>
> Let's call this option B.
>
> Both have pros and cons. Julien feels strongly for option A. I vote for
> option A, but I find option B also acceptable. Let's make a decision so
> that Shannon can move forward.
Stefano Stabellini July 25, 2016, 11:40 p.m. UTC | #46
On Mon, 25 Jul 2016, Boris Ostrovsky wrote:
> On 07/25/2016 06:06 PM, Stefano Stabellini wrote:
> > On Mon, 25 Jul 2016, George Dunlap wrote:
> >> On Thu, Jul 21, 2016 at 10:15 PM, Stefano Stabellini
> >> <sstabellini@kernel.org> wrote:
> >>>> You are assuming that the guest will map the ACPI blob with the same
> >>>> attributes as the rest of the superpage.
> >>>>
> >>>> IHMO, a sane operating system will want to map the ACPI blob read-only.
> >>> That's true. But there are other things which might be mapped
> >>> differently and could shatter a stage-1 superpage mapping (especially on
> >>> x86 that has a much more complex memory map than ARM). Obviously adding
> >>> one more is not doing it any good, but it might not make a difference in
> >>> practice.
> >>>
> >>> Anyway, I agree with Julien that his suggestion is the best for ARM. If
> >>> the libxl maintainers are willing to accept two different code paths for
> >>> this on ARM and x86, then I am fine with it too.
> >> Sorry to be a bit late to this thread -- there's a interface principle
> >> that I think we should at some point have a larger discussion about:
> >> whether "maxmem" means the amount of RAM which the guest sees as RAM,
> >> or whether "maxmem" means the amount of RAM that the administrator
> >> sees as used by the guest.  At the moment tnhere's no consistent
> >> answer actually; but I am strongly of the opinion that for usability
> >> the best answer is for "memory" to be the *total* amount of *host*
> >> memory used by the guest.  In an ideal world, the admin should be able
> >> to do "xl info", see that there is 3000MiB free, and then start a
> >> guest with 3000MiB and expect it to succeed.  At the moment he has to
> >> guess.
> > I don't want to add to the confusion, but maxmem is often higher than
> > the actual memory allocated for the guest at any given moment, given
> > that it's the upper limit enforced by the hypervisor (maxmem and mem are
> > often different, think about ballooning). So how can it be "the amount
> > of RAM that the administrator sees as used by the guest"? At best it
> > could be "the amount of RAM that the administrator sees could be at most
> > used by the guest" or "the amount of RAM that the administrator sees as
> > allocated on behalf of the guest at boot".
> >
> >
> >> To confirm, do you include memory allocated by the hypervisor to keep
> >> track of the guest (i.e struc domain, struct vcpu...)?
> >>
> >> If not, the problem stays the same because the admin will have to know
> >> how much memory Xen will allocate to keep track of the guest. So if "xl
> >> info" tells you that 3000MiB is free, you will only be able to use
> >> 3000MiB - few kilobytes.
> > That's right, unfortunately all those structs allocated by the
> > hypervisor are completely unknown to the tootlstack. However they should
> > be an order of magnitude or two smaller than things like the videoram,
> > the ethernet blob (on x86) or the ACPI blob. So taking the memory for
> > ACPI and videoram from the existing maxmem pool without increasing it,
> > would significantly improve, but not completely solve, the problem
> > described by George.
> >
> >
> > Going back to the discussion about how to account for the ACPI blob in
> > maxmem, let's make this simple, if we increase maxmem by the size of the
> > ACPI blob:
> >
> > - the toolstack allocates more RAM than expected (bad)
> > - when the admin specifies 1GB of RAM, the guest actually gets 1GB of
> >   usable RAM (good)
> > - things are faster as Xen and the guest can exploit superpage mappings
> >   more easily at stage-1 and stage-2 (good)
> >
> > Let's call this option A.
> >
> > If we do not increase maxmem:
> >
> > - the toolstack allocates less RAM, closer to the size specified in the
> >   VM config file (good)
> > - the guest gets less usable memory than expected, less than what was
> >   specified in the VM config file (bad)
> 
> 
> Not sure I agree with this, at least for x86/Linux: guest gets 1GB of
> usable RAM and part of that RAM stores ACPI stuff. Guest is free to
> stash ACPI tables somewhere else or ignore them altogether and use that
> memory for whatever it wants.

On ARM it will be a ROM (from guest POV)
Boris Ostrovsky July 26, 2016, 1:17 a.m. UTC | #47
On 07/25/2016 07:40 PM, Stefano Stabellini wrote:
> On Mon, 25 Jul 2016, Boris Ostrovsky wrote:
>> On 07/25/2016 06:06 PM, Stefano Stabellini wrote:
>>> On Mon, 25 Jul 2016, George Dunlap wrote:
>>>> On Thu, Jul 21, 2016 at 10:15 PM, Stefano Stabellini
>>>> <sstabellini@kernel.org> wrote:
>>>>>> You are assuming that the guest will map the ACPI blob with the same
>>>>>> attributes as the rest of the superpage.
>>>>>>
>>>>>> IHMO, a sane operating system will want to map the ACPI blob read-only.
>>>>> That's true. But there are other things which might be mapped
>>>>> differently and could shatter a stage-1 superpage mapping (especially on
>>>>> x86 that has a much more complex memory map than ARM). Obviously adding
>>>>> one more is not doing it any good, but it might not make a difference in
>>>>> practice.
>>>>>
>>>>> Anyway, I agree with Julien that his suggestion is the best for ARM. If
>>>>> the libxl maintainers are willing to accept two different code paths for
>>>>> this on ARM and x86, then I am fine with it too.
>>>> Sorry to be a bit late to this thread -- there's a interface principle
>>>> that I think we should at some point have a larger discussion about:
>>>> whether "maxmem" means the amount of RAM which the guest sees as RAM,
>>>> or whether "maxmem" means the amount of RAM that the administrator
>>>> sees as used by the guest.  At the moment tnhere's no consistent
>>>> answer actually; but I am strongly of the opinion that for usability
>>>> the best answer is for "memory" to be the *total* amount of *host*
>>>> memory used by the guest.  In an ideal world, the admin should be able
>>>> to do "xl info", see that there is 3000MiB free, and then start a
>>>> guest with 3000MiB and expect it to succeed.  At the moment he has to
>>>> guess.
>>> I don't want to add to the confusion, but maxmem is often higher than
>>> the actual memory allocated for the guest at any given moment, given
>>> that it's the upper limit enforced by the hypervisor (maxmem and mem are
>>> often different, think about ballooning). So how can it be "the amount
>>> of RAM that the administrator sees as used by the guest"? At best it
>>> could be "the amount of RAM that the administrator sees could be at most
>>> used by the guest" or "the amount of RAM that the administrator sees as
>>> allocated on behalf of the guest at boot".
>>>
>>>
>>>> To confirm, do you include memory allocated by the hypervisor to keep
>>>> track of the guest (i.e struc domain, struct vcpu...)?
>>>>
>>>> If not, the problem stays the same because the admin will have to know
>>>> how much memory Xen will allocate to keep track of the guest. So if "xl
>>>> info" tells you that 3000MiB is free, you will only be able to use
>>>> 3000MiB - few kilobytes.
>>> That's right, unfortunately all those structs allocated by the
>>> hypervisor are completely unknown to the tootlstack. However they should
>>> be an order of magnitude or two smaller than things like the videoram,
>>> the ethernet blob (on x86) or the ACPI blob. So taking the memory for
>>> ACPI and videoram from the existing maxmem pool without increasing it,
>>> would significantly improve, but not completely solve, the problem
>>> described by George.
>>>
>>>
>>> Going back to the discussion about how to account for the ACPI blob in
>>> maxmem, let's make this simple, if we increase maxmem by the size of the
>>> ACPI blob:
>>>
>>> - the toolstack allocates more RAM than expected (bad)
>>> - when the admin specifies 1GB of RAM, the guest actually gets 1GB of
>>>    usable RAM (good)
>>> - things are faster as Xen and the guest can exploit superpage mappings
>>>    more easily at stage-1 and stage-2 (good)
>>>
>>> Let's call this option A.
>>>
>>> If we do not increase maxmem:
>>>
>>> - the toolstack allocates less RAM, closer to the size specified in the
>>>    VM config file (good)
>>> - the guest gets less usable memory than expected, less than what was
>>>    specified in the VM config file (bad)
>>
>> Not sure I agree with this, at least for x86/Linux: guest gets 1GB of
>> usable RAM and part of that RAM stores ACPI stuff. Guest is free to
>> stash ACPI tables somewhere else or ignore them altogether and use that
>> memory for whatever it wants.
> On ARM it will be a ROM (from guest POV)


In which case I don't see why we should take it from maxmem allocation. 
I somehow thought that there was a choice of whether to put it in ROM or 
RAM on ARM but if it's ROM only then I don't think there is an option.

IIUIC the toolstack pretends that the blob goes to memory because that's 
how its interfaces work but that space is not really what we think about 
when we set memory/maxmem in the configuration file. Unlike x86.

-boris
Julien Grall July 28, 2016, 11:06 a.m. UTC | #48
Hi,

On 26/07/16 02:17, Boris Ostrovsky wrote:
> On 07/25/2016 07:40 PM, Stefano Stabellini wrote:
>> On Mon, 25 Jul 2016, Boris Ostrovsky wrote:
>>> On 07/25/2016 06:06 PM, Stefano Stabellini wrote:
>>>> On Mon, 25 Jul 2016, George Dunlap wrote:
>>>>> On Thu, Jul 21, 2016 at 10:15 PM, Stefano Stabellini
>>>>> <sstabellini@kernel.org> wrote:
>>>> Going back to the discussion about how to account for the ACPI blob in
>>>> maxmem, let's make this simple, if we increase maxmem by the size of
>>>> the
>>>> ACPI blob:
>>>>
>>>> - the toolstack allocates more RAM than expected (bad)
>>>> - when the admin specifies 1GB of RAM, the guest actually gets 1GB of
>>>>    usable RAM (good)
>>>> - things are faster as Xen and the guest can exploit superpage mappings
>>>>    more easily at stage-1 and stage-2 (good)
>>>>
>>>> Let's call this option A.
>>>>
>>>> If we do not increase maxmem:
>>>>
>>>> - the toolstack allocates less RAM, closer to the size specified in the
>>>>    VM config file (good)
>>>> - the guest gets less usable memory than expected, less than what was
>>>>    specified in the VM config file (bad)
>>>
>>> Not sure I agree with this, at least for x86/Linux: guest gets 1GB of
>>> usable RAM and part of that RAM stores ACPI stuff. Guest is free to
>>> stash ACPI tables somewhere else or ignore them altogether and use that
>>> memory for whatever it wants.
>> On ARM it will be a ROM (from guest POV)
>
>
> In which case I don't see why we should take it from maxmem allocation.
> I somehow thought that there was a choice of whether to put it in ROM or
> RAM on ARM but if it's ROM only then I don't think there is an option.

We have option to do the both on ARM. I just feel that the ROM option is 
a cleaner interface because the ACPI tables are not supposed be modified 
by the guest, so we can prevent to be overridden (+ all the advantages 
mentioned by Stefano with option A).

> IIUIC the toolstack pretends that the blob goes to memory because that's
> how its interfaces work but that space is not really what we think about
> when we set memory/maxmem in the configuration file. Unlike x86.

I think we need to draw a conclusion for Shannon to continue to do the 
work and I would like to see this series in Xen 4.8. From my 
understanding you are for option B, so does George.

Stefano votes for option A, but find B acceptable. Any other opinions?

Regards,
Julien Grall July 28, 2016, 11:10 a.m. UTC | #49
Hi Shannon,

Sorry for the late answer.

On 25/07/16 08:56, Shannon Zhao wrote:
>
>
> On 2016/7/20 17:32, Wei Liu wrote:
>> On Wed, Jul 20, 2016 at 02:52:05PM +0800, Shannon Zhao wrote:
>>>>
>>>>
>>>> On 2016/7/19 18:38, Wei Liu wrote:
>>>>>> On Fri, Jul 15, 2016 at 05:39:32PM +0800, Shannon Zhao wrote:
>>>>>> [...]
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> It would be trivial to have another option in xl.cfg to allow MB
>>>>>>>>>>>>>> granularity. But I don't think that's a good idea. Asking for more
>>>>>>>>>>>>>> memory when you don't really know how much is enough is not very useful.
>>>>>>>>>>>>>> If an admin can know how much is needed, surely the library can be
>>>>>>>>>>>>>> taught to obtain that knowledge, too.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> We need to decide which model we should go with. And, if we decide to
>>>>>>>>>>>>>> diverge, document the difference between x86 and ARM model.
>>>>>>>>>>>>>>
>>>>>>>>>> Hi Wei,
>>>>>>>>>>
>>>>>>>>>> Do you decide how to add the size of ACPI blob to max_memkb?
>>>>>>>>>>
>>>>>> AFAICT ARM and x86 maintainers hold different opinions on how memory
>>>>>> should be accounted.
>>>>>>
>>>>>> I would like to have a unified memory accounting model. But if we can't
>>>>>> have that at the moment, I'm fine with divergence, but please document
>>>>>> it somewhere (comment near code snippet, in header, or a file under docs
>>>>>> etc). And the amount added to max_memkb needs to be properly calculated,
>>>>>> not some magic number, so that we have a chance in the future to
>>>>>> confidently change how we do thing.
>>>> If it's only allowed to add the size to max_memkb in
>>>> libxl__domain_build_info_setdefault(), it only can use a const number
>>>> since the tables are not generted and we don;t know the size. But the
>>>> const number could be chosen properly since we could know the maximum
>>>> ACPI tables size of current ARM approach.
>>>>
>>>> But maybe in the future, if we add some new ACPI tables, it should
>>>> increase the size as well. So I think this should be documented.
>>>>
>>>> As I asked before, is it ok to add the size to max_memkb after
>>>> generating the ACPI tables and before loading the tables to guest space?
>>>>
>> Yes. I don't think shoehorning everything into setdefault is a good
>> idea.
>>
>> I think libxl_arm.c:libxl__arch_domain_create would be the right place
>> to do it.
>>
>> I am thinking about calling xc_domain_setmaxmem there, but not adding a
>> number to d_config->b_info.max_memkb. Adding that to ->max_memkb would
>> be wrong because the bigger ->max_memkb will be recored and the same
>> algorithm will be applied every time you migrate your guest, so the
>> max_memkb will grow bigger and bigger.
>>
>> Given the different approach taken by ARM and x86, maybe we need to also
>> record the size of acpi blobs somewhere in xenstore (also needs to be
>> documented) so that subsequent libxl_domain_setmaxmem can extract that
>> number again.
>>
>> Please wait a bit for Stefano and Julien to comment before you do work.
>>
> Stefano, Julien,
> Any comments regarding how to add the ACPI size to max_memkb?

I think Wei's suggestion is a good one.

Cheers,
Shannon Zhao July 28, 2016, 12:42 p.m. UTC | #50
On 2016年07月28日 19:06, Julien Grall wrote:
> On 26/07/16 02:17, Boris Ostrovsky wrote:
>> On 07/25/2016 07:40 PM, Stefano Stabellini wrote:
>>> On Mon, 25 Jul 2016, Boris Ostrovsky wrote:
>>>> On 07/25/2016 06:06 PM, Stefano Stabellini wrote:
>>>>> On Mon, 25 Jul 2016, George Dunlap wrote:
>>>>>> On Thu, Jul 21, 2016 at 10:15 PM, Stefano Stabellini
>>>>>> <sstabellini@kernel.org> wrote:
>>>>> Going back to the discussion about how to account for the ACPI blob in
>>>>> maxmem, let's make this simple, if we increase maxmem by the size of
>>>>> the
>>>>> ACPI blob:
>>>>>
>>>>> - the toolstack allocates more RAM than expected (bad)
>>>>> - when the admin specifies 1GB of RAM, the guest actually gets 1GB of
>>>>>    usable RAM (good)
>>>>> - things are faster as Xen and the guest can exploit superpage
>>>>> mappings
>>>>>    more easily at stage-1 and stage-2 (good)
>>>>>
>>>>> Let's call this option A.
>>>>>
>>>>> If we do not increase maxmem:
>>>>>
>>>>> - the toolstack allocates less RAM, closer to the size specified in
>>>>> the
>>>>>    VM config file (good)
>>>>> - the guest gets less usable memory than expected, less than what was
>>>>>    specified in the VM config file (bad)
>>>>
>>>> Not sure I agree with this, at least for x86/Linux: guest gets 1GB of
>>>> usable RAM and part of that RAM stores ACPI stuff. Guest is free to
>>>> stash ACPI tables somewhere else or ignore them altogether and use that
>>>> memory for whatever it wants.
>>> On ARM it will be a ROM (from guest POV)
>>
>>
>> In which case I don't see why we should take it from maxmem allocation.
>> I somehow thought that there was a choice of whether to put it in ROM or
>> RAM on ARM but if it's ROM only then I don't think there is an option.
> 
> We have option to do the both on ARM. I just feel that the ROM option is
> a cleaner interface because the ACPI tables are not supposed be modified
> by the guest, so we can prevent to be overridden (+ all the advantages
> mentioned by Stefano with option A).
> 
>> IIUIC the toolstack pretends that the blob goes to memory because that's
>> how its interfaces work but that space is not really what we think about
>> when we set memory/maxmem in the configuration file. Unlike x86.
> 
> I think we need to draw a conclusion for Shannon to continue to do the
> work and I would like to see this series in Xen 4.8. From my
> understanding you are for option B, so does George.
> 
> Stefano votes for option A, but find B acceptable. Any other opinions?
I agree with Stefano, both are fine.

Thanks,
Wei Liu Aug. 2, 2016, 11:01 a.m. UTC | #51
On Thu, Jul 28, 2016 at 12:06:11PM +0100, Julien Grall wrote:
> Hi,
> 
> On 26/07/16 02:17, Boris Ostrovsky wrote:
> >On 07/25/2016 07:40 PM, Stefano Stabellini wrote:
> >>On Mon, 25 Jul 2016, Boris Ostrovsky wrote:
> >>>On 07/25/2016 06:06 PM, Stefano Stabellini wrote:
> >>>>On Mon, 25 Jul 2016, George Dunlap wrote:
> >>>>>On Thu, Jul 21, 2016 at 10:15 PM, Stefano Stabellini
> >>>>><sstabellini@kernel.org> wrote:
> >>>>Going back to the discussion about how to account for the ACPI blob in
> >>>>maxmem, let's make this simple, if we increase maxmem by the size of
> >>>>the
> >>>>ACPI blob:
> >>>>
> >>>>- the toolstack allocates more RAM than expected (bad)
> >>>>- when the admin specifies 1GB of RAM, the guest actually gets 1GB of
> >>>>   usable RAM (good)
> >>>>- things are faster as Xen and the guest can exploit superpage mappings
> >>>>   more easily at stage-1 and stage-2 (good)
> >>>>
> >>>>Let's call this option A.
> >>>>
> >>>>If we do not increase maxmem:
> >>>>
> >>>>- the toolstack allocates less RAM, closer to the size specified in the
> >>>>   VM config file (good)
> >>>>- the guest gets less usable memory than expected, less than what was
> >>>>   specified in the VM config file (bad)
> >>>
> >>>Not sure I agree with this, at least for x86/Linux: guest gets 1GB of
> >>>usable RAM and part of that RAM stores ACPI stuff. Guest is free to
> >>>stash ACPI tables somewhere else or ignore them altogether and use that
> >>>memory for whatever it wants.
> >>On ARM it will be a ROM (from guest POV)
> >
> >
> >In which case I don't see why we should take it from maxmem allocation.
> >I somehow thought that there was a choice of whether to put it in ROM or
> >RAM on ARM but if it's ROM only then I don't think there is an option.
> 
> We have option to do the both on ARM. I just feel that the ROM option is a
> cleaner interface because the ACPI tables are not supposed be modified by
> the guest, so we can prevent to be overridden (+ all the advantages
> mentioned by Stefano with option A).
> 
> >IIUIC the toolstack pretends that the blob goes to memory because that's
> >how its interfaces work but that space is not really what we think about
> >when we set memory/maxmem in the configuration file. Unlike x86.
> 
> I think we need to draw a conclusion for Shannon to continue to do the work
> and I would like to see this series in Xen 4.8. From my understanding you
> are for option B, so does George.
> 
> Stefano votes for option A, but find B acceptable. Any other opinions?
> 

From my PoV I just need things to be clearly documented.

Wei.

> Regards,
> 
> -- 
> Julien Grall
Wei Liu Aug. 2, 2016, 11:01 a.m. UTC | #52
On Thu, Jul 28, 2016 at 08:42:05PM +0800, Shannon Zhao wrote:
> On 2016年07月28日 19:06, Julien Grall wrote:
> > On 26/07/16 02:17, Boris Ostrovsky wrote:
> >> On 07/25/2016 07:40 PM, Stefano Stabellini wrote:
> >>> On Mon, 25 Jul 2016, Boris Ostrovsky wrote:
> >>>> On 07/25/2016 06:06 PM, Stefano Stabellini wrote:
> >>>>> On Mon, 25 Jul 2016, George Dunlap wrote:
> >>>>>> On Thu, Jul 21, 2016 at 10:15 PM, Stefano Stabellini
> >>>>>> <sstabellini@kernel.org> wrote:
> >>>>> Going back to the discussion about how to account for the ACPI blob in
> >>>>> maxmem, let's make this simple, if we increase maxmem by the size of
> >>>>> the
> >>>>> ACPI blob:
> >>>>>
> >>>>> - the toolstack allocates more RAM than expected (bad)
> >>>>> - when the admin specifies 1GB of RAM, the guest actually gets 1GB of
> >>>>>    usable RAM (good)
> >>>>> - things are faster as Xen and the guest can exploit superpage
> >>>>> mappings
> >>>>>    more easily at stage-1 and stage-2 (good)
> >>>>>
> >>>>> Let's call this option A.
> >>>>>
> >>>>> If we do not increase maxmem:
> >>>>>
> >>>>> - the toolstack allocates less RAM, closer to the size specified in
> >>>>> the
> >>>>>    VM config file (good)
> >>>>> - the guest gets less usable memory than expected, less than what was
> >>>>>    specified in the VM config file (bad)
> >>>>
> >>>> Not sure I agree with this, at least for x86/Linux: guest gets 1GB of
> >>>> usable RAM and part of that RAM stores ACPI stuff. Guest is free to
> >>>> stash ACPI tables somewhere else or ignore them altogether and use that
> >>>> memory for whatever it wants.
> >>> On ARM it will be a ROM (from guest POV)
> >>
> >>
> >> In which case I don't see why we should take it from maxmem allocation.
> >> I somehow thought that there was a choice of whether to put it in ROM or
> >> RAM on ARM but if it's ROM only then I don't think there is an option.
> > 
> > We have option to do the both on ARM. I just feel that the ROM option is
> > a cleaner interface because the ACPI tables are not supposed be modified
> > by the guest, so we can prevent to be overridden (+ all the advantages
> > mentioned by Stefano with option A).
> > 
> >> IIUIC the toolstack pretends that the blob goes to memory because that's
> >> how its interfaces work but that space is not really what we think about
> >> when we set memory/maxmem in the configuration file. Unlike x86.
> > 
> > I think we need to draw a conclusion for Shannon to continue to do the
> > work and I would like to see this series in Xen 4.8. From my
> > understanding you are for option B, so does George.
> > 
> > Stefano votes for option A, but find B acceptable. Any other opinions?
> I agree with Stefano, both are fine.
> 

Sorry for the late reply.

Are you now unblocked? If not, what is not yet decided or needed
clarification?

Wei.

> Thanks,
> -- 
> Shannon
Julien Grall Aug. 3, 2016, 7:20 p.m. UTC | #53
Hi Wei,

On 02/08/16 12:01, Wei Liu wrote:
> On Thu, Jul 28, 2016 at 08:42:05PM +0800, Shannon Zhao wrote:
>> On 2016年07月28日 19:06, Julien Grall wrote:
>>> On 26/07/16 02:17, Boris Ostrovsky wrote:
>>>> On 07/25/2016 07:40 PM, Stefano Stabellini wrote:
>>>>> On Mon, 25 Jul 2016, Boris Ostrovsky wrote:
>>>>>> On 07/25/2016 06:06 PM, Stefano Stabellini wrote:
>>>>>>> On Mon, 25 Jul 2016, George Dunlap wrote:
>>>>>>>> On Thu, Jul 21, 2016 at 10:15 PM, Stefano Stabellini
>>>>>>>> <sstabellini@kernel.org> wrote:
>>>>>>> Going back to the discussion about how to account for the ACPI blob in
>>>>>>> maxmem, let's make this simple, if we increase maxmem by the size of
>>>>>>> the
>>>>>>> ACPI blob:
>>>>>>>
>>>>>>> - the toolstack allocates more RAM than expected (bad)
>>>>>>> - when the admin specifies 1GB of RAM, the guest actually gets 1GB of
>>>>>>>    usable RAM (good)
>>>>>>> - things are faster as Xen and the guest can exploit superpage
>>>>>>> mappings
>>>>>>>    more easily at stage-1 and stage-2 (good)
>>>>>>>
>>>>>>> Let's call this option A.
>>>>>>>
>>>>>>> If we do not increase maxmem:
>>>>>>>
>>>>>>> - the toolstack allocates less RAM, closer to the size specified in
>>>>>>> the
>>>>>>>    VM config file (good)
>>>>>>> - the guest gets less usable memory than expected, less than what was
>>>>>>>    specified in the VM config file (bad)
>>>>>>
>>>>>> Not sure I agree with this, at least for x86/Linux: guest gets 1GB of
>>>>>> usable RAM and part of that RAM stores ACPI stuff. Guest is free to
>>>>>> stash ACPI tables somewhere else or ignore them altogether and use that
>>>>>> memory for whatever it wants.
>>>>> On ARM it will be a ROM (from guest POV)
>>>>
>>>>
>>>> In which case I don't see why we should take it from maxmem allocation.
>>>> I somehow thought that there was a choice of whether to put it in ROM or
>>>> RAM on ARM but if it's ROM only then I don't think there is an option.
>>>
>>> We have option to do the both on ARM. I just feel that the ROM option is
>>> a cleaner interface because the ACPI tables are not supposed be modified
>>> by the guest, so we can prevent to be overridden (+ all the advantages
>>> mentioned by Stefano with option A).
>>>
>>>> IIUIC the toolstack pretends that the blob goes to memory because that's
>>>> how its interfaces work but that space is not really what we think about
>>>> when we set memory/maxmem in the configuration file. Unlike x86.
>>>
>>> I think we need to draw a conclusion for Shannon to continue to do the
>>> work and I would like to see this series in Xen 4.8. From my
>>> understanding you are for option B, so does George.
>>>
>>> Stefano votes for option A, but find B acceptable. Any other opinions?
>> I agree with Stefano, both are fine.
>>
>
> Sorry for the late reply.
>
> Are you now unblocked? If not, what is not yet decided or needed
> clarification?

I don't think there was a strict consensus. I think this is something we 
can revisit later if necessary as the guest interface does not tie up to 
a specific physical address (The UEFI firmware should retrieve the 
information from the device tree).

So, Shannon could continue towards solution A. I.e the ACPI blob is 
loaded outside of the guest RAM?

If someone disagree please speak up. But we should unblock Shannon to 
get this series in Xen 4.8.

Regards,
Wei Liu Aug. 4, 2016, 10:17 a.m. UTC | #54
On Wed, Aug 03, 2016 at 08:20:18PM +0100, Julien Grall wrote:
> Hi Wei,
> 
> On 02/08/16 12:01, Wei Liu wrote:
> >On Thu, Jul 28, 2016 at 08:42:05PM +0800, Shannon Zhao wrote:
> >>On 2016年07月28日 19:06, Julien Grall wrote:
> >>>On 26/07/16 02:17, Boris Ostrovsky wrote:
> >>>>On 07/25/2016 07:40 PM, Stefano Stabellini wrote:
> >>>>>On Mon, 25 Jul 2016, Boris Ostrovsky wrote:
> >>>>>>On 07/25/2016 06:06 PM, Stefano Stabellini wrote:
> >>>>>>>On Mon, 25 Jul 2016, George Dunlap wrote:
> >>>>>>>>On Thu, Jul 21, 2016 at 10:15 PM, Stefano Stabellini
> >>>>>>>><sstabellini@kernel.org> wrote:
> >>>>>>>Going back to the discussion about how to account for the ACPI blob in
> >>>>>>>maxmem, let's make this simple, if we increase maxmem by the size of
> >>>>>>>the
> >>>>>>>ACPI blob:
> >>>>>>>
> >>>>>>>- the toolstack allocates more RAM than expected (bad)
> >>>>>>>- when the admin specifies 1GB of RAM, the guest actually gets 1GB of
> >>>>>>>   usable RAM (good)
> >>>>>>>- things are faster as Xen and the guest can exploit superpage
> >>>>>>>mappings
> >>>>>>>   more easily at stage-1 and stage-2 (good)
> >>>>>>>
> >>>>>>>Let's call this option A.
> >>>>>>>
> >>>>>>>If we do not increase maxmem:
> >>>>>>>
> >>>>>>>- the toolstack allocates less RAM, closer to the size specified in
> >>>>>>>the
> >>>>>>>   VM config file (good)
> >>>>>>>- the guest gets less usable memory than expected, less than what was
> >>>>>>>   specified in the VM config file (bad)
> >>>>>>
> >>>>>>Not sure I agree with this, at least for x86/Linux: guest gets 1GB of
> >>>>>>usable RAM and part of that RAM stores ACPI stuff. Guest is free to
> >>>>>>stash ACPI tables somewhere else or ignore them altogether and use that
> >>>>>>memory for whatever it wants.
> >>>>>On ARM it will be a ROM (from guest POV)
> >>>>
> >>>>
> >>>>In which case I don't see why we should take it from maxmem allocation.
> >>>>I somehow thought that there was a choice of whether to put it in ROM or
> >>>>RAM on ARM but if it's ROM only then I don't think there is an option.
> >>>
> >>>We have option to do the both on ARM. I just feel that the ROM option is
> >>>a cleaner interface because the ACPI tables are not supposed be modified
> >>>by the guest, so we can prevent to be overridden (+ all the advantages
> >>>mentioned by Stefano with option A).
> >>>
> >>>>IIUIC the toolstack pretends that the blob goes to memory because that's
> >>>>how its interfaces work but that space is not really what we think about
> >>>>when we set memory/maxmem in the configuration file. Unlike x86.
> >>>
> >>>I think we need to draw a conclusion for Shannon to continue to do the
> >>>work and I would like to see this series in Xen 4.8. From my
> >>>understanding you are for option B, so does George.
> >>>
> >>>Stefano votes for option A, but find B acceptable. Any other opinions?
> >>I agree with Stefano, both are fine.
> >>
> >
> >Sorry for the late reply.
> >
> >Are you now unblocked? If not, what is not yet decided or needed
> >clarification?
> 
> I don't think there was a strict consensus. I think this is something we can
> revisit later if necessary as the guest interface does not tie up to a
> specific physical address (The UEFI firmware should retrieve the information
> from the device tree).
> 
> So, Shannon could continue towards solution A. I.e the ACPI blob is loaded
> outside of the guest RAM?
> 

I'm fine with that, the bottom line is everything should be documented
so that we can confidently make changes later (or confidently refuse to
make changes, heh).

(Given the chance I would still prefer a unified model)

> If someone disagree please speak up. But we should unblock Shannon to get
> this series in Xen 4.8.

Yes, I agree.

Wei.

> 
> Regards,
> 
> -- 
> Julien Grall
diff mbox

Patch

diff --git a/tools/libxc/xc_dom_arm.c b/tools/libxc/xc_dom_arm.c
index 64a8b67..6a0a5b7 100644
--- a/tools/libxc/xc_dom_arm.c
+++ b/tools/libxc/xc_dom_arm.c
@@ -63,6 +63,47 @@  static int setup_pgtables_arm(struct xc_dom_image *dom)
 
 /* ------------------------------------------------------------------------ */
 
+static int xc_dom_copy_acpi(struct xc_dom_image *dom)
+{
+    int rc, i;
+    uint32_t pages_num = ROUNDUP(dom->acpitable_size, XC_PAGE_SHIFT) >>
+                         XC_PAGE_SHIFT;
+    const xen_pfn_t base = GUEST_ACPI_BASE >> XC_PAGE_SHIFT;
+    xen_pfn_t *p2m;
+    void *acpi_pages;
+
+    p2m = malloc(pages_num * sizeof(*p2m));
+    for (i = 0; i < pages_num; i++)
+        p2m[i] = base + i;
+
+    rc = xc_domain_populate_physmap_exact(dom->xch, dom->guest_domid,
+                                          pages_num, 0, 0, p2m);
+    if ( rc )
+    {
+        DOMPRINTF("%s: xc_domain_populate_physmap_exact failed with %d",
+                  __FUNCTION__, rc);
+        goto out;
+    }
+
+    acpi_pages = xc_map_foreign_range(dom->xch, dom->guest_domid,
+                                      PAGE_SIZE * pages_num,
+                                      PROT_READ | PROT_WRITE, base);
+    if ( !acpi_pages )
+    {
+        DOMPRINTF("%s Can't map acpi_pages", __FUNCTION__);
+        rc = -1;
+        goto out;
+    }
+
+    memcpy(acpi_pages, dom->acpitable_blob, dom->acpitable_size);
+
+out:
+    munmap(acpi_pages, pages_num * PAGE_SIZE);
+    free(p2m);
+
+    return rc;
+}
+
 static int alloc_magic_pages(struct xc_dom_image *dom)
 {
     int rc, i;
@@ -100,6 +141,16 @@  static int alloc_magic_pages(struct xc_dom_image *dom)
     xc_hvm_param_set(dom->xch, dom->guest_domid, HVM_PARAM_STORE_EVTCHN,
             dom->xenstore_evtchn);
 
+    if ( dom->acpitable_blob && dom->acpitable_size > 0 )
+    {
+        rc = xc_dom_copy_acpi(dom);
+        if ( rc != 0 )
+        {
+            DOMPRINTF("Unable to copy ACPI tables");
+            return rc;
+        }
+    }
+
     return 0;
 }