diff mbox series

[1/2] hw/arm/virt: Improve address assignment for highmem IO regions

Message ID 20220802064529.547361-2-gshan@redhat.com (mailing list archive)
State New, archived
Headers show
Series hw/arm/virt: Improve address assignment for highmem IO regions | expand

Commit Message

Gavin Shan Aug. 2, 2022, 6:45 a.m. UTC
There are 3 highmem IO regions as below. They can be disabled in
two situations: (a) The specific region is disabled by user. (b)
The specific region doesn't fit in the PA space. However, the base
address and highest_gpa are still updated no matter if the region
is enabled or disabled. It's incorrectly incurring waste in the PA
space.

Improve address assignment for highmem IO regions to avoid the waste
in the PA space by putting the logic into virt_memmap_fits().

Signed-off-by: Gavin Shan <gshan@redhat.com>
---
 hw/arm/virt.c | 54 +++++++++++++++++++++++++++++----------------------
 1 file changed, 31 insertions(+), 23 deletions(-)

Comments

Eric Auger Aug. 2, 2022, 9:41 a.m. UTC | #1
Hi Gavin,

On 8/2/22 08:45, Gavin Shan wrote:
> There are 3 highmem IO regions as below. They can be disabled in
> two situations: (a) The specific region is disabled by user. (b)
> The specific region doesn't fit in the PA space. However, the base
> address and highest_gpa are still updated no matter if the region
> is enabled or disabled. It's incorrectly incurring waste in the PA
> space.
If I am not wrong highmem_redists and highmem_mmio are not user selectable

Only highmem ecam depends on machine type & ACPI setup. But I would say
that in server use case it is always set. So is that optimization really
needed?
>
> Improve address assignment for highmem IO regions to avoid the waste
> in the PA space by putting the logic into virt_memmap_fits().
>
> Signed-off-by: Gavin Shan <gshan@redhat.com>
> ---
>  hw/arm/virt.c | 54 +++++++++++++++++++++++++++++----------------------
>  1 file changed, 31 insertions(+), 23 deletions(-)
>
> diff --git a/hw/arm/virt.c b/hw/arm/virt.c
> index 9633f822f3..bc0cd218f9 100644
> --- a/hw/arm/virt.c
> +++ b/hw/arm/virt.c
> @@ -1688,6 +1688,34 @@ static uint64_t virt_cpu_mp_affinity(VirtMachineState *vms, int idx)
>      return arm_cpu_mp_affinity(idx, clustersz);
>  }
>  
> +static void virt_memmap_fits(VirtMachineState *vms, int index,
> +                             bool *enabled, hwaddr *base, int pa_bits)
> +{
> +    hwaddr size = extended_memmap[index].size;
> +
> +    /* The region will be disabled if its size isn't given */
> +    if (!*enabled || !size) {
In which case do you have !size?
> +        *enabled = false;
> +        vms->memmap[index].base = 0;
> +        vms->memmap[index].size = 0;
It looks dangerous to me to reset the region's base and size like that.
for instance fdt_add_gic_node() will add dummy data in the dt.
> +        return;
> +    }
> +
> +    /*
> +     * Check if the memory region fits in the PA space. The memory map
> +     * and highest_gpa are updated if it fits. Otherwise, it's disabled.
> +     */
> +    *enabled = (ROUND_UP(*base, size) + size <= BIT_ULL(pa_bits));
using a 'fits' local variable would make the code more obvious I think
> +    if (*enabled) {
> +        *base = ROUND_UP(*base, size);
> +        vms->memmap[index].base = *base;
> +        vms->memmap[index].size = size;
> +        vms->highest_gpa = *base + size - 1;
> +
> +	*base = *base + size;
> +    }
> +}
> +
>  static void virt_set_memmap(VirtMachineState *vms, int pa_bits)
>  {
>      MachineState *ms = MACHINE(vms);
> @@ -1744,37 +1772,17 @@ static void virt_set_memmap(VirtMachineState *vms, int pa_bits)
>      vms->highest_gpa = memtop - 1;
>  
>      for (i = VIRT_LOWMEMMAP_LAST; i < ARRAY_SIZE(extended_memmap); i++) {
> -        hwaddr size = extended_memmap[i].size;
> -        bool fits;
> -
> -        base = ROUND_UP(base, size);
> -        vms->memmap[i].base = base;
> -        vms->memmap[i].size = size;
> -
> -        /*
> -         * Check each device to see if they fit in the PA space,
> -         * moving highest_gpa as we go.
> -         *
> -         * For each device that doesn't fit, disable it.
> -         */
> -        fits = (base + size) <= BIT_ULL(pa_bits);
> -        if (fits) {
> -            vms->highest_gpa = base + size - 1;
> -        }
> -

we could avoid running the code below in case highmem is not set. We would need to reset that flags though.

>          switch (i) {
>          case VIRT_HIGH_GIC_REDIST2:
> -            vms->highmem_redists &= fits;
> +            virt_memmap_fits(vms, i, &vms->highmem_redists, &base, pa_bits);
>              break;
>          case VIRT_HIGH_PCIE_ECAM:
> -            vms->highmem_ecam &= fits;
> +            virt_memmap_fits(vms, i, &vms->highmem_ecam, &base, pa_bits);
>              break;
>          case VIRT_HIGH_PCIE_MMIO:
> -            vms->highmem_mmio &= fits;
> +            virt_memmap_fits(vms, i, &vms->highmem_mmio, &base, pa_bits);
>              break;
>          }
> -
> -        base += size;
>      }
>  
>      if (device_memory_size > 0) {
Thanks

Eric
Gavin Shan Aug. 3, 2022, 3:01 a.m. UTC | #2
Hi Eric,

On 8/2/22 7:41 PM, Eric Auger wrote:
> On 8/2/22 08:45, Gavin Shan wrote:
>> There are 3 highmem IO regions as below. They can be disabled in
>> two situations: (a) The specific region is disabled by user. (b)
>> The specific region doesn't fit in the PA space. However, the base
>> address and highest_gpa are still updated no matter if the region
>> is enabled or disabled. It's incorrectly incurring waste in the PA
>> space.
> If I am not wrong highmem_redists and highmem_mmio are not user selectable
> 
> Only highmem ecam depends on machine type & ACPI setup. But I would say
> that in server use case it is always set. So is that optimization really
> needed?

There are two other cases you missed.

- highmem_ecam is enabled after virt-2.12, meaning it stays disabled
   before that.

- The high memory region can be disabled if user is asking large
   (normal) memory space through 'maxmem=' option. When the requested
   memory by 'maxmem=' is large enough, the high memory regions are
   disabled. It means the normal memory has higher priority than those
   high memory regions. This is the case I provided in (b) of the
   commit log.

In the commit log, I was supposed to say something like below for
(a):

- The specific high memory region can be disabled through changing
   the code by user or developer. For example, 'vms->highmem_mmio'
   is changed from true to false in virt_instance_init().

>>
>> Improve address assignment for highmem IO regions to avoid the waste
>> in the PA space by putting the logic into virt_memmap_fits().
>>
>> Signed-off-by: Gavin Shan <gshan@redhat.com>
>> ---
>>   hw/arm/virt.c | 54 +++++++++++++++++++++++++++++----------------------
>>   1 file changed, 31 insertions(+), 23 deletions(-)
>>
>> diff --git a/hw/arm/virt.c b/hw/arm/virt.c
>> index 9633f822f3..bc0cd218f9 100644
>> --- a/hw/arm/virt.c
>> +++ b/hw/arm/virt.c
>> @@ -1688,6 +1688,34 @@ static uint64_t virt_cpu_mp_affinity(VirtMachineState *vms, int idx)
>>       return arm_cpu_mp_affinity(idx, clustersz);
>>   }
>>   
>> +static void virt_memmap_fits(VirtMachineState *vms, int index,
>> +                             bool *enabled, hwaddr *base, int pa_bits)
>> +{
>> +    hwaddr size = extended_memmap[index].size;
>> +
>> +    /* The region will be disabled if its size isn't given */
>> +    if (!*enabled || !size) {
> In which case do you have !size?

Yeah, we don't have !size and the condition should be removed.

>> +        *enabled = false;
>> +        vms->memmap[index].base = 0;
>> +        vms->memmap[index].size = 0;
> It looks dangerous to me to reset the region's base and size like that.
> for instance fdt_add_gic_node() will add dummy data in the dt.

I would guess you missed that the high memory regions won't be exported
through device-tree if they have been disabled. We have a check there,
which is "if (nb_redist_regions == 1)"

>> +        return;
>> +    }
>> +
>> +    /*
>> +     * Check if the memory region fits in the PA space. The memory map
>> +     * and highest_gpa are updated if it fits. Otherwise, it's disabled.
>> +     */
>> +    *enabled = (ROUND_UP(*base, size) + size <= BIT_ULL(pa_bits));
> using a 'fits' local variable would make the code more obvious I think

Lets confirm if you're suggesting something like below?

         bool fits;

         fits = (ROUND_UP(*base, size) + size <= BIT_ULL(pa_bits));

         if (fits) {
            /* update *base, memory mapping, highest_gpa */
         } else {
            *enabled = false;
         }

I guess we can simply do

         if (ROUND_UP(*base, size) + size <= BIT_ULL(pa_bits)) {
            /* update *base, memory mapping, highest_gpa */
         } else {
            *enabled = false;
         }

Please let me know which one looks best to you.

>> +    if (*enabled) {
>> +        *base = ROUND_UP(*base, size);
>> +        vms->memmap[index].base = *base;
>> +        vms->memmap[index].size = size;
>> +        vms->highest_gpa = *base + size - 1;
>> +
>> +	*base = *base + size;
>> +    }
>> +}
>> +
>>   static void virt_set_memmap(VirtMachineState *vms, int pa_bits)
>>   {
>>       MachineState *ms = MACHINE(vms);
>> @@ -1744,37 +1772,17 @@ static void virt_set_memmap(VirtMachineState *vms, int pa_bits)
>>       vms->highest_gpa = memtop - 1;
>>   
>>       for (i = VIRT_LOWMEMMAP_LAST; i < ARRAY_SIZE(extended_memmap); i++) {
>> -        hwaddr size = extended_memmap[i].size;
>> -        bool fits;
>> -
>> -        base = ROUND_UP(base, size);
>> -        vms->memmap[i].base = base;
>> -        vms->memmap[i].size = size;
>> -
>> -        /*
>> -         * Check each device to see if they fit in the PA space,
>> -         * moving highest_gpa as we go.
>> -         *
>> -         * For each device that doesn't fit, disable it.
>> -         */
>> -        fits = (base + size) <= BIT_ULL(pa_bits);
>> -        if (fits) {
>> -            vms->highest_gpa = base + size - 1;
>> -        }
>> -
> 
> we could avoid running the code below in case highmem is not set. We would need to reset that flags though.
> 

Yeah, I think it's not a big deal. My though is to update the flag in virt_memmap_fits().

>>           switch (i) {
>>           case VIRT_HIGH_GIC_REDIST2:
>> -            vms->highmem_redists &= fits;
>> +            virt_memmap_fits(vms, i, &vms->highmem_redists, &base, pa_bits);
>>               break;
>>           case VIRT_HIGH_PCIE_ECAM:
>> -            vms->highmem_ecam &= fits;
>> +            virt_memmap_fits(vms, i, &vms->highmem_ecam, &base, pa_bits);
>>               break;
>>           case VIRT_HIGH_PCIE_MMIO:
>> -            vms->highmem_mmio &= fits;
>> +            virt_memmap_fits(vms, i, &vms->highmem_mmio, &base, pa_bits);
>>               break;
>>           }
>> -
>> -        base += size;
>>       }
>>   
>>       if (device_memory_size > 0) {

Thanks,
Gavin
Marc Zyngier Aug. 3, 2022, 7:01 a.m. UTC | #3
On Wed, 03 Aug 2022 04:01:04 +0100,
Gavin Shan <gshan@redhat.com> wrote:
> 
> Hi Eric,
> 
> On 8/2/22 7:41 PM, Eric Auger wrote:
> > On 8/2/22 08:45, Gavin Shan wrote:
> >> There are 3 highmem IO regions as below. They can be disabled in
> >> two situations: (a) The specific region is disabled by user. (b)
> >> The specific region doesn't fit in the PA space. However, the base
> >> address and highest_gpa are still updated no matter if the region
> >> is enabled or disabled. It's incorrectly incurring waste in the PA
> >> space.
> > If I am not wrong highmem_redists and highmem_mmio are not user selectable
> > 
> > Only highmem ecam depends on machine type & ACPI setup. But I would say
> > that in server use case it is always set. So is that optimization really
> > needed?
> 
> There are two other cases you missed.
> 
> - highmem_ecam is enabled after virt-2.12, meaning it stays disabled
>   before that.

I don't get this. The current behaviour is to disable highmem_ecam if
it doesn't fit in the PA space. I can't see anything that enables it
if it was disabled the first place.

> 
> - The high memory region can be disabled if user is asking large
>   (normal) memory space through 'maxmem=' option. When the requested
>   memory by 'maxmem=' is large enough, the high memory regions are
>   disabled. It means the normal memory has higher priority than those
>   high memory regions. This is the case I provided in (b) of the
>   commit log.

Why is that a problem? It matches the expected behaviour, as the
highmem IO region is floating and is pushed up by the memory region.

> 
> In the commit log, I was supposed to say something like below for
> (a):
> 
> - The specific high memory region can be disabled through changing
>   the code by user or developer. For example, 'vms->highmem_mmio'
>   is changed from true to false in virt_instance_init().

Huh. By this principle, the user can change anything. Why is it
important?

> 
> >> 
> >> Improve address assignment for highmem IO regions to avoid the waste
> >> in the PA space by putting the logic into virt_memmap_fits().

I guess that this is what I understand the least. What do you mean by
"wasted PA space"? Either the regions fit in the PA space, and
computing their addresses in relevant, or they fall outside of it and
what we stick in memap[index].base is completely irrelevant.

> >> 
> >> Signed-off-by: Gavin Shan <gshan@redhat.com>
> >> ---
> >>   hw/arm/virt.c | 54 +++++++++++++++++++++++++++++----------------------
> >>   1 file changed, 31 insertions(+), 23 deletions(-)
> >> 
> >> diff --git a/hw/arm/virt.c b/hw/arm/virt.c
> >> index 9633f822f3..bc0cd218f9 100644
> >> --- a/hw/arm/virt.c
> >> +++ b/hw/arm/virt.c
> >> @@ -1688,6 +1688,34 @@ static uint64_t virt_cpu_mp_affinity(VirtMachineState *vms, int idx)
> >>       return arm_cpu_mp_affinity(idx, clustersz);
> >>   }
> >>   +static void virt_memmap_fits(VirtMachineState *vms, int index,
> >> +                             bool *enabled, hwaddr *base, int pa_bits)
> >> +{
> >> +    hwaddr size = extended_memmap[index].size;
> >> +
> >> +    /* The region will be disabled if its size isn't given */
> >> +    if (!*enabled || !size) {
> > In which case do you have !size?
> 
> Yeah, we don't have !size and the condition should be removed.
> 
> >> +        *enabled = false;
> >> +        vms->memmap[index].base = 0;
> >> +        vms->memmap[index].size = 0;
> > It looks dangerous to me to reset the region's base and size like that.
> > for instance fdt_add_gic_node() will add dummy data in the dt.
> 
> I would guess you missed that the high memory regions won't be exported
> through device-tree if they have been disabled. We have a check there,
> which is "if (nb_redist_regions == 1)"
> 
> >> +        return;
> >> +    }
> >> +
> >> +    /*
> >> +     * Check if the memory region fits in the PA space. The memory map
> >> +     * and highest_gpa are updated if it fits. Otherwise, it's disabled.
> >> +     */
> >> +    *enabled = (ROUND_UP(*base, size) + size <= BIT_ULL(pa_bits));
> > using a 'fits' local variable would make the code more obvious I think
> 
> Lets confirm if you're suggesting something like below?
> 
>         bool fits;
> 
>         fits = (ROUND_UP(*base, size) + size <= BIT_ULL(pa_bits));
> 
>         if (fits) {
>            /* update *base, memory mapping, highest_gpa */
>         } else {
>            *enabled = false;
>         }
> 
> I guess we can simply do
> 
>         if (ROUND_UP(*base, size) + size <= BIT_ULL(pa_bits)) {
>            /* update *base, memory mapping, highest_gpa */
>         } else {
>            *enabled = false;
>         }
> 
> Please let me know which one looks best to you.

Why should the 'enabled' flag be updated by this function, instead of
returning the value and keeping it as an assignment in the caller
function? It is purely stylistic though.

Thanks,

	M.
Eric Auger Aug. 3, 2022, 8:10 a.m. UTC | #4
Hi,
On 8/3/22 09:01, Marc Zyngier wrote:
> On Wed, 03 Aug 2022 04:01:04 +0100,
> Gavin Shan <gshan@redhat.com> wrote:
>> Hi Eric,
>>
>> On 8/2/22 7:41 PM, Eric Auger wrote:
>>> On 8/2/22 08:45, Gavin Shan wrote:
>>>> There are 3 highmem IO regions as below. They can be disabled in
>>>> two situations: (a) The specific region is disabled by user. (b)
>>>> The specific region doesn't fit in the PA space. However, the base
>>>> address and highest_gpa are still updated no matter if the region
>>>> is enabled or disabled. It's incorrectly incurring waste in the PA
>>>> space.
>>> If I am not wrong highmem_redists and highmem_mmio are not user selectable
>>>
>>> Only highmem ecam depends on machine type & ACPI setup. But I would say
>>> that in server use case it is always set. So is that optimization really
>>> needed?
>> There are two other cases you missed.
>>
>> - highmem_ecam is enabled after virt-2.12, meaning it stays disabled
>>   before that.
> I don't get this. The current behaviour is to disable highmem_ecam if
> it doesn't fit in the PA space. I can't see anything that enables it
> if it was disabled the first place.
>
>> - The high memory region can be disabled if user is asking large
>>   (normal) memory space through 'maxmem=' option. When the requested
>>   memory by 'maxmem=' is large enough, the high memory regions are
>>   disabled. It means the normal memory has higher priority than those
>>   high memory regions. This is the case I provided in (b) of the
>>   commit log.
> Why is that a problem? It matches the expected behaviour, as the
> highmem IO region is floating and is pushed up by the memory region.

the only concern I have is we changed the user experience with
d9afe24c29 ("hw/arm/virt: Disable highmem devices that don't fit in the
PA range")

before we returned an error if the highmem devices could not fit within
the IPA range and exited:

-m and ,maxmem option values require an IPA range (41 bits) larger than the one supported by the host (40 bits)


Now we skip them silently.
Most probably we should have gated that change with an option for
compatibility.

>
>> In the commit log, I was supposed to say something like below for
>> (a):
>>
>> - The specific high memory region can be disabled through changing
>>   the code by user or developer. For example, 'vms->highmem_mmio'
>>   is changed from true to false in virt_instance_init().
> Huh. By this principle, the user can change anything. Why is it
> important?
>
>>>> Improve address assignment for highmem IO regions to avoid the waste
>>>> in the PA space by putting the logic into virt_memmap_fits().
> I guess that this is what I understand the least. What do you mean by
> "wasted PA space"? Either the regions fit in the PA space, and
> computing their addresses in relevant, or they fall outside of it and
> what we stick in memap[index].base is completely irrelevant.

agreed. To me the only 'waste' we can have is when highmem ECAM is
disabled through a combination of user options and we provision for the
unused space when computing the highest_gpa. But it is 256 MB and the
case where it is disabled is marginal.

Eric
>
>>>> Signed-off-by: Gavin Shan <gshan@redhat.com>
>>>> ---
>>>>   hw/arm/virt.c | 54 +++++++++++++++++++++++++++++----------------------
>>>>   1 file changed, 31 insertions(+), 23 deletions(-)
>>>>
>>>> diff --git a/hw/arm/virt.c b/hw/arm/virt.c
>>>> index 9633f822f3..bc0cd218f9 100644
>>>> --- a/hw/arm/virt.c
>>>> +++ b/hw/arm/virt.c
>>>> @@ -1688,6 +1688,34 @@ static uint64_t virt_cpu_mp_affinity(VirtMachineState *vms, int idx)
>>>>       return arm_cpu_mp_affinity(idx, clustersz);
>>>>   }
>>>>   +static void virt_memmap_fits(VirtMachineState *vms, int index,
>>>> +                             bool *enabled, hwaddr *base, int pa_bits)
>>>> +{
>>>> +    hwaddr size = extended_memmap[index].size;
>>>> +
>>>> +    /* The region will be disabled if its size isn't given */
>>>> +    if (!*enabled || !size) {
>>> In which case do you have !size?
>> Yeah, we don't have !size and the condition should be removed.
>>
>>>> +        *enabled = false;
>>>> +        vms->memmap[index].base = 0;
>>>> +        vms->memmap[index].size = 0;
>>> It looks dangerous to me to reset the region's base and size like that.
>>> for instance fdt_add_gic_node() will add dummy data in the dt.
>> I would guess you missed that the high memory regions won't be exported
>> through device-tree if they have been disabled. We have a check there,
>> which is "if (nb_redist_regions == 1)"
>>
>>>> +        return;
>>>> +    }
>>>> +
>>>> +    /*
>>>> +     * Check if the memory region fits in the PA space. The memory map
>>>> +     * and highest_gpa are updated if it fits. Otherwise, it's disabled.
>>>> +     */
>>>> +    *enabled = (ROUND_UP(*base, size) + size <= BIT_ULL(pa_bits));
>>> using a 'fits' local variable would make the code more obvious I think
>> Lets confirm if you're suggesting something like below?
>>
>>         bool fits;
>>
>>         fits = (ROUND_UP(*base, size) + size <= BIT_ULL(pa_bits));
>>
>>         if (fits) {
>>            /* update *base, memory mapping, highest_gpa */
>>         } else {
>>            *enabled = false;
>>         }
>>
>> I guess we can simply do
>>
>>         if (ROUND_UP(*base, size) + size <= BIT_ULL(pa_bits)) {
>>            /* update *base, memory mapping, highest_gpa */
>>         } else {
>>            *enabled = false;
>>         }
>>
>> Please let me know which one looks best to you.
> Why should the 'enabled' flag be updated by this function, instead of
> returning the value and keeping it as an assignment in the caller
> function? It is purely stylistic though.
>
> Thanks,
>
> 	M.
>
Eric Auger Aug. 3, 2022, 8:44 a.m. UTC | #5
Hi Gavin,

On 8/3/22 05:01, Gavin Shan wrote:
> Hi Eric,
>
> On 8/2/22 7:41 PM, Eric Auger wrote:
>> On 8/2/22 08:45, Gavin Shan wrote:
>>> There are 3 highmem IO regions as below. They can be disabled in
>>> two situations: (a) The specific region is disabled by user. (b)
>>> The specific region doesn't fit in the PA space. However, the base
>>> address and highest_gpa are still updated no matter if the region
>>> is enabled or disabled. It's incorrectly incurring waste in the PA
>>> space.
>> If I am not wrong highmem_redists and highmem_mmio are not user
>> selectable
>>
>> Only highmem ecam depends on machine type & ACPI setup. But I would say
>> that in server use case it is always set. So is that optimization really
>> needed?
>
> There are two other cases you missed.
>
> - highmem_ecam is enabled after virt-2.12, meaning it stays disabled
>   before that.
Yes that's what I meant above by 'depends on machine type'
>
> - The high memory region can be disabled if user is asking large
>   (normal) memory space through 'maxmem=' option. When the requested
>   memory by 'maxmem=' is large enough, the high memory regions are
>   disabled. It means the normal memory has higher priority than those
>   high memory regions. This is the case I provided in (b) of the
>   commit log.
yes but in such a case you don't "waste" IPA as you mention in the
commit log because you only ask for a VM dimensionned for the highest_gpa.
The only case where you would "waste" IPA is for high ecam which can
disabled by option combination but it is marginal.

>
> In the commit log, I was supposed to say something like below for
> (a):
>
> - The specific high memory region can be disabled through changing
>   the code by user or developer. For example, 'vms->highmem_mmio'
>   is changed from true to false in virt_instance_init().
>
>>>
>>> Improve address assignment for highmem IO regions to avoid the waste
>>> in the PA space by putting the logic into virt_memmap_fits().
>>>
>>> Signed-off-by: Gavin Shan <gshan@redhat.com>
>>> ---
>>>   hw/arm/virt.c | 54
>>> +++++++++++++++++++++++++++++----------------------
>>>   1 file changed, 31 insertions(+), 23 deletions(-)
>>>
>>> diff --git a/hw/arm/virt.c b/hw/arm/virt.c
>>> index 9633f822f3..bc0cd218f9 100644
>>> --- a/hw/arm/virt.c
>>> +++ b/hw/arm/virt.c
>>> @@ -1688,6 +1688,34 @@ static uint64_t
>>> virt_cpu_mp_affinity(VirtMachineState *vms, int idx)
>>>       return arm_cpu_mp_affinity(idx, clustersz);
>>>   }
>>>   +static void virt_memmap_fits(VirtMachineState *vms, int index,
>>> +                             bool *enabled, hwaddr *base, int pa_bits)
>>> +{
>>> +    hwaddr size = extended_memmap[index].size;
>>> +
>>> +    /* The region will be disabled if its size isn't given */
>>> +    if (!*enabled || !size) {
>> In which case do you have !size?
>
> Yeah, we don't have !size and the condition should be removed.
>
>>> +        *enabled = false;
>>> +        vms->memmap[index].base = 0;
>>> +        vms->memmap[index].size = 0;
>> It looks dangerous to me to reset the region's base and size like that.
>> for instance fdt_add_gic_node() will add dummy data in the dt.
>
> I would guess you missed that the high memory regions won't be exported
> through device-tree if they have been disabled. We have a check there,
> which is "if (nb_redist_regions == 1)"
OK I missed a check was added in virt_gicv3_redist_region_count.
Nevertheless, your comment "The region will be disabled if its size
isn't given */ is not obvious to me. To me the region is disabled if the
corresponding flag is not set. From your comment I have the impression
the size is checked to see if the region is exposed, it does not look
obvious.
>
>>> +        return;
>>> +    }
>>> +
>>> +    /*
>>> +     * Check if the memory region fits in the PA space. The memory map
>>> +     * and highest_gpa are updated if it fits. Otherwise, it's
>>> disabled.
>>> +     */
>>> +    *enabled = (ROUND_UP(*base, size) + size <= BIT_ULL(pa_bits));
>> using a 'fits' local variable would make the code more obvious I think
>
> Lets confirm if you're suggesting something like below?
>
>         bool fits;
>
>         fits = (ROUND_UP(*base, size) + size <= BIT_ULL(pa_bits));
>
>         if (fits) {
>            /* update *base, memory mapping, highest_gpa */
>         } else {
>            *enabled = false;
>         }
yes that's what I suggested.
>
> I guess we can simply do
>
>         if (ROUND_UP(*base, size) + size <= BIT_ULL(pa_bits)) {
>            /* update *base, memory mapping, highest_gpa */
>         } else {
>            *enabled = false;
>         }
>
> Please let me know which one looks best to you.
>
>>> +    if (*enabled) {
>>> +        *base = ROUND_UP(*base, size);
>>> +        vms->memmap[index].base = *base;
>>> +        vms->memmap[index].size = size;
>>> +        vms->highest_gpa = *base + size - 1;
>>> +
>>> +    *base = *base + size;
>>> +    }
>>> +}
>>> +
>>>   static void virt_set_memmap(VirtMachineState *vms, int pa_bits)
>>>   {
>>>       MachineState *ms = MACHINE(vms);
>>> @@ -1744,37 +1772,17 @@ static void virt_set_memmap(VirtMachineState
>>> *vms, int pa_bits)
>>>       vms->highest_gpa = memtop - 1;
>>>         for (i = VIRT_LOWMEMMAP_LAST; i <
>>> ARRAY_SIZE(extended_memmap); i++) {
>>> -        hwaddr size = extended_memmap[i].size;
>>> -        bool fits;
>>> -
>>> -        base = ROUND_UP(base, size);
>>> -        vms->memmap[i].base = base;
>>> -        vms->memmap[i].size = size;
>>> -
>>> -        /*
>>> -         * Check each device to see if they fit in the PA space,
>>> -         * moving highest_gpa as we go.
>>> -         *
>>> -         * For each device that doesn't fit, disable it.
>>> -         */
>>> -        fits = (base + size) <= BIT_ULL(pa_bits);
>>> -        if (fits) {
>>> -            vms->highest_gpa = base + size - 1;
>>> -        }
>>> -
>>
>> we could avoid running the code below in case highmem is not set. We
>> would need to reset that flags though.
>>
>
> Yeah, I think it's not a big deal. My though is to update the flag in
> virt_memmap_fits().

Eric
>
>>>           switch (i) {
>>>           case VIRT_HIGH_GIC_REDIST2:
>>> -            vms->highmem_redists &= fits;
>>> +            virt_memmap_fits(vms, i, &vms->highmem_redists, &base,
>>> pa_bits);
>>>               break;
>>>           case VIRT_HIGH_PCIE_ECAM:
>>> -            vms->highmem_ecam &= fits;
>>> +            virt_memmap_fits(vms, i, &vms->highmem_ecam, &base,
>>> pa_bits);
>>>               break;
>>>           case VIRT_HIGH_PCIE_MMIO:
>>> -            vms->highmem_mmio &= fits;
>>> +            virt_memmap_fits(vms, i, &vms->highmem_mmio, &base,
>>> pa_bits);
>>>               break;
>>>           }
>>> -
>>> -        base += size;
>>>       }
>>>         if (device_memory_size > 0) {
>
> Thanks,
> Gavin
>
Eric Auger Aug. 3, 2022, 12:52 p.m. UTC | #6
Hi,
On 8/3/22 15:02, Gavin Shan wrote:
> Hi Marc,
>
> On 8/3/22 5:01 PM, Marc Zyngier wrote:
>> On Wed, 03 Aug 2022 04:01:04 +0100,
>> Gavin Shan <gshan@redhat.com> wrote:
>>> On 8/2/22 7:41 PM, Eric Auger wrote:
>>>> On 8/2/22 08:45, Gavin Shan wrote:
>>>>> There are 3 highmem IO regions as below. They can be disabled in
>>>>> two situations: (a) The specific region is disabled by user. (b)
>>>>> The specific region doesn't fit in the PA space. However, the base
>>>>> address and highest_gpa are still updated no matter if the region
>>>>> is enabled or disabled. It's incorrectly incurring waste in the PA
>>>>> space.
>>>> If I am not wrong highmem_redists and highmem_mmio are not user
>>>> selectable
>>>>
>>>> Only highmem ecam depends on machine type & ACPI setup. But I would
>>>> say
>>>> that in server use case it is always set. So is that optimization
>>>> really
>>>> needed?
>>>
>>> There are two other cases you missed.
>>>
>>> - highmem_ecam is enabled after virt-2.12, meaning it stays disabled
>>>    before that.
>>
>> I don't get this. The current behaviour is to disable highmem_ecam if
>> it doesn't fit in the PA space. I can't see anything that enables it
>> if it was disabled the first place.
>>
>
> There are several places or conditions where vms->highmem_ecam can be
> disabled:
>
> - virt_instance_init() where vms->highmem_ecam is inherited from
>   !vmc->no_highmem_ecam. The option is set to true after virt-2.12
>   in virt_machine_2_12_options().
>
> - machvirt_init() where vms->highmem_ecam can be disable if we have
>   32-bits vCPUs and failure on loading firmware.
>
> - Another place is where we're talking about. It's address assignment
>   to fit the PA space.
>
>>>
>>> - The high memory region can be disabled if user is asking large
>>>    (normal) memory space through 'maxmem=' option. When the requested
>>>    memory by 'maxmem=' is large enough, the high memory regions are
>>>    disabled. It means the normal memory has higher priority than those
>>>    high memory regions. This is the case I provided in (b) of the
>>>    commit log.
>>
>> Why is that a problem? It matches the expected behaviour, as the
>> highmem IO region is floating and is pushed up by the memory region.
>>
>
> Eric thought that VIRT_HIGH_GIC_REDIST2 and VIRT_HIGH_PCIE_MMIO regions
> aren't user selectable. I tended to explain why it's not true. 'maxmem='
> can affect the outcome. When 'maxmem=' value is big enough, there will be
> no free area in the PA space to hold those two regions.
>
>>>
>>> In the commit log, I was supposed to say something like below for
>>> (a):
>>>
>>> - The specific high memory region can be disabled through changing
>>>    the code by user or developer. For example, 'vms->highmem_mmio'
>>>    is changed from true to false in virt_instance_init().
>>
>> Huh. By this principle, the user can change anything. Why is it
>> important?
>>
>
> Still like above. I was explaining the possible cases where those
> 3 switches can be turned on/off by users or developers. Our code
> needs to be consistent and comprehensive.
>
>   vms->highmem_redists
>   vms->highmem_ecam
>   vms->mmio
>
>>>
>>>>>
>>>>> Improve address assignment for highmem IO regions to avoid the waste
>>>>> in the PA space by putting the logic into virt_memmap_fits().
>>
>> I guess that this is what I understand the least. What do you mean by
>> "wasted PA space"? Either the regions fit in the PA space, and
>> computing their addresses in relevant, or they fall outside of it and
>> what we stick in memap[index].base is completely irrelevant.
>>
>
> It's possible that we run into the following combination. we should
> have enough PA space to enable VIRT_HIGH_PCIE_MMIO region. However,
> the region is disabled in the original implementation because
> VIRT_HIGH_{GIC_REDIST2, PCIE_ECAM} regions consumed 1GB, which is
> unecessary and waste in the PA space.
each region's base is aligned on its size.
>
>     static MemMapEntry extended_memmap[] = {
>         [VIRT_HIGH_GIC_REDIST2] =   { 0x0, 64 * MiB },
>         [VIRT_HIGH_PCIE_ECAM] =     { 0x0, 256 * MiB },
>         [VIRT_HIGH_PCIE_MMIO] =     { 0x0, 512 * GiB },
so anyway MMIO is at least at 512GB. Having a 1TB IPA space does not
imply any amount of RAM. This depends on the address space.
I    };
>
>     IPA_LIMIT           = (1UL << 40)
>     '-maxmem'           = 511GB              /* Memory starts from 1GB */
>     '-slots'            = 0
>     vms->highmem_rdist2 = false
How can this happen? the only reason for highmem_redists to be reset is
if it does not fit into map_ipa. So if mmio fits, highmem_redists does
too. What do I miss?
>     vms->highmem_ecam   = false
    vms->highmem_mmio   = true
I am still skeptic about the relevance of such optim. Isn't it sensible
to expect the host to support large IPA if you want to use such amount
of memory.
I think we should rather better document the memory map from a user
point of view.
Besides if you change the memory map, you need to introduce yet another
option to avoid breaking migration, no?

Eric
>
>>>>>
>>>>> Signed-off-by: Gavin Shan <gshan@redhat.com>
>>>>> ---
>>>>>    hw/arm/virt.c | 54
>>>>> +++++++++++++++++++++++++++++----------------------
>>>>>    1 file changed, 31 insertions(+), 23 deletions(-)
>>>>>
>>>>> diff --git a/hw/arm/virt.c b/hw/arm/virt.c
>>>>> index 9633f822f3..bc0cd218f9 100644
>>>>> --- a/hw/arm/virt.c
>>>>> +++ b/hw/arm/virt.c
>>>>> @@ -1688,6 +1688,34 @@ static uint64_t
>>>>> virt_cpu_mp_affinity(VirtMachineState *vms, int idx)
>>>>>        return arm_cpu_mp_affinity(idx, clustersz);
>>>>>    }
>>>>>    +static void virt_memmap_fits(VirtMachineState *vms, int index,
>>>>> +                             bool *enabled, hwaddr *base, int
>>>>> pa_bits)
>>>>> +{
>>>>> +    hwaddr size = extended_memmap[index].size;
>>>>> +
>>>>> +    /* The region will be disabled if its size isn't given */
>>>>> +    if (!*enabled || !size) {
>>>> In which case do you have !size?
>>>
>>> Yeah, we don't have !size and the condition should be removed.
>>>
>>>>> +        *enabled = false;
>>>>> +        vms->memmap[index].base = 0;
>>>>> +        vms->memmap[index].size = 0;
>>>> It looks dangerous to me to reset the region's base and size like
>>>> that.
>>>> for instance fdt_add_gic_node() will add dummy data in the dt.
>>>
>>> I would guess you missed that the high memory regions won't be exported
>>> through device-tree if they have been disabled. We have a check there,
>>> which is "if (nb_redist_regions == 1)"
>>>
>>>>> +        return;
>>>>> +    }
>>>>> +
>>>>> +    /*
>>>>> +     * Check if the memory region fits in the PA space. The
>>>>> memory map
>>>>> +     * and highest_gpa are updated if it fits. Otherwise, it's
>>>>> disabled.
>>>>> +     */
>>>>> +    *enabled = (ROUND_UP(*base, size) + size <= BIT_ULL(pa_bits));
>>>> using a 'fits' local variable would make the code more obvious I think
>>>
>>> Lets confirm if you're suggesting something like below?
>>>
>>>          bool fits;
>>>
>>>          fits = (ROUND_UP(*base, size) + size <= BIT_ULL(pa_bits));
>>>
>>>          if (fits) {
>>>             /* update *base, memory mapping, highest_gpa */
>>>          } else {
>>>             *enabled = false;
>>>          }
>>>
>>> I guess we can simply do
>>>
>>>          if (ROUND_UP(*base, size) + size <= BIT_ULL(pa_bits)) {
>>>             /* update *base, memory mapping, highest_gpa */
>>>          } else {
>>>             *enabled = false;
>>>          }
>>>
>>> Please let me know which one looks best to you.
>>
>> Why should the 'enabled' flag be updated by this function, instead of
>> returning the value and keeping it as an assignment in the caller
>> function? It is purely stylistic though.
>>
>
> The idea to put the logic, address assignment for those 3 high memory
> regions or updating the flags (vms->high_redist2/ecam/mmio), to the
> newly introduced function, to make virt_set_memmap() a bit simplified.
> Eric tends to agree the changes with minor adjustments. So I'm going
> to keep it as of being, which doesn't mean the stylistic thought is
> a bad one :)
>
> Lastly, I need to rewrite the comit log completely because it seems
> causing confusions to Eric and you.
>
> Thanks,
> Gavin
>
Gavin Shan Aug. 3, 2022, 1:02 p.m. UTC | #7
Hi Marc,

On 8/3/22 5:01 PM, Marc Zyngier wrote:
> On Wed, 03 Aug 2022 04:01:04 +0100,
> Gavin Shan <gshan@redhat.com> wrote:
>> On 8/2/22 7:41 PM, Eric Auger wrote:
>>> On 8/2/22 08:45, Gavin Shan wrote:
>>>> There are 3 highmem IO regions as below. They can be disabled in
>>>> two situations: (a) The specific region is disabled by user. (b)
>>>> The specific region doesn't fit in the PA space. However, the base
>>>> address and highest_gpa are still updated no matter if the region
>>>> is enabled or disabled. It's incorrectly incurring waste in the PA
>>>> space.
>>> If I am not wrong highmem_redists and highmem_mmio are not user selectable
>>>
>>> Only highmem ecam depends on machine type & ACPI setup. But I would say
>>> that in server use case it is always set. So is that optimization really
>>> needed?
>>
>> There are two other cases you missed.
>>
>> - highmem_ecam is enabled after virt-2.12, meaning it stays disabled
>>    before that.
> 
> I don't get this. The current behaviour is to disable highmem_ecam if
> it doesn't fit in the PA space. I can't see anything that enables it
> if it was disabled the first place.
> 

There are several places or conditions where vms->highmem_ecam can be
disabled:

- virt_instance_init() where vms->highmem_ecam is inherited from
   !vmc->no_highmem_ecam. The option is set to true after virt-2.12
   in virt_machine_2_12_options().

- machvirt_init() where vms->highmem_ecam can be disable if we have
   32-bits vCPUs and failure on loading firmware.

- Another place is where we're talking about. It's address assignment
   to fit the PA space.

>>
>> - The high memory region can be disabled if user is asking large
>>    (normal) memory space through 'maxmem=' option. When the requested
>>    memory by 'maxmem=' is large enough, the high memory regions are
>>    disabled. It means the normal memory has higher priority than those
>>    high memory regions. This is the case I provided in (b) of the
>>    commit log.
> 
> Why is that a problem? It matches the expected behaviour, as the
> highmem IO region is floating and is pushed up by the memory region.
> 

Eric thought that VIRT_HIGH_GIC_REDIST2 and VIRT_HIGH_PCIE_MMIO regions
aren't user selectable. I tended to explain why it's not true. 'maxmem='
can affect the outcome. When 'maxmem=' value is big enough, there will be
no free area in the PA space to hold those two regions.

>>
>> In the commit log, I was supposed to say something like below for
>> (a):
>>
>> - The specific high memory region can be disabled through changing
>>    the code by user or developer. For example, 'vms->highmem_mmio'
>>    is changed from true to false in virt_instance_init().
> 
> Huh. By this principle, the user can change anything. Why is it
> important?
> 

Still like above. I was explaining the possible cases where those
3 switches can be turned on/off by users or developers. Our code
needs to be consistent and comprehensive.

   vms->highmem_redists
   vms->highmem_ecam
   vms->mmio

>>
>>>>
>>>> Improve address assignment for highmem IO regions to avoid the waste
>>>> in the PA space by putting the logic into virt_memmap_fits().
> 
> I guess that this is what I understand the least. What do you mean by
> "wasted PA space"? Either the regions fit in the PA space, and
> computing their addresses in relevant, or they fall outside of it and
> what we stick in memap[index].base is completely irrelevant.
> 

It's possible that we run into the following combination. we should
have enough PA space to enable VIRT_HIGH_PCIE_MMIO region. However,
the region is disabled in the original implementation because
VIRT_HIGH_{GIC_REDIST2, PCIE_ECAM} regions consumed 1GB, which is
unecessary and waste in the PA space.

     static MemMapEntry extended_memmap[] = {
         [VIRT_HIGH_GIC_REDIST2] =   { 0x0, 64 * MiB },
         [VIRT_HIGH_PCIE_ECAM] =     { 0x0, 256 * MiB },
         [VIRT_HIGH_PCIE_MMIO] =     { 0x0, 512 * GiB },
     };

     IPA_LIMIT           = (1UL << 40)
     '-maxmem'           = 511GB              /* Memory starts from 1GB */
     '-slots'            = 0
     vms->highmem_rdist2 = false
     vms->highmem_ecam   = false
     vms->highmem_mmio   = true

>>>>
>>>> Signed-off-by: Gavin Shan <gshan@redhat.com>
>>>> ---
>>>>    hw/arm/virt.c | 54 +++++++++++++++++++++++++++++----------------------
>>>>    1 file changed, 31 insertions(+), 23 deletions(-)
>>>>
>>>> diff --git a/hw/arm/virt.c b/hw/arm/virt.c
>>>> index 9633f822f3..bc0cd218f9 100644
>>>> --- a/hw/arm/virt.c
>>>> +++ b/hw/arm/virt.c
>>>> @@ -1688,6 +1688,34 @@ static uint64_t virt_cpu_mp_affinity(VirtMachineState *vms, int idx)
>>>>        return arm_cpu_mp_affinity(idx, clustersz);
>>>>    }
>>>>    +static void virt_memmap_fits(VirtMachineState *vms, int index,
>>>> +                             bool *enabled, hwaddr *base, int pa_bits)
>>>> +{
>>>> +    hwaddr size = extended_memmap[index].size;
>>>> +
>>>> +    /* The region will be disabled if its size isn't given */
>>>> +    if (!*enabled || !size) {
>>> In which case do you have !size?
>>
>> Yeah, we don't have !size and the condition should be removed.
>>
>>>> +        *enabled = false;
>>>> +        vms->memmap[index].base = 0;
>>>> +        vms->memmap[index].size = 0;
>>> It looks dangerous to me to reset the region's base and size like that.
>>> for instance fdt_add_gic_node() will add dummy data in the dt.
>>
>> I would guess you missed that the high memory regions won't be exported
>> through device-tree if they have been disabled. We have a check there,
>> which is "if (nb_redist_regions == 1)"
>>
>>>> +        return;
>>>> +    }
>>>> +
>>>> +    /*
>>>> +     * Check if the memory region fits in the PA space. The memory map
>>>> +     * and highest_gpa are updated if it fits. Otherwise, it's disabled.
>>>> +     */
>>>> +    *enabled = (ROUND_UP(*base, size) + size <= BIT_ULL(pa_bits));
>>> using a 'fits' local variable would make the code more obvious I think
>>
>> Lets confirm if you're suggesting something like below?
>>
>>          bool fits;
>>
>>          fits = (ROUND_UP(*base, size) + size <= BIT_ULL(pa_bits));
>>
>>          if (fits) {
>>             /* update *base, memory mapping, highest_gpa */
>>          } else {
>>             *enabled = false;
>>          }
>>
>> I guess we can simply do
>>
>>          if (ROUND_UP(*base, size) + size <= BIT_ULL(pa_bits)) {
>>             /* update *base, memory mapping, highest_gpa */
>>          } else {
>>             *enabled = false;
>>          }
>>
>> Please let me know which one looks best to you.
> 
> Why should the 'enabled' flag be updated by this function, instead of
> returning the value and keeping it as an assignment in the caller
> function? It is purely stylistic though.
> 

The idea to put the logic, address assignment for those 3 high memory
regions or updating the flags (vms->high_redist2/ecam/mmio), to the
newly introduced function, to make virt_set_memmap() a bit simplified.
Eric tends to agree the changes with minor adjustments. So I'm going
to keep it as of being, which doesn't mean the stylistic thought is
a bad one :)

Lastly, I need to rewrite the comit log completely because it seems
causing confusions to Eric and you.

Thanks,
Gavin
Gavin Shan Aug. 3, 2022, 1:11 p.m. UTC | #8
Hi Eric,

On 8/3/22 6:44 PM, Eric Auger wrote:
> On 8/3/22 05:01, Gavin Shan wrote:
>> On 8/2/22 7:41 PM, Eric Auger wrote:
>>> On 8/2/22 08:45, Gavin Shan wrote:
>>>> There are 3 highmem IO regions as below. They can be disabled in
>>>> two situations: (a) The specific region is disabled by user. (b)
>>>> The specific region doesn't fit in the PA space. However, the base
>>>> address and highest_gpa are still updated no matter if the region
>>>> is enabled or disabled. It's incorrectly incurring waste in the PA
>>>> space.
>>> If I am not wrong highmem_redists and highmem_mmio are not user
>>> selectable
>>>
>>> Only highmem ecam depends on machine type & ACPI setup. But I would say
>>> that in server use case it is always set. So is that optimization really
>>> needed?
>>
>> There are two other cases you missed.
>>
>> - highmem_ecam is enabled after virt-2.12, meaning it stays disabled
>>    before that.
> Yes that's what I meant above by 'depends on machine type'

Ok.

>>
>> - The high memory region can be disabled if user is asking large
>>    (normal) memory space through 'maxmem=' option. When the requested
>>    memory by 'maxmem=' is large enough, the high memory regions are
>>    disabled. It means the normal memory has higher priority than those
>>    high memory regions. This is the case I provided in (b) of the
>>    commit log.
> yes but in such a case you don't "waste" IPA as you mention in the
> commit log because you only ask for a VM dimensionned for the highest_gpa.
> The only case where you would "waste" IPA is for high ecam which can
> disabled by option combination but it is marginal.
> 

Ok, I've explain this to Marc in another reply. In short, we possibly
have below combination. the 'highmem_mmio' region isn't enabled as
we expect. The reason is 'highmem_rdist2' and 'highmem_ecam' consumes
1GB, which is unnecessary because both regions are disabled in advance.

Note: system memory starts from 1GB.

    qemu -m 4096M -maxmem=511G

    IPA_LIMIT  = (1UL << 40)
    vms->highmem_rdist2 = false              /* 64MB  */
    vms->highmem_ecam   = false              /* 256MB */
    vms->highmem_mmio   = true               /* 512GB */

>>
>> In the commit log, I was supposed to say something like below for
>> (a):
>>
>> - The specific high memory region can be disabled through changing
>>    the code by user or developer. For example, 'vms->highmem_mmio'
>>    is changed from true to false in virt_instance_init().
>>
>>>>
>>>> Improve address assignment for highmem IO regions to avoid the waste
>>>> in the PA space by putting the logic into virt_memmap_fits().
>>>>
>>>> Signed-off-by: Gavin Shan <gshan@redhat.com>
>>>> ---
>>>>    hw/arm/virt.c | 54
>>>> +++++++++++++++++++++++++++++----------------------
>>>>    1 file changed, 31 insertions(+), 23 deletions(-)
>>>>
>>>> diff --git a/hw/arm/virt.c b/hw/arm/virt.c
>>>> index 9633f822f3..bc0cd218f9 100644
>>>> --- a/hw/arm/virt.c
>>>> +++ b/hw/arm/virt.c
>>>> @@ -1688,6 +1688,34 @@ static uint64_t
>>>> virt_cpu_mp_affinity(VirtMachineState *vms, int idx)
>>>>        return arm_cpu_mp_affinity(idx, clustersz);
>>>>    }
>>>>    +static void virt_memmap_fits(VirtMachineState *vms, int index,
>>>> +                             bool *enabled, hwaddr *base, int pa_bits)
>>>> +{
>>>> +    hwaddr size = extended_memmap[index].size;
>>>> +
>>>> +    /* The region will be disabled if its size isn't given */
>>>> +    if (!*enabled || !size) {
>>> In which case do you have !size?
>>
>> Yeah, we don't have !size and the condition should be removed.
>>
>>>> +        *enabled = false;
>>>> +        vms->memmap[index].base = 0;
>>>> +        vms->memmap[index].size = 0;
>>> It looks dangerous to me to reset the region's base and size like that.
>>> for instance fdt_add_gic_node() will add dummy data in the dt.
>>
>> I would guess you missed that the high memory regions won't be exported
>> through device-tree if they have been disabled. We have a check there,
>> which is "if (nb_redist_regions == 1)"
> OK I missed a check was added in virt_gicv3_redist_region_count.
> Nevertheless, your comment "The region will be disabled if its size
> isn't given */ is not obvious to me. To me the region is disabled if the
> corresponding flag is not set. From your comment I have the impression
> the size is checked to see if the region is exposed, it does not look
> obvious.

Ok :)

>>
>>>> +        return;
>>>> +    }
>>>> +
>>>> +    /*
>>>> +     * Check if the memory region fits in the PA space. The memory map
>>>> +     * and highest_gpa are updated if it fits. Otherwise, it's
>>>> disabled.
>>>> +     */
>>>> +    *enabled = (ROUND_UP(*base, size) + size <= BIT_ULL(pa_bits));
>>> using a 'fits' local variable would make the code more obvious I think
>>
>> Lets confirm if you're suggesting something like below?
>>
>>          bool fits;
>>
>>          fits = (ROUND_UP(*base, size) + size <= BIT_ULL(pa_bits));
>>
>>          if (fits) {
>>             /* update *base, memory mapping, highest_gpa */
>>          } else {
>>             *enabled = false;
>>          }
> yes that's what I suggested.

Yeah, it's more obvious. I would hold to post v2 to see if Marc will
have more comments.

>>
>> I guess we can simply do
>>
>>          if (ROUND_UP(*base, size) + size <= BIT_ULL(pa_bits)) {
>>             /* update *base, memory mapping, highest_gpa */
>>          } else {
>>             *enabled = false;
>>          }
>>
>> Please let me know which one looks best to you.
>>
>>>> +    if (*enabled) {
>>>> +        *base = ROUND_UP(*base, size);
>>>> +        vms->memmap[index].base = *base;
>>>> +        vms->memmap[index].size = size;
>>>> +        vms->highest_gpa = *base + size - 1;
>>>> +
>>>> +    *base = *base + size;
>>>> +    }
>>>> +}
>>>> +
>>>>    static void virt_set_memmap(VirtMachineState *vms, int pa_bits)
>>>>    {
>>>>        MachineState *ms = MACHINE(vms);
>>>> @@ -1744,37 +1772,17 @@ static void virt_set_memmap(VirtMachineState
>>>> *vms, int pa_bits)
>>>>        vms->highest_gpa = memtop - 1;
>>>>          for (i = VIRT_LOWMEMMAP_LAST; i <
>>>> ARRAY_SIZE(extended_memmap); i++) {
>>>> -        hwaddr size = extended_memmap[i].size;
>>>> -        bool fits;
>>>> -
>>>> -        base = ROUND_UP(base, size);
>>>> -        vms->memmap[i].base = base;
>>>> -        vms->memmap[i].size = size;
>>>> -
>>>> -        /*
>>>> -         * Check each device to see if they fit in the PA space,
>>>> -         * moving highest_gpa as we go.
>>>> -         *
>>>> -         * For each device that doesn't fit, disable it.
>>>> -         */
>>>> -        fits = (base + size) <= BIT_ULL(pa_bits);
>>>> -        if (fits) {
>>>> -            vms->highest_gpa = base + size - 1;
>>>> -        }
>>>> -
>>>
>>> we could avoid running the code below in case highmem is not set. We
>>> would need to reset that flags though.
>>>
>>
>> Yeah, I think it's not a big deal. My though is to update the flag in
>> virt_memmap_fits().
>>
>>>>            switch (i) {
>>>>            case VIRT_HIGH_GIC_REDIST2:
>>>> -            vms->highmem_redists &= fits;
>>>> +            virt_memmap_fits(vms, i, &vms->highmem_redists, &base,
>>>> pa_bits);
>>>>                break;
>>>>            case VIRT_HIGH_PCIE_ECAM:
>>>> -            vms->highmem_ecam &= fits;
>>>> +            virt_memmap_fits(vms, i, &vms->highmem_ecam, &base,
>>>> pa_bits);
>>>>                break;
>>>>            case VIRT_HIGH_PCIE_MMIO:
>>>> -            vms->highmem_mmio &= fits;
>>>> +            virt_memmap_fits(vms, i, &vms->highmem_mmio, &base,
>>>> pa_bits);
>>>>                break;
>>>>            }
>>>> -
>>>> -        base += size;
>>>>        }
>>>>          if (device_memory_size > 0) {

Thanks,
Gavin
Gavin Shan Aug. 4, 2022, 2:47 a.m. UTC | #9
Hi Eric,

On 8/3/22 10:52 PM, Eric Auger wrote:
> On 8/3/22 15:02, Gavin Shan wrote:
>> On 8/3/22 5:01 PM, Marc Zyngier wrote:
>>> On Wed, 03 Aug 2022 04:01:04 +0100,
>>> Gavin Shan <gshan@redhat.com> wrote:
>>>> On 8/2/22 7:41 PM, Eric Auger wrote:
>>>>> On 8/2/22 08:45, Gavin Shan wrote:
>>>>>> There are 3 highmem IO regions as below. They can be disabled in
>>>>>> two situations: (a) The specific region is disabled by user. (b)
>>>>>> The specific region doesn't fit in the PA space. However, the base
>>>>>> address and highest_gpa are still updated no matter if the region
>>>>>> is enabled or disabled. It's incorrectly incurring waste in the PA
>>>>>> space.
>>>>> If I am not wrong highmem_redists and highmem_mmio are not user
>>>>> selectable
>>>>>
>>>>> Only highmem ecam depends on machine type & ACPI setup. But I would
>>>>> say
>>>>> that in server use case it is always set. So is that optimization
>>>>> really
>>>>> needed?
>>>>
>>>> There are two other cases you missed.
>>>>
>>>> - highmem_ecam is enabled after virt-2.12, meaning it stays disabled
>>>>     before that.
>>>
>>> I don't get this. The current behaviour is to disable highmem_ecam if
>>> it doesn't fit in the PA space. I can't see anything that enables it
>>> if it was disabled the first place.
>>>
>>
>> There are several places or conditions where vms->highmem_ecam can be
>> disabled:
>>
>> - virt_instance_init() where vms->highmem_ecam is inherited from
>>    !vmc->no_highmem_ecam. The option is set to true after virt-2.12
>>    in virt_machine_2_12_options().
>>
>> - machvirt_init() where vms->highmem_ecam can be disable if we have
>>    32-bits vCPUs and failure on loading firmware.
>>
>> - Another place is where we're talking about. It's address assignment
>>    to fit the PA space.
>>
>>>>
>>>> - The high memory region can be disabled if user is asking large
>>>>     (normal) memory space through 'maxmem=' option. When the requested
>>>>     memory by 'maxmem=' is large enough, the high memory regions are
>>>>     disabled. It means the normal memory has higher priority than those
>>>>     high memory regions. This is the case I provided in (b) of the
>>>>     commit log.
>>>
>>> Why is that a problem? It matches the expected behaviour, as the
>>> highmem IO region is floating and is pushed up by the memory region.
>>>
>>
>> Eric thought that VIRT_HIGH_GIC_REDIST2 and VIRT_HIGH_PCIE_MMIO regions
>> aren't user selectable. I tended to explain why it's not true. 'maxmem='
>> can affect the outcome. When 'maxmem=' value is big enough, there will be
>> no free area in the PA space to hold those two regions.
>>
>>>>
>>>> In the commit log, I was supposed to say something like below for
>>>> (a):
>>>>
>>>> - The specific high memory region can be disabled through changing
>>>>     the code by user or developer. For example, 'vms->highmem_mmio'
>>>>     is changed from true to false in virt_instance_init().
>>>
>>> Huh. By this principle, the user can change anything. Why is it
>>> important?
>>>
>>
>> Still like above. I was explaining the possible cases where those
>> 3 switches can be turned on/off by users or developers. Our code
>> needs to be consistent and comprehensive.
>>
>>    vms->highmem_redists
>>    vms->highmem_ecam
>>    vms->mmio
>>
>>>>
>>>>>>
>>>>>> Improve address assignment for highmem IO regions to avoid the waste
>>>>>> in the PA space by putting the logic into virt_memmap_fits().
>>>
>>> I guess that this is what I understand the least. What do you mean by
>>> "wasted PA space"? Either the regions fit in the PA space, and
>>> computing their addresses in relevant, or they fall outside of it and
>>> what we stick in memap[index].base is completely irrelevant.
>>>
>>
>> It's possible that we run into the following combination. we should
>> have enough PA space to enable VIRT_HIGH_PCIE_MMIO region. However,
>> the region is disabled in the original implementation because
>> VIRT_HIGH_{GIC_REDIST2, PCIE_ECAM} regions consumed 1GB, which is
>> unecessary and waste in the PA space.
> each region's base is aligned on its size.

Yes.

>>
>>      static MemMapEntry extended_memmap[] = {
>>          [VIRT_HIGH_GIC_REDIST2] =   { 0x0, 64 * MiB },
>>          [VIRT_HIGH_PCIE_ECAM] =     { 0x0, 256 * MiB },
>>          [VIRT_HIGH_PCIE_MMIO] =     { 0x0, 512 * GiB },
> so anyway MMIO is at least at 512GB. Having a 1TB IPA space does not
> imply any amount of RAM. This depends on the address space.
> I    };

Yes. Prior to the start of system memory, there is 1GB used by
various regions either.

>>
>>      IPA_LIMIT           = (1UL << 40)
>>      '-maxmem'           = 511GB              /* Memory starts from 1GB */
>>      '-slots'            = 0
>>      vms->highmem_rdist2 = false
> How can this happen? the only reason for highmem_redists to be reset is
> if it does not fit into map_ipa. So if mmio fits, highmem_redists does
> too. What do I miss?

The example is having "vms->highmem_rdist2 = flase" BEFORE the address
assignment, it's possible that developer changes the code to disable
it intentionally. The point is the original implementation isn't comprehensive
because it has the wrong assumption that vms->highmem_{rdist2, ecam, mmio} all
true before the address assignment. With the wrong assumption, the base address
is always increased, even the previous region is disabled, during the
address assignment in virt_set_memmap().


>>      vms->highmem_ecam   = false
>      vms->highmem_mmio   = true
> I am still skeptic about the relevance of such optim. Isn't it sensible
> to expect the host to support large IPA if you want to use such amount
> of memory.
> I think we should rather better document the memory map from a user
> point of view.
> Besides if you change the memory map, you need to introduce yet another
> option to avoid breaking migration, no?
> 

These 3 high memory regions consumes 513GB with alignment considered when
all of them are enabled. The point is those 3 high memory regions, or part
of them can be squeezed or smashed to accommodate '-maxmem' by design. I
think there are two options I can think of. I personally prefer to keep
the capability. With this, users gain broader range for their '-maxmem'.
Please let me know your preference.

- Revert the capability of squeezing or smashing those high memory regions
   to accommodate '-maxmem'. It means vms->high_{redist2, ecam, mmio} can't
   be disable on conflicts to the PA space.

- Keep the capability, with this optimization applied to make the implementation
   comprehensive.

I think it's worthy to add something about this limitation in doc/system/arm/virt.rst.

I don't think I need introduce another option to avoid migration breakage.
Could you explain why I need the extra option?

>>
>>>>>>
>>>>>> Signed-off-by: Gavin Shan <gshan@redhat.com>
>>>>>> ---
>>>>>>     hw/arm/virt.c | 54
>>>>>> +++++++++++++++++++++++++++++----------------------
>>>>>>     1 file changed, 31 insertions(+), 23 deletions(-)
>>>>>>
>>>>>> diff --git a/hw/arm/virt.c b/hw/arm/virt.c
>>>>>> index 9633f822f3..bc0cd218f9 100644
>>>>>> --- a/hw/arm/virt.c
>>>>>> +++ b/hw/arm/virt.c
>>>>>> @@ -1688,6 +1688,34 @@ static uint64_t
>>>>>> virt_cpu_mp_affinity(VirtMachineState *vms, int idx)
>>>>>>         return arm_cpu_mp_affinity(idx, clustersz);
>>>>>>     }
>>>>>>     +static void virt_memmap_fits(VirtMachineState *vms, int index,
>>>>>> +                             bool *enabled, hwaddr *base, int
>>>>>> pa_bits)
>>>>>> +{
>>>>>> +    hwaddr size = extended_memmap[index].size;
>>>>>> +
>>>>>> +    /* The region will be disabled if its size isn't given */
>>>>>> +    if (!*enabled || !size) {
>>>>> In which case do you have !size?
>>>>
>>>> Yeah, we don't have !size and the condition should be removed.
>>>>
>>>>>> +        *enabled = false;
>>>>>> +        vms->memmap[index].base = 0;
>>>>>> +        vms->memmap[index].size = 0;
>>>>> It looks dangerous to me to reset the region's base and size like
>>>>> that.
>>>>> for instance fdt_add_gic_node() will add dummy data in the dt.
>>>>
>>>> I would guess you missed that the high memory regions won't be exported
>>>> through device-tree if they have been disabled. We have a check there,
>>>> which is "if (nb_redist_regions == 1)"
>>>>
>>>>>> +        return;
>>>>>> +    }
>>>>>> +
>>>>>> +    /*
>>>>>> +     * Check if the memory region fits in the PA space. The
>>>>>> memory map
>>>>>> +     * and highest_gpa are updated if it fits. Otherwise, it's
>>>>>> disabled.
>>>>>> +     */
>>>>>> +    *enabled = (ROUND_UP(*base, size) + size <= BIT_ULL(pa_bits));
>>>>> using a 'fits' local variable would make the code more obvious I think
>>>>
>>>> Lets confirm if you're suggesting something like below?
>>>>
>>>>           bool fits;
>>>>
>>>>           fits = (ROUND_UP(*base, size) + size <= BIT_ULL(pa_bits));
>>>>
>>>>           if (fits) {
>>>>              /* update *base, memory mapping, highest_gpa */
>>>>           } else {
>>>>              *enabled = false;
>>>>           }
>>>>
>>>> I guess we can simply do
>>>>
>>>>           if (ROUND_UP(*base, size) + size <= BIT_ULL(pa_bits)) {
>>>>              /* update *base, memory mapping, highest_gpa */
>>>>           } else {
>>>>              *enabled = false;
>>>>           }
>>>>
>>>> Please let me know which one looks best to you.
>>>
>>> Why should the 'enabled' flag be updated by this function, instead of
>>> returning the value and keeping it as an assignment in the caller
>>> function? It is purely stylistic though.
>>>
>>
>> The idea to put the logic, address assignment for those 3 high memory
>> regions or updating the flags (vms->high_redist2/ecam/mmio), to the
>> newly introduced function, to make virt_set_memmap() a bit simplified.
>> Eric tends to agree the changes with minor adjustments. So I'm going
>> to keep it as of being, which doesn't mean the stylistic thought is
>> a bad one :)
>>
>> Lastly, I need to rewrite the comit log completely because it seems
>> causing confusions to Eric and you.
>>

Thanks,
Gavin
Eric Auger Aug. 4, 2022, 7:19 a.m. UTC | #10
On 8/4/22 04:47, Gavin Shan wrote:
> Hi Eric,
>
> On 8/3/22 10:52 PM, Eric Auger wrote:
>> On 8/3/22 15:02, Gavin Shan wrote:
>>> On 8/3/22 5:01 PM, Marc Zyngier wrote:
>>>> On Wed, 03 Aug 2022 04:01:04 +0100,
>>>> Gavin Shan <gshan@redhat.com> wrote:
>>>>> On 8/2/22 7:41 PM, Eric Auger wrote:
>>>>>> On 8/2/22 08:45, Gavin Shan wrote:
>>>>>>> There are 3 highmem IO regions as below. They can be disabled in
>>>>>>> two situations: (a) The specific region is disabled by user. (b)
>>>>>>> The specific region doesn't fit in the PA space. However, the base
>>>>>>> address and highest_gpa are still updated no matter if the region
>>>>>>> is enabled or disabled. It's incorrectly incurring waste in the PA
>>>>>>> space.
>>>>>> If I am not wrong highmem_redists and highmem_mmio are not user
>>>>>> selectable
>>>>>>
>>>>>> Only highmem ecam depends on machine type & ACPI setup. But I would
>>>>>> say
>>>>>> that in server use case it is always set. So is that optimization
>>>>>> really
>>>>>> needed?
>>>>>
>>>>> There are two other cases you missed.
>>>>>
>>>>> - highmem_ecam is enabled after virt-2.12, meaning it stays disabled
>>>>>     before that.
>>>>
>>>> I don't get this. The current behaviour is to disable highmem_ecam if
>>>> it doesn't fit in the PA space. I can't see anything that enables it
>>>> if it was disabled the first place.
>>>>
>>>
>>> There are several places or conditions where vms->highmem_ecam can be
>>> disabled:
>>>
>>> - virt_instance_init() where vms->highmem_ecam is inherited from
>>>    !vmc->no_highmem_ecam. The option is set to true after virt-2.12
>>>    in virt_machine_2_12_options().
>>>
>>> - machvirt_init() where vms->highmem_ecam can be disable if we have
>>>    32-bits vCPUs and failure on loading firmware.
>>>
>>> - Another place is where we're talking about. It's address assignment
>>>    to fit the PA space.
>>>
>>>>>
>>>>> - The high memory region can be disabled if user is asking large
>>>>>     (normal) memory space through 'maxmem=' option. When the
>>>>> requested
>>>>>     memory by 'maxmem=' is large enough, the high memory regions are
>>>>>     disabled. It means the normal memory has higher priority than
>>>>> those
>>>>>     high memory regions. This is the case I provided in (b) of the
>>>>>     commit log.
>>>>
>>>> Why is that a problem? It matches the expected behaviour, as the
>>>> highmem IO region is floating and is pushed up by the memory region.
>>>>
>>>
>>> Eric thought that VIRT_HIGH_GIC_REDIST2 and VIRT_HIGH_PCIE_MMIO regions
>>> aren't user selectable. I tended to explain why it's not true.
>>> 'maxmem='
>>> can affect the outcome. When 'maxmem=' value is big enough, there
>>> will be
>>> no free area in the PA space to hold those two regions.
>>>
>>>>>
>>>>> In the commit log, I was supposed to say something like below for
>>>>> (a):
>>>>>
>>>>> - The specific high memory region can be disabled through changing
>>>>>     the code by user or developer. For example, 'vms->highmem_mmio'
>>>>>     is changed from true to false in virt_instance_init().
>>>>
>>>> Huh. By this principle, the user can change anything. Why is it
>>>> important?
>>>>
>>>
>>> Still like above. I was explaining the possible cases where those
>>> 3 switches can be turned on/off by users or developers. Our code
>>> needs to be consistent and comprehensive.
>>>
>>>    vms->highmem_redists
>>>    vms->highmem_ecam
>>>    vms->mmio
>>>
>>>>>
>>>>>>>
>>>>>>> Improve address assignment for highmem IO regions to avoid the
>>>>>>> waste
>>>>>>> in the PA space by putting the logic into virt_memmap_fits().
>>>>
>>>> I guess that this is what I understand the least. What do you mean by
>>>> "wasted PA space"? Either the regions fit in the PA space, and
>>>> computing their addresses in relevant, or they fall outside of it and
>>>> what we stick in memap[index].base is completely irrelevant.
>>>>
>>>
>>> It's possible that we run into the following combination. we should
>>> have enough PA space to enable VIRT_HIGH_PCIE_MMIO region. However,
>>> the region is disabled in the original implementation because
>>> VIRT_HIGH_{GIC_REDIST2, PCIE_ECAM} regions consumed 1GB, which is
>>> unecessary and waste in the PA space.
>> each region's base is aligned on its size.
>
> Yes.
>
>>>
>>>      static MemMapEntry extended_memmap[] = {
>>>          [VIRT_HIGH_GIC_REDIST2] =   { 0x0, 64 * MiB },
>>>          [VIRT_HIGH_PCIE_ECAM] =     { 0x0, 256 * MiB },
>>>          [VIRT_HIGH_PCIE_MMIO] =     { 0x0, 512 * GiB },
>> so anyway MMIO is at least at 512GB. Having a 1TB IPA space does not
>> imply any amount of RAM. This depends on the address space.
>> I    };
>
> Yes. Prior to the start of system memory, there is 1GB used by
> various regions either.
yes
>
>>>
>>>      IPA_LIMIT           = (1UL << 40)
>>>      '-maxmem'           = 511GB              /* Memory starts from
>>> 1GB */
>>>      '-slots'            = 0
>>>      vms->highmem_rdist2 = false
>> How can this happen? the only reason for highmem_redists to be reset is
>> if it does not fit into map_ipa. So if mmio fits, highmem_redists does
>> too. What do I miss?
>
> The example is having "vms->highmem_rdist2 = flase" BEFORE the address
> assignment, it's possible that developer changes the code to disable
> it intentionally. The point is the original implementation isn't
> comprehensive
> because it has the wrong assumption that vms->highmem_{rdist2, ecam,
> mmio} all
> true before the address assignment. With the wrong assumption, the
> base address
> is always increased, even the previous region is disabled, during the
> address assignment in virt_set_memmap().
Yes we currently always provision space for those functionalities,
independently on whether they are used. But that's true for many other
regions in the address map (although smaller) ;-)
>
>
>>>      vms->highmem_ecam   = false
>>      vms->highmem_mmio   = true
>> I am still skeptic about the relevance of such optim. Isn't it sensible
>> to expect the host to support large IPA if you want to use such amount
>> of memory.
>> I think we should rather better document the memory map from a user
>> point of view.
>> Besides if you change the memory map, you need to introduce yet another
>> option to avoid breaking migration, no?
>>
>
> These 3 high memory regions consumes 513GB with alignment considered when
> all of them are enabled. The point is those 3 high memory regions, or
> part
> of them can be squeezed or smashed to accommodate '-maxmem' by design. I
> think there are two options I can think of. I personally prefer to keep
> the capability. With this, users gain broader range for their '-maxmem'.
> Please let me know your preference.
>
> - Revert the capability of squeezing or smashing those high memory
> regions
>   to accommodate '-maxmem'. It means vms->high_{redist2, ecam, mmio}
> can't
>   be disable on conflicts to the PA space.
>
> - Keep the capability, with this optimization applied to make the
> implementation
>   comprehensive.
>
> I think it's worthy to add something about this limitation in
> doc/system/arm/virt.rst.
indeed that's worth in any case.
>
> I don't think I need introduce another option to avoid migration
> breakage.
> Could you explain why I need the extra option?
I think you do. Before and after this patch the QEMU memory regions
associated to those devices won't be a the same location in the memory
so if you migrate from an old version to a newer one, the guest won't be
able to access them

OK I have given my own opinion about those potential changes. Let's wait
for others' ;-)

Eric
>
>>>
>>>>>>>
>>>>>>> Signed-off-by: Gavin Shan <gshan@redhat.com>
>>>>>>> ---
>>>>>>>     hw/arm/virt.c | 54
>>>>>>> +++++++++++++++++++++++++++++----------------------
>>>>>>>     1 file changed, 31 insertions(+), 23 deletions(-)
>>>>>>>
>>>>>>> diff --git a/hw/arm/virt.c b/hw/arm/virt.c
>>>>>>> index 9633f822f3..bc0cd218f9 100644
>>>>>>> --- a/hw/arm/virt.c
>>>>>>> +++ b/hw/arm/virt.c
>>>>>>> @@ -1688,6 +1688,34 @@ static uint64_t
>>>>>>> virt_cpu_mp_affinity(VirtMachineState *vms, int idx)
>>>>>>>         return arm_cpu_mp_affinity(idx, clustersz);
>>>>>>>     }
>>>>>>>     +static void virt_memmap_fits(VirtMachineState *vms, int index,
>>>>>>> +                             bool *enabled, hwaddr *base, int
>>>>>>> pa_bits)
>>>>>>> +{
>>>>>>> +    hwaddr size = extended_memmap[index].size;
>>>>>>> +
>>>>>>> +    /* The region will be disabled if its size isn't given */
>>>>>>> +    if (!*enabled || !size) {
>>>>>> In which case do you have !size?
>>>>>
>>>>> Yeah, we don't have !size and the condition should be removed.
>>>>>
>>>>>>> +        *enabled = false;
>>>>>>> +        vms->memmap[index].base = 0;
>>>>>>> +        vms->memmap[index].size = 0;
>>>>>> It looks dangerous to me to reset the region's base and size like
>>>>>> that.
>>>>>> for instance fdt_add_gic_node() will add dummy data in the dt.
>>>>>
>>>>> I would guess you missed that the high memory regions won't be
>>>>> exported
>>>>> through device-tree if they have been disabled. We have a check
>>>>> there,
>>>>> which is "if (nb_redist_regions == 1)"
>>>>>
>>>>>>> +        return;
>>>>>>> +    }
>>>>>>> +
>>>>>>> +    /*
>>>>>>> +     * Check if the memory region fits in the PA space. The
>>>>>>> memory map
>>>>>>> +     * and highest_gpa are updated if it fits. Otherwise, it's
>>>>>>> disabled.
>>>>>>> +     */
>>>>>>> +    *enabled = (ROUND_UP(*base, size) + size <= BIT_ULL(pa_bits));
>>>>>> using a 'fits' local variable would make the code more obvious I
>>>>>> think
>>>>>
>>>>> Lets confirm if you're suggesting something like below?
>>>>>
>>>>>           bool fits;
>>>>>
>>>>>           fits = (ROUND_UP(*base, size) + size <= BIT_ULL(pa_bits));
>>>>>
>>>>>           if (fits) {
>>>>>              /* update *base, memory mapping, highest_gpa */
>>>>>           } else {
>>>>>              *enabled = false;
>>>>>           }
>>>>>
>>>>> I guess we can simply do
>>>>>
>>>>>           if (ROUND_UP(*base, size) + size <= BIT_ULL(pa_bits)) {
>>>>>              /* update *base, memory mapping, highest_gpa */
>>>>>           } else {
>>>>>              *enabled = false;
>>>>>           }
>>>>>
>>>>> Please let me know which one looks best to you.
>>>>
>>>> Why should the 'enabled' flag be updated by this function, instead of
>>>> returning the value and keeping it as an assignment in the caller
>>>> function? It is purely stylistic though.
>>>>
>>>
>>> The idea to put the logic, address assignment for those 3 high memory
>>> regions or updating the flags (vms->high_redist2/ecam/mmio), to the
>>> newly introduced function, to make virt_set_memmap() a bit simplified.
>>> Eric tends to agree the changes with minor adjustments. So I'm going
>>> to keep it as of being, which doesn't mean the stylistic thought is
>>> a bad one :)
>>>
>>> Lastly, I need to rewrite the comit log completely because it seems
>>> causing confusions to Eric and you.
>>>
>
> Thanks,
> Gavin
>
Eric Auger Aug. 5, 2022, 8 a.m. UTC | #11
Hi Gavin,

On 8/5/22 10:36, Gavin Shan wrote:
> Hi Eric,
>
> On 8/4/22 5:19 PM, Eric Auger wrote:
>> On 8/4/22 04:47, Gavin Shan wrote:
>>> On 8/3/22 10:52 PM, Eric Auger wrote:
>>>> On 8/3/22 15:02, Gavin Shan wrote:
>>>>> On 8/3/22 5:01 PM, Marc Zyngier wrote:
>>>>>> On Wed, 03 Aug 2022 04:01:04 +0100,
>>>>>> Gavin Shan <gshan@redhat.com> wrote:
>>>>>>> On 8/2/22 7:41 PM, Eric Auger wrote:
>>>>>>>> On 8/2/22 08:45, Gavin Shan wrote:
>>>>>>>>> There are 3 highmem IO regions as below. They can be disabled in
>>>>>>>>> two situations: (a) The specific region is disabled by user. (b)
>>>>>>>>> The specific region doesn't fit in the PA space. However, the
>>>>>>>>> base
>>>>>>>>> address and highest_gpa are still updated no matter if the region
>>>>>>>>> is enabled or disabled. It's incorrectly incurring waste in
>>>>>>>>> the PA
>>>>>>>>> space.
>>>>>>>> If I am not wrong highmem_redists and highmem_mmio are not user
>>>>>>>> selectable
>>>>>>>>
>>>>>>>> Only highmem ecam depends on machine type & ACPI setup. But I
>>>>>>>> would
>>>>>>>> say
>>>>>>>> that in server use case it is always set. So is that optimization
>>>>>>>> really
>>>>>>>> needed?
>>>>>>>
>>>>>>> There are two other cases you missed.
>>>>>>>
>>>>>>> - highmem_ecam is enabled after virt-2.12, meaning it stays
>>>>>>> disabled
>>>>>>>      before that.
>>>>>>
>>>>>> I don't get this. The current behaviour is to disable
>>>>>> highmem_ecam if
>>>>>> it doesn't fit in the PA space. I can't see anything that enables it
>>>>>> if it was disabled the first place.
>>>>>>
>>>>>
>>>>> There are several places or conditions where vms->highmem_ecam can be
>>>>> disabled:
>>>>>
>>>>> - virt_instance_init() where vms->highmem_ecam is inherited from
>>>>>     !vmc->no_highmem_ecam. The option is set to true after virt-2.12
>>>>>     in virt_machine_2_12_options().
>>>>>
>>>>> - machvirt_init() where vms->highmem_ecam can be disable if we have
>>>>>     32-bits vCPUs and failure on loading firmware.
>>>>>
>>>>> - Another place is where we're talking about. It's address assignment
>>>>>     to fit the PA space.
>>>>>
>>>>>>>
>>>>>>> - The high memory region can be disabled if user is asking large
>>>>>>>      (normal) memory space through 'maxmem=' option. When the
>>>>>>> requested
>>>>>>>      memory by 'maxmem=' is large enough, the high memory
>>>>>>> regions are
>>>>>>>      disabled. It means the normal memory has higher priority than
>>>>>>> those
>>>>>>>      high memory regions. This is the case I provided in (b) of the
>>>>>>>      commit log.
>>>>>>
>>>>>> Why is that a problem? It matches the expected behaviour, as the
>>>>>> highmem IO region is floating and is pushed up by the memory region.
>>>>>>
>>>>>
>>>>> Eric thought that VIRT_HIGH_GIC_REDIST2 and VIRT_HIGH_PCIE_MMIO
>>>>> regions
>>>>> aren't user selectable. I tended to explain why it's not true.
>>>>> 'maxmem='
>>>>> can affect the outcome. When 'maxmem=' value is big enough, there
>>>>> will be
>>>>> no free area in the PA space to hold those two regions.
>>>>>
>>>>>>>
>>>>>>> In the commit log, I was supposed to say something like below for
>>>>>>> (a):
>>>>>>>
>>>>>>> - The specific high memory region can be disabled through changing
>>>>>>>      the code by user or developer. For example,
>>>>>>> 'vms->highmem_mmio'
>>>>>>>      is changed from true to false in virt_instance_init().
>>>>>>
>>>>>> Huh. By this principle, the user can change anything. Why is it
>>>>>> important?
>>>>>>
>>>>>
>>>>> Still like above. I was explaining the possible cases where those
>>>>> 3 switches can be turned on/off by users or developers. Our code
>>>>> needs to be consistent and comprehensive.
>>>>>
>>>>>     vms->highmem_redists
>>>>>     vms->highmem_ecam
>>>>>     vms->mmio
>>>>>
>>>>>>>
>>>>>>>>>
>>>>>>>>> Improve address assignment for highmem IO regions to avoid the
>>>>>>>>> waste
>>>>>>>>> in the PA space by putting the logic into virt_memmap_fits().
>>>>>>
>>>>>> I guess that this is what I understand the least. What do you
>>>>>> mean by
>>>>>> "wasted PA space"? Either the regions fit in the PA space, and
>>>>>> computing their addresses in relevant, or they fall outside of it
>>>>>> and
>>>>>> what we stick in memap[index].base is completely irrelevant.
>>>>>>
>>>>>
>>>>> It's possible that we run into the following combination. we should
>>>>> have enough PA space to enable VIRT_HIGH_PCIE_MMIO region. However,
>>>>> the region is disabled in the original implementation because
>>>>> VIRT_HIGH_{GIC_REDIST2, PCIE_ECAM} regions consumed 1GB, which is
>>>>> unecessary and waste in the PA space.
>>>> each region's base is aligned on its size.
>>>
>>> Yes.
>>>
>>>>>
>>>>>       static MemMapEntry extended_memmap[] = {
>>>>>           [VIRT_HIGH_GIC_REDIST2] =   { 0x0, 64 * MiB },
>>>>>           [VIRT_HIGH_PCIE_ECAM] =     { 0x0, 256 * MiB },
>>>>>           [VIRT_HIGH_PCIE_MMIO] =     { 0x0, 512 * GiB },
>>>> so anyway MMIO is at least at 512GB. Having a 1TB IPA space does not
>>>> imply any amount of RAM. This depends on the address space.
>>>> I    };
>>>
>>> Yes. Prior to the start of system memory, there is 1GB used by
>>> various regions either.
>> yes
>>>
>>>>>
>>>>>       IPA_LIMIT           = (1UL << 40)
>>>>>       '-maxmem'           = 511GB              /* Memory starts from
>>>>> 1GB */
>>>>>       '-slots'            = 0
>>>>>       vms->highmem_rdist2 = false
>>>> How can this happen? the only reason for highmem_redists to be
>>>> reset is
>>>> if it does not fit into map_ipa. So if mmio fits, highmem_redists does
>>>> too. What do I miss?
>>>
>>> The example is having "vms->highmem_rdist2 = flase" BEFORE the address
>>> assignment, it's possible that developer changes the code to disable
>>> it intentionally. The point is the original implementation isn't
>>> comprehensive
>>> because it has the wrong assumption that vms->highmem_{rdist2, ecam,
>>> mmio} all
>>> true before the address assignment. With the wrong assumption, the
>>> base address
>>> is always increased, even the previous region is disabled, during the
>>> address assignment in virt_set_memmap().
>> Yes we currently always provision space for those functionalities,
>> independently on whether they are used. But that's true for many other
>> regions in the address map (although smaller) ;-)
>
> Yep :)
>
>>>
>>>
>>>>>       vms->highmem_ecam   = false
>>>>       vms->highmem_mmio   = true
>>>> I am still skeptic about the relevance of such optim. Isn't it
>>>> sensible
>>>> to expect the host to support large IPA if you want to use such amount
>>>> of memory.
>>>> I think we should rather better document the memory map from a user
>>>> point of view.
>>>> Besides if you change the memory map, you need to introduce yet
>>>> another
>>>> option to avoid breaking migration, no?
>>>>
>>>
>>> These 3 high memory regions consumes 513GB with alignment considered
>>> when
>>> all of them are enabled. The point is those 3 high memory regions, or
>>> part
>>> of them can be squeezed or smashed to accommodate '-maxmem' by
>>> design. I
>>> think there are two options I can think of. I personally prefer to keep
>>> the capability. With this, users gain broader range for their
>>> '-maxmem'.
>>> Please let me know your preference.
>>>
>>> - Revert the capability of squeezing or smashing those high memory
>>> regions
>>>    to accommodate '-maxmem'. It means vms->high_{redist2, ecam, mmio}
>>> can't
>>>    be disable on conflicts to the PA space.
>>>
>>> - Keep the capability, with this optimization applied to make the
>>> implementation
>>>    comprehensive.
>>>
>>> I think it's worthy to add something about this limitation in
>>> doc/system/arm/virt.rst.
>> indeed that's worth in any case.
>>>
>>> I don't think I need introduce another option to avoid migration
>>> breakage.
>>> Could you explain why I need the extra option?
>> I think you do. Before and after this patch the QEMU memory regions
>> associated to those devices won't be a the same location in the memory
>> so if you migrate from an old version to a newer one, the guest won't be
>> able to access them
>>
>> OK I have given my own opinion about those potential changes. Let's wait
>> for others' ;-)
>>
>
> Thank you for your comments. Yeah, I would hold to post v2 to collect
> more comments :)
>
> I'm still don't understand how it's affecting migration. If I understand
> correct, the changed address based doesn't affect migration, as explained
> like below. It took me while to looking the source code to figure out
> how VIRT_HIGH_{GIC_REDIST2, PCIE_ECAM} is linked to GIC and PCIe host and
> migration.
>
> For VIRTH_HIGH_GIC_REDIST2 region, it's used by TYPE_ARM_GICV3 or
> TYPE_KVM_ARM_GICV3.
> TYPE_KVM_ARM_GICV3. For both cases, we do NOT migrate the region
> directly. Instead,
> the registers are saved to GICv3CPUState. GICv3CPUState is migrate and
> the registers
> are restored from the instance.
>
> For VIRT_HIGH_PCIE_ECAM, the registers are saved to PCIDevice::config.
> The
> buffer is migrated and PCI's config space is restored from it. In
> hw/net/e1000e.c,
> e1000e_vmstate has something like below embedded:
>
>     VMSTATE_PCI_DEVICE(parent_obj, E1000EState),

Actually I was more thinking about PCI MMIO region. Effectively for
regions that are saved/restored from regs it sounds OK (RDIST).
For ECAM I do not know how migration is handled but better to double
check with PCI/migration experts.

Thanks

Eric
>
> ---
>
> I also did one experiment by having different address bases on the
> source and
> destination. Migration succeeds.
>
> address regions from source
> ---------------------------
> virt_memmap_fits: HIGH_GIC_REDIST2: [0000004000000000  0000000004000000]
> virt_memmap_fits: HIGH_GIC_ECAM:    [0000004010000000  0000000010000000]
> virt_memmap_fits: HIGH_GIC_MMIO:    [0000008000000000  0000008000000000]
>
> address regions from destination
> --------------------------------
> virt_memmap_fits: HIGH_GIC_REDIST2: [0000004040000000  0000000004000000]
> virt_memmap_fits: HIGH_GIC_ECAM:    [0000004050000000  0000000010000000]
> virt_memmap_fits: HIGH_GIC_MMIO:    [0000008000000000  0000008000000000]
>
>
>>>
>>>>>
>>>>>>>>>
>>>>>>>>> Signed-off-by: Gavin Shan <gshan@redhat.com>
>>>>>>>>> ---
>>>>>>>>>      hw/arm/virt.c | 54
>>>>>>>>> +++++++++++++++++++++++++++++----------------------
>>>>>>>>>      1 file changed, 31 insertions(+), 23 deletions(-)
>>>>>>>>>
>>>>>>>>> diff --git a/hw/arm/virt.c b/hw/arm/virt.c
>>>>>>>>> index 9633f822f3..bc0cd218f9 100644
>>>>>>>>> --- a/hw/arm/virt.c
>>>>>>>>> +++ b/hw/arm/virt.c
>>>>>>>>> @@ -1688,6 +1688,34 @@ static uint64_t
>>>>>>>>> virt_cpu_mp_affinity(VirtMachineState *vms, int idx)
>>>>>>>>>          return arm_cpu_mp_affinity(idx, clustersz);
>>>>>>>>>      }
>>>>>>>>>      +static void virt_memmap_fits(VirtMachineState *vms, int
>>>>>>>>> index,
>>>>>>>>> +                             bool *enabled, hwaddr *base, int
>>>>>>>>> pa_bits)
>>>>>>>>> +{
>>>>>>>>> +    hwaddr size = extended_memmap[index].size;
>>>>>>>>> +
>>>>>>>>> +    /* The region will be disabled if its size isn't given */
>>>>>>>>> +    if (!*enabled || !size) {
>>>>>>>> In which case do you have !size?
>>>>>>>
>>>>>>> Yeah, we don't have !size and the condition should be removed.
>>>>>>>
>>>>>>>>> +        *enabled = false;
>>>>>>>>> +        vms->memmap[index].base = 0;
>>>>>>>>> +        vms->memmap[index].size = 0;
>>>>>>>> It looks dangerous to me to reset the region's base and size like
>>>>>>>> that.
>>>>>>>> for instance fdt_add_gic_node() will add dummy data in the dt.
>>>>>>>
>>>>>>> I would guess you missed that the high memory regions won't be
>>>>>>> exported
>>>>>>> through device-tree if they have been disabled. We have a check
>>>>>>> there,
>>>>>>> which is "if (nb_redist_regions == 1)"
>>>>>>>
>>>>>>>>> +        return;
>>>>>>>>> +    }
>>>>>>>>> +
>>>>>>>>> +    /*
>>>>>>>>> +     * Check if the memory region fits in the PA space. The
>>>>>>>>> memory map
>>>>>>>>> +     * and highest_gpa are updated if it fits. Otherwise, it's
>>>>>>>>> disabled.
>>>>>>>>> +     */
>>>>>>>>> +    *enabled = (ROUND_UP(*base, size) + size <=
>>>>>>>>> BIT_ULL(pa_bits));
>>>>>>>> using a 'fits' local variable would make the code more obvious I
>>>>>>>> think
>>>>>>>
>>>>>>> Lets confirm if you're suggesting something like below?
>>>>>>>
>>>>>>>            bool fits;
>>>>>>>
>>>>>>>            fits = (ROUND_UP(*base, size) + size <=
>>>>>>> BIT_ULL(pa_bits));
>>>>>>>
>>>>>>>            if (fits) {
>>>>>>>               /* update *base, memory mapping, highest_gpa */
>>>>>>>            } else {
>>>>>>>               *enabled = false;
>>>>>>>            }
>>>>>>>
>>>>>>> I guess we can simply do
>>>>>>>
>>>>>>>            if (ROUND_UP(*base, size) + size <= BIT_ULL(pa_bits)) {
>>>>>>>               /* update *base, memory mapping, highest_gpa */
>>>>>>>            } else {
>>>>>>>               *enabled = false;
>>>>>>>            }
>>>>>>>
>>>>>>> Please let me know which one looks best to you.
>>>>>>
>>>>>> Why should the 'enabled' flag be updated by this function,
>>>>>> instead of
>>>>>> returning the value and keeping it as an assignment in the caller
>>>>>> function? It is purely stylistic though.
>>>>>>
>>>>>
>>>>> The idea to put the logic, address assignment for those 3 high memory
>>>>> regions or updating the flags (vms->high_redist2/ecam/mmio), to the
>>>>> newly introduced function, to make virt_set_memmap() a bit
>>>>> simplified.
>>>>> Eric tends to agree the changes with minor adjustments. So I'm going
>>>>> to keep it as of being, which doesn't mean the stylistic thought is
>>>>> a bad one :)
>>>>>
>>>>> Lastly, I need to rewrite the comit log completely because it seems
>>>>> causing confusions to Eric and you.
>>>>>
>>>
>
> Thanks,
> Gavin
>
Gavin Shan Aug. 5, 2022, 8:36 a.m. UTC | #12
Hi Eric,

On 8/4/22 5:19 PM, Eric Auger wrote:
> On 8/4/22 04:47, Gavin Shan wrote:
>> On 8/3/22 10:52 PM, Eric Auger wrote:
>>> On 8/3/22 15:02, Gavin Shan wrote:
>>>> On 8/3/22 5:01 PM, Marc Zyngier wrote:
>>>>> On Wed, 03 Aug 2022 04:01:04 +0100,
>>>>> Gavin Shan <gshan@redhat.com> wrote:
>>>>>> On 8/2/22 7:41 PM, Eric Auger wrote:
>>>>>>> On 8/2/22 08:45, Gavin Shan wrote:
>>>>>>>> There are 3 highmem IO regions as below. They can be disabled in
>>>>>>>> two situations: (a) The specific region is disabled by user. (b)
>>>>>>>> The specific region doesn't fit in the PA space. However, the base
>>>>>>>> address and highest_gpa are still updated no matter if the region
>>>>>>>> is enabled or disabled. It's incorrectly incurring waste in the PA
>>>>>>>> space.
>>>>>>> If I am not wrong highmem_redists and highmem_mmio are not user
>>>>>>> selectable
>>>>>>>
>>>>>>> Only highmem ecam depends on machine type & ACPI setup. But I would
>>>>>>> say
>>>>>>> that in server use case it is always set. So is that optimization
>>>>>>> really
>>>>>>> needed?
>>>>>>
>>>>>> There are two other cases you missed.
>>>>>>
>>>>>> - highmem_ecam is enabled after virt-2.12, meaning it stays disabled
>>>>>>      before that.
>>>>>
>>>>> I don't get this. The current behaviour is to disable highmem_ecam if
>>>>> it doesn't fit in the PA space. I can't see anything that enables it
>>>>> if it was disabled the first place.
>>>>>
>>>>
>>>> There are several places or conditions where vms->highmem_ecam can be
>>>> disabled:
>>>>
>>>> - virt_instance_init() where vms->highmem_ecam is inherited from
>>>>     !vmc->no_highmem_ecam. The option is set to true after virt-2.12
>>>>     in virt_machine_2_12_options().
>>>>
>>>> - machvirt_init() where vms->highmem_ecam can be disable if we have
>>>>     32-bits vCPUs and failure on loading firmware.
>>>>
>>>> - Another place is where we're talking about. It's address assignment
>>>>     to fit the PA space.
>>>>
>>>>>>
>>>>>> - The high memory region can be disabled if user is asking large
>>>>>>      (normal) memory space through 'maxmem=' option. When the
>>>>>> requested
>>>>>>      memory by 'maxmem=' is large enough, the high memory regions are
>>>>>>      disabled. It means the normal memory has higher priority than
>>>>>> those
>>>>>>      high memory regions. This is the case I provided in (b) of the
>>>>>>      commit log.
>>>>>
>>>>> Why is that a problem? It matches the expected behaviour, as the
>>>>> highmem IO region is floating and is pushed up by the memory region.
>>>>>
>>>>
>>>> Eric thought that VIRT_HIGH_GIC_REDIST2 and VIRT_HIGH_PCIE_MMIO regions
>>>> aren't user selectable. I tended to explain why it's not true.
>>>> 'maxmem='
>>>> can affect the outcome. When 'maxmem=' value is big enough, there
>>>> will be
>>>> no free area in the PA space to hold those two regions.
>>>>
>>>>>>
>>>>>> In the commit log, I was supposed to say something like below for
>>>>>> (a):
>>>>>>
>>>>>> - The specific high memory region can be disabled through changing
>>>>>>      the code by user or developer. For example, 'vms->highmem_mmio'
>>>>>>      is changed from true to false in virt_instance_init().
>>>>>
>>>>> Huh. By this principle, the user can change anything. Why is it
>>>>> important?
>>>>>
>>>>
>>>> Still like above. I was explaining the possible cases where those
>>>> 3 switches can be turned on/off by users or developers. Our code
>>>> needs to be consistent and comprehensive.
>>>>
>>>>     vms->highmem_redists
>>>>     vms->highmem_ecam
>>>>     vms->mmio
>>>>
>>>>>>
>>>>>>>>
>>>>>>>> Improve address assignment for highmem IO regions to avoid the
>>>>>>>> waste
>>>>>>>> in the PA space by putting the logic into virt_memmap_fits().
>>>>>
>>>>> I guess that this is what I understand the least. What do you mean by
>>>>> "wasted PA space"? Either the regions fit in the PA space, and
>>>>> computing their addresses in relevant, or they fall outside of it and
>>>>> what we stick in memap[index].base is completely irrelevant.
>>>>>
>>>>
>>>> It's possible that we run into the following combination. we should
>>>> have enough PA space to enable VIRT_HIGH_PCIE_MMIO region. However,
>>>> the region is disabled in the original implementation because
>>>> VIRT_HIGH_{GIC_REDIST2, PCIE_ECAM} regions consumed 1GB, which is
>>>> unecessary and waste in the PA space.
>>> each region's base is aligned on its size.
>>
>> Yes.
>>
>>>>
>>>>       static MemMapEntry extended_memmap[] = {
>>>>           [VIRT_HIGH_GIC_REDIST2] =   { 0x0, 64 * MiB },
>>>>           [VIRT_HIGH_PCIE_ECAM] =     { 0x0, 256 * MiB },
>>>>           [VIRT_HIGH_PCIE_MMIO] =     { 0x0, 512 * GiB },
>>> so anyway MMIO is at least at 512GB. Having a 1TB IPA space does not
>>> imply any amount of RAM. This depends on the address space.
>>> I    };
>>
>> Yes. Prior to the start of system memory, there is 1GB used by
>> various regions either.
> yes
>>
>>>>
>>>>       IPA_LIMIT           = (1UL << 40)
>>>>       '-maxmem'           = 511GB              /* Memory starts from
>>>> 1GB */
>>>>       '-slots'            = 0
>>>>       vms->highmem_rdist2 = false
>>> How can this happen? the only reason for highmem_redists to be reset is
>>> if it does not fit into map_ipa. So if mmio fits, highmem_redists does
>>> too. What do I miss?
>>
>> The example is having "vms->highmem_rdist2 = flase" BEFORE the address
>> assignment, it's possible that developer changes the code to disable
>> it intentionally. The point is the original implementation isn't
>> comprehensive
>> because it has the wrong assumption that vms->highmem_{rdist2, ecam,
>> mmio} all
>> true before the address assignment. With the wrong assumption, the
>> base address
>> is always increased, even the previous region is disabled, during the
>> address assignment in virt_set_memmap().
> Yes we currently always provision space for those functionalities,
> independently on whether they are used. But that's true for many other
> regions in the address map (although smaller) ;-)

Yep :)

>>
>>
>>>>       vms->highmem_ecam   = false
>>>       vms->highmem_mmio   = true
>>> I am still skeptic about the relevance of such optim. Isn't it sensible
>>> to expect the host to support large IPA if you want to use such amount
>>> of memory.
>>> I think we should rather better document the memory map from a user
>>> point of view.
>>> Besides if you change the memory map, you need to introduce yet another
>>> option to avoid breaking migration, no?
>>>
>>
>> These 3 high memory regions consumes 513GB with alignment considered when
>> all of them are enabled. The point is those 3 high memory regions, or
>> part
>> of them can be squeezed or smashed to accommodate '-maxmem' by design. I
>> think there are two options I can think of. I personally prefer to keep
>> the capability. With this, users gain broader range for their '-maxmem'.
>> Please let me know your preference.
>>
>> - Revert the capability of squeezing or smashing those high memory
>> regions
>>    to accommodate '-maxmem'. It means vms->high_{redist2, ecam, mmio}
>> can't
>>    be disable on conflicts to the PA space.
>>
>> - Keep the capability, with this optimization applied to make the
>> implementation
>>    comprehensive.
>>
>> I think it's worthy to add something about this limitation in
>> doc/system/arm/virt.rst.
> indeed that's worth in any case.
>>
>> I don't think I need introduce another option to avoid migration
>> breakage.
>> Could you explain why I need the extra option?
> I think you do. Before and after this patch the QEMU memory regions
> associated to those devices won't be a the same location in the memory
> so if you migrate from an old version to a newer one, the guest won't be
> able to access them
> 
> OK I have given my own opinion about those potential changes. Let's wait
> for others' ;-)
> 

Thank you for your comments. Yeah, I would hold to post v2 to collect
more comments :)

I'm still don't understand how it's affecting migration. If I understand
correct, the changed address based doesn't affect migration, as explained
like below. It took me while to looking the source code to figure out
how VIRT_HIGH_{GIC_REDIST2, PCIE_ECAM} is linked to GIC and PCIe host and
migration.

For VIRTH_HIGH_GIC_REDIST2 region, it's used by TYPE_ARM_GICV3 or TYPE_KVM_ARM_GICV3.
TYPE_KVM_ARM_GICV3. For both cases, we do NOT migrate the region directly. Instead,
the registers are saved to GICv3CPUState. GICv3CPUState is migrate and the registers
are restored from the instance.

For VIRT_HIGH_PCIE_ECAM, the registers are saved to PCIDevice::config. The
buffer is migrated and PCI's config space is restored from it. In hw/net/e1000e.c,
e1000e_vmstate has something like below embedded:

     VMSTATE_PCI_DEVICE(parent_obj, E1000EState),

---

I also did one experiment by having different address bases on the source and
destination. Migration succeeds.

address regions from source
---------------------------
virt_memmap_fits: HIGH_GIC_REDIST2: [0000004000000000  0000000004000000]
virt_memmap_fits: HIGH_GIC_ECAM:    [0000004010000000  0000000010000000]
virt_memmap_fits: HIGH_GIC_MMIO:    [0000008000000000  0000008000000000]

address regions from destination
--------------------------------
virt_memmap_fits: HIGH_GIC_REDIST2: [0000004040000000  0000000004000000]
virt_memmap_fits: HIGH_GIC_ECAM:    [0000004050000000  0000000010000000]
virt_memmap_fits: HIGH_GIC_MMIO:    [0000008000000000  0000008000000000]


>>
>>>>
>>>>>>>>
>>>>>>>> Signed-off-by: Gavin Shan <gshan@redhat.com>
>>>>>>>> ---
>>>>>>>>      hw/arm/virt.c | 54
>>>>>>>> +++++++++++++++++++++++++++++----------------------
>>>>>>>>      1 file changed, 31 insertions(+), 23 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/hw/arm/virt.c b/hw/arm/virt.c
>>>>>>>> index 9633f822f3..bc0cd218f9 100644
>>>>>>>> --- a/hw/arm/virt.c
>>>>>>>> +++ b/hw/arm/virt.c
>>>>>>>> @@ -1688,6 +1688,34 @@ static uint64_t
>>>>>>>> virt_cpu_mp_affinity(VirtMachineState *vms, int idx)
>>>>>>>>          return arm_cpu_mp_affinity(idx, clustersz);
>>>>>>>>      }
>>>>>>>>      +static void virt_memmap_fits(VirtMachineState *vms, int index,
>>>>>>>> +                             bool *enabled, hwaddr *base, int
>>>>>>>> pa_bits)
>>>>>>>> +{
>>>>>>>> +    hwaddr size = extended_memmap[index].size;
>>>>>>>> +
>>>>>>>> +    /* The region will be disabled if its size isn't given */
>>>>>>>> +    if (!*enabled || !size) {
>>>>>>> In which case do you have !size?
>>>>>>
>>>>>> Yeah, we don't have !size and the condition should be removed.
>>>>>>
>>>>>>>> +        *enabled = false;
>>>>>>>> +        vms->memmap[index].base = 0;
>>>>>>>> +        vms->memmap[index].size = 0;
>>>>>>> It looks dangerous to me to reset the region's base and size like
>>>>>>> that.
>>>>>>> for instance fdt_add_gic_node() will add dummy data in the dt.
>>>>>>
>>>>>> I would guess you missed that the high memory regions won't be
>>>>>> exported
>>>>>> through device-tree if they have been disabled. We have a check
>>>>>> there,
>>>>>> which is "if (nb_redist_regions == 1)"
>>>>>>
>>>>>>>> +        return;
>>>>>>>> +    }
>>>>>>>> +
>>>>>>>> +    /*
>>>>>>>> +     * Check if the memory region fits in the PA space. The
>>>>>>>> memory map
>>>>>>>> +     * and highest_gpa are updated if it fits. Otherwise, it's
>>>>>>>> disabled.
>>>>>>>> +     */
>>>>>>>> +    *enabled = (ROUND_UP(*base, size) + size <= BIT_ULL(pa_bits));
>>>>>>> using a 'fits' local variable would make the code more obvious I
>>>>>>> think
>>>>>>
>>>>>> Lets confirm if you're suggesting something like below?
>>>>>>
>>>>>>            bool fits;
>>>>>>
>>>>>>            fits = (ROUND_UP(*base, size) + size <= BIT_ULL(pa_bits));
>>>>>>
>>>>>>            if (fits) {
>>>>>>               /* update *base, memory mapping, highest_gpa */
>>>>>>            } else {
>>>>>>               *enabled = false;
>>>>>>            }
>>>>>>
>>>>>> I guess we can simply do
>>>>>>
>>>>>>            if (ROUND_UP(*base, size) + size <= BIT_ULL(pa_bits)) {
>>>>>>               /* update *base, memory mapping, highest_gpa */
>>>>>>            } else {
>>>>>>               *enabled = false;
>>>>>>            }
>>>>>>
>>>>>> Please let me know which one looks best to you.
>>>>>
>>>>> Why should the 'enabled' flag be updated by this function, instead of
>>>>> returning the value and keeping it as an assignment in the caller
>>>>> function? It is purely stylistic though.
>>>>>
>>>>
>>>> The idea to put the logic, address assignment for those 3 high memory
>>>> regions or updating the flags (vms->high_redist2/ecam/mmio), to the
>>>> newly introduced function, to make virt_set_memmap() a bit simplified.
>>>> Eric tends to agree the changes with minor adjustments. So I'm going
>>>> to keep it as of being, which doesn't mean the stylistic thought is
>>>> a bad one :)
>>>>
>>>> Lastly, I need to rewrite the comit log completely because it seems
>>>> causing confusions to Eric and you.
>>>>
>>

Thanks,
Gavin
Marc Zyngier Aug. 8, 2022, 9:17 a.m. UTC | #13
On Wed, 03 Aug 2022 14:02:04 +0100,
Gavin Shan <gshan@redhat.com> wrote:
> 
> Hi Marc,
> 
> On 8/3/22 5:01 PM, Marc Zyngier wrote:
> > On Wed, 03 Aug 2022 04:01:04 +0100,
> > Gavin Shan <gshan@redhat.com> wrote:
> >> On 8/2/22 7:41 PM, Eric Auger wrote:
> >>> On 8/2/22 08:45, Gavin Shan wrote:
> >>>> There are 3 highmem IO regions as below. They can be disabled in
> >>>> two situations: (a) The specific region is disabled by user. (b)
> >>>> The specific region doesn't fit in the PA space. However, the base
> >>>> address and highest_gpa are still updated no matter if the region
> >>>> is enabled or disabled. It's incorrectly incurring waste in the PA
> >>>> space.
> >>> If I am not wrong highmem_redists and highmem_mmio are not user selectable
> >>> 
> >>> Only highmem ecam depends on machine type & ACPI setup. But I would say
> >>> that in server use case it is always set. So is that optimization really
> >>> needed?
> >> 
> >> There are two other cases you missed.
> >> 
> >> - highmem_ecam is enabled after virt-2.12, meaning it stays disabled
> >>    before that.
> > 
> > I don't get this. The current behaviour is to disable highmem_ecam if
> > it doesn't fit in the PA space. I can't see anything that enables it
> > if it was disabled the first place.
> > 
> 
> There are several places or conditions where vms->highmem_ecam can be
> disabled:
> 
> - virt_instance_init() where vms->highmem_ecam is inherited from
>   !vmc->no_highmem_ecam. The option is set to true after virt-2.12
>   in virt_machine_2_12_options().
> 
> - machvirt_init() where vms->highmem_ecam can be disable if we have
>   32-bits vCPUs and failure on loading firmware.

Right. But at no point do we *enable* something that was disabled
beforehand, which is how I understood your previous comment.

> 
> - Another place is where we're talking about. It's address assignment
>   to fit the PA space.

Alignment? No, the alignment is cast into stone: it is set to the
smallest power-of-two containing the region (natural alignment).

> 
> >> 
> >> - The high memory region can be disabled if user is asking large
> >>    (normal) memory space through 'maxmem=' option. When the requested
> >>    memory by 'maxmem=' is large enough, the high memory regions are
> >>    disabled. It means the normal memory has higher priority than those
> >>    high memory regions. This is the case I provided in (b) of the
> >>    commit log.
> > 
> > Why is that a problem? It matches the expected behaviour, as the
> > highmem IO region is floating and is pushed up by the memory region.
> > 
> 
> Eric thought that VIRT_HIGH_GIC_REDIST2 and VIRT_HIGH_PCIE_MMIO regions
> aren't user selectable. I tended to explain why it's not true. 'maxmem='
> can affect the outcome. When 'maxmem=' value is big enough, there will be
> no free area in the PA space to hold those two regions.

Right, that's an interesting point. This is a consequence of these
upper regions floating above RAM.

> 
> >> 
> >> In the commit log, I was supposed to say something like below for
> >> (a):
> >> 
> >> - The specific high memory region can be disabled through changing
> >>    the code by user or developer. For example, 'vms->highmem_mmio'
> >>    is changed from true to false in virt_instance_init().
> > 
> > Huh. By this principle, the user can change anything. Why is it
> > important?
> > 
> 
> Still like above. I was explaining the possible cases where those
> 3 switches can be turned on/off by users or developers. Our code
> needs to be consistent and comprehensive.
> 
>   vms->highmem_redists
>   vms->highmem_ecam
>   vms->mmio
> 
> >> 
> >>>> 
> >>>> Improve address assignment for highmem IO regions to avoid the waste
> >>>> in the PA space by putting the logic into virt_memmap_fits().
> > 
> > I guess that this is what I understand the least. What do you mean by
> > "wasted PA space"? Either the regions fit in the PA space, and
> > computing their addresses in relevant, or they fall outside of it and
> > what we stick in memap[index].base is completely irrelevant.
> > 
> 
> It's possible that we run into the following combination. we should
> have enough PA space to enable VIRT_HIGH_PCIE_MMIO region. However,
> the region is disabled in the original implementation because
> VIRT_HIGH_{GIC_REDIST2, PCIE_ECAM} regions consumed 1GB, which is
> unecessary and waste in the PA space.
> 
>     static MemMapEntry extended_memmap[] = {
>         [VIRT_HIGH_GIC_REDIST2] =   { 0x0, 64 * MiB },
>         [VIRT_HIGH_PCIE_ECAM] =     { 0x0, 256 * MiB },
>         [VIRT_HIGH_PCIE_MMIO] =     { 0x0, 512 * GiB },
>     };
> 
>     IPA_LIMIT           = (1UL << 40)
>     '-maxmem'           = 511GB              /* Memory starts from 1GB */
>     '-slots'            = 0
>     vms->highmem_rdist2 = false
>     vms->highmem_ecam   = false
>     vms->highmem_mmio   = true
>

Sure. But that's not how QEMU works today, and these regions are
enabled in order (though it could well be that my recent changes have
made the situation more complicated).

What you're suggesting is a pretty radical change in the way the
memory map is set. My hunch is that allowing the highmem IO regions to
be selectively enabled and allowed to float in the high IO space
should come as a new virt machine revision, with user-visible options.

Thanks,

	M.
Gavin Shan Aug. 11, 2022, 5:32 a.m. UTC | #14
Hi Marc,

On 8/8/22 7:17 PM, Marc Zyngier wrote:
> On Wed, 03 Aug 2022 14:02:04 +0100,
> Gavin Shan <gshan@redhat.com> wrote:
>> On 8/3/22 5:01 PM, Marc Zyngier wrote:
>>> On Wed, 03 Aug 2022 04:01:04 +0100,
>>> Gavin Shan <gshan@redhat.com> wrote:
>>>> On 8/2/22 7:41 PM, Eric Auger wrote:
>>>>> On 8/2/22 08:45, Gavin Shan wrote:
>>>>>> There are 3 highmem IO regions as below. They can be disabled in
>>>>>> two situations: (a) The specific region is disabled by user. (b)
>>>>>> The specific region doesn't fit in the PA space. However, the base
>>>>>> address and highest_gpa are still updated no matter if the region
>>>>>> is enabled or disabled. It's incorrectly incurring waste in the PA
>>>>>> space.
>>>>> If I am not wrong highmem_redists and highmem_mmio are not user selectable
>>>>>
>>>>> Only highmem ecam depends on machine type & ACPI setup. But I would say
>>>>> that in server use case it is always set. So is that optimization really
>>>>> needed?
>>>>
>>>> There are two other cases you missed.
>>>>
>>>> - highmem_ecam is enabled after virt-2.12, meaning it stays disabled
>>>>     before that.
>>>
>>> I don't get this. The current behaviour is to disable highmem_ecam if
>>> it doesn't fit in the PA space. I can't see anything that enables it
>>> if it was disabled the first place.
>>>
>>
>> There are several places or conditions where vms->highmem_ecam can be
>> disabled:
>>
>> - virt_instance_init() where vms->highmem_ecam is inherited from
>>    !vmc->no_highmem_ecam. The option is set to true after virt-2.12
>>    in virt_machine_2_12_options().
>>
>> - machvirt_init() where vms->highmem_ecam can be disable if we have
>>    32-bits vCPUs and failure on loading firmware.
> 
> Right. But at no point do we *enable* something that was disabled
> beforehand, which is how I understood your previous comment.
> 

Sorry for the delay. I think the original changelog is confusing
enough and sorry about it. I will improve it if v2 is needed :)

Yes, we shouldn't assign address to VIRT_HIGH_PCIE_ECAM region
and enable it when vms->highmem_ecam is false in virt_set_memmap().

In the original implementation of virt_set_memmap(), the memory
regioin is disabled when when vms->highmem_ecam is false. However,
the address is still assigned to the memory region, even when
vms->highmem_ecam is false. This leads to waste in the PA space.

In hw/arm/virt.c::virt_set_memmap(), @base is always added with
the memory region size, even the memory region has been disabled.

     for (i = VIRT_LOWMEMMAP_LAST; i < ARRAY_SIZE(extended_memmap); i++) {
         hwaddr size = extended_memmap[i].size;
         bool fits;

         base = ROUND_UP(base, size);               /* The roundup isn't necessary for disabled region */
         vms->memmap[i].base = base;
         vms->memmap[i].size = size;

          :
          :

         base += size;                              /* The increment isn't necessary for disabled region */
     }

>>
>> - Another place is where we're talking about. It's address assignment
>>    to fit the PA space.
> 
> Alignment? No, the alignment is cast into stone: it is set to the
> smallest power-of-two containing the region (natural alignment).
> 

Nope, I was talking about address assignment, instead of address
alignment. Lets have an example here to explain. For example,
we have following capability and user's command lines. In this
specific example, the memory layout is something like below:

     PA space limit:            40 bits (1TB)
     user's command line:       -m 1GB,maxmem=1019G,slots=4

     VIRT_MEM region start:     1GB
     VIRT_MEM region end:       2GB
     device_memory_base:        2GB                               // in virt_set_memmap()
     device_memory_size:     1022GB    (end at 1024GB)            // in virt_set_memmap()
     
All the high memory regions won't be enabled because we don't
have more free areas in the PA space. In virt_set_memmap(),
@base is still increased by the region's size, as said above.


>>
>>>>
>>>> - The high memory region can be disabled if user is asking large
>>>>     (normal) memory space through 'maxmem=' option. When the requested
>>>>     memory by 'maxmem=' is large enough, the high memory regions are
>>>>     disabled. It means the normal memory has higher priority than those
>>>>     high memory regions. This is the case I provided in (b) of the
>>>>     commit log.
>>>
>>> Why is that a problem? It matches the expected behaviour, as the
>>> highmem IO region is floating and is pushed up by the memory region.
>>>
>>
>> Eric thought that VIRT_HIGH_GIC_REDIST2 and VIRT_HIGH_PCIE_MMIO regions
>> aren't user selectable. I tended to explain why it's not true. 'maxmem='
>> can affect the outcome. When 'maxmem=' value is big enough, there will be
>> no free area in the PA space to hold those two regions.
> 
> Right, that's an interesting point. This is a consequence of these
> upper regions floating above RAM.
> 

Yep, it's fine for those high memory region floating above RAM, and to
disable them if we run out of PA space. Something may be irrelevant
to this topic: VIRT_HIGH_PCIE_MMIO region has 512GB, which is huge one.
It may be nice to fall back smaller sizes when we're having tight PA
space. For example, we can fall back to try 256GB if 512GB doesn't fit
into the PA space.

However, I'm not sure how we had the size (512GB) for the region. Are
there practical factors why we need a 512GB PCIe 64-bits MMIO space?

>>
>>>>
>>>> In the commit log, I was supposed to say something like below for
>>>> (a):
>>>>
>>>> - The specific high memory region can be disabled through changing
>>>>     the code by user or developer. For example, 'vms->highmem_mmio'
>>>>     is changed from true to false in virt_instance_init().
>>>
>>> Huh. By this principle, the user can change anything. Why is it
>>> important?
>>>
>>
>> Still like above. I was explaining the possible cases where those
>> 3 switches can be turned on/off by users or developers. Our code
>> needs to be consistent and comprehensive.
>>
>>    vms->highmem_redists
>>    vms->highmem_ecam
>>    vms->mmio
>>
>>>>
>>>>>>
>>>>>> Improve address assignment for highmem IO regions to avoid the waste
>>>>>> in the PA space by putting the logic into virt_memmap_fits().
>>>
>>> I guess that this is what I understand the least. What do you mean by
>>> "wasted PA space"? Either the regions fit in the PA space, and
>>> computing their addresses in relevant, or they fall outside of it and
>>> what we stick in memap[index].base is completely irrelevant.
>>>
>>
>> It's possible that we run into the following combination. we should
>> have enough PA space to enable VIRT_HIGH_PCIE_MMIO region. However,
>> the region is disabled in the original implementation because
>> VIRT_HIGH_{GIC_REDIST2, PCIE_ECAM} regions consumed 1GB, which is
>> unecessary and waste in the PA space.
>>
>>      static MemMapEntry extended_memmap[] = {
>>          [VIRT_HIGH_GIC_REDIST2] =   { 0x0, 64 * MiB },
>>          [VIRT_HIGH_PCIE_ECAM] =     { 0x0, 256 * MiB },
>>          [VIRT_HIGH_PCIE_MMIO] =     { 0x0, 512 * GiB },
>>      };
>>
>>      IPA_LIMIT           = (1UL << 40)
>>      '-maxmem'           = 511GB              /* Memory starts from 1GB */
>>      '-slots'            = 0
>>      vms->highmem_rdist2 = false
>>      vms->highmem_ecam   = false
>>      vms->highmem_mmio   = true
>>
> 
> Sure. But that's not how QEMU works today, and these regions are
> enabled in order (though it could well be that my recent changes have
> made the situation more complicated).
> 
> What you're suggesting is a pretty radical change in the way the
> memory map is set. My hunch is that allowing the highmem IO regions to
> be selectively enabled and allowed to float in the high IO space
> should come as a new virt machine revision, with user-visible options.
> 

Yeah, These regions are enabled in order. It also means they have ascending
priorities. In other words, '-maxmem' has higher priority than those 3 high
memory regions.

My suggested code changes just improve the address assignment for these 3
high memory regions, without changing the mechanism fundamentally. The
intention of the proposed changes are like below.

- In virt_set_memmap(), don't assign address for one specific high memory
   region if it has been disabled.

- Put the logic into standalone helper, to simplify the code.

I'm not sure about the user-visible options. I would say it's going to
increase user's load. I means the user experience will be degraded.
Those user-visible options needs to be worried by users :)

Marc, lets improve the changelog and the code changes in v2, to seek
your comments if you agree? :)

Thanks,
Gavin
Marc Zyngier Aug. 11, 2022, 7:37 a.m. UTC | #15
Hi Gavin,

On Thu, 11 Aug 2022 06:32:36 +0100,
Gavin Shan <gshan@redhat.com> wrote:
> 
> Hi Marc,
> 
> On 8/8/22 7:17 PM, Marc Zyngier wrote:
> > On Wed, 03 Aug 2022 14:02:04 +0100,
> > Gavin Shan <gshan@redhat.com> wrote:

[...]

> Sorry for the delay. I think the original changelog is confusing
> enough and sorry about it. I will improve it if v2 is needed :)
> 
> Yes, we shouldn't assign address to VIRT_HIGH_PCIE_ECAM region
> and enable it when vms->highmem_ecam is false in virt_set_memmap().
> 
> In the original implementation of virt_set_memmap(), the memory
> regioin is disabled when when vms->highmem_ecam is false. However,
> the address is still assigned to the memory region, even when
> vms->highmem_ecam is false. This leads to waste in the PA space.

Yes, I got this point now. However, I think the approach you've chosen
leads to subtle issues, see below.

[...]

> >> Eric thought that VIRT_HIGH_GIC_REDIST2 and VIRT_HIGH_PCIE_MMIO regions
> >> aren't user selectable. I tended to explain why it's not true. 'maxmem='
> >> can affect the outcome. When 'maxmem=' value is big enough, there will be
> >> no free area in the PA space to hold those two regions.
> > 
> > Right, that's an interesting point. This is a consequence of these
> > upper regions floating above RAM.
> > 
> 
> Yep, it's fine for those high memory region floating above RAM, and to
> disable them if we run out of PA space. Something may be irrelevant
> to this topic: VIRT_HIGH_PCIE_MMIO region has 512GB, which is huge one.
> It may be nice to fall back smaller sizes when we're having tight PA
> space. For example, we can fall back to try 256GB if 512GB doesn't fit
> into the PA space.
> 
> However, I'm not sure how we had the size (512GB) for the region. Are
> there practical factors why we need a 512GB PCIe 64-bits MMIO space?

I have no idea. But this is something that is now relied upon by
existing VMs, and I'm not sure we can break this.

> >>>>>> Improve address assignment for highmem IO regions to avoid the waste
> >>>>>> in the PA space by putting the logic into virt_memmap_fits().
> >>> 
> >>> I guess that this is what I understand the least. What do you mean by
> >>> "wasted PA space"? Either the regions fit in the PA space, and
> >>> computing their addresses in relevant, or they fall outside of it and
> >>> what we stick in memap[index].base is completely irrelevant.
> >>> 
> >> 
> >> It's possible that we run into the following combination. we should
> >> have enough PA space to enable VIRT_HIGH_PCIE_MMIO region. However,
> >> the region is disabled in the original implementation because
> >> VIRT_HIGH_{GIC_REDIST2, PCIE_ECAM} regions consumed 1GB, which is
> >> unecessary and waste in the PA space.
> >> 
> >>      static MemMapEntry extended_memmap[] = {
> >>          [VIRT_HIGH_GIC_REDIST2] =   { 0x0, 64 * MiB },
> >>          [VIRT_HIGH_PCIE_ECAM] =     { 0x0, 256 * MiB },
> >>          [VIRT_HIGH_PCIE_MMIO] =     { 0x0, 512 * GiB },
> >>      };
> >> 
> >>      IPA_LIMIT           = (1UL << 40)
> >>      '-maxmem'           = 511GB              /* Memory starts from 1GB */
> >>      '-slots'            = 0
> >>      vms->highmem_rdist2 = false
> >>      vms->highmem_ecam   = false
> >>      vms->highmem_mmio   = true
> >> 
> > 
> > Sure. But that's not how QEMU works today, and these regions are
> > enabled in order (though it could well be that my recent changes have
> > made the situation more complicated).
> > 
> > What you're suggesting is a pretty radical change in the way the
> > memory map is set. My hunch is that allowing the highmem IO regions to
> > be selectively enabled and allowed to float in the high IO space
> > should come as a new virt machine revision, with user-visible options.
> > 
> 
> Yeah, These regions are enabled in order. It also means they have ascending
> priorities. In other words, '-maxmem' has higher priority than those 3 high
> memory regions.
> 
> My suggested code changes just improve the address assignment for these 3
> high memory regions, without changing the mechanism fundamentally. The
> intention of the proposed changes are like below.
> 
> - In virt_set_memmap(), don't assign address for one specific high memory
>   region if it has been disabled.
> 
> - Put the logic into standalone helper, to simplify the code.
> 
> I'm not sure about the user-visible options. I would say it's going to
> increase user's load. I means the user experience will be degraded.
> Those user-visible options needs to be worried by users :)

Not necessarily. You could have a default that picks whatever fits.
But this is hard to get right, and I'm not convinced you will always
be able to satisfy everyone.

I'm also worried of seeing different behaviours if I dump a VM from
QEMU 7.x and reload it with 7.y, only to realise that I don't have the
same memory layout. In my opinion, these things need to be either
constant, or user-specified.

> Marc, lets improve the changelog and the code changes in v2, to seek
> your comments if you agree? :)

Of course!

	M.
diff mbox series

Patch

diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index 9633f822f3..bc0cd218f9 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -1688,6 +1688,34 @@  static uint64_t virt_cpu_mp_affinity(VirtMachineState *vms, int idx)
     return arm_cpu_mp_affinity(idx, clustersz);
 }
 
+static void virt_memmap_fits(VirtMachineState *vms, int index,
+                             bool *enabled, hwaddr *base, int pa_bits)
+{
+    hwaddr size = extended_memmap[index].size;
+
+    /* The region will be disabled if its size isn't given */
+    if (!*enabled || !size) {
+        *enabled = false;
+        vms->memmap[index].base = 0;
+        vms->memmap[index].size = 0;
+        return;
+    }
+
+    /*
+     * Check if the memory region fits in the PA space. The memory map
+     * and highest_gpa are updated if it fits. Otherwise, it's disabled.
+     */
+    *enabled = (ROUND_UP(*base, size) + size <= BIT_ULL(pa_bits));
+    if (*enabled) {
+        *base = ROUND_UP(*base, size);
+        vms->memmap[index].base = *base;
+        vms->memmap[index].size = size;
+        vms->highest_gpa = *base + size - 1;
+
+	*base = *base + size;
+    }
+}
+
 static void virt_set_memmap(VirtMachineState *vms, int pa_bits)
 {
     MachineState *ms = MACHINE(vms);
@@ -1744,37 +1772,17 @@  static void virt_set_memmap(VirtMachineState *vms, int pa_bits)
     vms->highest_gpa = memtop - 1;
 
     for (i = VIRT_LOWMEMMAP_LAST; i < ARRAY_SIZE(extended_memmap); i++) {
-        hwaddr size = extended_memmap[i].size;
-        bool fits;
-
-        base = ROUND_UP(base, size);
-        vms->memmap[i].base = base;
-        vms->memmap[i].size = size;
-
-        /*
-         * Check each device to see if they fit in the PA space,
-         * moving highest_gpa as we go.
-         *
-         * For each device that doesn't fit, disable it.
-         */
-        fits = (base + size) <= BIT_ULL(pa_bits);
-        if (fits) {
-            vms->highest_gpa = base + size - 1;
-        }
-
         switch (i) {
         case VIRT_HIGH_GIC_REDIST2:
-            vms->highmem_redists &= fits;
+            virt_memmap_fits(vms, i, &vms->highmem_redists, &base, pa_bits);
             break;
         case VIRT_HIGH_PCIE_ECAM:
-            vms->highmem_ecam &= fits;
+            virt_memmap_fits(vms, i, &vms->highmem_ecam, &base, pa_bits);
             break;
         case VIRT_HIGH_PCIE_MMIO:
-            vms->highmem_mmio &= fits;
+            virt_memmap_fits(vms, i, &vms->highmem_mmio, &base, pa_bits);
             break;
         }
-
-        base += size;
     }
 
     if (device_memory_size > 0) {