diff mbox series

[v2,2/2] kvm: add support for guest physical bits

Message ID 20240305105233.617131-3-kraxel@redhat.com (mailing list archive)
State New, archived
Headers show
Series kvm: add support for guest physical bits | expand

Commit Message

Gerd Hoffmann March 5, 2024, 10:52 a.m. UTC
Query kvm for supported guest physical address bits, in cpuid
function 80000008, eax[23:16].  Usually this is identical to host
physical address bits.  With NPT or EPT being used this might be
restricted to 48 (max 4-level paging address space size) even if
the host cpu supports more physical address bits.

When set pass this to the guest, using cpuid too.  Guest firmware
can use this to figure how big the usable guest physical address
space is, so PCI bar mapping are actually reachable.

Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
---
 target/i386/cpu.h     |  1 +
 target/i386/cpu.c     |  1 +
 target/i386/kvm/kvm.c | 17 +++++++++++++++++
 3 files changed, 19 insertions(+)

Comments

Xiaoyao Li March 8, 2024, 5:47 a.m. UTC | #1
On 3/5/2024 6:52 PM, Gerd Hoffmann wrote:
> Query kvm for supported guest physical address bits, in cpuid
> function 80000008, eax[23:16].  Usually this is identical to host
> physical address bits.  With NPT or EPT being used this might be
> restricted to 48 (max 4-level paging address space size) even if
> the host cpu supports more physical address bits.
> 
> When set pass this to the guest, using cpuid too.  Guest firmware
> can use this to figure how big the usable guest physical address
> space is, so PCI bar mapping are actually reachable.
> 
> Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
> ---
>   target/i386/cpu.h     |  1 +
>   target/i386/cpu.c     |  1 +
>   target/i386/kvm/kvm.c | 17 +++++++++++++++++
>   3 files changed, 19 insertions(+)
> 
> diff --git a/target/i386/cpu.h b/target/i386/cpu.h
> index 952174bb6f52..d427218827f6 100644
> --- a/target/i386/cpu.h
> +++ b/target/i386/cpu.h
> @@ -2026,6 +2026,7 @@ struct ArchCPU {
>   
>       /* Number of physical address bits supported */
>       uint32_t phys_bits;
> +    uint32_t guest_phys_bits;
>   
>       /* in order to simplify APIC support, we leave this pointer to the
>          user */
> diff --git a/target/i386/cpu.c b/target/i386/cpu.c
> index 2666ef380891..1a6cfc75951e 100644
> --- a/target/i386/cpu.c
> +++ b/target/i386/cpu.c
> @@ -6570,6 +6570,7 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, uint32_t count,
>           if (env->features[FEAT_8000_0001_EDX] & CPUID_EXT2_LM) {
>               /* 64 bit processor */
>                *eax |= (cpu_x86_virtual_addr_width(env) << 8);
> +             *eax |= (cpu->guest_phys_bits << 16);
>           }
>           *ebx = env->features[FEAT_8000_0008_EBX];
>           if (cs->nr_cores * cs->nr_threads > 1) {
> diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
> index 7298822cb511..ce22dfcaa661 100644
> --- a/target/i386/kvm/kvm.c
> +++ b/target/i386/kvm/kvm.c
> @@ -238,6 +238,15 @@ static int kvm_get_tsc(CPUState *cs)
>       return 0;
>   }
>   
> +/* return cpuid fn 8000_0008 eax[23:16] aka GuestPhysBits */
> +static int kvm_get_guest_phys_bits(KVMState *s)
> +{
> +    uint32_t eax;
> +
> +    eax = kvm_arch_get_supported_cpuid(s, 0x80000008, 0, R_EAX);
> +    return (eax >> 16) & 0xff;
> +}
> +
>   static inline void do_kvm_synchronize_tsc(CPUState *cpu, run_on_cpu_data arg)
>   {
>       kvm_get_tsc(cpu);
> @@ -1730,6 +1739,7 @@ int kvm_arch_init_vcpu(CPUState *cs)
>       X86CPU *cpu = X86_CPU(cs);
>       CPUX86State *env = &cpu->env;
>       uint32_t limit, i, j, cpuid_i;
> +    uint32_t guest_phys_bits;
>       uint32_t unused;
>       struct kvm_cpuid_entry2 *c;
>       uint32_t signature[3];
> @@ -1765,6 +1775,13 @@ int kvm_arch_init_vcpu(CPUState *cs)
>   
>       env->apic_bus_freq = KVM_APIC_BUS_FREQUENCY;
>   
> +    guest_phys_bits = kvm_get_guest_phys_bits(cs->kvm_state);
> +    if (guest_phys_bits &&
> +        (cpu->guest_phys_bits == 0 ||
> +         cpu->guest_phys_bits > guest_phys_bits)) {
> +        cpu->guest_phys_bits = guest_phys_bits;
> +    }

suggest to move this added code block to kvm_cpu_realizefn().

Main code in kvm_arch_init_vcpu() is to consume the data in cpu or 
env->features to configure KVM specific configuration via KVM ioctls.

The preparation work of initializing the required data usually not 
happen in kvm_arch_init_vcpu();

>       /*
>        * kvm_hyperv_expand_features() is called here for the second time in case
>        * KVM_CAP_SYS_HYPERV_CPUID is not supported. While we can't possibly handle
Paolo Bonzini March 11, 2024, 10:55 a.m. UTC | #2
On Tue, Mar 5, 2024 at 11:52 AM Gerd Hoffmann <kraxel@redhat.com> wrote:
>
> Query kvm for supported guest physical address bits, in cpuid
> function 80000008, eax[23:16].  Usually this is identical to host
> physical address bits.  With NPT or EPT being used this might be
> restricted to 48 (max 4-level paging address space size) even if
> the host cpu supports more physical address bits.
>
> When set pass this to the guest, using cpuid too.  Guest firmware
> can use this to figure how big the usable guest physical address
> space is, so PCI bar mapping are actually reachable.
>
> Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
> ---
>  target/i386/cpu.h     |  1 +
>  target/i386/cpu.c     |  1 +
>  target/i386/kvm/kvm.c | 17 +++++++++++++++++
>  3 files changed, 19 insertions(+)
>
> diff --git a/target/i386/cpu.h b/target/i386/cpu.h
> index 952174bb6f52..d427218827f6 100644
> --- a/target/i386/cpu.h
> +++ b/target/i386/cpu.h
> +    guest_phys_bits = kvm_get_guest_phys_bits(cs->kvm_state);
> +    if (guest_phys_bits &&
> +        (cpu->guest_phys_bits == 0 ||
> +         cpu->guest_phys_bits > guest_phys_bits)) {
> +        cpu->guest_phys_bits = guest_phys_bits;
> +    }

Like Xiaoyao mentioned, the right place for this is kvm_cpu_realizefn,
after host_cpu_realizefn returns. It should also be conditional on
cpu->host_phys_bits. It also makes sense to:

- make kvm_get_guest_phys_bits() return bits 7:0 if bits 23:16 are zero

- here, set cpu->guest_phys_bits only if it is not equal to
cpu->phys_bits (this undoes the previous suggestion, but I think it's
cleaner)

- add a property in x86_cpu_properties[] to allow configuration with TCG.

Paolo

> +
>      /*
>       * kvm_hyperv_expand_features() is called here for the second time in case
>       * KVM_CAP_SYS_HYPERV_CPUID is not supported. While we can't possibly handle
> --
> 2.44.0
>
Gerd Hoffmann March 11, 2024, 11:59 a.m. UTC | #3
Hi,

> > diff --git a/target/i386/cpu.h b/target/i386/cpu.h
> > index 952174bb6f52..d427218827f6 100644
> > --- a/target/i386/cpu.h
> > +++ b/target/i386/cpu.h
> > +    guest_phys_bits = kvm_get_guest_phys_bits(cs->kvm_state);
> > +    if (guest_phys_bits &&
> > +        (cpu->guest_phys_bits == 0 ||
> > +         cpu->guest_phys_bits > guest_phys_bits)) {
> > +        cpu->guest_phys_bits = guest_phys_bits;
> > +    }
> 
> Like Xiaoyao mentioned, the right place for this is kvm_cpu_realizefn,
> after host_cpu_realizefn returns. It should also be conditional on
> cpu->host_phys_bits.

Ok.

> It also makes sense to:
> 
> - make kvm_get_guest_phys_bits() return bits 7:0 if bits 23:16 are zero
> 
> - here, set cpu->guest_phys_bits only if it is not equal to
> cpu->phys_bits (this undoes the previous suggestion, but I think it's
> cleaner)

Not sure about that.

I think it would be good to have a backward compatibility story.
Currently neither the kernel nor qemu set guest_phys_bits.  So if the
firmware finds guest_phys_bits == 0 it does not know whenever ...

  (a) kernel or qemu being too old, or
  (b) no restrictions apply, it is safe to go with phys_bits.

One easy option would be to always let qemu pass through guest_phys_bits
from the kernel, even in case it is equal to phys_bits.

> - add a property in x86_cpu_properties[] to allow configuration with TCG.

Was thinking about configuration too.  Not sure it is a good idea to
add yet another phys-bits config option to the mix of options we already
have ...

In case host_phys_bits=true qemu could simply use
min(kernel guest-phys-bits,host-phys-bits-limit)

For the host_phys_bits=false case it would probably be best to just
not set guest_phys_bits.

take care,
  Gerd
Paolo Bonzini March 11, 2024, 1:28 p.m. UTC | #4
On Mon, Mar 11, 2024 at 12:59 PM Gerd Hoffmann <kraxel@redhat.com> wrote:
>
>   Hi,
>
> > > diff --git a/target/i386/cpu.h b/target/i386/cpu.h
> > > index 952174bb6f52..d427218827f6 100644
> > > --- a/target/i386/cpu.h
> > > +++ b/target/i386/cpu.h
> > > +    guest_phys_bits = kvm_get_guest_phys_bits(cs->kvm_state);
> > > +    if (guest_phys_bits &&
> > > +        (cpu->guest_phys_bits == 0 ||
> > > +         cpu->guest_phys_bits > guest_phys_bits)) {
> > > +        cpu->guest_phys_bits = guest_phys_bits;
> > > +    }
> >
> > Like Xiaoyao mentioned, the right place for this is kvm_cpu_realizefn,
> > after host_cpu_realizefn returns. It should also be conditional on
> > cpu->host_phys_bits.
>
> Ok.
>
> > It also makes sense to:
> >
> > - make kvm_get_guest_phys_bits() return bits 7:0 if bits 23:16 are zero
> >
> > - here, set cpu->guest_phys_bits only if it is not equal to
> > cpu->phys_bits (this undoes the previous suggestion, but I think it's
> > cleaner)
>
> Not sure about that.
>
> I think it would be good to have a backward compatibility story.
> Currently neither the kernel nor qemu set guest_phys_bits.  So if the
> firmware finds guest_phys_bits == 0 it does not know whenever ...
>
>   (a) kernel or qemu being too old, or
>   (b) no restrictions apply, it is safe to go with phys_bits.
>
> One easy option would be to always let qemu pass through guest_phys_bits
> from the kernel, even in case it is equal to phys_bits.

Ah, I see - you would like to be able to use all 52 bits (instead of
going for a safer 46 or 48) and therefore you need to have nonzero
guest_phys_bits even if it's equal to phys_bits. While on an old
kernel, you would pass forward 0.

> > - add a property in x86_cpu_properties[] to allow configuration with TCG.
>
> Was thinking about configuration too.  Not sure it is a good idea to
> add yet another phys-bits config option to the mix of options we already
> have ...

I think it's nice that you can use TCG to test various cases, which
requires a new property.

> In case host_phys_bits=true qemu could simply use
> min(kernel guest-phys-bits,host-phys-bits-limit)

Yes, that works.

Paolo

> For the host_phys_bits=false case it would probably be best to just
> not set guest_phys_bits.
Tao Su March 13, 2024, 6:39 a.m. UTC | #5
On Tue, Mar 05, 2024 at 11:52:33AM +0100, Gerd Hoffmann wrote:
> Query kvm for supported guest physical address bits, in cpuid
> function 80000008, eax[23:16].  Usually this is identical to host
> physical address bits.  With NPT or EPT being used this might be
> restricted to 48 (max 4-level paging address space size) even if
> the host cpu supports more physical address bits.
> 
> When set pass this to the guest, using cpuid too.  Guest firmware
> can use this to figure how big the usable guest physical address
> space is, so PCI bar mapping are actually reachable.

If this patch is applied, do you have plans to implement it in
OVMF/Seabios?

Thanks,
Tao

> 
> Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
> ---
>  target/i386/cpu.h     |  1 +
>  target/i386/cpu.c     |  1 +
>  target/i386/kvm/kvm.c | 17 +++++++++++++++++
>  3 files changed, 19 insertions(+)
> 
> diff --git a/target/i386/cpu.h b/target/i386/cpu.h
> index 952174bb6f52..d427218827f6 100644
> --- a/target/i386/cpu.h
> +++ b/target/i386/cpu.h
> @@ -2026,6 +2026,7 @@ struct ArchCPU {
>  
>      /* Number of physical address bits supported */
>      uint32_t phys_bits;
> +    uint32_t guest_phys_bits;
>  
>      /* in order to simplify APIC support, we leave this pointer to the
>         user */
> diff --git a/target/i386/cpu.c b/target/i386/cpu.c
> index 2666ef380891..1a6cfc75951e 100644
> --- a/target/i386/cpu.c
> +++ b/target/i386/cpu.c
> @@ -6570,6 +6570,7 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, uint32_t count,
>          if (env->features[FEAT_8000_0001_EDX] & CPUID_EXT2_LM) {
>              /* 64 bit processor */
>               *eax |= (cpu_x86_virtual_addr_width(env) << 8);
> +             *eax |= (cpu->guest_phys_bits << 16);
>          }
>          *ebx = env->features[FEAT_8000_0008_EBX];
>          if (cs->nr_cores * cs->nr_threads > 1) {
> diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
> index 7298822cb511..ce22dfcaa661 100644
> --- a/target/i386/kvm/kvm.c
> +++ b/target/i386/kvm/kvm.c
> @@ -238,6 +238,15 @@ static int kvm_get_tsc(CPUState *cs)
>      return 0;
>  }
>  
> +/* return cpuid fn 8000_0008 eax[23:16] aka GuestPhysBits */
> +static int kvm_get_guest_phys_bits(KVMState *s)
> +{
> +    uint32_t eax;
> +
> +    eax = kvm_arch_get_supported_cpuid(s, 0x80000008, 0, R_EAX);
> +    return (eax >> 16) & 0xff;
> +}
> +
>  static inline void do_kvm_synchronize_tsc(CPUState *cpu, run_on_cpu_data arg)
>  {
>      kvm_get_tsc(cpu);
> @@ -1730,6 +1739,7 @@ int kvm_arch_init_vcpu(CPUState *cs)
>      X86CPU *cpu = X86_CPU(cs);
>      CPUX86State *env = &cpu->env;
>      uint32_t limit, i, j, cpuid_i;
> +    uint32_t guest_phys_bits;
>      uint32_t unused;
>      struct kvm_cpuid_entry2 *c;
>      uint32_t signature[3];
> @@ -1765,6 +1775,13 @@ int kvm_arch_init_vcpu(CPUState *cs)
>  
>      env->apic_bus_freq = KVM_APIC_BUS_FREQUENCY;
>  
> +    guest_phys_bits = kvm_get_guest_phys_bits(cs->kvm_state);
> +    if (guest_phys_bits &&
> +        (cpu->guest_phys_bits == 0 ||
> +         cpu->guest_phys_bits > guest_phys_bits)) {
> +        cpu->guest_phys_bits = guest_phys_bits;
> +    }
> +
>      /*
>       * kvm_hyperv_expand_features() is called here for the second time in case
>       * KVM_CAP_SYS_HYPERV_CPUID is not supported. While we can't possibly handle
> -- 
> 2.44.0
> 
>
Gerd Hoffmann March 13, 2024, 8:15 a.m. UTC | #6
On Wed, Mar 13, 2024 at 02:39:32PM +0800, Tao Su wrote:
> On Tue, Mar 05, 2024 at 11:52:33AM +0100, Gerd Hoffmann wrote:
> > Query kvm for supported guest physical address bits, in cpuid
> > function 80000008, eax[23:16].  Usually this is identical to host
> > physical address bits.  With NPT or EPT being used this might be
> > restricted to 48 (max 4-level paging address space size) even if
> > the host cpu supports more physical address bits.
> > 
> > When set pass this to the guest, using cpuid too.  Guest firmware
> > can use this to figure how big the usable guest physical address
> > space is, so PCI bar mapping are actually reachable.
> 
> If this patch is applied, do you have plans to implement it in
> OVMF/Seabios?

Yes.  ovmf test patches:

https://github.com/kraxel/edk2/commits/devel/guest-phys-bits/

take care,
  Gerd
diff mbox series

Patch

diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 952174bb6f52..d427218827f6 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -2026,6 +2026,7 @@  struct ArchCPU {
 
     /* Number of physical address bits supported */
     uint32_t phys_bits;
+    uint32_t guest_phys_bits;
 
     /* in order to simplify APIC support, we leave this pointer to the
        user */
diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index 2666ef380891..1a6cfc75951e 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -6570,6 +6570,7 @@  void cpu_x86_cpuid(CPUX86State *env, uint32_t index, uint32_t count,
         if (env->features[FEAT_8000_0001_EDX] & CPUID_EXT2_LM) {
             /* 64 bit processor */
              *eax |= (cpu_x86_virtual_addr_width(env) << 8);
+             *eax |= (cpu->guest_phys_bits << 16);
         }
         *ebx = env->features[FEAT_8000_0008_EBX];
         if (cs->nr_cores * cs->nr_threads > 1) {
diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
index 7298822cb511..ce22dfcaa661 100644
--- a/target/i386/kvm/kvm.c
+++ b/target/i386/kvm/kvm.c
@@ -238,6 +238,15 @@  static int kvm_get_tsc(CPUState *cs)
     return 0;
 }
 
+/* return cpuid fn 8000_0008 eax[23:16] aka GuestPhysBits */
+static int kvm_get_guest_phys_bits(KVMState *s)
+{
+    uint32_t eax;
+
+    eax = kvm_arch_get_supported_cpuid(s, 0x80000008, 0, R_EAX);
+    return (eax >> 16) & 0xff;
+}
+
 static inline void do_kvm_synchronize_tsc(CPUState *cpu, run_on_cpu_data arg)
 {
     kvm_get_tsc(cpu);
@@ -1730,6 +1739,7 @@  int kvm_arch_init_vcpu(CPUState *cs)
     X86CPU *cpu = X86_CPU(cs);
     CPUX86State *env = &cpu->env;
     uint32_t limit, i, j, cpuid_i;
+    uint32_t guest_phys_bits;
     uint32_t unused;
     struct kvm_cpuid_entry2 *c;
     uint32_t signature[3];
@@ -1765,6 +1775,13 @@  int kvm_arch_init_vcpu(CPUState *cs)
 
     env->apic_bus_freq = KVM_APIC_BUS_FREQUENCY;
 
+    guest_phys_bits = kvm_get_guest_phys_bits(cs->kvm_state);
+    if (guest_phys_bits &&
+        (cpu->guest_phys_bits == 0 ||
+         cpu->guest_phys_bits > guest_phys_bits)) {
+        cpu->guest_phys_bits = guest_phys_bits;
+    }
+
     /*
      * kvm_hyperv_expand_features() is called here for the second time in case
      * KVM_CAP_SYS_HYPERV_CPUID is not supported. While we can't possibly handle