diff mbox series

[v6,5/5] x86/kvm: Avoid dynamic allocation of pvclock data when SEV is active

Message ID 1536343050-18532-6-git-send-email-brijesh.singh@amd.com (mailing list archive)
State New, archived
Headers show
Series x86: Fix SEV guest regression | expand

Commit Message

Brijesh Singh Sept. 7, 2018, 5:57 p.m. UTC
Currently, the per-cpu pvclock data is allocated dynamically when
cpu > HVC_BOOT_ARRAY_SIZE. The physical address of this variable is
shared between the guest and the hypervisor hence it must be mapped as
unencrypted (ie. C=0) when SEV is active.

The C-bit works on a page, hence we will be required to perform a
full 4k page allocation to store a single 32-byte pvclock variable. It
will waste fairly sizeable amount of memory since each CPU will be doing
a separate 4k allocation. Let's define a second array for the SEV case to
statically allocate for NR_CPUS and put this array in .data..decrypted
section so that its mapped with C=0 during boot. The .data..decrypted
section has a big chunk of memory that is currently unused. And since
second array will be used only when memory encryption is active hence
free it when encryption is not active.

Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Suggested-by: Sean Christopherson <sean.j.christopherson@intel.com>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: kvm@vger.kernel.org
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Borislav Petkov <bp@suse.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: linux-kernel@vger.kernel.org
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Sean Christopherson <sean.j.christopherson@intel.com>
Cc: kvm@vger.kernel.org
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
---
 arch/x86/include/asm/mem_encrypt.h |  4 ++++
 arch/x86/kernel/kvmclock.c         | 14 ++++++++++++++
 arch/x86/kernel/vmlinux.lds.S      |  3 +++
 arch/x86/mm/init.c                 |  3 +++
 arch/x86/mm/mem_encrypt.c          | 10 ++++++++++
 5 files changed, 34 insertions(+)

Comments

Borislav Petkov Sept. 10, 2018, 12:27 p.m. UTC | #1
On Fri, Sep 07, 2018 at 12:57:30PM -0500, Brijesh Singh wrote:
> Currently, the per-cpu pvclock data is allocated dynamically when
> cpu > HVC_BOOT_ARRAY_SIZE.

Well no, you need to write this correctly - what is "cpu >
HVC_BOOT_ARRAY_SIZE" ?!

( I know what it is but I know it only because I've looked at that code before.  )

So no, please explain it in English not in code.

> The physical address of this variable is
> shared between the guest and the hypervisor hence it must be mapped as
> unencrypted (ie. C=0) when SEV is active.

This sentence is a good example about how to explain stuff in commit
messages.

> The C-bit works on a page,

"The C-bit determines the encryption status of a 4K page."

> hence we will be required to perform a

Use passive tone in your commit message: no "we", etc...

> full 4k page allocation to store a single 32-byte pvclock variable. It
> will waste fairly sizeable amount of memory since each CPU will be doing

"... will waste *a* fairly sizeable amount of ..."

> a separate 4k allocation.

Start new paragraph here and use passive tone.

> Let's define a second array for the SEV case to
> statically allocate for NR_CPUS and put this array in .data..decrypted

NR_CPUS needs explaining for the unenlightened reader. Also,

	"... put this array in *the* .data..decrypted section... "

> section so that its mapped with C=0 during boot.

<---- newline here.

> The .data..decrypted
> section has a big chunk of memory that is currently unused. And since
> second array will be used only when memory encryption is active hence

"... since *the* second array... "

s/hence //

> free it when encryption is not active.
> 
> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
> Suggested-by: Sean Christopherson <sean.j.christopherson@intel.com>
> Cc: Tom Lendacky <thomas.lendacky@amd.com>
> Cc: kvm@vger.kernel.org
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Borislav Petkov <bp@suse.de>
> Cc: "H. Peter Anvin" <hpa@zytor.com>
> Cc: linux-kernel@vger.kernel.org
> Cc: Paolo Bonzini <pbonzini@redhat.com>
> Cc: Sean Christopherson <sean.j.christopherson@intel.com>
> Cc: kvm@vger.kernel.org
> Cc: "Radim Krčmář" <rkrcmar@redhat.com>
> ---
>  arch/x86/include/asm/mem_encrypt.h |  4 ++++
>  arch/x86/kernel/kvmclock.c         | 14 ++++++++++++++
>  arch/x86/kernel/vmlinux.lds.S      |  3 +++
>  arch/x86/mm/init.c                 |  3 +++
>  arch/x86/mm/mem_encrypt.c          | 10 ++++++++++
>  5 files changed, 34 insertions(+)
> 
> diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
> index 802b2eb..cc46584 100644
> --- a/arch/x86/include/asm/mem_encrypt.h
> +++ b/arch/x86/include/asm/mem_encrypt.h
> @@ -48,11 +48,13 @@ int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size);
>  
>  /* Architecture __weak replacement functions */
>  void __init mem_encrypt_init(void);
> +void __init free_decrypted_mem(void);

Proper prefixing:

"mem_encrypt_free_decrypted"

or so

>  bool sme_active(void);
>  bool sev_active(void);
>  
>  #define __decrypted __attribute__((__section__(".data..decrypted")))
> +#define __decrypted_aux __attribute__((__section__(".data..decrypted.aux")))
>  
>  #else	/* !CONFIG_AMD_MEM_ENCRYPT */
>  
> @@ -80,6 +82,7 @@ static inline int __init
>  early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0; }
>  
>  #define __decrypted
> +#define __decrypted_aux
>  
>  #endif	/* CONFIG_AMD_MEM_ENCRYPT */
>  
> @@ -93,6 +96,7 @@ early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0;
>  #define __sme_pa_nodebug(x)	(__pa_nodebug(x) | sme_me_mask)
>  
>  extern char __start_data_decrypted[], __end_data_decrypted[];
> +extern char __start_data_decrypted_aux[];
>  
>  #endif	/* __ASSEMBLY__ */
>  
> diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
> index 376fd3a..6086b56 100644
> --- a/arch/x86/kernel/kvmclock.c
> +++ b/arch/x86/kernel/kvmclock.c
> @@ -65,6 +65,15 @@ static struct pvclock_vsyscall_time_info
>  static struct pvclock_wall_clock wall_clock __decrypted;
>  static DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu);
>  
> +#ifdef CONFIG_AMD_MEM_ENCRYPT
> +/*
> + * The auxiliary array will be used when SEV is active. In non-SEV case,
> + * it will be freed by free_decrypted_mem().
> + */
> +static struct pvclock_vsyscall_time_info
> +			hv_clock_aux[NR_CPUS] __decrypted_aux;

Hmm, so worst case that's 64 4K pages:

(8192*32)/4096 = 64 4K pages.

Now, the real question from all this SNAFU is, why can't all those point
to a single struct pvclock_vsyscall_time_info and all CPUs read a single
thing? Why do they have to be per-CPU and thus waste so much memory?
Paolo Bonzini Sept. 10, 2018, 12:28 p.m. UTC | #2
On 07/09/2018 19:57, Brijesh Singh wrote:
> Currently, the per-cpu pvclock data is allocated dynamically when
> cpu > HVC_BOOT_ARRAY_SIZE. The physical address of this variable is
> shared between the guest and the hypervisor hence it must be mapped as
> unencrypted (ie. C=0) when SEV is active.
> 
> The C-bit works on a page, hence we will be required to perform a
> full 4k page allocation to store a single 32-byte pvclock variable. It
> will waste fairly sizeable amount of memory since each CPU will be doing
> a separate 4k allocation. Let's define a second array for the SEV case to
> statically allocate for NR_CPUS and put this array in .data..decrypted
> section so that its mapped with C=0 during boot. The .data..decrypted
> section has a big chunk of memory that is currently unused. And since
> second array will be used only when memory encryption is active hence
> free it when encryption is not active.
> 
> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
> Suggested-by: Sean Christopherson <sean.j.christopherson@intel.com>
> Cc: Tom Lendacky <thomas.lendacky@amd.com>
> Cc: kvm@vger.kernel.org
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Borislav Petkov <bp@suse.de>
> Cc: "H. Peter Anvin" <hpa@zytor.com>
> Cc: linux-kernel@vger.kernel.org
> Cc: Paolo Bonzini <pbonzini@redhat.com>
> Cc: Sean Christopherson <sean.j.christopherson@intel.com>
> Cc: kvm@vger.kernel.org
> Cc: "Radim Krčmář" <rkrcmar@redhat.com>
> ---
>  arch/x86/include/asm/mem_encrypt.h |  4 ++++
>  arch/x86/kernel/kvmclock.c         | 14 ++++++++++++++
>  arch/x86/kernel/vmlinux.lds.S      |  3 +++
>  arch/x86/mm/init.c                 |  3 +++
>  arch/x86/mm/mem_encrypt.c          | 10 ++++++++++
>  5 files changed, 34 insertions(+)
> 
> diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
> index 802b2eb..cc46584 100644
> --- a/arch/x86/include/asm/mem_encrypt.h
> +++ b/arch/x86/include/asm/mem_encrypt.h
> @@ -48,11 +48,13 @@ int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size);
>  
>  /* Architecture __weak replacement functions */
>  void __init mem_encrypt_init(void);
> +void __init free_decrypted_mem(void);
>  
>  bool sme_active(void);
>  bool sev_active(void);
>  
>  #define __decrypted __attribute__((__section__(".data..decrypted")))
> +#define __decrypted_aux __attribute__((__section__(".data..decrypted.aux")))
>  
>  #else	/* !CONFIG_AMD_MEM_ENCRYPT */
>  
> @@ -80,6 +82,7 @@ static inline int __init
>  early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0; }
>  
>  #define __decrypted
> +#define __decrypted_aux
>  
>  #endif	/* CONFIG_AMD_MEM_ENCRYPT */
>  
> @@ -93,6 +96,7 @@ early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0;
>  #define __sme_pa_nodebug(x)	(__pa_nodebug(x) | sme_me_mask)
>  
>  extern char __start_data_decrypted[], __end_data_decrypted[];
> +extern char __start_data_decrypted_aux[];
>  
>  #endif	/* __ASSEMBLY__ */
>  
> diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
> index 376fd3a..6086b56 100644
> --- a/arch/x86/kernel/kvmclock.c
> +++ b/arch/x86/kernel/kvmclock.c
> @@ -65,6 +65,15 @@ static struct pvclock_vsyscall_time_info
>  static struct pvclock_wall_clock wall_clock __decrypted;
>  static DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu);
>  
> +#ifdef CONFIG_AMD_MEM_ENCRYPT
> +/*
> + * The auxiliary array will be used when SEV is active. In non-SEV case,
> + * it will be freed by free_decrypted_mem().
> + */
> +static struct pvclock_vsyscall_time_info
> +			hv_clock_aux[NR_CPUS] __decrypted_aux;
> +#endif
> +
>  static inline struct pvclock_vcpu_time_info *this_cpu_pvti(void)
>  {
>  	return &this_cpu_read(hv_clock_per_cpu)->pvti;
> @@ -269,6 +278,11 @@ static int kvmclock_setup_percpu(unsigned int cpu)
>  	/* Use the static page for the first CPUs, allocate otherwise */
>  	if (cpu < HVC_BOOT_ARRAY_SIZE)
>  		p = &hv_clock_boot[cpu];
> +#ifdef CONFIG_AMD_MEM_ENCRYPT
> +	/* Use the static page from auxiliary array instead of allocating it. */
> +	else if (sev_active())
> +		p = &hv_clock_aux[cpu - HVC_BOOT_ARRAY_SIZE];
> +#endif
>  	else
>  		p = kzalloc(sizeof(*p), GFP_KERNEL);
>  
> diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
> index 4cb1064..bde287a 100644
> --- a/arch/x86/kernel/vmlinux.lds.S
> +++ b/arch/x86/kernel/vmlinux.lds.S
> @@ -77,6 +77,9 @@ jiffies_64 = jiffies;
>  	. = ALIGN(PMD_SIZE);					\
>  	__start_data_decrypted = .;				\
>  	*(.data..decrypted);					\
> +	. = ALIGN(PAGE_SIZE);					\
> +	__start_data_decrypted_aux = .;				\
> +	*(.data..decrypted.aux);				\
>  	. = ALIGN(PMD_SIZE);					\
>  	__end_data_decrypted = .;				\
>  
> diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
> index 7a8fc26..052b279 100644
> --- a/arch/x86/mm/init.c
> +++ b/arch/x86/mm/init.c
> @@ -815,9 +815,12 @@ void free_kernel_image_pages(void *begin, void *end)
>  		set_memory_np_noalias(begin_ul, len_pages);
>  }
>  
> +void __weak free_decrypted_mem(void) { }
> +
>  void __ref free_initmem(void)
>  {
>  	e820__reallocate_tables();
> +	free_decrypted_mem();
>  
>  	free_kernel_image_pages(&__init_begin, &__init_end);
>  }
> diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
> index b2de398..9a08c52 100644
> --- a/arch/x86/mm/mem_encrypt.c
> +++ b/arch/x86/mm/mem_encrypt.c
> @@ -348,6 +348,16 @@ bool sev_active(void)
>  EXPORT_SYMBOL(sev_active);
>  
>  /* Architecture __weak replacement functions */
> +void __init free_decrypted_mem(void)
> +{
> +	if (mem_encrypt_active())
> +		return;
> +
> +	free_init_pages("unused decrypted",
> +			(unsigned long)__start_data_decrypted_aux,
> +			(unsigned long)__end_data_decrypted);
> +}
> +
>  void __init mem_encrypt_init(void)
>  {
>  	if (!sme_me_mask)
> 

Acked-by: Paolo Bonzini <pbonzini@redhat.com>
Brijesh Singh Sept. 10, 2018, 1:15 p.m. UTC | #3
On 9/10/18 7:27 AM, Borislav Petkov wrote:
> On Fri, Sep 07, 2018 at 12:57:30PM -0500, Brijesh Singh wrote:
>> Currently, the per-cpu pvclock data is allocated dynamically when
>> cpu > HVC_BOOT_ARRAY_SIZE.
> Well no, you need to write this correctly - what is "cpu >
> HVC_BOOT_ARRAY_SIZE" ?!
>
> ( I know what it is but I know it only because I've looked at that code before.  )
>
> So no, please explain it in English not in code.


>> The physical address of this variable is
>> shared between the guest and the hypervisor hence it must be mapped as
>> unencrypted (ie. C=0) when SEV is active.
> This sentence is a good example about how to explain stuff in commit
> messages.
>
>> The C-bit works on a page,
> "The C-bit determines the encryption status of a 4K page."
>
>> hence we will be required to perform a
> Use passive tone in your commit message: no "we", etc...
>
>> full 4k page allocation to store a single 32-byte pvclock variable. It
>> will waste fairly sizeable amount of memory since each CPU will be doing
> "... will waste *a* fairly sizeable amount of ..."
>
>> a separate 4k allocation.
> Start new paragraph here and use passive tone.
>
>> Let's define a second array for the SEV case to
>> statically allocate for NR_CPUS and put this array in .data..decrypted
> NR_CPUS needs explaining for the unenlightened reader. Also,
>
> 	"... put this array in *the* .data..decrypted section... "
>
>> section so that its mapped with C=0 during boot.
> <---- newline here.
>
>> The .data..decrypted
>> section has a big chunk of memory that is currently unused. And since
>> second array will be used only when memory encryption is active hence
> "... since *the* second array... "
>
> s/hence //
>
>> free it when encryption is not active.
>>
>> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
>> Suggested-by: Sean Christopherson <sean.j.christopherson@intel.com>
>> Cc: Tom Lendacky <thomas.lendacky@amd.com>
>> Cc: kvm@vger.kernel.org
>> Cc: Thomas Gleixner <tglx@linutronix.de>
>> Cc: Borislav Petkov <bp@suse.de>
>> Cc: "H. Peter Anvin" <hpa@zytor.com>
>> Cc: linux-kernel@vger.kernel.org
>> Cc: Paolo Bonzini <pbonzini@redhat.com>
>> Cc: Sean Christopherson <sean.j.christopherson@intel.com>
>> Cc: kvm@vger.kernel.org
>> Cc: "Radim Krčmář" <rkrcmar@redhat.com>
>> ---
>>  arch/x86/include/asm/mem_encrypt.h |  4 ++++
>>  arch/x86/kernel/kvmclock.c         | 14 ++++++++++++++
>>  arch/x86/kernel/vmlinux.lds.S      |  3 +++
>>  arch/x86/mm/init.c                 |  3 +++
>>  arch/x86/mm/mem_encrypt.c          | 10 ++++++++++
>>  5 files changed, 34 insertions(+)
>>
>> diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
>> index 802b2eb..cc46584 100644
>> --- a/arch/x86/include/asm/mem_encrypt.h
>> +++ b/arch/x86/include/asm/mem_encrypt.h
>> @@ -48,11 +48,13 @@ int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size);
>>  
>>  /* Architecture __weak replacement functions */
>>  void __init mem_encrypt_init(void);
>> +void __init free_decrypted_mem(void);
> Proper prefixing:
>
> "mem_encrypt_free_decrypted"
>
> or so
>
>>  bool sme_active(void);
>>  bool sev_active(void);
>>  
>>  #define __decrypted __attribute__((__section__(".data..decrypted")))
>> +#define __decrypted_aux __attribute__((__section__(".data..decrypted.aux")))
>>  
>>  #else	/* !CONFIG_AMD_MEM_ENCRYPT */
>>  
>> @@ -80,6 +82,7 @@ static inline int __init
>>  early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0; }
>>  
>>  #define __decrypted
>> +#define __decrypted_aux
>>  
>>  #endif	/* CONFIG_AMD_MEM_ENCRYPT */
>>  
>> @@ -93,6 +96,7 @@ early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0;
>>  #define __sme_pa_nodebug(x)	(__pa_nodebug(x) | sme_me_mask)
>>  
>>  extern char __start_data_decrypted[], __end_data_decrypted[];
>> +extern char __start_data_decrypted_aux[];
>>  
>>  #endif	/* __ASSEMBLY__ */
>>  
>> diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
>> index 376fd3a..6086b56 100644
>> --- a/arch/x86/kernel/kvmclock.c
>> +++ b/arch/x86/kernel/kvmclock.c
>> @@ -65,6 +65,15 @@ static struct pvclock_vsyscall_time_info
>>  static struct pvclock_wall_clock wall_clock __decrypted;
>>  static DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu);
>>  
>> +#ifdef CONFIG_AMD_MEM_ENCRYPT
>> +/*
>> + * The auxiliary array will be used when SEV is active. In non-SEV case,
>> + * it will be freed by free_decrypted_mem().
>> + */
>> +static struct pvclock_vsyscall_time_info
>> +			hv_clock_aux[NR_CPUS] __decrypted_aux;
> Hmm, so worst case that's 64 4K pages:
>
> (8192*32)/4096 = 64 4K pages.

We can minimize the worst case memory usage. The number of VCPUs
supported by KVM maybe less than NR_CPUS. e.g Currently KVM_MAX_VCPUS is
set to 288

(288 * 64)/4096 = 4 4K pages.

(pvclock_vsyscall_time_info is cache aligned so it will be 64 bytes)


#if NR_CPUS > KVM_MAX_VCPUS
#define HV_AUX_ARRAY_SIZE  KVM_MAX_VCPUS
#else
#define HV_AUX_ARRAY_SIZE NR_CPUS
#endif

static struct pvclock_vsyscall_time_info
                        hv_clock_aux[HV_AUX_ARRAY_SIZE] __decrypted_aux;


> Now, the real question from all this SNAFU is, why can't all those point
> to a single struct pvclock_vsyscall_time_info and all CPUs read a single
> thing? Why do they have to be per-CPU and thus waste so much memory?
>
Sean Christopherson Sept. 10, 2018, 1:29 p.m. UTC | #4
On Mon, 2018-09-10 at 08:15 -0500, Brijesh Singh wrote:
> 
> On 9/10/18 7:27 AM, Borislav Petkov wrote:
> > 
> > On Fri, Sep 07, 2018 at 12:57:30PM -0500, Brijesh Singh wrote:
> > > 
> > > diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
> > > index 376fd3a..6086b56 100644
> > > --- a/arch/x86/kernel/kvmclock.c
> > > +++ b/arch/x86/kernel/kvmclock.c
> > > @@ -65,6 +65,15 @@ static struct pvclock_vsyscall_time_info
> > >  static struct pvclock_wall_clock wall_clock __decrypted;
> > >  static DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu);
> > >  
> > > +#ifdef CONFIG_AMD_MEM_ENCRYPT
> > > +/*
> > > + * The auxiliary array will be used when SEV is active. In non-SEV case,
> > > + * it will be freed by free_decrypted_mem().
> > > + */
> > > +static struct pvclock_vsyscall_time_info
> > > +			hv_clock_aux[NR_CPUS] __decrypted_aux;
> > Hmm, so worst case that's 64 4K pages:
> > 
> > (8192*32)/4096 = 64 4K pages.
> We can minimize the worst case memory usage. The number of VCPUs
> supported by KVM maybe less than NR_CPUS. e.g Currently KVM_MAX_VCPUS is
> set to 288

KVM_MAX_VCPUS is a property of the host, whereas this code runs in the
guest, e.g. KVM_MAX_VCPUS could be 2048 in the host for all we know.

> (288 * 64)/4096 = 4 4K pages.
> 
> (pvclock_vsyscall_time_info is cache aligned so it will be 64 bytes)

Ah, I was wondering why my calculations were always different than
yours.  I was looking at struct pvclock_vcpu_time_info, which is 32
bytes.

> #if NR_CPUS > KVM_MAX_VCPUS
> #define HV_AUX_ARRAY_SIZE  KVM_MAX_VCPUS
> #else
> #define HV_AUX_ARRAY_SIZE NR_CPUS
> #endif
> 
> static struct pvclock_vsyscall_time_info
>                         hv_clock_aux[HV_AUX_ARRAY_SIZE] __decrypted_aux;
Brijesh Singh Sept. 10, 2018, 3:10 p.m. UTC | #5
On 09/10/2018 08:29 AM, Sean Christopherson wrote:
...

>>>> + */
>>>> +static struct pvclock_vsyscall_time_info
>>>> +			hv_clock_aux[NR_CPUS] __decrypted_aux;
>>> Hmm, so worst case that's 64 4K pages:
>>>
>>> (8192*32)/4096 = 64 4K pages.
>> We can minimize the worst case memory usage. The number of VCPUs
>> supported by KVM maybe less than NR_CPUS. e.g Currently KVM_MAX_VCPUS is
>> set to 288
> 
> KVM_MAX_VCPUS is a property of the host, whereas this code runs in the
> guest, e.g. KVM_MAX_VCPUS could be 2048 in the host for all we know.
> 


IIRC, during guest creation time qemu will check the host supported
VCPUS count. If count is greater than KVM_MAX_VCPUS then it will
fail to launch guest (or fail to hot plug vcpus). In other words, the
number of vcpus in a KVM guest will never to > KVM_MAX_VCPUS.

Am I missing something ?
Sean Christopherson Sept. 10, 2018, 3:28 p.m. UTC | #6
On Mon, 2018-09-10 at 10:10 -0500, Brijesh Singh wrote:
> 
> On 09/10/2018 08:29 AM, Sean Christopherson wrote:
> ...
> 
> > > > > + */
> > > > > +static struct pvclock_vsyscall_time_info
> > > > > +			hv_clock_aux[NR_CPUS] __decrypted_aux;
> > > > Hmm, so worst case that's 64 4K pages:
> > > > 
> > > > (8192*32)/4096 = 64 4K pages.
> > > We can minimize the worst case memory usage. The number of VCPUs
> > > supported by KVM maybe less than NR_CPUS. e.g Currently KVM_MAX_VCPUS is
> > > set to 288
> > KVM_MAX_VCPUS is a property of the host, whereas this code runs in the
> > guest, e.g. KVM_MAX_VCPUS could be 2048 in the host for all we know.
> > 
> 
> IIRC, during guest creation time qemu will check the host supported
> VCPUS count. If count is greater than KVM_MAX_VCPUS then it will
> fail to launch guest (or fail to hot plug vcpus). In other words, the
> number of vcpus in a KVM guest will never to > KVM_MAX_VCPUS.
> 
> Am I missing something ?

KVM_MAX_VCPUS is a definition for use in the *host*, it's even defined
in kvm_host.h.  The guest's pvclock code won't get magically recompiled
if KVM_MAX_VCPUS is changed in the host.  KVM_MAX_VCPUS is an arbitrary
value in the sense that there isn't a fundamental hard limit, i.e. the
value can be changed, either for a custom KVM build or in mainline,
e.g. it was bumped in 2016:

commit 682f732ecf7396e9d6fe24d44738966699fae6c0
Author: Radim Krčmář <rkrcmar@redhat.com>
Date:   Tue Jul 12 22:09:29 2016 +0200

    KVM: x86: bump MAX_VCPUS to 288
    
    288 is in high demand because of Knights Landing CPU.
    We cannot set the limit to 640k, because that would be wasting space.
    
    Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
    Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
    Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 074b5c760327..21a40dc7aad6 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -34,7 +34,7 @@
 #include <asm/asm.h>
 #include <asm/kvm_page_track.h>
 
-#define KVM_MAX_VCPUS 255
+#define KVM_MAX_VCPUS 288
 #define KVM_SOFT_MAX_VCPUS 240
 #define KVM_USER_MEM_SLOTS 509
 /* memory slots that are not exposed to userspace */
Brijesh Singh Sept. 10, 2018, 3:30 p.m. UTC | #7
On 09/10/2018 10:28 AM, Sean Christopherson wrote:
...

>>
>> IIRC, during guest creation time qemu will check the host supported
>> VCPUS count. If count is greater than KVM_MAX_VCPUS then it will
>> fail to launch guest (or fail to hot plug vcpus). In other words, the
>> number of vcpus in a KVM guest will never to > KVM_MAX_VCPUS.
>>
>> Am I missing something ?
> 
> KVM_MAX_VCPUS is a definition for use in the *host*, it's even defined
> in kvm_host.h.  The guest's pvclock code won't get magically recompiled
> if KVM_MAX_VCPUS is changed in the host.  KVM_MAX_VCPUS is an arbitrary
> value in the sense that there isn't a fundamental hard limit, i.e. the
> value can be changed, either for a custom KVM build or in mainline,
> e.g. it was bumped in 2016:
> 

Ah I see your point. thanks for clarifying it.
Borislav Petkov Sept. 10, 2018, 3:53 p.m. UTC | #8
On Mon, Sep 10, 2018 at 08:15:38AM -0500, Brijesh Singh wrote:
> > Now, the real question from all this SNAFU is, why can't all those point
> > to a single struct pvclock_vsyscall_time_info and all CPUs read a single
> > thing? Why do they have to be per-CPU and thus waste so much memory?

You forgot to answer to the real question - why do we need those things
to be perCPU and why can't we use a single instance to share with *all*
CPUs?

Because if we can, then we're golden!
Sean Christopherson Sept. 10, 2018, 4:13 p.m. UTC | #9
On Mon, 2018-09-10 at 17:53 +0200, Borislav Petkov wrote:
> On Mon, Sep 10, 2018 at 08:15:38AM -0500, Brijesh Singh wrote:
> > 
> > > 
> > > Now, the real question from all this SNAFU is, why can't all those point
> > > to a single struct pvclock_vsyscall_time_info and all CPUs read a single
> > > thing? Why do they have to be per-CPU and thus waste so much memory?
> You forgot to answer to the real question - why do we need those things
> to be perCPU and why can't we use a single instance to share with *all*
> CPUs?

I can't speak to the actual TSC stuff, but...

The pvclock ABI includes a per-vCPU bit, PVCLOCK_GUEST_STOPPED, to indicate
that the VM has been paused by the host.  The guest uses this information to
update its watchdogs to avoid false positives.  I have no idea if there are
use cases for setting STOPPED on a subset of vCPUs, but the ABI allows it so
here we are...
Brijesh Singh Sept. 10, 2018, 4:14 p.m. UTC | #10
On 09/10/2018 10:53 AM, Borislav Petkov wrote:
> On Mon, Sep 10, 2018 at 08:15:38AM -0500, Brijesh Singh wrote:
>>> Now, the real question from all this SNAFU is, why can't all those point
>>> to a single struct pvclock_vsyscall_time_info and all CPUs read a single
>>> thing? Why do they have to be per-CPU and thus waste so much memory?
> 
> You forgot to answer to the real question - why do we need those things
> to be perCPU and why can't we use a single instance to share with *all*
> CPUs?
> 
> Because if we can, then we're golden!
> 


I did not forgot to answer it. Maybe Paolo or someone more knowledgeable
in that area of code can comment why those are perCpu.

-Brijesh
Borislav Petkov Sept. 10, 2018, 4:48 p.m. UTC | #11
Ok,

so *maybe* - and I pretty much have no clue about virt - but just
*maybe*, that kvmclock thing can be shared by all CPUs in a *guest*
then. As in: the guest should see stable clocks which are the same
regardless from which vCPU they're read and so on...

Just a dumb idea anyway - this is me thinking about baremetal and trying
to convert that to virt.

But I'd say the memory savings aspect is something we can discuss later,
when there's time and after the regression at hand has been addressed...

Thx.
Paolo Bonzini Sept. 11, 2018, 9:26 a.m. UTC | #12
On 10/09/2018 18:48, Borislav Petkov wrote:
> 
> so *maybe* - and I pretty much have no clue about virt - but just
> *maybe*, that kvmclock thing can be shared by all CPUs in a *guest*
> then. As in: the guest should see stable clocks which are the same
> regardless from which vCPU they're read and so on...

Usually the kvmclock structs are all the same, but there is support for
old machines with inconsistent TSCs (across different sockets typically).

Paolo
Borislav Petkov Sept. 11, 2018, 10:01 a.m. UTC | #13
On Tue, Sep 11, 2018 at 11:26:21AM +0200, Paolo Bonzini wrote:
> Usually the kvmclock structs are all the same, but there is support for
> old machines with inconsistent TSCs (across different sockets typically).

Would that be a problem, though? Sounds like an "improvement" to me. :-)

I mean, if we keep using the same TSC across all vCPUs, the guest will
actually see a single TSC and thus have stable and synchronized TSCs.
Unlike the host.

I.e., the guest will be better than the host! :-)
Paolo Bonzini Sept. 11, 2018, 10:19 a.m. UTC | #14
On 11/09/2018 12:01, Borislav Petkov wrote:
> On Tue, Sep 11, 2018 at 11:26:21AM +0200, Paolo Bonzini wrote:
>> Usually the kvmclock structs are all the same, but there is support for
>> old machines with inconsistent TSCs (across different sockets typically).
> 
> Would that be a problem, though? Sounds like an "improvement" to me. :-)
> 
> I mean, if we keep using the same TSC across all vCPUs, the guest will
> actually see a single TSC and thus have stable and synchronized TSCs.
> Unlike the host.

That's exactly what kvmclock is for, it provides a stable and
synchronized clock on top of unsynchronized TSCs.  But that's also why
you need one struct per vCPU, at least in the synchronized case.

Paolo

> I.e., the guest will be better than the host! :-)
>
Borislav Petkov Sept. 11, 2018, 10:25 a.m. UTC | #15
On Tue, Sep 11, 2018 at 12:19:13PM +0200, Paolo Bonzini wrote:
> That's exactly what kvmclock is for, it provides a stable and
> synchronized clock on top of unsynchronized TSCs.  But that's also why
> you need one struct per vCPU, at least in the synchronized case.

Why?

Why can't it be a single pointer to a struct pvclock_vsyscall_time_info
shared between all vCPUs?

Or does each vCPU write its own specific stuff into it so it has to be
per-vCPU?
Paolo Bonzini Sept. 11, 2018, 11:07 a.m. UTC | #16
On 11/09/2018 12:25, Borislav Petkov wrote:
> On Tue, Sep 11, 2018 at 12:19:13PM +0200, Paolo Bonzini wrote:
>> That's exactly what kvmclock is for, it provides a stable and
>> synchronized clock on top of unsynchronized TSCs.  But that's also why
>> you need one struct per vCPU, at least in the synchronized case.
                                                 ^^^^^^^^^^^^

unsynchronized

> 
> Why?
> 
> Why can't it be a single pointer to a struct pvclock_vsyscall_time_info
> shared between all vCPUs?
> 
> Or does each vCPU write its own specific stuff into it so it has to be
> per-vCPU?

If the host TSCs are unsynchronized then yes, that's what happens.  And
you can do live migration from synchronized to unsynchronized.

Paolo
Borislav Petkov Sept. 11, 2018, 1:55 p.m. UTC | #17
On Tue, Sep 11, 2018 at 01:07:06PM +0200, Paolo Bonzini wrote:
> If the host TSCs are unsynchronized then yes, that's what happens.  And
> you can do live migration from synchronized to unsynchronized.

Which brings us back to my original question: why would we *ever* want
to support unsynchronized TSCs in a guest? Such machines are a real
abomination for baremetal - it doesn't make *any* sense to me to have
that in guests too, if it can be helped...
Paolo Bonzini Sept. 11, 2018, 2 p.m. UTC | #18
On 11/09/2018 15:55, Borislav Petkov wrote:
> On Tue, Sep 11, 2018 at 01:07:06PM +0200, Paolo Bonzini wrote:
>> If the host TSCs are unsynchronized then yes, that's what happens.  And
>> you can do live migration from synchronized to unsynchronized.
> 
> Which brings us back to my original question: why would we *ever* want
> to support unsynchronized TSCs in a guest? Such machines are a real
> abomination for baremetal - it doesn't make *any* sense to me to have
> that in guests too, if it can be helped...

No, wait.  The host TSC is unsynchronized, _so_ you need one kvmclock
struct per vCPU.  The resulting kvmclock is synchronized.

Paolo
diff mbox series

Patch

diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
index 802b2eb..cc46584 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -48,11 +48,13 @@  int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size);
 
 /* Architecture __weak replacement functions */
 void __init mem_encrypt_init(void);
+void __init free_decrypted_mem(void);
 
 bool sme_active(void);
 bool sev_active(void);
 
 #define __decrypted __attribute__((__section__(".data..decrypted")))
+#define __decrypted_aux __attribute__((__section__(".data..decrypted.aux")))
 
 #else	/* !CONFIG_AMD_MEM_ENCRYPT */
 
@@ -80,6 +82,7 @@  static inline int __init
 early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0; }
 
 #define __decrypted
+#define __decrypted_aux
 
 #endif	/* CONFIG_AMD_MEM_ENCRYPT */
 
@@ -93,6 +96,7 @@  early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0;
 #define __sme_pa_nodebug(x)	(__pa_nodebug(x) | sme_me_mask)
 
 extern char __start_data_decrypted[], __end_data_decrypted[];
+extern char __start_data_decrypted_aux[];
 
 #endif	/* __ASSEMBLY__ */
 
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 376fd3a..6086b56 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -65,6 +65,15 @@  static struct pvclock_vsyscall_time_info
 static struct pvclock_wall_clock wall_clock __decrypted;
 static DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu);
 
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+/*
+ * The auxiliary array will be used when SEV is active. In non-SEV case,
+ * it will be freed by free_decrypted_mem().
+ */
+static struct pvclock_vsyscall_time_info
+			hv_clock_aux[NR_CPUS] __decrypted_aux;
+#endif
+
 static inline struct pvclock_vcpu_time_info *this_cpu_pvti(void)
 {
 	return &this_cpu_read(hv_clock_per_cpu)->pvti;
@@ -269,6 +278,11 @@  static int kvmclock_setup_percpu(unsigned int cpu)
 	/* Use the static page for the first CPUs, allocate otherwise */
 	if (cpu < HVC_BOOT_ARRAY_SIZE)
 		p = &hv_clock_boot[cpu];
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+	/* Use the static page from auxiliary array instead of allocating it. */
+	else if (sev_active())
+		p = &hv_clock_aux[cpu - HVC_BOOT_ARRAY_SIZE];
+#endif
 	else
 		p = kzalloc(sizeof(*p), GFP_KERNEL);
 
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 4cb1064..bde287a 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -77,6 +77,9 @@  jiffies_64 = jiffies;
 	. = ALIGN(PMD_SIZE);					\
 	__start_data_decrypted = .;				\
 	*(.data..decrypted);					\
+	. = ALIGN(PAGE_SIZE);					\
+	__start_data_decrypted_aux = .;				\
+	*(.data..decrypted.aux);				\
 	. = ALIGN(PMD_SIZE);					\
 	__end_data_decrypted = .;				\
 
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 7a8fc26..052b279 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -815,9 +815,12 @@  void free_kernel_image_pages(void *begin, void *end)
 		set_memory_np_noalias(begin_ul, len_pages);
 }
 
+void __weak free_decrypted_mem(void) { }
+
 void __ref free_initmem(void)
 {
 	e820__reallocate_tables();
+	free_decrypted_mem();
 
 	free_kernel_image_pages(&__init_begin, &__init_end);
 }
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index b2de398..9a08c52 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -348,6 +348,16 @@  bool sev_active(void)
 EXPORT_SYMBOL(sev_active);
 
 /* Architecture __weak replacement functions */
+void __init free_decrypted_mem(void)
+{
+	if (mem_encrypt_active())
+		return;
+
+	free_init_pages("unused decrypted",
+			(unsigned long)__start_data_decrypted_aux,
+			(unsigned long)__end_data_decrypted);
+}
+
 void __init mem_encrypt_init(void)
 {
 	if (!sme_me_mask)