diff mbox series

[PATCHv2,2/4] arm64: add guest pvstate support

Message ID 20210709043713.887098-3-senozhatsky@chromium.org (mailing list archive)
State New, archived
Headers show
Series arm64:kvm: teach guest sched that VCPUs can be preempted | expand

Commit Message

Sergey Senozhatsky July 9, 2021, 4:37 a.m. UTC
PV-vcpu-state is a per-CPU struct, which, for the time being,
holds boolean `preempted' vCPU state. During the startup,
given that host supports PV-state, each guest vCPU sends
a pointer to its per-CPU variable to the host as a payload
with the SMCCC HV call, so that host can update vCPU state
when it puts or loads vCPU.

This has impact on the guest's scheduler:

[..]
  wake_up_process()
   try_to_wake_up()
    select_task_rq_fair()
     available_idle_cpu()
      vcpu_is_preempted()

Some sched benchmarks data is available on the github page [0].

[0] https://github.com/sergey-senozhatsky/arm64-vcpu_is_preempted

Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
---
 arch/arm64/include/asm/paravirt.h | 19 +++++++
 arch/arm64/kernel/paravirt.c      | 94 +++++++++++++++++++++++++++++++
 arch/arm64/kernel/smp.c           |  4 ++
 3 files changed, 117 insertions(+)

Comments

David Edmondson July 9, 2021, 7:39 a.m. UTC | #1
On Friday, 2021-07-09 at 13:37:11 +09, Sergey Senozhatsky wrote:

> PV-vcpu-state is a per-CPU struct, which, for the time being,
> holds boolean `preempted' vCPU state. During the startup,
> given that host supports PV-state, each guest vCPU sends
> a pointer to its per-CPU variable to the host as a payload
> with the SMCCC HV call, so that host can update vCPU state
> when it puts or loads vCPU.
>
> This has impact on the guest's scheduler:
>
> [..]
>   wake_up_process()
>    try_to_wake_up()
>     select_task_rq_fair()
>      available_idle_cpu()
>       vcpu_is_preempted()
>
> Some sched benchmarks data is available on the github page [0].
>
> [0] https://github.com/sergey-senozhatsky/arm64-vcpu_is_preempted
>
> Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
> ---
>  arch/arm64/include/asm/paravirt.h | 19 +++++++
>  arch/arm64/kernel/paravirt.c      | 94 +++++++++++++++++++++++++++++++
>  arch/arm64/kernel/smp.c           |  4 ++
>  3 files changed, 117 insertions(+)
>
> diff --git a/arch/arm64/include/asm/paravirt.h b/arch/arm64/include/asm/paravirt.h
> index 9aa193e0e8f2..a3f7665dff38 100644
> --- a/arch/arm64/include/asm/paravirt.h
> +++ b/arch/arm64/include/asm/paravirt.h
> @@ -2,6 +2,11 @@
>  #ifndef _ASM_ARM64_PARAVIRT_H
>  #define _ASM_ARM64_PARAVIRT_H
>  
> +struct vcpu_state {
> +	bool	preempted;
> +	u8	reserved[63];
> +};
> +
>  #ifdef CONFIG_PARAVIRT
>  #include <linux/static_call_types.h>
>  
> @@ -20,8 +25,22 @@ static inline u64 paravirt_steal_clock(int cpu)
>  
>  int __init pv_time_init(void);
>  
> +bool dummy_vcpu_is_preempted(unsigned int cpu);
> +
> +extern struct static_key pv_vcpu_is_preempted_enabled;
> +DECLARE_STATIC_CALL(pv_vcpu_is_preempted, dummy_vcpu_is_preempted);
> +
> +static inline bool paravirt_vcpu_is_preempted(unsigned int cpu)
> +{
> +	return static_call(pv_vcpu_is_preempted)(cpu);
> +}
> +
> +int __init pv_vcpu_state_init(void);
> +
>  #else
>  
> +#define pv_vcpu_state_init() do {} while (0)
> +
>  #define pv_time_init() do {} while (0)
>  
>  #endif // CONFIG_PARAVIRT
> diff --git a/arch/arm64/kernel/paravirt.c b/arch/arm64/kernel/paravirt.c
> index 75fed4460407..d8fc46795d94 100644
> --- a/arch/arm64/kernel/paravirt.c
> +++ b/arch/arm64/kernel/paravirt.c
> @@ -40,6 +40,11 @@ struct pv_time_stolen_time_region {
>  
>  static DEFINE_PER_CPU(struct pv_time_stolen_time_region, stolen_time_region);
>  
> +static DEFINE_PER_CPU(struct vcpu_state, vcpus_states);
> +struct static_key pv_vcpu_is_preempted_enabled;
> +
> +DEFINE_STATIC_CALL(pv_vcpu_is_preempted, dummy_vcpu_is_preempted);
> +
>  static bool steal_acc = true;
>  static int __init parse_no_stealacc(char *arg)
>  {
> @@ -165,3 +170,92 @@ int __init pv_time_init(void)
>  
>  	return 0;
>  }
> +
> +bool dummy_vcpu_is_preempted(unsigned int cpu)
> +{
> +	return false;
> +}
> +
> +static bool __vcpu_is_preempted(unsigned int cpu)
> +{
> +	struct vcpu_state *st;
> +
> +	st = &per_cpu(vcpus_states, cpu);
> +	return READ_ONCE(st->preempted);
> +}
> +
> +static bool has_pv_vcpu_state(void)
> +{
> +	struct arm_smccc_res res;
> +
> +	/* To detect the presence of PV time support we require SMCCC 1.1+ */

"PV VCPU state support" rather than "PV time support".

> +	if (arm_smccc_1_1_get_conduit() == SMCCC_CONDUIT_NONE)
> +		return false;
> +
> +	arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
> +			     ARM_SMCCC_HV_PV_VCPU_STATE_FEATURES,
> +			     &res);
> +
> +	if (res.a0 != SMCCC_RET_SUCCESS)
> +		return false;
> +	return true;
> +}
> +
> +static int __pv_vcpu_state_hook(unsigned int cpu, int event)
> +{
> +	struct arm_smccc_res res;
> +	struct vcpu_state *st;
> +
> +	st = &per_cpu(vcpus_states, cpu);
> +	arm_smccc_1_1_invoke(event, virt_to_phys(st), &res);
> +	if (res.a0 != SMCCC_RET_SUCCESS)
> +		return -EINVAL;
> +	return 0;
> +}
> +
> +static int vcpu_state_init(unsigned int cpu)
> +{
> +	int ret = __pv_vcpu_state_hook(cpu, ARM_SMCCC_HV_PV_VCPU_STATE_INIT);
> +
> +	if (ret)
> +		pr_warn("Unable to ARM_SMCCC_HV_PV_STATE_INIT\n");
> +	return ret;
> +}
> +
> +static int vcpu_state_release(unsigned int cpu)
> +{
> +	int ret = __pv_vcpu_state_hook(cpu, ARM_SMCCC_HV_PV_VCPU_STATE_RELEASE);
> +
> +	if (ret)
> +		pr_warn("Unable to ARM_SMCCC_HV_PV_STATE_RELEASE\n");
> +	return ret;
> +}
> +
> +static int pv_vcpu_state_register_hooks(void)
> +{
> +	int ret;
> +
> +	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
> +				"hypervisor/arm/pvstate:starting",
> +				vcpu_state_init,
> +				vcpu_state_release);
> +	if (ret < 0)
> +		pr_warn("Failed to register CPU hooks\n");

Include that it's PV VCPU state hooks?

> +	return 0;
> +}
> +
> +int __init pv_vcpu_state_init(void)
> +{
> +	int ret;
> +
> +	if (!has_pv_vcpu_state())
> +		return 0;
> +
> +	ret = pv_vcpu_state_register_hooks();
> +	if (ret)
> +		return ret;
> +
> +	static_call_update(pv_vcpu_is_preempted, __vcpu_is_preempted);
> +	static_key_slow_inc(&pv_vcpu_is_preempted_enabled);
> +	return 0;
> +}
> diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
> index 6f6ff072acbd..20d42e0f2a99 100644
> --- a/arch/arm64/kernel/smp.c
> +++ b/arch/arm64/kernel/smp.c
> @@ -50,6 +50,7 @@
>  #include <asm/tlbflush.h>
>  #include <asm/ptrace.h>
>  #include <asm/virt.h>
> +#include <asm/paravirt.h>
>  
>  #define CREATE_TRACE_POINTS
>  #include <trace/events/ipi.h>
> @@ -756,6 +757,9 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
>  	numa_store_cpu_info(this_cpu);
>  	numa_add_cpu(this_cpu);
>  
> +	/* Init paravirt CPU state */
> +	pv_vcpu_state_init();
> +
>  	/*
>  	 * If UP is mandated by "nosmp" (which implies "maxcpus=0"), don't set
>  	 * secondary CPUs present.
> -- 
> 2.32.0.93.g670b81a890-goog

dme.
Sergey Senozhatsky July 9, 2021, 7:52 a.m. UTC | #2
On (21/07/09 08:39), David Edmondson wrote:
[..]
> > +
> > +static bool has_pv_vcpu_state(void)
> > +{
> > +	struct arm_smccc_res res;
> > +
> > +	/* To detect the presence of PV time support we require SMCCC 1.1+ */
> 
> "PV VCPU state support" rather than "PV time support".

Indeed. Thanks.

[..]
> > +static int pv_vcpu_state_register_hooks(void)
> > +{
> > +	int ret;
> > +
> > +	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
> > +				"hypervisor/arm/pvstate:starting",
> > +				vcpu_state_init,
> > +				vcpu_state_release);
> > +	if (ret < 0)
> > +		pr_warn("Failed to register CPU hooks\n");
> 
> Include that it's PV VCPU state hooks?

Ack.
Joel Fernandes July 9, 2021, 6:58 p.m. UTC | #3
Hi, Just few nits, patch itself LGTM:

On Fri, Jul 9, 2021 at 12:37 AM Sergey Senozhatsky
<senozhatsky@chromium.org> wrote:
>
> PV-vcpu-state is a per-CPU struct, which, for the time being,
> holds boolean `preempted' vCPU state. During the startup,
> given that host supports PV-state, each guest vCPU sends
> a pointer to its per-CPU variable to the host as a payload
> with the SMCCC HV call, so that host can update vCPU state
> when it puts or loads vCPU.
>
> This has impact on the guest's scheduler:
>
> [..]
>   wake_up_process()
>    try_to_wake_up()
>     select_task_rq_fair()
>      available_idle_cpu()
>       vcpu_is_preempted()
>
> Some sched benchmarks data is available on the github page [0].
>
> [0] https://github.com/sergey-senozhatsky/arm64-vcpu_is_preempted
>
> Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
> ---
>  arch/arm64/include/asm/paravirt.h | 19 +++++++
>  arch/arm64/kernel/paravirt.c      | 94 +++++++++++++++++++++++++++++++
>  arch/arm64/kernel/smp.c           |  4 ++
>  3 files changed, 117 insertions(+)
>
> diff --git a/arch/arm64/include/asm/paravirt.h b/arch/arm64/include/asm/paravirt.h
> index 9aa193e0e8f2..a3f7665dff38 100644
> --- a/arch/arm64/include/asm/paravirt.h
> +++ b/arch/arm64/include/asm/paravirt.h
> @@ -2,6 +2,11 @@
>  #ifndef _ASM_ARM64_PARAVIRT_H
>  #define _ASM_ARM64_PARAVIRT_H
>
> +struct vcpu_state {
> +       bool    preempted;
> +       u8      reserved[63];
> +};
> +
>  #ifdef CONFIG_PARAVIRT
>  #include <linux/static_call_types.h>
>
> @@ -20,8 +25,22 @@ static inline u64 paravirt_steal_clock(int cpu)
>
>  int __init pv_time_init(void);
>
> +bool dummy_vcpu_is_preempted(unsigned int cpu);
> +
> +extern struct static_key pv_vcpu_is_preempted_enabled;.

pv_vcpu_is_preempted_enabled static_key is not used in any patch.
Maybe it is stale?

> +DECLARE_STATIC_CALL(pv_vcpu_is_preempted, dummy_vcpu_is_preempted);
> +
> +static inline bool paravirt_vcpu_is_preempted(unsigned int cpu)
> +{
> +       return static_call(pv_vcpu_is_preempted)(cpu);
> +}
> +
> +int __init pv_vcpu_state_init(void);
> +
>  #else
>
> +#define pv_vcpu_state_init() do {} while (0)
> +
>  #define pv_time_init() do {} while (0)
>
>  #endif // CONFIG_PARAVIRT
> diff --git a/arch/arm64/kernel/paravirt.c b/arch/arm64/kernel/paravirt.c
> index 75fed4460407..d8fc46795d94 100644
> --- a/arch/arm64/kernel/paravirt.c
> +++ b/arch/arm64/kernel/paravirt.c
> @@ -40,6 +40,11 @@ struct pv_time_stolen_time_region {
>
>  static DEFINE_PER_CPU(struct pv_time_stolen_time_region, stolen_time_region);
>
> +static DEFINE_PER_CPU(struct vcpu_state, vcpus_states);
> +struct static_key pv_vcpu_is_preempted_enabled;
> +
> +DEFINE_STATIC_CALL(pv_vcpu_is_preempted, dummy_vcpu_is_preempted);

Could we use DEFINE_STATIC_CALL_NULL and get rid of the dummy
function? I believe that makes the function trampoline as return
instruction, till it is updated.

> +
>  static bool steal_acc = true;
>  static int __init parse_no_stealacc(char *arg)
>  {
> @@ -165,3 +170,92 @@ int __init pv_time_init(void)
>
>         return 0;
>  }
> +
> +bool dummy_vcpu_is_preempted(unsigned int cpu)
> +{
> +       return false;
> +}
> +
> +static bool __vcpu_is_preempted(unsigned int cpu)
> +{
> +       struct vcpu_state *st;
> +
> +       st = &per_cpu(vcpus_states, cpu);
> +       return READ_ONCE(st->preempted);

I guess you could just do:
{
  return READ_ONCE(per_cpu(vcpus_states, cpu).preempted);
}

> +}
> +
> +static bool has_pv_vcpu_state(void)
> +{
> +       struct arm_smccc_res res;
> +
> +       /* To detect the presence of PV time support we require SMCCC 1.1+ */
> +       if (arm_smccc_1_1_get_conduit() == SMCCC_CONDUIT_NONE)
> +               return false;
> +
> +       arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
> +                            ARM_SMCCC_HV_PV_VCPU_STATE_FEATURES,
> +                            &res);
> +
> +       if (res.a0 != SMCCC_RET_SUCCESS)
> +               return false;
> +       return true;
> +}
> +
> +static int __pv_vcpu_state_hook(unsigned int cpu, int event)
> +{
> +       struct arm_smccc_res res;
> +       struct vcpu_state *st;
> +
> +       st = &per_cpu(vcpus_states, cpu);
> +       arm_smccc_1_1_invoke(event, virt_to_phys(st), &res);
> +       if (res.a0 != SMCCC_RET_SUCCESS)
> +               return -EINVAL;
> +       return 0;
> +}
> +
> +static int vcpu_state_init(unsigned int cpu)
> +{
> +       int ret = __pv_vcpu_state_hook(cpu, ARM_SMCCC_HV_PV_VCPU_STATE_INIT);
> +
> +       if (ret)
> +               pr_warn("Unable to ARM_SMCCC_HV_PV_STATE_INIT\n");
> +       return ret;
> +}
> +
> +static int vcpu_state_release(unsigned int cpu)
> +{
> +       int ret = __pv_vcpu_state_hook(cpu, ARM_SMCCC_HV_PV_VCPU_STATE_RELEASE);
> +
> +       if (ret)
> +               pr_warn("Unable to ARM_SMCCC_HV_PV_STATE_RELEASE\n");
> +       return ret;
> +}
> +
> +static int pv_vcpu_state_register_hooks(void)
> +{
> +       int ret;
> +
> +       ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
> +                               "hypervisor/arm/pvstate:starting",
> +                               vcpu_state_init,
> +                               vcpu_state_release);
> +       if (ret < 0)
> +               pr_warn("Failed to register CPU hooks\n");
> +       return 0;
> +}
> +
> +int __init pv_vcpu_state_init(void)
> +{
> +       int ret;
> +
> +       if (!has_pv_vcpu_state())
> +               return 0;
> +
> +       ret = pv_vcpu_state_register_hooks();
> +       if (ret)
> +               return ret;
> +
> +       static_call_update(pv_vcpu_is_preempted, __vcpu_is_preempted);
> +       static_key_slow_inc(&pv_vcpu_is_preempted_enabled);

I think this static key inc is also stale.

thanks,

-Joel
Sergey Senozhatsky July 9, 2021, 9:53 p.m. UTC | #4
Hi Joel,

On (21/07/09 14:58), Joel Fernandes wrote:
[..]
> > +struct vcpu_state {
> > +       bool    preempted;
> > +       u8      reserved[63];
> > +};
> > +
> >  #ifdef CONFIG_PARAVIRT
> >  #include <linux/static_call_types.h>
> >
> > @@ -20,8 +25,22 @@ static inline u64 paravirt_steal_clock(int cpu)
> >
> >  int __init pv_time_init(void);
> >
> > +bool dummy_vcpu_is_preempted(unsigned int cpu);
> > +
> > +extern struct static_key pv_vcpu_is_preempted_enabled;.
> 
> pv_vcpu_is_preempted_enabled static_key is not used in any patch.
> Maybe it is stale?

Oh, it is, thanks.

> > +DECLARE_STATIC_CALL(pv_vcpu_is_preempted, dummy_vcpu_is_preempted);
> > +
> > +static inline bool paravirt_vcpu_is_preempted(unsigned int cpu)
> > +{
> > +       return static_call(pv_vcpu_is_preempted)(cpu);
> > +}
> > +
> > +int __init pv_vcpu_state_init(void);
> > +
> >  #else
> >
> > +#define pv_vcpu_state_init() do {} while (0)
> > +
> >  #define pv_time_init() do {} while (0)
> >
> >  #endif // CONFIG_PARAVIRT
> > diff --git a/arch/arm64/kernel/paravirt.c b/arch/arm64/kernel/paravirt.c
> > index 75fed4460407..d8fc46795d94 100644
> > --- a/arch/arm64/kernel/paravirt.c
> > +++ b/arch/arm64/kernel/paravirt.c
> > @@ -40,6 +40,11 @@ struct pv_time_stolen_time_region {
> >
> >  static DEFINE_PER_CPU(struct pv_time_stolen_time_region, stolen_time_region);
> >
> > +static DEFINE_PER_CPU(struct vcpu_state, vcpus_states);
> > +struct static_key pv_vcpu_is_preempted_enabled;
> > +
> > +DEFINE_STATIC_CALL(pv_vcpu_is_preempted, dummy_vcpu_is_preempted);
> 
> Could we use DEFINE_STATIC_CALL_NULL and get rid of the dummy
> function? I believe that makes the function trampoline as return
> instruction, till it is updated.

Is DEFINE_STATIC_CALL_NULL for cases when function returns void?

We need something that returns `false` to vcpu_is_preempted() or
per_cpu(vcpus_states) once pv vcpu-state is initialised.

[..]
> > +static bool __vcpu_is_preempted(unsigned int cpu)
> > +{
> > +       struct vcpu_state *st;
> > +
> > +       st = &per_cpu(vcpus_states, cpu);
> > +       return READ_ONCE(st->preempted);
> 
> I guess you could just do:
> {
>   return READ_ONCE(per_cpu(vcpus_states, cpu).preempted);
> }

Ack.
Joel Fernandes July 11, 2021, 4:58 p.m. UTC | #5
On Fri, Jul 9, 2021 at 5:53 PM Sergey Senozhatsky
<senozhatsky@chromium.org> wrote:

> > > +DECLARE_STATIC_CALL(pv_vcpu_is_preempted, dummy_vcpu_is_preempted);
> > > +
> > > +static inline bool paravirt_vcpu_is_preempted(unsigned int cpu)
> > > +{
> > > +       return static_call(pv_vcpu_is_preempted)(cpu);
> > > +}
> > > +
> > > +int __init pv_vcpu_state_init(void);
> > > +
> > >  #else
> > >
> > > +#define pv_vcpu_state_init() do {} while (0)
> > > +
> > >  #define pv_time_init() do {} while (0)
> > >
> > >  #endif // CONFIG_PARAVIRT
> > > diff --git a/arch/arm64/kernel/paravirt.c b/arch/arm64/kernel/paravirt.c
> > > index 75fed4460407..d8fc46795d94 100644
> > > --- a/arch/arm64/kernel/paravirt.c
> > > +++ b/arch/arm64/kernel/paravirt.c
> > > @@ -40,6 +40,11 @@ struct pv_time_stolen_time_region {
> > >
> > >  static DEFINE_PER_CPU(struct pv_time_stolen_time_region, stolen_time_region);
> > >
> > > +static DEFINE_PER_CPU(struct vcpu_state, vcpus_states);
> > > +struct static_key pv_vcpu_is_preempted_enabled;
> > > +
> > > +DEFINE_STATIC_CALL(pv_vcpu_is_preempted, dummy_vcpu_is_preempted);
> >
> > Could we use DEFINE_STATIC_CALL_NULL and get rid of the dummy
> > function? I believe that makes the function trampoline as return
> > instruction, till it is updated.
>
> Is DEFINE_STATIC_CALL_NULL for cases when function returns void?
>
> We need something that returns `false` to vcpu_is_preempted() or
> per_cpu(vcpus_states) once pv vcpu-state is initialised.

Ah, that might be problematic. In which case what you did is fine. Thanks,

- Joel



>
> [..]
> > > +static bool __vcpu_is_preempted(unsigned int cpu)
> > > +{
> > > +       struct vcpu_state *st;
> > > +
> > > +       st = &per_cpu(vcpus_states, cpu);
> > > +       return READ_ONCE(st->preempted);
> >
> > I guess you could just do:
> > {
> >   return READ_ONCE(per_cpu(vcpus_states, cpu).preempted);
> > }
>
> Ack.
Marc Zyngier July 12, 2021, 3:42 p.m. UTC | #6
On Fri, 09 Jul 2021 05:37:11 +0100,
Sergey Senozhatsky <senozhatsky@chromium.org> wrote:
> 
> PV-vcpu-state is a per-CPU struct, which, for the time being,
> holds boolean `preempted' vCPU state. During the startup,
> given that host supports PV-state, each guest vCPU sends
> a pointer to its per-CPU variable to the host as a payload

What is the expected memory type for this memory region? What is its
life cycle? Where is it allocated from?

> with the SMCCC HV call, so that host can update vCPU state
> when it puts or loads vCPU.
> 
> This has impact on the guest's scheduler:
> 
> [..]
>   wake_up_process()
>    try_to_wake_up()
>     select_task_rq_fair()
>      available_idle_cpu()
>       vcpu_is_preempted()
> 
> Some sched benchmarks data is available on the github page [0].
> 
> [0] https://github.com/sergey-senozhatsky/arm64-vcpu_is_preempted

Please include these results in the cover letter. I tend to reply to
email while offline, and I can't comment on GH.

> Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
> ---
>  arch/arm64/include/asm/paravirt.h | 19 +++++++
>  arch/arm64/kernel/paravirt.c      | 94 +++++++++++++++++++++++++++++++
>  arch/arm64/kernel/smp.c           |  4 ++
>  3 files changed, 117 insertions(+)
> 
> diff --git a/arch/arm64/include/asm/paravirt.h b/arch/arm64/include/asm/paravirt.h
> index 9aa193e0e8f2..a3f7665dff38 100644
> --- a/arch/arm64/include/asm/paravirt.h
> +++ b/arch/arm64/include/asm/paravirt.h
> @@ -2,6 +2,11 @@
>  #ifndef _ASM_ARM64_PARAVIRT_H
>  #define _ASM_ARM64_PARAVIRT_H
>  
> +struct vcpu_state {

If this is KVM specific (which it most likely is), please name-space
it correctly, and move it to a KVM-specific location.

> +	bool	preempted;
> +	u8	reserved[63];

Why 63? Do you attach any particular meaning to a 64byte structure
(and before you say "cache line size", please look at some of the
cache line sizes we have to deal with...).

This should also be versioned from day-1, one way or another.

> +};
> +
>  #ifdef CONFIG_PARAVIRT
>  #include <linux/static_call_types.h>
>  
> @@ -20,8 +25,22 @@ static inline u64 paravirt_steal_clock(int cpu)
>  
>  int __init pv_time_init(void);
>  
> +bool dummy_vcpu_is_preempted(unsigned int cpu);
> +
> +extern struct static_key pv_vcpu_is_preempted_enabled;
> +DECLARE_STATIC_CALL(pv_vcpu_is_preempted, dummy_vcpu_is_preempted);
> +
> +static inline bool paravirt_vcpu_is_preempted(unsigned int cpu)
> +{
> +	return static_call(pv_vcpu_is_preempted)(cpu);
> +}
> +
> +int __init pv_vcpu_state_init(void);
> +
>  #else
>  
> +#define pv_vcpu_state_init() do {} while (0)
> +
>  #define pv_time_init() do {} while (0)
>  
>  #endif // CONFIG_PARAVIRT
> diff --git a/arch/arm64/kernel/paravirt.c b/arch/arm64/kernel/paravirt.c
> index 75fed4460407..d8fc46795d94 100644
> --- a/arch/arm64/kernel/paravirt.c
> +++ b/arch/arm64/kernel/paravirt.c
> @@ -40,6 +40,11 @@ struct pv_time_stolen_time_region {
>  
>  static DEFINE_PER_CPU(struct pv_time_stolen_time_region, stolen_time_region);
>  
> +static DEFINE_PER_CPU(struct vcpu_state, vcpus_states);

nit: there is only one 'state' structure per CPU, so I'd prefer the
singular form.

> +struct static_key pv_vcpu_is_preempted_enabled;
> +
> +DEFINE_STATIC_CALL(pv_vcpu_is_preempted, dummy_vcpu_is_preempted);
> +
>  static bool steal_acc = true;
>  static int __init parse_no_stealacc(char *arg)
>  {
> @@ -165,3 +170,92 @@ int __init pv_time_init(void)
>  
>  	return 0;
>  }
> +
> +bool dummy_vcpu_is_preempted(unsigned int cpu)

Why does this have to be global?

> +{
> +	return false;
> +}
> +
> +static bool __vcpu_is_preempted(unsigned int cpu)
> +{
> +	struct vcpu_state *st;
> +
> +	st = &per_cpu(vcpus_states, cpu);
> +	return READ_ONCE(st->preempted);
> +}
> +
> +static bool has_pv_vcpu_state(void)
> +{
> +	struct arm_smccc_res res;
> +
> +	/* To detect the presence of PV time support we require SMCCC 1.1+ */
> +	if (arm_smccc_1_1_get_conduit() == SMCCC_CONDUIT_NONE)
> +		return false;
> +
> +	arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
> +			     ARM_SMCCC_HV_PV_VCPU_STATE_FEATURES,
> +			     &res);
> +
> +	if (res.a0 != SMCCC_RET_SUCCESS)
> +		return false;
> +	return true;

Please move all this over the the KVM-specific discovery mechanism.

> +}
> +
> +static int __pv_vcpu_state_hook(unsigned int cpu, int event)
> +{
> +	struct arm_smccc_res res;
> +	struct vcpu_state *st;
> +
> +	st = &per_cpu(vcpus_states, cpu);
> +	arm_smccc_1_1_invoke(event, virt_to_phys(st), &res);
> +	if (res.a0 != SMCCC_RET_SUCCESS)
> +		return -EINVAL;
> +	return 0;
> +}
> +
> +static int vcpu_state_init(unsigned int cpu)
> +{
> +	int ret = __pv_vcpu_state_hook(cpu, ARM_SMCCC_HV_PV_VCPU_STATE_INIT);
> +
> +	if (ret)
> +		pr_warn("Unable to ARM_SMCCC_HV_PV_STATE_INIT\n");

pr_warn_once(), please.

> +	return ret;
> +}
> +
> +static int vcpu_state_release(unsigned int cpu)
> +{
> +	int ret = __pv_vcpu_state_hook(cpu, ARM_SMCCC_HV_PV_VCPU_STATE_RELEASE);
> +
> +	if (ret)
> +		pr_warn("Unable to ARM_SMCCC_HV_PV_STATE_RELEASE\n");
> +	return ret;
> +}
> +
> +static int pv_vcpu_state_register_hooks(void)
> +{
> +	int ret;
> +
> +	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
> +				"hypervisor/arm/pvstate:starting",
> +				vcpu_state_init,
> +				vcpu_state_release);
> +	if (ret < 0)
> +		pr_warn("Failed to register CPU hooks\n");
> +	return 0;
> +}
> +
> +int __init pv_vcpu_state_init(void)
> +{
> +	int ret;
> +
> +	if (!has_pv_vcpu_state())
> +		return 0;
> +
> +	ret = pv_vcpu_state_register_hooks();
> +	if (ret)
> +		return ret;
> +
> +	static_call_update(pv_vcpu_is_preempted, __vcpu_is_preempted);
> +	static_key_slow_inc(&pv_vcpu_is_preempted_enabled);
> +	return 0;
> +}
> diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
> index 6f6ff072acbd..20d42e0f2a99 100644
> --- a/arch/arm64/kernel/smp.c
> +++ b/arch/arm64/kernel/smp.c
> @@ -50,6 +50,7 @@
>  #include <asm/tlbflush.h>
>  #include <asm/ptrace.h>
>  #include <asm/virt.h>
> +#include <asm/paravirt.h>
>  
>  #define CREATE_TRACE_POINTS
>  #include <trace/events/ipi.h>
> @@ -756,6 +757,9 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
>  	numa_store_cpu_info(this_cpu);
>  	numa_add_cpu(this_cpu);
>  
> +	/* Init paravirt CPU state */
> +	pv_vcpu_state_init();
> +
>  	/*
>  	 * If UP is mandated by "nosmp" (which implies "maxcpus=0"), don't set
>  	 * secondary CPUs present.

Thanks,

	M.
Sergey Senozhatsky July 21, 2021, 2:05 a.m. UTC | #7
On (21/07/12 16:42), Marc Zyngier wrote:
> > 
> > PV-vcpu-state is a per-CPU struct, which, for the time being,
> > holds boolean `preempted' vCPU state. During the startup,
> > given that host supports PV-state, each guest vCPU sends
> > a pointer to its per-CPU variable to the host as a payload
> 
> What is the expected memory type for this memory region? What is its
> life cycle? Where is it allocated from?

Guest per-CPU area, which physical addresses is shared with the host.

> > with the SMCCC HV call, so that host can update vCPU state
> > when it puts or loads vCPU.
> > 
> > This has impact on the guest's scheduler:
> > 
> > [..]
> >   wake_up_process()
> >    try_to_wake_up()
> >     select_task_rq_fair()
> >      available_idle_cpu()
> >       vcpu_is_preempted()
> > 
> > Some sched benchmarks data is available on the github page [0].
> > 
> > [0] https://github.com/sergey-senozhatsky/arm64-vcpu_is_preempted
> 
> Please include these results in the cover letter. I tend to reply to
> email while offline, and I can't comment on GH.

ACK.

> > +struct vcpu_state {
> 
> If this is KVM specific (which it most likely is), please name-space
> it correctly, and move it to a KVM-specific location.

ACK.

> > +	bool	preempted;
> > +	u8	reserved[63];
> 
> Why 63? Do you attach any particular meaning to a 64byte structure
> (and before you say "cache line size", please look at some of the
> cache line sizes we have to deal with...).

We do have some future plans to share some bits of the guest's context
with the host.

> This should also be versioned from day-1, one way or another.

Makes sense.

> > +};
> > +
> >  #ifdef CONFIG_PARAVIRT
> >  #include <linux/static_call_types.h>
> >  
> > @@ -20,8 +25,22 @@ static inline u64 paravirt_steal_clock(int cpu)
> >  
> >  int __init pv_time_init(void);
> >  
> > +bool dummy_vcpu_is_preempted(unsigned int cpu);
> > +
> > +extern struct static_key pv_vcpu_is_preempted_enabled;
> > +DECLARE_STATIC_CALL(pv_vcpu_is_preempted, dummy_vcpu_is_preempted);
> > +
> > +static inline bool paravirt_vcpu_is_preempted(unsigned int cpu)
> > +{
> > +	return static_call(pv_vcpu_is_preempted)(cpu);
> > +}
> > +
> > +int __init pv_vcpu_state_init(void);
> > +
> >  #else
> >  
> > +#define pv_vcpu_state_init() do {} while (0)
> > +
> >  #define pv_time_init() do {} while (0)
> >  
> >  #endif // CONFIG_PARAVIRT
> > diff --git a/arch/arm64/kernel/paravirt.c b/arch/arm64/kernel/paravirt.c
> > index 75fed4460407..d8fc46795d94 100644
> > --- a/arch/arm64/kernel/paravirt.c
> > +++ b/arch/arm64/kernel/paravirt.c
> > @@ -40,6 +40,11 @@ struct pv_time_stolen_time_region {
> >  
> >  static DEFINE_PER_CPU(struct pv_time_stolen_time_region, stolen_time_region);
> >  
> > +static DEFINE_PER_CPU(struct vcpu_state, vcpus_states);
> 
> nit: there is only one 'state' structure per CPU, so I'd prefer the
> singular form.

ACK.

> > +struct static_key pv_vcpu_is_preempted_enabled;
> > +
> > +DEFINE_STATIC_CALL(pv_vcpu_is_preempted, dummy_vcpu_is_preempted);
> > +
> >  static bool steal_acc = true;
> >  static int __init parse_no_stealacc(char *arg)
> >  {
> > @@ -165,3 +170,92 @@ int __init pv_time_init(void)
> >  
> >  	return 0;
> >  }
> > +
> > +bool dummy_vcpu_is_preempted(unsigned int cpu)
> 
> Why does this have to be global?

I think this can be moved away from the header, so then we don't need
to DECLARE_STATIC_CALL() with a dummy function.

> > +static bool has_pv_vcpu_state(void)
> > +{
> > +	struct arm_smccc_res res;
> > +
> > +	/* To detect the presence of PV time support we require SMCCC 1.1+ */
> > +	if (arm_smccc_1_1_get_conduit() == SMCCC_CONDUIT_NONE)
> > +		return false;
> > +
> > +	arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
> > +			     ARM_SMCCC_HV_PV_VCPU_STATE_FEATURES,
> > +			     &res);
> > +
> > +	if (res.a0 != SMCCC_RET_SUCCESS)
> > +		return false;
> > +	return true;
> 
> Please move all this over the the KVM-specific discovery mechanism.

Will take a look.

> > +static int __pv_vcpu_state_hook(unsigned int cpu, int event)
> > +{
> > +	struct arm_smccc_res res;
> > +	struct vcpu_state *st;
> > +
> > +	st = &per_cpu(vcpus_states, cpu);
> > +	arm_smccc_1_1_invoke(event, virt_to_phys(st), &res);
> > +	if (res.a0 != SMCCC_RET_SUCCESS)
> > +		return -EINVAL;
> > +	return 0;
> > +}
> > +
> > +static int vcpu_state_init(unsigned int cpu)
> > +{
> > +	int ret = __pv_vcpu_state_hook(cpu, ARM_SMCCC_HV_PV_VCPU_STATE_INIT);
> > +
> > +	if (ret)
> > +		pr_warn("Unable to ARM_SMCCC_HV_PV_STATE_INIT\n");
> 
> pr_warn_once(), please.

ACK.
Marc Zyngier July 21, 2021, 8:22 a.m. UTC | #8
On Wed, 21 Jul 2021 03:05:25 +0100,
Sergey Senozhatsky <senozhatsky@chromium.org> wrote:
> 
> On (21/07/12 16:42), Marc Zyngier wrote:
> > > 
> > > PV-vcpu-state is a per-CPU struct, which, for the time being,
> > > holds boolean `preempted' vCPU state. During the startup,
> > > given that host supports PV-state, each guest vCPU sends
> > > a pointer to its per-CPU variable to the host as a payload
> > 
> > What is the expected memory type for this memory region? What is its
> > life cycle? Where is it allocated from?
> 
> Guest per-CPU area, which physical addresses is shared with the
> host.

Again: what are the memory types you expect this to be used with? When
will the hypervisor ever stop accessing this? How does it work across
reset?

I'm sorry to be that pressing, but these are the gory details that
actually matter if you want this thing to be maintainable.

Thanks,

	M.
Sergey Senozhatsky July 21, 2021, 8:47 a.m. UTC | #9
On (21/07/21 09:22), Marc Zyngier wrote:
> On Wed, 21 Jul 2021 03:05:25 +0100,
> Sergey Senozhatsky <senozhatsky@chromium.org> wrote:
> > 
> > On (21/07/12 16:42), Marc Zyngier wrote:
> > > > 
> > > > PV-vcpu-state is a per-CPU struct, which, for the time being,
> > > > holds boolean `preempted' vCPU state. During the startup,
> > > > given that host supports PV-state, each guest vCPU sends
> > > > a pointer to its per-CPU variable to the host as a payload
> > > 
> > > What is the expected memory type for this memory region? What is its
> > > life cycle? Where is it allocated from?
> > 
> > Guest per-CPU area, which physical addresses is shared with the
> > host.
> 
> Again: what are the memory types you expect this to be used with?

I heard your questions, I'm trying to figure out the answers now.

As of memory type - I presume you are talking about coherent vs
non-coherent memory. Can guest per-CPU memory be non-coherent? Guest
never writes anything to the region of memory it shares with the host,
it only reads what the host writes to it. All reads and writes are
done from CPU (no devices DMA access, etc).

Do we need any cache flushes/syncs in this case?

> When will the hypervisor ever stop accessing this?

KVM always access it for the vcpus that are getting scheduled out or
scheduled in on the host side.

> How does it work across reset?

I need to figure out what happens during reset/migration in the first
place.

> I'm sorry to be that pressing, but these are the gory details that
> actually matter if you want this thing to be maintainable.

Sure, no problem.
Marc Zyngier July 21, 2021, 10:16 a.m. UTC | #10
On Wed, 21 Jul 2021 09:47:52 +0100,
Sergey Senozhatsky <senozhatsky@chromium.org> wrote:
> 
> On (21/07/21 09:22), Marc Zyngier wrote:
> > On Wed, 21 Jul 2021 03:05:25 +0100,
> > Sergey Senozhatsky <senozhatsky@chromium.org> wrote:
> > > 
> > > On (21/07/12 16:42), Marc Zyngier wrote:
> > > > > 
> > > > > PV-vcpu-state is a per-CPU struct, which, for the time being,
> > > > > holds boolean `preempted' vCPU state. During the startup,
> > > > > given that host supports PV-state, each guest vCPU sends
> > > > > a pointer to its per-CPU variable to the host as a payload
> > > > 
> > > > What is the expected memory type for this memory region? What is its
> > > > life cycle? Where is it allocated from?
> > > 
> > > Guest per-CPU area, which physical addresses is shared with the
> > > host.
> > 
> > Again: what are the memory types you expect this to be used with?
> 
> I heard your questions, I'm trying to figure out the answers now.
> 
> As of memory type - I presume you are talking about coherent vs
> non-coherent memory.

No. I'm talking about cacheable vs non-cacheable. The ARM architecture
is always coherent for memory that is inner-shareable, which applies
to any system running Linux. On the other hand, there is no
architected cache snooping when using non-cacheable accesses.

> Can guest per-CPU memory be non-coherent? Guest never writes
> anything to the region of memory it shares with the host, it only
> reads what the host writes to it. All reads and writes are done from
> CPU (no devices DMA access, etc).
> 
> Do we need any cache flushes/syncs in this case?

If you expect the guest to have non-cacheable mappings (or to run with
its MMU off at any point, which amounts to the same thing) *and* still
be able to access the shared page, then *someone* will have to perform
CMOs to make these writes visible to the PoC (unless you have FWB).

Needless to say, this would kill any sort of performance gain this
feature could hypothetically bring. Defining the scope for the access
would help mitigating this, even if that's just a sentence saying "the
shared page *must* be accessed from a cacheable mapping".

> 
> > When will the hypervisor ever stop accessing this?
> 
> KVM always access it for the vcpus that are getting scheduled out or
> scheduled in on the host side.

I was more hinting at whether there was a way to disable this at
runtime. Think of a guest using kexec, for example, where you really
don't want the hypervisor to start messing with memory that has been
since reallocated by the guest.

> > How does it work across reset?
> 
> I need to figure out what happens during reset/migration in the first
> place.

Yup.

	M.
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/paravirt.h b/arch/arm64/include/asm/paravirt.h
index 9aa193e0e8f2..a3f7665dff38 100644
--- a/arch/arm64/include/asm/paravirt.h
+++ b/arch/arm64/include/asm/paravirt.h
@@ -2,6 +2,11 @@ 
 #ifndef _ASM_ARM64_PARAVIRT_H
 #define _ASM_ARM64_PARAVIRT_H
 
+struct vcpu_state {
+	bool	preempted;
+	u8	reserved[63];
+};
+
 #ifdef CONFIG_PARAVIRT
 #include <linux/static_call_types.h>
 
@@ -20,8 +25,22 @@  static inline u64 paravirt_steal_clock(int cpu)
 
 int __init pv_time_init(void);
 
+bool dummy_vcpu_is_preempted(unsigned int cpu);
+
+extern struct static_key pv_vcpu_is_preempted_enabled;
+DECLARE_STATIC_CALL(pv_vcpu_is_preempted, dummy_vcpu_is_preempted);
+
+static inline bool paravirt_vcpu_is_preempted(unsigned int cpu)
+{
+	return static_call(pv_vcpu_is_preempted)(cpu);
+}
+
+int __init pv_vcpu_state_init(void);
+
 #else
 
+#define pv_vcpu_state_init() do {} while (0)
+
 #define pv_time_init() do {} while (0)
 
 #endif // CONFIG_PARAVIRT
diff --git a/arch/arm64/kernel/paravirt.c b/arch/arm64/kernel/paravirt.c
index 75fed4460407..d8fc46795d94 100644
--- a/arch/arm64/kernel/paravirt.c
+++ b/arch/arm64/kernel/paravirt.c
@@ -40,6 +40,11 @@  struct pv_time_stolen_time_region {
 
 static DEFINE_PER_CPU(struct pv_time_stolen_time_region, stolen_time_region);
 
+static DEFINE_PER_CPU(struct vcpu_state, vcpus_states);
+struct static_key pv_vcpu_is_preempted_enabled;
+
+DEFINE_STATIC_CALL(pv_vcpu_is_preempted, dummy_vcpu_is_preempted);
+
 static bool steal_acc = true;
 static int __init parse_no_stealacc(char *arg)
 {
@@ -165,3 +170,92 @@  int __init pv_time_init(void)
 
 	return 0;
 }
+
+bool dummy_vcpu_is_preempted(unsigned int cpu)
+{
+	return false;
+}
+
+static bool __vcpu_is_preempted(unsigned int cpu)
+{
+	struct vcpu_state *st;
+
+	st = &per_cpu(vcpus_states, cpu);
+	return READ_ONCE(st->preempted);
+}
+
+static bool has_pv_vcpu_state(void)
+{
+	struct arm_smccc_res res;
+
+	/* To detect the presence of PV time support we require SMCCC 1.1+ */
+	if (arm_smccc_1_1_get_conduit() == SMCCC_CONDUIT_NONE)
+		return false;
+
+	arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
+			     ARM_SMCCC_HV_PV_VCPU_STATE_FEATURES,
+			     &res);
+
+	if (res.a0 != SMCCC_RET_SUCCESS)
+		return false;
+	return true;
+}
+
+static int __pv_vcpu_state_hook(unsigned int cpu, int event)
+{
+	struct arm_smccc_res res;
+	struct vcpu_state *st;
+
+	st = &per_cpu(vcpus_states, cpu);
+	arm_smccc_1_1_invoke(event, virt_to_phys(st), &res);
+	if (res.a0 != SMCCC_RET_SUCCESS)
+		return -EINVAL;
+	return 0;
+}
+
+static int vcpu_state_init(unsigned int cpu)
+{
+	int ret = __pv_vcpu_state_hook(cpu, ARM_SMCCC_HV_PV_VCPU_STATE_INIT);
+
+	if (ret)
+		pr_warn("Unable to ARM_SMCCC_HV_PV_STATE_INIT\n");
+	return ret;
+}
+
+static int vcpu_state_release(unsigned int cpu)
+{
+	int ret = __pv_vcpu_state_hook(cpu, ARM_SMCCC_HV_PV_VCPU_STATE_RELEASE);
+
+	if (ret)
+		pr_warn("Unable to ARM_SMCCC_HV_PV_STATE_RELEASE\n");
+	return ret;
+}
+
+static int pv_vcpu_state_register_hooks(void)
+{
+	int ret;
+
+	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
+				"hypervisor/arm/pvstate:starting",
+				vcpu_state_init,
+				vcpu_state_release);
+	if (ret < 0)
+		pr_warn("Failed to register CPU hooks\n");
+	return 0;
+}
+
+int __init pv_vcpu_state_init(void)
+{
+	int ret;
+
+	if (!has_pv_vcpu_state())
+		return 0;
+
+	ret = pv_vcpu_state_register_hooks();
+	if (ret)
+		return ret;
+
+	static_call_update(pv_vcpu_is_preempted, __vcpu_is_preempted);
+	static_key_slow_inc(&pv_vcpu_is_preempted_enabled);
+	return 0;
+}
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 6f6ff072acbd..20d42e0f2a99 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -50,6 +50,7 @@ 
 #include <asm/tlbflush.h>
 #include <asm/ptrace.h>
 #include <asm/virt.h>
+#include <asm/paravirt.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/ipi.h>
@@ -756,6 +757,9 @@  void __init smp_prepare_cpus(unsigned int max_cpus)
 	numa_store_cpu_info(this_cpu);
 	numa_add_cpu(this_cpu);
 
+	/* Init paravirt CPU state */
+	pv_vcpu_state_init();
+
 	/*
 	 * If UP is mandated by "nosmp" (which implies "maxcpus=0"), don't set
 	 * secondary CPUs present.