Message ID | 20230815183903.2735724-15-maz@kernel.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | KVM: arm64: NV trap forwarding infrastructure | expand |
Hi Marc, On Tue, Aug 15, 2023 at 11:47 AM Marc Zyngier <maz@kernel.org> wrote: > > A significant part of what a NV hypervisor needs to do is to decide > whether a trap from a L2+ guest has to be forwarded to a L1 guest > or handled locally. This is done by checking for the trap bits that > the guest hypervisor has set and acting accordingly, as described by > the architecture. > > A previous approach was to sprinkle a bunch of checks in all the > system register accessors, but this is pretty error prone and doesn't > help getting an overview of what is happening. > > Instead, implement a set of global tables that describe a trap bit, > combinations of trap bits, behaviours on trap, and what bits must > be evaluated on a system register trap. > > Although this is painful to describe, this allows to specify each > and every control bit in a static manner. To make it efficient, > the table is inserted in an xarray that is global to the system, > and checked each time we trap a system register while running > a L2 guest. > > Add the basic infrastructure for now, while additional patches will > implement configuration registers. > > Signed-off-by: Marc Zyngier <maz@kernel.org> > --- > arch/arm64/include/asm/kvm_host.h | 1 + > arch/arm64/include/asm/kvm_nested.h | 2 + > arch/arm64/kvm/emulate-nested.c | 282 ++++++++++++++++++++++++++++ > arch/arm64/kvm/sys_regs.c | 6 + > arch/arm64/kvm/trace_arm.h | 26 +++ > 5 files changed, 317 insertions(+) > > diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h > index 721680da1011..cb1c5c54cedd 100644 > --- a/arch/arm64/include/asm/kvm_host.h > +++ b/arch/arm64/include/asm/kvm_host.h > @@ -988,6 +988,7 @@ int kvm_handle_cp10_id(struct kvm_vcpu *vcpu); > void kvm_reset_sys_regs(struct kvm_vcpu *vcpu); > > int __init kvm_sys_reg_table_init(void); > +int __init populate_nv_trap_config(void); > > bool lock_all_vcpus(struct kvm *kvm); > void unlock_all_vcpus(struct kvm *kvm); > diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h > index 8fb67f032fd1..fa23cc9c2adc 100644 > --- a/arch/arm64/include/asm/kvm_nested.h > +++ b/arch/arm64/include/asm/kvm_nested.h > @@ -11,6 +11,8 @@ static inline bool vcpu_has_nv(const struct kvm_vcpu *vcpu) > test_bit(KVM_ARM_VCPU_HAS_EL2, vcpu->arch.features)); > } > > +extern bool __check_nv_sr_forward(struct kvm_vcpu *vcpu); > + > struct sys_reg_params; > struct sys_reg_desc; > > diff --git a/arch/arm64/kvm/emulate-nested.c b/arch/arm64/kvm/emulate-nested.c > index b96662029fb1..d5837ed0077c 100644 > --- a/arch/arm64/kvm/emulate-nested.c > +++ b/arch/arm64/kvm/emulate-nested.c > @@ -14,6 +14,288 @@ > > #include "trace.h" > > +enum trap_behaviour { > + BEHAVE_HANDLE_LOCALLY = 0, > + BEHAVE_FORWARD_READ = BIT(0), > + BEHAVE_FORWARD_WRITE = BIT(1), > + BEHAVE_FORWARD_ANY = BEHAVE_FORWARD_READ | BEHAVE_FORWARD_WRITE, > +}; > + > +struct trap_bits { > + const enum vcpu_sysreg index; > + const enum trap_behaviour behaviour; > + const u64 value; > + const u64 mask; > +}; > + > +/* Coarse Grained Trap definitions */ > +enum cgt_group_id { > + /* Indicates no coarse trap control */ > + __RESERVED__, > + > + /* > + * The first batch of IDs denote coarse trapping that are used > + * on their own instead of being part of a combination of > + * trap controls. > + */ > + > + /* > + * Anything after this point is a combination of coarse trap > + * controls, which must all be evaluated to decide what to do. > + */ > + __MULTIPLE_CONTROL_BITS__, > + > + /* > + * Anything after this point requires a callback evaluating a > + * complex trap condition. Hopefully we'll never need this... > + */ > + __COMPLEX_CONDITIONS__, > + > + /* Must be last */ > + __NR_CGT_GROUP_IDS__ > +}; > + > +static const struct trap_bits coarse_trap_bits[] = { > +}; > + > +#define MCB(id, ...) \ > + [id - __MULTIPLE_CONTROL_BITS__] = \ > + (const enum cgt_group_id[]){ \ > + __VA_ARGS__, __RESERVED__ \ > + } > + > +static const enum cgt_group_id *coarse_control_combo[] = { > +}; > + > +typedef enum trap_behaviour (*complex_condition_check)(struct kvm_vcpu *); > + > +#define CCC(id, fn) \ > + [id - __COMPLEX_CONDITIONS__] = fn > + > +static const complex_condition_check ccc[] = { > +}; > + > +/* > + * Bit assignment for the trap controls. We use a 64bit word with the > + * following layout for each trapped sysreg: > + * > + * [9:0] enum cgt_group_id (10 bits) > + * [62:10] Unused (53 bits) > + * [63] RES0 - Must be zero, as lost on insertion in the xarray > + */ > +#define TC_CGT_BITS 10 > + > +union trap_config { > + u64 val; > + struct { > + unsigned long cgt:TC_CGT_BITS; /* Coarse Grained Trap id */ > + unsigned long unused:53; /* Unused, should be zero */ > + unsigned long mbz:1; /* Must Be Zero */ > + }; > +}; > + > +struct encoding_to_trap_config { > + const u32 encoding; > + const u32 end; > + const union trap_config tc; > + const unsigned int line; > +}; > + > +#define SR_RANGE_TRAP(sr_start, sr_end, trap_id) \ > + { \ > + .encoding = sr_start, \ > + .end = sr_end, \ > + .tc = { \ > + .cgt = trap_id, \ > + }, \ > + .line = __LINE__, \ > + } > + > +#define SR_TRAP(sr, trap_id) SR_RANGE_TRAP(sr, sr, trap_id) > + > +/* > + * Map encoding to trap bits for exception reported with EC=0x18. > + * These must only be evaluated when running a nested hypervisor, but > + * that the current context is not a hypervisor context. When the > + * trapped access matches one of the trap controls, the exception is > + * re-injected in the nested hypervisor. > + */ > +static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = { > +}; > + > +static DEFINE_XARRAY(sr_forward_xa); > + > +static union trap_config get_trap_config(u32 sysreg) > +{ > + return (union trap_config) { > + .val = xa_to_value(xa_load(&sr_forward_xa, sysreg)), > + }; > +} > + > +static __init void print_nv_trap_error(const struct encoding_to_trap_config *tc, > + const char *type, int err) > +{ > + kvm_err("%s line %d encoding range " > + "(%d, %d, %d, %d, %d) - (%d, %d, %d, %d, %d) (err=%d)\n", > + type, tc->line, > + sys_reg_Op0(tc->encoding), sys_reg_Op1(tc->encoding), > + sys_reg_CRn(tc->encoding), sys_reg_CRm(tc->encoding), > + sys_reg_Op2(tc->encoding), > + sys_reg_Op0(tc->end), sys_reg_Op1(tc->end), > + sys_reg_CRn(tc->end), sys_reg_CRm(tc->end), > + sys_reg_Op2(tc->end), > + err); > +} > + > +int __init populate_nv_trap_config(void) > +{ > + int ret = 0; > + > + BUILD_BUG_ON(sizeof(union trap_config) != sizeof(void *)); > + BUILD_BUG_ON(__NR_CGT_GROUP_IDS__ > BIT(TC_CGT_BITS)); > + > + for (int i = 0; i < ARRAY_SIZE(encoding_to_cgt); i++) { > + const struct encoding_to_trap_config *cgt = &encoding_to_cgt[i]; > + void *prev; > + > + if (cgt->tc.val & BIT(63)) { > + kvm_err("CGT[%d] has MBZ bit set\n", i); > + ret = -EINVAL; > + } > + > + if (cgt->encoding != cgt->end) { > + prev = xa_store_range(&sr_forward_xa, > + cgt->encoding, cgt->end, > + xa_mk_value(cgt->tc.val), > + GFP_KERNEL); > + } else { > + prev = xa_store(&sr_forward_xa, cgt->encoding, > + xa_mk_value(cgt->tc.val), GFP_KERNEL); > + if (prev && !xa_is_err(prev)) { > + ret = -EINVAL; > + print_nv_trap_error(cgt, "Duplicate CGT", ret); > + } > + } > + > + if (xa_is_err(prev)) { > + ret = xa_err(prev); > + print_nv_trap_error(cgt, "Failed CGT insertion", ret); > + } > + } > + > + kvm_info("nv: %ld coarse grained trap handlers\n", > + ARRAY_SIZE(encoding_to_cgt)); > + > + for (int id = __MULTIPLE_CONTROL_BITS__; id < __COMPLEX_CONDITIONS__; id++) { > + const enum cgt_group_id *cgids; > + > + cgids = coarse_control_combo[id - __MULTIPLE_CONTROL_BITS__]; > + > + for (int i = 0; cgids[i] != __RESERVED__; i++) { > + if (cgids[i] >= __MULTIPLE_CONTROL_BITS__) { > + kvm_err("Recursive MCB %d/%d\n", id, cgids[i]); > + ret = -EINVAL; > + } > + } > + } > + > + if (ret) > + xa_destroy(&sr_forward_xa); > + > + return ret; > +} > + > +static enum trap_behaviour get_behaviour(struct kvm_vcpu *vcpu, > + const struct trap_bits *tb) > +{ > + enum trap_behaviour b = BEHAVE_HANDLE_LOCALLY; > + u64 val; > + > + val = __vcpu_sys_reg(vcpu, tb->index); > + if ((val & tb->mask) == tb->value) > + b |= tb->behaviour; > + > + return b; > +} > + > +static enum trap_behaviour __compute_trap_behaviour(struct kvm_vcpu *vcpu, > + const enum cgt_group_id id, > + enum trap_behaviour b) > +{ > + switch (id) { > + const enum cgt_group_id *cgids; > + > + case __RESERVED__ ... __MULTIPLE_CONTROL_BITS__ - 1: > + if (likely(id != __RESERVED__)) > + b |= get_behaviour(vcpu, &coarse_trap_bits[id]); > + break; > + case __MULTIPLE_CONTROL_BITS__ ... __COMPLEX_CONDITIONS__ - 1: > + /* Yes, this is recursive. Don't do anything stupid. */ > + cgids = coarse_control_combo[id - __MULTIPLE_CONTROL_BITS__]; > + for (int i = 0; cgids[i] != __RESERVED__; i++) > + b |= __compute_trap_behaviour(vcpu, cgids[i], b); > + break; > + default: > + if (ARRAY_SIZE(ccc)) > + b |= ccc[id - __COMPLEX_CONDITIONS__](vcpu); > + break; > + } > + > + return b; > +} > + > +static enum trap_behaviour compute_trap_behaviour(struct kvm_vcpu *vcpu, > + const union trap_config tc) > +{ > + enum trap_behaviour b = BEHAVE_HANDLE_LOCALLY; > + > + return __compute_trap_behaviour(vcpu, tc.cgt, b); > +} > + > +bool __check_nv_sr_forward(struct kvm_vcpu *vcpu) > +{ > + union trap_config tc; > + enum trap_behaviour b; > + bool is_read; > + u32 sysreg; > + u64 esr; > + > + if (!vcpu_has_nv(vcpu) || is_hyp_ctxt(vcpu)) > + return false; > + > + esr = kvm_vcpu_get_esr(vcpu); > + sysreg = esr_sys64_to_sysreg(esr); > + is_read = (esr & ESR_ELx_SYS64_ISS_DIR_MASK) == ESR_ELx_SYS64_ISS_DIR_READ; > + > + tc = get_trap_config(sysreg); > + > + /* > + * A value of 0 for the whole entry means that we know nothing > + * for this sysreg, and that it cannot be re-injected into the > + * nested hypervisor. In this situation, let's cut it short. > + * > + * Note that ultimately, we could also make use of the xarray > + * to store the index of the sysreg in the local descriptor > + * array, avoiding another search... Hint, hint... > + */ > + if (!tc.val) > + return false; > + > + b = compute_trap_behaviour(vcpu, tc); > + > + if (((b & BEHAVE_FORWARD_READ) && is_read) || > + ((b & BEHAVE_FORWARD_WRITE) && !is_read)) > + goto inject; > + > + return false; > + > +inject: > + trace_kvm_forward_sysreg_trap(vcpu, sysreg, is_read); > + > + kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu)); > + return true; > +} > + > static u64 kvm_check_illegal_exception_return(struct kvm_vcpu *vcpu, u64 spsr) > { > u64 mode = spsr & PSR_MODE_MASK; > diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c > index f5baaa508926..9556896311db 100644 > --- a/arch/arm64/kvm/sys_regs.c > +++ b/arch/arm64/kvm/sys_regs.c > @@ -3177,6 +3177,9 @@ int kvm_handle_sys_reg(struct kvm_vcpu *vcpu) > > trace_kvm_handle_sys_reg(esr); > > + if (__check_nv_sr_forward(vcpu)) > + return 1; > + > params = esr_sys64_to_params(esr); > params.regval = vcpu_get_reg(vcpu, Rt); > > @@ -3594,5 +3597,8 @@ int __init kvm_sys_reg_table_init(void) > if (!first_idreg) > return -EINVAL; > > + if (kvm_get_mode() == KVM_MODE_NV) > + return populate_nv_trap_config(); > + > return 0; > } > diff --git a/arch/arm64/kvm/trace_arm.h b/arch/arm64/kvm/trace_arm.h > index 6ce5c025218d..8ad53104934d 100644 > --- a/arch/arm64/kvm/trace_arm.h > +++ b/arch/arm64/kvm/trace_arm.h > @@ -364,6 +364,32 @@ TRACE_EVENT(kvm_inject_nested_exception, > __entry->hcr_el2) > ); > > +TRACE_EVENT(kvm_forward_sysreg_trap, > + TP_PROTO(struct kvm_vcpu *vcpu, u32 sysreg, bool is_read), > + TP_ARGS(vcpu, sysreg, is_read), > + > + TP_STRUCT__entry( > + __field(u64, pc) > + __field(u32, sysreg) > + __field(bool, is_read) > + ), > + > + TP_fast_assign( > + __entry->pc = *vcpu_pc(vcpu); > + __entry->sysreg = sysreg; > + __entry->is_read = is_read; > + ), > + > + TP_printk("%llx %c (%d,%d,%d,%d,%d)", > + __entry->pc, > + __entry->is_read ? 'R' : 'W', > + sys_reg_Op0(__entry->sysreg), > + sys_reg_Op1(__entry->sysreg), > + sys_reg_CRn(__entry->sysreg), > + sys_reg_CRm(__entry->sysreg), > + sys_reg_Op2(__entry->sysreg)) > +); > + > #endif /* _TRACE_ARM_ARM64_KVM_H */ > > #undef TRACE_INCLUDE_PATH > -- > 2.34.1 > Reviewed-by: Jing Zhang <jingzhangos@google.com>
Hi Marc, > On 15 Aug 2023, at 18:38, Marc Zyngier <maz@kernel.org> wrote: > > A significant part of what a NV hypervisor needs to do is to decide > whether a trap from a L2+ guest has to be forwarded to a L1 guest > or handled locally. This is done by checking for the trap bits that > the guest hypervisor has set and acting accordingly, as described by > the architecture. > > A previous approach was to sprinkle a bunch of checks in all the > system register accessors, but this is pretty error prone and doesn't > help getting an overview of what is happening. > > Instead, implement a set of global tables that describe a trap bit, > combinations of trap bits, behaviours on trap, and what bits must > be evaluated on a system register trap. > > Although this is painful to describe, this allows to specify each > and every control bit in a static manner. To make it efficient, > the table is inserted in an xarray that is global to the system, > and checked each time we trap a system register while running > a L2 guest. > > Add the basic infrastructure for now, while additional patches will > implement configuration registers. > > Signed-off-by: Marc Zyngier <maz@kernel.org> > --- > arch/arm64/include/asm/kvm_host.h | 1 + > arch/arm64/include/asm/kvm_nested.h | 2 + > arch/arm64/kvm/emulate-nested.c | 282 ++++++++++++++++++++++++++++ > arch/arm64/kvm/sys_regs.c | 6 + > arch/arm64/kvm/trace_arm.h | 26 +++ > 5 files changed, 317 insertions(+) > > diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h > index 721680da1011..cb1c5c54cedd 100644 > --- a/arch/arm64/include/asm/kvm_host.h > +++ b/arch/arm64/include/asm/kvm_host.h > @@ -988,6 +988,7 @@ int kvm_handle_cp10_id(struct kvm_vcpu *vcpu); > void kvm_reset_sys_regs(struct kvm_vcpu *vcpu); > > int __init kvm_sys_reg_table_init(void); > +int __init populate_nv_trap_config(void); > > bool lock_all_vcpus(struct kvm *kvm); > void unlock_all_vcpus(struct kvm *kvm); > diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h > index 8fb67f032fd1..fa23cc9c2adc 100644 > --- a/arch/arm64/include/asm/kvm_nested.h > +++ b/arch/arm64/include/asm/kvm_nested.h > @@ -11,6 +11,8 @@ static inline bool vcpu_has_nv(const struct kvm_vcpu *vcpu) > test_bit(KVM_ARM_VCPU_HAS_EL2, vcpu->arch.features)); > } > > +extern bool __check_nv_sr_forward(struct kvm_vcpu *vcpu); > + > struct sys_reg_params; > struct sys_reg_desc; > > diff --git a/arch/arm64/kvm/emulate-nested.c b/arch/arm64/kvm/emulate-nested.c > index b96662029fb1..d5837ed0077c 100644 > --- a/arch/arm64/kvm/emulate-nested.c > +++ b/arch/arm64/kvm/emulate-nested.c > @@ -14,6 +14,288 @@ > > #include "trace.h" > > +enum trap_behaviour { > + BEHAVE_HANDLE_LOCALLY = 0, > + BEHAVE_FORWARD_READ = BIT(0), > + BEHAVE_FORWARD_WRITE = BIT(1), > + BEHAVE_FORWARD_ANY = BEHAVE_FORWARD_READ | BEHAVE_FORWARD_WRITE, > +}; > + > +struct trap_bits { > + const enum vcpu_sysreg index; > + const enum trap_behaviour behaviour; > + const u64 value; > + const u64 mask; > +}; > + > +/* Coarse Grained Trap definitions */ > +enum cgt_group_id { > + /* Indicates no coarse trap control */ > + __RESERVED__, > + > + /* > + * The first batch of IDs denote coarse trapping that are used > + * on their own instead of being part of a combination of > + * trap controls. > + */ > + > + /* > + * Anything after this point is a combination of coarse trap > + * controls, which must all be evaluated to decide what to do. > + */ > + __MULTIPLE_CONTROL_BITS__, > + > + /* > + * Anything after this point requires a callback evaluating a > + * complex trap condition. Hopefully we'll never need this... > + */ > + __COMPLEX_CONDITIONS__, > + > + /* Must be last */ > + __NR_CGT_GROUP_IDS__ > +}; > + > +static const struct trap_bits coarse_trap_bits[] = { > +}; > + > +#define MCB(id, ...) \ > + [id - __MULTIPLE_CONTROL_BITS__] = \ > + (const enum cgt_group_id[]){ \ > + __VA_ARGS__, __RESERVED__ \ > + } > + > +static const enum cgt_group_id *coarse_control_combo[] = { > +}; > + > +typedef enum trap_behaviour (*complex_condition_check)(struct kvm_vcpu *); > + > +#define CCC(id, fn) \ > + [id - __COMPLEX_CONDITIONS__] = fn > + > +static const complex_condition_check ccc[] = { > +}; > + > +/* > + * Bit assignment for the trap controls. We use a 64bit word with the > + * following layout for each trapped sysreg: > + * > + * [9:0] enum cgt_group_id (10 bits) > + * [62:10] Unused (53 bits) > + * [63] RES0 - Must be zero, as lost on insertion in the xarray > + */ > +#define TC_CGT_BITS 10 > + > +union trap_config { > + u64 val; > + struct { > + unsigned long cgt:TC_CGT_BITS; /* Coarse Grained Trap id */ > + unsigned long unused:53; /* Unused, should be zero */ > + unsigned long mbz:1; /* Must Be Zero */ > + }; > +}; > + > +struct encoding_to_trap_config { > + const u32 encoding; > + const u32 end; > + const union trap_config tc; > + const unsigned int line; > +}; > + > +#define SR_RANGE_TRAP(sr_start, sr_end, trap_id) \ > + { \ > + .encoding = sr_start, \ > + .end = sr_end, \ > + .tc = { \ > + .cgt = trap_id, \ > + }, \ > + .line = __LINE__, \ > + } > + > +#define SR_TRAP(sr, trap_id) SR_RANGE_TRAP(sr, sr, trap_id) > + > +/* > + * Map encoding to trap bits for exception reported with EC=0x18. > + * These must only be evaluated when running a nested hypervisor, but > + * that the current context is not a hypervisor context. When the > + * trapped access matches one of the trap controls, the exception is > + * re-injected in the nested hypervisor. > + */ > +static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = { > +}; > + > +static DEFINE_XARRAY(sr_forward_xa); > + > +static union trap_config get_trap_config(u32 sysreg) > +{ > + return (union trap_config) { > + .val = xa_to_value(xa_load(&sr_forward_xa, sysreg)), > + }; > +} > + > +static __init void print_nv_trap_error(const struct encoding_to_trap_config *tc, > + const char *type, int err) > +{ > + kvm_err("%s line %d encoding range " > + "(%d, %d, %d, %d, %d) - (%d, %d, %d, %d, %d) (err=%d)\n", > + type, tc->line, > + sys_reg_Op0(tc->encoding), sys_reg_Op1(tc->encoding), > + sys_reg_CRn(tc->encoding), sys_reg_CRm(tc->encoding), > + sys_reg_Op2(tc->encoding), > + sys_reg_Op0(tc->end), sys_reg_Op1(tc->end), > + sys_reg_CRn(tc->end), sys_reg_CRm(tc->end), > + sys_reg_Op2(tc->end), > + err); > +} > + > +int __init populate_nv_trap_config(void) > +{ > + int ret = 0; > + > + BUILD_BUG_ON(sizeof(union trap_config) != sizeof(void *)); > + BUILD_BUG_ON(__NR_CGT_GROUP_IDS__ > BIT(TC_CGT_BITS)); > + > + for (int i = 0; i < ARRAY_SIZE(encoding_to_cgt); i++) { > + const struct encoding_to_trap_config *cgt = &encoding_to_cgt[i]; > + void *prev; > + > + if (cgt->tc.val & BIT(63)) { > + kvm_err("CGT[%d] has MBZ bit set\n", i); > + ret = -EINVAL; > + } > + > + if (cgt->encoding != cgt->end) { > + prev = xa_store_range(&sr_forward_xa, > + cgt->encoding, cgt->end, > + xa_mk_value(cgt->tc.val), > + GFP_KERNEL); > + } else { > + prev = xa_store(&sr_forward_xa, cgt->encoding, > + xa_mk_value(cgt->tc.val), GFP_KERNEL); > + if (prev && !xa_is_err(prev)) { > + ret = -EINVAL; > + print_nv_trap_error(cgt, "Duplicate CGT", ret); > + } > + } > + > + if (xa_is_err(prev)) { > + ret = xa_err(prev); > + print_nv_trap_error(cgt, "Failed CGT insertion", ret); > + } > + } > + > + kvm_info("nv: %ld coarse grained trap handlers\n", > + ARRAY_SIZE(encoding_to_cgt)); > + > + for (int id = __MULTIPLE_CONTROL_BITS__; id < __COMPLEX_CONDITIONS__; id++) { > + const enum cgt_group_id *cgids; > + > + cgids = coarse_control_combo[id - __MULTIPLE_CONTROL_BITS__]; > + > + for (int i = 0; cgids[i] != __RESERVED__; i++) { > + if (cgids[i] >= __MULTIPLE_CONTROL_BITS__) { > + kvm_err("Recursive MCB %d/%d\n", id, cgids[i]); > + ret = -EINVAL; > + } > + } > + } > + > + if (ret) > + xa_destroy(&sr_forward_xa); > + > + return ret; > +} > + > +static enum trap_behaviour get_behaviour(struct kvm_vcpu *vcpu, > + const struct trap_bits *tb) > +{ > + enum trap_behaviour b = BEHAVE_HANDLE_LOCALLY; > + u64 val; > + > + val = __vcpu_sys_reg(vcpu, tb->index); > + if ((val & tb->mask) == tb->value) > + b |= tb->behaviour; > + > + return b; > +} > + > +static enum trap_behaviour __compute_trap_behaviour(struct kvm_vcpu *vcpu, > + const enum cgt_group_id id, > + enum trap_behaviour b) > +{ > + switch (id) { > + const enum cgt_group_id *cgids; > + > + case __RESERVED__ ... __MULTIPLE_CONTROL_BITS__ - 1: > + if (likely(id != __RESERVED__)) > + b |= get_behaviour(vcpu, &coarse_trap_bits[id]); > + break; > + case __MULTIPLE_CONTROL_BITS__ ... __COMPLEX_CONDITIONS__ - 1: > + /* Yes, this is recursive. Don't do anything stupid. */ > + cgids = coarse_control_combo[id - __MULTIPLE_CONTROL_BITS__]; > + for (int i = 0; cgids[i] != __RESERVED__; i++) > + b |= __compute_trap_behaviour(vcpu, cgids[i], b); > + break; > + default: > + if (ARRAY_SIZE(ccc)) > + b |= ccc[id - __COMPLEX_CONDITIONS__](vcpu); > + break; > + } > + > + return b; > +} > + > +static enum trap_behaviour compute_trap_behaviour(struct kvm_vcpu *vcpu, > + const union trap_config tc) > +{ > + enum trap_behaviour b = BEHAVE_HANDLE_LOCALLY; > + > + return __compute_trap_behaviour(vcpu, tc.cgt, b); > +} > + > +bool __check_nv_sr_forward(struct kvm_vcpu *vcpu) > +{ > + union trap_config tc; > + enum trap_behaviour b; > + bool is_read; > + u32 sysreg; > + u64 esr; > + > + if (!vcpu_has_nv(vcpu) || is_hyp_ctxt(vcpu)) > + return false; > + > + esr = kvm_vcpu_get_esr(vcpu); > + sysreg = esr_sys64_to_sysreg(esr); > + is_read = (esr & ESR_ELx_SYS64_ISS_DIR_MASK) == ESR_ELx_SYS64_ISS_DIR_READ; > + > + tc = get_trap_config(sysreg); > + > + /* > + * A value of 0 for the whole entry means that we know nothing > + * for this sysreg, and that it cannot be re-injected into the > + * nested hypervisor. In this situation, let's cut it short. > + * > + * Note that ultimately, we could also make use of the xarray > + * to store the index of the sysreg in the local descriptor > + * array, avoiding another search... Hint, hint... > + */ > + if (!tc.val) > + return false; > + > + b = compute_trap_behaviour(vcpu, tc); > + > + if (((b & BEHAVE_FORWARD_READ) && is_read) || > + ((b & BEHAVE_FORWARD_WRITE) && !is_read)) > + goto inject; > + > + return false; > + > +inject: > + trace_kvm_forward_sysreg_trap(vcpu, sysreg, is_read); > + > + kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu)); > + return true; > +} > + > static u64 kvm_check_illegal_exception_return(struct kvm_vcpu *vcpu, u64 spsr) > { > u64 mode = spsr & PSR_MODE_MASK; > diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c > index f5baaa508926..9556896311db 100644 > --- a/arch/arm64/kvm/sys_regs.c > +++ b/arch/arm64/kvm/sys_regs.c > @@ -3177,6 +3177,9 @@ int kvm_handle_sys_reg(struct kvm_vcpu *vcpu) > > trace_kvm_handle_sys_reg(esr); > > + if (__check_nv_sr_forward(vcpu)) > + return 1; > + > params = esr_sys64_to_params(esr); > params.regval = vcpu_get_reg(vcpu, Rt); > > @@ -3594,5 +3597,8 @@ int __init kvm_sys_reg_table_init(void) > if (!first_idreg) > return -EINVAL; > > + if (kvm_get_mode() == KVM_MODE_NV) > + return populate_nv_trap_config(); > + > return 0; > } > diff --git a/arch/arm64/kvm/trace_arm.h b/arch/arm64/kvm/trace_arm.h > index 6ce5c025218d..8ad53104934d 100644 > --- a/arch/arm64/kvm/trace_arm.h > +++ b/arch/arm64/kvm/trace_arm.h > @@ -364,6 +364,32 @@ TRACE_EVENT(kvm_inject_nested_exception, > __entry->hcr_el2) > ); > > +TRACE_EVENT(kvm_forward_sysreg_trap, > + TP_PROTO(struct kvm_vcpu *vcpu, u32 sysreg, bool is_read), > + TP_ARGS(vcpu, sysreg, is_read), > + > + TP_STRUCT__entry( > + __field(u64, pc) > + __field(u32, sysreg) > + __field(bool, is_read) > + ), > + > + TP_fast_assign( > + __entry->pc = *vcpu_pc(vcpu); > + __entry->sysreg = sysreg; > + __entry->is_read = is_read; > + ), > + > + TP_printk("%llx %c (%d,%d,%d,%d,%d)", > + __entry->pc, > + __entry->is_read ? 'R' : 'W', > + sys_reg_Op0(__entry->sysreg), > + sys_reg_Op1(__entry->sysreg), > + sys_reg_CRn(__entry->sysreg), > + sys_reg_CRm(__entry->sysreg), > + sys_reg_Op2(__entry->sysreg)) > +); > + Reviewed-by: Miguel Luis <miguel.luis@oracle.com> Thanks Miguel > #endif /* _TRACE_ARM_ARM64_KVM_H */ > > #undef TRACE_INCLUDE_PATH > -- > 2.34.1 >
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 721680da1011..cb1c5c54cedd 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -988,6 +988,7 @@ int kvm_handle_cp10_id(struct kvm_vcpu *vcpu); void kvm_reset_sys_regs(struct kvm_vcpu *vcpu); int __init kvm_sys_reg_table_init(void); +int __init populate_nv_trap_config(void); bool lock_all_vcpus(struct kvm *kvm); void unlock_all_vcpus(struct kvm *kvm); diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h index 8fb67f032fd1..fa23cc9c2adc 100644 --- a/arch/arm64/include/asm/kvm_nested.h +++ b/arch/arm64/include/asm/kvm_nested.h @@ -11,6 +11,8 @@ static inline bool vcpu_has_nv(const struct kvm_vcpu *vcpu) test_bit(KVM_ARM_VCPU_HAS_EL2, vcpu->arch.features)); } +extern bool __check_nv_sr_forward(struct kvm_vcpu *vcpu); + struct sys_reg_params; struct sys_reg_desc; diff --git a/arch/arm64/kvm/emulate-nested.c b/arch/arm64/kvm/emulate-nested.c index b96662029fb1..d5837ed0077c 100644 --- a/arch/arm64/kvm/emulate-nested.c +++ b/arch/arm64/kvm/emulate-nested.c @@ -14,6 +14,288 @@ #include "trace.h" +enum trap_behaviour { + BEHAVE_HANDLE_LOCALLY = 0, + BEHAVE_FORWARD_READ = BIT(0), + BEHAVE_FORWARD_WRITE = BIT(1), + BEHAVE_FORWARD_ANY = BEHAVE_FORWARD_READ | BEHAVE_FORWARD_WRITE, +}; + +struct trap_bits { + const enum vcpu_sysreg index; + const enum trap_behaviour behaviour; + const u64 value; + const u64 mask; +}; + +/* Coarse Grained Trap definitions */ +enum cgt_group_id { + /* Indicates no coarse trap control */ + __RESERVED__, + + /* + * The first batch of IDs denote coarse trapping that are used + * on their own instead of being part of a combination of + * trap controls. + */ + + /* + * Anything after this point is a combination of coarse trap + * controls, which must all be evaluated to decide what to do. + */ + __MULTIPLE_CONTROL_BITS__, + + /* + * Anything after this point requires a callback evaluating a + * complex trap condition. Hopefully we'll never need this... + */ + __COMPLEX_CONDITIONS__, + + /* Must be last */ + __NR_CGT_GROUP_IDS__ +}; + +static const struct trap_bits coarse_trap_bits[] = { +}; + +#define MCB(id, ...) \ + [id - __MULTIPLE_CONTROL_BITS__] = \ + (const enum cgt_group_id[]){ \ + __VA_ARGS__, __RESERVED__ \ + } + +static const enum cgt_group_id *coarse_control_combo[] = { +}; + +typedef enum trap_behaviour (*complex_condition_check)(struct kvm_vcpu *); + +#define CCC(id, fn) \ + [id - __COMPLEX_CONDITIONS__] = fn + +static const complex_condition_check ccc[] = { +}; + +/* + * Bit assignment for the trap controls. We use a 64bit word with the + * following layout for each trapped sysreg: + * + * [9:0] enum cgt_group_id (10 bits) + * [62:10] Unused (53 bits) + * [63] RES0 - Must be zero, as lost on insertion in the xarray + */ +#define TC_CGT_BITS 10 + +union trap_config { + u64 val; + struct { + unsigned long cgt:TC_CGT_BITS; /* Coarse Grained Trap id */ + unsigned long unused:53; /* Unused, should be zero */ + unsigned long mbz:1; /* Must Be Zero */ + }; +}; + +struct encoding_to_trap_config { + const u32 encoding; + const u32 end; + const union trap_config tc; + const unsigned int line; +}; + +#define SR_RANGE_TRAP(sr_start, sr_end, trap_id) \ + { \ + .encoding = sr_start, \ + .end = sr_end, \ + .tc = { \ + .cgt = trap_id, \ + }, \ + .line = __LINE__, \ + } + +#define SR_TRAP(sr, trap_id) SR_RANGE_TRAP(sr, sr, trap_id) + +/* + * Map encoding to trap bits for exception reported with EC=0x18. + * These must only be evaluated when running a nested hypervisor, but + * that the current context is not a hypervisor context. When the + * trapped access matches one of the trap controls, the exception is + * re-injected in the nested hypervisor. + */ +static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = { +}; + +static DEFINE_XARRAY(sr_forward_xa); + +static union trap_config get_trap_config(u32 sysreg) +{ + return (union trap_config) { + .val = xa_to_value(xa_load(&sr_forward_xa, sysreg)), + }; +} + +static __init void print_nv_trap_error(const struct encoding_to_trap_config *tc, + const char *type, int err) +{ + kvm_err("%s line %d encoding range " + "(%d, %d, %d, %d, %d) - (%d, %d, %d, %d, %d) (err=%d)\n", + type, tc->line, + sys_reg_Op0(tc->encoding), sys_reg_Op1(tc->encoding), + sys_reg_CRn(tc->encoding), sys_reg_CRm(tc->encoding), + sys_reg_Op2(tc->encoding), + sys_reg_Op0(tc->end), sys_reg_Op1(tc->end), + sys_reg_CRn(tc->end), sys_reg_CRm(tc->end), + sys_reg_Op2(tc->end), + err); +} + +int __init populate_nv_trap_config(void) +{ + int ret = 0; + + BUILD_BUG_ON(sizeof(union trap_config) != sizeof(void *)); + BUILD_BUG_ON(__NR_CGT_GROUP_IDS__ > BIT(TC_CGT_BITS)); + + for (int i = 0; i < ARRAY_SIZE(encoding_to_cgt); i++) { + const struct encoding_to_trap_config *cgt = &encoding_to_cgt[i]; + void *prev; + + if (cgt->tc.val & BIT(63)) { + kvm_err("CGT[%d] has MBZ bit set\n", i); + ret = -EINVAL; + } + + if (cgt->encoding != cgt->end) { + prev = xa_store_range(&sr_forward_xa, + cgt->encoding, cgt->end, + xa_mk_value(cgt->tc.val), + GFP_KERNEL); + } else { + prev = xa_store(&sr_forward_xa, cgt->encoding, + xa_mk_value(cgt->tc.val), GFP_KERNEL); + if (prev && !xa_is_err(prev)) { + ret = -EINVAL; + print_nv_trap_error(cgt, "Duplicate CGT", ret); + } + } + + if (xa_is_err(prev)) { + ret = xa_err(prev); + print_nv_trap_error(cgt, "Failed CGT insertion", ret); + } + } + + kvm_info("nv: %ld coarse grained trap handlers\n", + ARRAY_SIZE(encoding_to_cgt)); + + for (int id = __MULTIPLE_CONTROL_BITS__; id < __COMPLEX_CONDITIONS__; id++) { + const enum cgt_group_id *cgids; + + cgids = coarse_control_combo[id - __MULTIPLE_CONTROL_BITS__]; + + for (int i = 0; cgids[i] != __RESERVED__; i++) { + if (cgids[i] >= __MULTIPLE_CONTROL_BITS__) { + kvm_err("Recursive MCB %d/%d\n", id, cgids[i]); + ret = -EINVAL; + } + } + } + + if (ret) + xa_destroy(&sr_forward_xa); + + return ret; +} + +static enum trap_behaviour get_behaviour(struct kvm_vcpu *vcpu, + const struct trap_bits *tb) +{ + enum trap_behaviour b = BEHAVE_HANDLE_LOCALLY; + u64 val; + + val = __vcpu_sys_reg(vcpu, tb->index); + if ((val & tb->mask) == tb->value) + b |= tb->behaviour; + + return b; +} + +static enum trap_behaviour __compute_trap_behaviour(struct kvm_vcpu *vcpu, + const enum cgt_group_id id, + enum trap_behaviour b) +{ + switch (id) { + const enum cgt_group_id *cgids; + + case __RESERVED__ ... __MULTIPLE_CONTROL_BITS__ - 1: + if (likely(id != __RESERVED__)) + b |= get_behaviour(vcpu, &coarse_trap_bits[id]); + break; + case __MULTIPLE_CONTROL_BITS__ ... __COMPLEX_CONDITIONS__ - 1: + /* Yes, this is recursive. Don't do anything stupid. */ + cgids = coarse_control_combo[id - __MULTIPLE_CONTROL_BITS__]; + for (int i = 0; cgids[i] != __RESERVED__; i++) + b |= __compute_trap_behaviour(vcpu, cgids[i], b); + break; + default: + if (ARRAY_SIZE(ccc)) + b |= ccc[id - __COMPLEX_CONDITIONS__](vcpu); + break; + } + + return b; +} + +static enum trap_behaviour compute_trap_behaviour(struct kvm_vcpu *vcpu, + const union trap_config tc) +{ + enum trap_behaviour b = BEHAVE_HANDLE_LOCALLY; + + return __compute_trap_behaviour(vcpu, tc.cgt, b); +} + +bool __check_nv_sr_forward(struct kvm_vcpu *vcpu) +{ + union trap_config tc; + enum trap_behaviour b; + bool is_read; + u32 sysreg; + u64 esr; + + if (!vcpu_has_nv(vcpu) || is_hyp_ctxt(vcpu)) + return false; + + esr = kvm_vcpu_get_esr(vcpu); + sysreg = esr_sys64_to_sysreg(esr); + is_read = (esr & ESR_ELx_SYS64_ISS_DIR_MASK) == ESR_ELx_SYS64_ISS_DIR_READ; + + tc = get_trap_config(sysreg); + + /* + * A value of 0 for the whole entry means that we know nothing + * for this sysreg, and that it cannot be re-injected into the + * nested hypervisor. In this situation, let's cut it short. + * + * Note that ultimately, we could also make use of the xarray + * to store the index of the sysreg in the local descriptor + * array, avoiding another search... Hint, hint... + */ + if (!tc.val) + return false; + + b = compute_trap_behaviour(vcpu, tc); + + if (((b & BEHAVE_FORWARD_READ) && is_read) || + ((b & BEHAVE_FORWARD_WRITE) && !is_read)) + goto inject; + + return false; + +inject: + trace_kvm_forward_sysreg_trap(vcpu, sysreg, is_read); + + kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu)); + return true; +} + static u64 kvm_check_illegal_exception_return(struct kvm_vcpu *vcpu, u64 spsr) { u64 mode = spsr & PSR_MODE_MASK; diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index f5baaa508926..9556896311db 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -3177,6 +3177,9 @@ int kvm_handle_sys_reg(struct kvm_vcpu *vcpu) trace_kvm_handle_sys_reg(esr); + if (__check_nv_sr_forward(vcpu)) + return 1; + params = esr_sys64_to_params(esr); params.regval = vcpu_get_reg(vcpu, Rt); @@ -3594,5 +3597,8 @@ int __init kvm_sys_reg_table_init(void) if (!first_idreg) return -EINVAL; + if (kvm_get_mode() == KVM_MODE_NV) + return populate_nv_trap_config(); + return 0; } diff --git a/arch/arm64/kvm/trace_arm.h b/arch/arm64/kvm/trace_arm.h index 6ce5c025218d..8ad53104934d 100644 --- a/arch/arm64/kvm/trace_arm.h +++ b/arch/arm64/kvm/trace_arm.h @@ -364,6 +364,32 @@ TRACE_EVENT(kvm_inject_nested_exception, __entry->hcr_el2) ); +TRACE_EVENT(kvm_forward_sysreg_trap, + TP_PROTO(struct kvm_vcpu *vcpu, u32 sysreg, bool is_read), + TP_ARGS(vcpu, sysreg, is_read), + + TP_STRUCT__entry( + __field(u64, pc) + __field(u32, sysreg) + __field(bool, is_read) + ), + + TP_fast_assign( + __entry->pc = *vcpu_pc(vcpu); + __entry->sysreg = sysreg; + __entry->is_read = is_read; + ), + + TP_printk("%llx %c (%d,%d,%d,%d,%d)", + __entry->pc, + __entry->is_read ? 'R' : 'W', + sys_reg_Op0(__entry->sysreg), + sys_reg_Op1(__entry->sysreg), + sys_reg_CRn(__entry->sysreg), + sys_reg_CRm(__entry->sysreg), + sys_reg_Op2(__entry->sysreg)) +); + #endif /* _TRACE_ARM_ARM64_KVM_H */ #undef TRACE_INCLUDE_PATH
A significant part of what a NV hypervisor needs to do is to decide whether a trap from a L2+ guest has to be forwarded to a L1 guest or handled locally. This is done by checking for the trap bits that the guest hypervisor has set and acting accordingly, as described by the architecture. A previous approach was to sprinkle a bunch of checks in all the system register accessors, but this is pretty error prone and doesn't help getting an overview of what is happening. Instead, implement a set of global tables that describe a trap bit, combinations of trap bits, behaviours on trap, and what bits must be evaluated on a system register trap. Although this is painful to describe, this allows to specify each and every control bit in a static manner. To make it efficient, the table is inserted in an xarray that is global to the system, and checked each time we trap a system register while running a L2 guest. Add the basic infrastructure for now, while additional patches will implement configuration registers. Signed-off-by: Marc Zyngier <maz@kernel.org> --- arch/arm64/include/asm/kvm_host.h | 1 + arch/arm64/include/asm/kvm_nested.h | 2 + arch/arm64/kvm/emulate-nested.c | 282 ++++++++++++++++++++++++++++ arch/arm64/kvm/sys_regs.c | 6 + arch/arm64/kvm/trace_arm.h | 26 +++ 5 files changed, 317 insertions(+)