diff mbox series

[v6,3/6] KVM: arm64: Enable writable for ID_AA64DFR0_EL1 and ID_DFR0_EL1

Message ID 20230718164522.3498236-4-jingzhangos@google.com (mailing list archive)
State New, archived
Headers show
Series Enable writable for idregs DFR0,PFR0, MMFR{0,1,2, 3} | expand

Commit Message

Jing Zhang July 18, 2023, 4:45 p.m. UTC
All valid fields in ID_AA64DFR0_EL1 and ID_DFR0_EL1 are writable
from usrespace with this change.

Signed-off-by: Jing Zhang <jingzhangos@google.com>
---
 arch/arm64/kvm/sys_regs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

Comments

Cornelia Huck July 20, 2023, 8:52 a.m. UTC | #1
On Tue, Jul 18 2023, Jing Zhang <jingzhangos@google.com> wrote:

> All valid fields in ID_AA64DFR0_EL1 and ID_DFR0_EL1 are writable
> from usrespace with this change.

Typo: s/usrespace/userspace/

>
> Signed-off-by: Jing Zhang <jingzhangos@google.com>
> ---
>  arch/arm64/kvm/sys_regs.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
> index 053d8057ff1e..f33aec83f1b4 100644
> --- a/arch/arm64/kvm/sys_regs.c
> +++ b/arch/arm64/kvm/sys_regs.c
> @@ -2008,7 +2008,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
>  	  .set_user = set_id_dfr0_el1,
>  	  .visibility = aa32_id_visibility,
>  	  .reset = read_sanitised_id_dfr0_el1,
> -	  .val = ID_DFR0_EL1_PerfMon_MASK, },
> +	  .val = GENMASK(63, 0), },
>  	ID_HIDDEN(ID_AFR0_EL1),
>  	AA32_ID_SANITISED(ID_MMFR0_EL1),
>  	AA32_ID_SANITISED(ID_MMFR1_EL1),
> @@ -2057,7 +2057,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
>  	  .get_user = get_id_reg,
>  	  .set_user = set_id_aa64dfr0_el1,
>  	  .reset = read_sanitised_id_aa64dfr0_el1,
> -	  .val = ID_AA64DFR0_EL1_PMUVer_MASK, },
> +	  .val = GENMASK(63, 0), },
>  	ID_SANITISED(ID_AA64DFR1_EL1),
>  	ID_UNALLOCATED(5,2),
>  	ID_UNALLOCATED(5,3),

How does userspace find out whether a given id reg is actually writable,
other than trying to write to it?
Jing Zhang July 20, 2023, 4:39 p.m. UTC | #2
Hi Cornelia,

On Thu, Jul 20, 2023 at 1:52 AM Cornelia Huck <cohuck@redhat.com> wrote:
>
> On Tue, Jul 18 2023, Jing Zhang <jingzhangos@google.com> wrote:
>
> > All valid fields in ID_AA64DFR0_EL1 and ID_DFR0_EL1 are writable
> > from usrespace with this change.
>
> Typo: s/usrespace/userspace/
Thanks.
>
> >
> > Signed-off-by: Jing Zhang <jingzhangos@google.com>
> > ---
> >  arch/arm64/kvm/sys_regs.c | 4 ++--
> >  1 file changed, 2 insertions(+), 2 deletions(-)
> >
> > diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
> > index 053d8057ff1e..f33aec83f1b4 100644
> > --- a/arch/arm64/kvm/sys_regs.c
> > +++ b/arch/arm64/kvm/sys_regs.c
> > @@ -2008,7 +2008,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
> >         .set_user = set_id_dfr0_el1,
> >         .visibility = aa32_id_visibility,
> >         .reset = read_sanitised_id_dfr0_el1,
> > -       .val = ID_DFR0_EL1_PerfMon_MASK, },
> > +       .val = GENMASK(63, 0), },
> >       ID_HIDDEN(ID_AFR0_EL1),
> >       AA32_ID_SANITISED(ID_MMFR0_EL1),
> >       AA32_ID_SANITISED(ID_MMFR1_EL1),
> > @@ -2057,7 +2057,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
> >         .get_user = get_id_reg,
> >         .set_user = set_id_aa64dfr0_el1,
> >         .reset = read_sanitised_id_aa64dfr0_el1,
> > -       .val = ID_AA64DFR0_EL1_PMUVer_MASK, },
> > +       .val = GENMASK(63, 0), },
> >       ID_SANITISED(ID_AA64DFR1_EL1),
> >       ID_UNALLOCATED(5,2),
> >       ID_UNALLOCATED(5,3),
>
> How does userspace find out whether a given id reg is actually writable,
> other than trying to write to it?
>
No mechanism was provided to userspace to discover if a given idreg or
any fields of a given idreg is writable. The write to a readonly idreg
can also succeed (write ignored) without any error if what's written
is exactly the same as what the idreg holds or if it is a write to
AArch32 idregs on an AArch64-only system.
Not sure if it is worth adding an API to return the writable mask for
idregs, since we want to enable the writable for all allocated
unhidden idregs eventually.

Thanks,
Jing
Cornelia Huck July 21, 2023, 8:38 a.m. UTC | #3
On Thu, Jul 20 2023, Jing Zhang <jingzhangos@google.com> wrote:

> Hi Cornelia,
>
> On Thu, Jul 20, 2023 at 1:52 AM Cornelia Huck <cohuck@redhat.com> wrote:
>>
>> On Tue, Jul 18 2023, Jing Zhang <jingzhangos@google.com> wrote:
>>
>> > All valid fields in ID_AA64DFR0_EL1 and ID_DFR0_EL1 are writable
>> > from usrespace with this change.
>>
>> Typo: s/usrespace/userspace/
> Thanks.
>>
>> >
>> > Signed-off-by: Jing Zhang <jingzhangos@google.com>
>> > ---
>> >  arch/arm64/kvm/sys_regs.c | 4 ++--
>> >  1 file changed, 2 insertions(+), 2 deletions(-)
>> >
>> > diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
>> > index 053d8057ff1e..f33aec83f1b4 100644
>> > --- a/arch/arm64/kvm/sys_regs.c
>> > +++ b/arch/arm64/kvm/sys_regs.c
>> > @@ -2008,7 +2008,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
>> >         .set_user = set_id_dfr0_el1,
>> >         .visibility = aa32_id_visibility,
>> >         .reset = read_sanitised_id_dfr0_el1,
>> > -       .val = ID_DFR0_EL1_PerfMon_MASK, },
>> > +       .val = GENMASK(63, 0), },
>> >       ID_HIDDEN(ID_AFR0_EL1),
>> >       AA32_ID_SANITISED(ID_MMFR0_EL1),
>> >       AA32_ID_SANITISED(ID_MMFR1_EL1),
>> > @@ -2057,7 +2057,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
>> >         .get_user = get_id_reg,
>> >         .set_user = set_id_aa64dfr0_el1,
>> >         .reset = read_sanitised_id_aa64dfr0_el1,
>> > -       .val = ID_AA64DFR0_EL1_PMUVer_MASK, },
>> > +       .val = GENMASK(63, 0), },
>> >       ID_SANITISED(ID_AA64DFR1_EL1),
>> >       ID_UNALLOCATED(5,2),
>> >       ID_UNALLOCATED(5,3),
>>
>> How does userspace find out whether a given id reg is actually writable,
>> other than trying to write to it?
>>
> No mechanism was provided to userspace to discover if a given idreg or
> any fields of a given idreg is writable. The write to a readonly idreg
> can also succeed (write ignored) without any error if what's written
> is exactly the same as what the idreg holds or if it is a write to
> AArch32 idregs on an AArch64-only system.

Hm, I'm not sure that's a good thing for the cases where we want to
support mix-and-match userspace and kernels. Userspace may want to know
upfront whether it can actually tweak the contents of an idreg or not
(for example, in the context of using CPU models for compatibility), so
that it can reject or warn about certain configurations that may not
turn out as the user expects.

> Not sure if it is worth adding an API to return the writable mask for
> idregs, since we want to enable the writable for all allocated
> unhidden idregs eventually.

We'd enable any new idregs for writing from the start in the future, I
guess?

I see two approaches here:
- add an API to get a list of idregs with their writable masks
- add a capability "you can write to all idregs whatever you'd expect to
  be able to write there architecture wise", which would require to add
  support for all idregs prior to exposing that cap

The second option would be the easier one (if we don't manage to break
it in the future :)
Marc Zyngier July 21, 2023, 9:31 a.m. UTC | #4
On Fri, 21 Jul 2023 09:38:23 +0100,
Cornelia Huck <cohuck@redhat.com> wrote:
> 
> On Thu, Jul 20 2023, Jing Zhang <jingzhangos@google.com> wrote:
> 
> > Hi Cornelia,
> >
> > On Thu, Jul 20, 2023 at 1:52 AM Cornelia Huck <cohuck@redhat.com> wrote:
> >>
> >> On Tue, Jul 18 2023, Jing Zhang <jingzhangos@google.com> wrote:
> >>
> >> > All valid fields in ID_AA64DFR0_EL1 and ID_DFR0_EL1 are writable
> >> > from usrespace with this change.
> >>
> >> Typo: s/usrespace/userspace/
> > Thanks.
> >>
> >> >
> >> > Signed-off-by: Jing Zhang <jingzhangos@google.com>
> >> > ---
> >> >  arch/arm64/kvm/sys_regs.c | 4 ++--
> >> >  1 file changed, 2 insertions(+), 2 deletions(-)
> >> >
> >> > diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
> >> > index 053d8057ff1e..f33aec83f1b4 100644
> >> > --- a/arch/arm64/kvm/sys_regs.c
> >> > +++ b/arch/arm64/kvm/sys_regs.c
> >> > @@ -2008,7 +2008,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
> >> >         .set_user = set_id_dfr0_el1,
> >> >         .visibility = aa32_id_visibility,
> >> >         .reset = read_sanitised_id_dfr0_el1,
> >> > -       .val = ID_DFR0_EL1_PerfMon_MASK, },
> >> > +       .val = GENMASK(63, 0), },
> >> >       ID_HIDDEN(ID_AFR0_EL1),
> >> >       AA32_ID_SANITISED(ID_MMFR0_EL1),
> >> >       AA32_ID_SANITISED(ID_MMFR1_EL1),
> >> > @@ -2057,7 +2057,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
> >> >         .get_user = get_id_reg,
> >> >         .set_user = set_id_aa64dfr0_el1,
> >> >         .reset = read_sanitised_id_aa64dfr0_el1,
> >> > -       .val = ID_AA64DFR0_EL1_PMUVer_MASK, },
> >> > +       .val = GENMASK(63, 0), },
> >> >       ID_SANITISED(ID_AA64DFR1_EL1),
> >> >       ID_UNALLOCATED(5,2),
> >> >       ID_UNALLOCATED(5,3),
> >>
> >> How does userspace find out whether a given id reg is actually writable,
> >> other than trying to write to it?
> >>
> > No mechanism was provided to userspace to discover if a given idreg or
> > any fields of a given idreg is writable. The write to a readonly idreg
> > can also succeed (write ignored) without any error if what's written
> > is exactly the same as what the idreg holds or if it is a write to
> > AArch32 idregs on an AArch64-only system.
> 
> Hm, I'm not sure that's a good thing for the cases where we want to
> support mix-and-match userspace and kernels. Userspace may want to know
> upfront whether it can actually tweak the contents of an idreg or not
> (for example, in the context of using CPU models for compatibility), so
> that it can reject or warn about certain configurations that may not
> turn out as the user expects.
> 
> > Not sure if it is worth adding an API to return the writable mask for
> > idregs, since we want to enable the writable for all allocated
> > unhidden idregs eventually.
> 
> We'd enable any new idregs for writing from the start in the future, I
> guess?
> 
> I see two approaches here:
> - add an API to get a list of idregs with their writable masks
> - add a capability "you can write to all idregs whatever you'd expect to
>   be able to write there architecture wise", which would require to add
>   support for all idregs prior to exposing that cap
> 
> The second option would be the easier one (if we don't manage to break
> it in the future :)

I'm not sure the last option is even possible. The architecture keeps
allocating new ID registers in the op0==3, op1=={0, 1, 3}, CRn==0,
CRm=={0-7}, op2=={0-7} space, so fields that were RES0 until then
start having a non-0 value.

This could lead to a situation where you move from a system that
didn't know about ID_AA64MMFR6_EL1.XYZ to a system that advertises it,
and for which the XYZ instruction has another behaviour. Bad things
follow.

My preference would be a single ioctl that returns the full list of
writeable masks in the ID reg range. It is big, but not crazy big
(1536 bytes, if I haven't messed up), and includes the non ID_*_EL1
sysreg such as MPIDR_EL1, CTR_EL1, SMIDR_EL1.

It would allow the VMM to actively write zeroes to any writable ID
register it doesn't know about, or for which it doesn't have anything
to restore. It is also relatively future proof, as it covers
*everything* the architecture has provisioned for the future (by the
time that space is exhausted, I hope none of us will still be involved
with this crap).

Thanks,

	M.
Cornelia Huck July 21, 2023, 9:48 a.m. UTC | #5
On Fri, Jul 21 2023, Marc Zyngier <maz@kernel.org> wrote:

> On Fri, 21 Jul 2023 09:38:23 +0100,
> Cornelia Huck <cohuck@redhat.com> wrote:
>> 
>> On Thu, Jul 20 2023, Jing Zhang <jingzhangos@google.com> wrote:
>> > No mechanism was provided to userspace to discover if a given idreg or
>> > any fields of a given idreg is writable. The write to a readonly idreg
>> > can also succeed (write ignored) without any error if what's written
>> > is exactly the same as what the idreg holds or if it is a write to
>> > AArch32 idregs on an AArch64-only system.
>> 
>> Hm, I'm not sure that's a good thing for the cases where we want to
>> support mix-and-match userspace and kernels. Userspace may want to know
>> upfront whether it can actually tweak the contents of an idreg or not
>> (for example, in the context of using CPU models for compatibility), so
>> that it can reject or warn about certain configurations that may not
>> turn out as the user expects.
>> 
>> > Not sure if it is worth adding an API to return the writable mask for
>> > idregs, since we want to enable the writable for all allocated
>> > unhidden idregs eventually.
>> 
>> We'd enable any new idregs for writing from the start in the future, I
>> guess?
>> 
>> I see two approaches here:
>> - add an API to get a list of idregs with their writable masks
>> - add a capability "you can write to all idregs whatever you'd expect to
>>   be able to write there architecture wise", which would require to add
>>   support for all idregs prior to exposing that cap
>> 
>> The second option would be the easier one (if we don't manage to break
>> it in the future :)
>
> I'm not sure the last option is even possible. The architecture keeps
> allocating new ID registers in the op0==3, op1=={0, 1, 3}, CRn==0,
> CRm=={0-7}, op2=={0-7} space, so fields that were RES0 until then
> start having a non-0 value.
>
> This could lead to a situation where you move from a system that
> didn't know about ID_AA64MMFR6_EL1.XYZ to a system that advertises it,
> and for which the XYZ instruction has another behaviour. Bad things
> follow.

Hrm :(

>
> My preference would be a single ioctl that returns the full list of
> writeable masks in the ID reg range. It is big, but not crazy big
> (1536 bytes, if I haven't messed up), and includes the non ID_*_EL1
> sysreg such as MPIDR_EL1, CTR_EL1, SMIDR_EL1.
>
> It would allow the VMM to actively write zeroes to any writable ID
> register it doesn't know about, or for which it doesn't have anything
> to restore. It is also relatively future proof, as it covers
> *everything* the architecture has provisioned for the future (by the
> time that space is exhausted, I hope none of us will still be involved
> with this crap).

Famous last words :)

But yes, that should work. This wouldn't be the first ioctl returning a
long list, and the VMM would just call it once on VM startup to figure
things out anyway.
Jing Zhang July 21, 2023, 6:22 p.m. UTC | #6
Hi Marc,

On Fri, Jul 21, 2023 at 2:31 AM Marc Zyngier <maz@kernel.org> wrote:
>
> On Fri, 21 Jul 2023 09:38:23 +0100,
> Cornelia Huck <cohuck@redhat.com> wrote:
> >
> > On Thu, Jul 20 2023, Jing Zhang <jingzhangos@google.com> wrote:
> >
> > > Hi Cornelia,
> > >
> > > On Thu, Jul 20, 2023 at 1:52 AM Cornelia Huck <cohuck@redhat.com> wrote:
> > >>
> > >> On Tue, Jul 18 2023, Jing Zhang <jingzhangos@google.com> wrote:
> > >>
> > >> > All valid fields in ID_AA64DFR0_EL1 and ID_DFR0_EL1 are writable
> > >> > from usrespace with this change.
> > >>
> > >> Typo: s/usrespace/userspace/
> > > Thanks.
> > >>
> > >> >
> > >> > Signed-off-by: Jing Zhang <jingzhangos@google.com>
> > >> > ---
> > >> >  arch/arm64/kvm/sys_regs.c | 4 ++--
> > >> >  1 file changed, 2 insertions(+), 2 deletions(-)
> > >> >
> > >> > diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
> > >> > index 053d8057ff1e..f33aec83f1b4 100644
> > >> > --- a/arch/arm64/kvm/sys_regs.c
> > >> > +++ b/arch/arm64/kvm/sys_regs.c
> > >> > @@ -2008,7 +2008,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
> > >> >         .set_user = set_id_dfr0_el1,
> > >> >         .visibility = aa32_id_visibility,
> > >> >         .reset = read_sanitised_id_dfr0_el1,
> > >> > -       .val = ID_DFR0_EL1_PerfMon_MASK, },
> > >> > +       .val = GENMASK(63, 0), },
> > >> >       ID_HIDDEN(ID_AFR0_EL1),
> > >> >       AA32_ID_SANITISED(ID_MMFR0_EL1),
> > >> >       AA32_ID_SANITISED(ID_MMFR1_EL1),
> > >> > @@ -2057,7 +2057,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
> > >> >         .get_user = get_id_reg,
> > >> >         .set_user = set_id_aa64dfr0_el1,
> > >> >         .reset = read_sanitised_id_aa64dfr0_el1,
> > >> > -       .val = ID_AA64DFR0_EL1_PMUVer_MASK, },
> > >> > +       .val = GENMASK(63, 0), },
> > >> >       ID_SANITISED(ID_AA64DFR1_EL1),
> > >> >       ID_UNALLOCATED(5,2),
> > >> >       ID_UNALLOCATED(5,3),
> > >>
> > >> How does userspace find out whether a given id reg is actually writable,
> > >> other than trying to write to it?
> > >>
> > > No mechanism was provided to userspace to discover if a given idreg or
> > > any fields of a given idreg is writable. The write to a readonly idreg
> > > can also succeed (write ignored) without any error if what's written
> > > is exactly the same as what the idreg holds or if it is a write to
> > > AArch32 idregs on an AArch64-only system.
> >
> > Hm, I'm not sure that's a good thing for the cases where we want to
> > support mix-and-match userspace and kernels. Userspace may want to know
> > upfront whether it can actually tweak the contents of an idreg or not
> > (for example, in the context of using CPU models for compatibility), so
> > that it can reject or warn about certain configurations that may not
> > turn out as the user expects.
> >
> > > Not sure if it is worth adding an API to return the writable mask for
> > > idregs, since we want to enable the writable for all allocated
> > > unhidden idregs eventually.
> >
> > We'd enable any new idregs for writing from the start in the future, I
> > guess?
> >
> > I see two approaches here:
> > - add an API to get a list of idregs with their writable masks
> > - add a capability "you can write to all idregs whatever you'd expect to
> >   be able to write there architecture wise", which would require to add
> >   support for all idregs prior to exposing that cap
> >
> > The second option would be the easier one (if we don't manage to break
> > it in the future :)
>
> I'm not sure the last option is even possible. The architecture keeps
> allocating new ID registers in the op0==3, op1=={0, 1, 3}, CRn==0,
> CRm=={0-7}, op2=={0-7} space, so fields that were RES0 until then
> start having a non-0 value.
For now, the per VM ID emulated ID registers support only covers space
for op0==3, op1==0, CRn==0, CRm=={1-7}, op2=={0-8}. For others, mask
value of 0 would be returned in the new ioctl.
>
> This could lead to a situation where you move from a system that
> didn't know about ID_AA64MMFR6_EL1.XYZ to a system that advertises it,
> and for which the XYZ instruction has another behaviour. Bad things
> follow.
>
> My preference would be a single ioctl that returns the full list of
> writeable masks in the ID reg range. It is big, but not crazy big
> (1536 bytes, if I haven't messed up), and includes the non ID_*_EL1
> sysreg such as MPIDR_EL1, CTR_EL1, SMIDR_EL1.
Just want to double confirm that would the ioclt return the list of
only writable masks, not the list of {idreg_name, mask} pair? So, the
VMM will need to index idreg's writable mask by op1, CRm, op2?
>
> It would allow the VMM to actively write zeroes to any writable ID
> register it doesn't know about, or for which it doesn't have anything
> to restore. It is also relatively future proof, as it covers
> *everything* the architecture has provisioned for the future (by the
> time that space is exhausted, I hope none of us will still be involved
> with this crap).
>
> Thanks,
>
>         M.
>
> --
> Without deviation from the norm, progress is not possible.
>
Thanks,
Jing
Oliver Upton July 21, 2023, 9:10 p.m. UTC | #7
On Fri, Jul 21, 2023 at 11:22:35AM -0700, Jing Zhang wrote:
> On Fri, Jul 21, 2023 at 2:31 AM Marc Zyngier <maz@kernel.org> wrote:
> > My preference would be a single ioctl that returns the full list of
> > writeable masks in the ID reg range. It is big, but not crazy big
> > (1536 bytes, if I haven't messed up), and includes the non ID_*_EL1
> > sysreg such as MPIDR_EL1, CTR_EL1, SMIDR_EL1.
> Just want to double confirm that would the ioclt return the list of
> only writable masks, not the list of {idreg_name, mask} pair? So, the
> VMM will need to index idreg's writable mask by op1, CRm, op2?

I generally agree with the approach Marc is proposing, but I wonder if
it makes sense to have userspace ask the kernel for this information on
a per-register basis.

What I had in mind was something similar to the KVM_GET_ONE_REG ioctl,
but instead of returning the register value it'd return the mask of the
register. This would keep the kernel implementation dead simple (I'm
lazy) and more easily allow for future expansion in case we want to
start describing more registers this way. Userspace would iterate the ID
register space and ask the kernel for the mask of registers it wants to
change.

Thoughts?
Cornelia Huck July 24, 2023, 8:45 a.m. UTC | #8
On Fri, Jul 21 2023, Oliver Upton <oliver.upton@linux.dev> wrote:

> On Fri, Jul 21, 2023 at 11:22:35AM -0700, Jing Zhang wrote:
>> On Fri, Jul 21, 2023 at 2:31 AM Marc Zyngier <maz@kernel.org> wrote:
>> > My preference would be a single ioctl that returns the full list of
>> > writeable masks in the ID reg range. It is big, but not crazy big
>> > (1536 bytes, if I haven't messed up), and includes the non ID_*_EL1
>> > sysreg such as MPIDR_EL1, CTR_EL1, SMIDR_EL1.
>> Just want to double confirm that would the ioclt return the list of
>> only writable masks, not the list of {idreg_name, mask} pair? So, the
>> VMM will need to index idreg's writable mask by op1, CRm, op2?
>
> I generally agree with the approach Marc is proposing, but I wonder if
> it makes sense to have userspace ask the kernel for this information on
> a per-register basis.
>
> What I had in mind was something similar to the KVM_GET_ONE_REG ioctl,
> but instead of returning the register value it'd return the mask of the
> register. This would keep the kernel implementation dead simple (I'm
> lazy) and more easily allow for future expansion in case we want to
> start describing more registers this way. Userspace would iterate the ID
> register space and ask the kernel for the mask of registers it wants to
> change.

Hm... for userspace it might be easier to get one big list and then
parse it afterwards? Similar to what GET_REG_LIST does today.

Are you thinking more of a KVM_GET_REG_INFO or so ioctl, that could
support different kinds of extra info (and might also make sense for
other architectures?) If we end up with something more versatile, it
might make sense going that route.
Oliver Upton July 26, 2023, 5:24 p.m. UTC | #9
Hi Cornelia,

On Mon, Jul 24, 2023 at 10:45:44AM +0200, Cornelia Huck wrote:
> On Fri, Jul 21 2023, Oliver Upton <oliver.upton@linux.dev> wrote:
> > What I had in mind was something similar to the KVM_GET_ONE_REG ioctl,
> > but instead of returning the register value it'd return the mask of the
> > register. This would keep the kernel implementation dead simple (I'm
> > lazy) and more easily allow for future expansion in case we want to
> > start describing more registers this way. Userspace would iterate the ID
> > register space and ask the kernel for the mask of registers it wants to
> > change.
> 
> Hm... for userspace it might be easier to get one big list and then
> parse it afterwards? Similar to what GET_REG_LIST does today.

Possibly, but I felt like it was a bit different from GET_REG_LIST since
this would actually be a list of key-value pairs (reg_id, mask) instead
of a pure enumeration of IDs. My worry is that if/when we wind up describing
more registers in this list-based ioctl then userspace is going to wind
up traversing that structure a lot to find the register masks it actually
cares about.

> Are you thinking more of a KVM_GET_REG_INFO or so ioctl, that could
> support different kinds of extra info (and might also make sense for
> other architectures?) If we end up with something more versatile, it
> might make sense going that route.

TBH, I hadn't considered the extensibililty of a per-register ioctl, but
that does seem like a good point.

--
Thanks,
Oliver
Cornelia Huck July 27, 2023, 9:34 a.m. UTC | #10
On Wed, Jul 26 2023, Oliver Upton <oliver.upton@linux.dev> wrote:

> Hi Cornelia,
>
> On Mon, Jul 24, 2023 at 10:45:44AM +0200, Cornelia Huck wrote:
>> On Fri, Jul 21 2023, Oliver Upton <oliver.upton@linux.dev> wrote:
>> > What I had in mind was something similar to the KVM_GET_ONE_REG ioctl,
>> > but instead of returning the register value it'd return the mask of the
>> > register. This would keep the kernel implementation dead simple (I'm
>> > lazy) and more easily allow for future expansion in case we want to
>> > start describing more registers this way. Userspace would iterate the ID
>> > register space and ask the kernel for the mask of registers it wants to
>> > change.
>> 
>> Hm... for userspace it might be easier to get one big list and then
>> parse it afterwards? Similar to what GET_REG_LIST does today.
>
> Possibly, but I felt like it was a bit different from GET_REG_LIST since
> this would actually be a list of key-value pairs (reg_id, mask) instead
> of a pure enumeration of IDs. My worry is that if/when we wind up describing
> more registers in this list-based ioctl then userspace is going to wind
> up traversing that structure a lot to find the register masks it actually
> cares about.

Depends on how userspace actually digests it, but point taken.

>
>> Are you thinking more of a KVM_GET_REG_INFO or so ioctl, that could
>> support different kinds of extra info (and might also make sense for
>> other architectures?) If we end up with something more versatile, it
>> might make sense going that route.
>
> TBH, I hadn't considered the extensibililty of a per-register ioctl, but
> that does seem like a good point.

Maybe smth like

/* available with KVM_CAP_GET_REG_INFO */
struct kvm_reg_info {
	__u64 id;
	__u32 op;
	__u32 len;
	__u8 data[];
};

/* operations for kvm_reg_info->op */
#define KVM_REG_INFO_ARM_ID_REG 0

#define KVM_GET_REG_INFO _IOW(KVMIO, 0xd2, struct kvm_reg_info)

and returning sys_reg_desc->val in data if id points to a valid id reg.
Marc Zyngier July 29, 2023, 10:36 a.m. UTC | #11
On Fri, 21 Jul 2023 10:48:27 +0100,
Cornelia Huck <cohuck@redhat.com> wrote:
> 
> On Fri, Jul 21 2023, Marc Zyngier <maz@kernel.org> wrote:
> 
> > On Fri, 21 Jul 2023 09:38:23 +0100,
> > Cornelia Huck <cohuck@redhat.com> wrote:
> >> 
> >> On Thu, Jul 20 2023, Jing Zhang <jingzhangos@google.com> wrote:
> >> > No mechanism was provided to userspace to discover if a given idreg or
> >> > any fields of a given idreg is writable. The write to a readonly idreg
> >> > can also succeed (write ignored) without any error if what's written
> >> > is exactly the same as what the idreg holds or if it is a write to
> >> > AArch32 idregs on an AArch64-only system.
> >> 
> >> Hm, I'm not sure that's a good thing for the cases where we want to
> >> support mix-and-match userspace and kernels. Userspace may want to know
> >> upfront whether it can actually tweak the contents of an idreg or not
> >> (for example, in the context of using CPU models for compatibility), so
> >> that it can reject or warn about certain configurations that may not
> >> turn out as the user expects.
> >> 
> >> > Not sure if it is worth adding an API to return the writable mask for
> >> > idregs, since we want to enable the writable for all allocated
> >> > unhidden idregs eventually.
> >> 
> >> We'd enable any new idregs for writing from the start in the future, I
> >> guess?
> >> 
> >> I see two approaches here:
> >> - add an API to get a list of idregs with their writable masks
> >> - add a capability "you can write to all idregs whatever you'd expect to
> >>   be able to write there architecture wise", which would require to add
> >>   support for all idregs prior to exposing that cap
> >> 
> >> The second option would be the easier one (if we don't manage to break
> >> it in the future :)
> >
> > I'm not sure the last option is even possible. The architecture keeps
> > allocating new ID registers in the op0==3, op1=={0, 1, 3}, CRn==0,
> > CRm=={0-7}, op2=={0-7} space, so fields that were RES0 until then
> > start having a non-0 value.
> >
> > This could lead to a situation where you move from a system that
> > didn't know about ID_AA64MMFR6_EL1.XYZ to a system that advertises it,
> > and for which the XYZ instruction has another behaviour. Bad things
> > follow.
> 
> Hrm :(
> 
> >
> > My preference would be a single ioctl that returns the full list of
> > writeable masks in the ID reg range. It is big, but not crazy big
> > (1536 bytes, if I haven't messed up), and includes the non ID_*_EL1
> > sysreg such as MPIDR_EL1, CTR_EL1, SMIDR_EL1.
> >
> > It would allow the VMM to actively write zeroes to any writable ID
> > register it doesn't know about, or for which it doesn't have anything
> > to restore. It is also relatively future proof, as it covers
> > *everything* the architecture has provisioned for the future (by the
> > time that space is exhausted, I hope none of us will still be involved
> > with this crap).
> 
> Famous last words :)
> 
> But yes, that should work. This wouldn't be the first ioctl returning a
> long list, and the VMM would just call it once on VM startup to figure
> things out anyway.

To be clear, see below for what I had in mind. It is of course
untested, and is probably overlooking a number of details, but you'll
hopefully get my drift. I think this has some benefit over the
per-sysreg ioctl, as it covers everything in one go, and is guaranteed
to be exhaustive (until the architecture grows another range of ID
crap).

Note that we don't necessarily need to restrict ourselves to a single
range either. We could also return some other ranges depending on
additional parameters (Oliver mentioned offline the case of the
PCMEIDx_EL0 registers).

Thank,

	M.

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 2ca2973abe66..fa79f3651423 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -3589,3 +3589,91 @@ int __init kvm_sys_reg_table_init(void)
 
 	return 0;
 }
+
+/*
+ * From DDI0487J.a, D19.2.66 ("ID_AA64MMFR2_EL1, AArch64 Memory Model
+ * Feature Register 2"):
+ *
+ * "The Feature ID space is defined as the System register space in
+ * AArch64 with op0==3, op1=={0, 1, 3}, CRn==0, CRm=={0-7},
+ * op2=={0-7}."
+ *
+ * This covers all R/O registers that indicate anything useful feature
+ * wise, including the ID registers.
+ */
+
+/* Userspace-visible definitions */
+#define ARM64_FEATURE_ID_SPACE_SIZE	(3 * 8 * 8)
+#define __ARM64_FEATURE_ID_SPACE_IDX(op0, op1, crn, crm, op2)		\
+	({								\
+		__u64 __op1 = op1 & 3;					\
+		__op1 -= (__op1 == 3);					\
+		((ARM64_SYS_REG_SHIFT_MASK(3, OP0) |			\
+		  ARM64_SYS_REG_SHIFT_MASK(__op1, OP1) |		\
+		  ARM64_SYS_REG_SHIFT_MASK(0, CRN) |			\
+		  ARM64_SYS_REG_SHIFT_MASK(crm & 7, CRM) |		\
+		  ARM64_SYS_REG_SHIFT_MASK(op2, OP2)) -			\
+		 (ARM64_SYS_REG_SHIFT_MASK(3, OP0) |			\
+		  ARM64_SYS_REG_SHIFT_MASK(0, OP1) |			\
+		  ARM64_SYS_REG_SHIFT_MASK(0, CRN) |			\
+		  ARM64_SYS_REG_SHIFT_MASK(0, CRM) |			\
+		  ARM64_SYS_REG_SHIFT_MASK(0, OP2)));			\
+	})
+
+#define ARM64_FEATURE_ID_SPACE_INDEX(r)					\
+	__ARM64_FEATURE_ID_SPACE_IDX(sys_reg_Op0(r),			\
+				     sys_reg_Op1(r),			\
+				     sys_reg_CRn(r),			\
+				     sys_reg_CRm(r),			\
+				     sys_reg_Op2(r))
+
+struct feature_id_writeable_masks {
+	u64	mask[ARM64_FEATURE_ID_SPACE_SIZE];
+};
+
+static bool is_feature_id_reg(u32 encoding)
+{
+	return (sys_reg_Op0(encoding) == 3 &&
+		(sys_reg_Op1(encoding) < 2 || sys_reg_Op1(encoding) == 3) &&
+		sys_reg_CRn(encoding) == 0 &&
+		sys_reg_CRm(encoding) <= 7);
+}
+
+int kvm_get_writeable_feature_regs(struct kvm *kvm, u64 __user *masks)
+{
+	/* Wipe the whole thing first */
+	for (int i = 0; i < ARM64_FEATURE_ID_SPACE_SIZE; i++)
+		if (put_user(0, masks + i))
+			return -EFAULT;
+
+	for (int i = 0; i < ARRAY_SIZE(sys_reg_descs); i++) {
+		const struct sys_reg_desc *reg = &sys_reg_descs[i];
+		u32 encoding = reg_to_encoding(reg);
+		u64 val;
+
+		if (!is_feature_id_reg(encoding) || !reg->set_user)
+			continue;
+
+		/*
+		 * For ID registers, we return the writable mask.
+		 * Other feature registers return a full 64bit mask.
+		 * That's not necessarily compliant with a given
+		 * revision of the architecture, but the RES0/RES1
+		 * definitions allow us to do that
+		 */
+		if (is_id_reg(encoding)) {
+			if (!reg->val)
+				continue;
+
+			val = reg->val;
+		} else {
+			val = ~0UL;
+		}
+
+		if (put_user(val,
+			     (masks + ARM64_FEATURE_ID_SPACE_INDEX(encoding))))
+			return -EFAULT;
+	}
+
+	return 0;
+}
Jing Zhang July 31, 2023, 8:51 p.m. UTC | #12
On Sat, Jul 29, 2023 at 3:36 AM Marc Zyngier <maz@kernel.org> wrote:
>
> On Fri, 21 Jul 2023 10:48:27 +0100,
> Cornelia Huck <cohuck@redhat.com> wrote:
> >
> > On Fri, Jul 21 2023, Marc Zyngier <maz@kernel.org> wrote:
> >
> > > On Fri, 21 Jul 2023 09:38:23 +0100,
> > > Cornelia Huck <cohuck@redhat.com> wrote:
> > >>
> > >> On Thu, Jul 20 2023, Jing Zhang <jingzhangos@google.com> wrote:
> > >> > No mechanism was provided to userspace to discover if a given idreg or
> > >> > any fields of a given idreg is writable. The write to a readonly idreg
> > >> > can also succeed (write ignored) without any error if what's written
> > >> > is exactly the same as what the idreg holds or if it is a write to
> > >> > AArch32 idregs on an AArch64-only system.
> > >>
> > >> Hm, I'm not sure that's a good thing for the cases where we want to
> > >> support mix-and-match userspace and kernels. Userspace may want to know
> > >> upfront whether it can actually tweak the contents of an idreg or not
> > >> (for example, in the context of using CPU models for compatibility), so
> > >> that it can reject or warn about certain configurations that may not
> > >> turn out as the user expects.
> > >>
> > >> > Not sure if it is worth adding an API to return the writable mask for
> > >> > idregs, since we want to enable the writable for all allocated
> > >> > unhidden idregs eventually.
> > >>
> > >> We'd enable any new idregs for writing from the start in the future, I
> > >> guess?
> > >>
> > >> I see two approaches here:
> > >> - add an API to get a list of idregs with their writable masks
> > >> - add a capability "you can write to all idregs whatever you'd expect to
> > >>   be able to write there architecture wise", which would require to add
> > >>   support for all idregs prior to exposing that cap
> > >>
> > >> The second option would be the easier one (if we don't manage to break
> > >> it in the future :)
> > >
> > > I'm not sure the last option is even possible. The architecture keeps
> > > allocating new ID registers in the op0==3, op1=={0, 1, 3}, CRn==0,
> > > CRm=={0-7}, op2=={0-7} space, so fields that were RES0 until then
> > > start having a non-0 value.
> > >
> > > This could lead to a situation where you move from a system that
> > > didn't know about ID_AA64MMFR6_EL1.XYZ to a system that advertises it,
> > > and for which the XYZ instruction has another behaviour. Bad things
> > > follow.
> >
> > Hrm :(
> >
> > >
> > > My preference would be a single ioctl that returns the full list of
> > > writeable masks in the ID reg range. It is big, but not crazy big
> > > (1536 bytes, if I haven't messed up), and includes the non ID_*_EL1
> > > sysreg such as MPIDR_EL1, CTR_EL1, SMIDR_EL1.
> > >
> > > It would allow the VMM to actively write zeroes to any writable ID
> > > register it doesn't know about, or for which it doesn't have anything
> > > to restore. It is also relatively future proof, as it covers
> > > *everything* the architecture has provisioned for the future (by the
> > > time that space is exhausted, I hope none of us will still be involved
> > > with this crap).
> >
> > Famous last words :)
> >
> > But yes, that should work. This wouldn't be the first ioctl returning a
> > long list, and the VMM would just call it once on VM startup to figure
> > things out anyway.
>
> To be clear, see below for what I had in mind. It is of course
> untested, and is probably overlooking a number of details, but you'll
> hopefully get my drift. I think this has some benefit over the
> per-sysreg ioctl, as it covers everything in one go, and is guaranteed
> to be exhaustive (until the architecture grows another range of ID
> crap).
>
> Note that we don't necessarily need to restrict ourselves to a single
> range either. We could also return some other ranges depending on
> additional parameters (Oliver mentioned offline the case of the
> PCMEIDx_EL0 registers).
>
> Thank,
>
>         M.
>
> diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
> index 2ca2973abe66..fa79f3651423 100644
> --- a/arch/arm64/kvm/sys_regs.c
> +++ b/arch/arm64/kvm/sys_regs.c
> @@ -3589,3 +3589,91 @@ int __init kvm_sys_reg_table_init(void)
>
>         return 0;
>  }
> +
> +/*
> + * From DDI0487J.a, D19.2.66 ("ID_AA64MMFR2_EL1, AArch64 Memory Model
> + * Feature Register 2"):
> + *
> + * "The Feature ID space is defined as the System register space in
> + * AArch64 with op0==3, op1=={0, 1, 3}, CRn==0, CRm=={0-7},
> + * op2=={0-7}."
> + *
> + * This covers all R/O registers that indicate anything useful feature
> + * wise, including the ID registers.
> + */
> +
> +/* Userspace-visible definitions */
> +#define ARM64_FEATURE_ID_SPACE_SIZE    (3 * 8 * 8)
> +#define __ARM64_FEATURE_ID_SPACE_IDX(op0, op1, crn, crm, op2)          \
> +       ({                                                              \
> +               __u64 __op1 = op1 & 3;                                  \
> +               __op1 -= (__op1 == 3);                                  \
> +               ((ARM64_SYS_REG_SHIFT_MASK(3, OP0) |                    \
> +                 ARM64_SYS_REG_SHIFT_MASK(__op1, OP1) |                \
> +                 ARM64_SYS_REG_SHIFT_MASK(0, CRN) |                    \
> +                 ARM64_SYS_REG_SHIFT_MASK(crm & 7, CRM) |              \
> +                 ARM64_SYS_REG_SHIFT_MASK(op2, OP2)) -                 \
> +                (ARM64_SYS_REG_SHIFT_MASK(3, OP0) |                    \
> +                 ARM64_SYS_REG_SHIFT_MASK(0, OP1) |                    \
> +                 ARM64_SYS_REG_SHIFT_MASK(0, CRN) |                    \
> +                 ARM64_SYS_REG_SHIFT_MASK(0, CRM) |                    \
> +                 ARM64_SYS_REG_SHIFT_MASK(0, OP2)));                   \
> +       })
> +
> +#define ARM64_FEATURE_ID_SPACE_INDEX(r)                                        \
> +       __ARM64_FEATURE_ID_SPACE_IDX(sys_reg_Op0(r),                    \
> +                                    sys_reg_Op1(r),                    \
> +                                    sys_reg_CRn(r),                    \
> +                                    sys_reg_CRm(r),                    \
> +                                    sys_reg_Op2(r))
> +
> +struct feature_id_writeable_masks {
> +       u64     mask[ARM64_FEATURE_ID_SPACE_SIZE];
> +};
> +
> +static bool is_feature_id_reg(u32 encoding)
> +{
> +       return (sys_reg_Op0(encoding) == 3 &&
> +               (sys_reg_Op1(encoding) < 2 || sys_reg_Op1(encoding) == 3) &&
> +               sys_reg_CRn(encoding) == 0 &&
> +               sys_reg_CRm(encoding) <= 7);
> +}
> +
> +int kvm_get_writeable_feature_regs(struct kvm *kvm, u64 __user *masks)
> +{
> +       /* Wipe the whole thing first */
> +       for (int i = 0; i < ARM64_FEATURE_ID_SPACE_SIZE; i++)
> +               if (put_user(0, masks + i))
> +                       return -EFAULT;
> +
> +       for (int i = 0; i < ARRAY_SIZE(sys_reg_descs); i++) {
> +               const struct sys_reg_desc *reg = &sys_reg_descs[i];
> +               u32 encoding = reg_to_encoding(reg);
> +               u64 val;
> +
> +               if (!is_feature_id_reg(encoding) || !reg->set_user)
> +                       continue;
> +
> +               /*
> +                * For ID registers, we return the writable mask.
> +                * Other feature registers return a full 64bit mask.
> +                * That's not necessarily compliant with a given
> +                * revision of the architecture, but the RES0/RES1
> +                * definitions allow us to do that
> +                */
> +               if (is_id_reg(encoding)) {
> +                       if (!reg->val)
> +                               continue;
> +
> +                       val = reg->val;
> +               } else {
> +                       val = ~0UL;
> +               }
> +
> +               if (put_user(val,
> +                            (masks + ARM64_FEATURE_ID_SPACE_INDEX(encoding))))
> +                       return -EFAULT;
> +       }
> +
> +       return 0;
> +}
Thanks Marc.
The whole idea is clear to me now. I'll implement this in the next version.

Jing
>
>
>
> --
> Without deviation from the norm, progress is not possible.
diff mbox series

Patch

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 053d8057ff1e..f33aec83f1b4 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -2008,7 +2008,7 @@  static const struct sys_reg_desc sys_reg_descs[] = {
 	  .set_user = set_id_dfr0_el1,
 	  .visibility = aa32_id_visibility,
 	  .reset = read_sanitised_id_dfr0_el1,
-	  .val = ID_DFR0_EL1_PerfMon_MASK, },
+	  .val = GENMASK(63, 0), },
 	ID_HIDDEN(ID_AFR0_EL1),
 	AA32_ID_SANITISED(ID_MMFR0_EL1),
 	AA32_ID_SANITISED(ID_MMFR1_EL1),
@@ -2057,7 +2057,7 @@  static const struct sys_reg_desc sys_reg_descs[] = {
 	  .get_user = get_id_reg,
 	  .set_user = set_id_aa64dfr0_el1,
 	  .reset = read_sanitised_id_aa64dfr0_el1,
-	  .val = ID_AA64DFR0_EL1_PMUVer_MASK, },
+	  .val = GENMASK(63, 0), },
 	ID_SANITISED(ID_AA64DFR1_EL1),
 	ID_UNALLOCATED(5,2),
 	ID_UNALLOCATED(5,3),