diff mbox

[v4,5/8] arm/vm_event: get/set registers

Message ID 1464561430-13465-5-git-send-email-tamas@tklengyel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Tamas K Lengyel May 29, 2016, 10:37 p.m. UTC
Add support for getting/setting registers through vm_event on ARM.

Signed-off-by: Tamas K Lengyel <tamas@tklengyel.com>
---
Cc: Stefano Stabellini <sstabellini@kernel.org>
Cc: Julien Grall <julien.grall@arm.com>
Cc: Razvan Cojocaru <rcojocaru@bitdefender.com>

v4: Use psr mode to determine whether to full 32-bit or 64-bit structs
---
 xen/arch/arm/Makefile          |   1 +
 xen/arch/arm/vm_event.c        | 139 +++++++++++++++++++++++++++++++++++++++++
 xen/include/asm-arm/vm_event.h |  11 ----
 xen/include/asm-x86/vm_event.h |   4 --
 xen/include/public/vm_event.h  |  58 ++++++++++++++++-
 xen/include/xen/vm_event.h     |   3 +
 6 files changed, 199 insertions(+), 17 deletions(-)
 create mode 100644 xen/arch/arm/vm_event.c

Comments

Razvan Cojocaru May 30, 2016, 7:09 a.m. UTC | #1
On 05/30/2016 01:37 AM, Tamas K Lengyel wrote:
> Add support for getting/setting registers through vm_event on ARM.
> 
> Signed-off-by: Tamas K Lengyel <tamas@tklengyel.com>
> ---
> Cc: Stefano Stabellini <sstabellini@kernel.org>
> Cc: Julien Grall <julien.grall@arm.com>
> Cc: Razvan Cojocaru <rcojocaru@bitdefender.com>
> 
> v4: Use psr mode to determine whether to full 32-bit or 64-bit structs
> ---
>  xen/arch/arm/Makefile          |   1 +
>  xen/arch/arm/vm_event.c        | 139 +++++++++++++++++++++++++++++++++++++++++
>  xen/include/asm-arm/vm_event.h |  11 ----
>  xen/include/asm-x86/vm_event.h |   4 --
>  xen/include/public/vm_event.h  |  58 ++++++++++++++++-
>  xen/include/xen/vm_event.h     |   3 +
>  6 files changed, 199 insertions(+), 17 deletions(-)
>  create mode 100644 xen/arch/arm/vm_event.c

Acked-by: Razvan Cojocaru <rcojocaru@bitdefender.com>
Jan Beulich May 30, 2016, 11:50 a.m. UTC | #2
>>> On 30.05.16 at 00:37, <tamas@tklengyel.com> wrote:
> +struct vm_event_regs_arm32 {
> +    uint32_t r0_usr;
> +    uint32_t r1_usr;
> +    uint32_t r2_usr;
> +    uint32_t r3_usr;
> +    uint32_t r4_usr;
> +    uint32_t r5_usr;
> +    uint32_t r6_usr;
> +    uint32_t r7_usr;
> +    uint32_t r8_usr;
> +    uint32_t r9_usr;
> +    uint32_t r10_usr;
> +    uint32_t r12_usr;
> +    uint32_t lr_usr;
> +    uint32_t fp;
> +    uint32_t pc;
> +    uint32_t sp_usr;
> +    uint32_t sp_svc;
> +    uint32_t spsr_svc;
> +};

It would seem more natural for the "ordinary" registers to be
arranged in the numerical sequence, i.e. fp, r12, sp, lr, pc.

> +struct vm_event_regs_arm64 {
> +    uint64_t x0;
> +    uint64_t x1;
> +    uint64_t x2;
> +    uint64_t x3;
> +    uint64_t x4;
> +    uint64_t x5;
> +    uint64_t x6;
> +    uint64_t x7;
> +    uint64_t x8;
> +    uint64_t x9;
> +    uint64_t x10;
> +    uint64_t x16;
> +    uint64_t lr;
> +    uint64_t fp;
> +    uint64_t pc;
> +    uint64_t sp_el0;
> +    uint64_t sp_el1;
> +    uint32_t spsr_el1;
> +    uint32_t _pad;
> +};

My ARM knowledge is certainly quite limited, but the incomplete set
of GPRs here is quite obvious: Is there a reason to not make all of
them available here? And if there is, could the criteria of which
registers got put here please be documented in a comment (so that
the next person noticing the incomplete set won't ask again)?

I'm also puzzled by fp and lr - I'm not aware of registers of those
names (and gas also doesn't accept them afaict).

> +struct vm_event_regs_arm {
> +    uint32_t cpsr; /* PSR_MODE_BIT is set iff arm32 is used below */

Explicit padding missing after this one.

Jan
Tamas K Lengyel May 30, 2016, 7:47 p.m. UTC | #3
On Mon, May 30, 2016 at 5:50 AM, Jan Beulich <JBeulich@suse.com> wrote:
>>>> On 30.05.16 at 00:37, <tamas@tklengyel.com> wrote:
>> +struct vm_event_regs_arm32 {
>> +    uint32_t r0_usr;
>> +    uint32_t r1_usr;
>> +    uint32_t r2_usr;
>> +    uint32_t r3_usr;
>> +    uint32_t r4_usr;
>> +    uint32_t r5_usr;
>> +    uint32_t r6_usr;
>> +    uint32_t r7_usr;
>> +    uint32_t r8_usr;
>> +    uint32_t r9_usr;
>> +    uint32_t r10_usr;
>> +    uint32_t r12_usr;
>> +    uint32_t lr_usr;
>> +    uint32_t fp;
>> +    uint32_t pc;
>> +    uint32_t sp_usr;
>> +    uint32_t sp_svc;
>> +    uint32_t spsr_svc;
>> +};
>
> It would seem more natural for the "ordinary" registers to be
> arranged in the numerical sequence, i.e. fp, r12, sp, lr, pc.

Not sure I follow.

>
>> +struct vm_event_regs_arm64 {
>> +    uint64_t x0;
>> +    uint64_t x1;
>> +    uint64_t x2;
>> +    uint64_t x3;
>> +    uint64_t x4;
>> +    uint64_t x5;
>> +    uint64_t x6;
>> +    uint64_t x7;
>> +    uint64_t x8;
>> +    uint64_t x9;
>> +    uint64_t x10;
>> +    uint64_t x16;
>> +    uint64_t lr;
>> +    uint64_t fp;
>> +    uint64_t pc;
>> +    uint64_t sp_el0;
>> +    uint64_t sp_el1;
>> +    uint32_t spsr_el1;
>> +    uint32_t _pad;
>> +};
>
> My ARM knowledge is certainly quite limited, but the incomplete set
> of GPRs here is quite obvious: Is there a reason to not make all of
> them available here? And if there is, could the criteria of which
> registers got put here please be documented in a comment (so that
> the next person noticing the incomplete set won't ask again)?

There already is a comment in place that explains why we are not
sending the full set of registers here.

>
> I'm also puzzled by fp and lr - I'm not aware of registers of those
> names (and gas also doesn't accept them afaict).

Not sure why but Xen names x29 fp and x30 lr. See
include/asm-arm/arm64/processor.h.

>
>> +struct vm_event_regs_arm {
>> +    uint32_t cpsr; /* PSR_MODE_BIT is set iff arm32 is used below */
>
> Explicit padding missing after this one.

Ack.

Thanks,
Tamas
Julien Grall May 30, 2016, 8:20 p.m. UTC | #4
Hi Tamas,

On 30/05/2016 20:47, Tamas K Lengyel wrote:
> On Mon, May 30, 2016 at 5:50 AM, Jan Beulich <JBeulich@suse.com> wrote:
>>> +struct vm_event_regs_arm64 {
>>> +    uint64_t x0;
>>> +    uint64_t x1;
>>> +    uint64_t x2;
>>> +    uint64_t x3;
>>> +    uint64_t x4;
>>> +    uint64_t x5;
>>> +    uint64_t x6;
>>> +    uint64_t x7;
>>> +    uint64_t x8;
>>> +    uint64_t x9;
>>> +    uint64_t x10;
>>> +    uint64_t x16;
>>> +    uint64_t lr;
>>> +    uint64_t fp;
>>> +    uint64_t pc;
>>> +    uint64_t sp_el0;
>>> +    uint64_t sp_el1;
>>> +    uint32_t spsr_el1;
>>> +    uint32_t _pad;
>>> +};
>>
>> My ARM knowledge is certainly quite limited, but the incomplete set
>> of GPRs here is quite obvious: Is there a reason to not make all of
>> them available here? And if there is, could the criteria of which
>> registers got put here please be documented in a comment (so that
>> the next person noticing the incomplete set won't ask again)?
>
> There already is a comment in place that explains why we are not
> sending the full set of registers here.

Your comment only says:
"Using custom vCPU structs (i.e. not hvm_hw_cpu) for both x86 and ARM
so as to not fill the vm_event ring buffer too quickly." it does not 
explain the criteria of which registers got put here.

I already mentioned it on a previous version, and it seems to have been 
dropped from the commit message.

Anyway, we should avoid to be stingy with comments. They are helpful for 
anyone interested by the vm_event subsystem and even you in few years if 
you decide to modify this code.

>>
>> I'm also puzzled by fp and lr - I'm not aware of registers of those
>> names (and gas also doesn't accept them afaict).
>
> Not sure why but Xen names x29 fp and x30 lr. See
> include/asm-arm/arm64/processor.h.

They are not officially called "lr" and "fp" in the ARM ARM. However the 
AAPCS64 [1] (5.1.1) defines x29 and x30 as special registers holding 
resp. "lr" and "fp".

The convention is used internally in Xen for convenience. However, the 
public header uses x29 and x30 (see include/public/arch-arm.h).

I would prefer if you use the official name in the public headers.

Regards,

[1] 
http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055b/IHI0055B_aapcs64.pdf
Tamas K Lengyel May 30, 2016, 8:37 p.m. UTC | #5
On Mon, May 30, 2016 at 2:20 PM, Julien Grall <julien.grall@arm.com> wrote:
> Hi Tamas,
>
> On 30/05/2016 20:47, Tamas K Lengyel wrote:
>>
>> On Mon, May 30, 2016 at 5:50 AM, Jan Beulich <JBeulich@suse.com> wrote:
>>>>
>>>> +struct vm_event_regs_arm64 {
>>>> +    uint64_t x0;
>>>> +    uint64_t x1;
>>>> +    uint64_t x2;
>>>> +    uint64_t x3;
>>>> +    uint64_t x4;
>>>> +    uint64_t x5;
>>>> +    uint64_t x6;
>>>> +    uint64_t x7;
>>>> +    uint64_t x8;
>>>> +    uint64_t x9;
>>>> +    uint64_t x10;
>>>> +    uint64_t x16;
>>>> +    uint64_t lr;
>>>> +    uint64_t fp;
>>>> +    uint64_t pc;
>>>> +    uint64_t sp_el0;
>>>> +    uint64_t sp_el1;
>>>> +    uint32_t spsr_el1;
>>>> +    uint32_t _pad;
>>>> +};
>>>
>>>
>>> My ARM knowledge is certainly quite limited, but the incomplete set
>>> of GPRs here is quite obvious: Is there a reason to not make all of
>>> them available here? And if there is, could the criteria of which
>>> registers got put here please be documented in a comment (so that
>>> the next person noticing the incomplete set won't ask again)?
>>
>>
>> There already is a comment in place that explains why we are not
>> sending the full set of registers here.
>
>
> Your comment only says:
> "Using custom vCPU structs (i.e. not hvm_hw_cpu) for both x86 and ARM
> so as to not fill the vm_event ring buffer too quickly." it does not explain
> the criteria of which registers got put here.

Well, as we discussed it in the previous revision, there is no
hard-set rule of what can and cannot be transmitted here. The only
thing to keep in mind is to not grow this struct to be too large. The
registers sent right now represent a "best guess" of what may be
useful for performance-sensitive vm_event applications on ARM. It can
be adjusted in the future if applications require other registers.
Right now there are no applications at all in this space so we don't
have any specifications to rely on for making this selection. I'm
happy to make adjustments to the selection if others have a better
idea what to send, the only registers I certainly find very useful are
TTBR0/1, cpsr and pc at this time.

>
> I already mentioned it on a previous version, and it seems to have been
> dropped from the commit message.
>
> Anyway, we should avoid to be stingy with comments. They are helpful for
> anyone interested by the vm_event subsystem and even you in few years if you
> decide to modify this code.

Certainly, I would be happy to put the above description into a comment here.

>
>>>
>>> I'm also puzzled by fp and lr - I'm not aware of registers of those
>>> names (and gas also doesn't accept them afaict).
>>
>>
>> Not sure why but Xen names x29 fp and x30 lr. See
>> include/asm-arm/arm64/processor.h.
>
>
> They are not officially called "lr" and "fp" in the ARM ARM. However the
> AAPCS64 [1] (5.1.1) defines x29 and x30 as special registers holding resp.
> "lr" and "fp".
>
> The convention is used internally in Xen for convenience. However, the
> public header uses x29 and x30 (see include/public/arch-arm.h).
>
> I would prefer if you use the official name in the public headers.

Sounds good to me.

Thanks,
Tamas
Razvan Cojocaru May 30, 2016, 8:46 p.m. UTC | #6
On 05/30/16 23:37, Tamas K Lengyel wrote:
> Well, as we discussed it in the previous revision, there is no
> hard-set rule of what can and cannot be transmitted here. The only
> thing to keep in mind is to not grow this struct to be too large. The
> registers sent right now represent a "best guess" of what may be
> useful for performance-sensitive vm_event applications on ARM. It can
> be adjusted in the future if applications require other registers.

Obviously I agree with Tamas here, my only comment would be that in my
humble opinion multipage vm_event ring buffers are probably the way to
go in the long run. I'm not sure if other parts of Xen already employ
such devices (or if indeed there's support for this at all at this
time), but this might be something worth looking into.

Again, this does not imply that there's anything wrong with this patch
or that this is an issue this series should address.


Thanks,
Razvan
Tamas K Lengyel May 30, 2016, 8:53 p.m. UTC | #7
On Mon, May 30, 2016 at 2:46 PM, Razvan Cojocaru
<rcojocaru@bitdefender.com> wrote:
> On 05/30/16 23:37, Tamas K Lengyel wrote:
>> Well, as we discussed it in the previous revision, there is no
>> hard-set rule of what can and cannot be transmitted here. The only
>> thing to keep in mind is to not grow this struct to be too large. The
>> registers sent right now represent a "best guess" of what may be
>> useful for performance-sensitive vm_event applications on ARM. It can
>> be adjusted in the future if applications require other registers.
>
> Obviously I agree with Tamas here, my only comment would be that in my
> humble opinion multipage vm_event ring buffers are probably the way to
> go in the long run. I'm not sure if other parts of Xen already employ
> such devices (or if indeed there's support for this at all at this
> time), but this might be something worth looking into.
>
> Again, this does not imply that there's anything wrong with this patch
> or that this is an issue this series should address.

Yeap, something ackin to libvchan would be very handy to have for
guest-hypervisor communication as well. I recall it popping up on the
mailinglist before, not sure if there was any consensus on it in the
end.

Tamas
Julien Grall May 30, 2016, 9:35 p.m. UTC | #8
On 30/05/2016 21:37, Tamas K Lengyel wrote:
> On Mon, May 30, 2016 at 2:20 PM, Julien Grall <julien.grall@arm.com> wrote:
>> Hi Tamas,
>>
>> On 30/05/2016 20:47, Tamas K Lengyel wrote:
>>>
>>> On Mon, May 30, 2016 at 5:50 AM, Jan Beulich <JBeulich@suse.com> wrote:
>>>>>
>>>>> +struct vm_event_regs_arm64 {
>>>>> +    uint64_t x0;
>>>>> +    uint64_t x1;
>>>>> +    uint64_t x2;
>>>>> +    uint64_t x3;
>>>>> +    uint64_t x4;
>>>>> +    uint64_t x5;
>>>>> +    uint64_t x6;
>>>>> +    uint64_t x7;
>>>>> +    uint64_t x8;
>>>>> +    uint64_t x9;
>>>>> +    uint64_t x10;
>>>>> +    uint64_t x16;
>>>>> +    uint64_t lr;
>>>>> +    uint64_t fp;
>>>>> +    uint64_t pc;
>>>>> +    uint64_t sp_el0;
>>>>> +    uint64_t sp_el1;
>>>>> +    uint32_t spsr_el1;
>>>>> +    uint32_t _pad;
>>>>> +};
>>>>
>>>>
>>>> My ARM knowledge is certainly quite limited, but the incomplete set
>>>> of GPRs here is quite obvious: Is there a reason to not make all of
>>>> them available here? And if there is, could the criteria of which
>>>> registers got put here please be documented in a comment (so that
>>>> the next person noticing the incomplete set won't ask again)?
>>>
>>>
>>> There already is a comment in place that explains why we are not
>>> sending the full set of registers here.
>>
>>
>> Your comment only says:
>> "Using custom vCPU structs (i.e. not hvm_hw_cpu) for both x86 and ARM
>> so as to not fill the vm_event ring buffer too quickly." it does not explain
>> the criteria of which registers got put here.
>
> Well, as we discussed it in the previous revision, there is no
> hard-set rule of what can and cannot be transmitted here. The only
> thing to keep in mind is to not grow this struct to be too large. The
> registers sent right now represent a "best guess" of what may be
> useful for performance-sensitive vm_event applications on ARM. It can
> be adjusted in the future if applications require other registers.
> Right now there are no applications at all in this space so we don't
> have any specifications to rely on for making this selection. I'm
> happy to make adjustments to the selection if others have a better
> idea what to send, the only registers I certainly find very useful are
> TTBR0/1, cpsr and pc at this time.

Please log it in the commit message and the code. If someone emitted 
multiple time the same concern on previous version, it likely means that 
your commit message was not clear enough and should be updated.

The number of patch to review on Xen-devel is very consequence, so we 
cannot really afford to spend a lot of time digging into previous 
threads. As a maintainer of a subsystem, you should be aware of that.

We are trying, at least on ARM, to get as much details as possible in 
the commit message and document any possible unclear code to help 
developer understanding why it has been done like that. It also helps us 
(the reviewers and maintainers) to give useful advice later on.

Regards,
Tamas K Lengyel May 30, 2016, 9:41 p.m. UTC | #9
On Mon, May 30, 2016 at 3:35 PM, Julien Grall <julien.grall@arm.com> wrote:
>
>
> On 30/05/2016 21:37, Tamas K Lengyel wrote:
>>
>> On Mon, May 30, 2016 at 2:20 PM, Julien Grall <julien.grall@arm.com>
>> wrote:
>>>
>>> Hi Tamas,
>>>
>>> On 30/05/2016 20:47, Tamas K Lengyel wrote:
>>>>
>>>>
>>>> On Mon, May 30, 2016 at 5:50 AM, Jan Beulich <JBeulich@suse.com> wrote:
>>>>>>
>>>>>>
>>>>>> +struct vm_event_regs_arm64 {
>>>>>> +    uint64_t x0;
>>>>>> +    uint64_t x1;
>>>>>> +    uint64_t x2;
>>>>>> +    uint64_t x3;
>>>>>> +    uint64_t x4;
>>>>>> +    uint64_t x5;
>>>>>> +    uint64_t x6;
>>>>>> +    uint64_t x7;
>>>>>> +    uint64_t x8;
>>>>>> +    uint64_t x9;
>>>>>> +    uint64_t x10;
>>>>>> +    uint64_t x16;
>>>>>> +    uint64_t lr;
>>>>>> +    uint64_t fp;
>>>>>> +    uint64_t pc;
>>>>>> +    uint64_t sp_el0;
>>>>>> +    uint64_t sp_el1;
>>>>>> +    uint32_t spsr_el1;
>>>>>> +    uint32_t _pad;
>>>>>> +};
>>>>>
>>>>>
>>>>>
>>>>> My ARM knowledge is certainly quite limited, but the incomplete set
>>>>> of GPRs here is quite obvious: Is there a reason to not make all of
>>>>> them available here? And if there is, could the criteria of which
>>>>> registers got put here please be documented in a comment (so that
>>>>> the next person noticing the incomplete set won't ask again)?
>>>>
>>>>
>>>>
>>>> There already is a comment in place that explains why we are not
>>>> sending the full set of registers here.
>>>
>>>
>>>
>>> Your comment only says:
>>> "Using custom vCPU structs (i.e. not hvm_hw_cpu) for both x86 and ARM
>>> so as to not fill the vm_event ring buffer too quickly." it does not
>>> explain
>>> the criteria of which registers got put here.
>>
>>
>> Well, as we discussed it in the previous revision, there is no
>> hard-set rule of what can and cannot be transmitted here. The only
>> thing to keep in mind is to not grow this struct to be too large. The
>> registers sent right now represent a "best guess" of what may be
>> useful for performance-sensitive vm_event applications on ARM. It can
>> be adjusted in the future if applications require other registers.
>> Right now there are no applications at all in this space so we don't
>> have any specifications to rely on for making this selection. I'm
>> happy to make adjustments to the selection if others have a better
>> idea what to send, the only registers I certainly find very useful are
>> TTBR0/1, cpsr and pc at this time.
>
>
> Please log it in the commit message and the code. If someone emitted
> multiple time the same concern on previous version, it likely means that
> your commit message was not clear enough and should be updated.
>
> The number of patch to review on Xen-devel is very consequence, so we cannot
> really afford to spend a lot of time digging into previous threads. As a
> maintainer of a subsystem, you should be aware of that.
>
> We are trying, at least on ARM, to get as much details as possible in the
> commit message and document any possible unclear code to help developer
> understanding why it has been done like that. It also helps us (the
> reviewers and maintainers) to give useful advice later on.
>

Of course, thanks.

Tamas
Jan Beulich May 31, 2016, 7:48 a.m. UTC | #10
>>> On 30.05.16 at 21:47, <tamas@tklengyel.com> wrote:
> On Mon, May 30, 2016 at 5:50 AM, Jan Beulich <JBeulich@suse.com> wrote:
>>>>> On 30.05.16 at 00:37, <tamas@tklengyel.com> wrote:
>>> +struct vm_event_regs_arm32 {
>>> +    uint32_t r0_usr;
>>> +    uint32_t r1_usr;
>>> +    uint32_t r2_usr;
>>> +    uint32_t r3_usr;
>>> +    uint32_t r4_usr;
>>> +    uint32_t r5_usr;
>>> +    uint32_t r6_usr;
>>> +    uint32_t r7_usr;
>>> +    uint32_t r8_usr;
>>> +    uint32_t r9_usr;
>>> +    uint32_t r10_usr;
>>> +    uint32_t r12_usr;
>>> +    uint32_t lr_usr;
>>> +    uint32_t fp;
>>> +    uint32_t pc;
>>> +    uint32_t sp_usr;
>>> +    uint32_t sp_svc;
>>> +    uint32_t spsr_svc;
>>> +};
>>
>> It would seem more natural for the "ordinary" registers to be
>> arranged in the numerical sequence, i.e. fp, r12, sp, lr, pc.
> 
> Not sure I follow.

For one it is quite natural for someone looking at a sequence of
register values to assume / expect them to be in natural order.
And then, having them in natural (numeric) order allows e.g.
extracting the register identifying bits from an instruction to use
them as an array index into (part of) this structure.

(For some background: I've been bitten a number of times by
people sorting x86 registers alphabetically instead or naturally,
i.e. EAX, EBX, ECX, EDX instead of EAX, ECX, EDX, EBX).

>>> +struct vm_event_regs_arm64 {
>>> +    uint64_t x0;
>>> +    uint64_t x1;
>>> +    uint64_t x2;
>>> +    uint64_t x3;
>>> +    uint64_t x4;
>>> +    uint64_t x5;
>>> +    uint64_t x6;
>>> +    uint64_t x7;
>>> +    uint64_t x8;
>>> +    uint64_t x9;
>>> +    uint64_t x10;
>>> +    uint64_t x16;
>>> +    uint64_t lr;
>>> +    uint64_t fp;
>>> +    uint64_t pc;
>>> +    uint64_t sp_el0;
>>> +    uint64_t sp_el1;
>>> +    uint32_t spsr_el1;
>>> +    uint32_t _pad;
>>> +};
>>
>> My ARM knowledge is certainly quite limited, but the incomplete set
>> of GPRs here is quite obvious: Is there a reason to not make all of
>> them available here? And if there is, could the criteria of which
>> registers got put here please be documented in a comment (so that
>> the next person noticing the incomplete set won't ask again)?
> 
> There already is a comment in place that explains why we are not
> sending the full set of registers here.

I've just gone through the entire patch again, and I didn't find any.
Are you perhaps referring to "Using custom vCPU structs (i.e. not
hvm_hw_cpu) for both x86 and ARM so as to not fill the vm_event
ring buffer too quickly"? If so, that's irrelevant here: It explains why
e.g. floating point registers don't get sent, but it doesn't explain in
any way why some GPRs are more important than others.

>> I'm also puzzled by fp and lr - I'm not aware of registers of those
>> names (and gas also doesn't accept them afaict).
> 
> Not sure why but Xen names x29 fp and x30 lr. See
> include/asm-arm/arm64/processor.h.

See Julien's reply.

Jan
Jan Beulich May 31, 2016, 7:54 a.m. UTC | #11
>>> On 30.05.16 at 22:37, <tamas@tklengyel.com> wrote:
> On Mon, May 30, 2016 at 2:20 PM, Julien Grall <julien.grall@arm.com> wrote:
>> On 30/05/2016 20:47, Tamas K Lengyel wrote:
>>> On Mon, May 30, 2016 at 5:50 AM, Jan Beulich <JBeulich@suse.com> wrote:
>>>>> +struct vm_event_regs_arm64 {
>>>>> +    uint64_t x0;
>>>>> +    uint64_t x1;
>>>>> +    uint64_t x2;
>>>>> +    uint64_t x3;
>>>>> +    uint64_t x4;
>>>>> +    uint64_t x5;
>>>>> +    uint64_t x6;
>>>>> +    uint64_t x7;
>>>>> +    uint64_t x8;
>>>>> +    uint64_t x9;
>>>>> +    uint64_t x10;
>>>>> +    uint64_t x16;
>>>>> +    uint64_t lr;
>>>>> +    uint64_t fp;
>>>>> +    uint64_t pc;
>>>>> +    uint64_t sp_el0;
>>>>> +    uint64_t sp_el1;
>>>>> +    uint32_t spsr_el1;
>>>>> +    uint32_t _pad;
>>>>> +};
>>>>
>>>>
>>>> My ARM knowledge is certainly quite limited, but the incomplete set
>>>> of GPRs here is quite obvious: Is there a reason to not make all of
>>>> them available here? And if there is, could the criteria of which
>>>> registers got put here please be documented in a comment (so that
>>>> the next person noticing the incomplete set won't ask again)?
>>>
>>>
>>> There already is a comment in place that explains why we are not
>>> sending the full set of registers here.
>>
>>
>> Your comment only says:
>> "Using custom vCPU structs (i.e. not hvm_hw_cpu) for both x86 and ARM
>> so as to not fill the vm_event ring buffer too quickly." it does not explain
>> the criteria of which registers got put here.
> 
> Well, as we discussed it in the previous revision, there is no
> hard-set rule of what can and cannot be transmitted here. The only
> thing to keep in mind is to not grow this struct to be too large. The
> registers sent right now represent a "best guess" of what may be
> useful for performance-sensitive vm_event applications on ARM. It can
> be adjusted in the future if applications require other registers.
> Right now there are no applications at all in this space so we don't
> have any specifications to rely on for making this selection. I'm
> happy to make adjustments to the selection if others have a better
> idea what to send, the only registers I certainly find very useful are
> TTBR0/1, cpsr and pc at this time.

Not being an ARM maintainer I'd say "Then include just those and no
(other) GPRs at all, or include all GPRs." Such a "best guess" approach
can really only end up being wrong from some future consumer. And
in that consideration, please also include the aspects that lead to all
x86 GPRs to get included here (not to speak of even various MSR
values). IOW the same criteria should be applied to all architectures.

Jan
Razvan Cojocaru May 31, 2016, 8:06 a.m. UTC | #12
On 05/31/2016 10:54 AM, Jan Beulich wrote:
>>>> On 30.05.16 at 22:37, <tamas@tklengyel.com> wrote:
>> On Mon, May 30, 2016 at 2:20 PM, Julien Grall <julien.grall@arm.com> wrote:
>>> On 30/05/2016 20:47, Tamas K Lengyel wrote:
>>>> On Mon, May 30, 2016 at 5:50 AM, Jan Beulich <JBeulich@suse.com> wrote:
>>>>>> +struct vm_event_regs_arm64 {
>>>>>> +    uint64_t x0;
>>>>>> +    uint64_t x1;
>>>>>> +    uint64_t x2;
>>>>>> +    uint64_t x3;
>>>>>> +    uint64_t x4;
>>>>>> +    uint64_t x5;
>>>>>> +    uint64_t x6;
>>>>>> +    uint64_t x7;
>>>>>> +    uint64_t x8;
>>>>>> +    uint64_t x9;
>>>>>> +    uint64_t x10;
>>>>>> +    uint64_t x16;
>>>>>> +    uint64_t lr;
>>>>>> +    uint64_t fp;
>>>>>> +    uint64_t pc;
>>>>>> +    uint64_t sp_el0;
>>>>>> +    uint64_t sp_el1;
>>>>>> +    uint32_t spsr_el1;
>>>>>> +    uint32_t _pad;
>>>>>> +};
>>>>>
>>>>>
>>>>> My ARM knowledge is certainly quite limited, but the incomplete set
>>>>> of GPRs here is quite obvious: Is there a reason to not make all of
>>>>> them available here? And if there is, could the criteria of which
>>>>> registers got put here please be documented in a comment (so that
>>>>> the next person noticing the incomplete set won't ask again)?
>>>>
>>>>
>>>> There already is a comment in place that explains why we are not
>>>> sending the full set of registers here.
>>>
>>>
>>> Your comment only says:
>>> "Using custom vCPU structs (i.e. not hvm_hw_cpu) for both x86 and ARM
>>> so as to not fill the vm_event ring buffer too quickly." it does not explain
>>> the criteria of which registers got put here.
>>
>> Well, as we discussed it in the previous revision, there is no
>> hard-set rule of what can and cannot be transmitted here. The only
>> thing to keep in mind is to not grow this struct to be too large. The
>> registers sent right now represent a "best guess" of what may be
>> useful for performance-sensitive vm_event applications on ARM. It can
>> be adjusted in the future if applications require other registers.
>> Right now there are no applications at all in this space so we don't
>> have any specifications to rely on for making this selection. I'm
>> happy to make adjustments to the selection if others have a better
>> idea what to send, the only registers I certainly find very useful are
>> TTBR0/1, cpsr and pc at this time.
> 
> Not being an ARM maintainer I'd say "Then include just those and no
> (other) GPRs at all, or include all GPRs." Such a "best guess" approach
> can really only end up being wrong from some future consumer. And
> in that consideration, please also include the aspects that lead to all
> x86 GPRs to get included here (not to speak of even various MSR
> values). IOW the same criteria should be applied to all architectures.

The x86 GPRS are already all included in the vm_event request:

133 struct vm_event_regs_x86 {
134     uint64_t rax;
135     uint64_t rcx;
136     uint64_t rdx;
137     uint64_t rbx;
138     uint64_t rsp;
139     uint64_t rbp;
140     uint64_t rsi;
141     uint64_t rdi;


Thanks,
Razvan
Jan Beulich May 31, 2016, 8:30 a.m. UTC | #13
>>> On 31.05.16 at 10:06, <rcojocaru@bitdefender.com> wrote:
> On 05/31/2016 10:54 AM, Jan Beulich wrote:
>>>>> On 30.05.16 at 22:37, <tamas@tklengyel.com> wrote:
>>> On Mon, May 30, 2016 at 2:20 PM, Julien Grall <julien.grall@arm.com> wrote:
>>>> On 30/05/2016 20:47, Tamas K Lengyel wrote:
>>>>> On Mon, May 30, 2016 at 5:50 AM, Jan Beulich <JBeulich@suse.com> wrote:
>>>>>>> +struct vm_event_regs_arm64 {
>>>>>>> +    uint64_t x0;
>>>>>>> +    uint64_t x1;
>>>>>>> +    uint64_t x2;
>>>>>>> +    uint64_t x3;
>>>>>>> +    uint64_t x4;
>>>>>>> +    uint64_t x5;
>>>>>>> +    uint64_t x6;
>>>>>>> +    uint64_t x7;
>>>>>>> +    uint64_t x8;
>>>>>>> +    uint64_t x9;
>>>>>>> +    uint64_t x10;
>>>>>>> +    uint64_t x16;
>>>>>>> +    uint64_t lr;
>>>>>>> +    uint64_t fp;
>>>>>>> +    uint64_t pc;
>>>>>>> +    uint64_t sp_el0;
>>>>>>> +    uint64_t sp_el1;
>>>>>>> +    uint32_t spsr_el1;
>>>>>>> +    uint32_t _pad;
>>>>>>> +};
>>>>>>
>>>>>>
>>>>>> My ARM knowledge is certainly quite limited, but the incomplete set
>>>>>> of GPRs here is quite obvious: Is there a reason to not make all of
>>>>>> them available here? And if there is, could the criteria of which
>>>>>> registers got put here please be documented in a comment (so that
>>>>>> the next person noticing the incomplete set won't ask again)?
>>>>>
>>>>>
>>>>> There already is a comment in place that explains why we are not
>>>>> sending the full set of registers here.
>>>>
>>>>
>>>> Your comment only says:
>>>> "Using custom vCPU structs (i.e. not hvm_hw_cpu) for both x86 and ARM
>>>> so as to not fill the vm_event ring buffer too quickly." it does not explain
>>>> the criteria of which registers got put here.
>>>
>>> Well, as we discussed it in the previous revision, there is no
>>> hard-set rule of what can and cannot be transmitted here. The only
>>> thing to keep in mind is to not grow this struct to be too large. The
>>> registers sent right now represent a "best guess" of what may be
>>> useful for performance-sensitive vm_event applications on ARM. It can
>>> be adjusted in the future if applications require other registers.
>>> Right now there are no applications at all in this space so we don't
>>> have any specifications to rely on for making this selection. I'm
>>> happy to make adjustments to the selection if others have a better
>>> idea what to send, the only registers I certainly find very useful are
>>> TTBR0/1, cpsr and pc at this time.
>> 
>> Not being an ARM maintainer I'd say "Then include just those and no
>> (other) GPRs at all, or include all GPRs." Such a "best guess" approach
>> can really only end up being wrong from some future consumer. And
>> in that consideration, please also include the aspects that lead to all
>> x86 GPRs to get included here (not to speak of even various MSR
>> values). IOW the same criteria should be applied to all architectures.
> 
> The x86 GPRS are already all included in the vm_event request:

Well, that's what I've been saying, or at least I had tried to: I
realize I've typoed the past tense of "lead" - should have been
"led". Sorry for the confusion.

Jan
Tamas K Lengyel May 31, 2016, 4:20 p.m. UTC | #14
On May 31, 2016 01:54, "Jan Beulich" <JBeulich@suse.com> wrote:
>
> >>> On 30.05.16 at 22:37, <tamas@tklengyel.com> wrote:
> > On Mon, May 30, 2016 at 2:20 PM, Julien Grall <julien.grall@arm.com>
wrote:
> >> On 30/05/2016 20:47, Tamas K Lengyel wrote:
> >>> On Mon, May 30, 2016 at 5:50 AM, Jan Beulich <JBeulich@suse.com>
wrote:
> >>>>> +struct vm_event_regs_arm64 {
> >>>>> +    uint64_t x0;
> >>>>> +    uint64_t x1;
> >>>>> +    uint64_t x2;
> >>>>> +    uint64_t x3;
> >>>>> +    uint64_t x4;
> >>>>> +    uint64_t x5;
> >>>>> +    uint64_t x6;
> >>>>> +    uint64_t x7;
> >>>>> +    uint64_t x8;
> >>>>> +    uint64_t x9;
> >>>>> +    uint64_t x10;
> >>>>> +    uint64_t x16;
> >>>>> +    uint64_t lr;
> >>>>> +    uint64_t fp;
> >>>>> +    uint64_t pc;
> >>>>> +    uint64_t sp_el0;
> >>>>> +    uint64_t sp_el1;
> >>>>> +    uint32_t spsr_el1;
> >>>>> +    uint32_t _pad;
> >>>>> +};
> >>>>
> >>>>
> >>>> My ARM knowledge is certainly quite limited, but the incomplete set
> >>>> of GPRs here is quite obvious: Is there a reason to not make all of
> >>>> them available here? And if there is, could the criteria of which
> >>>> registers got put here please be documented in a comment (so that
> >>>> the next person noticing the incomplete set won't ask again)?
> >>>
> >>>
> >>> There already is a comment in place that explains why we are not
> >>> sending the full set of registers here.
> >>
> >>
> >> Your comment only says:
> >> "Using custom vCPU structs (i.e. not hvm_hw_cpu) for both x86 and ARM
> >> so as to not fill the vm_event ring buffer too quickly." it does not
explain
> >> the criteria of which registers got put here.
> >
> > Well, as we discussed it in the previous revision, there is no
> > hard-set rule of what can and cannot be transmitted here. The only
> > thing to keep in mind is to not grow this struct to be too large. The
> > registers sent right now represent a "best guess" of what may be
> > useful for performance-sensitive vm_event applications on ARM. It can
> > be adjusted in the future if applications require other registers.
> > Right now there are no applications at all in this space so we don't
> > have any specifications to rely on for making this selection. I'm
> > happy to make adjustments to the selection if others have a better
> > idea what to send, the only registers I certainly find very useful are
> > TTBR0/1, cpsr and pc at this time.
>
> Not being an ARM maintainer I'd say "Then include just those and no
> (other) GPRs at all, or include all GPRs." Such a "best guess" approach
> can really only end up being wrong from some future consumer. And
> in that consideration, please also include the aspects that lead to all
> x86 GPRs to get included here (not to speak of even various MSR
> values). IOW the same criteria should be applied to all architectures.
>

I don't think there is such a thing as being wrong here. The user always
has access to the full set of registers plus there is ample room here to
add other registers in the future while we are smaller then the x86 struct.
For an initial set I think it's perfectly fine to do a subset and add more
as we go forward and learn about the usecases. So since there is no
technical reason that doing a subset would be incorrect I don't see an
issue here. As I said, I'm happy to explain it in the commit message and a
comment in the code that the register set is expandable and adjustable.

Tamas
Tamas K Lengyel May 31, 2016, 4:28 p.m. UTC | #15
On May 31, 2016 01:48, "Jan Beulich" <JBeulich@suse.com> wrote:
>
> >>> On 30.05.16 at 21:47, <tamas@tklengyel.com> wrote:
> > On Mon, May 30, 2016 at 5:50 AM, Jan Beulich <JBeulich@suse.com> wrote:
> >>>>> On 30.05.16 at 00:37, <tamas@tklengyel.com> wrote:
> >>> +struct vm_event_regs_arm32 {
> >>> +    uint32_t r0_usr;
> >>> +    uint32_t r1_usr;
> >>> +    uint32_t r2_usr;
> >>> +    uint32_t r3_usr;
> >>> +    uint32_t r4_usr;
> >>> +    uint32_t r5_usr;
> >>> +    uint32_t r6_usr;
> >>> +    uint32_t r7_usr;
> >>> +    uint32_t r8_usr;
> >>> +    uint32_t r9_usr;
> >>> +    uint32_t r10_usr;
> >>> +    uint32_t r12_usr;
> >>> +    uint32_t lr_usr;
> >>> +    uint32_t fp;
> >>> +    uint32_t pc;
> >>> +    uint32_t sp_usr;
> >>> +    uint32_t sp_svc;
> >>> +    uint32_t spsr_svc;
> >>> +};
> >>
> >> It would seem more natural for the "ordinary" registers to be
> >> arranged in the numerical sequence, i.e. fp, r12, sp, lr, pc.
> >
> > Not sure I follow.
>
> For one it is quite natural for someone looking at a sequence of
> register values to assume / expect them to be in natural order.
> And then, having them in natural (numeric) order allows e.g.
> extracting the register identifying bits from an instruction to use
> them as an array index into (part of) this structure.
>
> (For some background: I've been bitten a number of times by
> people sorting x86 registers alphabetically instead or naturally,
> i.e. EAX, EBX, ECX, EDX instead of EAX, ECX, EDX, EBX).

I see, however I believe that would be a very careless use of this struct
from the user as the register sizes are not even necessarily match the
architecture. For example we only define the 64bit x86 registers, so trying
to access it as an array of 32bit registers wouldn't work at all. Plus we
are not doing a full set of registers, and I rather not imply that every
element in the "natural sequence" is present. It may be, it may be not.

>
> >>> +struct vm_event_regs_arm64 {
> >>> +    uint64_t x0;
> >>> +    uint64_t x1;
> >>> +    uint64_t x2;
> >>> +    uint64_t x3;
> >>> +    uint64_t x4;
> >>> +    uint64_t x5;
> >>> +    uint64_t x6;
> >>> +    uint64_t x7;
> >>> +    uint64_t x8;
> >>> +    uint64_t x9;
> >>> +    uint64_t x10;
> >>> +    uint64_t x16;
> >>> +    uint64_t lr;
> >>> +    uint64_t fp;
> >>> +    uint64_t pc;
> >>> +    uint64_t sp_el0;
> >>> +    uint64_t sp_el1;
> >>> +    uint32_t spsr_el1;
> >>> +    uint32_t _pad;
> >>> +};
> >>
> >> My ARM knowledge is certainly quite limited, but the incomplete set
> >> of GPRs here is quite obvious: Is there a reason to not make all of
> >> them available here? And if there is, could the criteria of which
> >> registers got put here please be documented in a comment (so that
> >> the next person noticing the incomplete set won't ask again)?
> >
> > There already is a comment in place that explains why we are not
> > sending the full set of registers here.
>
> I've just gone through the entire patch again, and I didn't find any.
> Are you perhaps referring to "Using custom vCPU structs (i.e. not
> hvm_hw_cpu) for both x86 and ARM so as to not fill the vm_event
> ring buffer too quickly"? If so, that's irrelevant here: It explains why
> e.g. floating point registers don't get sent, but it doesn't explain in
> any way why some GPRs are more important than others.
>
> >> I'm also puzzled by fp and lr - I'm not aware of registers of those
> >> names (and gas also doesn't accept them afaict).
> >
> > Not sure why but Xen names x29 fp and x30 lr. See
> > include/asm-arm/arm64/processor.h.
>
> See Julien's reply.
>
> Jan
>
Jan Beulich June 1, 2016, 8:41 a.m. UTC | #16
>>> On 31.05.16 at 18:28, <tamas@tklengyel.com> wrote:
> On May 31, 2016 01:48, "Jan Beulich" <JBeulich@suse.com> wrote:
>>
>> >>> On 30.05.16 at 21:47, <tamas@tklengyel.com> wrote:
>> > On Mon, May 30, 2016 at 5:50 AM, Jan Beulich <JBeulich@suse.com> wrote:
>> >>>>> On 30.05.16 at 00:37, <tamas@tklengyel.com> wrote:
>> >>> +struct vm_event_regs_arm32 {
>> >>> +    uint32_t r0_usr;
>> >>> +    uint32_t r1_usr;
>> >>> +    uint32_t r2_usr;
>> >>> +    uint32_t r3_usr;
>> >>> +    uint32_t r4_usr;
>> >>> +    uint32_t r5_usr;
>> >>> +    uint32_t r6_usr;
>> >>> +    uint32_t r7_usr;
>> >>> +    uint32_t r8_usr;
>> >>> +    uint32_t r9_usr;
>> >>> +    uint32_t r10_usr;
>> >>> +    uint32_t r12_usr;
>> >>> +    uint32_t lr_usr;
>> >>> +    uint32_t fp;
>> >>> +    uint32_t pc;
>> >>> +    uint32_t sp_usr;
>> >>> +    uint32_t sp_svc;
>> >>> +    uint32_t spsr_svc;
>> >>> +};
>> >>
>> >> It would seem more natural for the "ordinary" registers to be
>> >> arranged in the numerical sequence, i.e. fp, r12, sp, lr, pc.
>> >
>> > Not sure I follow.
>>
>> For one it is quite natural for someone looking at a sequence of
>> register values to assume / expect them to be in natural order.
>> And then, having them in natural (numeric) order allows e.g.
>> extracting the register identifying bits from an instruction to use
>> them as an array index into (part of) this structure.
>>
>> (For some background: I've been bitten a number of times by
>> people sorting x86 registers alphabetically instead or naturally,
>> i.e. EAX, EBX, ECX, EDX instead of EAX, ECX, EDX, EBX).
> 
> I see, however I believe that would be a very careless use of this struct
> from the user as the register sizes are not even necessarily match the
> architecture. For example we only define the 64bit x86 registers, so trying
> to access it as an array of 32bit registers wouldn't work at all. Plus we
> are not doing a full set of registers, and I rather not imply that every
> element in the "natural sequence" is present. It may be, it may be not.

Once an ABI is set in stone, and if that ABI allows for optimizations
(by consumers) like the one mentioned, I don't think this would be
careless use. The resulting code is very clearly much more efficient
than e.g. a switch() statement with a case label for each and every
register. Well, yes, I already hear the "memory is cheap and hence
code size doesn't matter" argument, but as said elsewhere quite
recently I don't buy this.

Jan
Julien Grall June 1, 2016, 11:24 a.m. UTC | #17
Hi,

On 01/06/16 09:41, Jan Beulich wrote:
>>>> On 31.05.16 at 18:28, <tamas@tklengyel.com> wrote:
>> On May 31, 2016 01:48, "Jan Beulich" <JBeulich@suse.com> wrote:
>>>
>>>>>> On 30.05.16 at 21:47, <tamas@tklengyel.com> wrote:
>>>> On Mon, May 30, 2016 at 5:50 AM, Jan Beulich <JBeulich@suse.com> wrote:
>>>>>>>> On 30.05.16 at 00:37, <tamas@tklengyel.com> wrote:
>>>>>> +struct vm_event_regs_arm32 {
>>>>>> +    uint32_t r0_usr;
>>>>>> +    uint32_t r1_usr;
>>>>>> +    uint32_t r2_usr;
>>>>>> +    uint32_t r3_usr;
>>>>>> +    uint32_t r4_usr;
>>>>>> +    uint32_t r5_usr;
>>>>>> +    uint32_t r6_usr;
>>>>>> +    uint32_t r7_usr;
>>>>>> +    uint32_t r8_usr;
>>>>>> +    uint32_t r9_usr;
>>>>>> +    uint32_t r10_usr;
>>>>>> +    uint32_t r12_usr;
>>>>>> +    uint32_t lr_usr;
>>>>>> +    uint32_t fp;
>>>>>> +    uint32_t pc;
>>>>>> +    uint32_t sp_usr;
>>>>>> +    uint32_t sp_svc;
>>>>>> +    uint32_t spsr_svc;
>>>>>> +};
>>>>>
>>>>> It would seem more natural for the "ordinary" registers to be
>>>>> arranged in the numerical sequence, i.e. fp, r12, sp, lr, pc.
>>>>
>>>> Not sure I follow.
>>>
>>> For one it is quite natural for someone looking at a sequence of
>>> register values to assume / expect them to be in natural order.
>>> And then, having them in natural (numeric) order allows e.g.
>>> extracting the register identifying bits from an instruction to use
>>> them as an array index into (part of) this structure.
>>>
>>> (For some background: I've been bitten a number of times by
>>> people sorting x86 registers alphabetically instead or naturally,
>>> i.e. EAX, EBX, ECX, EDX instead of EAX, ECX, EDX, EBX).
>>
>> I see, however I believe that would be a very careless use of this struct
>> from the user as the register sizes are not even necessarily match the
>> architecture. For example we only define the 64bit x86 registers, so trying
>> to access it as an array of 32bit registers wouldn't work at all. Plus we
>> are not doing a full set of registers, and I rather not imply that every
>> element in the "natural sequence" is present. It may be, it may be not.
>
> Once an ABI is set in stone, and if that ABI allows for optimizations
> (by consumers) like the one mentioned, I don't think this would be
> careless use. The resulting code is very clearly much more efficient
> than e.g. a switch() statement with a case label for each and every
> register. Well, yes, I already hear the "memory is cheap and hence
> code size doesn't matter" argument, but as said elsewhere quite
> recently I don't buy this.

I agree with Jan here.

ARM64 has 31 general purposes registers (x0-x30). The switch to find a 
register based on the index will be quite big.

If you order the register and provide all the general purposes registers 
(x0 - x30), you will be able to replace by a single line (for instance 
see select_user_reg in arch/arm/traps.c).

This will also likely speed up your introspection application.

Regards,
Tamas K Lengyel June 1, 2016, 6:21 p.m. UTC | #18
On Wed, Jun 1, 2016 at 5:24 AM, Julien Grall <julien.grall@arm.com> wrote:
> Hi,
>
>
> On 01/06/16 09:41, Jan Beulich wrote:
>>>>>
>>>>> On 31.05.16 at 18:28, <tamas@tklengyel.com> wrote:
>>>
>>> On May 31, 2016 01:48, "Jan Beulich" <JBeulich@suse.com> wrote:
>>>>
>>>>
>>>>>>> On 30.05.16 at 21:47, <tamas@tklengyel.com> wrote:
>>>>>
>>>>> On Mon, May 30, 2016 at 5:50 AM, Jan Beulich <JBeulich@suse.com> wrote:
>>>>>>>>>
>>>>>>>>> On 30.05.16 at 00:37, <tamas@tklengyel.com> wrote:
>>>>>>>
>>>>>>> +struct vm_event_regs_arm32 {
>>>>>>> +    uint32_t r0_usr;
>>>>>>> +    uint32_t r1_usr;
>>>>>>> +    uint32_t r2_usr;
>>>>>>> +    uint32_t r3_usr;
>>>>>>> +    uint32_t r4_usr;
>>>>>>> +    uint32_t r5_usr;
>>>>>>> +    uint32_t r6_usr;
>>>>>>> +    uint32_t r7_usr;
>>>>>>> +    uint32_t r8_usr;
>>>>>>> +    uint32_t r9_usr;
>>>>>>> +    uint32_t r10_usr;
>>>>>>> +    uint32_t r12_usr;
>>>>>>> +    uint32_t lr_usr;
>>>>>>> +    uint32_t fp;
>>>>>>> +    uint32_t pc;
>>>>>>> +    uint32_t sp_usr;
>>>>>>> +    uint32_t sp_svc;
>>>>>>> +    uint32_t spsr_svc;
>>>>>>> +};
>>>>>>
>>>>>>
>>>>>> It would seem more natural for the "ordinary" registers to be
>>>>>> arranged in the numerical sequence, i.e. fp, r12, sp, lr, pc.
>>>>>
>>>>>
>>>>> Not sure I follow.
>>>>
>>>>
>>>> For one it is quite natural for someone looking at a sequence of
>>>> register values to assume / expect them to be in natural order.
>>>> And then, having them in natural (numeric) order allows e.g.
>>>> extracting the register identifying bits from an instruction to use
>>>> them as an array index into (part of) this structure.
>>>>
>>>> (For some background: I've been bitten a number of times by
>>>> people sorting x86 registers alphabetically instead or naturally,
>>>> i.e. EAX, EBX, ECX, EDX instead of EAX, ECX, EDX, EBX).
>>>
>>>
>>> I see, however I believe that would be a very careless use of this struct
>>> from the user as the register sizes are not even necessarily match the
>>> architecture. For example we only define the 64bit x86 registers, so
>>> trying
>>> to access it as an array of 32bit registers wouldn't work at all. Plus we
>>> are not doing a full set of registers, and I rather not imply that every
>>> element in the "natural sequence" is present. It may be, it may be not.
>>
>>
>> Once an ABI is set in stone, and if that ABI allows for optimizations
>> (by consumers) like the one mentioned, I don't think this would be
>> careless use. The resulting code is very clearly much more efficient
>> than e.g. a switch() statement with a case label for each and every
>> register. Well, yes, I already hear the "memory is cheap and hence
>> code size doesn't matter" argument, but as said elsewhere quite
>> recently I don't buy this.
>
>
> I agree with Jan here.
>
> ARM64 has 31 general purposes registers (x0-x30). The switch to find a
> register based on the index will be quite big.
>
> If you order the register and provide all the general purposes registers (x0
> - x30), you will be able to replace by a single line (for instance see
> select_user_reg in arch/arm/traps.c).

The issue is that the x86 vm_event struct right now has 32*uint64_t
size. So if we would want to transmit all ARM64 GPRs + cpsr, PC and
TTBR0/1 we would end up growing this structure beyond what it is
currently. I want to avoid that as it affects both ARM32 and x86
introspection applications as well as now we can hold fewer events on
the ring. I would say it is better to avoid that then potentially
saving some on a switch in ARM64 introspection applications.

>
> This will also likely speed up your introspection application.

Win some, lose some. I would prefer if these changes had no
cross-architectural effect unless absolutely required. Razvan, what do
you think?

Tamas
Razvan Cojocaru June 1, 2016, 7:34 p.m. UTC | #19
On 06/01/16 21:21, Tamas K Lengyel wrote:
> On Wed, Jun 1, 2016 at 5:24 AM, Julien Grall <julien.grall@arm.com> wrote:
>> Hi,
>>
>>
>> On 01/06/16 09:41, Jan Beulich wrote:
>>>>>>
>>>>>> On 31.05.16 at 18:28, <tamas@tklengyel.com> wrote:
>>>>
>>>> On May 31, 2016 01:48, "Jan Beulich" <JBeulich@suse.com> wrote:
>>>>>
>>>>>
>>>>>>>> On 30.05.16 at 21:47, <tamas@tklengyel.com> wrote:
>>>>>>
>>>>>> On Mon, May 30, 2016 at 5:50 AM, Jan Beulich <JBeulich@suse.com> wrote:
>>>>>>>>>>
>>>>>>>>>> On 30.05.16 at 00:37, <tamas@tklengyel.com> wrote:
>>>>>>>>
>>>>>>>> +struct vm_event_regs_arm32 {
>>>>>>>> +    uint32_t r0_usr;
>>>>>>>> +    uint32_t r1_usr;
>>>>>>>> +    uint32_t r2_usr;
>>>>>>>> +    uint32_t r3_usr;
>>>>>>>> +    uint32_t r4_usr;
>>>>>>>> +    uint32_t r5_usr;
>>>>>>>> +    uint32_t r6_usr;
>>>>>>>> +    uint32_t r7_usr;
>>>>>>>> +    uint32_t r8_usr;
>>>>>>>> +    uint32_t r9_usr;
>>>>>>>> +    uint32_t r10_usr;
>>>>>>>> +    uint32_t r12_usr;
>>>>>>>> +    uint32_t lr_usr;
>>>>>>>> +    uint32_t fp;
>>>>>>>> +    uint32_t pc;
>>>>>>>> +    uint32_t sp_usr;
>>>>>>>> +    uint32_t sp_svc;
>>>>>>>> +    uint32_t spsr_svc;
>>>>>>>> +};
>>>>>>>
>>>>>>>
>>>>>>> It would seem more natural for the "ordinary" registers to be
>>>>>>> arranged in the numerical sequence, i.e. fp, r12, sp, lr, pc.
>>>>>>
>>>>>>
>>>>>> Not sure I follow.
>>>>>
>>>>>
>>>>> For one it is quite natural for someone looking at a sequence of
>>>>> register values to assume / expect them to be in natural order.
>>>>> And then, having them in natural (numeric) order allows e.g.
>>>>> extracting the register identifying bits from an instruction to use
>>>>> them as an array index into (part of) this structure.
>>>>>
>>>>> (For some background: I've been bitten a number of times by
>>>>> people sorting x86 registers alphabetically instead or naturally,
>>>>> i.e. EAX, EBX, ECX, EDX instead of EAX, ECX, EDX, EBX).
>>>>
>>>>
>>>> I see, however I believe that would be a very careless use of this struct
>>>> from the user as the register sizes are not even necessarily match the
>>>> architecture. For example we only define the 64bit x86 registers, so
>>>> trying
>>>> to access it as an array of 32bit registers wouldn't work at all. Plus we
>>>> are not doing a full set of registers, and I rather not imply that every
>>>> element in the "natural sequence" is present. It may be, it may be not.
>>>
>>>
>>> Once an ABI is set in stone, and if that ABI allows for optimizations
>>> (by consumers) like the one mentioned, I don't think this would be
>>> careless use. The resulting code is very clearly much more efficient
>>> than e.g. a switch() statement with a case label for each and every
>>> register. Well, yes, I already hear the "memory is cheap and hence
>>> code size doesn't matter" argument, but as said elsewhere quite
>>> recently I don't buy this.
>>
>>
>> I agree with Jan here.
>>
>> ARM64 has 31 general purposes registers (x0-x30). The switch to find a
>> register based on the index will be quite big.
>>
>> If you order the register and provide all the general purposes registers (x0
>> - x30), you will be able to replace by a single line (for instance see
>> select_user_reg in arch/arm/traps.c).
> 
> The issue is that the x86 vm_event struct right now has 32*uint64_t
> size. So if we would want to transmit all ARM64 GPRs + cpsr, PC and
> TTBR0/1 we would end up growing this structure beyond what it is
> currently. I want to avoid that as it affects both ARM32 and x86
> introspection applications as well as now we can hold fewer events on
> the ring. I would say it is better to avoid that then potentially
> saving some on a switch in ARM64 introspection applications.
> 
>>
>> This will also likely speed up your introspection application.
> 
> Win some, lose some. I would prefer if these changes had no
> cross-architectural effect unless absolutely required. Razvan, what do
> you think?

I think that if we keep a single page sized event ring buffer it would
be best to only send what consumers need right now.

The only purpose of having that information in the request is to quickly
get things that are immediately necessary - otherwise the full context
can be obtained with xc_domain_hvm_getcontext_partial().

As long as there's a comment present that says that this is all the
information needed at this time, and the struct can grow if needs
change, it might be best not to add unnecessary data just in case
somebody will need it later.

Having said that, growing the struct by about 16 bytes or so shouldn't
pose huge problems (I'm not sure I've calculated the correct size based
on your previous message). Our application only uses sync-style events,
so the ring buffer can only be full when all VCPUs are blocked, so even
a one page ring buffer should be enough for most sensible applications
right now.

But, the way to go for the future, as I've said before, is IMHO to grow
the ring buffer and possibly add all the data that
xc_domain_hvm_getcontext_partial() now retrieves to the events.

To summarize, FWIW I think we should hold back on including all
available data to requests as long as we keep using a one page ring
buffer, but growing the structure by not much is acceptable.


Thanks,
Razvan
Julien Grall June 1, 2016, 7:38 p.m. UTC | #20
Hi Tamas,

On 01/06/16 19:21, Tamas K Lengyel wrote:
> On Wed, Jun 1, 2016 at 5:24 AM, Julien Grall <julien.grall@arm.com> wrote:
>> Hi,
>>
>>
>> On 01/06/16 09:41, Jan Beulich wrote:
>>>>>>
>>>>>> On 31.05.16 at 18:28, <tamas@tklengyel.com> wrote:
>>>>
>>>> On May 31, 2016 01:48, "Jan Beulich" <JBeulich@suse.com> wrote:
>>>>>
>>>>>
>>>>>>>> On 30.05.16 at 21:47, <tamas@tklengyel.com> wrote:
>>>>>>
>>>>>> On Mon, May 30, 2016 at 5:50 AM, Jan Beulich <JBeulich@suse.com> wrote:
>>>>>>>>>>
>>>>>>>>>> On 30.05.16 at 00:37, <tamas@tklengyel.com> wrote:
>>>>>>>>
>>>>>>>> +struct vm_event_regs_arm32 {
>>>>>>>> +    uint32_t r0_usr;
>>>>>>>> +    uint32_t r1_usr;
>>>>>>>> +    uint32_t r2_usr;
>>>>>>>> +    uint32_t r3_usr;
>>>>>>>> +    uint32_t r4_usr;
>>>>>>>> +    uint32_t r5_usr;
>>>>>>>> +    uint32_t r6_usr;
>>>>>>>> +    uint32_t r7_usr;
>>>>>>>> +    uint32_t r8_usr;
>>>>>>>> +    uint32_t r9_usr;
>>>>>>>> +    uint32_t r10_usr;
>>>>>>>> +    uint32_t r12_usr;
>>>>>>>> +    uint32_t lr_usr;
>>>>>>>> +    uint32_t fp;
>>>>>>>> +    uint32_t pc;
>>>>>>>> +    uint32_t sp_usr;
>>>>>>>> +    uint32_t sp_svc;
>>>>>>>> +    uint32_t spsr_svc;
>>>>>>>> +};
>>>>>>>
>>>>>>>
>>>>>>> It would seem more natural for the "ordinary" registers to be
>>>>>>> arranged in the numerical sequence, i.e. fp, r12, sp, lr, pc.
>>>>>>
>>>>>>
>>>>>> Not sure I follow.
>>>>>
>>>>>
>>>>> For one it is quite natural for someone looking at a sequence of
>>>>> register values to assume / expect them to be in natural order.
>>>>> And then, having them in natural (numeric) order allows e.g.
>>>>> extracting the register identifying bits from an instruction to use
>>>>> them as an array index into (part of) this structure.
>>>>>
>>>>> (For some background: I've been bitten a number of times by
>>>>> people sorting x86 registers alphabetically instead or naturally,
>>>>> i.e. EAX, EBX, ECX, EDX instead of EAX, ECX, EDX, EBX).
>>>>
>>>>
>>>> I see, however I believe that would be a very careless use of this struct
>>>> from the user as the register sizes are not even necessarily match the
>>>> architecture. For example we only define the 64bit x86 registers, so
>>>> trying
>>>> to access it as an array of 32bit registers wouldn't work at all. Plus we
>>>> are not doing a full set of registers, and I rather not imply that every
>>>> element in the "natural sequence" is present. It may be, it may be not.
>>>
>>>
>>> Once an ABI is set in stone, and if that ABI allows for optimizations
>>> (by consumers) like the one mentioned, I don't think this would be
>>> careless use. The resulting code is very clearly much more efficient
>>> than e.g. a switch() statement with a case label for each and every
>>> register. Well, yes, I already hear the "memory is cheap and hence
>>> code size doesn't matter" argument, but as said elsewhere quite
>>> recently I don't buy this.
>>
>>
>> I agree with Jan here.
>>
>> ARM64 has 31 general purposes registers (x0-x30). The switch to find a
>> register based on the index will be quite big.
>>
>> If you order the register and provide all the general purposes registers (x0
>> - x30), you will be able to replace by a single line (for instance see
>> select_user_reg in arch/arm/traps.c).
>
> The issue is that the x86 vm_event struct right now has 32*uint64_t
> size. So if we would want to transmit all ARM64 GPRs + cpsr, PC and
> TTBR0/1 we would end up growing this structure beyond what it is
> currently. I want to avoid that as it affects both ARM32 and x86
> introspection applications as well as now we can hold fewer events on
> the ring. I would say it is better to avoid that then potentially
> saving some on a switch in ARM64 introspection applications.

The design choice made for x86 should not impact the ARM design (and 
vice-versa). There are key structures in the public interface which 
differ between x86 and ARM (see arch_vcpu_info and arch_shared_info). 
And this is fine because Xen is not meant to run an x86 guest on an ARM 
hypervisor.

As far as I can tell, we currently support VM_EVENT_REASON_MEM_ACCESS 
and VM_EVENT_REASON_GUEST_REQUEST. So technically the structure is set 
in stone. However, we have an interface version that could be bumped, we 
can still decide what is the sensible choice.

With your suggestions only a part of the general-purpose registers will 
be present in the vm_event. I understand that the ring has a limited 
size. If I counted correctly, the current size of the vm_event structure 
is 304 bytes. So 15 vm_event slots are available.

If we grow the structure for ARM64, i.e 3 64-bits registers (PC, TTBR0, 
TTBR1) and 1 32-bit register (CPSR). Which mean 311 bytes, i.e 13 
vm_event slots.

If the vm event subsystem is under pressure, I admit that 2 slots could 
be a lot. However, as not all the GPRs will be available in the 
structure you may have to fetch them, through XEN_DOMCTL_getvcpucontext 
I guess (?). The impact of the introspection to the guest will be 
significant.

I cannot tell how often this will be the case, but I can tell you a 
compiler can do anything with the register allocation. I.e it could 
decide to privilege allocation in registers x20-x30 (because big number 
are nicer).

If you are still concern about the pressure on the ring page, Razvan 
suggested to support multi-ring page.

Regards,
Julien Grall June 1, 2016, 7:43 p.m. UTC | #21
Hi Razvan,

On 01/06/16 20:34, Razvan Cojocaru wrote:
> On 06/01/16 21:21, Tamas K Lengyel wrote:
>> On Wed, Jun 1, 2016 at 5:24 AM, Julien Grall <julien.grall@arm.com> wrote:
> The only purpose of having that information in the request is to quickly
> get things that are immediately necessary - otherwise the full context
> can be obtained with xc_domain_hvm_getcontext_partial().

xc_domain_hvm_getcontext_partial is not yet implemented on ARM. So 
currently the only solution to get the other registers will be 
XEN_DOMCTL_getvcpucontext.

Regards,
Julien Grall June 1, 2016, 7:49 p.m. UTC | #22
On 01/06/16 20:38, Julien Grall wrote:
> Hi Tamas,
>
> On 01/06/16 19:21, Tamas K Lengyel wrote:
>> On Wed, Jun 1, 2016 at 5:24 AM, Julien Grall <julien.grall@arm.com>
>> wrote:
>>> Hi,
>>>
>>>
>>> On 01/06/16 09:41, Jan Beulich wrote:
>>>>>>>
>>>>>>> On 31.05.16 at 18:28, <tamas@tklengyel.com> wrote:
>>>>>
>>>>> On May 31, 2016 01:48, "Jan Beulich" <JBeulich@suse.com> wrote:
>>>>>>
>>>>>>
>>>>>>>>> On 30.05.16 at 21:47, <tamas@tklengyel.com> wrote:
>>>>>>>
>>>>>>> On Mon, May 30, 2016 at 5:50 AM, Jan Beulich <JBeulich@suse.com>
>>>>>>> wrote:
>>>>>>>>>>>
>>>>>>>>>>> On 30.05.16 at 00:37, <tamas@tklengyel.com> wrote:
>>>>>>>>>
>>>>>>>>> +struct vm_event_regs_arm32 {
>>>>>>>>> +    uint32_t r0_usr;
>>>>>>>>> +    uint32_t r1_usr;
>>>>>>>>> +    uint32_t r2_usr;
>>>>>>>>> +    uint32_t r3_usr;
>>>>>>>>> +    uint32_t r4_usr;
>>>>>>>>> +    uint32_t r5_usr;
>>>>>>>>> +    uint32_t r6_usr;
>>>>>>>>> +    uint32_t r7_usr;
>>>>>>>>> +    uint32_t r8_usr;
>>>>>>>>> +    uint32_t r9_usr;
>>>>>>>>> +    uint32_t r10_usr;
>>>>>>>>> +    uint32_t r12_usr;
>>>>>>>>> +    uint32_t lr_usr;
>>>>>>>>> +    uint32_t fp;
>>>>>>>>> +    uint32_t pc;
>>>>>>>>> +    uint32_t sp_usr;
>>>>>>>>> +    uint32_t sp_svc;
>>>>>>>>> +    uint32_t spsr_svc;
>>>>>>>>> +};
>>>>>>>>
>>>>>>>>
>>>>>>>> It would seem more natural for the "ordinary" registers to be
>>>>>>>> arranged in the numerical sequence, i.e. fp, r12, sp, lr, pc.
>>>>>>>
>>>>>>>
>>>>>>> Not sure I follow.
>>>>>>
>>>>>>
>>>>>> For one it is quite natural for someone looking at a sequence of
>>>>>> register values to assume / expect them to be in natural order.
>>>>>> And then, having them in natural (numeric) order allows e.g.
>>>>>> extracting the register identifying bits from an instruction to use
>>>>>> them as an array index into (part of) this structure.
>>>>>>
>>>>>> (For some background: I've been bitten a number of times by
>>>>>> people sorting x86 registers alphabetically instead or naturally,
>>>>>> i.e. EAX, EBX, ECX, EDX instead of EAX, ECX, EDX, EBX).
>>>>>
>>>>>
>>>>> I see, however I believe that would be a very careless use of this
>>>>> struct
>>>>> from the user as the register sizes are not even necessarily match the
>>>>> architecture. For example we only define the 64bit x86 registers, so
>>>>> trying
>>>>> to access it as an array of 32bit registers wouldn't work at all.
>>>>> Plus we
>>>>> are not doing a full set of registers, and I rather not imply that
>>>>> every
>>>>> element in the "natural sequence" is present. It may be, it may be
>>>>> not.
>>>>
>>>>
>>>> Once an ABI is set in stone, and if that ABI allows for optimizations
>>>> (by consumers) like the one mentioned, I don't think this would be
>>>> careless use. The resulting code is very clearly much more efficient
>>>> than e.g. a switch() statement with a case label for each and every
>>>> register. Well, yes, I already hear the "memory is cheap and hence
>>>> code size doesn't matter" argument, but as said elsewhere quite
>>>> recently I don't buy this.
>>>
>>>
>>> I agree with Jan here.
>>>
>>> ARM64 has 31 general purposes registers (x0-x30). The switch to find a
>>> register based on the index will be quite big.
>>>
>>> If you order the register and provide all the general purposes
>>> registers (x0
>>> - x30), you will be able to replace by a single line (for instance see
>>> select_user_reg in arch/arm/traps.c).
>>
>> The issue is that the x86 vm_event struct right now has 32*uint64_t
>> size. So if we would want to transmit all ARM64 GPRs + cpsr, PC and
>> TTBR0/1 we would end up growing this structure beyond what it is
>> currently. I want to avoid that as it affects both ARM32 and x86
>> introspection applications as well as now we can hold fewer events on
>> the ring. I would say it is better to avoid that then potentially
>> saving some on a switch in ARM64 introspection applications.
>
> The design choice made for x86 should not impact the ARM design (and
> vice-versa). There are key structures in the public interface which
> differ between x86 and ARM (see arch_vcpu_info and arch_shared_info).
> And this is fine because Xen is not meant to run an x86 guest on an ARM
> hypervisor.
>
> As far as I can tell, we currently support VM_EVENT_REASON_MEM_ACCESS
> and VM_EVENT_REASON_GUEST_REQUEST. So technically the structure is set
> in stone. However, we have an interface version that could be bumped, we
> can still decide what is the sensible choice.
>
> With your suggestions only a part of the general-purpose registers will
> be present in the vm_event. I understand that the ring has a limited
> size. If I counted correctly, the current size of the vm_event structure
> is 304 bytes. So 15 vm_event slots are available.
>
> If we grow the structure for ARM64, i.e 3 64-bits registers (PC, TTBR0,
> TTBR1) and 1 32-bit register (CPSR). Which mean 311 bytes, i.e 13

Sorry I miscalculated. So it should be:
   - 332 bytes
   - 12 slots

> vm_event slots.
>
> If the vm event subsystem is under pressure, I admit that 2 slots could
> be a lot. However, as not all the GPRs will be available in the
> structure you may have to fetch them, through XEN_DOMCTL_getvcpucontext
> I guess (?). The impact of the introspection to the guest will be
> significant.
>
> I cannot tell how often this will be the case, but I can tell you a
> compiler can do anything with the register allocation. I.e it could
> decide to privilege allocation in registers x20-x30 (because big number
> are nicer).
>
> If you are still concern about the pressure on the ring page, Razvan
> suggested to support multi-ring page.
>
> Regards,
>
Tamas K Lengyel June 1, 2016, 7:50 p.m. UTC | #23
On Wed, Jun 1, 2016 at 1:38 PM, Julien Grall <julien.grall@arm.com> wrote:
> Hi Tamas,
>
>
> On 01/06/16 19:21, Tamas K Lengyel wrote:
>>
>> On Wed, Jun 1, 2016 at 5:24 AM, Julien Grall <julien.grall@arm.com> wrote:
>>>
>>> Hi,
>>>
>>>
>>> On 01/06/16 09:41, Jan Beulich wrote:
>>>>>>>
>>>>>>>
>>>>>>> On 31.05.16 at 18:28, <tamas@tklengyel.com> wrote:
>>>>>
>>>>>
>>>>> On May 31, 2016 01:48, "Jan Beulich" <JBeulich@suse.com> wrote:
>>>>>>
>>>>>>
>>>>>>
>>>>>>>>> On 30.05.16 at 21:47, <tamas@tklengyel.com> wrote:
>>>>>>>
>>>>>>>
>>>>>>> On Mon, May 30, 2016 at 5:50 AM, Jan Beulich <JBeulich@suse.com>
>>>>>>> wrote:
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> On 30.05.16 at 00:37, <tamas@tklengyel.com> wrote:
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> +struct vm_event_regs_arm32 {
>>>>>>>>> +    uint32_t r0_usr;
>>>>>>>>> +    uint32_t r1_usr;
>>>>>>>>> +    uint32_t r2_usr;
>>>>>>>>> +    uint32_t r3_usr;
>>>>>>>>> +    uint32_t r4_usr;
>>>>>>>>> +    uint32_t r5_usr;
>>>>>>>>> +    uint32_t r6_usr;
>>>>>>>>> +    uint32_t r7_usr;
>>>>>>>>> +    uint32_t r8_usr;
>>>>>>>>> +    uint32_t r9_usr;
>>>>>>>>> +    uint32_t r10_usr;
>>>>>>>>> +    uint32_t r12_usr;
>>>>>>>>> +    uint32_t lr_usr;
>>>>>>>>> +    uint32_t fp;
>>>>>>>>> +    uint32_t pc;
>>>>>>>>> +    uint32_t sp_usr;
>>>>>>>>> +    uint32_t sp_svc;
>>>>>>>>> +    uint32_t spsr_svc;
>>>>>>>>> +};
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>> It would seem more natural for the "ordinary" registers to be
>>>>>>>> arranged in the numerical sequence, i.e. fp, r12, sp, lr, pc.
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> Not sure I follow.
>>>>>>
>>>>>>
>>>>>>
>>>>>> For one it is quite natural for someone looking at a sequence of
>>>>>> register values to assume / expect them to be in natural order.
>>>>>> And then, having them in natural (numeric) order allows e.g.
>>>>>> extracting the register identifying bits from an instruction to use
>>>>>> them as an array index into (part of) this structure.
>>>>>>
>>>>>> (For some background: I've been bitten a number of times by
>>>>>> people sorting x86 registers alphabetically instead or naturally,
>>>>>> i.e. EAX, EBX, ECX, EDX instead of EAX, ECX, EDX, EBX).
>>>>>
>>>>>
>>>>>
>>>>> I see, however I believe that would be a very careless use of this
>>>>> struct
>>>>> from the user as the register sizes are not even necessarily match the
>>>>> architecture. For example we only define the 64bit x86 registers, so
>>>>> trying
>>>>> to access it as an array of 32bit registers wouldn't work at all. Plus
>>>>> we
>>>>> are not doing a full set of registers, and I rather not imply that
>>>>> every
>>>>> element in the "natural sequence" is present. It may be, it may be not.
>>>>
>>>>
>>>>
>>>> Once an ABI is set in stone, and if that ABI allows for optimizations
>>>> (by consumers) like the one mentioned, I don't think this would be
>>>> careless use. The resulting code is very clearly much more efficient
>>>> than e.g. a switch() statement with a case label for each and every
>>>> register. Well, yes, I already hear the "memory is cheap and hence
>>>> code size doesn't matter" argument, but as said elsewhere quite
>>>> recently I don't buy this.
>>>
>>>
>>>
>>> I agree with Jan here.
>>>
>>> ARM64 has 31 general purposes registers (x0-x30). The switch to find a
>>> register based on the index will be quite big.
>>>
>>> If you order the register and provide all the general purposes registers
>>> (x0
>>> - x30), you will be able to replace by a single line (for instance see
>>> select_user_reg in arch/arm/traps.c).
>>
>>
>> The issue is that the x86 vm_event struct right now has 32*uint64_t
>> size. So if we would want to transmit all ARM64 GPRs + cpsr, PC and
>> TTBR0/1 we would end up growing this structure beyond what it is
>> currently. I want to avoid that as it affects both ARM32 and x86
>> introspection applications as well as now we can hold fewer events on
>> the ring. I would say it is better to avoid that then potentially
>> saving some on a switch in ARM64 introspection applications.
>
>
> The design choice made for x86 should not impact the ARM design (and
> vice-versa). There are key structures in the public interface which differ
> between x86 and ARM (see arch_vcpu_info and arch_shared_info). And this is
> fine because Xen is not meant to run an x86 guest on an ARM hypervisor.
>
> As far as I can tell, we currently support VM_EVENT_REASON_MEM_ACCESS and
> VM_EVENT_REASON_GUEST_REQUEST. So technically the structure is set in stone.
> However, we have an interface version that could be bumped, we can still
> decide what is the sensible choice.
>
> With your suggestions only a part of the general-purpose registers will be
> present in the vm_event. I understand that the ring has a limited size. If I
> counted correctly, the current size of the vm_event structure is 304 bytes.
> So 15 vm_event slots are available.
>
> If we grow the structure for ARM64, i.e 3 64-bits registers (PC, TTBR0,
> TTBR1) and 1 32-bit register (CPSR). Which mean 311 bytes, i.e 13 vm_event
> slots.
>
> If the vm event subsystem is under pressure, I admit that 2 slots could be a
> lot. However, as not all the GPRs will be available in the structure you may
> have to fetch them, through XEN_DOMCTL_getvcpucontext I guess (?). The
> impact of the introspection to the guest will be significant.
>
> I cannot tell how often this will be the case, but I can tell you a compiler
> can do anything with the register allocation. I.e it could decide to
> privilege allocation in registers x20-x30 (because big number are nicer).

Fair point.

>
> If you are still concern about the pressure on the ring page, Razvan
> suggested to support multi-ring page.

Going the multi-page ring route is a bit beyond of what I would like
to get reasonable done right now. As per Razvan's comment growing the
struct size is not that big of a concern so I'll include all ARM64
GPRs in the next revision.

Thanks,
Tamas
Jan Beulich June 2, 2016, 7:35 a.m. UTC | #24
>>> On 01.06.16 at 21:34, <rcojocaru@bitdefender.com> wrote:
> On 06/01/16 21:21, Tamas K Lengyel wrote:
>> On Wed, Jun 1, 2016 at 5:24 AM, Julien Grall <julien.grall@arm.com> wrote:
>>> On 01/06/16 09:41, Jan Beulich wrote:
>>>> Once an ABI is set in stone, and if that ABI allows for optimizations
>>>> (by consumers) like the one mentioned, I don't think this would be
>>>> careless use. The resulting code is very clearly much more efficient
>>>> than e.g. a switch() statement with a case label for each and every
>>>> register. Well, yes, I already hear the "memory is cheap and hence
>>>> code size doesn't matter" argument, but as said elsewhere quite
>>>> recently I don't buy this.
>>>
>>>
>>> I agree with Jan here.
>>>
>>> ARM64 has 31 general purposes registers (x0-x30). The switch to find a
>>> register based on the index will be quite big.
>>>
>>> If you order the register and provide all the general purposes registers (x0
>>> - x30), you will be able to replace by a single line (for instance see
>>> select_user_reg in arch/arm/traps.c).
>> 
>> The issue is that the x86 vm_event struct right now has 32*uint64_t
>> size. So if we would want to transmit all ARM64 GPRs + cpsr, PC and
>> TTBR0/1 we would end up growing this structure beyond what it is
>> currently. I want to avoid that as it affects both ARM32 and x86
>> introspection applications as well as now we can hold fewer events on
>> the ring. I would say it is better to avoid that then potentially
>> saving some on a switch in ARM64 introspection applications.
>> 
>>>
>>> This will also likely speed up your introspection application.
>> 
>> Win some, lose some. I would prefer if these changes had no
>> cross-architectural effect unless absolutely required. Razvan, what do
>> you think?

Perhaps that cross architectural effect then hints at some design
shortcoming? I.e. things should have been arranged for changes
to one arch'es register set to _never_ affect others.

> I think that if we keep a single page sized event ring buffer it would
> be best to only send what consumers need right now.
> 
> The only purpose of having that information in the request is to quickly
> get things that are immediately necessary - otherwise the full context
> can be obtained with xc_domain_hvm_getcontext_partial().
> 
> As long as there's a comment present that says that this is all the
> information needed at this time, and the struct can grow if needs
> change, it might be best not to add unnecessary data just in case
> somebody will need it later.

Well, that's not quite enough I would say (and this is as a general
remark, as I've peeked ahead and hence did see that Tamas agreed
to include all GPRs): The criteria for inclusion or exclusion should
follow a predictable model. I.e. if someone comes along and says
"I need register Y", then there should be rules that (s)he can apply
up front to determine what (at least in the vast majority of cases)
the response is going to be. You saying "I need only this arbitrary
subset of registers" is completely in-transparent, as it leaves open
why this is so, and why this would also be so for others. I.e. you'd
at least need to answer the question why (as an example) you
need x5 included, but not x25, despite both registers being equal
from an architecture pov. An answer like "My application gets away
with it" is not acceptable here.

Jan
Razvan Cojocaru June 2, 2016, 8:26 a.m. UTC | #25
On 06/02/2016 10:35 AM, Jan Beulich wrote:
> The criteria for inclusion or exclusion should
> follow a predictable model. I.e. if someone comes along and says
> "I need register Y", then there should be rules that (s)he can apply
> up front to determine what (at least in the vast majority of cases)
> the response is going to be. You saying "I need only this arbitrary
> subset of registers" is completely in-transparent, as it leaves open
> why this is so, and why this would also be so for others. I.e. you'd
> at least need to answer the question why (as an example) you
> need x5 included, but not x25, despite both registers being equal
> from an architecture pov. An answer like "My application gets away
> with it" is not acceptable here.

There will be trade-offs. Again, my preference would be to have a
multi-page ring buffer with all the VCPU context included in the
request, always. This would work best as far as both completeness and
performance goes.

But that is obviously not a trivial change and I can see how Tamas would
prefer to get things done sooner along the lines of the pre-existing
design (whatever its merits and demerits are).

As long as we're using a smaller ring buffer, the answer (at least as
I've meant it) is not "_my_ application gets away with it" - which is
indeed not the most reasonable statement, but "_current_ introspection
applications - of which my application is one of - don't need more than
this at this time". So in the spirit of "the rest of the context is a
hypercall away" + we don't have that much space available in the ring
buffer, the whole context is not included. The assumption here being
that once another application (or even our application, at a later time)
needs to enlarge the set of registers sent with a request, it will be
requested here and we can discuss what that entails.


Thanks,
Razvan
Jan Beulich June 2, 2016, 9:38 a.m. UTC | #26
>>> On 02.06.16 at 10:26, <rcojocaru@bitdefender.com> wrote:
> On 06/02/2016 10:35 AM, Jan Beulich wrote:
>> The criteria for inclusion or exclusion should
>> follow a predictable model. I.e. if someone comes along and says
>> "I need register Y", then there should be rules that (s)he can apply
>> up front to determine what (at least in the vast majority of cases)
>> the response is going to be. You saying "I need only this arbitrary
>> subset of registers" is completely in-transparent, as it leaves open
>> why this is so, and why this would also be so for others. I.e. you'd
>> at least need to answer the question why (as an example) you
>> need x5 included, but not x25, despite both registers being equal
>> from an architecture pov. An answer like "My application gets away
>> with it" is not acceptable here.
> 
> There will be trade-offs. Again, my preference would be to have a
> multi-page ring buffer with all the VCPU context included in the
> request, always. This would work best as far as both completeness and
> performance goes.
> 
> But that is obviously not a trivial change and I can see how Tamas would
> prefer to get things done sooner along the lines of the pre-existing
> design (whatever its merits and demerits are).
> 
> As long as we're using a smaller ring buffer, the answer (at least as
> I've meant it) is not "_my_ application gets away with it" - which is
> indeed not the most reasonable statement, but "_current_ introspection
> applications - of which my application is one of - don't need more than
> this at this time".

But then could one of you finally say _why_ that is? I.e. why some
GPRs are "better" to your applications than others? If you need
access to _any_ GPRs that don't have a special purpose (like PC,
SP, LR, and maybe FP; I realize this is more applicable to ARM32
than ARM64), I would suspect you need them to e.g. obtain
instruction operands. Yet then I can't see why some of them are
needed while others aren't. I could mayve see you wanting access
to function argument and return value registers, but afaict the
proposed set isn't matching that set either, plus that's a guest ABI
thing anyway, i.e. would again be arbitrary due to you limiting
things to work with just certain guests.

Jan
Razvan Cojocaru June 2, 2016, 9:42 a.m. UTC | #27
On 06/02/2016 12:38 PM, Jan Beulich wrote:
>>>> On 02.06.16 at 10:26, <rcojocaru@bitdefender.com> wrote:
>> On 06/02/2016 10:35 AM, Jan Beulich wrote:
>>> The criteria for inclusion or exclusion should
>>> follow a predictable model. I.e. if someone comes along and says
>>> "I need register Y", then there should be rules that (s)he can apply
>>> up front to determine what (at least in the vast majority of cases)
>>> the response is going to be. You saying "I need only this arbitrary
>>> subset of registers" is completely in-transparent, as it leaves open
>>> why this is so, and why this would also be so for others. I.e. you'd
>>> at least need to answer the question why (as an example) you
>>> need x5 included, but not x25, despite both registers being equal
>>> from an architecture pov. An answer like "My application gets away
>>> with it" is not acceptable here.
>>
>> There will be trade-offs. Again, my preference would be to have a
>> multi-page ring buffer with all the VCPU context included in the
>> request, always. This would work best as far as both completeness and
>> performance goes.
>>
>> But that is obviously not a trivial change and I can see how Tamas would
>> prefer to get things done sooner along the lines of the pre-existing
>> design (whatever its merits and demerits are).
>>
>> As long as we're using a smaller ring buffer, the answer (at least as
>> I've meant it) is not "_my_ application gets away with it" - which is
>> indeed not the most reasonable statement, but "_current_ introspection
>> applications - of which my application is one of - don't need more than
>> this at this time".
> 
> But then could one of you finally say _why_ that is? I.e. why some
> GPRs are "better" to your applications than others? If you need
> access to _any_ GPRs that don't have a special purpose (like PC,
> SP, LR, and maybe FP; I realize this is more applicable to ARM32
> than ARM64), I would suspect you need them to e.g. obtain
> instruction operands. Yet then I can't see why some of them are
> needed while others aren't. I could mayve see you wanting access
> to function argument and return value registers, but afaict the
> proposed set isn't matching that set either, plus that's a guest ABI
> thing anyway, i.e. would again be arbitrary due to you limiting
> things to work with just certain guests.

Fair point. Unfortunately I can't speak for Tamas' introspection needs
on ARM so he'll have to answer there - for my part, I do include all x86
GPRs in the request.


Thanks,
Razvan
diff mbox

Patch

diff --git a/xen/arch/arm/Makefile b/xen/arch/arm/Makefile
index 344d3ad..7d2641c 100644
--- a/xen/arch/arm/Makefile
+++ b/xen/arch/arm/Makefile
@@ -42,6 +42,7 @@  obj-y += processor.o
 obj-y += smc.o
 obj-$(CONFIG_XSPLICE) += xsplice.o
 obj-y += monitor.o
+obj-y += vm_event.o
 
 #obj-bin-y += ....o
 
diff --git a/xen/arch/arm/vm_event.c b/xen/arch/arm/vm_event.c
new file mode 100644
index 0000000..dcf9f1c
--- /dev/null
+++ b/xen/arch/arm/vm_event.c
@@ -0,0 +1,139 @@ 
+/*
+ * arch/arm/vm_event.c
+ *
+ * Architecture-specific vm_event handling routines
+ *
+ * Copyright (c) 2016 Tamas K Lengyel (tamas@tklengyel.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <xen/sched.h>
+#include <asm/vm_event.h>
+
+void vm_event_fill_regs(vm_event_request_t *req)
+{
+    const struct cpu_user_regs *regs = guest_cpu_user_regs();
+
+    req->data.regs.arm.cpsr = regs->cpsr;
+    req->data.regs.arm.ttbr0 = READ_SYSREG64(TTBR0_EL1);
+    req->data.regs.arm.ttbr1 = READ_SYSREG64(TTBR1_EL1);
+
+    if ( psr_mode_is_32bit(regs->cpsr) )
+    {
+        req->data.regs.arm.arch.arm32.r0_usr = regs->r0;
+        req->data.regs.arm.arch.arm32.r1_usr = regs->r1;
+        req->data.regs.arm.arch.arm32.r2_usr = regs->r2;
+        req->data.regs.arm.arch.arm32.r3_usr = regs->r3;
+        req->data.regs.arm.arch.arm32.r4_usr = regs->r4;
+        req->data.regs.arm.arch.arm32.r5_usr = regs->r5;
+        req->data.regs.arm.arch.arm32.r6_usr = regs->r6;
+        req->data.regs.arm.arch.arm32.r7_usr = regs->r7;
+        req->data.regs.arm.arch.arm32.r8_usr = regs->r8;
+        req->data.regs.arm.arch.arm32.r9_usr = regs->r9;
+        req->data.regs.arm.arch.arm32.r10_usr = regs->r10;
+        req->data.regs.arm.arch.arm32.r12_usr = regs->r12;
+        req->data.regs.arm.arch.arm32.lr_usr = regs->lr_usr;
+        req->data.regs.arm.arch.arm32.pc = regs->pc32;
+        req->data.regs.arm.arch.arm32.fp = regs->fp;
+        req->data.regs.arm.arch.arm32.sp_usr = regs->sp_usr;
+        req->data.regs.arm.arch.arm32.sp_svc = regs->sp_svc;
+        req->data.regs.arm.arch.arm32.spsr_svc = regs->spsr_svc;
+    }
+#ifdef CONFIG_ARM_64
+    else
+    {
+        req->data.regs.arm.arch.arm64.x0 = regs->x0;
+        req->data.regs.arm.arch.arm64.x1 = regs->x1;
+        req->data.regs.arm.arch.arm64.x2 = regs->x2;
+        req->data.regs.arm.arch.arm64.x3 = regs->x3;
+        req->data.regs.arm.arch.arm64.x4 = regs->x4;
+        req->data.regs.arm.arch.arm64.x5 = regs->x5;
+        req->data.regs.arm.arch.arm64.x6 = regs->x6;
+        req->data.regs.arm.arch.arm64.x7 = regs->x7;
+        req->data.regs.arm.arch.arm64.x8 = regs->x8;
+        req->data.regs.arm.arch.arm64.x9 = regs->x9;
+        req->data.regs.arm.arch.arm64.x10 = regs->x10;
+        req->data.regs.arm.arch.arm64.x16 = regs->x16;
+        req->data.regs.arm.arch.arm64.pc = regs->pc;
+        req->data.regs.arm.arch.arm64.sp_el0 = regs->sp_el0;
+        req->data.regs.arm.arch.arm64.sp_el1 = regs->sp_el1;
+        req->data.regs.arm.arch.arm64.fp = regs->fp;
+        req->data.regs.arm.arch.arm64.lr = regs->lr;
+        req->data.regs.arm.arch.arm64.spsr_el1 = regs->spsr_svc;
+    }
+#endif
+}
+
+void vm_event_set_registers(struct vcpu *v, vm_event_response_t *rsp)
+{
+    struct cpu_user_regs *regs = &v->arch.cpu_info->guest_cpu_user_regs;
+
+    regs->cpsr = rsp->data.regs.arm.cpsr;
+    v->arch.ttbr0 = rsp->data.regs.arm.ttbr0;
+    v->arch.ttbr1 = rsp->data.regs.arm.ttbr1;
+
+    if ( psr_mode_is_32bit(regs->cpsr) )
+    {
+        regs->r0 = rsp->data.regs.arm.arch.arm32.r0_usr;
+        regs->r1 = rsp->data.regs.arm.arch.arm32.r1_usr;
+        regs->r2 = rsp->data.regs.arm.arch.arm32.r2_usr;
+        regs->r3 = rsp->data.regs.arm.arch.arm32.r3_usr;
+        regs->r4 = rsp->data.regs.arm.arch.arm32.r4_usr;
+        regs->r5 = rsp->data.regs.arm.arch.arm32.r5_usr;
+        regs->r6 = rsp->data.regs.arm.arch.arm32.r6_usr;
+        regs->r7 = rsp->data.regs.arm.arch.arm32.r7_usr;
+        regs->r8 = rsp->data.regs.arm.arch.arm32.r8_usr;
+        regs->r9 = rsp->data.regs.arm.arch.arm32.r9_usr;
+        regs->r10 = rsp->data.regs.arm.arch.arm32.r10_usr;
+        regs->r12 = rsp->data.regs.arm.arch.arm32.r12_usr;
+        regs->pc32 = rsp->data.regs.arm.arch.arm32.pc;
+        regs->fp = rsp->data.regs.arm.arch.arm32.fp;
+        regs->lr_usr = rsp->data.regs.arm.arch.arm32.lr_usr;
+        regs->sp_usr = rsp->data.regs.arm.arch.arm32.sp_usr;
+        regs->sp_svc = rsp->data.regs.arm.arch.arm32.sp_svc;
+        regs->spsr_svc = rsp->data.regs.arm.arch.arm32.spsr_svc;
+    }
+#ifdef CONFIG_ARM_64
+    else
+    {
+        regs->x0 = rsp->data.regs.arm.arch.arm64.x0;
+        regs->x1 = rsp->data.regs.arm.arch.arm64.x1;
+        regs->x2 = rsp->data.regs.arm.arch.arm64.x2;
+        regs->x3 = rsp->data.regs.arm.arch.arm64.x3;
+        regs->x4 = rsp->data.regs.arm.arch.arm64.x4;
+        regs->x5 = rsp->data.regs.arm.arch.arm64.x5;
+        regs->x6 = rsp->data.regs.arm.arch.arm64.x6;
+        regs->x7 = rsp->data.regs.arm.arch.arm64.x7;
+        regs->x8 = rsp->data.regs.arm.arch.arm64.x8;
+        regs->x9 = rsp->data.regs.arm.arch.arm64.x9;
+        regs->x10 = rsp->data.regs.arm.arch.arm64.x10;
+        regs->x16 = rsp->data.regs.arm.arch.arm64.x16;
+        regs->pc = rsp->data.regs.arm.arch.arm64.pc;
+        regs->sp_el0 = rsp->data.regs.arm.arch.arm64.sp_el0;
+        regs->sp_el1 = rsp->data.regs.arm.arch.arm64.sp_el1;
+        regs->fp = rsp->data.regs.arm.arch.arm64.fp;
+        regs->lr = rsp->data.regs.arm.arch.arm64.lr;
+        regs->spsr_svc = rsp->data.regs.arm.arch.arm64.spsr_el1;
+    }
+#endif
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/include/asm-arm/vm_event.h b/xen/include/asm-arm/vm_event.h
index a3fc4ce..a4922b3 100644
--- a/xen/include/asm-arm/vm_event.h
+++ b/xen/include/asm-arm/vm_event.h
@@ -48,15 +48,4 @@  void vm_event_register_write_resume(struct vcpu *v, vm_event_response_t *rsp)
     /* Not supported on ARM. */
 }
 
-static inline
-void vm_event_set_registers(struct vcpu *v, vm_event_response_t *rsp)
-{
-    /* Not supported on ARM. */
-}
-
-static inline void vm_event_fill_regs(vm_event_request_t *req)
-{
-    /* Not supported on ARM. */
-}
-
 #endif /* __ASM_ARM_VM_EVENT_H__ */
diff --git a/xen/include/asm-x86/vm_event.h b/xen/include/asm-x86/vm_event.h
index 026f42e..cf2077c 100644
--- a/xen/include/asm-x86/vm_event.h
+++ b/xen/include/asm-x86/vm_event.h
@@ -40,8 +40,4 @@  void vm_event_toggle_singlestep(struct domain *d, struct vcpu *v);
 
 void vm_event_register_write_resume(struct vcpu *v, vm_event_response_t *rsp);
 
-void vm_event_set_registers(struct vcpu *v, vm_event_response_t *rsp);
-
-void vm_event_fill_regs(vm_event_request_t *req);
-
 #endif /* __ASM_X86_VM_EVENT_H__ */
diff --git a/xen/include/public/vm_event.h b/xen/include/public/vm_event.h
index 3acf217..6ff7cc0 100644
--- a/xen/include/public/vm_event.h
+++ b/xen/include/public/vm_event.h
@@ -129,8 +129,8 @@ 
 #define VM_EVENT_X86_XCR0   3
 
 /*
- * Using a custom struct (not hvm_hw_cpu) so as to not fill
- * the vm_event ring buffer too quickly.
+ * Using custom vCPU structs (i.e. not hvm_hw_cpu) for both x86 and ARM
+ * so as to not fill the vm_event ring buffer too quickly.
  */
 struct vm_event_regs_x86 {
     uint64_t rax;
@@ -168,6 +168,59 @@  struct vm_event_regs_x86 {
     uint32_t _pad;
 };
 
+struct vm_event_regs_arm32 {
+    uint32_t r0_usr;
+    uint32_t r1_usr;
+    uint32_t r2_usr;
+    uint32_t r3_usr;
+    uint32_t r4_usr;
+    uint32_t r5_usr;
+    uint32_t r6_usr;
+    uint32_t r7_usr;
+    uint32_t r8_usr;
+    uint32_t r9_usr;
+    uint32_t r10_usr;
+    uint32_t r12_usr;
+    uint32_t lr_usr;
+    uint32_t fp;
+    uint32_t pc;
+    uint32_t sp_usr;
+    uint32_t sp_svc;
+    uint32_t spsr_svc;
+};
+
+struct vm_event_regs_arm64 {
+    uint64_t x0;
+    uint64_t x1;
+    uint64_t x2;
+    uint64_t x3;
+    uint64_t x4;
+    uint64_t x5;
+    uint64_t x6;
+    uint64_t x7;
+    uint64_t x8;
+    uint64_t x9;
+    uint64_t x10;
+    uint64_t x16;
+    uint64_t lr;
+    uint64_t fp;
+    uint64_t pc;
+    uint64_t sp_el0;
+    uint64_t sp_el1;
+    uint32_t spsr_el1;
+    uint32_t _pad;
+};
+
+struct vm_event_regs_arm {
+    uint32_t cpsr; /* PSR_MODE_BIT is set iff arm32 is used below */
+    uint64_t ttbr0;
+    uint64_t ttbr1;
+    union {
+        struct vm_event_regs_arm32 arm32;
+        struct vm_event_regs_arm64 arm64;
+    } arch;
+};
+
 /*
  * mem_access flag definitions
  *
@@ -256,6 +309,7 @@  typedef struct vm_event_st {
     union {
         union {
             struct vm_event_regs_x86 x86;
+            struct vm_event_regs_arm arm;
         } regs;
 
         struct vm_event_emul_read_data emul_read_data;
diff --git a/xen/include/xen/vm_event.h b/xen/include/xen/vm_event.h
index 89e6243..a5767ab 100644
--- a/xen/include/xen/vm_event.h
+++ b/xen/include/xen/vm_event.h
@@ -75,6 +75,9 @@  int vm_event_domctl(struct domain *d, xen_domctl_vm_event_op_t *vec,
 void vm_event_vcpu_pause(struct vcpu *v);
 void vm_event_vcpu_unpause(struct vcpu *v);
 
+void vm_event_fill_regs(vm_event_request_t *req);
+void vm_event_set_registers(struct vcpu *v, vm_event_response_t *rsp);
+
 /*
  * Monitor vm-events
  */