diff mbox series

[V3,21/23] xen/arm: Add mapcache invalidation handling

Message ID 1606732298-22107-22-git-send-email-olekstysh@gmail.com (mailing list archive)
State New, archived
Headers show
Series [V3,01/23] x86/ioreq: Prepare IOREQ feature for making it common | expand

Commit Message

Oleksandr Tyshchenko Nov. 30, 2020, 10:31 a.m. UTC
From: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>

We need to send mapcache invalidation request to qemu/demu everytime
the page gets removed from a guest.

At the moment, the Arm code doesn't explicitely remove the existing
mapping before inserting the new mapping. Instead, this is done
implicitely by __p2m_set_entry().

So we need to recognize a case when old entry is a RAM page *and*
the new MFN is different in order to set the corresponding flag.
The most suitable place to do this is p2m_free_entry(), there
we can find the correct leaf type. The invalidation request
will be sent in do_trap_hypercall() later on.

Signed-off-by: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
CC: Julien Grall <julien.grall@arm.com>

---
Please note, this is a split/cleanup/hardening of Julien's PoC:
"Add support for Guest IO forwarding to a device emulator"

Changes V1 -> V2:
   - new patch, some changes were derived from (+ new explanation):
     xen/ioreq: Make x86's invalidate qemu mapcache handling common
   - put setting of the flag into __p2m_set_entry()
   - clarify the conditions when the flag should be set
   - use domain_has_ioreq_server()
   - update do_trap_hypercall() by adding local variable

Changes V2 -> V3:
   - update patch description
   - move check to p2m_free_entry()
   - add a comment
   - use "curr" instead of "v" in do_trap_hypercall()
---
---
 xen/arch/arm/p2m.c   | 24 ++++++++++++++++--------
 xen/arch/arm/traps.c | 13 ++++++++++---
 2 files changed, 26 insertions(+), 11 deletions(-)

Comments

Stefano Stabellini Dec. 10, 2020, 2:30 a.m. UTC | #1
On Mon, 30 Nov 2020, Oleksandr Tyshchenko wrote:
> From: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
> 
> We need to send mapcache invalidation request to qemu/demu everytime
> the page gets removed from a guest.
> 
> At the moment, the Arm code doesn't explicitely remove the existing
> mapping before inserting the new mapping. Instead, this is done
> implicitely by __p2m_set_entry().
> 
> So we need to recognize a case when old entry is a RAM page *and*
> the new MFN is different in order to set the corresponding flag.
> The most suitable place to do this is p2m_free_entry(), there
> we can find the correct leaf type. The invalidation request
> will be sent in do_trap_hypercall() later on.

Why is it sent in do_trap_hypercall() ?


> Signed-off-by: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
> CC: Julien Grall <julien.grall@arm.com>
> 
> ---
> Please note, this is a split/cleanup/hardening of Julien's PoC:
> "Add support for Guest IO forwarding to a device emulator"
> 
> Changes V1 -> V2:
>    - new patch, some changes were derived from (+ new explanation):
>      xen/ioreq: Make x86's invalidate qemu mapcache handling common
>    - put setting of the flag into __p2m_set_entry()
>    - clarify the conditions when the flag should be set
>    - use domain_has_ioreq_server()
>    - update do_trap_hypercall() by adding local variable
> 
> Changes V2 -> V3:
>    - update patch description
>    - move check to p2m_free_entry()
>    - add a comment
>    - use "curr" instead of "v" in do_trap_hypercall()
> ---
> ---
>  xen/arch/arm/p2m.c   | 24 ++++++++++++++++--------
>  xen/arch/arm/traps.c | 13 ++++++++++---
>  2 files changed, 26 insertions(+), 11 deletions(-)
> 
> diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
> index 5b8d494..9674f6f 100644
> --- a/xen/arch/arm/p2m.c
> +++ b/xen/arch/arm/p2m.c
> @@ -1,6 +1,7 @@
>  #include <xen/cpu.h>
>  #include <xen/domain_page.h>
>  #include <xen/iocap.h>
> +#include <xen/ioreq.h>
>  #include <xen/lib.h>
>  #include <xen/sched.h>
>  #include <xen/softirq.h>
> @@ -749,17 +750,24 @@ static void p2m_free_entry(struct p2m_domain *p2m,
>      if ( !p2m_is_valid(entry) )
>          return;
>  
> -    /* Nothing to do but updating the stats if the entry is a super-page. */
> -    if ( p2m_is_superpage(entry, level) )
> +    if ( p2m_is_superpage(entry, level) || (level == 3) )
>      {
> -        p2m->stats.mappings[level]--;
> -        return;
> -    }
> +#ifdef CONFIG_IOREQ_SERVER
> +        /*
> +         * If this gets called (non-recursively) then either the entry
> +         * was replaced by an entry with a different base (valid case) or
> +         * the shattering of a superpage was failed (error case).
> +         * So, at worst, the spurious mapcache invalidation might be sent.
> +         */
> +        if ( domain_has_ioreq_server(p2m->domain) &&
> +             (p2m->domain == current->domain) && p2m_is_ram(entry.p2m.type) )
> +            p2m->domain->mapcache_invalidate = true;

Why the (p2m->domain == current->domain) check? Shouldn't we set
mapcache_invalidate to true anyway? What happens if p2m->domain !=
current->domain? We wouldn't want the domain to lose the
mapcache_invalidate notification.


> +#endif
>  
> -    if ( level == 3 )
> -    {
>          p2m->stats.mappings[level]--;
> -        p2m_put_l3_page(entry);
> +        /* Nothing to do if the entry is a super-page. */
> +        if ( level == 3 )
> +            p2m_put_l3_page(entry);
>          return;
>      }
>  
> diff --git a/xen/arch/arm/traps.c b/xen/arch/arm/traps.c
> index b6077d2..151c626 100644
> --- a/xen/arch/arm/traps.c
> +++ b/xen/arch/arm/traps.c
> @@ -1443,6 +1443,7 @@ static void do_trap_hypercall(struct cpu_user_regs *regs, register_t *nr,
>                                const union hsr hsr)
>  {
>      arm_hypercall_fn_t call = NULL;
> +    struct vcpu *curr = current;

Is this just to save 3 characters?


>      BUILD_BUG_ON(NR_hypercalls < ARRAY_SIZE(arm_hypercall_table) );
>  
> @@ -1459,7 +1460,7 @@ static void do_trap_hypercall(struct cpu_user_regs *regs, register_t *nr,
>          return;
>      }
>  
> -    current->hcall_preempted = false;
> +    curr->hcall_preempted = false;
>  
>      perfc_incra(hypercalls, *nr);
>      call = arm_hypercall_table[*nr].fn;
> @@ -1472,7 +1473,7 @@ static void do_trap_hypercall(struct cpu_user_regs *regs, register_t *nr,
>      HYPERCALL_RESULT_REG(regs) = call(HYPERCALL_ARGS(regs));
>  
>  #ifndef NDEBUG
> -    if ( !current->hcall_preempted )
> +    if ( !curr->hcall_preempted )
>      {
>          /* Deliberately corrupt parameter regs used by this hypercall. */
>          switch ( arm_hypercall_table[*nr].nr_args ) {
> @@ -1489,8 +1490,14 @@ static void do_trap_hypercall(struct cpu_user_regs *regs, register_t *nr,
>  #endif
>  
>      /* Ensure the hypercall trap instruction is re-executed. */
> -    if ( current->hcall_preempted )
> +    if ( curr->hcall_preempted )
>          regs->pc -= 4;  /* re-execute 'hvc #XEN_HYPERCALL_TAG' */
> +
> +#ifdef CONFIG_IOREQ_SERVER
> +    if ( unlikely(curr->domain->mapcache_invalidate) &&
> +         test_and_clear_bool(curr->domain->mapcache_invalidate) )
> +        ioreq_signal_mapcache_invalidate();

Why not just:

if ( unlikely(test_and_clear_bool(curr->domain->mapcache_invalidate)) )
    ioreq_signal_mapcache_invalidate();
Julien Grall Dec. 10, 2020, 6:50 p.m. UTC | #2
Hi Stefano,

On 10/12/2020 02:30, Stefano Stabellini wrote:
> On Mon, 30 Nov 2020, Oleksandr Tyshchenko wrote:
>> From: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
>>
>> We need to send mapcache invalidation request to qemu/demu everytime
>> the page gets removed from a guest.
>>
>> At the moment, the Arm code doesn't explicitely remove the existing
>> mapping before inserting the new mapping. Instead, this is done
>> implicitely by __p2m_set_entry().
>>
>> So we need to recognize a case when old entry is a RAM page *and*
>> the new MFN is different in order to set the corresponding flag.
>> The most suitable place to do this is p2m_free_entry(), there
>> we can find the correct leaf type. The invalidation request
>> will be sent in do_trap_hypercall() later on.
> 
> Why is it sent in do_trap_hypercall() ?

I believe this is following the approach used by x86. There are actually 
some discussion about it (see [1]).

Leaving aside the toolstack case for now, AFAIK, the only way a guest 
can modify its p2m is via an hypercall. Do you have an example otherwise?

When sending the invalidation request, the vCPU will be blocked until 
all the IOREQ server have acknowledged the invalidation. So the 
hypercall seems to be the best position to do it.

Alternatively, we could use check_for_vcpu_work() to check if the 
mapcache needs to be invalidated. The inconvenience is we would execute 
a few more instructions in each entry/exit path.

> 
> 
>> Signed-off-by: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
>> CC: Julien Grall <julien.grall@arm.com>
>>
>> ---
>> Please note, this is a split/cleanup/hardening of Julien's PoC:
>> "Add support for Guest IO forwarding to a device emulator"
>>
>> Changes V1 -> V2:
>>     - new patch, some changes were derived from (+ new explanation):
>>       xen/ioreq: Make x86's invalidate qemu mapcache handling common
>>     - put setting of the flag into __p2m_set_entry()
>>     - clarify the conditions when the flag should be set
>>     - use domain_has_ioreq_server()
>>     - update do_trap_hypercall() by adding local variable
>>
>> Changes V2 -> V3:
>>     - update patch description
>>     - move check to p2m_free_entry()
>>     - add a comment
>>     - use "curr" instead of "v" in do_trap_hypercall()
>> ---
>> ---
>>   xen/arch/arm/p2m.c   | 24 ++++++++++++++++--------
>>   xen/arch/arm/traps.c | 13 ++++++++++---
>>   2 files changed, 26 insertions(+), 11 deletions(-)
>>
>> diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
>> index 5b8d494..9674f6f 100644
>> --- a/xen/arch/arm/p2m.c
>> +++ b/xen/arch/arm/p2m.c
>> @@ -1,6 +1,7 @@
>>   #include <xen/cpu.h>
>>   #include <xen/domain_page.h>
>>   #include <xen/iocap.h>
>> +#include <xen/ioreq.h>
>>   #include <xen/lib.h>
>>   #include <xen/sched.h>
>>   #include <xen/softirq.h>
>> @@ -749,17 +750,24 @@ static void p2m_free_entry(struct p2m_domain *p2m,
>>       if ( !p2m_is_valid(entry) )
>>           return;
>>   
>> -    /* Nothing to do but updating the stats if the entry is a super-page. */
>> -    if ( p2m_is_superpage(entry, level) )
>> +    if ( p2m_is_superpage(entry, level) || (level == 3) )
>>       {
>> -        p2m->stats.mappings[level]--;
>> -        return;
>> -    }
>> +#ifdef CONFIG_IOREQ_SERVER
>> +        /*
>> +         * If this gets called (non-recursively) then either the entry
>> +         * was replaced by an entry with a different base (valid case) or
>> +         * the shattering of a superpage was failed (error case).
>> +         * So, at worst, the spurious mapcache invalidation might be sent.
>> +         */
>> +        if ( domain_has_ioreq_server(p2m->domain) &&
>> +             (p2m->domain == current->domain) && p2m_is_ram(entry.p2m.type) )
>> +            p2m->domain->mapcache_invalidate = true;
> 
> Why the (p2m->domain == current->domain) check? Shouldn't we set
> mapcache_invalidate to true anyway? What happens if p2m->domain !=
> current->domain? We wouldn't want the domain to lose the
> mapcache_invalidate notification.

This is also discussed in [1]. :) The main question is why would a 
toolstack/device model modify the guest memory after boot?

If we assume it does, then the device model would need to pause the 
domain before modifying the RAM.

We also need to make sure that all the IOREQ servers have invalidated
the mapcache before the domain run again.

This would require quite a bit of work. I am not sure the effort is 
worth if there are no active users today.

> 
> 
>> +#endif
>>   
>> -    if ( level == 3 )
>> -    {
>>           p2m->stats.mappings[level]--;
>> -        p2m_put_l3_page(entry);
>> +        /* Nothing to do if the entry is a super-page. */
>> +        if ( level == 3 )
>> +            p2m_put_l3_page(entry);
>>           return;
>>       }
>>   
>> diff --git a/xen/arch/arm/traps.c b/xen/arch/arm/traps.c
>> index b6077d2..151c626 100644
>> --- a/xen/arch/arm/traps.c
>> +++ b/xen/arch/arm/traps.c
>> @@ -1443,6 +1443,7 @@ static void do_trap_hypercall(struct cpu_user_regs *regs, register_t *nr,
>>                                 const union hsr hsr)
>>   {
>>       arm_hypercall_fn_t call = NULL;
>> +    struct vcpu *curr = current;
> 
> Is this just to save 3 characters?

Because current is not cheap to read and the compiler cannot optimize it 
(we obfuscate it as this is a per-cpu variable). So we commonly store 
'current'  in a local variable if there are multiple use.

> 
> 
>>       BUILD_BUG_ON(NR_hypercalls < ARRAY_SIZE(arm_hypercall_table) );
>>   
>> @@ -1459,7 +1460,7 @@ static void do_trap_hypercall(struct cpu_user_regs *regs, register_t *nr,
>>           return;
>>       }
>>   
>> -    current->hcall_preempted = false;
>> +    curr->hcall_preempted = false;
>>   
>>       perfc_incra(hypercalls, *nr);
>>       call = arm_hypercall_table[*nr].fn;
>> @@ -1472,7 +1473,7 @@ static void do_trap_hypercall(struct cpu_user_regs *regs, register_t *nr,
>>       HYPERCALL_RESULT_REG(regs) = call(HYPERCALL_ARGS(regs));
>>   
>>   #ifndef NDEBUG
>> -    if ( !current->hcall_preempted )
>> +    if ( !curr->hcall_preempted )
>>       {
>>           /* Deliberately corrupt parameter regs used by this hypercall. */
>>           switch ( arm_hypercall_table[*nr].nr_args ) {
>> @@ -1489,8 +1490,14 @@ static void do_trap_hypercall(struct cpu_user_regs *regs, register_t *nr,
>>   #endif
>>   
>>       /* Ensure the hypercall trap instruction is re-executed. */
>> -    if ( current->hcall_preempted )
>> +    if ( curr->hcall_preempted )
>>           regs->pc -= 4;  /* re-execute 'hvc #XEN_HYPERCALL_TAG' */
>> +
>> +#ifdef CONFIG_IOREQ_SERVER
>> +    if ( unlikely(curr->domain->mapcache_invalidate) &&
>> +         test_and_clear_bool(curr->domain->mapcache_invalidate) )
>> +        ioreq_signal_mapcache_invalidate();
> 
> Why not just:
> 
> if ( unlikely(test_and_clear_bool(curr->domain->mapcache_invalidate)) )
>      ioreq_signal_mapcache_invalidate();
> 

This seems to match the x86 code. My guess is they tried to prevent the 
cost of the atomic operation if there is no chance mapcache_invalidate 
is true.

I am split whether the first check is worth it. The atomic operation 
should be uncontended most of the time, so it should be quick. But it 
will always be slower than just a read because there is always a store 
involved.

On a related topic, Jan pointed out that the invalidation would not work 
properly if you have multiple vCPU modifying the P2M at the same time.

Cheers,

[1] 
https://lore.kernel.org/xen-devel/f92f62bf-2f8d-34db-4be5-d3e6a4b9d580@suse.com/
Stefano Stabellini Dec. 11, 2020, 1:28 a.m. UTC | #3
On Thu, 10 Dec 2020, Julien Grall wrote:
> On 10/12/2020 02:30, Stefano Stabellini wrote:
> > On Mon, 30 Nov 2020, Oleksandr Tyshchenko wrote:
> > > From: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
> > > 
> > > We need to send mapcache invalidation request to qemu/demu everytime
> > > the page gets removed from a guest.
> > > 
> > > At the moment, the Arm code doesn't explicitely remove the existing
> > > mapping before inserting the new mapping. Instead, this is done
> > > implicitely by __p2m_set_entry().
> > > 
> > > So we need to recognize a case when old entry is a RAM page *and*
> > > the new MFN is different in order to set the corresponding flag.
> > > The most suitable place to do this is p2m_free_entry(), there
> > > we can find the correct leaf type. The invalidation request
> > > will be sent in do_trap_hypercall() later on.
> > 
> > Why is it sent in do_trap_hypercall() ?
> 
> I believe this is following the approach used by x86. There are actually some
> discussion about it (see [1]).
> 
> Leaving aside the toolstack case for now, AFAIK, the only way a guest can
> modify its p2m is via an hypercall. Do you have an example otherwise?

OK this is a very important assumption. We should write it down for sure.
I think it is true today on ARM.


> When sending the invalidation request, the vCPU will be blocked until all the
> IOREQ server have acknowledged the invalidation. So the hypercall seems to be
> the best position to do it.
> 
> Alternatively, we could use check_for_vcpu_work() to check if the mapcache
> needs to be invalidated. The inconvenience is we would execute a few more
> instructions in each entry/exit path.

Yeah it would be more natural to call it from check_for_vcpu_work(). If
we put it between #ifdef CONFIG_IOREQ_SERVER it wouldn't be bad. But I
am not a fan of increasing the instructions on the exit path either.
From this point of view, putting it at the end of do_trap_hypercall is a
nice trick actually. Let's just make sure it has a good comment on top.


> > > Signed-off-by: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
> > > CC: Julien Grall <julien.grall@arm.com>
> > > 
> > > ---
> > > Please note, this is a split/cleanup/hardening of Julien's PoC:
> > > "Add support for Guest IO forwarding to a device emulator"
> > > 
> > > Changes V1 -> V2:
> > >     - new patch, some changes were derived from (+ new explanation):
> > >       xen/ioreq: Make x86's invalidate qemu mapcache handling common
> > >     - put setting of the flag into __p2m_set_entry()
> > >     - clarify the conditions when the flag should be set
> > >     - use domain_has_ioreq_server()
> > >     - update do_trap_hypercall() by adding local variable
> > > 
> > > Changes V2 -> V3:
> > >     - update patch description
> > >     - move check to p2m_free_entry()
> > >     - add a comment
> > >     - use "curr" instead of "v" in do_trap_hypercall()
> > > ---
> > > ---
> > >   xen/arch/arm/p2m.c   | 24 ++++++++++++++++--------
> > >   xen/arch/arm/traps.c | 13 ++++++++++---
> > >   2 files changed, 26 insertions(+), 11 deletions(-)
> > > 
> > > diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
> > > index 5b8d494..9674f6f 100644
> > > --- a/xen/arch/arm/p2m.c
> > > +++ b/xen/arch/arm/p2m.c
> > > @@ -1,6 +1,7 @@
> > >   #include <xen/cpu.h>
> > >   #include <xen/domain_page.h>
> > >   #include <xen/iocap.h>
> > > +#include <xen/ioreq.h>
> > >   #include <xen/lib.h>
> > >   #include <xen/sched.h>
> > >   #include <xen/softirq.h>
> > > @@ -749,17 +750,24 @@ static void p2m_free_entry(struct p2m_domain *p2m,
> > >       if ( !p2m_is_valid(entry) )
> > >           return;
> > >   -    /* Nothing to do but updating the stats if the entry is a
> > > super-page. */
> > > -    if ( p2m_is_superpage(entry, level) )
> > > +    if ( p2m_is_superpage(entry, level) || (level == 3) )
> > >       {
> > > -        p2m->stats.mappings[level]--;
> > > -        return;
> > > -    }
> > > +#ifdef CONFIG_IOREQ_SERVER
> > > +        /*
> > > +         * If this gets called (non-recursively) then either the entry
> > > +         * was replaced by an entry with a different base (valid case) or
> > > +         * the shattering of a superpage was failed (error case).
> > > +         * So, at worst, the spurious mapcache invalidation might be
> > > sent.
> > > +         */
> > > +        if ( domain_has_ioreq_server(p2m->domain) &&
> > > +             (p2m->domain == current->domain) &&
> > > p2m_is_ram(entry.p2m.type) )
> > > +            p2m->domain->mapcache_invalidate = true;
> > 
> > Why the (p2m->domain == current->domain) check? Shouldn't we set
> > mapcache_invalidate to true anyway? What happens if p2m->domain !=
> > current->domain? We wouldn't want the domain to lose the
> > mapcache_invalidate notification.
> 
> This is also discussed in [1]. :) The main question is why would a
> toolstack/device model modify the guest memory after boot?
> 
> If we assume it does, then the device model would need to pause the domain
> before modifying the RAM.
> 
> We also need to make sure that all the IOREQ servers have invalidated
> the mapcache before the domain run again.
> 
> This would require quite a bit of work. I am not sure the effort is worth if
> there are no active users today.

OK, that explains why we think p2m->domain == current->domain, but why
do we need to have a check for it right here?

In other words, we don't think it is realistc to get here with
p2m->domain != current->domain, but let's say that we do somehow. What's
the best course of action? Probably, set mapcache_invalidate to true and
possibly print a warning?

Leaving mapcache_invalidate to false doesn't seem to be what we want to
do?

 
> > >       BUILD_BUG_ON(NR_hypercalls < ARRAY_SIZE(arm_hypercall_table) );
> > >   @@ -1459,7 +1460,7 @@ static void do_trap_hypercall(struct cpu_user_regs
> > > *regs, register_t *nr,
> > >           return;
> > >       }
> > >   -    current->hcall_preempted = false;
> > > +    curr->hcall_preempted = false;
> > >         perfc_incra(hypercalls, *nr);
> > >       call = arm_hypercall_table[*nr].fn;
> > > @@ -1472,7 +1473,7 @@ static void do_trap_hypercall(struct cpu_user_regs
> > > *regs, register_t *nr,
> > >       HYPERCALL_RESULT_REG(regs) = call(HYPERCALL_ARGS(regs));
> > >     #ifndef NDEBUG
> > > -    if ( !current->hcall_preempted )
> > > +    if ( !curr->hcall_preempted )
> > >       {
> > >           /* Deliberately corrupt parameter regs used by this hypercall.
> > > */
> > >           switch ( arm_hypercall_table[*nr].nr_args ) {
> > > @@ -1489,8 +1490,14 @@ static void do_trap_hypercall(struct cpu_user_regs
> > > *regs, register_t *nr,
> > >   #endif
> > >         /* Ensure the hypercall trap instruction is re-executed. */
> > > -    if ( current->hcall_preempted )
> > > +    if ( curr->hcall_preempted )
> > >           regs->pc -= 4;  /* re-execute 'hvc #XEN_HYPERCALL_TAG' */
> > > +
> > > +#ifdef CONFIG_IOREQ_SERVER
> > > +    if ( unlikely(curr->domain->mapcache_invalidate) &&
> > > +         test_and_clear_bool(curr->domain->mapcache_invalidate) )
> > > +        ioreq_signal_mapcache_invalidate();
> > 
> > Why not just:
> > 
> > if ( unlikely(test_and_clear_bool(curr->domain->mapcache_invalidate)) )
> >      ioreq_signal_mapcache_invalidate();
> > 
> 
> This seems to match the x86 code. My guess is they tried to prevent the cost
> of the atomic operation if there is no chance mapcache_invalidate is true.
> 
> I am split whether the first check is worth it. The atomic operation should be
> uncontended most of the time, so it should be quick. But it will always be
> slower than just a read because there is always a store involved.

I am not a fun of optimizations with unclear benefits :-)


> On a related topic, Jan pointed out that the invalidation would not work
> properly if you have multiple vCPU modifying the P2M at the same time.

Uhm, yes.
Oleksandr Tyshchenko Dec. 11, 2020, 11:21 a.m. UTC | #4
On 11.12.20 03:28, Stefano Stabellini wrote:

Hi Julien, Stefano

> On Thu, 10 Dec 2020, Julien Grall wrote:
>> On 10/12/2020 02:30, Stefano Stabellini wrote:
>>> On Mon, 30 Nov 2020, Oleksandr Tyshchenko wrote:
>>>> From: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
>>>>
>>>> We need to send mapcache invalidation request to qemu/demu everytime
>>>> the page gets removed from a guest.
>>>>
>>>> At the moment, the Arm code doesn't explicitely remove the existing
>>>> mapping before inserting the new mapping. Instead, this is done
>>>> implicitely by __p2m_set_entry().
>>>>
>>>> So we need to recognize a case when old entry is a RAM page *and*
>>>> the new MFN is different in order to set the corresponding flag.
>>>> The most suitable place to do this is p2m_free_entry(), there
>>>> we can find the correct leaf type. The invalidation request
>>>> will be sent in do_trap_hypercall() later on.
>>> Why is it sent in do_trap_hypercall() ?
>> I believe this is following the approach used by x86. There are actually some
>> discussion about it (see [1]).
>>
>> Leaving aside the toolstack case for now, AFAIK, the only way a guest can
>> modify its p2m is via an hypercall. Do you have an example otherwise?
> OK this is a very important assumption. We should write it down for sure.
> I think it is true today on ARM.
>
>
>> When sending the invalidation request, the vCPU will be blocked until all the
>> IOREQ server have acknowledged the invalidation. So the hypercall seems to be
>> the best position to do it.
>>
>> Alternatively, we could use check_for_vcpu_work() to check if the mapcache
>> needs to be invalidated. The inconvenience is we would execute a few more
>> instructions in each entry/exit path.
> Yeah it would be more natural to call it from check_for_vcpu_work(). If
> we put it between #ifdef CONFIG_IOREQ_SERVER it wouldn't be bad. But I
> am not a fan of increasing the instructions on the exit path either.
>  From this point of view, putting it at the end of do_trap_hypercall is a
> nice trick actually. Let's just make sure it has a good comment on top.
>
>
>>>> Signed-off-by: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
>>>> CC: Julien Grall <julien.grall@arm.com>
>>>>
>>>> ---
>>>> Please note, this is a split/cleanup/hardening of Julien's PoC:
>>>> "Add support for Guest IO forwarding to a device emulator"
>>>>
>>>> Changes V1 -> V2:
>>>>      - new patch, some changes were derived from (+ new explanation):
>>>>        xen/ioreq: Make x86's invalidate qemu mapcache handling common
>>>>      - put setting of the flag into __p2m_set_entry()
>>>>      - clarify the conditions when the flag should be set
>>>>      - use domain_has_ioreq_server()
>>>>      - update do_trap_hypercall() by adding local variable
>>>>
>>>> Changes V2 -> V3:
>>>>      - update patch description
>>>>      - move check to p2m_free_entry()
>>>>      - add a comment
>>>>      - use "curr" instead of "v" in do_trap_hypercall()
>>>> ---
>>>> ---
>>>>    xen/arch/arm/p2m.c   | 24 ++++++++++++++++--------
>>>>    xen/arch/arm/traps.c | 13 ++++++++++---
>>>>    2 files changed, 26 insertions(+), 11 deletions(-)
>>>>
>>>> diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
>>>> index 5b8d494..9674f6f 100644
>>>> --- a/xen/arch/arm/p2m.c
>>>> +++ b/xen/arch/arm/p2m.c
>>>> @@ -1,6 +1,7 @@
>>>>    #include <xen/cpu.h>
>>>>    #include <xen/domain_page.h>
>>>>    #include <xen/iocap.h>
>>>> +#include <xen/ioreq.h>
>>>>    #include <xen/lib.h>
>>>>    #include <xen/sched.h>
>>>>    #include <xen/softirq.h>
>>>> @@ -749,17 +750,24 @@ static void p2m_free_entry(struct p2m_domain *p2m,
>>>>        if ( !p2m_is_valid(entry) )
>>>>            return;
>>>>    -    /* Nothing to do but updating the stats if the entry is a
>>>> super-page. */
>>>> -    if ( p2m_is_superpage(entry, level) )
>>>> +    if ( p2m_is_superpage(entry, level) || (level == 3) )
>>>>        {
>>>> -        p2m->stats.mappings[level]--;
>>>> -        return;
>>>> -    }
>>>> +#ifdef CONFIG_IOREQ_SERVER
>>>> +        /*
>>>> +         * If this gets called (non-recursively) then either the entry
>>>> +         * was replaced by an entry with a different base (valid case) or
>>>> +         * the shattering of a superpage was failed (error case).
>>>> +         * So, at worst, the spurious mapcache invalidation might be
>>>> sent.
>>>> +         */
>>>> +        if ( domain_has_ioreq_server(p2m->domain) &&
>>>> +             (p2m->domain == current->domain) &&
>>>> p2m_is_ram(entry.p2m.type) )
>>>> +            p2m->domain->mapcache_invalidate = true;
>>> Why the (p2m->domain == current->domain) check? Shouldn't we set
>>> mapcache_invalidate to true anyway? What happens if p2m->domain !=
>>> current->domain? We wouldn't want the domain to lose the
>>> mapcache_invalidate notification.
>> This is also discussed in [1]. :) The main question is why would a
>> toolstack/device model modify the guest memory after boot?
>>
>> If we assume it does, then the device model would need to pause the domain
>> before modifying the RAM.
>>
>> We also need to make sure that all the IOREQ servers have invalidated
>> the mapcache before the domain run again.
>>
>> This would require quite a bit of work. I am not sure the effort is worth if
>> there are no active users today.
> OK, that explains why we think p2m->domain == current->domain, but why
> do we need to have a check for it right here?
>
> In other words, we don't think it is realistc to get here with
> p2m->domain != current->domain, but let's say that we do somehow. What's
> the best course of action? Probably, set mapcache_invalidate to true and
> possibly print a warning?
>
> Leaving mapcache_invalidate to false doesn't seem to be what we want to
> do?
>
>   
>>>>        BUILD_BUG_ON(NR_hypercalls < ARRAY_SIZE(arm_hypercall_table) );
>>>>    @@ -1459,7 +1460,7 @@ static void do_trap_hypercall(struct cpu_user_regs
>>>> *regs, register_t *nr,
>>>>            return;
>>>>        }
>>>>    -    current->hcall_preempted = false;
>>>> +    curr->hcall_preempted = false;
>>>>          perfc_incra(hypercalls, *nr);
>>>>        call = arm_hypercall_table[*nr].fn;
>>>> @@ -1472,7 +1473,7 @@ static void do_trap_hypercall(struct cpu_user_regs
>>>> *regs, register_t *nr,
>>>>        HYPERCALL_RESULT_REG(regs) = call(HYPERCALL_ARGS(regs));
>>>>      #ifndef NDEBUG
>>>> -    if ( !current->hcall_preempted )
>>>> +    if ( !curr->hcall_preempted )
>>>>        {
>>>>            /* Deliberately corrupt parameter regs used by this hypercall.
>>>> */
>>>>            switch ( arm_hypercall_table[*nr].nr_args ) {
>>>> @@ -1489,8 +1490,14 @@ static void do_trap_hypercall(struct cpu_user_regs
>>>> *regs, register_t *nr,
>>>>    #endif
>>>>          /* Ensure the hypercall trap instruction is re-executed. */
>>>> -    if ( current->hcall_preempted )
>>>> +    if ( curr->hcall_preempted )
>>>>            regs->pc -= 4;  /* re-execute 'hvc #XEN_HYPERCALL_TAG' */
>>>> +
>>>> +#ifdef CONFIG_IOREQ_SERVER
>>>> +    if ( unlikely(curr->domain->mapcache_invalidate) &&
>>>> +         test_and_clear_bool(curr->domain->mapcache_invalidate) )
>>>> +        ioreq_signal_mapcache_invalidate();
>>> Why not just:
>>>
>>> if ( unlikely(test_and_clear_bool(curr->domain->mapcache_invalidate)) )
>>>       ioreq_signal_mapcache_invalidate();
>>>
>> This seems to match the x86 code. My guess is they tried to prevent the cost
>> of the atomic operation if there is no chance mapcache_invalidate is true.
>>
>> I am split whether the first check is worth it. The atomic operation should be
>> uncontended most of the time, so it should be quick. But it will always be
>> slower than just a read because there is always a store involved.
> I am not a fun of optimizations with unclear benefits :-)
>
>
>> On a related topic, Jan pointed out that the invalidation would not work
>> properly if you have multiple vCPU modifying the P2M at the same time.
>>
Thanks to Julien, he explained all bits in detail. Indeed I followed how 
it was done on x86 (place where to send the invalidation request, the 
code to check whether the flag is set, which at first glance, appears 
odd, etc)
and review comments (to latch current into the local variable, and make 
sure that domain sends invalidation request on itself).
Regarding what to do if p2m->domain != current->domain in 
p2m_free_entry(). Probably we could set flag only if guest is paused, 
otherwise just print a warning. Thoughts?
Stefano Stabellini Dec. 11, 2020, 7:07 p.m. UTC | #5
On Fri, 11 Dec 2020, Oleksandr wrote:
> On 11.12.20 03:28, Stefano Stabellini wrote:
> > On Thu, 10 Dec 2020, Julien Grall wrote:
> > > On 10/12/2020 02:30, Stefano Stabellini wrote:
> > > > On Mon, 30 Nov 2020, Oleksandr Tyshchenko wrote:
> > > > > From: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
> > > > > 
> > > > > We need to send mapcache invalidation request to qemu/demu everytime
> > > > > the page gets removed from a guest.
> > > > > 
> > > > > At the moment, the Arm code doesn't explicitely remove the existing
> > > > > mapping before inserting the new mapping. Instead, this is done
> > > > > implicitely by __p2m_set_entry().
> > > > > 
> > > > > So we need to recognize a case when old entry is a RAM page *and*
> > > > > the new MFN is different in order to set the corresponding flag.
> > > > > The most suitable place to do this is p2m_free_entry(), there
> > > > > we can find the correct leaf type. The invalidation request
> > > > > will be sent in do_trap_hypercall() later on.
> > > > Why is it sent in do_trap_hypercall() ?
> > > I believe this is following the approach used by x86. There are actually
> > > some
> > > discussion about it (see [1]).
> > > 
> > > Leaving aside the toolstack case for now, AFAIK, the only way a guest can
> > > modify its p2m is via an hypercall. Do you have an example otherwise?
> > OK this is a very important assumption. We should write it down for sure.
> > I think it is true today on ARM.
> > 
> > 
> > > When sending the invalidation request, the vCPU will be blocked until all
> > > the
> > > IOREQ server have acknowledged the invalidation. So the hypercall seems to
> > > be
> > > the best position to do it.
> > > 
> > > Alternatively, we could use check_for_vcpu_work() to check if the mapcache
> > > needs to be invalidated. The inconvenience is we would execute a few more
> > > instructions in each entry/exit path.
> > Yeah it would be more natural to call it from check_for_vcpu_work(). If
> > we put it between #ifdef CONFIG_IOREQ_SERVER it wouldn't be bad. But I
> > am not a fan of increasing the instructions on the exit path either.
> >  From this point of view, putting it at the end of do_trap_hypercall is a
> > nice trick actually. Let's just make sure it has a good comment on top.
> > 
> > 
> > > > > Signed-off-by: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
> > > > > CC: Julien Grall <julien.grall@arm.com>
> > > > > 
> > > > > ---
> > > > > Please note, this is a split/cleanup/hardening of Julien's PoC:
> > > > > "Add support for Guest IO forwarding to a device emulator"
> > > > > 
> > > > > Changes V1 -> V2:
> > > > >      - new patch, some changes were derived from (+ new explanation):
> > > > >        xen/ioreq: Make x86's invalidate qemu mapcache handling common
> > > > >      - put setting of the flag into __p2m_set_entry()
> > > > >      - clarify the conditions when the flag should be set
> > > > >      - use domain_has_ioreq_server()
> > > > >      - update do_trap_hypercall() by adding local variable
> > > > > 
> > > > > Changes V2 -> V3:
> > > > >      - update patch description
> > > > >      - move check to p2m_free_entry()
> > > > >      - add a comment
> > > > >      - use "curr" instead of "v" in do_trap_hypercall()
> > > > > ---
> > > > > ---
> > > > >    xen/arch/arm/p2m.c   | 24 ++++++++++++++++--------
> > > > >    xen/arch/arm/traps.c | 13 ++++++++++---
> > > > >    2 files changed, 26 insertions(+), 11 deletions(-)
> > > > > 
> > > > > diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
> > > > > index 5b8d494..9674f6f 100644
> > > > > --- a/xen/arch/arm/p2m.c
> > > > > +++ b/xen/arch/arm/p2m.c
> > > > > @@ -1,6 +1,7 @@
> > > > >    #include <xen/cpu.h>
> > > > >    #include <xen/domain_page.h>
> > > > >    #include <xen/iocap.h>
> > > > > +#include <xen/ioreq.h>
> > > > >    #include <xen/lib.h>
> > > > >    #include <xen/sched.h>
> > > > >    #include <xen/softirq.h>
> > > > > @@ -749,17 +750,24 @@ static void p2m_free_entry(struct p2m_domain
> > > > > *p2m,
> > > > >        if ( !p2m_is_valid(entry) )
> > > > >            return;
> > > > >    -    /* Nothing to do but updating the stats if the entry is a
> > > > > super-page. */
> > > > > -    if ( p2m_is_superpage(entry, level) )
> > > > > +    if ( p2m_is_superpage(entry, level) || (level == 3) )
> > > > >        {
> > > > > -        p2m->stats.mappings[level]--;
> > > > > -        return;
> > > > > -    }
> > > > > +#ifdef CONFIG_IOREQ_SERVER
> > > > > +        /*
> > > > > +         * If this gets called (non-recursively) then either the
> > > > > entry
> > > > > +         * was replaced by an entry with a different base (valid
> > > > > case) or
> > > > > +         * the shattering of a superpage was failed (error case).
> > > > > +         * So, at worst, the spurious mapcache invalidation might be
> > > > > sent.
> > > > > +         */
> > > > > +        if ( domain_has_ioreq_server(p2m->domain) &&
> > > > > +             (p2m->domain == current->domain) &&
> > > > > p2m_is_ram(entry.p2m.type) )
> > > > > +            p2m->domain->mapcache_invalidate = true;
> > > > Why the (p2m->domain == current->domain) check? Shouldn't we set
> > > > mapcache_invalidate to true anyway? What happens if p2m->domain !=
> > > > current->domain? We wouldn't want the domain to lose the
> > > > mapcache_invalidate notification.
> > > This is also discussed in [1]. :) The main question is why would a
> > > toolstack/device model modify the guest memory after boot?
> > > 
> > > If we assume it does, then the device model would need to pause the domain
> > > before modifying the RAM.
> > > 
> > > We also need to make sure that all the IOREQ servers have invalidated
> > > the mapcache before the domain run again.
> > > 
> > > This would require quite a bit of work. I am not sure the effort is worth
> > > if
> > > there are no active users today.
> > OK, that explains why we think p2m->domain == current->domain, but why
> > do we need to have a check for it right here?
> > 
> > In other words, we don't think it is realistc to get here with
> > p2m->domain != current->domain, but let's say that we do somehow. What's
> > the best course of action? Probably, set mapcache_invalidate to true and
> > possibly print a warning?
> > 
> > Leaving mapcache_invalidate to false doesn't seem to be what we want to
> > do?
> > 
> >   
> > > > >        BUILD_BUG_ON(NR_hypercalls < ARRAY_SIZE(arm_hypercall_table) );
> > > > >    @@ -1459,7 +1460,7 @@ static void do_trap_hypercall(struct
> > > > > cpu_user_regs
> > > > > *regs, register_t *nr,
> > > > >            return;
> > > > >        }
> > > > >    -    current->hcall_preempted = false;
> > > > > +    curr->hcall_preempted = false;
> > > > >          perfc_incra(hypercalls, *nr);
> > > > >        call = arm_hypercall_table[*nr].fn;
> > > > > @@ -1472,7 +1473,7 @@ static void do_trap_hypercall(struct
> > > > > cpu_user_regs
> > > > > *regs, register_t *nr,
> > > > >        HYPERCALL_RESULT_REG(regs) = call(HYPERCALL_ARGS(regs));
> > > > >      #ifndef NDEBUG
> > > > > -    if ( !current->hcall_preempted )
> > > > > +    if ( !curr->hcall_preempted )
> > > > >        {
> > > > >            /* Deliberately corrupt parameter regs used by this
> > > > > hypercall.
> > > > > */
> > > > >            switch ( arm_hypercall_table[*nr].nr_args ) {
> > > > > @@ -1489,8 +1490,14 @@ static void do_trap_hypercall(struct
> > > > > cpu_user_regs
> > > > > *regs, register_t *nr,
> > > > >    #endif
> > > > >          /* Ensure the hypercall trap instruction is re-executed. */
> > > > > -    if ( current->hcall_preempted )
> > > > > +    if ( curr->hcall_preempted )
> > > > >            regs->pc -= 4;  /* re-execute 'hvc #XEN_HYPERCALL_TAG' */
> > > > > +
> > > > > +#ifdef CONFIG_IOREQ_SERVER
> > > > > +    if ( unlikely(curr->domain->mapcache_invalidate) &&
> > > > > +         test_and_clear_bool(curr->domain->mapcache_invalidate) )
> > > > > +        ioreq_signal_mapcache_invalidate();
> > > > Why not just:
> > > > 
> > > > if ( unlikely(test_and_clear_bool(curr->domain->mapcache_invalidate)) )
> > > >       ioreq_signal_mapcache_invalidate();
> > > > 
> > > This seems to match the x86 code. My guess is they tried to prevent the
> > > cost
> > > of the atomic operation if there is no chance mapcache_invalidate is true.
> > > 
> > > I am split whether the first check is worth it. The atomic operation
> > > should be
> > > uncontended most of the time, so it should be quick. But it will always be
> > > slower than just a read because there is always a store involved.
> > I am not a fun of optimizations with unclear benefits :-)
> > 
> > 
> > > On a related topic, Jan pointed out that the invalidation would not work
> > > properly if you have multiple vCPU modifying the P2M at the same time.
> > > 
> Thanks to Julien, he explained all bits in detail. Indeed I followed how it
> was done on x86 (place where to send the invalidation request, the code to
> check whether the flag is set, which at first glance, appears odd, etc)
> and review comments (to latch current into the local variable, and make sure
> that domain sends invalidation request on itself).
> Regarding what to do if p2m->domain != current->domain in p2m_free_entry().
> Probably we could set flag only if guest is paused, otherwise just print a
> warning. Thoughts?

I'd do something like:

if ( domain_has_ioreq_server(p2m->domain) && p2m_is_ram(entry.p2m.type) )
{
    WARN_ON(p2m->domain != current->domain);
    p2m->domain->mapcache_invalidate = true;
}

but maybe Julien has a better idea.
Julien Grall Dec. 11, 2020, 7:27 p.m. UTC | #6
Hi Stefano,

On 11/12/2020 01:28, Stefano Stabellini wrote:
>>>> Signed-off-by: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
>>>> CC: Julien Grall <julien.grall@arm.com>
>>>>
>>>> ---
>>>> Please note, this is a split/cleanup/hardening of Julien's PoC:
>>>> "Add support for Guest IO forwarding to a device emulator"
>>>>
>>>> Changes V1 -> V2:
>>>>      - new patch, some changes were derived from (+ new explanation):
>>>>        xen/ioreq: Make x86's invalidate qemu mapcache handling common
>>>>      - put setting of the flag into __p2m_set_entry()
>>>>      - clarify the conditions when the flag should be set
>>>>      - use domain_has_ioreq_server()
>>>>      - update do_trap_hypercall() by adding local variable
>>>>
>>>> Changes V2 -> V3:
>>>>      - update patch description
>>>>      - move check to p2m_free_entry()
>>>>      - add a comment
>>>>      - use "curr" instead of "v" in do_trap_hypercall()
>>>> ---
>>>> ---
>>>>    xen/arch/arm/p2m.c   | 24 ++++++++++++++++--------
>>>>    xen/arch/arm/traps.c | 13 ++++++++++---
>>>>    2 files changed, 26 insertions(+), 11 deletions(-)
>>>>
>>>> diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
>>>> index 5b8d494..9674f6f 100644
>>>> --- a/xen/arch/arm/p2m.c
>>>> +++ b/xen/arch/arm/p2m.c
>>>> @@ -1,6 +1,7 @@
>>>>    #include <xen/cpu.h>
>>>>    #include <xen/domain_page.h>
>>>>    #include <xen/iocap.h>
>>>> +#include <xen/ioreq.h>
>>>>    #include <xen/lib.h>
>>>>    #include <xen/sched.h>
>>>>    #include <xen/softirq.h>
>>>> @@ -749,17 +750,24 @@ static void p2m_free_entry(struct p2m_domain *p2m,
>>>>        if ( !p2m_is_valid(entry) )
>>>>            return;
>>>>    -    /* Nothing to do but updating the stats if the entry is a
>>>> super-page. */
>>>> -    if ( p2m_is_superpage(entry, level) )
>>>> +    if ( p2m_is_superpage(entry, level) || (level == 3) )
>>>>        {
>>>> -        p2m->stats.mappings[level]--;
>>>> -        return;
>>>> -    }
>>>> +#ifdef CONFIG_IOREQ_SERVER
>>>> +        /*
>>>> +         * If this gets called (non-recursively) then either the entry
>>>> +         * was replaced by an entry with a different base (valid case) or
>>>> +         * the shattering of a superpage was failed (error case).
>>>> +         * So, at worst, the spurious mapcache invalidation might be
>>>> sent.
>>>> +         */
>>>> +        if ( domain_has_ioreq_server(p2m->domain) &&

Hmmm... I didn't realize that you were going to call 
domain_has_ioreq_server() here. Per your comment, this can only be 
called when on p2m->domain == current.

One way would be to switch the two check. However, I am not entirely 
sure this is necessary. I see no issue to always set mapcache_invalidate 
even if there are no IOREQ server available.

>>>> +             (p2m->domain == current->domain) &&
>>>> p2m_is_ram(entry.p2m.type) )
>>>> +            p2m->domain->mapcache_invalidate = true;
>>>
>>> Why the (p2m->domain == current->domain) check? Shouldn't we set
>>> mapcache_invalidate to true anyway? What happens if p2m->domain !=
>>> current->domain? We wouldn't want the domain to lose the
>>> mapcache_invalidate notification.
>>
>> This is also discussed in [1]. :) The main question is why would a
>> toolstack/device model modify the guest memory after boot?
>>
>> If we assume it does, then the device model would need to pause the domain
>> before modifying the RAM.
>>
>> We also need to make sure that all the IOREQ servers have invalidated
>> the mapcache before the domain run again.
>>
>> This would require quite a bit of work. I am not sure the effort is worth if
>> there are no active users today.
> 
> OK, that explains why we think p2m->domain == current->domain, but why
> do we need to have a check for it right here?
> 
> In other words, we don't think it is realistc to get here with
> p2m->domain != current->domain, but let's say that we do somehow.

I am guessing by "here", you mean in the situation where a RAM entry 
would be removed. Is it correct? If so yes, I don't believe this should 
happen today (even at domain creation/destruction).

> What's
> the best course of action?

The best course of action would be to forward the invalidation to *all* 
the IOREQ servers and wait for it before the domain can run again.

> Probably, set mapcache_invalidate to true and
> possibly print a warning?
So if the toolstack (or an IOREQ server ends to up use it), then we need 
to make sure all the IOREQ server have invalidated the mapcache before 
the domain can run again.

> 
> Leaving mapcache_invalidate to false doesn't seem to be what we want to
> do?

Setting to true/false is not going to be very helful because the guest 
may never issue an hypercall.

Without any more work, the guest may get corrupted. So I would suggest 
to either prevent the P2M to be modified after the domain has been 
created and before it is destroyed (it more a stopgap) or fix it properly.

>>>>        BUILD_BUG_ON(NR_hypercalls < ARRAY_SIZE(arm_hypercall_table) );
>>>>    @@ -1459,7 +1460,7 @@ static void do_trap_hypercall(struct cpu_user_regs
>>>> *regs, register_t *nr,
>>>>            return;
>>>>        }
>>>>    -    current->hcall_preempted = false;
>>>> +    curr->hcall_preempted = false;
>>>>          perfc_incra(hypercalls, *nr);
>>>>        call = arm_hypercall_table[*nr].fn;
>>>> @@ -1472,7 +1473,7 @@ static void do_trap_hypercall(struct cpu_user_regs
>>>> *regs, register_t *nr,
>>>>        HYPERCALL_RESULT_REG(regs) = call(HYPERCALL_ARGS(regs));
>>>>      #ifndef NDEBUG
>>>> -    if ( !current->hcall_preempted )
>>>> +    if ( !curr->hcall_preempted )
>>>>        {
>>>>            /* Deliberately corrupt parameter regs used by this hypercall.
>>>> */
>>>>            switch ( arm_hypercall_table[*nr].nr_args ) {
>>>> @@ -1489,8 +1490,14 @@ static void do_trap_hypercall(struct cpu_user_regs
>>>> *regs, register_t *nr,
>>>>    #endif
>>>>          /* Ensure the hypercall trap instruction is re-executed. */
>>>> -    if ( current->hcall_preempted )
>>>> +    if ( curr->hcall_preempted )
>>>>            regs->pc -= 4;  /* re-execute 'hvc #XEN_HYPERCALL_TAG' */
>>>> +
>>>> +#ifdef CONFIG_IOREQ_SERVER
>>>> +    if ( unlikely(curr->domain->mapcache_invalidate) &&
>>>> +         test_and_clear_bool(curr->domain->mapcache_invalidate) )
>>>> +        ioreq_signal_mapcache_invalidate();
>>>
>>> Why not just:
>>>
>>> if ( unlikely(test_and_clear_bool(curr->domain->mapcache_invalidate)) )
>>>       ioreq_signal_mapcache_invalidate();
>>>
>>
>> This seems to match the x86 code. My guess is they tried to prevent the cost
>> of the atomic operation if there is no chance mapcache_invalidate is true.
>>
>> I am split whether the first check is worth it. The atomic operation should be
>> uncontended most of the time, so it should be quick. But it will always be
>> slower than just a read because there is always a store involved.
> 
> I am not a fun of optimizations with unclear benefits :-)

I thought a bit more about it and I am actually leaning towards keeping 
the first check.

The common implementation of the hypercall path is mostly (if not all) 
accessing per-vCPU variables. So the hypercalls can mostly work 
independently (at least in the common part).

Assuming we drop the first check, we would now be writing to a 
per-domain variable at every hypercall. You are probably going to notice 
performance impact if you were going to benchmark concurrent no-op 
hypercall, because the cache line is going to bounce (because of the write).

Arguably, this may become noise if you execute a full hypercall. But I 
would still like to treat the common hypercall path as the entry/exit 
path. IOW, I would like to be careful in what we add there.

The main reason is hypercalls may be used quite a lot by PV backends or 
device emulator (if we think about Virtio).

If we decided to move the change in the entry/exit path, then it would
definitely be an issue for the reasons I explained above. So I would 
also like to avoid the write in shared variable if we can.

FAOD, I am not saying this optmization will save the world :). I am sure 
there will be more (in particular in the vGIC part) in order to get 
Virtio performance in par with PV backends on Xen.

This discussion would also be moot if ...

> 
>> On a related topic, Jan pointed out that the invalidation would not work
>> properly if you have multiple vCPU modifying the P2M at the same time.

... we had a per-vCPU flag instead.

Cheers,
Julien Grall Dec. 11, 2020, 7:37 p.m. UTC | #7
On 11/12/2020 19:07, Stefano Stabellini wrote:
> On Fri, 11 Dec 2020, Oleksandr wrote:
>> On 11.12.20 03:28, Stefano Stabellini wrote:
>>> On Thu, 10 Dec 2020, Julien Grall wrote:
>>>> On 10/12/2020 02:30, Stefano Stabellini wrote:
>>>>> On Mon, 30 Nov 2020, Oleksandr Tyshchenko wrote:
>>>>>> From: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
>>>>>>
>>>>>> We need to send mapcache invalidation request to qemu/demu everytime
>>>>>> the page gets removed from a guest.
>>>>>>
>>>>>> At the moment, the Arm code doesn't explicitely remove the existing
>>>>>> mapping before inserting the new mapping. Instead, this is done
>>>>>> implicitely by __p2m_set_entry().
>>>>>>
>>>>>> So we need to recognize a case when old entry is a RAM page *and*
>>>>>> the new MFN is different in order to set the corresponding flag.
>>>>>> The most suitable place to do this is p2m_free_entry(), there
>>>>>> we can find the correct leaf type. The invalidation request
>>>>>> will be sent in do_trap_hypercall() later on.
>>>>> Why is it sent in do_trap_hypercall() ?
>>>> I believe this is following the approach used by x86. There are actually
>>>> some
>>>> discussion about it (see [1]).
>>>>
>>>> Leaving aside the toolstack case for now, AFAIK, the only way a guest can
>>>> modify its p2m is via an hypercall. Do you have an example otherwise?
>>> OK this is a very important assumption. We should write it down for sure.
>>> I think it is true today on ARM.
>>>
>>>
>>>> When sending the invalidation request, the vCPU will be blocked until all
>>>> the
>>>> IOREQ server have acknowledged the invalidation. So the hypercall seems to
>>>> be
>>>> the best position to do it.
>>>>
>>>> Alternatively, we could use check_for_vcpu_work() to check if the mapcache
>>>> needs to be invalidated. The inconvenience is we would execute a few more
>>>> instructions in each entry/exit path.
>>> Yeah it would be more natural to call it from check_for_vcpu_work(). If
>>> we put it between #ifdef CONFIG_IOREQ_SERVER it wouldn't be bad. But I
>>> am not a fan of increasing the instructions on the exit path either.
>>>   From this point of view, putting it at the end of do_trap_hypercall is a
>>> nice trick actually. Let's just make sure it has a good comment on top.
>>>
>>>
>>>>>> Signed-off-by: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
>>>>>> CC: Julien Grall <julien.grall@arm.com>
>>>>>>
>>>>>> ---
>>>>>> Please note, this is a split/cleanup/hardening of Julien's PoC:
>>>>>> "Add support for Guest IO forwarding to a device emulator"
>>>>>>
>>>>>> Changes V1 -> V2:
>>>>>>       - new patch, some changes were derived from (+ new explanation):
>>>>>>         xen/ioreq: Make x86's invalidate qemu mapcache handling common
>>>>>>       - put setting of the flag into __p2m_set_entry()
>>>>>>       - clarify the conditions when the flag should be set
>>>>>>       - use domain_has_ioreq_server()
>>>>>>       - update do_trap_hypercall() by adding local variable
>>>>>>
>>>>>> Changes V2 -> V3:
>>>>>>       - update patch description
>>>>>>       - move check to p2m_free_entry()
>>>>>>       - add a comment
>>>>>>       - use "curr" instead of "v" in do_trap_hypercall()
>>>>>> ---
>>>>>> ---
>>>>>>     xen/arch/arm/p2m.c   | 24 ++++++++++++++++--------
>>>>>>     xen/arch/arm/traps.c | 13 ++++++++++---
>>>>>>     2 files changed, 26 insertions(+), 11 deletions(-)
>>>>>>
>>>>>> diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
>>>>>> index 5b8d494..9674f6f 100644
>>>>>> --- a/xen/arch/arm/p2m.c
>>>>>> +++ b/xen/arch/arm/p2m.c
>>>>>> @@ -1,6 +1,7 @@
>>>>>>     #include <xen/cpu.h>
>>>>>>     #include <xen/domain_page.h>
>>>>>>     #include <xen/iocap.h>
>>>>>> +#include <xen/ioreq.h>
>>>>>>     #include <xen/lib.h>
>>>>>>     #include <xen/sched.h>
>>>>>>     #include <xen/softirq.h>
>>>>>> @@ -749,17 +750,24 @@ static void p2m_free_entry(struct p2m_domain
>>>>>> *p2m,
>>>>>>         if ( !p2m_is_valid(entry) )
>>>>>>             return;
>>>>>>     -    /* Nothing to do but updating the stats if the entry is a
>>>>>> super-page. */
>>>>>> -    if ( p2m_is_superpage(entry, level) )
>>>>>> +    if ( p2m_is_superpage(entry, level) || (level == 3) )
>>>>>>         {
>>>>>> -        p2m->stats.mappings[level]--;
>>>>>> -        return;
>>>>>> -    }
>>>>>> +#ifdef CONFIG_IOREQ_SERVER
>>>>>> +        /*
>>>>>> +         * If this gets called (non-recursively) then either the
>>>>>> entry
>>>>>> +         * was replaced by an entry with a different base (valid
>>>>>> case) or
>>>>>> +         * the shattering of a superpage was failed (error case).
>>>>>> +         * So, at worst, the spurious mapcache invalidation might be
>>>>>> sent.
>>>>>> +         */
>>>>>> +        if ( domain_has_ioreq_server(p2m->domain) &&
>>>>>> +             (p2m->domain == current->domain) &&
>>>>>> p2m_is_ram(entry.p2m.type) )
>>>>>> +            p2m->domain->mapcache_invalidate = true;
>>>>> Why the (p2m->domain == current->domain) check? Shouldn't we set
>>>>> mapcache_invalidate to true anyway? What happens if p2m->domain !=
>>>>> current->domain? We wouldn't want the domain to lose the
>>>>> mapcache_invalidate notification.
>>>> This is also discussed in [1]. :) The main question is why would a
>>>> toolstack/device model modify the guest memory after boot?
>>>>
>>>> If we assume it does, then the device model would need to pause the domain
>>>> before modifying the RAM.
>>>>
>>>> We also need to make sure that all the IOREQ servers have invalidated
>>>> the mapcache before the domain run again.
>>>>
>>>> This would require quite a bit of work. I am not sure the effort is worth
>>>> if
>>>> there are no active users today.
>>> OK, that explains why we think p2m->domain == current->domain, but why
>>> do we need to have a check for it right here?
>>>
>>> In other words, we don't think it is realistc to get here with
>>> p2m->domain != current->domain, but let's say that we do somehow. What's
>>> the best course of action? Probably, set mapcache_invalidate to true and
>>> possibly print a warning?
>>>
>>> Leaving mapcache_invalidate to false doesn't seem to be what we want to
>>> do?
>>>
>>>    
>>>>>>         BUILD_BUG_ON(NR_hypercalls < ARRAY_SIZE(arm_hypercall_table) );
>>>>>>     @@ -1459,7 +1460,7 @@ static void do_trap_hypercall(struct
>>>>>> cpu_user_regs
>>>>>> *regs, register_t *nr,
>>>>>>             return;
>>>>>>         }
>>>>>>     -    current->hcall_preempted = false;
>>>>>> +    curr->hcall_preempted = false;
>>>>>>           perfc_incra(hypercalls, *nr);
>>>>>>         call = arm_hypercall_table[*nr].fn;
>>>>>> @@ -1472,7 +1473,7 @@ static void do_trap_hypercall(struct
>>>>>> cpu_user_regs
>>>>>> *regs, register_t *nr,
>>>>>>         HYPERCALL_RESULT_REG(regs) = call(HYPERCALL_ARGS(regs));
>>>>>>       #ifndef NDEBUG
>>>>>> -    if ( !current->hcall_preempted )
>>>>>> +    if ( !curr->hcall_preempted )
>>>>>>         {
>>>>>>             /* Deliberately corrupt parameter regs used by this
>>>>>> hypercall.
>>>>>> */
>>>>>>             switch ( arm_hypercall_table[*nr].nr_args ) {
>>>>>> @@ -1489,8 +1490,14 @@ static void do_trap_hypercall(struct
>>>>>> cpu_user_regs
>>>>>> *regs, register_t *nr,
>>>>>>     #endif
>>>>>>           /* Ensure the hypercall trap instruction is re-executed. */
>>>>>> -    if ( current->hcall_preempted )
>>>>>> +    if ( curr->hcall_preempted )
>>>>>>             regs->pc -= 4;  /* re-execute 'hvc #XEN_HYPERCALL_TAG' */
>>>>>> +
>>>>>> +#ifdef CONFIG_IOREQ_SERVER
>>>>>> +    if ( unlikely(curr->domain->mapcache_invalidate) &&
>>>>>> +         test_and_clear_bool(curr->domain->mapcache_invalidate) )
>>>>>> +        ioreq_signal_mapcache_invalidate();
>>>>> Why not just:
>>>>>
>>>>> if ( unlikely(test_and_clear_bool(curr->domain->mapcache_invalidate)) )
>>>>>        ioreq_signal_mapcache_invalidate();
>>>>>
>>>> This seems to match the x86 code. My guess is they tried to prevent the
>>>> cost
>>>> of the atomic operation if there is no chance mapcache_invalidate is true.
>>>>
>>>> I am split whether the first check is worth it. The atomic operation
>>>> should be
>>>> uncontended most of the time, so it should be quick. But it will always be
>>>> slower than just a read because there is always a store involved.
>>> I am not a fun of optimizations with unclear benefits :-)
>>>
>>>
>>>> On a related topic, Jan pointed out that the invalidation would not work
>>>> properly if you have multiple vCPU modifying the P2M at the same time.
>>>>
>> Thanks to Julien, he explained all bits in detail. Indeed I followed how it
>> was done on x86 (place where to send the invalidation request, the code to
>> check whether the flag is set, which at first glance, appears odd, etc)
>> and review comments (to latch current into the local variable, and make sure
>> that domain sends invalidation request on itself).
>> Regarding what to do if p2m->domain != current->domain in p2m_free_entry().
>> Probably we could set flag only if guest is paused, otherwise just print a
>> warning. Thoughts?
> 
> I'd do something like:
> 
> if ( domain_has_ioreq_server(p2m->domain) && p2m_is_ram(entry.p2m.type) )
> {
>      WARN_ON(p2m->domain != current->domain)

IOREQ server are not trusted. Yet they will be able to reach this patch 
if one re-use the stubdomain model (they are allowed to modify guest 
layout).

So this change would hand a DoS attack to the IOREQ server on a silver 
platter :).

In general, we should avoid to use WARN_ON() for things that can be 
triggered by a domain. Instead we should use gprintk(XENLOG_WARNING, 
"...") to allow rate-limit.

On the cons side, it would be more difficult to spot any misue with a 
gprintk().

>      p2m->domain->mapcache_invalidate = true;
> }
> 
> but maybe Julien has a better idea.

I suggested a different approach and some rationale in answer to your 
e-mail. Although, I am not sure if we could call it a better approach 
:). We can continue the discusison there.

Cheers,
diff mbox series

Patch

diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
index 5b8d494..9674f6f 100644
--- a/xen/arch/arm/p2m.c
+++ b/xen/arch/arm/p2m.c
@@ -1,6 +1,7 @@ 
 #include <xen/cpu.h>
 #include <xen/domain_page.h>
 #include <xen/iocap.h>
+#include <xen/ioreq.h>
 #include <xen/lib.h>
 #include <xen/sched.h>
 #include <xen/softirq.h>
@@ -749,17 +750,24 @@  static void p2m_free_entry(struct p2m_domain *p2m,
     if ( !p2m_is_valid(entry) )
         return;
 
-    /* Nothing to do but updating the stats if the entry is a super-page. */
-    if ( p2m_is_superpage(entry, level) )
+    if ( p2m_is_superpage(entry, level) || (level == 3) )
     {
-        p2m->stats.mappings[level]--;
-        return;
-    }
+#ifdef CONFIG_IOREQ_SERVER
+        /*
+         * If this gets called (non-recursively) then either the entry
+         * was replaced by an entry with a different base (valid case) or
+         * the shattering of a superpage was failed (error case).
+         * So, at worst, the spurious mapcache invalidation might be sent.
+         */
+        if ( domain_has_ioreq_server(p2m->domain) &&
+             (p2m->domain == current->domain) && p2m_is_ram(entry.p2m.type) )
+            p2m->domain->mapcache_invalidate = true;
+#endif
 
-    if ( level == 3 )
-    {
         p2m->stats.mappings[level]--;
-        p2m_put_l3_page(entry);
+        /* Nothing to do if the entry is a super-page. */
+        if ( level == 3 )
+            p2m_put_l3_page(entry);
         return;
     }
 
diff --git a/xen/arch/arm/traps.c b/xen/arch/arm/traps.c
index b6077d2..151c626 100644
--- a/xen/arch/arm/traps.c
+++ b/xen/arch/arm/traps.c
@@ -1443,6 +1443,7 @@  static void do_trap_hypercall(struct cpu_user_regs *regs, register_t *nr,
                               const union hsr hsr)
 {
     arm_hypercall_fn_t call = NULL;
+    struct vcpu *curr = current;
 
     BUILD_BUG_ON(NR_hypercalls < ARRAY_SIZE(arm_hypercall_table) );
 
@@ -1459,7 +1460,7 @@  static void do_trap_hypercall(struct cpu_user_regs *regs, register_t *nr,
         return;
     }
 
-    current->hcall_preempted = false;
+    curr->hcall_preempted = false;
 
     perfc_incra(hypercalls, *nr);
     call = arm_hypercall_table[*nr].fn;
@@ -1472,7 +1473,7 @@  static void do_trap_hypercall(struct cpu_user_regs *regs, register_t *nr,
     HYPERCALL_RESULT_REG(regs) = call(HYPERCALL_ARGS(regs));
 
 #ifndef NDEBUG
-    if ( !current->hcall_preempted )
+    if ( !curr->hcall_preempted )
     {
         /* Deliberately corrupt parameter regs used by this hypercall. */
         switch ( arm_hypercall_table[*nr].nr_args ) {
@@ -1489,8 +1490,14 @@  static void do_trap_hypercall(struct cpu_user_regs *regs, register_t *nr,
 #endif
 
     /* Ensure the hypercall trap instruction is re-executed. */
-    if ( current->hcall_preempted )
+    if ( curr->hcall_preempted )
         regs->pc -= 4;  /* re-execute 'hvc #XEN_HYPERCALL_TAG' */
+
+#ifdef CONFIG_IOREQ_SERVER
+    if ( unlikely(curr->domain->mapcache_invalidate) &&
+         test_and_clear_bool(curr->domain->mapcache_invalidate) )
+        ioreq_signal_mapcache_invalidate();
+#endif
 }
 
 void arch_hypercall_tasklet_result(struct vcpu *v, long res)