diff mbox series

[2/8] x86/shadow: properly handle get_page() failing

Message ID c5797fa8-8ee1-7668-936c-392f8298dff1@suse.com (mailing list archive)
State New, archived
Headers show
Series x86: XSA-40{1,2,8} follow-up | expand

Commit Message

Jan Beulich July 26, 2022, 4:04 p.m. UTC
We should not blindly (in a release build) insert the new entry in the
hash if a reference to the guest page cannot be obtained, or else an
excess reference would be put when removing the hash entry again. Crash
the domain in that case instead. The sole caller doesn't further care
about the state of the guest page: All it does is return the
corresponding shadow page (which was obtained successfully before) to
its caller.

To compensate we further need to adjust hash removal: Since the shadow
page already has had its backlink set, domain cleanup code would try to
destroy the shadow, and hence still cause a put_page() without
corresponding get_page(). Leverage that the failed get_page() leads to
no hash insertion, making shadow_hash_delete() no longer assume it will
find the requested entry. Instead return back whether the entry was
found. This way delete_shadow_status() can avoid calling put_page() in
the problem scenario.

For the other caller of shadow_hash_delete() simply reinstate the
otherwise dropped assertion at the call site.

While touching the conditionals in {set,delete}_shadow_status() anyway,
also switch around their two pre-existing parts, to have the cheap one
first (frequently allowing to avoid evaluation of the expensive - due to
evaluate_nospec() - one altogether).

Signed-off-by: Jan Beulich <jbeulich@suse.com>

Comments

Andrew Cooper July 27, 2022, 12:46 p.m. UTC | #1
On 26/07/2022 17:04, Jan Beulich wrote:
> We should not blindly (in a release build) insert the new entry in the
> hash if a reference to the guest page cannot be obtained, or else an
> excess reference would be put when removing the hash entry again. Crash
> the domain in that case instead. The sole caller doesn't further care
> about the state of the guest page: All it does is return the
> corresponding shadow page (which was obtained successfully before) to
> its caller.
>
> To compensate we further need to adjust hash removal: Since the shadow
> page already has had its backlink set, domain cleanup code would try to
> destroy the shadow, and hence still cause a put_page() without
> corresponding get_page(). Leverage that the failed get_page() leads to
> no hash insertion, making shadow_hash_delete() no longer assume it will
> find the requested entry. Instead return back whether the entry was
> found. This way delete_shadow_status() can avoid calling put_page() in
> the problem scenario.
>
> For the other caller of shadow_hash_delete() simply reinstate the
> otherwise dropped assertion at the call site.
>
> While touching the conditionals in {set,delete}_shadow_status() anyway,
> also switch around their two pre-existing parts, to have the cheap one
> first (frequently allowing to avoid evaluation of the expensive - due to
> evaluate_nospec() - one altogether).
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>, although with
some observations and a suggestion.

>
> --- a/xen/arch/x86/mm/shadow/common.c
> +++ b/xen/arch/x86/mm/shadow/common.c
> @@ -1581,7 +1581,7 @@ void shadow_hash_insert(struct domain *d
>      sh_hash_audit_bucket(d, key);
>  }
>  
> -void shadow_hash_delete(struct domain *d, unsigned long n, unsigned int t,
> +bool shadow_hash_delete(struct domain *d, unsigned long n, unsigned int t,
>                          mfn_t smfn)
>  /* Excise the mapping (n,t)->smfn from the hash table */
>  {
> @@ -1606,10 +1606,8 @@ void shadow_hash_delete(struct domain *d
>      {
>          /* Need to search for the one we want */
>          x = d->arch.paging.shadow.hash_table[key];
> -        while ( 1 )
> +        while ( x )
>          {
> -            ASSERT(x); /* We can't have hit the end, since our target is
> -                        * still in the chain somehwere... */
>              if ( next_shadow(x) == sp )
>              {
>                  x->next_shadow = sp->next_shadow;
> @@ -1617,10 +1615,14 @@ void shadow_hash_delete(struct domain *d
>              }
>              x = next_shadow(x);
>          }
> +        if ( !x )
> +            return false;
>      }

This entire if/else block is "delete matching element from single linked
list", opencoded in a very confusing way to follow.  I can't help but
feel there's a way to simplify it.  (Not in this patch, but for future
clean-up.)

>      set_next_shadow(sp, NULL);
>  
>      sh_hash_audit_bucket(d, key);
> +
> +    return true;
>  }
>  
>  typedef int (*hash_vcpu_callback_t)(struct vcpu *v, mfn_t smfn, mfn_t other_mfn);
> --- a/xen/arch/x86/mm/shadow/multi.c
> +++ b/xen/arch/x86/mm/shadow/multi.c
> @@ -132,7 +132,8 @@ delete_fl1_shadow_status(struct domain *
>      SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%"PRI_mfn"\n",
>                     gfn_x(gfn), SH_type_fl1_shadow, mfn_x(smfn));
>      ASSERT(mfn_to_page(smfn)->u.sh.head);
> -    shadow_hash_delete(d, gfn_x(gfn), SH_type_fl1_shadow, smfn);
> +    if ( !shadow_hash_delete(d, gfn_x(gfn), SH_type_fl1_shadow, smfn) )
> +        ASSERT_UNREACHABLE();

I'd recommend crashing the domain here too.  I know the we've already
got the return value we want, but this represents corruption in the
shadow pagetable metadata, and I highly doubt it is safe to let the
guest continue to execute in such circumstances.

>  }
>  
>  
> --- a/xen/arch/x86/mm/shadow/private.h
> +++ b/xen/arch/x86/mm/shadow/private.h
> @@ -375,7 +375,7 @@ shadow_size(unsigned int shadow_type)
>  mfn_t shadow_hash_lookup(struct domain *d, unsigned long n, unsigned int t);
>  void  shadow_hash_insert(struct domain *d,
>                           unsigned long n, unsigned int t, mfn_t smfn);
> -void  shadow_hash_delete(struct domain *d,
> +bool  shadow_hash_delete(struct domain *d,
>                           unsigned long n, unsigned int t, mfn_t smfn);
>  
>  /* shadow promotion */
> @@ -773,18 +773,19 @@ static inline void
>  set_shadow_status(struct domain *d, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
>  /* Put a shadow into the hash table */
>  {
> -    int res;
> -
>      SHADOW_PRINTK("d%d gmfn=%lx, type=%08x, smfn=%lx\n",
>                    d->domain_id, mfn_x(gmfn), shadow_type, mfn_x(smfn));
>  
>      ASSERT(mfn_to_page(smfn)->u.sh.head);
>  
>      /* 32-bit PV guests don't own their l4 pages so can't get_page them */
> -    if ( !is_pv_32bit_domain(d) || shadow_type != SH_type_l4_64_shadow )
> +    if ( (shadow_type != SH_type_l4_64_shadow || !is_pv_32bit_domain(d)) &&

This is the sensible way around anyway, but this is also a great example
of a predicate which really doesn't want to be eval_nospec().

We've got a growing pile of such usecases, so really do need to find a
predicate we can use which indicates "safely not speculation relevant".

~Andrew
Jan Beulich July 27, 2022, 12:53 p.m. UTC | #2
On 27.07.2022 14:46, Andrew Cooper wrote:
> On 26/07/2022 17:04, Jan Beulich wrote:
>> We should not blindly (in a release build) insert the new entry in the
>> hash if a reference to the guest page cannot be obtained, or else an
>> excess reference would be put when removing the hash entry again. Crash
>> the domain in that case instead. The sole caller doesn't further care
>> about the state of the guest page: All it does is return the
>> corresponding shadow page (which was obtained successfully before) to
>> its caller.
>>
>> To compensate we further need to adjust hash removal: Since the shadow
>> page already has had its backlink set, domain cleanup code would try to
>> destroy the shadow, and hence still cause a put_page() without
>> corresponding get_page(). Leverage that the failed get_page() leads to
>> no hash insertion, making shadow_hash_delete() no longer assume it will
>> find the requested entry. Instead return back whether the entry was
>> found. This way delete_shadow_status() can avoid calling put_page() in
>> the problem scenario.
>>
>> For the other caller of shadow_hash_delete() simply reinstate the
>> otherwise dropped assertion at the call site.
>>
>> While touching the conditionals in {set,delete}_shadow_status() anyway,
>> also switch around their two pre-existing parts, to have the cheap one
>> first (frequently allowing to avoid evaluation of the expensive - due to
>> evaluate_nospec() - one altogether).
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> 
> Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>, although with
> some observations and a suggestion.

Thanks.

>> --- a/xen/arch/x86/mm/shadow/multi.c
>> +++ b/xen/arch/x86/mm/shadow/multi.c
>> @@ -132,7 +132,8 @@ delete_fl1_shadow_status(struct domain *
>>      SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%"PRI_mfn"\n",
>>                     gfn_x(gfn), SH_type_fl1_shadow, mfn_x(smfn));
>>      ASSERT(mfn_to_page(smfn)->u.sh.head);
>> -    shadow_hash_delete(d, gfn_x(gfn), SH_type_fl1_shadow, smfn);
>> +    if ( !shadow_hash_delete(d, gfn_x(gfn), SH_type_fl1_shadow, smfn) )
>> +        ASSERT_UNREACHABLE();
> 
> I'd recommend crashing the domain here too.  I know the we've already
> got the return value we want, but this represents corruption in the
> shadow pagetable metadata, and I highly doubt it is safe to let the
> guest continue to execute in such circumstances.

I'm happy to add or convert, but please clarify: DYM in addition to
the assertion or in place of it?

Jan
Andrew Cooper July 27, 2022, 7:06 p.m. UTC | #3
On 27/07/2022 13:53, Jan Beulich wrote:
> On 27.07.2022 14:46, Andrew Cooper wrote:
>> On 26/07/2022 17:04, Jan Beulich wrote:
>>> --- a/xen/arch/x86/mm/shadow/multi.c
>>> +++ b/xen/arch/x86/mm/shadow/multi.c
>>> @@ -132,7 +132,8 @@ delete_fl1_shadow_status(struct domain *
>>>      SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%"PRI_mfn"\n",
>>>                     gfn_x(gfn), SH_type_fl1_shadow, mfn_x(smfn));
>>>      ASSERT(mfn_to_page(smfn)->u.sh.head);
>>> -    shadow_hash_delete(d, gfn_x(gfn), SH_type_fl1_shadow, smfn);
>>> +    if ( !shadow_hash_delete(d, gfn_x(gfn), SH_type_fl1_shadow, smfn) )
>>> +        ASSERT_UNREACHABLE();
>> I'd recommend crashing the domain here too.  I know the we've already
>> got the return value we want, but this represents corruption in the
>> shadow pagetable metadata, and I highly doubt it is safe to let the
>> guest continue to execute in such circumstances.
> I'm happy to add or convert, but please clarify: DYM in addition to
> the assertion or in place of it?

Erm pass.

We still don't have a sensible construct for ASSERT_OR_DOMAIN_CRASH(),
or a well-thought-through piece of guidance.

Probably keep the ASSERT() ?  Almost all state checking in the shadow
code is via asserts.


Mostly what I'm concerned by is it feeling weird to have one path which
only domain crashes, and one path which only asserts.

~Andrew
Jan Beulich July 28, 2022, 6:12 a.m. UTC | #4
On 27.07.2022 21:06, Andrew Cooper wrote:
> On 27/07/2022 13:53, Jan Beulich wrote:
>> On 27.07.2022 14:46, Andrew Cooper wrote:
>>> On 26/07/2022 17:04, Jan Beulich wrote:
>>>> --- a/xen/arch/x86/mm/shadow/multi.c
>>>> +++ b/xen/arch/x86/mm/shadow/multi.c
>>>> @@ -132,7 +132,8 @@ delete_fl1_shadow_status(struct domain *
>>>>      SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%"PRI_mfn"\n",
>>>>                     gfn_x(gfn), SH_type_fl1_shadow, mfn_x(smfn));
>>>>      ASSERT(mfn_to_page(smfn)->u.sh.head);
>>>> -    shadow_hash_delete(d, gfn_x(gfn), SH_type_fl1_shadow, smfn);
>>>> +    if ( !shadow_hash_delete(d, gfn_x(gfn), SH_type_fl1_shadow, smfn) )
>>>> +        ASSERT_UNREACHABLE();
>>> I'd recommend crashing the domain here too.  I know the we've already
>>> got the return value we want, but this represents corruption in the
>>> shadow pagetable metadata, and I highly doubt it is safe to let the
>>> guest continue to execute in such circumstances.
>> I'm happy to add or convert, but please clarify: DYM in addition to
>> the assertion or in place of it?
> 
> Erm pass.
> 
> We still don't have a sensible construct for ASSERT_OR_DOMAIN_CRASH(),
> or a well-thought-through piece of guidance.
> 
> Probably keep the ASSERT() ?  Almost all state checking in the shadow
> code is via asserts.

Meanwhile I was actually leaning towards dropping the assertion. The
goal is to catch attention when things go wrong. A suddenly dying
domain surely ought to be as noticable as an assertion triggering.

> Mostly what I'm concerned by is it feeling weird to have one path which
> only domain crashes, and one path which only asserts.

I'm afraid I've not been successful in determining which other path it
is you're referring to. If I knew, I might be up for converting that
as well (not in this same patch, perhaps). I don't think you mean
set_shadow_status(), since there we can't validly ASSERT(), as the
condition is (in principle) guest controllable.

Jan
diff mbox series

Patch

--- a/xen/arch/x86/mm/shadow/common.c
+++ b/xen/arch/x86/mm/shadow/common.c
@@ -1581,7 +1581,7 @@  void shadow_hash_insert(struct domain *d
     sh_hash_audit_bucket(d, key);
 }
 
-void shadow_hash_delete(struct domain *d, unsigned long n, unsigned int t,
+bool shadow_hash_delete(struct domain *d, unsigned long n, unsigned int t,
                         mfn_t smfn)
 /* Excise the mapping (n,t)->smfn from the hash table */
 {
@@ -1606,10 +1606,8 @@  void shadow_hash_delete(struct domain *d
     {
         /* Need to search for the one we want */
         x = d->arch.paging.shadow.hash_table[key];
-        while ( 1 )
+        while ( x )
         {
-            ASSERT(x); /* We can't have hit the end, since our target is
-                        * still in the chain somehwere... */
             if ( next_shadow(x) == sp )
             {
                 x->next_shadow = sp->next_shadow;
@@ -1617,10 +1615,14 @@  void shadow_hash_delete(struct domain *d
             }
             x = next_shadow(x);
         }
+        if ( !x )
+            return false;
     }
     set_next_shadow(sp, NULL);
 
     sh_hash_audit_bucket(d, key);
+
+    return true;
 }
 
 typedef int (*hash_vcpu_callback_t)(struct vcpu *v, mfn_t smfn, mfn_t other_mfn);
--- a/xen/arch/x86/mm/shadow/multi.c
+++ b/xen/arch/x86/mm/shadow/multi.c
@@ -132,7 +132,8 @@  delete_fl1_shadow_status(struct domain *
     SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%"PRI_mfn"\n",
                    gfn_x(gfn), SH_type_fl1_shadow, mfn_x(smfn));
     ASSERT(mfn_to_page(smfn)->u.sh.head);
-    shadow_hash_delete(d, gfn_x(gfn), SH_type_fl1_shadow, smfn);
+    if ( !shadow_hash_delete(d, gfn_x(gfn), SH_type_fl1_shadow, smfn) )
+        ASSERT_UNREACHABLE();
 }
 
 
--- a/xen/arch/x86/mm/shadow/private.h
+++ b/xen/arch/x86/mm/shadow/private.h
@@ -375,7 +375,7 @@  shadow_size(unsigned int shadow_type)
 mfn_t shadow_hash_lookup(struct domain *d, unsigned long n, unsigned int t);
 void  shadow_hash_insert(struct domain *d,
                          unsigned long n, unsigned int t, mfn_t smfn);
-void  shadow_hash_delete(struct domain *d,
+bool  shadow_hash_delete(struct domain *d,
                          unsigned long n, unsigned int t, mfn_t smfn);
 
 /* shadow promotion */
@@ -773,18 +773,19 @@  static inline void
 set_shadow_status(struct domain *d, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
 /* Put a shadow into the hash table */
 {
-    int res;
-
     SHADOW_PRINTK("d%d gmfn=%lx, type=%08x, smfn=%lx\n",
                   d->domain_id, mfn_x(gmfn), shadow_type, mfn_x(smfn));
 
     ASSERT(mfn_to_page(smfn)->u.sh.head);
 
     /* 32-bit PV guests don't own their l4 pages so can't get_page them */
-    if ( !is_pv_32bit_domain(d) || shadow_type != SH_type_l4_64_shadow )
+    if ( (shadow_type != SH_type_l4_64_shadow || !is_pv_32bit_domain(d)) &&
+         !get_page(mfn_to_page(gmfn), d) )
     {
-        res = get_page(mfn_to_page(gmfn), d);
-        ASSERT(res == 1);
+        printk(XENLOG_G_ERR "%pd: cannot get page for MFN %" PRI_mfn "\n",
+               d, mfn_x(gmfn));
+        domain_crash(d);
+        return;
     }
 
     shadow_hash_insert(d, mfn_x(gmfn), shadow_type, smfn);
@@ -797,9 +798,9 @@  delete_shadow_status(struct domain *d, m
     SHADOW_PRINTK("d%d gmfn=%"PRI_mfn", type=%08x, smfn=%"PRI_mfn"\n",
                   d->domain_id, mfn_x(gmfn), shadow_type, mfn_x(smfn));
     ASSERT(mfn_to_page(smfn)->u.sh.head);
-    shadow_hash_delete(d, mfn_x(gmfn), shadow_type, smfn);
-    /* 32-bit PV guests don't own their l4 pages; see set_shadow_status */
-    if ( !is_pv_32bit_domain(d) || shadow_type != SH_type_l4_64_shadow )
+    if ( shadow_hash_delete(d, mfn_x(gmfn), shadow_type, smfn) &&
+         /* 32-bit PV guests don't own their l4 pages; see set_shadow_status */
+         (shadow_type != SH_type_l4_64_shadow || !is_pv_32bit_domain(d)) )
         put_page(mfn_to_page(gmfn));
 }