diff mbox series

[v3] x86/mm: Short circuit damage from "fishy" ref/typecount failure

Message ID 20210119130254.27058-1-andrew.cooper3@citrix.com (mailing list archive)
State New
Headers show
Series [v3] x86/mm: Short circuit damage from "fishy" ref/typecount failure | expand

Commit Message

Andrew Cooper Jan. 19, 2021, 1:02 p.m. UTC
This code has been copied in 3 places, but it is problematic.

All cases will hit a BUG() later in domain teardown, when a the missing
type/count reference is underflowed.

Don't complicated the logic by leaving a totally unqualified domain crash, and
a timebomb which will be triggered by the toolstack at a slightly later, and
seemingly unrelated, point.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
CC: Jan Beulich <JBeulich@suse.com>
CC: Roger Pau Monné <roger.pau@citrix.com>
CC: Wei Liu <wl@xen.org>
CC: Paul Durrant <paul@xen.org>
CC: Tamas K Lengyel <tamas@tklengyel.com>

v3:
 * Actually include the typo corrects to make it compile.
v2:
 * Reword the commit message.
 * Switch BUG() to BUG_ON() to further reduce code volume.
---
 xen/arch/x86/hvm/ioreq.c     | 11 ++---------
 xen/arch/x86/hvm/vmx/vmx.c   | 11 ++---------
 xen/arch/x86/mm/mem_paging.c | 17 ++++-------------
 3 files changed, 8 insertions(+), 31 deletions(-)

Comments

Paul Durrant Jan. 19, 2021, 1:06 p.m. UTC | #1
> -----Original Message-----
> From: Andrew Cooper <andrew.cooper3@citrix.com>
> Sent: 19 January 2021 13:03
> To: Xen-devel <xen-devel@lists.xenproject.org>
> Cc: Andrew Cooper <andrew.cooper3@citrix.com>; Jan Beulich <JBeulich@suse.com>; Roger Pau Monné
> <roger.pau@citrix.com>; Wei Liu <wl@xen.org>; Paul Durrant <paul@xen.org>; Tamas K Lengyel
> <tamas@tklengyel.com>
> Subject: [PATCH v3] x86/mm: Short circuit damage from "fishy" ref/typecount failure
> 
> This code has been copied in 3 places, but it is problematic.
> 
> All cases will hit a BUG() later in domain teardown, when a the missing
> type/count reference is underflowed.
> 
> Don't complicated the logic by leaving a totally unqualified domain crash, and
> a timebomb which will be triggered by the toolstack at a slightly later, and
> seemingly unrelated, point.
> 
> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
> ---
> CC: Jan Beulich <JBeulich@suse.com>
> CC: Roger Pau Monné <roger.pau@citrix.com>
> CC: Wei Liu <wl@xen.org>
> CC: Paul Durrant <paul@xen.org>
> CC: Tamas K Lengyel <tamas@tklengyel.com>
> 
> v3:
>  * Actually include the typo corrects to make it compile.

Looks ok now :-)

Reviewed-by: Paul Durrant <paul@xen.org>

> v2:
>  * Reword the commit message.
>  * Switch BUG() to BUG_ON() to further reduce code volume.
> ---
>  xen/arch/x86/hvm/ioreq.c     | 11 ++---------
>  xen/arch/x86/hvm/vmx/vmx.c   | 11 ++---------
>  xen/arch/x86/mm/mem_paging.c | 17 ++++-------------
>  3 files changed, 8 insertions(+), 31 deletions(-)
> 
> diff --git a/xen/arch/x86/hvm/ioreq.c b/xen/arch/x86/hvm/ioreq.c
> index 1cc27df87f..0c38cfa151 100644
> --- a/xen/arch/x86/hvm/ioreq.c
> +++ b/xen/arch/x86/hvm/ioreq.c
> @@ -366,15 +366,8 @@ static int hvm_alloc_ioreq_mfn(struct hvm_ioreq_server *s, bool buf)
>      if ( !page )
>          return -ENOMEM;
> 
> -    if ( !get_page_and_type(page, s->target, PGT_writable_page) )
> -    {
> -        /*
> -         * The domain can't possibly know about this page yet, so failure
> -         * here is a clear indication of something fishy going on.
> -         */
> -        domain_crash(s->emulator);
> -        return -ENODATA;
> -    }
> +    /* Domain can't know about this page yet - something fishy going on. */
> +    BUG_ON(!get_page_and_type(page, s->target, PGT_writable_page));
> 
>      iorp->va = __map_domain_page_global(page);
>      if ( !iorp->va )
> diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
> index 2d4475ee3d..4120234c15 100644
> --- a/xen/arch/x86/hvm/vmx/vmx.c
> +++ b/xen/arch/x86/hvm/vmx/vmx.c
> @@ -3042,15 +3042,8 @@ static int vmx_alloc_vlapic_mapping(struct domain *d)
>      if ( !pg )
>          return -ENOMEM;
> 
> -    if ( !get_page_and_type(pg, d, PGT_writable_page) )
> -    {
> -        /*
> -         * The domain can't possibly know about this page yet, so failure
> -         * here is a clear indication of something fishy going on.
> -         */
> -        domain_crash(d);
> -        return -ENODATA;
> -    }
> +    /* Domain can't know about this page yet - something fishy going on. */
> +    BUG_ON(!get_page_and_type(pg, d, PGT_writable_page));
> 
>      mfn = page_to_mfn(pg);
>      clear_domain_page(mfn);
> diff --git a/xen/arch/x86/mm/mem_paging.c b/xen/arch/x86/mm/mem_paging.c
> index 01281f786e..60d667ae94 100644
> --- a/xen/arch/x86/mm/mem_paging.c
> +++ b/xen/arch/x86/mm/mem_paging.c
> @@ -379,19 +379,10 @@ static int prepare(struct domain *d, gfn_t gfn,
>          page = alloc_domheap_page(d, 0);
>          if ( unlikely(page == NULL) )
>              goto out;
> -        if ( unlikely(!get_page(page, d)) )
> -        {
> -            /*
> -             * The domain can't possibly know about this page yet, so failure
> -             * here is a clear indication of something fishy going on.
> -             */
> -            gprintk(XENLOG_ERR,
> -                    "%pd: fresh page for GFN %"PRI_gfn" in unexpected state\n",
> -                    d, gfn_x(gfn));
> -            domain_crash(d);
> -            page = NULL;
> -            goto out;
> -        }
> +
> +        /* Domain can't know about this page yet - something fishy going on. */
> +        BUG_ON(!get_page(page, d));
> +
>          mfn = page_to_mfn(page);
>          page_extant = 0;
> 
> --
> 2.11.0
Jan Beulich Jan. 19, 2021, 4:48 p.m. UTC | #2
On 19.01.2021 14:02, Andrew Cooper wrote:
> This code has been copied in 3 places, but it is problematic.
> 
> All cases will hit a BUG() later in domain teardown, when a the missing
> type/count reference is underflowed.

I'm afraid I could use some help with this: Why would there
be a missing reference, when the getting of one failed? IOW
I'm not (yet) convinced you don't make the impact more
severe in the (supposedly) impossible case of these paths
getting hit, by converting a domain crash into a host one.
It is in particular relevant that a PV guest may be able to
cheat and "guess" an MFN to obtain references for before a
certain hypercall (or other operation) actually completed.

> v2:
>  * Reword the commit message.
>  * Switch BUG() to BUG_ON() to further reduce code volume.

Short of us explicitly agreeing that such is fine to use, I
think we ought to stick to the previously (long ago) agreed
rule that BUG_ON() controlling expressions should not have
side effects, for us not wanting to guarantee it will now
and forever _not_ behave like ASSERT() wrt to evaluating
the expression.

Jan
Andrew Cooper Jan. 19, 2021, 6:09 p.m. UTC | #3
On 19/01/2021 16:48, Jan Beulich wrote:
> On 19.01.2021 14:02, Andrew Cooper wrote:
>> This code has been copied in 3 places, but it is problematic.
>>
>> All cases will hit a BUG() later in domain teardown, when a the missing
>> type/count reference is underflowed.
> I'm afraid I could use some help with this: Why would there
> be a missing reference, when the getting of one failed?

Look at the cleanup logic for the associated fields.

Either the plain ref fails (impossible without other fatal refcounting
errors AFAICT), or the typeref fails (a concern, but impossible AFAICT).

When the plain ref fails, put_page_alloc_ref() spots the underflow with
a BUG, while if the typeref fails, it is _put_page_type()'s BUG which
spots the underflow.

> IOW
> I'm not (yet) convinced you don't make the impact more
> severe in the (supposedly) impossible case of these paths
> getting hit, by converting a domain crash into a host one.

I have test cases.  I didn't go searching for the BUG()s by code inspection.

> It is in particular relevant that a PV guest may be able to
> cheat and "guess" an MFN to obtain references for before a
> certain hypercall (or other operation) actually completed.

And do what with it?  PV guest's can't force typerefs for
pgtable/segdesc because we prohibited cross-pg_owner scenarios many XSAs
ago.  A PV guest also can't force it to none or shared.

That only leaves PGT_writable_page, which is the ref we're interested in
taking.

>> v2:
>>  * Reword the commit message.
>>  * Switch BUG() to BUG_ON() to further reduce code volume.
> Short of us explicitly agreeing that such is fine to use, I
> think we ought to stick to the previously (long ago) agreed
> rule that BUG_ON() controlling expressions should not have
> side effects, for us not wanting to guarantee it will now
> and forever _not_ behave like ASSERT() wrt to evaluating
> the expression.

So what you want is v1 of this patch.

~Andrew
Jan Beulich Jan. 20, 2021, 8:06 a.m. UTC | #4
On 19.01.2021 19:09, Andrew Cooper wrote:
> On 19/01/2021 16:48, Jan Beulich wrote:
>> On 19.01.2021 14:02, Andrew Cooper wrote:
>>> This code has been copied in 3 places, but it is problematic.
>>>
>>> All cases will hit a BUG() later in domain teardown, when a the missing
>>> type/count reference is underflowed.
>> I'm afraid I could use some help with this: Why would there
>> be a missing reference, when the getting of one failed?
> 
> Look at the cleanup logic for the associated fields.
> 
> Either the plain ref fails (impossible without other fatal refcounting
> errors AFAICT), or the typeref fails (a concern, but impossible AFAICT).

In principle I would agree, if there wasn't the question of
count overflows. The type count presently is 56 bits wide,
while the general refcount has 54 bits. It'll be a long time
until they overflow, but it's not impossible. The underlying
problem there that I see is - where do we draw the line
between "can't possibly overflow in practice" (as we would
typically assume for 64-bit counters) and "is to be expected
to overflow (as we would typically assume for 32-bit
counters)?

Also, as far as "impossible" here goes - the constructs all
anyway exist only to deal with what we consider impossible.
The question therefore really is of almost exclusively
theoretical nature, and hence something like a counter
possibly overflowing imo needs to be accounted for as
theoretically possible, albeit impossible with today's
computers and realistic timing assumptions. If a counter
overflow occurred, it definitely wouldn't be because of a
bug in Xen, but because of abnormal behavior elsewhere.
Hence I remain unconvinced it is appropriate to deal with
the situation by BUG().

But yes, if otoh we assume the failure here to be the result
of a bug elsewhere in Xen (and not an overflow), then BUG()
may be warranted. Yet afaic these constructs weren't meant
to deal with bugs elsewhere in Xen, but with the
"impossible". So if we change our collective mind here, I
think the conversion to BUG() would then need accompanying
by respective commentary.

> When the plain ref fails, put_page_alloc_ref() spots the underflow with
> a BUG, while if the typeref fails, it is _put_page_type()'s BUG which
> spots the underflow.

put_page_alloc_ref() puts the ref installed by assign_pages()
aiui. If that one underflows, a bad put must have been invoked
elsewhere, which then is the one to fix. _put_page_type()
spotting an underflow also means either that very invocation
shouldn't have happened, or an earlier one was issued in
error.

At the example of hvm_alloc_ioreq_mfn(), in case of hitting
the path in question the only ref to the page that exists is
the one from assign_pages(). And that's the only one that
will get dropped when the domain gets cleaned up. The page,
in particular, hasn't been recorded in iorp just yet. So I
don't see where any other put (general or type) would come
from.

>> IOW
>> I'm not (yet) convinced you don't make the impact more
>> severe in the (supposedly) impossible case of these paths
>> getting hit, by converting a domain crash into a host one.
> 
> I have test cases.  I didn't go searching for the BUG()s by code inspection.

I'd be curious to see what exactly they do, and why exactly
a BUG() results.

>> It is in particular relevant that a PV guest may be able to
>> cheat and "guess" an MFN to obtain references for before a
>> certain hypercall (or other operation) actually completed.
> 
> And do what with it?  PV guest's can't force typerefs for
> pgtable/segdesc because we prohibited cross-pg_owner scenarios many XSAs
> ago.  A PV guest also can't force it to none or shared.

Hmm, yes, the owning domain always is a HVM one. So even by
cooperating the type can't become other than PGT_writable_page.

> That only leaves PGT_writable_page, which is the ref we're interested in
> taking.
> 
>>> v2:
>>>  * Reword the commit message.
>>>  * Switch BUG() to BUG_ON() to further reduce code volume.
>> Short of us explicitly agreeing that such is fine to use, I
>> think we ought to stick to the previously (long ago) agreed
>> rule that BUG_ON() controlling expressions should not have
>> side effects, for us not wanting to guarantee it will now
>> and forever _not_ behave like ASSERT() wrt to evaluating
>> the expression.
> 
> So what you want is v1 of this patch.

Looks like so (assuming my concerns above can get sorted); the
versions came in so rapid succession that I didn't get around
to look at the earlier versions.

Jan
Andrew Cooper Jan. 25, 2021, 5:59 p.m. UTC | #5
On 20/01/2021 08:06, Jan Beulich wrote:
> On 19.01.2021 19:09, Andrew Cooper wrote:
>> On 19/01/2021 16:48, Jan Beulich wrote:
>>> On 19.01.2021 14:02, Andrew Cooper wrote:
>>>> This code has been copied in 3 places, but it is problematic.
>>>>
>>>> All cases will hit a BUG() later in domain teardown, when a the missing
>>>> type/count reference is underflowed.
>>> I'm afraid I could use some help with this: Why would there
>>> be a missing reference, when the getting of one failed?
>> Look at the cleanup logic for the associated fields.
>>
>> Either the plain ref fails (impossible without other fatal refcounting
>> errors AFAICT), or the typeref fails (a concern, but impossible AFAICT).
> In principle I would agree, if there wasn't the question of
> count overflows. The type count presently is 56 bits wide,
> while the general refcount has 54 bits. It'll be a long time
> until they overflow, but it's not impossible. The underlying
> problem there that I see is - where do we draw the line
> between "can't possibly overflow in practice" (as we would
> typically assume for 64-bit counters) and "is to be expected
> to overflow (as we would typically assume for 32-bit
> counters)?

Ok fine - I was treating 54 bits as "not going to happen in practice".

A PV guest needs 2^43 pages of RAM to turn into pagetables to approach
the general refcount limit.  This is more RAM than most people can
accord, and this is way in excess of our security supported limits.


Errors in this area are already hit BUGs in loads of cases, because that
is less bad than the alternatives.

In principle, and as previously discussed, some issues in this area
could be fixed by porting refcount_t from PaX/Linux KSPP which will turn
refcount overflows into memory leaks, which is an even less bad alternative.

>
> Also, as far as "impossible" here goes - the constructs all
> anyway exist only to deal with what we consider impossible.
> The question therefore really is of almost exclusively
> theoretical nature, and hence something like a counter
> possibly overflowing imo needs to be accounted for as
> theoretically possible, albeit impossible with today's
> computers and realistic timing assumptions. If a counter
> overflow occurred, it definitely wouldn't be because of a
> bug in Xen, but because of abnormal behavior elsewhere.
> Hence I remain unconvinced it is appropriate to deal with
> the situation by BUG().

I'm not sure how to be any clearer.

I am literally not changing the current behaviour.  Xen *will* hit a
BUG() if any of these domain_crash() paths are taken.

If you do not believe me, then please go and actually check what happens
when simulating a ref-acquisition failure.


What I am doing is removing complexity (the point of the change) which
gives a false sense of the error being survivable.

If you want to do something other than BUG() in these cases, then you
need to figure some way for the teardown logic to identify which ref
went missing, but this would be a different, follow-on patch.

> But yes, if otoh we assume the failure here to be the result
> of a bug elsewhere in Xen (and not an overflow), then BUG()
> may be warranted. Yet afaic these constructs weren't meant
> to deal with bugs elsewhere in Xen, but with the
> "impossible". So if we change our collective mind here, I
> think the conversion to BUG() would then need accompanying
> by respective commentary.

BUG() is, and has always been, Xen's way of dealing with impossibles,
particularly when it comes to memory handling.

This isn't a "changing minds" occasion.  Removals of BUG()s elsewhere
pertains to logical error based on guest state, which is indeed
inappropriate error handling.

~Andrew
Jan Beulich Jan. 26, 2021, 10:48 a.m. UTC | #6
On 25.01.2021 18:59, Andrew Cooper wrote:
> On 20/01/2021 08:06, Jan Beulich wrote:
>> On 19.01.2021 19:09, Andrew Cooper wrote:
>>> On 19/01/2021 16:48, Jan Beulich wrote:
>>>> On 19.01.2021 14:02, Andrew Cooper wrote:
>>>>> This code has been copied in 3 places, but it is problematic.
>>>>>
>>>>> All cases will hit a BUG() later in domain teardown, when a the missing
>>>>> type/count reference is underflowed.
>>>> I'm afraid I could use some help with this: Why would there
>>>> be a missing reference, when the getting of one failed?
>>> Look at the cleanup logic for the associated fields.
>>>
>>> Either the plain ref fails (impossible without other fatal refcounting
>>> errors AFAICT), or the typeref fails (a concern, but impossible AFAICT).
>> In principle I would agree, if there wasn't the question of
>> count overflows. The type count presently is 56 bits wide,
>> while the general refcount has 54 bits. It'll be a long time
>> until they overflow, but it's not impossible. The underlying
>> problem there that I see is - where do we draw the line
>> between "can't possibly overflow in practice" (as we would
>> typically assume for 64-bit counters) and "is to be expected
>> to overflow (as we would typically assume for 32-bit
>> counters)?
> 
> Ok fine - I was treating 54 bits as "not going to happen in practice".
> 
> A PV guest needs 2^43 pages of RAM to turn into pagetables to approach
> the general refcount limit.  This is more RAM than most people can
> accord, and this is way in excess of our security supported limits.
> 
> 
> Errors in this area are already hit BUGs in loads of cases, because that
> is less bad than the alternatives.
> 
> In principle, and as previously discussed, some issues in this area
> could be fixed by porting refcount_t from PaX/Linux KSPP which will turn
> refcount overflows into memory leaks, which is an even less bad alternative.
> 
>>
>> Also, as far as "impossible" here goes - the constructs all
>> anyway exist only to deal with what we consider impossible.
>> The question therefore really is of almost exclusively
>> theoretical nature, and hence something like a counter
>> possibly overflowing imo needs to be accounted for as
>> theoretically possible, albeit impossible with today's
>> computers and realistic timing assumptions. If a counter
>> overflow occurred, it definitely wouldn't be because of a
>> bug in Xen, but because of abnormal behavior elsewhere.
>> Hence I remain unconvinced it is appropriate to deal with
>> the situation by BUG().
> 
> I'm not sure how to be any clearer.
> 
> I am literally not changing the current behaviour.  Xen *will* hit a
> BUG() if any of these domain_crash() paths are taken.
> 
> If you do not believe me, then please go and actually check what happens
> when simulating a ref-acquisition failure.

Well, okay, if you don't want to share the knowledge you've
gained, I will have to go this route. Will likely take
longer though than if you had tried to clarify how a ref
would go "missing" and hence allow for an underflow. This
isn't a matter of believing you, but one of wanting to
understand the situation you say you've run into.

Jan

> What I am doing is removing complexity (the point of the change) which
> gives a false sense of the error being survivable.
> 
> If you want to do something other than BUG() in these cases, then you
> need to figure some way for the teardown logic to identify which ref
> went missing, but this would be a different, follow-on patch.
> 
>> But yes, if otoh we assume the failure here to be the result
>> of a bug elsewhere in Xen (and not an overflow), then BUG()
>> may be warranted. Yet afaic these constructs weren't meant
>> to deal with bugs elsewhere in Xen, but with the
>> "impossible". So if we change our collective mind here, I
>> think the conversion to BUG() would then need accompanying
>> by respective commentary.
> 
> BUG() is, and has always been, Xen's way of dealing with impossibles,
> particularly when it comes to memory handling.
> 
> This isn't a "changing minds" occasion.  Removals of BUG()s elsewhere
> pertains to logical error based on guest state, which is indeed
> inappropriate error handling.
> 
> ~Andrew
>
Jan Beulich Jan. 28, 2021, 2:48 p.m. UTC | #7
On 25.01.2021 18:59, Andrew Cooper wrote:
> I am literally not changing the current behaviour.  Xen *will* hit a
> BUG() if any of these domain_crash() paths are taken.
> 
> If you do not believe me, then please go and actually check what happens
> when simulating a ref-acquisition failure.

Okay, I've started with the debugging patch below. After the
other error handling fixed (see the patch just sent), this
works fine and - with the changes not marked "//temp" - even
stops leaking the page in that case. This latter fact proves
(to me) that at least on this path there's no ref lost
anywhere, and there's also no BUG() elsewhere that we would
trigger. As re-assurance I observed subsequent run attempts
of the same guest to end up re-using this same page for this
same purpose in a number of cases.

You patch altered two other, similar paths, and I can't
exclude there to be a problem there. Since the exercise was
useful for fixing the other two bugs anyway, I guess I'll do
the same for those paths later on and see what I get.

Jan

--- unstable.orig/xen/arch/x86/hvm/vmx/vmx.c
+++ unstable/xen/arch/x86/hvm/vmx/vmx.c
@@ -3076,13 +3076,22 @@ static int vmx_alloc_vlapic_mapping(stru
     if ( !pg )
         return -ENOMEM;
 
-    if ( !get_page_and_type(pg, d, PGT_writable_page) )
+printk("%pd: APIC access MFN: %lx (c=%lx t=%lx)\n", d, mfn_x(page_to_mfn(pg)), pg->count_info, pg->u.inuse.type_info);//temp
+//temp    if ( !get_page_and_type(pg, d, PGT_writable_page) )
     {
         /*
          * The domain can't possibly know about this page yet, so failure
          * here is a clear indication of something fishy going on.
          */
         domain_crash(d);
+        if ( get_page(pg, d) )
+        {
+            put_page_alloc_ref(pg);
+printk("%pd: MFN %lx: (c=%lx t=%lx)\n", d, mfn_x(page_to_mfn(pg)), pg->count_info, pg->u.inuse.type_info);//temp
+            put_page(pg);
+        }
+        else
+            printk("%pd: leaking MFN %lx\n", d, mfn_x(page_to_mfn(pg)));
         return -ENODATA;
     }
Jan Beulich Jan. 29, 2021, 11:29 a.m. UTC | #8
On 25.01.2021 18:59, Andrew Cooper wrote:
> On 20/01/2021 08:06, Jan Beulich wrote:
>> Also, as far as "impossible" here goes - the constructs all
>> anyway exist only to deal with what we consider impossible.
>> The question therefore really is of almost exclusively
>> theoretical nature, and hence something like a counter
>> possibly overflowing imo needs to be accounted for as
>> theoretically possible, albeit impossible with today's
>> computers and realistic timing assumptions. If a counter
>> overflow occurred, it definitely wouldn't be because of a
>> bug in Xen, but because of abnormal behavior elsewhere.
>> Hence I remain unconvinced it is appropriate to deal with
>> the situation by BUG().
> 
> I'm not sure how to be any clearer.
> 
> I am literally not changing the current behaviour.  Xen *will* hit a
> BUG() if any of these domain_crash() paths are taken.
> 
> If you do not believe me, then please go and actually check what happens
> when simulating a ref-acquisition failure.

So I've now also played the same game on the ioreq path (see
debugging patch below, and again with some non-"//temp"
changes actually improving overall behavior in that "impossible"
case). No BUG()s hit, no leaks (thanks to the extra changes),
no other anomalies observed.

Hence I'm afraid it is now really up to you to point out the
specific BUG()s (and additional context as necessary) that you
either believe could be hit, or that you have observed being hit.

Jan

--- unstable.orig/xen/arch/x86/hvm/ioreq.c
+++ unstable/xen/arch/x86/hvm/ioreq.c
@@ -366,13 +366,23 @@ static int hvm_alloc_ioreq_mfn(struct hv
     if ( !page )
         return -ENOMEM;
 
-    if ( !get_page_and_type(page, s->target, PGT_writable_page) )
+printk("%pd: %sioreq MFN: %lx (c=%lx t=%lx)\n", s->target, buf ? "buf" : "", mfn_x(page_to_mfn(page)), page->count_info, page->u.inuse.type_info);//temp
+//temp    if ( !get_page_and_type(page, s->target, PGT_writable_page) )
+if((buf == (s->target->domain_id & 1)) || !get_page_and_type(page, s->target, PGT_writable_page))//temp
     {
         /*
          * The domain can't possibly know about this page yet, so failure
          * here is a clear indication of something fishy going on.
          */
-        domain_crash(s->emulator);
+        domain_crash(is_control_domain(s->emulator) ? s->target : s->emulator);
+        if ( get_page(page, s->target) )
+        {
+            put_page_alloc_ref(page);
+printk("%pd: MFN %lx: (c=%lx t=%lx)\n", s->target, mfn_x(page_to_mfn(page)), page->count_info, page->u.inuse.type_info);//temp
+            put_page(page);
+        }
+        else
+            printk("%pd: leaking %pd MFN %lx\n", s->emulator, s->target, mfn_x(page_to_mfn(page)));
         return -ENODATA;
     }
 
@@ -385,6 +395,7 @@ static int hvm_alloc_ioreq_mfn(struct hv
     return 0;
 
  fail:
+printk("%pd: %sioreq mfn: %lx (c=%lx t=%lx)\n", s->target, buf ? "buf" : "", mfn_x(page_to_mfn(page)), page->count_info, page->u.inuse.type_info);//temp
     put_page_alloc_ref(page);
     put_page_and_type(page);
 
@@ -404,6 +415,7 @@ static void hvm_free_ioreq_mfn(struct hv
     unmap_domain_page_global(iorp->va);
     iorp->va = NULL;
 
+printk("%pd: %sioreq mfn: %lx [c=%lx t=%lx]\n", s->target, buf ? "buf" : "", mfn_x(page_to_mfn(page)), page->count_info, page->u.inuse.type_info);//temp
     put_page_alloc_ref(page);
     put_page_and_type(page);
 }
Andrew Cooper Jan. 29, 2021, 4:17 p.m. UTC | #9
On 29/01/2021 11:29, Jan Beulich wrote:
> On 25.01.2021 18:59, Andrew Cooper wrote:
>> On 20/01/2021 08:06, Jan Beulich wrote:
>>> Also, as far as "impossible" here goes - the constructs all
>>> anyway exist only to deal with what we consider impossible.
>>> The question therefore really is of almost exclusively
>>> theoretical nature, and hence something like a counter
>>> possibly overflowing imo needs to be accounted for as
>>> theoretically possible, albeit impossible with today's
>>> computers and realistic timing assumptions. If a counter
>>> overflow occurred, it definitely wouldn't be because of a
>>> bug in Xen, but because of abnormal behavior elsewhere.
>>> Hence I remain unconvinced it is appropriate to deal with
>>> the situation by BUG().
>> I'm not sure how to be any clearer.
>>
>> I am literally not changing the current behaviour.  Xen *will* hit a
>> BUG() if any of these domain_crash() paths are taken.
>>
>> If you do not believe me, then please go and actually check what happens
>> when simulating a ref-acquisition failure.
> So I've now also played the same game on the ioreq path (see
> debugging patch below, and again with some non-"//temp"
> changes actually improving overall behavior in that "impossible"
> case). No BUG()s hit, no leaks (thanks to the extra changes),
> no other anomalies observed.
>
> Hence I'm afraid it is now really up to you to point out the
> specific BUG()s (and additional context as necessary) that you
> either believe could be hit, or that you have observed being hit.

The refcounting logic was taken verbatim from ioreq, with the only
difference being an order greater than 0.  The logic is also identical
to the vlapic logic.

And the reason *why* it bugs is obvious - the cleanup logic
unconditionally put()'s refs it never took to begin with, and hits
underflow bugchecks.

For specifics, a simulated regular ref failure:

diff --git a/xen/common/domain.c b/xen/common/domain.c
index 1051d86a20..314d258e31 100644
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -171,9 +171,14 @@ static int vmtrace_alloc_buffer(struct vcpu *v)
     v->vmtrace.buf = pg;
 
     for ( i = 0; i < d->vmtrace_frames; i++ )
+    {
+        if ( i == 0 )
+            return -ENOMEM;
+
         /* Domain can't know about this page yet - something fishy
going on. */
         if ( !get_page_and_type(&pg[i], d, PGT_writable_page) )
             BUG();
+    }
 
     return 0;
 }

and the simulated typeref failure:

diff --git a/xen/common/domain.c b/xen/common/domain.c
index db845ccc81..bd810157f4 100644
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -172,8 +172,16 @@ static int vmtrace_alloc_buffer(struct vcpu *v)
 
     for ( i = 0; i < d->vmtrace_frames; i++ )
     {
+        get_page(&pg[i], d);
+
         if ( i == 0 )
+        {
+            put_page(&pg[i]);
             return -ENOMEM;
+        }
+
+        get_page_type(&pg[i], PGT_writable_page);
+        continue;
 
         /* Domain can't know about this page yet - something fishy
going on. */
         if ( !get_page_and_type(&pg[i], d, PGT_writable_page) )

both yield:

(XEN) Xen BUG at /local/xen.git/xen/include/xen/mm.h:610
(XEN) RIP:    e008:[<ffff82d04020423e>]
common/domain.c#vmtrace_free_buffer+0x57/0xc1
(XEN) Xen call trace:
(XEN)    [<ffff82d04020423e>] R
common/domain.c#vmtrace_free_buffer+0x57/0xc1
(XEN)    [<ffff82d040205497>] F vcpu_create+0x245/0x32b
(XEN)    [<ffff82d04023ae5b>] F do_domctl+0xb48/0x1964
(XEN)    [<ffff82d04030c6b2>] F pv_hypercall+0x2e4/0x53d
(XEN)    [<ffff82d04039045d>] F lstar_enter+0x12d/0x140

~Andrew
Jan Beulich Jan. 29, 2021, 4:31 p.m. UTC | #10
On 29.01.2021 17:17, Andrew Cooper wrote:
> On 29/01/2021 11:29, Jan Beulich wrote:
>> On 25.01.2021 18:59, Andrew Cooper wrote:
>>> On 20/01/2021 08:06, Jan Beulich wrote:
>>>> Also, as far as "impossible" here goes - the constructs all
>>>> anyway exist only to deal with what we consider impossible.
>>>> The question therefore really is of almost exclusively
>>>> theoretical nature, and hence something like a counter
>>>> possibly overflowing imo needs to be accounted for as
>>>> theoretically possible, albeit impossible with today's
>>>> computers and realistic timing assumptions. If a counter
>>>> overflow occurred, it definitely wouldn't be because of a
>>>> bug in Xen, but because of abnormal behavior elsewhere.
>>>> Hence I remain unconvinced it is appropriate to deal with
>>>> the situation by BUG().
>>> I'm not sure how to be any clearer.
>>>
>>> I am literally not changing the current behaviour.  Xen *will* hit a
>>> BUG() if any of these domain_crash() paths are taken.
>>>
>>> If you do not believe me, then please go and actually check what happens
>>> when simulating a ref-acquisition failure.
>> So I've now also played the same game on the ioreq path (see
>> debugging patch below, and again with some non-"//temp"
>> changes actually improving overall behavior in that "impossible"
>> case). No BUG()s hit, no leaks (thanks to the extra changes),
>> no other anomalies observed.
>>
>> Hence I'm afraid it is now really up to you to point out the
>> specific BUG()s (and additional context as necessary) that you
>> either believe could be hit, or that you have observed being hit.
> 
> The refcounting logic was taken verbatim from ioreq, with the only
> difference being an order greater than 0.  The logic is also identical
> to the vlapic logic.
> 
> And the reason *why* it bugs is obvious - the cleanup logic
> unconditionally put()'s refs it never took to begin with, and hits
> underflow bugchecks.

In current staging, neither vmx_alloc_vlapic_mapping() nor
hvm_alloc_ioreq_mfn() put any refs they couldn't get. Hence
my failed attempt to repro your claimed misbehavior.

> For specifics, a simulated regular ref failure:
> 
> diff --git a/xen/common/domain.c b/xen/common/domain.c
> index 1051d86a20..314d258e31 100644
> --- a/xen/common/domain.c
> +++ b/xen/common/domain.c
> @@ -171,9 +171,14 @@ static int vmtrace_alloc_buffer(struct vcpu *v)
>      v->vmtrace.buf = pg;
>  
>      for ( i = 0; i < d->vmtrace_frames; i++ )
> +    {
> +        if ( i == 0 )
> +            return -ENOMEM;
> +
>          /* Domain can't know about this page yet - something fishy
> going on. */
>          if ( !get_page_and_type(&pg[i], d, PGT_writable_page) )
>              BUG();
> +    }

No bad put here. This should have BUG() replaced just like
the other two instances mentioned above have it.

> and the simulated typeref failure:
> 
> diff --git a/xen/common/domain.c b/xen/common/domain.c
> index db845ccc81..bd810157f4 100644
> --- a/xen/common/domain.c
> +++ b/xen/common/domain.c
> @@ -172,8 +172,16 @@ static int vmtrace_alloc_buffer(struct vcpu *v)
>  
>      for ( i = 0; i < d->vmtrace_frames; i++ )
>      {
> +        get_page(&pg[i], d);
> +
>          if ( i == 0 )
> +        {
> +            put_page(&pg[i]);

This of course if wrong is the get_page() failed. Assuming it
succeeds I don't see what's wrong here.

>              return -ENOMEM;
> +        }
> +
> +        get_page_type(&pg[i], PGT_writable_page);
> +        continue;
>  
>          /* Domain can't know about this page yet - something fishy
> going on. */
>          if ( !get_page_and_type(&pg[i], d, PGT_writable_page) )
> 
> both yield:
> 
> (XEN) Xen BUG at /local/xen.git/xen/include/xen/mm.h:610
> (XEN) RIP:    e008:[<ffff82d04020423e>]
> common/domain.c#vmtrace_free_buffer+0x57/0xc1
> (XEN) Xen call trace:
> (XEN)    [<ffff82d04020423e>] R
> common/domain.c#vmtrace_free_buffer+0x57/0xc1
> (XEN)    [<ffff82d040205497>] F vcpu_create+0x245/0x32b
> (XEN)    [<ffff82d04023ae5b>] F do_domctl+0xb48/0x1964
> (XEN)    [<ffff82d04030c6b2>] F pv_hypercall+0x2e4/0x53d
> (XEN)    [<ffff82d04039045d>] F lstar_enter+0x12d/0x140

So maybe vmtrace_free_buffer() is broken?

Jan
Andrew Cooper Jan. 29, 2021, 5:17 p.m. UTC | #11
On 29/01/2021 16:31, Jan Beulich wrote:
> On 29.01.2021 17:17, Andrew Cooper wrote:
>> On 29/01/2021 11:29, Jan Beulich wrote:
>>> On 25.01.2021 18:59, Andrew Cooper wrote:
>>>> On 20/01/2021 08:06, Jan Beulich wrote:
>>>>> Also, as far as "impossible" here goes - the constructs all
>>>>> anyway exist only to deal with what we consider impossible.
>>>>> The question therefore really is of almost exclusively
>>>>> theoretical nature, and hence something like a counter
>>>>> possibly overflowing imo needs to be accounted for as
>>>>> theoretically possible, albeit impossible with today's
>>>>> computers and realistic timing assumptions. If a counter
>>>>> overflow occurred, it definitely wouldn't be because of a
>>>>> bug in Xen, but because of abnormal behavior elsewhere.
>>>>> Hence I remain unconvinced it is appropriate to deal with
>>>>> the situation by BUG().
>>>> I'm not sure how to be any clearer.
>>>>
>>>> I am literally not changing the current behaviour.  Xen *will* hit a
>>>> BUG() if any of these domain_crash() paths are taken.
>>>>
>>>> If you do not believe me, then please go and actually check what happens
>>>> when simulating a ref-acquisition failure.
>>> So I've now also played the same game on the ioreq path (see
>>> debugging patch below, and again with some non-"//temp"
>>> changes actually improving overall behavior in that "impossible"
>>> case). No BUG()s hit, no leaks (thanks to the extra changes),
>>> no other anomalies observed.
>>>
>>> Hence I'm afraid it is now really up to you to point out the
>>> specific BUG()s (and additional context as necessary) that you
>>> either believe could be hit, or that you have observed being hit.
>> The refcounting logic was taken verbatim from ioreq, with the only
>> difference being an order greater than 0.  The logic is also identical
>> to the vlapic logic.
>>
>> And the reason *why* it bugs is obvious - the cleanup logic
>> unconditionally put()'s refs it never took to begin with, and hits
>> underflow bugchecks.
> In current staging, neither vmx_alloc_vlapic_mapping() nor
> hvm_alloc_ioreq_mfn() put any refs they couldn't get. Hence
> my failed attempt to repro your claimed misbehavior.

I think I've figured out what is going on.

They *look* as if they do, but the logic is deceptive.

We skip both puts in free_*() if the typeref failed, and rely on the
fact that the frame(s) are *also* on the domheap list for
relinquish_resources() to put the acquire ref.

Yet another bizzare recounting rule/behaviour which isn't written down.

My bug really was setting v->vmtrace.buf too early.  Furthermore, this
pattern is not safe for use in the domain_create() path, because it
currently depends on the domain being put on the domain list and
relinquish_resources() being called to avoid leaking memory.

I still argue that a typeref failure in these circumstances is the same
kind of impossibility that we use BUG for elsewhere, and therefore that
is what we should be doing.

~Andrew
Jan Beulich Feb. 1, 2021, 12:50 p.m. UTC | #12
On 29.01.2021 18:17, Andrew Cooper wrote:
> On 29/01/2021 16:31, Jan Beulich wrote:
>> On 29.01.2021 17:17, Andrew Cooper wrote:
>>> On 29/01/2021 11:29, Jan Beulich wrote:
>>>> On 25.01.2021 18:59, Andrew Cooper wrote:
>>>>> On 20/01/2021 08:06, Jan Beulich wrote:
>>>>>> Also, as far as "impossible" here goes - the constructs all
>>>>>> anyway exist only to deal with what we consider impossible.
>>>>>> The question therefore really is of almost exclusively
>>>>>> theoretical nature, and hence something like a counter
>>>>>> possibly overflowing imo needs to be accounted for as
>>>>>> theoretically possible, albeit impossible with today's
>>>>>> computers and realistic timing assumptions. If a counter
>>>>>> overflow occurred, it definitely wouldn't be because of a
>>>>>> bug in Xen, but because of abnormal behavior elsewhere.
>>>>>> Hence I remain unconvinced it is appropriate to deal with
>>>>>> the situation by BUG().
>>>>> I'm not sure how to be any clearer.
>>>>>
>>>>> I am literally not changing the current behaviour.  Xen *will* hit a
>>>>> BUG() if any of these domain_crash() paths are taken.
>>>>>
>>>>> If you do not believe me, then please go and actually check what happens
>>>>> when simulating a ref-acquisition failure.
>>>> So I've now also played the same game on the ioreq path (see
>>>> debugging patch below, and again with some non-"//temp"
>>>> changes actually improving overall behavior in that "impossible"
>>>> case). No BUG()s hit, no leaks (thanks to the extra changes),
>>>> no other anomalies observed.
>>>>
>>>> Hence I'm afraid it is now really up to you to point out the
>>>> specific BUG()s (and additional context as necessary) that you
>>>> either believe could be hit, or that you have observed being hit.
>>> The refcounting logic was taken verbatim from ioreq, with the only
>>> difference being an order greater than 0.  The logic is also identical
>>> to the vlapic logic.
>>>
>>> And the reason *why* it bugs is obvious - the cleanup logic
>>> unconditionally put()'s refs it never took to begin with, and hits
>>> underflow bugchecks.
>> In current staging, neither vmx_alloc_vlapic_mapping() nor
>> hvm_alloc_ioreq_mfn() put any refs they couldn't get. Hence
>> my failed attempt to repro your claimed misbehavior.
> 
> I think I've figured out what is going on.
> 
> They *look* as if they do, but the logic is deceptive.
> 
> We skip both puts in free_*() if the typeref failed, and rely on the
> fact that the frame(s) are *also* on the domheap list for
> relinquish_resources() to put the acquire ref.
> 
> Yet another bizzare recounting rule/behaviour which isn't written down.

But that's not the case - extra pages land on their own
list, which relinquish_resources() doesn't iterate. Hence
me saying we leak these pages on the domain_crash() paths,
and hence my repro attempt patches containing adjustments
to at least try to free those pages on those paths.

Jan
diff mbox series

Patch

diff --git a/xen/arch/x86/hvm/ioreq.c b/xen/arch/x86/hvm/ioreq.c
index 1cc27df87f..0c38cfa151 100644
--- a/xen/arch/x86/hvm/ioreq.c
+++ b/xen/arch/x86/hvm/ioreq.c
@@ -366,15 +366,8 @@  static int hvm_alloc_ioreq_mfn(struct hvm_ioreq_server *s, bool buf)
     if ( !page )
         return -ENOMEM;
 
-    if ( !get_page_and_type(page, s->target, PGT_writable_page) )
-    {
-        /*
-         * The domain can't possibly know about this page yet, so failure
-         * here is a clear indication of something fishy going on.
-         */
-        domain_crash(s->emulator);
-        return -ENODATA;
-    }
+    /* Domain can't know about this page yet - something fishy going on. */
+    BUG_ON(!get_page_and_type(page, s->target, PGT_writable_page));
 
     iorp->va = __map_domain_page_global(page);
     if ( !iorp->va )
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index 2d4475ee3d..4120234c15 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -3042,15 +3042,8 @@  static int vmx_alloc_vlapic_mapping(struct domain *d)
     if ( !pg )
         return -ENOMEM;
 
-    if ( !get_page_and_type(pg, d, PGT_writable_page) )
-    {
-        /*
-         * The domain can't possibly know about this page yet, so failure
-         * here is a clear indication of something fishy going on.
-         */
-        domain_crash(d);
-        return -ENODATA;
-    }
+    /* Domain can't know about this page yet - something fishy going on. */
+    BUG_ON(!get_page_and_type(pg, d, PGT_writable_page));
 
     mfn = page_to_mfn(pg);
     clear_domain_page(mfn);
diff --git a/xen/arch/x86/mm/mem_paging.c b/xen/arch/x86/mm/mem_paging.c
index 01281f786e..60d667ae94 100644
--- a/xen/arch/x86/mm/mem_paging.c
+++ b/xen/arch/x86/mm/mem_paging.c
@@ -379,19 +379,10 @@  static int prepare(struct domain *d, gfn_t gfn,
         page = alloc_domheap_page(d, 0);
         if ( unlikely(page == NULL) )
             goto out;
-        if ( unlikely(!get_page(page, d)) )
-        {
-            /*
-             * The domain can't possibly know about this page yet, so failure
-             * here is a clear indication of something fishy going on.
-             */
-            gprintk(XENLOG_ERR,
-                    "%pd: fresh page for GFN %"PRI_gfn" in unexpected state\n",
-                    d, gfn_x(gfn));
-            domain_crash(d);
-            page = NULL;
-            goto out;
-        }
+
+        /* Domain can't know about this page yet - something fishy going on. */
+        BUG_ON(!get_page(page, d));
+
         mfn = page_to_mfn(page);
         page_extant = 0;