[v2] KVM: kvm_io_bus_unregister_dev() should never fail
diff mbox

Message ID 20170323172419.21435-1-david@redhat.com
State New
Headers show

Commit Message

David Hildenbrand March 23, 2017, 5:24 p.m. UTC
No caller currently checks the return value of
kvm_io_bus_unregister_dev(). This is evil, as all callers silently go on
freeing their device. A stale reference will remain in the io_bus,
getting at least used again, when the iobus gets teared down on
kvm_destroy_vm() - leading to use after free errors.

There is nothing the callers could do, except retrying over and over
again.

So let's simply remove the bus altogether, print an error and make
sure no one can access this broken bus again (returning -ENOMEM on any
attempt to access it).

Fixes: e93f8a0f821e ("KVM: convert io_bus to SRCU")
Cc: stable@vger.kernel.org # 3.4+
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
---

Based on kvm/queue, where we just got 2a108a4e7c1 ("KVM: x86: clear bus
pointer when destroyed"), which added a check we need here.

v1 -> v2:
- added a check in kvm_destroy_vm()
- added a check in virt/kvm/eventfd.c

Using 'git grep -C 4 "kvm->buses"' should help to find all users. The other
user in virt/kvm/eventfd.c should be fine.

---
 include/linux/kvm_host.h |  4 ++--
 virt/kvm/eventfd.c       |  3 ++-
 virt/kvm/kvm_main.c      | 42 +++++++++++++++++++++++++-----------------
 3 files changed, 29 insertions(+), 20 deletions(-)

Comments

Cornelia Huck March 23, 2017, 5:31 p.m. UTC | #1
On Thu, 23 Mar 2017 18:24:19 +0100
David Hildenbrand <david@redhat.com> wrote:

> No caller currently checks the return value of
> kvm_io_bus_unregister_dev(). This is evil, as all callers silently go on
> freeing their device. A stale reference will remain in the io_bus,
> getting at least used again, when the iobus gets teared down on
> kvm_destroy_vm() - leading to use after free errors.
> 
> There is nothing the callers could do, except retrying over and over
> again.
> 
> So let's simply remove the bus altogether, print an error and make
> sure no one can access this broken bus again (returning -ENOMEM on any
> attempt to access it).
> 
> Fixes: e93f8a0f821e ("KVM: convert io_bus to SRCU")
> Cc: stable@vger.kernel.org # 3.4+
> Reported-by: Dmitry Vyukov <dvyukov@google.com>
> Signed-off-by: David Hildenbrand <david@redhat.com>
> ---
> 
> Based on kvm/queue, where we just got 2a108a4e7c1 ("KVM: x86: clear bus
> pointer when destroyed"), which added a check we need here.
> 
> v1 -> v2:
> - added a check in kvm_destroy_vm()
> - added a check in virt/kvm/eventfd.c
> 
> Using 'git grep -C 4 "kvm->buses"' should help to find all users. The other
> user in virt/kvm/eventfd.c should be fine.
> 
> ---
>  include/linux/kvm_host.h |  4 ++--
>  virt/kvm/eventfd.c       |  3 ++-
>  virt/kvm/kvm_main.c      | 42 +++++++++++++++++++++++++-----------------
>  3 files changed, 29 insertions(+), 20 deletions(-)

Reviewed-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Marcelo Tosatti March 23, 2017, 8:42 p.m. UTC | #2
On Thu, Mar 23, 2017 at 06:24:19PM +0100, David Hildenbrand wrote:
> No caller currently checks the return value of
> kvm_io_bus_unregister_dev(). This is evil, as all callers silently go on
> freeing their device. A stale reference will remain in the io_bus,
> getting at least used again, when the iobus gets teared down on
> kvm_destroy_vm() - leading to use after free errors.
> 
> There is nothing the callers could do, except retrying over and over
> again.
> 
> So let's simply remove the bus altogether, print an error and make
> sure no one can access this broken bus again (returning -ENOMEM on any
> attempt to access it).
> 
> Fixes: e93f8a0f821e ("KVM: convert io_bus to SRCU")
> Cc: stable@vger.kernel.org # 3.4+
> Reported-by: Dmitry Vyukov <dvyukov@google.com>
> Signed-off-by: David Hildenbrand <david@redhat.com>
> ---
> 
> Based on kvm/queue, where we just got 2a108a4e7c1 ("KVM: x86: clear bus
> pointer when destroyed"), which added a check we need here.
> 
> v1 -> v2:
> - added a check in kvm_destroy_vm()
> - added a check in virt/kvm/eventfd.c
> 
> Using 'git grep -C 4 "kvm->buses"' should help to find all users. The other
> user in virt/kvm/eventfd.c should be fine.
> 
> ---
>  include/linux/kvm_host.h |  4 ++--
>  virt/kvm/eventfd.c       |  3 ++-
>  virt/kvm/kvm_main.c      | 42 +++++++++++++++++++++++++-----------------
>  3 files changed, 29 insertions(+), 20 deletions(-)
> 
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 2c14ad9..d025074 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -162,8 +162,8 @@ int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
>  		    int len, void *val);
>  int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
>  			    int len, struct kvm_io_device *dev);
> -int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
> -			      struct kvm_io_device *dev);
> +void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
> +			       struct kvm_io_device *dev);
>  struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
>  					 gpa_t addr);
>  
> diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> index a29786d..4d28a9d 100644
> --- a/virt/kvm/eventfd.c
> +++ b/virt/kvm/eventfd.c
> @@ -870,7 +870,8 @@ kvm_deassign_ioeventfd_idx(struct kvm *kvm, enum kvm_bus bus_idx,
>  			continue;
>  
>  		kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev);
> -		kvm->buses[bus_idx]->ioeventfd_count--;
> +		if (kvm->buses[bus_idx])
> +			kvm->buses[bus_idx]->ioeventfd_count--;
>  		ioeventfd_release(p);
>  		ret = 0;
>  		break;
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 7445566..ef1aa7f 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -728,7 +728,8 @@ static void kvm_destroy_vm(struct kvm *kvm)
>  	spin_unlock(&kvm_lock);
>  	kvm_free_irq_routing(kvm);
>  	for (i = 0; i < KVM_NR_BUSES; i++) {
> -		kvm_io_bus_destroy(kvm->buses[i]);
> +		if (kvm->buses[i])
> +			kvm_io_bus_destroy(kvm->buses[i]);
>  		kvm->buses[i] = NULL;
>  	}
>  	kvm_coalesced_mmio_free(kvm);
> @@ -3476,6 +3477,8 @@ int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
>  	};
>  
>  	bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
> +	if (!bus)
> +		return -ENOMEM;
>  	r = __kvm_io_bus_write(vcpu, bus, &range, val);
>  	return r < 0 ? r : 0;
>  }
> @@ -3493,6 +3496,8 @@ int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx,
>  	};
>  
>  	bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
> +	if (!bus)
> +		return -ENOMEM;
>  
>  	/* First try the device referenced by cookie. */
>  	if ((cookie >= 0) && (cookie < bus->dev_count) &&
> @@ -3543,6 +3548,8 @@ int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
>  	};
>  
>  	bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
> +	if (!bus)
> +		return -ENOMEM;
>  	r = __kvm_io_bus_read(vcpu, bus, &range, val);
>  	return r < 0 ? r : 0;
>  }
> @@ -3555,6 +3562,9 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
>  	struct kvm_io_bus *new_bus, *bus;
>  
>  	bus = kvm->buses[bus_idx];
> +	if (!bus)
> +		return -ENOMEM;
> +
>  	/* exclude ioeventfd which is limited by maximum fd */
>  	if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
>  		return -ENOSPC;
> @@ -3574,45 +3584,41 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
>  }
>  
>  /* Caller must hold slots_lock. */
> -int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
> -			      struct kvm_io_device *dev)
> +void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
> +			       struct kvm_io_device *dev)
>  {
> -	int i, r;
> +	int i;
>  	struct kvm_io_bus *new_bus, *bus;
>  
>  	bus = kvm->buses[bus_idx];
> -
> -	/*
> -	 * It's possible the bus being released before hand. If so,
> -	 * we're done here.
> -	 */
>  	if (!bus)
> -		return 0;
> +		return;
>  
> -	r = -ENOENT;
>  	for (i = 0; i < bus->dev_count; i++)
>  		if (bus->range[i].dev == dev) {
> -			r = 0;
>  			break;
>  		}
>  
> -	if (r)
> -		return r;
> +	if (i == bus->dev_count)
> +		return;
>  
>  	new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count - 1) *
>  			  sizeof(struct kvm_io_range)), GFP_KERNEL);
> -	if (!new_bus)
> -		return -ENOMEM;
> +	if (!new_bus)  {
> +		pr_err("kvm: failed to shrink bus, removing it completely\n");
> +		goto broken;

The guest will fail in mysterious ways, if you do this (and
io_bus_unregister_dev can be called during runtime): in-kernel device
accesses will fail with unknown behaviour in the guest.

Can't you retry a handful of times with GFP_KERNEL before switching to GFP_ATOMIC? 
(which in case fails the machine is likely to be crashing soon).
Dmitry Vyukov March 24, 2017, 8:48 a.m. UTC | #3
On Thu, Mar 23, 2017 at 9:42 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> On Thu, Mar 23, 2017 at 06:24:19PM +0100, David Hildenbrand wrote:
>> No caller currently checks the return value of
>> kvm_io_bus_unregister_dev(). This is evil, as all callers silently go on
>> freeing their device. A stale reference will remain in the io_bus,
>> getting at least used again, when the iobus gets teared down on
>> kvm_destroy_vm() - leading to use after free errors.
>>
>> There is nothing the callers could do, except retrying over and over
>> again.
>>
>> So let's simply remove the bus altogether, print an error and make
>> sure no one can access this broken bus again (returning -ENOMEM on any
>> attempt to access it).
>>
>> Fixes: e93f8a0f821e ("KVM: convert io_bus to SRCU")
>> Cc: stable@vger.kernel.org # 3.4+
>> Reported-by: Dmitry Vyukov <dvyukov@google.com>
>> Signed-off-by: David Hildenbrand <david@redhat.com>
>> ---
>>
>> Based on kvm/queue, where we just got 2a108a4e7c1 ("KVM: x86: clear bus
>> pointer when destroyed"), which added a check we need here.
>>
>> v1 -> v2:
>> - added a check in kvm_destroy_vm()
>> - added a check in virt/kvm/eventfd.c
>>
>> Using 'git grep -C 4 "kvm->buses"' should help to find all users. The other
>> user in virt/kvm/eventfd.c should be fine.
>>
>> ---
>>  include/linux/kvm_host.h |  4 ++--
>>  virt/kvm/eventfd.c       |  3 ++-
>>  virt/kvm/kvm_main.c      | 42 +++++++++++++++++++++++++-----------------
>>  3 files changed, 29 insertions(+), 20 deletions(-)
>>
>> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
>> index 2c14ad9..d025074 100644
>> --- a/include/linux/kvm_host.h
>> +++ b/include/linux/kvm_host.h
>> @@ -162,8 +162,8 @@ int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
>>                   int len, void *val);
>>  int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
>>                           int len, struct kvm_io_device *dev);
>> -int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
>> -                           struct kvm_io_device *dev);
>> +void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
>> +                            struct kvm_io_device *dev);
>>  struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
>>                                        gpa_t addr);
>>
>> diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
>> index a29786d..4d28a9d 100644
>> --- a/virt/kvm/eventfd.c
>> +++ b/virt/kvm/eventfd.c
>> @@ -870,7 +870,8 @@ kvm_deassign_ioeventfd_idx(struct kvm *kvm, enum kvm_bus bus_idx,
>>                       continue;
>>
>>               kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev);
>> -             kvm->buses[bus_idx]->ioeventfd_count--;
>> +             if (kvm->buses[bus_idx])
>> +                     kvm->buses[bus_idx]->ioeventfd_count--;
>>               ioeventfd_release(p);
>>               ret = 0;
>>               break;
>> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
>> index 7445566..ef1aa7f 100644
>> --- a/virt/kvm/kvm_main.c
>> +++ b/virt/kvm/kvm_main.c
>> @@ -728,7 +728,8 @@ static void kvm_destroy_vm(struct kvm *kvm)
>>       spin_unlock(&kvm_lock);
>>       kvm_free_irq_routing(kvm);
>>       for (i = 0; i < KVM_NR_BUSES; i++) {
>> -             kvm_io_bus_destroy(kvm->buses[i]);
>> +             if (kvm->buses[i])
>> +                     kvm_io_bus_destroy(kvm->buses[i]);
>>               kvm->buses[i] = NULL;
>>       }
>>       kvm_coalesced_mmio_free(kvm);
>> @@ -3476,6 +3477,8 @@ int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
>>       };
>>
>>       bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
>> +     if (!bus)
>> +             return -ENOMEM;
>>       r = __kvm_io_bus_write(vcpu, bus, &range, val);
>>       return r < 0 ? r : 0;
>>  }
>> @@ -3493,6 +3496,8 @@ int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx,
>>       };
>>
>>       bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
>> +     if (!bus)
>> +             return -ENOMEM;
>>
>>       /* First try the device referenced by cookie. */
>>       if ((cookie >= 0) && (cookie < bus->dev_count) &&
>> @@ -3543,6 +3548,8 @@ int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
>>       };
>>
>>       bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
>> +     if (!bus)
>> +             return -ENOMEM;
>>       r = __kvm_io_bus_read(vcpu, bus, &range, val);
>>       return r < 0 ? r : 0;
>>  }
>> @@ -3555,6 +3562,9 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
>>       struct kvm_io_bus *new_bus, *bus;
>>
>>       bus = kvm->buses[bus_idx];
>> +     if (!bus)
>> +             return -ENOMEM;
>> +
>>       /* exclude ioeventfd which is limited by maximum fd */
>>       if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
>>               return -ENOSPC;
>> @@ -3574,45 +3584,41 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
>>  }
>>
>>  /* Caller must hold slots_lock. */
>> -int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
>> -                           struct kvm_io_device *dev)
>> +void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
>> +                            struct kvm_io_device *dev)
>>  {
>> -     int i, r;
>> +     int i;
>>       struct kvm_io_bus *new_bus, *bus;
>>
>>       bus = kvm->buses[bus_idx];
>> -
>> -     /*
>> -      * It's possible the bus being released before hand. If so,
>> -      * we're done here.
>> -      */
>>       if (!bus)
>> -             return 0;
>> +             return;
>>
>> -     r = -ENOENT;
>>       for (i = 0; i < bus->dev_count; i++)
>>               if (bus->range[i].dev == dev) {
>> -                     r = 0;
>>                       break;
>>               }
>>
>> -     if (r)
>> -             return r;
>> +     if (i == bus->dev_count)
>> +             return;
>>
>>       new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count - 1) *
>>                         sizeof(struct kvm_io_range)), GFP_KERNEL);
>> -     if (!new_bus)
>> -             return -ENOMEM;
>> +     if (!new_bus)  {
>> +             pr_err("kvm: failed to shrink bus, removing it completely\n");
>> +             goto broken;
>
> The guest will fail in mysterious ways, if you do this (and
> io_bus_unregister_dev can be called during runtime): in-kernel device
> accesses will fail with unknown behaviour in the guest.
>
> Can't you retry a handful of times with GFP_KERNEL before switching to GFP_ATOMIC?
> (which in case fails the machine is likely to be crashing soon).

The process can run in a cgroup, then kmalloc failure has nothing to
do with overall memory consumption. Machine can be perfectly fine.
Also, this very process can be chosen as an OOM kill target, then it
needs to gracefully deal with kmalloc failure and proceed to a
termination point.
Generally retrying something in a loop does not look like a solid plan
to deal with errors.
David Hildenbrand March 24, 2017, 8:55 a.m. UTC | #4
>>> -             return r;
>>> +     if (i == bus->dev_count)
>>> +             return;
>>>
>>>       new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count - 1) *
>>>                         sizeof(struct kvm_io_range)), GFP_KERNEL);
>>> -     if (!new_bus)
>>> -             return -ENOMEM;
>>> +     if (!new_bus)  {
>>> +             pr_err("kvm: failed to shrink bus, removing it completely\n");
>>> +             goto broken;
>>
>> The guest will fail in mysterious ways, if you do this (and
>> io_bus_unregister_dev can be called during runtime): in-kernel device
>> accesses will fail with unknown behaviour in the guest.

Actually, the next access to the BUS should result in -ENOMEM. And the
error message should be enough to then figure out what went wrong.
However, to hit this scenario at all feels very unlikely. So I would
like to avoid advanced allocation schemes.

>>
>> Can't you retry a handful of times with GFP_KERNEL before switching to GFP_ATOMIC?
>> (which in case fails the machine is likely to be crashing soon).
> 
> The process can run in a cgroup, then kmalloc failure has nothing to
> do with overall memory consumption. Machine can be perfectly fine.
> Also, this very process can be chosen as an OOM kill target, then it
> needs to gracefully deal with kmalloc failure and proceed to a
> termination point.
> Generally retrying something in a loop does not look like a solid plan
> to deal with errors.
> 

I agree, looping on memory allocations never feels like the right thing
to do.
Cornelia Huck March 24, 2017, 9:33 a.m. UTC | #5
On Fri, 24 Mar 2017 09:55:15 +0100
David Hildenbrand <david@redhat.com> wrote:

> 
> >>> -             return r;
> >>> +     if (i == bus->dev_count)
> >>> +             return;
> >>>
> >>>       new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count - 1) *
> >>>                         sizeof(struct kvm_io_range)), GFP_KERNEL);
> >>> -     if (!new_bus)
> >>> -             return -ENOMEM;
> >>> +     if (!new_bus)  {
> >>> +             pr_err("kvm: failed to shrink bus, removing it completely\n");
> >>> +             goto broken;
> >>
> >> The guest will fail in mysterious ways, if you do this (and
> >> io_bus_unregister_dev can be called during runtime): in-kernel device
> >> accesses will fail with unknown behaviour in the guest.
> 
> Actually, the next access to the BUS should result in -ENOMEM. And the
> error message should be enough to then figure out what went wrong.

Hopefully, an admin will look at the logs :)

But yes, the patch should have caught all issues in the host, and the
guest will basically be presented with broken "hardware".

> However, to hit this scenario at all feels very unlikely. So I would
> like to avoid advanced allocation schemes.

Agreed, spending too much time on complex recovery scenarios is
overkill for this unlikely case.
Marcelo Tosatti March 24, 2017, 10:33 a.m. UTC | #6
On Fri, Mar 24, 2017 at 09:48:31AM +0100, Dmitry Vyukov wrote:
> On Thu, Mar 23, 2017 at 9:42 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> > On Thu, Mar 23, 2017 at 06:24:19PM +0100, David Hildenbrand wrote:
> >> No caller currently checks the return value of
> >> kvm_io_bus_unregister_dev(). This is evil, as all callers silently go on
> >> freeing their device. A stale reference will remain in the io_bus,
> >> getting at least used again, when the iobus gets teared down on
> >> kvm_destroy_vm() - leading to use after free errors.
> >>
> >> There is nothing the callers could do, except retrying over and over
> >> again.
> >>
> >> So let's simply remove the bus altogether, print an error and make
> >> sure no one can access this broken bus again (returning -ENOMEM on any
> >> attempt to access it).
> >>
> >> Fixes: e93f8a0f821e ("KVM: convert io_bus to SRCU")
> >> Cc: stable@vger.kernel.org # 3.4+
> >> Reported-by: Dmitry Vyukov <dvyukov@google.com>
> >> Signed-off-by: David Hildenbrand <david@redhat.com>
> >> ---
> >>
> >> Based on kvm/queue, where we just got 2a108a4e7c1 ("KVM: x86: clear bus
> >> pointer when destroyed"), which added a check we need here.
> >>
> >> v1 -> v2:
> >> - added a check in kvm_destroy_vm()
> >> - added a check in virt/kvm/eventfd.c
> >>
> >> Using 'git grep -C 4 "kvm->buses"' should help to find all users. The other
> >> user in virt/kvm/eventfd.c should be fine.
> >>
> >> ---
> >>  include/linux/kvm_host.h |  4 ++--
> >>  virt/kvm/eventfd.c       |  3 ++-
> >>  virt/kvm/kvm_main.c      | 42 +++++++++++++++++++++++++-----------------
> >>  3 files changed, 29 insertions(+), 20 deletions(-)
> >>
> >> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> >> index 2c14ad9..d025074 100644
> >> --- a/include/linux/kvm_host.h
> >> +++ b/include/linux/kvm_host.h
> >> @@ -162,8 +162,8 @@ int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
> >>                   int len, void *val);
> >>  int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
> >>                           int len, struct kvm_io_device *dev);
> >> -int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
> >> -                           struct kvm_io_device *dev);
> >> +void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
> >> +                            struct kvm_io_device *dev);
> >>  struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
> >>                                        gpa_t addr);
> >>
> >> diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> >> index a29786d..4d28a9d 100644
> >> --- a/virt/kvm/eventfd.c
> >> +++ b/virt/kvm/eventfd.c
> >> @@ -870,7 +870,8 @@ kvm_deassign_ioeventfd_idx(struct kvm *kvm, enum kvm_bus bus_idx,
> >>                       continue;
> >>
> >>               kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev);
> >> -             kvm->buses[bus_idx]->ioeventfd_count--;
> >> +             if (kvm->buses[bus_idx])
> >> +                     kvm->buses[bus_idx]->ioeventfd_count--;
> >>               ioeventfd_release(p);
> >>               ret = 0;
> >>               break;
> >> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> >> index 7445566..ef1aa7f 100644
> >> --- a/virt/kvm/kvm_main.c
> >> +++ b/virt/kvm/kvm_main.c
> >> @@ -728,7 +728,8 @@ static void kvm_destroy_vm(struct kvm *kvm)
> >>       spin_unlock(&kvm_lock);
> >>       kvm_free_irq_routing(kvm);
> >>       for (i = 0; i < KVM_NR_BUSES; i++) {
> >> -             kvm_io_bus_destroy(kvm->buses[i]);
> >> +             if (kvm->buses[i])
> >> +                     kvm_io_bus_destroy(kvm->buses[i]);
> >>               kvm->buses[i] = NULL;
> >>       }
> >>       kvm_coalesced_mmio_free(kvm);
> >> @@ -3476,6 +3477,8 @@ int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
> >>       };
> >>
> >>       bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
> >> +     if (!bus)
> >> +             return -ENOMEM;
> >>       r = __kvm_io_bus_write(vcpu, bus, &range, val);
> >>       return r < 0 ? r : 0;
> >>  }
> >> @@ -3493,6 +3496,8 @@ int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx,
> >>       };
> >>
> >>       bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
> >> +     if (!bus)
> >> +             return -ENOMEM;
> >>
> >>       /* First try the device referenced by cookie. */
> >>       if ((cookie >= 0) && (cookie < bus->dev_count) &&
> >> @@ -3543,6 +3548,8 @@ int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
> >>       };
> >>
> >>       bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
> >> +     if (!bus)
> >> +             return -ENOMEM;
> >>       r = __kvm_io_bus_read(vcpu, bus, &range, val);
> >>       return r < 0 ? r : 0;
> >>  }
> >> @@ -3555,6 +3562,9 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
> >>       struct kvm_io_bus *new_bus, *bus;
> >>
> >>       bus = kvm->buses[bus_idx];
> >> +     if (!bus)
> >> +             return -ENOMEM;
> >> +
> >>       /* exclude ioeventfd which is limited by maximum fd */
> >>       if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
> >>               return -ENOSPC;
> >> @@ -3574,45 +3584,41 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
> >>  }
> >>
> >>  /* Caller must hold slots_lock. */
> >> -int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
> >> -                           struct kvm_io_device *dev)
> >> +void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
> >> +                            struct kvm_io_device *dev)
> >>  {
> >> -     int i, r;
> >> +     int i;
> >>       struct kvm_io_bus *new_bus, *bus;
> >>
> >>       bus = kvm->buses[bus_idx];
> >> -
> >> -     /*
> >> -      * It's possible the bus being released before hand. If so,
> >> -      * we're done here.
> >> -      */
> >>       if (!bus)
> >> -             return 0;
> >> +             return;
> >>
> >> -     r = -ENOENT;
> >>       for (i = 0; i < bus->dev_count; i++)
> >>               if (bus->range[i].dev == dev) {
> >> -                     r = 0;
> >>                       break;
> >>               }
> >>
> >> -     if (r)
> >> -             return r;
> >> +     if (i == bus->dev_count)
> >> +             return;
> >>
> >>       new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count - 1) *
> >>                         sizeof(struct kvm_io_range)), GFP_KERNEL);
> >> -     if (!new_bus)
> >> -             return -ENOMEM;
> >> +     if (!new_bus)  {
> >> +             pr_err("kvm: failed to shrink bus, removing it completely\n");
> >> +             goto broken;
> >
> > The guest will fail in mysterious ways, if you do this (and
> > io_bus_unregister_dev can be called during runtime): in-kernel device
> > accesses will fail with unknown behaviour in the guest.
> >
> > Can't you retry a handful of times with GFP_KERNEL before switching to GFP_ATOMIC?
> > (which in case fails the machine is likely to be crashing soon).
> 
> The process can run in a cgroup, then kmalloc failure has nothing to
> do with overall memory consumption. Machine can be perfectly fine.
> Also, this very process can be chosen as an OOM kill target, then it
> needs to gracefully deal with kmalloc failure and proceed to a
> termination point.
> Generally retrying something in a loop does not look like a solid plan
> to deal with errors.

Thats a good point.

Still dislike the "unregister all bus devices" as that renders the guest
unuseable. Can't you fail gracefully? Say force qemu error.
Cornelia Huck March 24, 2017, 2:40 p.m. UTC | #7
On Fri, 24 Mar 2017 07:33:01 -0300
Marcelo Tosatti <mtosatti@redhat.com> wrote:

> Still dislike the "unregister all bus devices" as that renders the guest
> unuseable. Can't you fail gracefully? Say force qemu error.

The callers are probably too varied to find a common way to do that
(multiply with different user space tools).

At least for some cases, the guest will be terminated in the near
future anyway: For example, on s390 a NULL bus will cause an error to
be propagated through KVM_RUN on the next guest->host virtio
notification, which will cause qemu to terminate the guest.

This situation is not ideal, but this is probably the most reasonable
approach.

Patch
diff mbox

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 2c14ad9..d025074 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -162,8 +162,8 @@  int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
 		    int len, void *val);
 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 			    int len, struct kvm_io_device *dev);
-int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
-			      struct kvm_io_device *dev);
+void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
+			       struct kvm_io_device *dev);
 struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
 					 gpa_t addr);
 
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index a29786d..4d28a9d 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -870,7 +870,8 @@  kvm_deassign_ioeventfd_idx(struct kvm *kvm, enum kvm_bus bus_idx,
 			continue;
 
 		kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev);
-		kvm->buses[bus_idx]->ioeventfd_count--;
+		if (kvm->buses[bus_idx])
+			kvm->buses[bus_idx]->ioeventfd_count--;
 		ioeventfd_release(p);
 		ret = 0;
 		break;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 7445566..ef1aa7f 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -728,7 +728,8 @@  static void kvm_destroy_vm(struct kvm *kvm)
 	spin_unlock(&kvm_lock);
 	kvm_free_irq_routing(kvm);
 	for (i = 0; i < KVM_NR_BUSES; i++) {
-		kvm_io_bus_destroy(kvm->buses[i]);
+		if (kvm->buses[i])
+			kvm_io_bus_destroy(kvm->buses[i]);
 		kvm->buses[i] = NULL;
 	}
 	kvm_coalesced_mmio_free(kvm);
@@ -3476,6 +3477,8 @@  int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
 	};
 
 	bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
+	if (!bus)
+		return -ENOMEM;
 	r = __kvm_io_bus_write(vcpu, bus, &range, val);
 	return r < 0 ? r : 0;
 }
@@ -3493,6 +3496,8 @@  int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx,
 	};
 
 	bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
+	if (!bus)
+		return -ENOMEM;
 
 	/* First try the device referenced by cookie. */
 	if ((cookie >= 0) && (cookie < bus->dev_count) &&
@@ -3543,6 +3548,8 @@  int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
 	};
 
 	bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
+	if (!bus)
+		return -ENOMEM;
 	r = __kvm_io_bus_read(vcpu, bus, &range, val);
 	return r < 0 ? r : 0;
 }
@@ -3555,6 +3562,9 @@  int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 	struct kvm_io_bus *new_bus, *bus;
 
 	bus = kvm->buses[bus_idx];
+	if (!bus)
+		return -ENOMEM;
+
 	/* exclude ioeventfd which is limited by maximum fd */
 	if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
 		return -ENOSPC;
@@ -3574,45 +3584,41 @@  int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 }
 
 /* Caller must hold slots_lock. */
-int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
-			      struct kvm_io_device *dev)
+void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
+			       struct kvm_io_device *dev)
 {
-	int i, r;
+	int i;
 	struct kvm_io_bus *new_bus, *bus;
 
 	bus = kvm->buses[bus_idx];
-
-	/*
-	 * It's possible the bus being released before hand. If so,
-	 * we're done here.
-	 */
 	if (!bus)
-		return 0;
+		return;
 
-	r = -ENOENT;
 	for (i = 0; i < bus->dev_count; i++)
 		if (bus->range[i].dev == dev) {
-			r = 0;
 			break;
 		}
 
-	if (r)
-		return r;
+	if (i == bus->dev_count)
+		return;
 
 	new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count - 1) *
 			  sizeof(struct kvm_io_range)), GFP_KERNEL);
-	if (!new_bus)
-		return -ENOMEM;
+	if (!new_bus)  {
+		pr_err("kvm: failed to shrink bus, removing it completely\n");
+		goto broken;
+	}
 
 	memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
 	new_bus->dev_count--;
 	memcpy(new_bus->range + i, bus->range + i + 1,
 	       (new_bus->dev_count - i) * sizeof(struct kvm_io_range));
 
+broken:
 	rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
 	synchronize_srcu_expedited(&kvm->srcu);
 	kfree(bus);
-	return r;
+	return;
 }
 
 struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
@@ -3625,6 +3631,8 @@  struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
 	srcu_idx = srcu_read_lock(&kvm->srcu);
 
 	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
+	if (!bus)
+		goto out_unlock;
 
 	dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1);
 	if (dev_idx < 0)