Message ID | 1493028624-29837-2-git-send-email-suzuki.poulose@arm.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Mon, Apr 24, 2017 at 11:10:23AM +0100, Suzuki K Poulose wrote: > The KVM uses mmu_notifier (wherever available) to keep track > of the changes to the mm of the guest. The guest shadow page > tables are released when the VM exits via mmu_notifier->ops.release(). > There is a rare chance that the mmu_notifier->release could be > called more than once via two different paths, which could end > up in use-after-free of kvm instance (such as [0]). > > e.g: > > thread A thread B > ------- -------------- > > get_signal-> kvm_destroy_vm()-> > do_exit-> mmu_notifier_unregister-> > exit_mm-> kvm_arch_flush_shadow_all()-> > exit_mmap-> spin_lock(&kvm->mmu_lock) > mmu_notifier_release-> .... > kvm_arch_flush_shadow_all()-> ..... > ... spin_lock(&kvm->mmu_lock) ..... > spin_unlock(&kvm->mmu_lock) > kvm_arch_free_kvm() > *** use after free of kvm *** > > This patch attempts to solve the problem by holding a reference to the KVM > for the mmu_notifier, which is dropped only from notifier->ops.release(). > This will ensure that the KVM struct is available till we reach the > kvm_mmu_notifier_release, and the kvm_destroy_vm is called only from/after > it. So, we can unregister the notifier with no_release option and hence > avoiding the race above. However, we need to make sure that the KVM is > freed only after the mmu_notifier has finished processing the notifier due to > the following possible path of execution : > > mmu_notifier_release -> kvm_mmu_notifier_release -> kvm_put_kvm -> > kvm_destroy_vm -> kvm_arch_free_kvm > > [0] http://lkml.kernel.org/r/CAAeHK+x8udHKq9xa1zkTO6ax5E8Dk32HYWfaT05FMchL2cr48g@mail.gmail.com > > Fixes: commit 85db06e514422 ("KVM: mmu_notifiers release method") > Reported-by: andreyknvl@google.com > Cc: Mark Rutland <mark.rutland@arm.com> > Cc: Paolo Bonzini <pbonzini@redhat.com> > Cc: Radim Krčmář <rkrcmar@redhat.com> > Cc: Marc Zyngier <marc.zyngier@arm.com> > Cc: Christoffer Dall <christoffer.dall@linaro.org> > Cc: andreyknvl@google.com > Cc: Marc Zyngier <marc.zyngier@arm.com> > Tested-by: Mark Rutland <mark.rutland@arm.com> > Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com> This looks good to me, but we should have some KVM generic experts look at it as well. Reviewed-by: Christoffer Dall <cdall@linaro.org> > --- > include/linux/kvm_host.h | 1 + > virt/kvm/kvm_main.c | 59 ++++++++++++++++++++++++++++++++++++++++++------ > 2 files changed, 53 insertions(+), 7 deletions(-) > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > index d025074..561e968 100644 > --- a/include/linux/kvm_host.h > +++ b/include/linux/kvm_host.h > @@ -424,6 +424,7 @@ struct kvm { > struct mmu_notifier mmu_notifier; > unsigned long mmu_notifier_seq; > long mmu_notifier_count; > + struct rcu_head mmu_notifier_rcu; > #endif > long tlbs_dirty; > struct list_head devices; > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > index 88257b3..2c3fdd4 100644 > --- a/virt/kvm/kvm_main.c > +++ b/virt/kvm/kvm_main.c > @@ -471,6 +471,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn, > idx = srcu_read_lock(&kvm->srcu); > kvm_arch_flush_shadow_all(kvm); > srcu_read_unlock(&kvm->srcu, idx); > + kvm_put_kvm(kvm); > } > > static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { > @@ -486,8 +487,46 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { > > static int kvm_init_mmu_notifier(struct kvm *kvm) > { > + int rc; > kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops; > - return mmu_notifier_register(&kvm->mmu_notifier, current->mm); > + rc = mmu_notifier_register(&kvm->mmu_notifier, current->mm); > + /* > + * We hold a reference to KVM here to make sure that the KVM > + * doesn't get free'd before ops->release() completes. > + */ > + if (!rc) > + kvm_get_kvm(kvm); > + return rc; > +} > + > +static void kvm_free_vm_rcu(struct rcu_head *rcu) > +{ > + struct kvm *kvm = container_of(rcu, struct kvm, mmu_notifier_rcu); > + kvm_arch_free_vm(kvm); > +} > + > +static void kvm_flush_shadow_mmu(struct kvm *kvm) > +{ > + /* > + * We hold a reference to kvm instance for mmu_notifier and is > + * only released when ops->release() is called via exit_mmap path. > + * So, when we reach here ops->release() has been called already, which > + * flushes the shadow page tables. Hence there is no need to call the > + * release() again when we unregister the notifier. However, we need > + * to delay freeing up the kvm until the release() completes, since > + * we could reach here via : > + * kvm_mmu_notifier_release() -> kvm_put_kvm() -> kvm_destroy_vm() > + */ > + mmu_notifier_unregister_no_release(&kvm->mmu_notifier, kvm->mm); > +} > + > +static void kvm_free_vm(struct kvm *kvm) > +{ > + /* > + * Wait until the mmu_notifier has finished the release(). > + * See comments above in kvm_flush_shadow_mmu. > + */ > + mmu_notifier_call_srcu(&kvm->mmu_notifier_rcu, kvm_free_vm_rcu); > } > > #else /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */ > @@ -497,6 +536,16 @@ static int kvm_init_mmu_notifier(struct kvm *kvm) > return 0; > } > > +static void kvm_flush_shadow_mmu(struct kvm *kvm) > +{ > + kvm_arch_flush_shadow_all(kvm); > +} > + > +static void kvm_free_vm(struct kvm *kvm) > +{ > + kvm_arch_free_vm(kvm); > +} > + > #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ > > static struct kvm_memslots *kvm_alloc_memslots(void) > @@ -733,18 +782,14 @@ static void kvm_destroy_vm(struct kvm *kvm) > kvm->buses[i] = NULL; > } > kvm_coalesced_mmio_free(kvm); > -#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) > - mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); > -#else > - kvm_arch_flush_shadow_all(kvm); > -#endif > + kvm_flush_shadow_mmu(kvm); > kvm_arch_destroy_vm(kvm); > kvm_destroy_devices(kvm); > for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) > kvm_free_memslots(kvm, kvm->memslots[i]); > cleanup_srcu_struct(&kvm->irq_srcu); > cleanup_srcu_struct(&kvm->srcu); > - kvm_arch_free_vm(kvm); > + kvm_free_vm(kvm); > preempt_notifier_dec(); > hardware_disable_all(); > mmdrop(mm); > -- > 2.7.4 >
2017-04-24 11:10+0100, Suzuki K Poulose: > The KVM uses mmu_notifier (wherever available) to keep track > of the changes to the mm of the guest. The guest shadow page > tables are released when the VM exits via mmu_notifier->ops.release(). > There is a rare chance that the mmu_notifier->release could be > called more than once via two different paths, which could end > up in use-after-free of kvm instance (such as [0]). > > e.g: > > thread A thread B > ------- -------------- > > get_signal-> kvm_destroy_vm()-> > do_exit-> mmu_notifier_unregister-> > exit_mm-> kvm_arch_flush_shadow_all()-> > exit_mmap-> spin_lock(&kvm->mmu_lock) > mmu_notifier_release-> .... > kvm_arch_flush_shadow_all()-> ..... > ... spin_lock(&kvm->mmu_lock) ..... > spin_unlock(&kvm->mmu_lock) > kvm_arch_free_kvm() > *** use after free of kvm *** I don't understand this race ... a piece of code in mmu_notifier_unregister() says: /* * Wait for any running method to finish, of course including * ->release if it was run by mmu_notifier_release instead of us. */ synchronize_srcu(&srcu); and code before that removes the notifier from the list, so it cannot be called after we pass this point. mmu_notifier_release() does roughly the same and explains it as: /* * synchronize_srcu here prevents mmu_notifier_release from returning to * exit_mmap (which would proceed with freeing all pages in the mm) * until the ->release method returns, if it was invoked by * mmu_notifier_unregister. * * The mmu_notifier_mm can't go away from under us because one mm_count * is held by exit_mmap. */ synchronize_srcu(&srcu); The call of mmu_notifier->release is protected by srcu in both cases and while it seems possible that mmu_notifier->release would be called twice, I don't see a combination that could result in use-after-free from mmu_notifier_release after mmu_notifier_unregister() has returned. Doesn't [2/2] solve the exact same issue (that the release method cannot be called twice in parallel)? Thanks.
On 25/04/17 19:49, Radim Krčmář wrote: > 2017-04-24 11:10+0100, Suzuki K Poulose: >> The KVM uses mmu_notifier (wherever available) to keep track >> of the changes to the mm of the guest. The guest shadow page >> tables are released when the VM exits via mmu_notifier->ops.release(). >> There is a rare chance that the mmu_notifier->release could be >> called more than once via two different paths, which could end >> up in use-after-free of kvm instance (such as [0]). >> >> e.g: >> >> thread A thread B >> ------- -------------- >> >> get_signal-> kvm_destroy_vm()-> >> do_exit-> mmu_notifier_unregister-> >> exit_mm-> kvm_arch_flush_shadow_all()-> >> exit_mmap-> spin_lock(&kvm->mmu_lock) >> mmu_notifier_release-> .... >> kvm_arch_flush_shadow_all()-> ..... >> ... spin_lock(&kvm->mmu_lock) ..... >> spin_unlock(&kvm->mmu_lock) >> kvm_arch_free_kvm() >> *** use after free of kvm *** > > I don't understand this race ... > a piece of code in mmu_notifier_unregister() says: > > /* > * Wait for any running method to finish, of course including > * ->release if it was run by mmu_notifier_release instead of us. > */ > synchronize_srcu(&srcu); > > and code before that removes the notifier from the list, so it cannot be > called after we pass this point. mmu_notifier_release() does roughly > the same and explains it as: > > /* > * synchronize_srcu here prevents mmu_notifier_release from returning to > * exit_mmap (which would proceed with freeing all pages in the mm) > * until the ->release method returns, if it was invoked by > * mmu_notifier_unregister. > * > * The mmu_notifier_mm can't go away from under us because one mm_count > * is held by exit_mmap. > */ > synchronize_srcu(&srcu); > > The call of mmu_notifier->release is protected by srcu in both cases and > while it seems possible that mmu_notifier->release would be called > twice, I don't see a combination that could result in use-after-free > from mmu_notifier_release after mmu_notifier_unregister() has returned. Thanks for bringing it up. Even I am wondering why this is triggered ! (But it does get triggered for sure !!) The only difference I can spot with _unregister & _release paths are the way we use src_read_lock across the deletion of the entry from the list. In mmu_notifier_unregister() we do : id = srcu_read_lock(&srcu); /* * exit_mmap will block in mmu_notifier_release to guarantee * that ->release is called before freeing the pages. */ if (mn->ops->release) mn->ops->release(mn, mm); srcu_read_unlock(&srcu, id); ## Releases the srcu lock here and then goes on to grab the spin_lock. spin_lock(&mm->mmu_notifier_mm->lock); /* * Can not use list_del_rcu() since __mmu_notifier_release * can delete it before we hold the lock. */ hlist_del_init_rcu(&mn->hlist); spin_unlock(&mm->mmu_notifier_mm->lock); While in mmu_notifier_release() we hold it until the node(s) are deleted from the list : /* * SRCU here will block mmu_notifier_unregister until * ->release returns. */ id = srcu_read_lock(&srcu); hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) /* * If ->release runs before mmu_notifier_unregister it must be * handled, as it's the only way for the driver to flush all * existing sptes and stop the driver from establishing any more * sptes before all the pages in the mm are freed. */ if (mn->ops->release) mn->ops->release(mn, mm); spin_lock(&mm->mmu_notifier_mm->lock); while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { mn = hlist_entry(mm->mmu_notifier_mm->list.first, struct mmu_notifier, hlist); /* * We arrived before mmu_notifier_unregister so * mmu_notifier_unregister will do nothing other than to wait * for ->release to finish and for mmu_notifier_unregister to * return. */ hlist_del_init_rcu(&mn->hlist); } spin_unlock(&mm->mmu_notifier_mm->lock); srcu_read_unlock(&srcu, id); ## The lock is release only after the deletion of the node. Both are followed by a synchronize_srcu(). Now, I am wondering if the unregister path could potentially miss SRCU read lock held in _release() path and go onto finish the synchronize_srcu before the item is deleted ? May be we should do the read_unlock after the deletion of the node in _unregister (like we do in the _release()) ? > > Doesn't [2/2] solve the exact same issue (that the release method cannot > be called twice in parallel)? Not really. This could be a race between a release() and one of the other notifier callbacks. e.g, In [0], we were hitting a use-after-free in kvm_unmap_hva() where, the unregister could have succeeded and released the KVM. [0] http://lkml.kernel.org/r/febea966-3767-21ff-3c40-1a76d1399138@suse.de In effect this all could be due to the same reason, the synchronize in unregister missing another reader. Suzuki > > Thanks. >
On Wed, Apr 26, 2017 at 05:03:44PM +0100, Suzuki K Poulose wrote: > On 25/04/17 19:49, Radim Krčmář wrote: > >2017-04-24 11:10+0100, Suzuki K Poulose: > >>The KVM uses mmu_notifier (wherever available) to keep track > >>of the changes to the mm of the guest. The guest shadow page > >>tables are released when the VM exits via mmu_notifier->ops.release(). > >>There is a rare chance that the mmu_notifier->release could be > >>called more than once via two different paths, which could end > >>up in use-after-free of kvm instance (such as [0]). > >> > >>e.g: > >> > >>thread A thread B > >>------- -------------- > >> > >> get_signal-> kvm_destroy_vm()-> > >> do_exit-> mmu_notifier_unregister-> > >> exit_mm-> kvm_arch_flush_shadow_all()-> > >> exit_mmap-> spin_lock(&kvm->mmu_lock) > >> mmu_notifier_release-> .... > >> kvm_arch_flush_shadow_all()-> ..... > >> ... spin_lock(&kvm->mmu_lock) ..... > >> spin_unlock(&kvm->mmu_lock) > >> kvm_arch_free_kvm() > >> *** use after free of kvm *** > > > >I don't understand this race ... > >a piece of code in mmu_notifier_unregister() says: > > > > /* > > * Wait for any running method to finish, of course including > > * ->release if it was run by mmu_notifier_release instead of us. > > */ > > synchronize_srcu(&srcu); > > > >and code before that removes the notifier from the list, so it cannot be > >called after we pass this point. mmu_notifier_release() does roughly > >the same and explains it as: > > > > /* > > * synchronize_srcu here prevents mmu_notifier_release from returning to > > * exit_mmap (which would proceed with freeing all pages in the mm) > > * until the ->release method returns, if it was invoked by > > * mmu_notifier_unregister. > > * > > * The mmu_notifier_mm can't go away from under us because one mm_count > > * is held by exit_mmap. > > */ > > synchronize_srcu(&srcu); > > > >The call of mmu_notifier->release is protected by srcu in both cases and > >while it seems possible that mmu_notifier->release would be called > >twice, I don't see a combination that could result in use-after-free > >from mmu_notifier_release after mmu_notifier_unregister() has returned. > > Thanks for bringing it up. Even I am wondering why this is triggered ! (But it > does get triggered for sure !!) > > The only difference I can spot with _unregister & _release paths are the way > we use src_read_lock across the deletion of the entry from the list. > > In mmu_notifier_unregister() we do : > > id = srcu_read_lock(&srcu); > /* > * exit_mmap will block in mmu_notifier_release to guarantee > * that ->release is called before freeing the pages. > */ > if (mn->ops->release) > mn->ops->release(mn, mm); > srcu_read_unlock(&srcu, id); > > ## Releases the srcu lock here and then goes on to grab the spin_lock. > > spin_lock(&mm->mmu_notifier_mm->lock); > /* > * Can not use list_del_rcu() since __mmu_notifier_release > * can delete it before we hold the lock. > */ > hlist_del_init_rcu(&mn->hlist); > spin_unlock(&mm->mmu_notifier_mm->lock); > > While in mmu_notifier_release() we hold it until the node(s) are deleted from the > list : > /* > * SRCU here will block mmu_notifier_unregister until > * ->release returns. > */ > id = srcu_read_lock(&srcu); > hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) > /* > * If ->release runs before mmu_notifier_unregister it must be > * handled, as it's the only way for the driver to flush all > * existing sptes and stop the driver from establishing any more > * sptes before all the pages in the mm are freed. > */ > if (mn->ops->release) > mn->ops->release(mn, mm); > > spin_lock(&mm->mmu_notifier_mm->lock); > while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { > mn = hlist_entry(mm->mmu_notifier_mm->list.first, > struct mmu_notifier, > hlist); > /* > * We arrived before mmu_notifier_unregister so > * mmu_notifier_unregister will do nothing other than to wait > * for ->release to finish and for mmu_notifier_unregister to > * return. > */ > hlist_del_init_rcu(&mn->hlist); > } > spin_unlock(&mm->mmu_notifier_mm->lock); > srcu_read_unlock(&srcu, id); > > ## The lock is release only after the deletion of the node. > > Both are followed by a synchronize_srcu(). Now, I am wondering if the unregister path > could potentially miss SRCU read lock held in _release() path and go onto finish the > synchronize_srcu before the item is deleted ? May be we should do the read_unlock > after the deletion of the node in _unregister (like we do in the _release()) ? > > > > >Doesn't [2/2] solve the exact same issue (that the release method cannot > >be called twice in parallel)? > > Not really. This could be a race between a release() and one of the other notifier > callbacks. e.g, In [0], we were hitting a use-after-free in kvm_unmap_hva() where, > the unregister could have succeeded and released the KVM. > > > [0] http://lkml.kernel.org/r/febea966-3767-21ff-3c40-1a76d1399138@suse.de > > In effect this all could be due to the same reason, the synchronize in unregister > missing another reader. If this is at all reproducible, I suggest use of ftrace or event tracing to work out exactly what is happening. Thanx, Paul
On 26/04/17 17:03, Suzuki K Poulose wrote: > On 25/04/17 19:49, Radim Krčmář wrote: >> 2017-04-24 11:10+0100, Suzuki K Poulose: >>> The KVM uses mmu_notifier (wherever available) to keep track >>> of the changes to the mm of the guest. The guest shadow page >>> tables are released when the VM exits via mmu_notifier->ops.release(). >>> There is a rare chance that the mmu_notifier->release could be >>> called more than once via two different paths, which could end >>> up in use-after-free of kvm instance (such as [0]). >>> >>> e.g: >>> >>> thread A thread B >>> ------- -------------- >>> >>> get_signal-> kvm_destroy_vm()-> >>> do_exit-> mmu_notifier_unregister-> >>> exit_mm-> kvm_arch_flush_shadow_all()-> >>> exit_mmap-> spin_lock(&kvm->mmu_lock) >>> mmu_notifier_release-> .... >>> kvm_arch_flush_shadow_all()-> ..... >>> ... spin_lock(&kvm->mmu_lock) ..... >>> spin_unlock(&kvm->mmu_lock) >>> kvm_arch_free_kvm() >>> *** use after free of kvm *** >> >> I don't understand this race ... >> a piece of code in mmu_notifier_unregister() says: >> >> /* >> * Wait for any running method to finish, of course including >> * ->release if it was run by mmu_notifier_release instead of us. >> */ >> synchronize_srcu(&srcu); >> >> and code before that removes the notifier from the list, so it cannot be >> called after we pass this point. mmu_notifier_release() does roughly >> the same and explains it as: >> >> /* >> * synchronize_srcu here prevents mmu_notifier_release from returning to >> * exit_mmap (which would proceed with freeing all pages in the mm) >> * until the ->release method returns, if it was invoked by >> * mmu_notifier_unregister. >> * >> * The mmu_notifier_mm can't go away from under us because one mm_count >> * is held by exit_mmap. >> */ >> synchronize_srcu(&srcu); >> >> The call of mmu_notifier->release is protected by srcu in both cases and >> while it seems possible that mmu_notifier->release would be called >> twice, I don't see a combination that could result in use-after-free >> from mmu_notifier_release after mmu_notifier_unregister() has returned. > > Thanks for bringing it up. Even I am wondering why this is triggered ! (But it > does get triggered for sure !!) > > The only difference I can spot with _unregister & _release paths are the way > we use src_read_lock across the deletion of the entry from the list. > > In mmu_notifier_unregister() we do : > > id = srcu_read_lock(&srcu); > /* > * exit_mmap will block in mmu_notifier_release to guarantee > * that ->release is called before freeing the pages. > */ > if (mn->ops->release) > mn->ops->release(mn, mm); > srcu_read_unlock(&srcu, id); > > ## Releases the srcu lock here and then goes on to grab the spin_lock. > > spin_lock(&mm->mmu_notifier_mm->lock); > /* > * Can not use list_del_rcu() since __mmu_notifier_release > * can delete it before we hold the lock. > */ > hlist_del_init_rcu(&mn->hlist); > spin_unlock(&mm->mmu_notifier_mm->lock); > > While in mmu_notifier_release() we hold it until the node(s) are deleted from the > list : > /* > * SRCU here will block mmu_notifier_unregister until > * ->release returns. > */ > id = srcu_read_lock(&srcu); > hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) > /* > * If ->release runs before mmu_notifier_unregister it must be > * handled, as it's the only way for the driver to flush all > * existing sptes and stop the driver from establishing any more > * sptes before all the pages in the mm are freed. > */ > if (mn->ops->release) > mn->ops->release(mn, mm); > > spin_lock(&mm->mmu_notifier_mm->lock); > while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { > mn = hlist_entry(mm->mmu_notifier_mm->list.first, > struct mmu_notifier, > hlist); > /* > * We arrived before mmu_notifier_unregister so > * mmu_notifier_unregister will do nothing other than to wait > * for ->release to finish and for mmu_notifier_unregister to > * return. > */ > hlist_del_init_rcu(&mn->hlist); > } > spin_unlock(&mm->mmu_notifier_mm->lock); > srcu_read_unlock(&srcu, id); > > ## The lock is release only after the deletion of the node. > > Both are followed by a synchronize_srcu(). Now, I am wondering if the unregister path > could potentially miss SRCU read lock held in _release() path and go onto finish the > synchronize_srcu before the item is deleted ? May be we should do the read_unlock > after the deletion of the node in _unregister (like we do in the _release()) ? I haven't been able to reproduce the mmu_notifier race condition, which leads to KVM free, reported at [1]. I will leave it running (with tracepoints/ftrace) over the weekend. > >> >> Doesn't [2/2] solve the exact same issue (that the release method cannot >> be called twice in parallel)? > > Not really. This could be a race between a release() and one of the other notifier > callbacks. e.g, In [0], we were hitting a use-after-free in kvm_unmap_hva() where, > the unregister could have succeeded and released the KVM. But I can reproduce this problem [0], and we need the [2/2] for arm/arm64. [0] http://lkml.kernel.org/r/febea966-3767-21ff-3c40-1a76d1399138@suse.de [1] http://lkml.kernel.org/r/CAAeHK+x8udHKq9xa1zkTO6ax5E8Dk32HYWfaT05FMchL2cr48g@mail.gmail.com Thanks Suzuki
On 28/04/17 18:20, Suzuki K Poulose wrote: > On 26/04/17 17:03, Suzuki K Poulose wrote: >> On 25/04/17 19:49, Radim Krčmář wrote: >>> 2017-04-24 11:10+0100, Suzuki K Poulose: >>>> The KVM uses mmu_notifier (wherever available) to keep track >>>> of the changes to the mm of the guest. The guest shadow page >>>> tables are released when the VM exits via mmu_notifier->ops.release(). >>>> There is a rare chance that the mmu_notifier->release could be >>>> called more than once via two different paths, which could end >>>> up in use-after-free of kvm instance (such as [0]). >>>> >>>> e.g: >>>> >>>> thread A thread B >>>> ------- -------------- >>>> >>>> get_signal-> kvm_destroy_vm()-> >>>> do_exit-> mmu_notifier_unregister-> >>>> exit_mm-> kvm_arch_flush_shadow_all()-> >>>> exit_mmap-> spin_lock(&kvm->mmu_lock) >>>> mmu_notifier_release-> .... >>>> kvm_arch_flush_shadow_all()-> ..... >>>> ... spin_lock(&kvm->mmu_lock) ..... >>>> spin_unlock(&kvm->mmu_lock) >>>> kvm_arch_free_kvm() >>>> *** use after free of kvm *** >>> >>> I don't understand this race ... >>> a piece of code in mmu_notifier_unregister() says: >>> >>> /* >>> * Wait for any running method to finish, of course including >>> * ->release if it was run by mmu_notifier_release instead of us. >>> */ >>> synchronize_srcu(&srcu); >>> >>> and code before that removes the notifier from the list, so it cannot be >>> called after we pass this point. mmu_notifier_release() does roughly >>> the same and explains it as: >>> >>> /* >>> * synchronize_srcu here prevents mmu_notifier_release from returning to >>> * exit_mmap (which would proceed with freeing all pages in the mm) >>> * until the ->release method returns, if it was invoked by >>> * mmu_notifier_unregister. >>> * >>> * The mmu_notifier_mm can't go away from under us because one mm_count >>> * is held by exit_mmap. >>> */ >>> synchronize_srcu(&srcu); >>> >>> The call of mmu_notifier->release is protected by srcu in both cases and >>> while it seems possible that mmu_notifier->release would be called >>> twice, I don't see a combination that could result in use-after-free >>> from mmu_notifier_release after mmu_notifier_unregister() has returned. >> >> Thanks for bringing it up. Even I am wondering why this is triggered ! (But it >> does get triggered for sure !!) >> >> The only difference I can spot with _unregister & _release paths are the way >> we use src_read_lock across the deletion of the entry from the list. >> >> In mmu_notifier_unregister() we do : >> >> id = srcu_read_lock(&srcu); >> /* >> * exit_mmap will block in mmu_notifier_release to guarantee >> * that ->release is called before freeing the pages. >> */ >> if (mn->ops->release) >> mn->ops->release(mn, mm); >> srcu_read_unlock(&srcu, id); >> >> ## Releases the srcu lock here and then goes on to grab the spin_lock. >> >> spin_lock(&mm->mmu_notifier_mm->lock); >> /* >> * Can not use list_del_rcu() since __mmu_notifier_release >> * can delete it before we hold the lock. >> */ >> hlist_del_init_rcu(&mn->hlist); >> spin_unlock(&mm->mmu_notifier_mm->lock); >> >> While in mmu_notifier_release() we hold it until the node(s) are deleted from the >> list : >> /* >> * SRCU here will block mmu_notifier_unregister until >> * ->release returns. >> */ >> id = srcu_read_lock(&srcu); >> hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) >> /* >> * If ->release runs before mmu_notifier_unregister it must be >> * handled, as it's the only way for the driver to flush all >> * existing sptes and stop the driver from establishing any more >> * sptes before all the pages in the mm are freed. >> */ >> if (mn->ops->release) >> mn->ops->release(mn, mm); >> >> spin_lock(&mm->mmu_notifier_mm->lock); >> while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { >> mn = hlist_entry(mm->mmu_notifier_mm->list.first, >> struct mmu_notifier, >> hlist); >> /* >> * We arrived before mmu_notifier_unregister so >> * mmu_notifier_unregister will do nothing other than to wait >> * for ->release to finish and for mmu_notifier_unregister to >> * return. >> */ >> hlist_del_init_rcu(&mn->hlist); >> } >> spin_unlock(&mm->mmu_notifier_mm->lock); >> srcu_read_unlock(&srcu, id); >> >> ## The lock is release only after the deletion of the node. >> >> Both are followed by a synchronize_srcu(). Now, I am wondering if the unregister path >> could potentially miss SRCU read lock held in _release() path and go onto finish the >> synchronize_srcu before the item is deleted ? May be we should do the read_unlock >> after the deletion of the node in _unregister (like we do in the _release()) ? > > I haven't been able to reproduce the mmu_notifier race condition, which leads to KVM > free, reported at [1]. I will leave it running (with tracepoints/ftrace) over the > weekend. > I couldn't reproduce the proposed "mmu_notifier race" reported in [0]. However I found some other use-after-free cases in the unmap_stage2_range() code due to the introduction of cond_resched_lock(). It may be just that the IP reported in [0] was for wrong line of code ? i.e, arch_spin_is_locked instead of unmap_stage2_range ? Anyways, I will send a new version of the patches in a separate series. [0] https://marc.info/?l=linux-kernel&m=149201399018791&w=2 Suzuki
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index d025074..561e968 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -424,6 +424,7 @@ struct kvm { struct mmu_notifier mmu_notifier; unsigned long mmu_notifier_seq; long mmu_notifier_count; + struct rcu_head mmu_notifier_rcu; #endif long tlbs_dirty; struct list_head devices; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 88257b3..2c3fdd4 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -471,6 +471,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn, idx = srcu_read_lock(&kvm->srcu); kvm_arch_flush_shadow_all(kvm); srcu_read_unlock(&kvm->srcu, idx); + kvm_put_kvm(kvm); } static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { @@ -486,8 +487,46 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { static int kvm_init_mmu_notifier(struct kvm *kvm) { + int rc; kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops; - return mmu_notifier_register(&kvm->mmu_notifier, current->mm); + rc = mmu_notifier_register(&kvm->mmu_notifier, current->mm); + /* + * We hold a reference to KVM here to make sure that the KVM + * doesn't get free'd before ops->release() completes. + */ + if (!rc) + kvm_get_kvm(kvm); + return rc; +} + +static void kvm_free_vm_rcu(struct rcu_head *rcu) +{ + struct kvm *kvm = container_of(rcu, struct kvm, mmu_notifier_rcu); + kvm_arch_free_vm(kvm); +} + +static void kvm_flush_shadow_mmu(struct kvm *kvm) +{ + /* + * We hold a reference to kvm instance for mmu_notifier and is + * only released when ops->release() is called via exit_mmap path. + * So, when we reach here ops->release() has been called already, which + * flushes the shadow page tables. Hence there is no need to call the + * release() again when we unregister the notifier. However, we need + * to delay freeing up the kvm until the release() completes, since + * we could reach here via : + * kvm_mmu_notifier_release() -> kvm_put_kvm() -> kvm_destroy_vm() + */ + mmu_notifier_unregister_no_release(&kvm->mmu_notifier, kvm->mm); +} + +static void kvm_free_vm(struct kvm *kvm) +{ + /* + * Wait until the mmu_notifier has finished the release(). + * See comments above in kvm_flush_shadow_mmu. + */ + mmu_notifier_call_srcu(&kvm->mmu_notifier_rcu, kvm_free_vm_rcu); } #else /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */ @@ -497,6 +536,16 @@ static int kvm_init_mmu_notifier(struct kvm *kvm) return 0; } +static void kvm_flush_shadow_mmu(struct kvm *kvm) +{ + kvm_arch_flush_shadow_all(kvm); +} + +static void kvm_free_vm(struct kvm *kvm) +{ + kvm_arch_free_vm(kvm); +} + #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ static struct kvm_memslots *kvm_alloc_memslots(void) @@ -733,18 +782,14 @@ static void kvm_destroy_vm(struct kvm *kvm) kvm->buses[i] = NULL; } kvm_coalesced_mmio_free(kvm); -#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) - mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); -#else - kvm_arch_flush_shadow_all(kvm); -#endif + kvm_flush_shadow_mmu(kvm); kvm_arch_destroy_vm(kvm); kvm_destroy_devices(kvm); for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) kvm_free_memslots(kvm, kvm->memslots[i]); cleanup_srcu_struct(&kvm->irq_srcu); cleanup_srcu_struct(&kvm->srcu); - kvm_arch_free_vm(kvm); + kvm_free_vm(kvm); preempt_notifier_dec(); hardware_disable_all(); mmdrop(mm);