diff mbox series

[4/4] kvm/x86: Allow to respond to generic signals during slow page faults

Message ID 20220622213656.81546-5-peterx@redhat.com (mailing list archive)
State New, archived
Headers show
Series kvm/mm: Allow GUP to respond to non fatal signals | expand

Commit Message

Peter Xu June 22, 2022, 9:36 p.m. UTC
All the facilities should be ready for this, what we need to do is to add a
new KVM_GTP_INTERRUPTIBLE flag showing that we're willing to be interrupted
by common signals during the __gfn_to_pfn_memslot() request, and wire it up
with a FOLL_INTERRUPTIBLE flag that we've just introduced.

Note that only x86 slow page fault routine will set this new bit.  The new
bit is not used in non-x86 arch or on other gup paths even for x86.
However it can actually be used elsewhere too but not yet covered.

When we see the PFN fetching was interrupted, do early exit to userspace
with an KVM_EXIT_INTR exit reason.

Signed-off-by: Peter Xu <peterx@redhat.com>
---
 arch/x86/kvm/mmu/mmu.c   | 9 +++++++++
 include/linux/kvm_host.h | 1 +
 virt/kvm/kvm_main.c      | 4 ++++
 3 files changed, 14 insertions(+)

Comments

Sean Christopherson June 23, 2022, 2:46 p.m. UTC | #1
On Wed, Jun 22, 2022, Peter Xu wrote:
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index e92f1ab63d6a..b39acb7cb16d 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -3012,6 +3012,13 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
>  static int handle_abnormal_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
>  			       unsigned int access)
>  {
> +	/* NOTE: not all error pfn is fatal; handle intr before the other ones */
> +	if (unlikely(is_intr_pfn(fault->pfn))) {
> +		vcpu->run->exit_reason = KVM_EXIT_INTR;
> +		++vcpu->stat.signal_exits;
> +		return -EINTR;
> +	}
> +
>  	/* The pfn is invalid, report the error! */
>  	if (unlikely(is_error_pfn(fault->pfn)))
>  		return kvm_handle_bad_page(vcpu, fault->gfn, fault->pfn);
> @@ -4017,6 +4024,8 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
>  		}
>  	}
>  
> +	/* Allow to respond to generic signals in slow page faults */

"slow" is being overloaded here.  The previous call __gfn_to_pfn_memslot() will
end up in hva_to_pfn_slow(), but because of passing a non-null async it won't wait.
This code really should have a more extensive comment irrespective of the interruptible
stuff, now would be a good time to add that.

Comments aside, isn't this series incomplete from the perspective that there are
still many flows where KVM will hang if gfn_to_pfn() gets stuck in gup?  E.g. if
KVM is retrieving a page pointed at by vmcs12.

> +	flags |= KVM_GTP_INTERRUPTIBLE;
>  	fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, flags, NULL,
>  					  &fault->map_writable, &fault->hva);
>  	return RET_PF_CONTINUE;
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 4f84a442f67f..c8d98e435537 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -1163,6 +1163,7 @@ typedef unsigned int __bitwise kvm_gtp_flag_t;
>  
>  #define  KVM_GTP_WRITE          ((__force kvm_gtp_flag_t) BIT(0))
>  #define  KVM_GTP_ATOMIC         ((__force kvm_gtp_flag_t) BIT(1))
> +#define  KVM_GTP_INTERRUPTIBLE  ((__force kvm_gtp_flag_t) BIT(2))
>  
>  kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
>  			       kvm_gtp_flag_t gtp_flags, bool *async,
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 952400b42ee9..b3873cac5672 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -2462,6 +2462,8 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async,
>  		flags |= FOLL_WRITE;
>  	if (async)
>  		flags |= FOLL_NOWAIT;
> +	if (gtp_flags & KVM_GTP_INTERRUPTIBLE)
> +		flags |= FOLL_INTERRUPTIBLE;
>  
>  	npages = get_user_pages_unlocked(addr, 1, &page, flags);
>  	if (npages != 1)
> @@ -2599,6 +2601,8 @@ kvm_pfn_t hva_to_pfn(unsigned long addr, kvm_gtp_flag_t gtp_flags, bool *async,
>  	npages = hva_to_pfn_slow(addr, async, gtp_flags, writable, &pfn);
>  	if (npages == 1)
>  		return pfn;
> +	if (npages == -EINTR)
> +		return KVM_PFN_ERR_INTR;
>  
>  	mmap_read_lock(current->mm);
>  	if (npages == -EHWPOISON ||
> -- 
> 2.32.0
>
Peter Xu June 23, 2022, 7:31 p.m. UTC | #2
Hi, Sean,

On Thu, Jun 23, 2022 at 02:46:08PM +0000, Sean Christopherson wrote:
> On Wed, Jun 22, 2022, Peter Xu wrote:
> > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > index e92f1ab63d6a..b39acb7cb16d 100644
> > --- a/arch/x86/kvm/mmu/mmu.c
> > +++ b/arch/x86/kvm/mmu/mmu.c
> > @@ -3012,6 +3012,13 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
> >  static int handle_abnormal_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
> >  			       unsigned int access)
> >  {
> > +	/* NOTE: not all error pfn is fatal; handle intr before the other ones */
> > +	if (unlikely(is_intr_pfn(fault->pfn))) {
> > +		vcpu->run->exit_reason = KVM_EXIT_INTR;
> > +		++vcpu->stat.signal_exits;
> > +		return -EINTR;
> > +	}
> > +
> >  	/* The pfn is invalid, report the error! */
> >  	if (unlikely(is_error_pfn(fault->pfn)))
> >  		return kvm_handle_bad_page(vcpu, fault->gfn, fault->pfn);
> > @@ -4017,6 +4024,8 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
> >  		}
> >  	}
> >  
> > +	/* Allow to respond to generic signals in slow page faults */
> 
> "slow" is being overloaded here.  The previous call __gfn_to_pfn_memslot() will
> end up in hva_to_pfn_slow(), but because of passing a non-null async it won't wait.
> This code really should have a more extensive comment irrespective of the interruptible
> stuff, now would be a good time to add that.

Yes I agree, especially the "async" parameter along with "atomic" makes it
even more confusing as you said.  But isn't that also means the "slow" here
is spot-on?  I mean imho it's the "elsewhere" needs cleanup not this
comment itself since it's really stating the fact that this is the real
slow path?

Or any other suggestions greatly welcomed on how I should improve this
comment.

> 
> Comments aside, isn't this series incomplete from the perspective that there are
> still many flows where KVM will hang if gfn_to_pfn() gets stuck in gup?  E.g. if
> KVM is retrieving a page pointed at by vmcs12.

Right.  The thing is I'm not confident I can make it complete easily in one
shot..

I mentioned some of that in cover letter or commit message of patch 1, in
that I don't think all the gup call sites are okay with being interrupted
by a non-fatal signal.

So what I want to do is doing it step by step, at least by introducing
FOLL_INTERRUPTIBLE and having one valid user of it that covers a very valid
use case.  I'm also pretty sure the page fault path is really the most
cases that will happen with GUP, so it already helps in many ways for me
when running with a patched kernel.

So when the complete picture is non-trivial to achieve in one shot, I think
this could be one option we go for.  With the facility (and example code on
x86 slow page fault) ready, hopefully we could start to convert many other
call sites to be signal-aware, outside page faults, or even outside x86,
because it's really a more generic problem, which I fully agree.

Does it sound reasonable to you?

Thanks,
Sean Christopherson June 23, 2022, 8:07 p.m. UTC | #3
On Thu, Jun 23, 2022, Peter Xu wrote:
> Hi, Sean,
> 
> On Thu, Jun 23, 2022 at 02:46:08PM +0000, Sean Christopherson wrote:
> > On Wed, Jun 22, 2022, Peter Xu wrote:
> > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > > index e92f1ab63d6a..b39acb7cb16d 100644
> > > --- a/arch/x86/kvm/mmu/mmu.c
> > > +++ b/arch/x86/kvm/mmu/mmu.c
> > > @@ -3012,6 +3012,13 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
> > >  static int handle_abnormal_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
> > >  			       unsigned int access)
> > >  {
> > > +	/* NOTE: not all error pfn is fatal; handle intr before the other ones */
> > > +	if (unlikely(is_intr_pfn(fault->pfn))) {
> > > +		vcpu->run->exit_reason = KVM_EXIT_INTR;
> > > +		++vcpu->stat.signal_exits;
> > > +		return -EINTR;
> > > +	}
> > > +
> > >  	/* The pfn is invalid, report the error! */
> > >  	if (unlikely(is_error_pfn(fault->pfn)))
> > >  		return kvm_handle_bad_page(vcpu, fault->gfn, fault->pfn);
> > > @@ -4017,6 +4024,8 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
> > >  		}
> > >  	}
> > >  
> > > +	/* Allow to respond to generic signals in slow page faults */
> > 
> > "slow" is being overloaded here.  The previous call __gfn_to_pfn_memslot() will
> > end up in hva_to_pfn_slow(), but because of passing a non-null async it won't wait.
> > This code really should have a more extensive comment irrespective of the interruptible
> > stuff, now would be a good time to add that.
> 
> Yes I agree, especially the "async" parameter along with "atomic" makes it
> even more confusing as you said.  But isn't that also means the "slow" here
> is spot-on?  I mean imho it's the "elsewhere" needs cleanup not this
> comment itself since it's really stating the fact that this is the real
> slow path?

No, because atomic=false here, i.e. KVM will try hva_to_pfn_slow() if hva_to_pfn_fast()
fails.  So even if we agree that the "wait for IO" path is the true slow path,
when reading KVM code the vast, vast majority of developers will associate "slow"
with hva_to_pfn_slow().

> Or any other suggestions greatly welcomed on how I should improve this
> comment.

Something along these lines?

	/*
	 * Allow gup to bail on pending non-fatal signals when it's also allowed
	 * to wait for IO.  Note, gup always bails if it is unable to quickly
	 * get a page and a fatal signal, i.e. SIGKILL, is pending.
	 */
> 
> > 
> > Comments aside, isn't this series incomplete from the perspective that there are
> > still many flows where KVM will hang if gfn_to_pfn() gets stuck in gup?  E.g. if
> > KVM is retrieving a page pointed at by vmcs12.
> 
> Right.  The thing is I'm not confident I can make it complete easily in one
> shot..
> 
> I mentioned some of that in cover letter or commit message of patch 1, in
> that I don't think all the gup call sites are okay with being interrupted
> by a non-fatal signal.
> 
> So what I want to do is doing it step by step, at least by introducing
> FOLL_INTERRUPTIBLE and having one valid user of it that covers a very valid
> use case.  I'm also pretty sure the page fault path is really the most
> cases that will happen with GUP, so it already helps in many ways for me
> when running with a patched kernel.
> 
> So when the complete picture is non-trivial to achieve in one shot, I think
> this could be one option we go for.  With the facility (and example code on
> x86 slow page fault) ready, hopefully we could start to convert many other
> call sites to be signal-aware, outside page faults, or even outside x86,
> because it's really a more generic problem, which I fully agree.
> 
> Does it sound reasonable to you?

Yep.  In fact, I'd be totally ok keeping this to just the page fault path.  I
missed one cruicial detail on my first read through: gup already bails on SIGKILL,
it's only these technically-not-fatal signals that gup ignores by default.  In
other words, using FOLL_INTERRUPTIBLE is purely opportunsitically as userspace
can always resort to SIGKILL if the VM really needs to die.

It would be very helpful to explicit call that out in the changelog, that way
it's (hopefully) clear that KVM uses FOLL_INTERRUPTIBLE to be user friendly when
it's easy to do so, and that it's not required for correctness/robustness.
Peter Xu June 23, 2022, 8:18 p.m. UTC | #4
On Thu, Jun 23, 2022 at 08:07:00PM +0000, Sean Christopherson wrote:
> On Thu, Jun 23, 2022, Peter Xu wrote:
> > Hi, Sean,
> > 
> > On Thu, Jun 23, 2022 at 02:46:08PM +0000, Sean Christopherson wrote:
> > > On Wed, Jun 22, 2022, Peter Xu wrote:
> > > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > > > index e92f1ab63d6a..b39acb7cb16d 100644
> > > > --- a/arch/x86/kvm/mmu/mmu.c
> > > > +++ b/arch/x86/kvm/mmu/mmu.c
> > > > @@ -3012,6 +3012,13 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
> > > >  static int handle_abnormal_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
> > > >  			       unsigned int access)
> > > >  {
> > > > +	/* NOTE: not all error pfn is fatal; handle intr before the other ones */
> > > > +	if (unlikely(is_intr_pfn(fault->pfn))) {
> > > > +		vcpu->run->exit_reason = KVM_EXIT_INTR;
> > > > +		++vcpu->stat.signal_exits;
> > > > +		return -EINTR;
> > > > +	}
> > > > +
> > > >  	/* The pfn is invalid, report the error! */
> > > >  	if (unlikely(is_error_pfn(fault->pfn)))
> > > >  		return kvm_handle_bad_page(vcpu, fault->gfn, fault->pfn);
> > > > @@ -4017,6 +4024,8 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
> > > >  		}
> > > >  	}
> > > >  
> > > > +	/* Allow to respond to generic signals in slow page faults */
> > > 
> > > "slow" is being overloaded here.  The previous call __gfn_to_pfn_memslot() will
> > > end up in hva_to_pfn_slow(), but because of passing a non-null async it won't wait.
> > > This code really should have a more extensive comment irrespective of the interruptible
> > > stuff, now would be a good time to add that.
> > 
> > Yes I agree, especially the "async" parameter along with "atomic" makes it
> > even more confusing as you said.  But isn't that also means the "slow" here
> > is spot-on?  I mean imho it's the "elsewhere" needs cleanup not this
> > comment itself since it's really stating the fact that this is the real
> > slow path?
> 
> No, because atomic=false here, i.e. KVM will try hva_to_pfn_slow() if hva_to_pfn_fast()
> fails.  So even if we agree that the "wait for IO" path is the true slow path,
> when reading KVM code the vast, vast majority of developers will associate "slow"
> with hva_to_pfn_slow().

Okay.  I think how we define slow matters, here my take is "when a major
fault happens" (as defined in the mm term), but probably that definition is
a bit far away from kvm as the hypervisor level indeed.

> 
> > Or any other suggestions greatly welcomed on how I should improve this
> > comment.
> 
> Something along these lines?
> 
> 	/*
> 	 * Allow gup to bail on pending non-fatal signals when it's also allowed
> 	 * to wait for IO.  Note, gup always bails if it is unable to quickly
> 	 * get a page and a fatal signal, i.e. SIGKILL, is pending.
> 	 */

Taken.

> > 
> > > 
> > > Comments aside, isn't this series incomplete from the perspective that there are
> > > still many flows where KVM will hang if gfn_to_pfn() gets stuck in gup?  E.g. if
> > > KVM is retrieving a page pointed at by vmcs12.
> > 
> > Right.  The thing is I'm not confident I can make it complete easily in one
> > shot..
> > 
> > I mentioned some of that in cover letter or commit message of patch 1, in
> > that I don't think all the gup call sites are okay with being interrupted
> > by a non-fatal signal.
> > 
> > So what I want to do is doing it step by step, at least by introducing
> > FOLL_INTERRUPTIBLE and having one valid user of it that covers a very valid
> > use case.  I'm also pretty sure the page fault path is really the most
> > cases that will happen with GUP, so it already helps in many ways for me
> > when running with a patched kernel.
> > 
> > So when the complete picture is non-trivial to achieve in one shot, I think
> > this could be one option we go for.  With the facility (and example code on
> > x86 slow page fault) ready, hopefully we could start to convert many other
> > call sites to be signal-aware, outside page faults, or even outside x86,
> > because it's really a more generic problem, which I fully agree.
> > 
> > Does it sound reasonable to you?
> 
> Yep.  In fact, I'd be totally ok keeping this to just the page fault path.  I
> missed one cruicial detail on my first read through: gup already bails on SIGKILL,
> it's only these technically-not-fatal signals that gup ignores by default.  In
> other words, using FOLL_INTERRUPTIBLE is purely opportunsitically as userspace
> can always resort to SIGKILL if the VM really needs to die.
> 
> It would be very helpful to explicit call that out in the changelog, that way
> it's (hopefully) clear that KVM uses FOLL_INTERRUPTIBLE to be user friendly when
> it's easy to do so, and that it's not required for correctness/robustness.

Yes that's the case, sigkill is special. I can mention that somewhere in
the cover letter too besides the comment you suggested above.  Thanks,
diff mbox series

Patch

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index e92f1ab63d6a..b39acb7cb16d 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3012,6 +3012,13 @@  static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
 static int handle_abnormal_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
 			       unsigned int access)
 {
+	/* NOTE: not all error pfn is fatal; handle intr before the other ones */
+	if (unlikely(is_intr_pfn(fault->pfn))) {
+		vcpu->run->exit_reason = KVM_EXIT_INTR;
+		++vcpu->stat.signal_exits;
+		return -EINTR;
+	}
+
 	/* The pfn is invalid, report the error! */
 	if (unlikely(is_error_pfn(fault->pfn)))
 		return kvm_handle_bad_page(vcpu, fault->gfn, fault->pfn);
@@ -4017,6 +4024,8 @@  static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 		}
 	}
 
+	/* Allow to respond to generic signals in slow page faults */
+	flags |= KVM_GTP_INTERRUPTIBLE;
 	fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, flags, NULL,
 					  &fault->map_writable, &fault->hva);
 	return RET_PF_CONTINUE;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 4f84a442f67f..c8d98e435537 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1163,6 +1163,7 @@  typedef unsigned int __bitwise kvm_gtp_flag_t;
 
 #define  KVM_GTP_WRITE          ((__force kvm_gtp_flag_t) BIT(0))
 #define  KVM_GTP_ATOMIC         ((__force kvm_gtp_flag_t) BIT(1))
+#define  KVM_GTP_INTERRUPTIBLE  ((__force kvm_gtp_flag_t) BIT(2))
 
 kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
 			       kvm_gtp_flag_t gtp_flags, bool *async,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 952400b42ee9..b3873cac5672 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2462,6 +2462,8 @@  static int hva_to_pfn_slow(unsigned long addr, bool *async,
 		flags |= FOLL_WRITE;
 	if (async)
 		flags |= FOLL_NOWAIT;
+	if (gtp_flags & KVM_GTP_INTERRUPTIBLE)
+		flags |= FOLL_INTERRUPTIBLE;
 
 	npages = get_user_pages_unlocked(addr, 1, &page, flags);
 	if (npages != 1)
@@ -2599,6 +2601,8 @@  kvm_pfn_t hva_to_pfn(unsigned long addr, kvm_gtp_flag_t gtp_flags, bool *async,
 	npages = hva_to_pfn_slow(addr, async, gtp_flags, writable, &pfn);
 	if (npages == 1)
 		return pfn;
+	if (npages == -EINTR)
+		return KVM_PFN_ERR_INTR;
 
 	mmap_read_lock(current->mm);
 	if (npages == -EHWPOISON ||