diff mbox series

[v3,17/22] KVM: Introduce KVM_CAP_ABSENT_MAPPING_FAULT without implementation

Message ID 20230412213510.1220557-18-amoorthy@google.com (mailing list archive)
State New, archived
Headers show
Series Improve scalability of KVM + userfaultfd live migration via annotated memory faults. | expand

Commit Message

Anish Moorthy April 12, 2023, 9:35 p.m. UTC
Add documentation, memslot flags, useful helper functions, and the
actual new capability itself.

Memory fault exits on absent mappings are particularly useful for
userfaultfd-based postcopy live migration. When many vCPUs fault on a
single userfaultfd the faults can take a while to surface to userspace
due to having to contend for uffd wait queue locks. Bypassing the uffd
entirely by returning information directly to the vCPU exit avoids this
contention and improves the fault rate.

Suggested-by: James Houghton <jthoughton@google.com>
Signed-off-by: Anish Moorthy <amoorthy@google.com>
---
 Documentation/virt/kvm/api.rst | 31 ++++++++++++++++++++++++++++---
 include/linux/kvm_host.h       |  7 +++++++
 include/uapi/linux/kvm.h       |  2 ++
 tools/include/uapi/linux/kvm.h |  1 +
 virt/kvm/kvm_main.c            |  3 +++
 5 files changed, 41 insertions(+), 3 deletions(-)

Comments

Robert Hoo April 19, 2023, 2 p.m. UTC | #1
On 4/13/2023 5:35 AM, Anish Moorthy wrote:
> Add documentation, memslot flags, useful helper functions, and the
> actual new capability itself.
> 
> Memory fault exits on absent mappings are particularly useful for
> userfaultfd-based postcopy live migration. When many vCPUs fault on a
> single userfaultfd the faults can take a while to surface to userspace
> due to having to contend for uffd wait queue locks. Bypassing the uffd
> entirely by returning information directly to the vCPU exit avoids this
> contention and improves the fault rate.
> 
> Suggested-by: James Houghton <jthoughton@google.com>
> Signed-off-by: Anish Moorthy <amoorthy@google.com>
> ---
>   Documentation/virt/kvm/api.rst | 31 ++++++++++++++++++++++++++++---
>   include/linux/kvm_host.h       |  7 +++++++
>   include/uapi/linux/kvm.h       |  2 ++
>   tools/include/uapi/linux/kvm.h |  1 +
>   virt/kvm/kvm_main.c            |  3 +++
>   5 files changed, 41 insertions(+), 3 deletions(-)
> 
> diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
> index f174f43c38d45..7967b9909e28b 100644
> --- a/Documentation/virt/kvm/api.rst
> +++ b/Documentation/virt/kvm/api.rst
> @@ -1312,6 +1312,7 @@ yet and must be cleared on entry.
>     /* for kvm_userspace_memory_region::flags */
>     #define KVM_MEM_LOG_DIRTY_PAGES	(1UL << 0)
>     #define KVM_MEM_READONLY	(1UL << 1)
> +  #define KVM_MEM_ABSENT_MAPPING_FAULT (1UL << 2)
>   
>   This ioctl allows the user to create, modify or delete a guest physical
>   memory slot.  Bits 0-15 of "slot" specify the slot id and this value
> @@ -1342,12 +1343,15 @@ It is recommended that the lower 21 bits of guest_phys_addr and userspace_addr
>   be identical.  This allows large pages in the guest to be backed by large
>   pages in the host.
>   
> -The flags field supports two flags: KVM_MEM_LOG_DIRTY_PAGES and
> -KVM_MEM_READONLY.  The former can be set to instruct KVM to keep track of
> +The flags field supports three flags
> +
> +1.  KVM_MEM_LOG_DIRTY_PAGES: can be set to instruct KVM to keep track of
>   writes to memory within the slot.  See KVM_GET_DIRTY_LOG ioctl to know how to
> -use it.  The latter can be set, if KVM_CAP_READONLY_MEM capability allows it,
> +use it.
> +2.  KVM_MEM_READONLY: can be set, if KVM_CAP_READONLY_MEM capability allows it,
>   to make a new slot read-only.  In this case, writes to this memory will be
>   posted to userspace as KVM_EXIT_MMIO exits.
> +3.  KVM_MEM_ABSENT_MAPPING_FAULT: see KVM_CAP_ABSENT_MAPPING_FAULT for details.
>   
>   When the KVM_CAP_SYNC_MMU capability is available, changes in the backing of
>   the memory region are automatically reflected into the guest.  For example, an
> @@ -7705,6 +7709,27 @@ userspace may receive "bare" EFAULTs (i.e. exit reason !=
>   KVM_EXIT_MEMORY_FAULT) from KVM_RUN. These should be considered bugs and
>   reported to the maintainers.
>   
> +7.35 KVM_CAP_ABSENT_MAPPING_FAULT
> +---------------------------------
> +
> +:Architectures: None
> +:Returns: -EINVAL.
> +
> +The presence of this capability indicates that userspace may pass the
> +KVM_MEM_ABSENT_MAPPING_FAULT flag to KVM_SET_USER_MEMORY_REGION to cause KVM_RUN
> +to fail (-EFAULT) in response to page faults for which the userspace page tables
> +do not contain present mappings. Attempting to enable the capability directly
> +will fail.
> +
> +The range of guest physical memory causing the fault is advertised to userspace
> +through KVM_CAP_MEMORY_FAULT_INFO (if it is enabled).
> +
> +Userspace should determine how best to make the mapping present, then take
> +appropriate action. For instance, in the case of absent mappings this might
> +involve establishing the mapping for the first time via UFFDIO_COPY/CONTINUE or
> +faulting the mapping in using MADV_POPULATE_READ/WRITE. After establishing the
> +mapping, userspace can return to KVM to retry the previous memory access.
> +
>   8. Other capabilities.
>   ======================
>   
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 776f9713f3921..2407fc1e52ab8 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -2289,4 +2289,11 @@ static inline void kvm_account_pgtable_pages(void *virt, int nr)
>    */
>   inline void kvm_populate_efault_info(struct kvm_vcpu *vcpu,
>   					uint64_t gpa, uint64_t len);
> +
> +static inline bool kvm_slot_fault_on_absent_mapping(
> +							const struct kvm_memory_slot *slot)

Strange line break.

> +{
> +	return slot->flags & KVM_MEM_ABSENT_MAPPING_FAULT;
> +}
> +
>   #endif
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index bc73e8381a2bb..21df449e74648 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -102,6 +102,7 @@ struct kvm_userspace_memory_region {
>    */
>   #define KVM_MEM_LOG_DIRTY_PAGES	(1UL << 0)
>   #define KVM_MEM_READONLY	(1UL << 1)
> +#define KVM_MEM_ABSENT_MAPPING_FAULT	(1UL << 2)
>   
>   /* for KVM_IRQ_LINE */
>   struct kvm_irq_level {
> @@ -1196,6 +1197,7 @@ struct kvm_ppc_resize_hpt {
>   #define KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP 225
>   #define KVM_CAP_PMU_EVENT_MASKED_EVENTS 226
>   #define KVM_CAP_MEMORY_FAULT_INFO 227
> +#define KVM_CAP_ABSENT_MAPPING_FAULT 228
>   
>   #ifdef KVM_CAP_IRQ_ROUTING
>   
> diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h
> index 5c57796364d65..59219da95634c 100644
> --- a/tools/include/uapi/linux/kvm.h
> +++ b/tools/include/uapi/linux/kvm.h
> @@ -102,6 +102,7 @@ struct kvm_userspace_memory_region {
>    */
>   #define KVM_MEM_LOG_DIRTY_PAGES	(1UL << 0)
>   #define KVM_MEM_READONLY	(1UL << 1)
> +#define KVM_MEM_ABSENT_MAPPING_FAULT (1UL << 2)
>   
>   /* for KVM_IRQ_LINE */
>   struct kvm_irq_level {
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index f3be5aa49829a..7cd0ad94726df 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -1525,6 +1525,9 @@ static int check_memory_region_flags(const struct kvm_userspace_memory_region *m
>   	valid_flags |= KVM_MEM_READONLY;

Is it better to also via kvm_vm_ioctl_check_extension() rather than
#ifdef __KVM_HAVE_READONLY_MEM?

>   #endif
>   
> +	if (kvm_vm_ioctl_check_extension(NULL, KVM_CAP_ABSENT_MAPPING_FAULT))
> +		valid_flags |= KVM_MEM_ABSENT_MAPPING_FAULT;
> +
>   	if (mem->flags & ~valid_flags)
>   		return -EINVAL;
>
Anish Moorthy April 20, 2023, 6:23 p.m. UTC | #2
On Wed, Apr 19, 2023 at 7:00 AM Hoo Robert <robert.hoo.linux@gmail.com> wrote:
> > +static inline bool kvm_slot_fault_on_absent_mapping(
> > +                                                     const struct kvm_memory_slot *slot)
>
> Strange line break.

Fixed: there's now a single indent on the second line.

> > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > index f3be5aa49829a..7cd0ad94726df 100644
> > --- a/virt/kvm/kvm_main.c
> > +++ b/virt/kvm/kvm_main.c
> > @@ -1525,6 +1525,9 @@ static int check_memory_region_flags(const struct kvm_userspace_memory_region *m
> >       valid_flags |= KVM_MEM_READONLY;
>
> Is it better to also via kvm_vm_ioctl_check_extension() rather than
> #ifdef __KVM_HAVE_READONLY_MEM?

Probably, that's unrelated though so I won't change it here
Sean Christopherson April 24, 2023, 9:02 p.m. UTC | #3
On Wed, Apr 12, 2023, Anish Moorthy wrote:
> Add documentation, memslot flags, useful helper functions, and the
> actual new capability itself.
> 
> Memory fault exits on absent mappings are particularly useful for
> userfaultfd-based postcopy live migration. When many vCPUs fault on a
> single userfaultfd the faults can take a while to surface to userspace
> due to having to contend for uffd wait queue locks. Bypassing the uffd
> entirely by returning information directly to the vCPU exit avoids this
> contention and improves the fault rate.
> 
> Suggested-by: James Houghton <jthoughton@google.com>
> Signed-off-by: Anish Moorthy <amoorthy@google.com>
> ---
>  Documentation/virt/kvm/api.rst | 31 ++++++++++++++++++++++++++++---
>  include/linux/kvm_host.h       |  7 +++++++
>  include/uapi/linux/kvm.h       |  2 ++
>  tools/include/uapi/linux/kvm.h |  1 +
>  virt/kvm/kvm_main.c            |  3 +++
>  5 files changed, 41 insertions(+), 3 deletions(-)
> 
> diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
> index f174f43c38d45..7967b9909e28b 100644
> --- a/Documentation/virt/kvm/api.rst
> +++ b/Documentation/virt/kvm/api.rst
> @@ -1312,6 +1312,7 @@ yet and must be cleared on entry.
>    /* for kvm_userspace_memory_region::flags */
>    #define KVM_MEM_LOG_DIRTY_PAGES	(1UL << 0)
>    #define KVM_MEM_READONLY	(1UL << 1)
> +  #define KVM_MEM_ABSENT_MAPPING_FAULT (1UL << 2)

This name is both too specific and too vague.  It's too specific because it affects
more than just "absent" mappings, it will affect any page fault that can't be
resolved by fast GUP, i.e. I'm objecting for all the same reasons I objected to
the exit reason being name KVM_MEMFAULT_REASON_ABSENT_MAPPING.  It's too vague
because it doesn't describe what behavior the flag actually enables in any way.

I liked the "nowait" verbiage from the RFC.  "fast_only" is an ok alternative,
but that's much more of a kernel-internal name.

Oliver, you had concerns with using "fault" in the name, is something like
KVM_MEM_NOWAIT_ON_PAGE_FAULT or KVM_MEM_NOWAIT_ON_FAULT palatable?  IMO, "fault"
is perfectly ok, we just need to ensure it's unlikely to be ambiguous for userspace.
Oliver Upton June 1, 2023, 4:04 p.m. UTC | #4
Better late than never right? :)

On Mon, Apr 24, 2023 at 02:02:49PM -0700, Sean Christopherson wrote:
> On Wed, Apr 12, 2023, Anish Moorthy wrote:
> > Add documentation, memslot flags, useful helper functions, and the
> > actual new capability itself.
> > 
> > Memory fault exits on absent mappings are particularly useful for
> > userfaultfd-based postcopy live migration. When many vCPUs fault on a
> > single userfaultfd the faults can take a while to surface to userspace
> > due to having to contend for uffd wait queue locks. Bypassing the uffd
> > entirely by returning information directly to the vCPU exit avoids this
> > contention and improves the fault rate.
> > 
> > Suggested-by: James Houghton <jthoughton@google.com>
> > Signed-off-by: Anish Moorthy <amoorthy@google.com>
> > ---
> >  Documentation/virt/kvm/api.rst | 31 ++++++++++++++++++++++++++++---
> >  include/linux/kvm_host.h       |  7 +++++++
> >  include/uapi/linux/kvm.h       |  2 ++
> >  tools/include/uapi/linux/kvm.h |  1 +
> >  virt/kvm/kvm_main.c            |  3 +++
> >  5 files changed, 41 insertions(+), 3 deletions(-)
> > 
> > diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
> > index f174f43c38d45..7967b9909e28b 100644
> > --- a/Documentation/virt/kvm/api.rst
> > +++ b/Documentation/virt/kvm/api.rst
> > @@ -1312,6 +1312,7 @@ yet and must be cleared on entry.
> >    /* for kvm_userspace_memory_region::flags */
> >    #define KVM_MEM_LOG_DIRTY_PAGES	(1UL << 0)
> >    #define KVM_MEM_READONLY	(1UL << 1)
> > +  #define KVM_MEM_ABSENT_MAPPING_FAULT (1UL << 2)
> 
> This name is both too specific and too vague.  It's too specific because it affects
> more than just "absent" mappings, it will affect any page fault that can't be
> resolved by fast GUP, i.e. I'm objecting for all the same reasons I objected to
> the exit reason being name KVM_MEMFAULT_REASON_ABSENT_MAPPING.  It's too vague
> because it doesn't describe what behavior the flag actually enables in any way.
> 
> I liked the "nowait" verbiage from the RFC.  "fast_only" is an ok alternative,
> but that's much more of a kernel-internal name.
> 
> Oliver, you had concerns with using "fault" in the name, is something like
> KVM_MEM_NOWAIT_ON_PAGE_FAULT or KVM_MEM_NOWAIT_ON_FAULT palatable?  IMO, "fault"
> is perfectly ok, we just need to ensure it's unlikely to be ambiguous for userspace.

Yeah, I can get over it. Slight preference towards KVM_MEM_NOWAIT_ON_FAULT,
fewer characters and still gets the point across.
Oliver Upton June 1, 2023, 6:19 p.m. UTC | #5
Anish,

On Wed, Apr 12, 2023 at 09:35:05PM +0000, Anish Moorthy wrote:
> +7.35 KVM_CAP_ABSENT_MAPPING_FAULT
> +---------------------------------
> +
> +:Architectures: None
> +:Returns: -EINVAL.
> +
> +The presence of this capability indicates that userspace may pass the
> +KVM_MEM_ABSENT_MAPPING_FAULT flag to KVM_SET_USER_MEMORY_REGION to cause KVM_RUN
> +to fail (-EFAULT) in response to page faults for which the userspace page tables
> +do not contain present mappings. Attempting to enable the capability directly
> +will fail.
> +
> +The range of guest physical memory causing the fault is advertised to userspace
> +through KVM_CAP_MEMORY_FAULT_INFO (if it is enabled).

Maybe third time is the charm. I *really* do not like the
interdependence between NOWAIT exits and the completely orthogonal
annotation of existing EFAULT exits.

How do we support a userspace that only cares about NOWAIT exits but
doesn't want other EFAULT exits to be annotated? It is very likely that
userspace will only know how to resolve NOWAIT exits anyway. Since we do
not provide a precise description of the conditions that caused an exit,
there's no way for userspace to differentiate between NOWAIT exits and
other exits it couldn't care less about.

NOWAIT exits w/o annotation (i.e. a 'bare' EFAULT) make even less sense
since userspace cannot even tell what address needs fixing at that
point.

This is why I had been suggesting we separate the two capabilities and
make annotated exits an unconditional property of NOWAIT exits. It
aligns with the practical use you're proposing for the series, and still
puts userspace in the drivers seat for other issues it may or may not
care about.
Sean Christopherson June 1, 2023, 6:59 p.m. UTC | #6
On Thu, Jun 01, 2023, Oliver Upton wrote:
> Anish,
> 
> On Wed, Apr 12, 2023 at 09:35:05PM +0000, Anish Moorthy wrote:
> > +7.35 KVM_CAP_ABSENT_MAPPING_FAULT
> > +---------------------------------
> > +
> > +:Architectures: None
> > +:Returns: -EINVAL.
> > +
> > +The presence of this capability indicates that userspace may pass the
> > +KVM_MEM_ABSENT_MAPPING_FAULT flag to KVM_SET_USER_MEMORY_REGION to cause KVM_RUN
> > +to fail (-EFAULT) in response to page faults for which the userspace page tables
> > +do not contain present mappings. Attempting to enable the capability directly
> > +will fail.
> > +
> > +The range of guest physical memory causing the fault is advertised to userspace
> > +through KVM_CAP_MEMORY_FAULT_INFO (if it is enabled).
> 
> Maybe third time is the charm. I *really* do not like the
> interdependence between NOWAIT exits and the completely orthogonal
> annotation of existing EFAULT exits.

They're not completely orthogonal, because the touchpoints for NOWAIT are themselves
existing EFAULT exits.

> How do we support a userspace that only cares about NOWAIT exits but
> doesn't want other EFAULT exits to be annotated?

We don't.  The proposed approach is to not change the return value, and the
vcpu->run union currently holds random garbage on -EFAULT, so I don't see any reason
to require userspace to opt-in, or to let userspace opt-out.  I.e. fill
vcpu->run->memory_fault unconditionally (for the paths that are converted) and
advertise to userspace that vcpu->run->memory_fault *may* contain useful info on
-EFAULT when KVM_CAP_MEMORY_FAULT_INFO is supported.  And then we define KVM's
ABI such that vcpu->run->memory_fault is guarateed to be valid if an -EFAULT occurs
when faulting in guest memory (on supported architectures).

> It is very likely that userspace will only know how to resolve NOWAIT exits
> anyway. Since we do not provide a precise description of the conditions that
> caused an exit, there's no way for userspace to differentiate between NOWAIT
> exits and other exits it couldn't care less about.
> 
> NOWAIT exits w/o annotation (i.e. a 'bare' EFAULT) make even less sense
> since userspace cannot even tell what address needs fixing at that
> point.
> 
> This is why I had been suggesting we separate the two capabilities and
> make annotated exits an unconditional property of NOWAIT exits.

No, because as I've been stating ad nauseum, KVM cannot differentiate between a
NOWAIT -EFAULT and an -EFAULT that would have occurred regardless of the NOWAIT
behavior.  Defining the ABI to be that KVM fills memory_fault if and only if the
slot has NOWAIT will create a mess, e.g. if an -EFAULT occurs while userspace
is doing a KVM_SET_USER_MEMORY_REGION to set NOWAIT, userspace may or may not see
valid memory_fault information depending on when the vCPU grabbed its memslot
snapshot.
Oliver Upton June 1, 2023, 7:29 p.m. UTC | #7
On Thu, Jun 01, 2023 at 11:59:29AM -0700, Sean Christopherson wrote:
> On Thu, Jun 01, 2023, Oliver Upton wrote:
> > How do we support a userspace that only cares about NOWAIT exits but
> > doesn't want other EFAULT exits to be annotated?
> 
> We don't.  The proposed approach is to not change the return value, and the
> vcpu->run union currently holds random garbage on -EFAULT, so I don't see any reason
> to require userspace to opt-in, or to let userspace opt-out.  I.e. fill
> vcpu->run->memory_fault unconditionally (for the paths that are converted) and
> advertise to userspace that vcpu->run->memory_fault *may* contain useful info on
> -EFAULT when KVM_CAP_MEMORY_FAULT_INFO is supported.  And then we define KVM's
> ABI such that vcpu->run->memory_fault is guarateed to be valid if an -EFAULT occurs
> when faulting in guest memory (on supported architectures).

Sure, but the series currently gives userspace an explicit opt-in for
existing EFAULT paths. Hold your breath, I'll reply over there so we
don't mix context.

> > It is very likely that userspace will only know how to resolve NOWAIT exits
> > anyway. Since we do not provide a precise description of the conditions that
> > caused an exit, there's no way for userspace to differentiate between NOWAIT
> > exits and other exits it couldn't care less about.
> > 
> > NOWAIT exits w/o annotation (i.e. a 'bare' EFAULT) make even less sense
> > since userspace cannot even tell what address needs fixing at that
> > point.
> > 
> > This is why I had been suggesting we separate the two capabilities and
> > make annotated exits an unconditional property of NOWAIT exits.
> 
> No, because as I've been stating ad nauseum, KVM cannot differentiate between a
> NOWAIT -EFAULT and an -EFAULT that would have occurred regardless of the NOWAIT
> behavior.

IOW: "If you engage brain for more than a second, you'll actually see
the point"

Ok, I'm on board now and sorry for the noise.
Sean Christopherson June 1, 2023, 7:34 p.m. UTC | #8
On Thu, Jun 01, 2023, Oliver Upton wrote:
> On Thu, Jun 01, 2023 at 11:59:29AM -0700, Sean Christopherson wrote:
> > On Thu, Jun 01, 2023, Oliver Upton wrote:
> > > How do we support a userspace that only cares about NOWAIT exits but
> > > doesn't want other EFAULT exits to be annotated?
> > 
> > We don't.  The proposed approach is to not change the return value, and the
> > vcpu->run union currently holds random garbage on -EFAULT, so I don't see any reason
> > to require userspace to opt-in, or to let userspace opt-out.  I.e. fill
> > vcpu->run->memory_fault unconditionally (for the paths that are converted) and
> > advertise to userspace that vcpu->run->memory_fault *may* contain useful info on
> > -EFAULT when KVM_CAP_MEMORY_FAULT_INFO is supported.  And then we define KVM's
> > ABI such that vcpu->run->memory_fault is guarateed to be valid if an -EFAULT occurs
> > when faulting in guest memory (on supported architectures).
> 
> Sure, but the series currently gives userspace an explicit opt-in for
> existing EFAULT paths. 

Yeah, that's one of the things I am/was going to provide feedback on, I've been
really slow getting into reviews for this cycle :-/
diff mbox series

Patch

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index f174f43c38d45..7967b9909e28b 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -1312,6 +1312,7 @@  yet and must be cleared on entry.
   /* for kvm_userspace_memory_region::flags */
   #define KVM_MEM_LOG_DIRTY_PAGES	(1UL << 0)
   #define KVM_MEM_READONLY	(1UL << 1)
+  #define KVM_MEM_ABSENT_MAPPING_FAULT (1UL << 2)
 
 This ioctl allows the user to create, modify or delete a guest physical
 memory slot.  Bits 0-15 of "slot" specify the slot id and this value
@@ -1342,12 +1343,15 @@  It is recommended that the lower 21 bits of guest_phys_addr and userspace_addr
 be identical.  This allows large pages in the guest to be backed by large
 pages in the host.
 
-The flags field supports two flags: KVM_MEM_LOG_DIRTY_PAGES and
-KVM_MEM_READONLY.  The former can be set to instruct KVM to keep track of
+The flags field supports three flags
+
+1.  KVM_MEM_LOG_DIRTY_PAGES: can be set to instruct KVM to keep track of
 writes to memory within the slot.  See KVM_GET_DIRTY_LOG ioctl to know how to
-use it.  The latter can be set, if KVM_CAP_READONLY_MEM capability allows it,
+use it.
+2.  KVM_MEM_READONLY: can be set, if KVM_CAP_READONLY_MEM capability allows it,
 to make a new slot read-only.  In this case, writes to this memory will be
 posted to userspace as KVM_EXIT_MMIO exits.
+3.  KVM_MEM_ABSENT_MAPPING_FAULT: see KVM_CAP_ABSENT_MAPPING_FAULT for details.
 
 When the KVM_CAP_SYNC_MMU capability is available, changes in the backing of
 the memory region are automatically reflected into the guest.  For example, an
@@ -7705,6 +7709,27 @@  userspace may receive "bare" EFAULTs (i.e. exit reason !=
 KVM_EXIT_MEMORY_FAULT) from KVM_RUN. These should be considered bugs and
 reported to the maintainers.
 
+7.35 KVM_CAP_ABSENT_MAPPING_FAULT
+---------------------------------
+
+:Architectures: None
+:Returns: -EINVAL.
+
+The presence of this capability indicates that userspace may pass the
+KVM_MEM_ABSENT_MAPPING_FAULT flag to KVM_SET_USER_MEMORY_REGION to cause KVM_RUN
+to fail (-EFAULT) in response to page faults for which the userspace page tables
+do not contain present mappings. Attempting to enable the capability directly
+will fail.
+
+The range of guest physical memory causing the fault is advertised to userspace
+through KVM_CAP_MEMORY_FAULT_INFO (if it is enabled).
+
+Userspace should determine how best to make the mapping present, then take
+appropriate action. For instance, in the case of absent mappings this might
+involve establishing the mapping for the first time via UFFDIO_COPY/CONTINUE or
+faulting the mapping in using MADV_POPULATE_READ/WRITE. After establishing the
+mapping, userspace can return to KVM to retry the previous memory access.
+
 8. Other capabilities.
 ======================
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 776f9713f3921..2407fc1e52ab8 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2289,4 +2289,11 @@  static inline void kvm_account_pgtable_pages(void *virt, int nr)
  */
 inline void kvm_populate_efault_info(struct kvm_vcpu *vcpu,
 					uint64_t gpa, uint64_t len);
+
+static inline bool kvm_slot_fault_on_absent_mapping(
+							const struct kvm_memory_slot *slot)
+{
+	return slot->flags & KVM_MEM_ABSENT_MAPPING_FAULT;
+}
+
 #endif
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index bc73e8381a2bb..21df449e74648 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -102,6 +102,7 @@  struct kvm_userspace_memory_region {
  */
 #define KVM_MEM_LOG_DIRTY_PAGES	(1UL << 0)
 #define KVM_MEM_READONLY	(1UL << 1)
+#define KVM_MEM_ABSENT_MAPPING_FAULT	(1UL << 2)
 
 /* for KVM_IRQ_LINE */
 struct kvm_irq_level {
@@ -1196,6 +1197,7 @@  struct kvm_ppc_resize_hpt {
 #define KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP 225
 #define KVM_CAP_PMU_EVENT_MASKED_EVENTS 226
 #define KVM_CAP_MEMORY_FAULT_INFO 227
+#define KVM_CAP_ABSENT_MAPPING_FAULT 228
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h
index 5c57796364d65..59219da95634c 100644
--- a/tools/include/uapi/linux/kvm.h
+++ b/tools/include/uapi/linux/kvm.h
@@ -102,6 +102,7 @@  struct kvm_userspace_memory_region {
  */
 #define KVM_MEM_LOG_DIRTY_PAGES	(1UL << 0)
 #define KVM_MEM_READONLY	(1UL << 1)
+#define KVM_MEM_ABSENT_MAPPING_FAULT (1UL << 2)
 
 /* for KVM_IRQ_LINE */
 struct kvm_irq_level {
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index f3be5aa49829a..7cd0ad94726df 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1525,6 +1525,9 @@  static int check_memory_region_flags(const struct kvm_userspace_memory_region *m
 	valid_flags |= KVM_MEM_READONLY;
 #endif
 
+	if (kvm_vm_ioctl_check_extension(NULL, KVM_CAP_ABSENT_MAPPING_FAULT))
+		valid_flags |= KVM_MEM_ABSENT_MAPPING_FAULT;
+
 	if (mem->flags & ~valid_flags)
 		return -EINVAL;