diff mbox series

[v10,9/9] KVM: Enable and expose KVM_MEM_PRIVATE

Message ID 20221202061347.1070246-10-chao.p.peng@linux.intel.com (mailing list archive)
State New
Headers show
Series KVM: mm: fd-based approach for supporting KVM | expand

Commit Message

Chao Peng Dec. 2, 2022, 6:13 a.m. UTC
Register/unregister private memslot to fd-based memory backing store
restrictedmem and implement the callbacks for restrictedmem_notifier:
  - invalidate_start()/invalidate_end() to zap the existing memory
    mappings in the KVM page table.
  - error() to request KVM_REQ_MEMORY_MCE and later exit to userspace
    with KVM_EXIT_SHUTDOWN.

Expose KVM_MEM_PRIVATE for memslot and KVM_MEMORY_ATTRIBUTE_PRIVATE for
KVM_GET_SUPPORTED_MEMORY_ATTRIBUTES to userspace but either are
controlled by kvm_arch_has_private_mem() which should be rewritten by
architecture code.

Co-developed-by: Yu Zhang <yu.c.zhang@linux.intel.com>
Signed-off-by: Yu Zhang <yu.c.zhang@linux.intel.com>
Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
Reviewed-by: Fuad Tabba <tabba@google.com>
---
 arch/x86/include/asm/kvm_host.h |   1 +
 arch/x86/kvm/x86.c              |  13 +++
 include/linux/kvm_host.h        |   3 +
 virt/kvm/kvm_main.c             | 179 +++++++++++++++++++++++++++++++-
 4 files changed, 191 insertions(+), 5 deletions(-)

Comments

Fuad Tabba Dec. 9, 2022, 9:11 a.m. UTC | #1
Hi,

On Fri, Dec 2, 2022 at 6:20 AM Chao Peng <chao.p.peng@linux.intel.com> wrote:
>
> Register/unregister private memslot to fd-based memory backing store
> restrictedmem and implement the callbacks for restrictedmem_notifier:
>   - invalidate_start()/invalidate_end() to zap the existing memory
>     mappings in the KVM page table.
>   - error() to request KVM_REQ_MEMORY_MCE and later exit to userspace
>     with KVM_EXIT_SHUTDOWN.
>
> Expose KVM_MEM_PRIVATE for memslot and KVM_MEMORY_ATTRIBUTE_PRIVATE for
> KVM_GET_SUPPORTED_MEMORY_ATTRIBUTES to userspace but either are
> controlled by kvm_arch_has_private_mem() which should be rewritten by
> architecture code.
>
> Co-developed-by: Yu Zhang <yu.c.zhang@linux.intel.com>
> Signed-off-by: Yu Zhang <yu.c.zhang@linux.intel.com>
> Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
> Reviewed-by: Fuad Tabba <tabba@google.com>

With the code to port it to pKVM/arm64:
Tested-by: Fuad Tabba <tabba@google.com>

Cheers,
/fuad


> ---
>  arch/x86/include/asm/kvm_host.h |   1 +
>  arch/x86/kvm/x86.c              |  13 +++
>  include/linux/kvm_host.h        |   3 +
>  virt/kvm/kvm_main.c             | 179 +++++++++++++++++++++++++++++++-
>  4 files changed, 191 insertions(+), 5 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 7772ab37ac89..27ef31133352 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -114,6 +114,7 @@
>         KVM_ARCH_REQ_FLAGS(31, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
>  #define KVM_REQ_HV_TLB_FLUSH \
>         KVM_ARCH_REQ_FLAGS(32, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
> +#define KVM_REQ_MEMORY_MCE             KVM_ARCH_REQ(33)
>
>  #define CR0_RESERVED_BITS                                               \
>         (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 5aefcff614d2..c67e22f3e2ee 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -6587,6 +6587,13 @@ int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state)
>  }
>  #endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */
>
> +#ifdef CONFIG_HAVE_KVM_RESTRICTED_MEM
> +void kvm_arch_memory_mce(struct kvm *kvm)
> +{
> +       kvm_make_all_cpus_request(kvm, KVM_REQ_MEMORY_MCE);
> +}
> +#endif
> +
>  static int kvm_vm_ioctl_get_clock(struct kvm *kvm, void __user *argp)
>  {
>         struct kvm_clock_data data = { 0 };
> @@ -10357,6 +10364,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>
>                 if (kvm_check_request(KVM_REQ_UPDATE_CPU_DIRTY_LOGGING, vcpu))
>                         static_call(kvm_x86_update_cpu_dirty_logging)(vcpu);
> +
> +               if (kvm_check_request(KVM_REQ_MEMORY_MCE, vcpu)) {
> +                       vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
> +                       r = 0;
> +                       goto out;
> +               }
>         }
>
>         if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win ||
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 153842bb33df..f032d878e034 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -590,6 +590,7 @@ struct kvm_memory_slot {
>         struct file *restricted_file;
>         loff_t restricted_offset;
>         struct restrictedmem_notifier notifier;
> +       struct kvm *kvm;
>  };
>
>  static inline bool kvm_slot_can_be_private(const struct kvm_memory_slot *slot)
> @@ -2363,6 +2364,8 @@ static inline int kvm_restricted_mem_get_pfn(struct kvm_memory_slot *slot,
>         *pfn = page_to_pfn(page);
>         return ret;
>  }
> +
> +void kvm_arch_memory_mce(struct kvm *kvm);
>  #endif /* CONFIG_HAVE_KVM_RESTRICTED_MEM */
>
>  #endif
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index e107afea32f0..ac835fc77273 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -936,6 +936,121 @@ static int kvm_init_mmu_notifier(struct kvm *kvm)
>
>  #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
>
> +#ifdef CONFIG_HAVE_KVM_RESTRICTED_MEM
> +static bool restrictedmem_range_is_valid(struct kvm_memory_slot *slot,
> +                                        pgoff_t start, pgoff_t end,
> +                                        gfn_t *gfn_start, gfn_t *gfn_end)
> +{
> +       unsigned long base_pgoff = slot->restricted_offset >> PAGE_SHIFT;
> +
> +       if (start > base_pgoff)
> +               *gfn_start = slot->base_gfn + start - base_pgoff;
> +       else
> +               *gfn_start = slot->base_gfn;
> +
> +       if (end < base_pgoff + slot->npages)
> +               *gfn_end = slot->base_gfn + end - base_pgoff;
> +       else
> +               *gfn_end = slot->base_gfn + slot->npages;
> +
> +       if (*gfn_start >= *gfn_end)
> +               return false;
> +
> +       return true;
> +}
> +
> +static void kvm_restrictedmem_invalidate_begin(struct restrictedmem_notifier *notifier,
> +                                              pgoff_t start, pgoff_t end)
> +{
> +       struct kvm_memory_slot *slot = container_of(notifier,
> +                                                   struct kvm_memory_slot,
> +                                                   notifier);
> +       struct kvm *kvm = slot->kvm;
> +       gfn_t gfn_start, gfn_end;
> +       struct kvm_gfn_range gfn_range;
> +       int idx;
> +
> +       if (!restrictedmem_range_is_valid(slot, start, end,
> +                                         &gfn_start, &gfn_end))
> +               return;
> +
> +       gfn_range.start = gfn_start;
> +       gfn_range.end = gfn_end;
> +       gfn_range.slot = slot;
> +       gfn_range.pte = __pte(0);
> +       gfn_range.may_block = true;
> +
> +       idx = srcu_read_lock(&kvm->srcu);
> +       KVM_MMU_LOCK(kvm);
> +
> +       kvm_mmu_invalidate_begin(kvm);
> +       kvm_mmu_invalidate_range_add(kvm, gfn_start, gfn_end);
> +       if (kvm_unmap_gfn_range(kvm, &gfn_range))
> +               kvm_flush_remote_tlbs(kvm);
> +
> +       KVM_MMU_UNLOCK(kvm);
> +       srcu_read_unlock(&kvm->srcu, idx);
> +}
> +
> +static void kvm_restrictedmem_invalidate_end(struct restrictedmem_notifier *notifier,
> +                                            pgoff_t start, pgoff_t end)
> +{
> +       struct kvm_memory_slot *slot = container_of(notifier,
> +                                                   struct kvm_memory_slot,
> +                                                   notifier);
> +       struct kvm *kvm = slot->kvm;
> +       gfn_t gfn_start, gfn_end;
> +
> +       if (!restrictedmem_range_is_valid(slot, start, end,
> +                                         &gfn_start, &gfn_end))
> +               return;
> +
> +       KVM_MMU_LOCK(kvm);
> +       kvm_mmu_invalidate_end(kvm);
> +       KVM_MMU_UNLOCK(kvm);
> +}
> +
> +static void kvm_restrictedmem_error(struct restrictedmem_notifier *notifier,
> +                                   pgoff_t start, pgoff_t end)
> +{
> +       struct kvm_memory_slot *slot = container_of(notifier,
> +                                                   struct kvm_memory_slot,
> +                                                   notifier);
> +       kvm_arch_memory_mce(slot->kvm);
> +}
> +
> +static struct restrictedmem_notifier_ops kvm_restrictedmem_notifier_ops = {
> +       .invalidate_start = kvm_restrictedmem_invalidate_begin,
> +       .invalidate_end = kvm_restrictedmem_invalidate_end,
> +       .error = kvm_restrictedmem_error,
> +};
> +
> +static inline void kvm_restrictedmem_register(struct kvm_memory_slot *slot)
> +{
> +       slot->notifier.ops = &kvm_restrictedmem_notifier_ops;
> +       restrictedmem_register_notifier(slot->restricted_file, &slot->notifier);
> +}
> +
> +static inline void kvm_restrictedmem_unregister(struct kvm_memory_slot *slot)
> +{
> +       restrictedmem_unregister_notifier(slot->restricted_file,
> +                                         &slot->notifier);
> +}
> +
> +#else /* !CONFIG_HAVE_KVM_RESTRICTED_MEM */
> +
> +static inline void kvm_restrictedmem_register(struct kvm_memory_slot *slot)
> +{
> +       WARN_ON_ONCE(1);
> +}
> +
> +static inline void kvm_restrictedmem_unregister(struct kvm_memory_slot *slot)
> +{
> +       WARN_ON_ONCE(1);
> +}
> +
> +#endif /* CONFIG_HAVE_KVM_RESTRICTED_MEM */
> +
>  #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
>  static int kvm_pm_notifier_call(struct notifier_block *bl,
>                                 unsigned long state,
> @@ -980,6 +1095,11 @@ static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
>  /* This does not remove the slot from struct kvm_memslots data structures */
>  static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
>  {
> +       if (slot->flags & KVM_MEM_PRIVATE) {
> +               kvm_restrictedmem_unregister(slot);
> +               fput(slot->restricted_file);
> +       }
> +
>         kvm_destroy_dirty_bitmap(slot);
>
>         kvm_arch_free_memslot(kvm, slot);
> @@ -1551,10 +1671,14 @@ static void kvm_replace_memslot(struct kvm *kvm,
>         }
>  }
>
> -static int check_memory_region_flags(const struct kvm_user_mem_region *mem)
> +static int check_memory_region_flags(struct kvm *kvm,
> +                                    const struct kvm_user_mem_region *mem)
>  {
>         u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
>
> +       if (kvm_arch_has_private_mem(kvm))
> +               valid_flags |= KVM_MEM_PRIVATE;
> +
>  #ifdef __KVM_HAVE_READONLY_MEM
>         valid_flags |= KVM_MEM_READONLY;
>  #endif
> @@ -1630,6 +1754,9 @@ static int kvm_prepare_memory_region(struct kvm *kvm,
>  {
>         int r;
>
> +       if (change == KVM_MR_CREATE && new->flags & KVM_MEM_PRIVATE)
> +               kvm_restrictedmem_register(new);
> +
>         /*
>          * If dirty logging is disabled, nullify the bitmap; the old bitmap
>          * will be freed on "commit".  If logging is enabled in both old and
> @@ -1658,6 +1785,9 @@ static int kvm_prepare_memory_region(struct kvm *kvm,
>         if (r && new && new->dirty_bitmap && (!old || !old->dirty_bitmap))
>                 kvm_destroy_dirty_bitmap(new);
>
> +       if (r && change == KVM_MR_CREATE && new->flags & KVM_MEM_PRIVATE)
> +               kvm_restrictedmem_unregister(new);
> +
>         return r;
>  }
>
> @@ -1963,7 +2093,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
>         int as_id, id;
>         int r;
>
> -       r = check_memory_region_flags(mem);
> +       r = check_memory_region_flags(kvm, mem);
>         if (r)
>                 return r;
>
> @@ -1982,6 +2112,10 @@ int __kvm_set_memory_region(struct kvm *kvm,
>              !access_ok((void __user *)(unsigned long)mem->userspace_addr,
>                         mem->memory_size))
>                 return -EINVAL;
> +       if (mem->flags & KVM_MEM_PRIVATE &&
> +               (mem->restricted_offset & (PAGE_SIZE - 1) ||
> +                mem->restricted_offset > U64_MAX - mem->memory_size))
> +               return -EINVAL;
>         if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM)
>                 return -EINVAL;
>         if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
> @@ -2020,6 +2154,9 @@ int __kvm_set_memory_region(struct kvm *kvm,
>                 if ((kvm->nr_memslot_pages + npages) < kvm->nr_memslot_pages)
>                         return -EINVAL;
>         } else { /* Modify an existing slot. */
> +               /* Private memslots are immutable, they can only be deleted. */
> +               if (mem->flags & KVM_MEM_PRIVATE)
> +                       return -EINVAL;
>                 if ((mem->userspace_addr != old->userspace_addr) ||
>                     (npages != old->npages) ||
>                     ((mem->flags ^ old->flags) & KVM_MEM_READONLY))
> @@ -2048,10 +2185,28 @@ int __kvm_set_memory_region(struct kvm *kvm,
>         new->npages = npages;
>         new->flags = mem->flags;
>         new->userspace_addr = mem->userspace_addr;
> +       if (mem->flags & KVM_MEM_PRIVATE) {
> +               new->restricted_file = fget(mem->restricted_fd);
> +               if (!new->restricted_file ||
> +                   !file_is_restrictedmem(new->restricted_file)) {
> +                       r = -EINVAL;
> +                       goto out;
> +               }
> +               new->restricted_offset = mem->restricted_offset;
> +       }
> +
> +       new->kvm = kvm;
>
>         r = kvm_set_memslot(kvm, old, new, change);
>         if (r)
> -               kfree(new);
> +               goto out;
> +
> +       return 0;
> +
> +out:
> +       if (new->restricted_file)
> +               fput(new->restricted_file);
> +       kfree(new);
>         return r;
>  }
>  EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
> @@ -2351,6 +2506,8 @@ static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
>  #ifdef CONFIG_HAVE_KVM_MEMORY_ATTRIBUTES
>  static u64 kvm_supported_mem_attributes(struct kvm *kvm)
>  {
> +       if (kvm_arch_has_private_mem(kvm))
> +               return KVM_MEMORY_ATTRIBUTE_PRIVATE;
>         return 0;
>  }
>
> @@ -4822,16 +4979,28 @@ static long kvm_vm_ioctl(struct file *filp,
>         }
>         case KVM_SET_USER_MEMORY_REGION: {
>                 struct kvm_user_mem_region mem;
> -               unsigned long size = sizeof(struct kvm_userspace_memory_region);
> +               unsigned int flags_offset = offsetof(typeof(mem), flags);
> +               unsigned long size;
> +               u32 flags;
>
>                 kvm_sanity_check_user_mem_region_alias();
>
> +               memset(&mem, 0, sizeof(mem));
> +
>                 r = -EFAULT;
> +               if (get_user(flags, (u32 __user *)(argp + flags_offset)))
> +                       goto out;
> +
> +               if (flags & KVM_MEM_PRIVATE)
> +                       size = sizeof(struct kvm_userspace_memory_region_ext);
> +               else
> +                       size = sizeof(struct kvm_userspace_memory_region);
> +
>                 if (copy_from_user(&mem, argp, size))
>                         goto out;
>
>                 r = -EINVAL;
> -               if (mem.flags & KVM_MEM_PRIVATE)
> +               if ((flags ^ mem.flags) & KVM_MEM_PRIVATE)
>                         goto out;
>
>                 r = kvm_vm_ioctl_set_memory_region(kvm, &mem);
> --
> 2.25.1
>
Vishal Annapurve Jan. 5, 2023, 8:38 p.m. UTC | #2
On Thu, Dec 1, 2022 at 10:20 PM Chao Peng <chao.p.peng@linux.intel.com> wrote:
>
> +#ifdef CONFIG_HAVE_KVM_RESTRICTED_MEM
> +static bool restrictedmem_range_is_valid(struct kvm_memory_slot *slot,
> +                                        pgoff_t start, pgoff_t end,
> +                                        gfn_t *gfn_start, gfn_t *gfn_end)
> +{
> +       unsigned long base_pgoff = slot->restricted_offset >> PAGE_SHIFT;
> +
> +       if (start > base_pgoff)
> +               *gfn_start = slot->base_gfn + start - base_pgoff;

There should be a check for overflow here in case start is a very big
value. Additional check can look like:
if (start >= base_pgoff + slot->npages)
       return false;

> +       else
> +               *gfn_start = slot->base_gfn;
> +
> +       if (end < base_pgoff + slot->npages)
> +               *gfn_end = slot->base_gfn + end - base_pgoff;

If "end" is smaller than base_pgoff, this can cause overflow and
return the range as valid. There should be additional check:
if (end < base_pgoff)
         return false;


> +       else
> +               *gfn_end = slot->base_gfn + slot->npages;
> +
> +       if (*gfn_start >= *gfn_end)
> +               return false;
> +
> +       return true;
> +}
> +
Chao Peng Jan. 6, 2023, 4:13 a.m. UTC | #3
On Thu, Jan 05, 2023 at 12:38:30PM -0800, Vishal Annapurve wrote:
> On Thu, Dec 1, 2022 at 10:20 PM Chao Peng <chao.p.peng@linux.intel.com> wrote:
> >
> > +#ifdef CONFIG_HAVE_KVM_RESTRICTED_MEM
> > +static bool restrictedmem_range_is_valid(struct kvm_memory_slot *slot,
> > +                                        pgoff_t start, pgoff_t end,
> > +                                        gfn_t *gfn_start, gfn_t *gfn_end)
> > +{
> > +       unsigned long base_pgoff = slot->restricted_offset >> PAGE_SHIFT;
> > +
> > +       if (start > base_pgoff)
> > +               *gfn_start = slot->base_gfn + start - base_pgoff;
> 
> There should be a check for overflow here in case start is a very big
> value. Additional check can look like:
> if (start >= base_pgoff + slot->npages)
>        return false;
> 
> > +       else
> > +               *gfn_start = slot->base_gfn;
> > +
> > +       if (end < base_pgoff + slot->npages)
> > +               *gfn_end = slot->base_gfn + end - base_pgoff;
> 
> If "end" is smaller than base_pgoff, this can cause overflow and
> return the range as valid. There should be additional check:
> if (end < base_pgoff)
>          return false;

Thanks! Both are good catches. The improved code:

static bool restrictedmem_range_is_valid(struct kvm_memory_slot *slot,
					 pgoff_t start, pgoff_t end,
					 gfn_t *gfn_start, gfn_t *gfn_end)
{
	unsigned long base_pgoff = slot->restricted_offset >> PAGE_SHIFT;

	if (start >= base_pgoff + slot->npages)
		return false;
	else if (start <= base_pgoff)
		*gfn_start = slot->base_gfn;
	else
		*gfn_start = start - base_pgoff + slot->base_gfn;

	if (end <= base_pgoff)
		return false;
	else if (end >= base_pgoff + slot->npages)
		*gfn_end = slot->base_gfn + slot->npages;
	else
		*gfn_end = end - base_pgoff + slot->base_gfn;

	if (*gfn_start >= *gfn_end)
		return false;

	return true;
}

Thanks,
Chao
> 
> 
> > +       else
> > +               *gfn_end = slot->base_gfn + slot->npages;
> > +
> > +       if (*gfn_start >= *gfn_end)
> > +               return false;
> > +
> > +       return true;
> > +}
> > +
Sean Christopherson Jan. 14, 2023, 12:01 a.m. UTC | #4
On Fri, Dec 02, 2022, Chao Peng wrote:
> @@ -10357,6 +10364,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>  
>  		if (kvm_check_request(KVM_REQ_UPDATE_CPU_DIRTY_LOGGING, vcpu))
>  			static_call(kvm_x86_update_cpu_dirty_logging)(vcpu);
> +
> +		if (kvm_check_request(KVM_REQ_MEMORY_MCE, vcpu)) {
> +			vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;

Synthesizing triple fault shutdown is not the right approach.  Even with TDX's
MCE "architecture" (heavy sarcasm), it's possible that host userspace and the
guest have a paravirt interface for handling memory errors without killing the
host.

> +			r = 0;
> +			goto out;
> +		}
>  	}


> @@ -1982,6 +2112,10 @@ int __kvm_set_memory_region(struct kvm *kvm,
>  	     !access_ok((void __user *)(unsigned long)mem->userspace_addr,
>  			mem->memory_size))
>  		return -EINVAL;
> +	if (mem->flags & KVM_MEM_PRIVATE &&
> +		(mem->restricted_offset & (PAGE_SIZE - 1) ||

Align indentation.

> +		 mem->restricted_offset > U64_MAX - mem->memory_size))

Strongly prefer to use similar logic to existing code that detects wraps:

		mem->restricted_offset + mem->memory_size < mem->restricted_offset

This is also where I'd like to add the "gfn is aligned to offset" check, though
my brain is too fried to figure that out right now.

> +		return -EINVAL;
>  	if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM)
>  		return -EINVAL;
>  	if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
> @@ -2020,6 +2154,9 @@ int __kvm_set_memory_region(struct kvm *kvm,
>  		if ((kvm->nr_memslot_pages + npages) < kvm->nr_memslot_pages)
>  			return -EINVAL;
>  	} else { /* Modify an existing slot. */
> +		/* Private memslots are immutable, they can only be deleted. */

I'm 99% certain I suggested this, but if we're going to make these memslots
immutable, then we should straight up disallow dirty logging, otherwise we'll
end up with a bizarre uAPI.

> +		if (mem->flags & KVM_MEM_PRIVATE)
> +			return -EINVAL;
>  		if ((mem->userspace_addr != old->userspace_addr) ||
>  		    (npages != old->npages) ||
>  		    ((mem->flags ^ old->flags) & KVM_MEM_READONLY))
> @@ -2048,10 +2185,28 @@ int __kvm_set_memory_region(struct kvm *kvm,
>  	new->npages = npages;
>  	new->flags = mem->flags;
>  	new->userspace_addr = mem->userspace_addr;
> +	if (mem->flags & KVM_MEM_PRIVATE) {
> +		new->restricted_file = fget(mem->restricted_fd);
> +		if (!new->restricted_file ||
> +		    !file_is_restrictedmem(new->restricted_file)) {
> +			r = -EINVAL;
> +			goto out;
> +		}
> +		new->restricted_offset = mem->restricted_offset;
> +	}
> +
> +	new->kvm = kvm;

Set this above, just so that the code flows better.
Chao Peng Jan. 17, 2023, 1:12 p.m. UTC | #5
On Sat, Jan 14, 2023 at 12:01:01AM +0000, Sean Christopherson wrote:
> On Fri, Dec 02, 2022, Chao Peng wrote:
> > @@ -10357,6 +10364,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
> >  
> >  		if (kvm_check_request(KVM_REQ_UPDATE_CPU_DIRTY_LOGGING, vcpu))
> >  			static_call(kvm_x86_update_cpu_dirty_logging)(vcpu);
> > +
> > +		if (kvm_check_request(KVM_REQ_MEMORY_MCE, vcpu)) {
> > +			vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
> 
> Synthesizing triple fault shutdown is not the right approach.  Even with TDX's
> MCE "architecture" (heavy sarcasm), it's possible that host userspace and the
> guest have a paravirt interface for handling memory errors without killing the
> host.

Agree shutdown is not the correct choice. I see you made below change:

send_sig_mceerr(BUS_MCEERR_AR, (void __user *)hva, PAGE_SHIFT, current)

The MCE may happen in any thread than KVM thread, sending siginal to
'current' thread may not be the expected behavior. Also how userspace
can tell is the MCE on the shared page or private page? Do we care?

> 
> > +			r = 0;
> > +			goto out;
> > +		}
> >  	}
> 
> 
> > @@ -1982,6 +2112,10 @@ int __kvm_set_memory_region(struct kvm *kvm,
> >  	     !access_ok((void __user *)(unsigned long)mem->userspace_addr,
> >  			mem->memory_size))
> >  		return -EINVAL;
> > +	if (mem->flags & KVM_MEM_PRIVATE &&
> > +		(mem->restricted_offset & (PAGE_SIZE - 1) ||
> 
> Align indentation.
> 
> > +		 mem->restricted_offset > U64_MAX - mem->memory_size))
> 
> Strongly prefer to use similar logic to existing code that detects wraps:
> 
> 		mem->restricted_offset + mem->memory_size < mem->restricted_offset
> 
> This is also where I'd like to add the "gfn is aligned to offset" check, though
> my brain is too fried to figure that out right now.
> 
> > +		return -EINVAL;
> >  	if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM)
> >  		return -EINVAL;
> >  	if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
> > @@ -2020,6 +2154,9 @@ int __kvm_set_memory_region(struct kvm *kvm,
> >  		if ((kvm->nr_memslot_pages + npages) < kvm->nr_memslot_pages)
> >  			return -EINVAL;
> >  	} else { /* Modify an existing slot. */
> > +		/* Private memslots are immutable, they can only be deleted. */
> 
> I'm 99% certain I suggested this, but if we're going to make these memslots
> immutable, then we should straight up disallow dirty logging, otherwise we'll
> end up with a bizarre uAPI.

But in my mind dirty logging will be needed in the very short time, when
live migration gets supported?

> 
> > +		if (mem->flags & KVM_MEM_PRIVATE)
> > +			return -EINVAL;
> >  		if ((mem->userspace_addr != old->userspace_addr) ||
> >  		    (npages != old->npages) ||
> >  		    ((mem->flags ^ old->flags) & KVM_MEM_READONLY))
> > @@ -2048,10 +2185,28 @@ int __kvm_set_memory_region(struct kvm *kvm,
> >  	new->npages = npages;
> >  	new->flags = mem->flags;
> >  	new->userspace_addr = mem->userspace_addr;
> > +	if (mem->flags & KVM_MEM_PRIVATE) {
> > +		new->restricted_file = fget(mem->restricted_fd);
> > +		if (!new->restricted_file ||
> > +		    !file_is_restrictedmem(new->restricted_file)) {
> > +			r = -EINVAL;
> > +			goto out;
> > +		}
> > +		new->restricted_offset = mem->restricted_offset;

I see you changed slot->restricted_offset type from loff_t to gfn_t and
used pgoff_t when doing the restrictedmem_bind/unbind(). Using page
index is reasonable KVM internally and sounds simpler than loff_t. But
we also need initialize it to page index here as well as changes in
another two cases. This is needed when restricted_offset != 0.

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 547b92215002..49e375e78f30 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2364,8 +2364,7 @@ static inline int kvm_restricted_mem_get_pfn(struct kvm_memory_slot *slot,
                                             gfn_t gfn, kvm_pfn_t *pfn,
                                             int *order)
 {
-       pgoff_t index = gfn - slot->base_gfn +
-                       (slot->restricted_offset >> PAGE_SHIFT);
+       pgoff_t index = gfn - slot->base_gfn + slot->restricted_offset;
        struct page *page;
        int ret;
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 01db35ddd5b3..7439bdcb0d04 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -935,7 +935,7 @@ static bool restrictedmem_range_is_valid(struct kvm_memory_slot *slot,
                                         pgoff_t start, pgoff_t end,
                                         gfn_t *gfn_start, gfn_t *gfn_end)
 {
-       unsigned long base_pgoff = slot->restricted_offset >> PAGE_SHIFT;
+       unsigned long base_pgoff = slot->restricted_offset;
 
        if (start > base_pgoff)
                *gfn_start = slot->base_gfn + start - base_pgoff;
@@ -2275,7 +2275,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
                        r = -EINVAL;
                        goto out;
                }
-               new->restricted_offset = mem->restricted_offset;
+               new->restricted_offset = mem->restricted_offset >> PAGE_SHIFT;
        }
 
        r = kvm_set_memslot(kvm, old, new, change);

Chao
> > +	}
> > +
> > +	new->kvm = kvm;
> 
> Set this above, just so that the code flows better.
Sean Christopherson Jan. 17, 2023, 7:35 p.m. UTC | #6
On Tue, Jan 17, 2023, Chao Peng wrote:
> On Sat, Jan 14, 2023 at 12:01:01AM +0000, Sean Christopherson wrote:
> > On Fri, Dec 02, 2022, Chao Peng wrote:
> > > @@ -10357,6 +10364,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
> > >  
> > >  		if (kvm_check_request(KVM_REQ_UPDATE_CPU_DIRTY_LOGGING, vcpu))
> > >  			static_call(kvm_x86_update_cpu_dirty_logging)(vcpu);
> > > +
> > > +		if (kvm_check_request(KVM_REQ_MEMORY_MCE, vcpu)) {
> > > +			vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
> > 
> > Synthesizing triple fault shutdown is not the right approach.  Even with TDX's
> > MCE "architecture" (heavy sarcasm), it's possible that host userspace and the
> > guest have a paravirt interface for handling memory errors without killing the
> > host.
> 
> Agree shutdown is not the correct choice. I see you made below change:
> 
> send_sig_mceerr(BUS_MCEERR_AR, (void __user *)hva, PAGE_SHIFT, current)
> 
> The MCE may happen in any thread than KVM thread, sending siginal to
> 'current' thread may not be the expected behavior.

This is already true today, e.g. a #MC in memory that is mapped into the guest can
be triggered by a host access.  Hrm, but in this case we actually have a KVM
instance, and we know that the #MC is relevant to the KVM instance, so I agree
that signaling 'current' is kludgy.

>  Also how userspace can tell is the MCE on the shared page or private page?
>  Do we care?

We care.  I was originally thinking we could require userspace to keep track of
things, but that's quite prescriptive and flawed, e.g. could race with conversions.

One option would be to KVM_EXIT_MEMORY_FAULT, and then wire up a generic (not x86
specific) KVM request to exit to userspace, e.g.

		/* KVM_EXIT_MEMORY_FAULT */
		struct {
#define KVM_MEMORY_EXIT_FLAG_PRIVATE	(1ULL << 3)
#define KVM_MEMORY_EXIT_FLAG_HW_ERROR	(1ULL << 4)
			__u64 flags;
			__u64 gpa;
			__u64 size;
		} memory;

But I'm not sure that's the correct approach.  It kinda feels like we're reinventing
the wheel.  It seems like restrictedmem_get_page() _must_ be able to reject attempts
to get a poisoned page, i.e. restrictedmem_get_page() should yield KVM_PFN_ERR_HWPOISON.
Assuming that's the case, then I believe KVM simply needs to zap SPTEs in response
to an error notification in order to force vCPUs to fault on the poisoned page.

> > > +		return -EINVAL;
> > >  	if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM)
> > >  		return -EINVAL;
> > >  	if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
> > > @@ -2020,6 +2154,9 @@ int __kvm_set_memory_region(struct kvm *kvm,
> > >  		if ((kvm->nr_memslot_pages + npages) < kvm->nr_memslot_pages)
> > >  			return -EINVAL;
> > >  	} else { /* Modify an existing slot. */
> > > +		/* Private memslots are immutable, they can only be deleted. */
> > 
> > I'm 99% certain I suggested this, but if we're going to make these memslots
> > immutable, then we should straight up disallow dirty logging, otherwise we'll
> > end up with a bizarre uAPI.
> 
> But in my mind dirty logging will be needed in the very short time, when
> live migration gets supported?

Ya, but if/when live migration support is added, private memslots will no longer
be immutable as userspace will want to enable dirty logging only when a VM is
being migrated, i.e. something will need to change.

Given that it looks like we have clear line of sight to SEV+UPM guests, my
preference would be to allow toggling dirty logging from the get-go.  It doesn't
necessarily have to be in the first patch, e.g. KVM could initially reject
KVM_MEM_LOG_DIRTY_PAGES + KVM_MEM_PRIVATE and then add support separately to make
the series easier to review, test, and bisect.

static int check_memory_region_flags(struct kvm *kvm,
				     const struct kvm_userspace_memory_region2 *mem)
{
	u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;

	if (kvm_arch_has_private_mem(kvm) &&
	    ~(mem->flags & KVM_MEM_LOG_DIRTY_PAGES))
		valid_flags |= KVM_MEM_PRIVATE;


	...
}

> > > +		if (mem->flags & KVM_MEM_PRIVATE)
> > > +			return -EINVAL;
> > >  		if ((mem->userspace_addr != old->userspace_addr) ||
> > >  		    (npages != old->npages) ||
> > >  		    ((mem->flags ^ old->flags) & KVM_MEM_READONLY))
> > > @@ -2048,10 +2185,28 @@ int __kvm_set_memory_region(struct kvm *kvm,
> > >  	new->npages = npages;
> > >  	new->flags = mem->flags;
> > >  	new->userspace_addr = mem->userspace_addr;
> > > +	if (mem->flags & KVM_MEM_PRIVATE) {
> > > +		new->restricted_file = fget(mem->restricted_fd);
> > > +		if (!new->restricted_file ||
> > > +		    !file_is_restrictedmem(new->restricted_file)) {
> > > +			r = -EINVAL;
> > > +			goto out;
> > > +		}
> > > +		new->restricted_offset = mem->restricted_offset;
> 
> I see you changed slot->restricted_offset type from loff_t to gfn_t and
> used pgoff_t when doing the restrictedmem_bind/unbind(). Using page
> index is reasonable KVM internally and sounds simpler than loff_t. But
> we also need initialize it to page index here as well as changes in
> another two cases. This is needed when restricted_offset != 0.

Oof.  I'm pretty sure I completely missed that loff_t is used for byte offsets,
whereas pgoff_t is a frame index. 

Given that the restrictmem APIs take pgoff_t, I definitely think it makes sense
to the index, but I'm very tempted to store pgoff_t instead of gfn_t, and name
the field "index" to help connect the dots to the rest of kernel, where "pgoff_t index"
is quite common.

And looking at those bits again, we should wrap all of the restrictedmem fields
with CONFIG_KVM_PRIVATE_MEM.  It'll require minor tweaks to __kvm_set_memory_region(),
but I think will yield cleaner code (and internal APIs) overall.

And wrap the three fields in an anonymous struct?  E.g. this is a little more
versbose (restrictedmem instead restricted), but at first glance it doesn't seem
to cause widespared line length issues.

#ifdef CONFIG_KVM_PRIVATE_MEM
	struct {
		struct file *file;
		pgoff_t index;
		struct restrictedmem_notifier notifier;
	} restrictedmem;
#endif

> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 547b92215002..49e375e78f30 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -2364,8 +2364,7 @@ static inline int kvm_restricted_mem_get_pfn(struct kvm_memory_slot *slot,
>                                              gfn_t gfn, kvm_pfn_t *pfn,
>                                              int *order)
>  {
> -       pgoff_t index = gfn - slot->base_gfn +
> -                       (slot->restricted_offset >> PAGE_SHIFT);
> +       pgoff_t index = gfn - slot->base_gfn + slot->restricted_offset;
>         struct page *page;
>         int ret;
>  
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 01db35ddd5b3..7439bdcb0d04 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -935,7 +935,7 @@ static bool restrictedmem_range_is_valid(struct kvm_memory_slot *slot,
>                                          pgoff_t start, pgoff_t end,
>                                          gfn_t *gfn_start, gfn_t *gfn_end)
>  {
> -       unsigned long base_pgoff = slot->restricted_offset >> PAGE_SHIFT;
> +       unsigned long base_pgoff = slot->restricted_offset;
>  
>         if (start > base_pgoff)
>                 *gfn_start = slot->base_gfn + start - base_pgoff;
> @@ -2275,7 +2275,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
>                         r = -EINVAL;
>                         goto out;
>                 }
> -               new->restricted_offset = mem->restricted_offset;
> +               new->restricted_offset = mem->restricted_offset >> PAGE_SHIFT;
>         }
>  
>         r = kvm_set_memslot(kvm, old, new, change);
> 
> Chao
> > > +	}
> > > +
> > > +	new->kvm = kvm;
> > 
> > Set this above, just so that the code flows better.
Chao Peng Jan. 18, 2023, 8:23 a.m. UTC | #7
On Tue, Jan 17, 2023 at 07:35:58PM +0000, Sean Christopherson wrote:
> On Tue, Jan 17, 2023, Chao Peng wrote:
> > On Sat, Jan 14, 2023 at 12:01:01AM +0000, Sean Christopherson wrote:
> > > On Fri, Dec 02, 2022, Chao Peng wrote:
> > > > @@ -10357,6 +10364,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
> > > >  
> > > >  		if (kvm_check_request(KVM_REQ_UPDATE_CPU_DIRTY_LOGGING, vcpu))
> > > >  			static_call(kvm_x86_update_cpu_dirty_logging)(vcpu);
> > > > +
> > > > +		if (kvm_check_request(KVM_REQ_MEMORY_MCE, vcpu)) {
> > > > +			vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
> > > 
> > > Synthesizing triple fault shutdown is not the right approach.  Even with TDX's
> > > MCE "architecture" (heavy sarcasm), it's possible that host userspace and the
> > > guest have a paravirt interface for handling memory errors without killing the
> > > host.
> > 
> > Agree shutdown is not the correct choice. I see you made below change:
> > 
> > send_sig_mceerr(BUS_MCEERR_AR, (void __user *)hva, PAGE_SHIFT, current)
> > 
> > The MCE may happen in any thread than KVM thread, sending siginal to
> > 'current' thread may not be the expected behavior.
> 
> This is already true today, e.g. a #MC in memory that is mapped into the guest can
> be triggered by a host access.  Hrm, but in this case we actually have a KVM
> instance, and we know that the #MC is relevant to the KVM instance, so I agree
> that signaling 'current' is kludgy.
> 
> >  Also how userspace can tell is the MCE on the shared page or private page?
> >  Do we care?
> 
> We care.  I was originally thinking we could require userspace to keep track of
> things, but that's quite prescriptive and flawed, e.g. could race with conversions.
> 
> One option would be to KVM_EXIT_MEMORY_FAULT, and then wire up a generic (not x86
> specific) KVM request to exit to userspace, e.g.
> 
> 		/* KVM_EXIT_MEMORY_FAULT */
> 		struct {
> #define KVM_MEMORY_EXIT_FLAG_PRIVATE	(1ULL << 3)
> #define KVM_MEMORY_EXIT_FLAG_HW_ERROR	(1ULL << 4)
> 			__u64 flags;
> 			__u64 gpa;
> 			__u64 size;
> 		} memory;
> 
> But I'm not sure that's the correct approach.  It kinda feels like we're reinventing
> the wheel.  It seems like restrictedmem_get_page() _must_ be able to reject attempts
> to get a poisoned page, i.e. restrictedmem_get_page() should yield KVM_PFN_ERR_HWPOISON.

Yes, I see there is -EHWPOISON handling for hva_to_pfn() for shared
memory. It makes sense doing similar for private page.

> Assuming that's the case, then I believe KVM simply needs to zap SPTEs in response
> to an error notification in order to force vCPUs to fault on the poisoned page.

Agree, this is waht we should do anyway.

> 
> > > > +		return -EINVAL;
> > > >  	if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM)
> > > >  		return -EINVAL;
> > > >  	if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
> > > > @@ -2020,6 +2154,9 @@ int __kvm_set_memory_region(struct kvm *kvm,
> > > >  		if ((kvm->nr_memslot_pages + npages) < kvm->nr_memslot_pages)
> > > >  			return -EINVAL;
> > > >  	} else { /* Modify an existing slot. */
> > > > +		/* Private memslots are immutable, they can only be deleted. */
> > > 
> > > I'm 99% certain I suggested this, but if we're going to make these memslots
> > > immutable, then we should straight up disallow dirty logging, otherwise we'll
> > > end up with a bizarre uAPI.
> > 
> > But in my mind dirty logging will be needed in the very short time, when
> > live migration gets supported?
> 
> Ya, but if/when live migration support is added, private memslots will no longer
> be immutable as userspace will want to enable dirty logging only when a VM is
> being migrated, i.e. something will need to change.
> 
> Given that it looks like we have clear line of sight to SEV+UPM guests, my
> preference would be to allow toggling dirty logging from the get-go.  It doesn't
> necessarily have to be in the first patch, e.g. KVM could initially reject
> KVM_MEM_LOG_DIRTY_PAGES + KVM_MEM_PRIVATE and then add support separately to make
> the series easier to review, test, and bisect.
> 
> static int check_memory_region_flags(struct kvm *kvm,
> 				     const struct kvm_userspace_memory_region2 *mem)
> {
> 	u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
> 
> 	if (kvm_arch_has_private_mem(kvm) &&
> 	    ~(mem->flags & KVM_MEM_LOG_DIRTY_PAGES))
> 		valid_flags |= KVM_MEM_PRIVATE;

Adding this limitation is OK to me. It's not too hard to remove it when
live migration gets added.

> 
> 
> 	...
> }
> 
> > > > +		if (mem->flags & KVM_MEM_PRIVATE)
> > > > +			return -EINVAL;
> > > >  		if ((mem->userspace_addr != old->userspace_addr) ||
> > > >  		    (npages != old->npages) ||
> > > >  		    ((mem->flags ^ old->flags) & KVM_MEM_READONLY))
> > > > @@ -2048,10 +2185,28 @@ int __kvm_set_memory_region(struct kvm *kvm,
> > > >  	new->npages = npages;
> > > >  	new->flags = mem->flags;
> > > >  	new->userspace_addr = mem->userspace_addr;
> > > > +	if (mem->flags & KVM_MEM_PRIVATE) {
> > > > +		new->restricted_file = fget(mem->restricted_fd);
> > > > +		if (!new->restricted_file ||
> > > > +		    !file_is_restrictedmem(new->restricted_file)) {
> > > > +			r = -EINVAL;
> > > > +			goto out;
> > > > +		}
> > > > +		new->restricted_offset = mem->restricted_offset;
> > 
> > I see you changed slot->restricted_offset type from loff_t to gfn_t and
> > used pgoff_t when doing the restrictedmem_bind/unbind(). Using page
> > index is reasonable KVM internally and sounds simpler than loff_t. But
> > we also need initialize it to page index here as well as changes in
> > another two cases. This is needed when restricted_offset != 0.
> 
> Oof.  I'm pretty sure I completely missed that loff_t is used for byte offsets,
> whereas pgoff_t is a frame index. 
> 
> Given that the restrictmem APIs take pgoff_t, I definitely think it makes sense
> to the index, but I'm very tempted to store pgoff_t instead of gfn_t, and name
> the field "index" to help connect the dots to the rest of kernel, where "pgoff_t index"
> is quite common.
> 
> And looking at those bits again, we should wrap all of the restrictedmem fields
> with CONFIG_KVM_PRIVATE_MEM.  It'll require minor tweaks to __kvm_set_memory_region(),
> but I think will yield cleaner code (and internal APIs) overall.
> 
> And wrap the three fields in an anonymous struct?  E.g. this is a little more
> versbose (restrictedmem instead restricted), but at first glance it doesn't seem
> to cause widespared line length issues.
> 
> #ifdef CONFIG_KVM_PRIVATE_MEM
> 	struct {
> 		struct file *file;
> 		pgoff_t index;
> 		struct restrictedmem_notifier notifier;
> 	} restrictedmem;
> #endif

Looks better.

Thanks,
Chao
> 
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index 547b92215002..49e375e78f30 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -2364,8 +2364,7 @@ static inline int kvm_restricted_mem_get_pfn(struct kvm_memory_slot *slot,
> >                                              gfn_t gfn, kvm_pfn_t *pfn,
> >                                              int *order)
> >  {
> > -       pgoff_t index = gfn - slot->base_gfn +
> > -                       (slot->restricted_offset >> PAGE_SHIFT);
> > +       pgoff_t index = gfn - slot->base_gfn + slot->restricted_offset;
> >         struct page *page;
> >         int ret;
> >  
> > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > index 01db35ddd5b3..7439bdcb0d04 100644
> > --- a/virt/kvm/kvm_main.c
> > +++ b/virt/kvm/kvm_main.c
> > @@ -935,7 +935,7 @@ static bool restrictedmem_range_is_valid(struct kvm_memory_slot *slot,
> >                                          pgoff_t start, pgoff_t end,
> >                                          gfn_t *gfn_start, gfn_t *gfn_end)
> >  {
> > -       unsigned long base_pgoff = slot->restricted_offset >> PAGE_SHIFT;
> > +       unsigned long base_pgoff = slot->restricted_offset;
> >  
> >         if (start > base_pgoff)
> >                 *gfn_start = slot->base_gfn + start - base_pgoff;
> > @@ -2275,7 +2275,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
> >                         r = -EINVAL;
> >                         goto out;
> >                 }
> > -               new->restricted_offset = mem->restricted_offset;
> > +               new->restricted_offset = mem->restricted_offset >> PAGE_SHIFT;
> >         }
> >  
> >         r = kvm_set_memslot(kvm, old, new, change);
> > 
> > Chao
> > > > +	}
> > > > +
> > > > +	new->kvm = kvm;
> > > 
> > > Set this above, just so that the code flows better.
Chao Peng Jan. 28, 2023, 2 p.m. UTC | #8
On Sat, Jan 14, 2023 at 12:01:01AM +0000, Sean Christopherson wrote:
> On Fri, Dec 02, 2022, Chao Peng wrote:
... 
> Strongly prefer to use similar logic to existing code that detects wraps:
> 
> 		mem->restricted_offset + mem->memory_size < mem->restricted_offset
> 
> This is also where I'd like to add the "gfn is aligned to offset" check, though
> my brain is too fried to figure that out right now.

Used count_trailing_zeros() for this TODO, unsure we have other better
approach.

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index afc8c26fa652..fd34c5f7cd2f 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -56,6 +56,7 @@
 #include <asm/processor.h>
 #include <asm/ioctl.h>
 #include <linux/uaccess.h>
+#include <linux/count_zeros.h>
 
 #include "coalesced_mmio.h"
 #include "async_pf.h"
@@ -2087,6 +2088,19 @@ static bool kvm_check_memslot_overlap(struct kvm_memslots *slots, int id,
 	return false;
 }
 
+/*
+ * Return true when ALIGNMENT(offset) >= ALIGNMENT(gpa).
+ */
+static bool kvm_check_rmem_offset_alignment(u64 offset, u64 gpa)
+{
+	if (!offset)
+		return true;
+	if (!gpa)
+		return false;
+
+	return !!(count_trailing_zeros(offset) >= count_trailing_zeros(gpa));
+}
+
 /*
  * Allocate some memory and give it an address in the guest physical address
  * space.
@@ -2128,7 +2142,8 @@ int __kvm_set_memory_region(struct kvm *kvm,
 	if (mem->flags & KVM_MEM_PRIVATE &&
 	    (mem->restrictedmem_offset & (PAGE_SIZE - 1) ||
 	     mem->restrictedmem_offset + mem->memory_size < mem->restrictedmem_offset ||
-	     0 /* TODO: require gfn be aligned with restricted offset */))
+	     !kvm_check_rmem_offset_alignment(mem->restrictedmem_offset,
+					      mem->guest_phys_addr)))
 		return -EINVAL;
 	if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_MEM_SLOTS_NUM)
 		return -EINVAL;
Sean Christopherson March 7, 2023, 8:27 p.m. UTC | #9
Please trim your replies so that readers don't need to scan through a hundred or
so lines of quotes just to confirm there's nothing there.

On Tue, Mar 07, 2023, Ackerley Tng wrote:
> Chao Peng <chao.p.peng@linux.intel.com> writes:
> 
> > Register/unregister private memslot to fd-based memory backing store
> > restrictedmem and implement the callbacks for restrictedmem_notifier:
> >    - invalidate_start()/invalidate_end() to zap the existing memory
> >      mappings in the KVM page table.
> >    - error() to request KVM_REQ_MEMORY_MCE and later exit to userspace
> >      with KVM_EXIT_SHUTDOWN.
> 
> > Expose KVM_MEM_PRIVATE for memslot and KVM_MEMORY_ATTRIBUTE_PRIVATE for
> > KVM_GET_SUPPORTED_MEMORY_ATTRIBUTES to userspace but either are
> > controlled by kvm_arch_has_private_mem() which should be rewritten by
> > architecture code.
> 
> Could we perhaps rename KVM_MEM_PRIVATE to KVM_MEM_PROTECTED, to be in
> line with KVM_X86_PROTECTED_VM?
> 
> I feel that a memslot that has the KVM_MEM_PRIVATE flag need not always
> be private; It can sometimes be providing memory that is shared and
> also accessible from the host.
> 
> KVM_MEMORY_ATTRIBUTE_PRIVATE is fine as-is because this flag is set when
> the guest memory is meant to be backed by private memory.
> 
> KVM_MEMORY_EXIT_FLAG_PRIVATE is also okay because the flag is used to
> indicate when the memory error is caused by a private access (as opposed
> to a shared access).
> 
> kvm_slot_can_be_private() could perhaps be renamed kvm_is_protected_slot()?

No to this suggestion.  I agree that KVM_MEM_PRIVATE is a bad name, but
kvm_is_protected_slot() is just as wrong.  The _only_ thing that the flag controls
is whether whether or not the memslot has an fd that is bound to restricted memory.
The memslot itself is not protected in any way, and if the entire memslot is mapped
shared, then the data backed by the memslot isn't protected either.

What about KVM_MEM_CAN_BE_PRIVATE?  KVM_MEM_PRIVATIZABLE is more succinct, but
AFAICT that's a made up word, and IMO is unnecessarily fancy.
Chao Peng March 8, 2023, 7:40 a.m. UTC | #10
On Wed, Mar 08, 2023 at 12:13:24AM +0000, Ackerley Tng wrote:
> Chao Peng <chao.p.peng@linux.intel.com> writes:
> 
> > On Sat, Jan 14, 2023 at 12:01:01AM +0000, Sean Christopherson wrote:
> > > On Fri, Dec 02, 2022, Chao Peng wrote:
> > ...
> > > Strongly prefer to use similar logic to existing code that detects wraps:
> 
> > > 		mem->restricted_offset + mem->memory_size < mem->restricted_offset
> 
> > > This is also where I'd like to add the "gfn is aligned to offset"
> > > check, though
> > > my brain is too fried to figure that out right now.
> 
> > Used count_trailing_zeros() for this TODO, unsure we have other better
> > approach.
> 
> > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > index afc8c26fa652..fd34c5f7cd2f 100644
> > --- a/virt/kvm/kvm_main.c
> > +++ b/virt/kvm/kvm_main.c
> > @@ -56,6 +56,7 @@
> >   #include <asm/processor.h>
> >   #include <asm/ioctl.h>
> >   #include <linux/uaccess.h>
> > +#include <linux/count_zeros.h>
> 
> >   #include "coalesced_mmio.h"
> >   #include "async_pf.h"
> > @@ -2087,6 +2088,19 @@ static bool kvm_check_memslot_overlap(struct
> > kvm_memslots *slots, int id,
> >   	return false;
> >   }
> 
> > +/*
> > + * Return true when ALIGNMENT(offset) >= ALIGNMENT(gpa).
> > + */
> > +static bool kvm_check_rmem_offset_alignment(u64 offset, u64 gpa)
> > +{
> > +	if (!offset)
> > +		return true;
> > +	if (!gpa)
> > +		return false;
> > +
> > +	return !!(count_trailing_zeros(offset) >= count_trailing_zeros(gpa));
> 
> Perhaps we could do something like
> 
> #define lowest_set_bit(val) (val & -val)
> 
> and use
> 
> return lowest_set_bit(offset) >= lowest_set_bit(gpa);

I see kernel already has fls64(), that looks what we need ;)

> 
> Please help me to understand: why must ALIGNMENT(offset) >=
> ALIGNMENT(gpa)? Why is it not sufficient to have both gpa and offset be
> aligned to PAGE_SIZE?

Yes, it's sufficient. Here we just want to be conservative on the uAPI
as Sean explained this at [1]:

  I would rather reject memslot if the gfn has lesser alignment than the
  offset. I'm totally ok with this approach _if_ there's a use case. 
  Until such a use case presents itself, I would rather be conservative
  from a uAPI perspective.

[1] https://lore.kernel.org/all/Y8HldeHBrw+OOZVm@google.com/

Chao
> 
> > +}
> > +
> >   /*
> >    * Allocate some memory and give it an address in the guest physical
> > address
> >    * space.
> > @@ -2128,7 +2142,8 @@ int __kvm_set_memory_region(struct kvm *kvm,
> >   	if (mem->flags & KVM_MEM_PRIVATE &&
> >   	    (mem->restrictedmem_offset & (PAGE_SIZE - 1) ||
> >   	     mem->restrictedmem_offset + mem->memory_size <
> > mem->restrictedmem_offset ||
> > -	     0 /* TODO: require gfn be aligned with restricted offset */))
> > +	     !kvm_check_rmem_offset_alignment(mem->restrictedmem_offset,
> > +					      mem->guest_phys_addr)))
> >   		return -EINVAL;
> >   	if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_MEM_SLOTS_NUM)
> >   		return -EINVAL;
Isaku Yamahata March 23, 2023, 12:41 a.m. UTC | #11
On Wed, Mar 08, 2023 at 03:40:26PM +0800,
Chao Peng <chao.p.peng@linux.intel.com> wrote:

> On Wed, Mar 08, 2023 at 12:13:24AM +0000, Ackerley Tng wrote:
> > Chao Peng <chao.p.peng@linux.intel.com> writes:
> > 
> > > On Sat, Jan 14, 2023 at 12:01:01AM +0000, Sean Christopherson wrote:
> > > > On Fri, Dec 02, 2022, Chao Peng wrote:
> > > ...
> > > > Strongly prefer to use similar logic to existing code that detects wraps:
> > 
> > > > 		mem->restricted_offset + mem->memory_size < mem->restricted_offset
> > 
> > > > This is also where I'd like to add the "gfn is aligned to offset"
> > > > check, though
> > > > my brain is too fried to figure that out right now.
> > 
> > > Used count_trailing_zeros() for this TODO, unsure we have other better
> > > approach.
> > 
> > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > > index afc8c26fa652..fd34c5f7cd2f 100644
> > > --- a/virt/kvm/kvm_main.c
> > > +++ b/virt/kvm/kvm_main.c
> > > @@ -56,6 +56,7 @@
> > >   #include <asm/processor.h>
> > >   #include <asm/ioctl.h>
> > >   #include <linux/uaccess.h>
> > > +#include <linux/count_zeros.h>
> > 
> > >   #include "coalesced_mmio.h"
> > >   #include "async_pf.h"
> > > @@ -2087,6 +2088,19 @@ static bool kvm_check_memslot_overlap(struct
> > > kvm_memslots *slots, int id,
> > >   	return false;
> > >   }
> > 
> > > +/*
> > > + * Return true when ALIGNMENT(offset) >= ALIGNMENT(gpa).
> > > + */
> > > +static bool kvm_check_rmem_offset_alignment(u64 offset, u64 gpa)
> > > +{
> > > +	if (!offset)
> > > +		return true;
> > > +	if (!gpa)
> > > +		return false;
> > > +
> > > +	return !!(count_trailing_zeros(offset) >= count_trailing_zeros(gpa));

This check doesn't work expected. For example, offset = 2GB, gpa=4GB
this check fails.
I come up with the following.

From ec87e25082f0497431b732702fae82c6a05071bf Mon Sep 17 00:00:00 2001
Message-Id: <ec87e25082f0497431b732702fae82c6a05071bf.1679531995.git.isaku.yamahata@intel.com>
From: Isaku Yamahata <isaku.yamahata@intel.com>
Date: Wed, 22 Mar 2023 15:32:56 -0700
Subject: [PATCH] KVM: Relax alignment check for restricted mem

kvm_check_rmem_offset_alignment() only checks based on offset alignment
and GPA alignment.  However, the actual alignment for offset depends
on architecture.  For x86 case, it can be 1G, 2M or 4K.  So even if
GPA is aligned for 1G+, only 1G-alignment is required for offset.

Without this patch, gpa=4G, offset=2G results in failure of memory slot
creation.

Fixes: edc8814b2c77 ("KVM: Require gfn be aligned with restricted offset")
Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
---
 arch/x86/include/asm/kvm_host.h | 15 +++++++++++++++
 virt/kvm/kvm_main.c             |  9 ++++++++-
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 88e11dd3afde..03af44650f24 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -16,6 +16,7 @@
 #include <linux/irq_work.h>
 #include <linux/irq.h>
 #include <linux/workqueue.h>
+#include <linux/count_zeros.h>
 
 #include <linux/kvm.h>
 #include <linux/kvm_para.h>
@@ -143,6 +144,20 @@
 #define KVM_HPAGE_MASK(x)	(~(KVM_HPAGE_SIZE(x) - 1))
 #define KVM_PAGES_PER_HPAGE(x)	(KVM_HPAGE_SIZE(x) / PAGE_SIZE)
 
+#define kvm_arch_required_alignment	kvm_arch_required_alignment
+static inline int kvm_arch_required_alignment(u64 gpa)
+{
+	int zeros = count_trailing_zeros(gpa);
+
+	WARN_ON_ONCE(!PAGE_ALIGNED(gpa));
+	if (zeros >= KVM_HPAGE_SHIFT(PG_LEVEL_1G))
+		return KVM_HPAGE_SHIFT(PG_LEVEL_1G);
+	else if (zeros >= KVM_HPAGE_SHIFT(PG_LEVEL_2M))
+		return KVM_HPAGE_SHIFT(PG_LEVEL_2M);
+
+	return PAGE_SHIFT;
+}
+
 #define KVM_MEMSLOT_PAGES_TO_MMU_PAGES_RATIO 50
 #define KVM_MIN_ALLOC_MMU_PAGES 64UL
 #define KVM_MMU_HASH_SHIFT 12
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index c9c4eef457b0..f4ff96171d24 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2113,6 +2113,13 @@ static bool kvm_check_memslot_overlap(struct kvm_memslots *slots, int id,
 	return false;
 }
 
+#ifndef kvm_arch_required_alignment
+__weak int kvm_arch_required_alignment(u64 gpa)
+{
+	return PAGE_SHIFT
+}
+#endif
+
 /*
  * Return true when ALIGNMENT(offset) >= ALIGNMENT(gpa).
  */
@@ -2123,7 +2130,7 @@ static bool kvm_check_rmem_offset_alignment(u64 offset, u64 gpa)
 	if (!gpa)
 		return false;
 
-	return !!(count_trailing_zeros(offset) >= count_trailing_zeros(gpa));
+	return !!(count_trailing_zeros(offset) >= kvm_arch_required_alignment(gpa));
 }
 
 /*
Chao Peng March 24, 2023, 2:10 a.m. UTC | #12
On Wed, Mar 22, 2023 at 05:41:31PM -0700, Isaku Yamahata wrote:
> On Wed, Mar 08, 2023 at 03:40:26PM +0800,
> Chao Peng <chao.p.peng@linux.intel.com> wrote:
> 
> > On Wed, Mar 08, 2023 at 12:13:24AM +0000, Ackerley Tng wrote:
> > > Chao Peng <chao.p.peng@linux.intel.com> writes:
> > > 
> > > > On Sat, Jan 14, 2023 at 12:01:01AM +0000, Sean Christopherson wrote:
> > > > > On Fri, Dec 02, 2022, Chao Peng wrote:
> > > > ...
> > > > > Strongly prefer to use similar logic to existing code that detects wraps:
> > > 
> > > > > 		mem->restricted_offset + mem->memory_size < mem->restricted_offset
> > > 
> > > > > This is also where I'd like to add the "gfn is aligned to offset"
> > > > > check, though
> > > > > my brain is too fried to figure that out right now.
> > > 
> > > > Used count_trailing_zeros() for this TODO, unsure we have other better
> > > > approach.
> > > 
> > > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > > > index afc8c26fa652..fd34c5f7cd2f 100644
> > > > --- a/virt/kvm/kvm_main.c
> > > > +++ b/virt/kvm/kvm_main.c
> > > > @@ -56,6 +56,7 @@
> > > >   #include <asm/processor.h>
> > > >   #include <asm/ioctl.h>
> > > >   #include <linux/uaccess.h>
> > > > +#include <linux/count_zeros.h>
> > > 
> > > >   #include "coalesced_mmio.h"
> > > >   #include "async_pf.h"
> > > > @@ -2087,6 +2088,19 @@ static bool kvm_check_memslot_overlap(struct
> > > > kvm_memslots *slots, int id,
> > > >   	return false;
> > > >   }
> > > 
> > > > +/*
> > > > + * Return true when ALIGNMENT(offset) >= ALIGNMENT(gpa).
> > > > + */
> > > > +static bool kvm_check_rmem_offset_alignment(u64 offset, u64 gpa)
> > > > +{
> > > > +	if (!offset)
> > > > +		return true;
> > > > +	if (!gpa)
> > > > +		return false;
> > > > +
> > > > +	return !!(count_trailing_zeros(offset) >= count_trailing_zeros(gpa));
> 
> This check doesn't work expected. For example, offset = 2GB, gpa=4GB
> this check fails.

This case is expected to fail as Sean initially suggested[*]:
  I would rather reject memslot if the gfn has lesser alignment than
  the offset. I'm totally ok with this approach _if_ there's a use case.
  Until such a use case presents itself, I would rather be conservative
  from a uAPI perspective.

I understand that we put tighter restriction on this but if you see such
restriction is really a big issue for real usage, instead of a
theoretical problem, then we can loosen the check here. But at that time
below code is kind of x86 specific and may need improve.

BTW, in latest code, I replaced count_trailing_zeros() with fls64():
  return !!(fls64(offset) >= fls64(gpa));

[*] https://lore.kernel.org/all/Y8HldeHBrw+OOZVm@google.com/

Chao
> I come up with the following.
> 
> >From ec87e25082f0497431b732702fae82c6a05071bf Mon Sep 17 00:00:00 2001
> Message-Id: <ec87e25082f0497431b732702fae82c6a05071bf.1679531995.git.isaku.yamahata@intel.com>
> From: Isaku Yamahata <isaku.yamahata@intel.com>
> Date: Wed, 22 Mar 2023 15:32:56 -0700
> Subject: [PATCH] KVM: Relax alignment check for restricted mem
> 
> kvm_check_rmem_offset_alignment() only checks based on offset alignment
> and GPA alignment.  However, the actual alignment for offset depends
> on architecture.  For x86 case, it can be 1G, 2M or 4K.  So even if
> GPA is aligned for 1G+, only 1G-alignment is required for offset.
> 
> Without this patch, gpa=4G, offset=2G results in failure of memory slot
> creation.
> 
> Fixes: edc8814b2c77 ("KVM: Require gfn be aligned with restricted offset")
> Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> ---
>  arch/x86/include/asm/kvm_host.h | 15 +++++++++++++++
>  virt/kvm/kvm_main.c             |  9 ++++++++-
>  2 files changed, 23 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 88e11dd3afde..03af44650f24 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -16,6 +16,7 @@
>  #include <linux/irq_work.h>
>  #include <linux/irq.h>
>  #include <linux/workqueue.h>
> +#include <linux/count_zeros.h>
>  
>  #include <linux/kvm.h>
>  #include <linux/kvm_para.h>
> @@ -143,6 +144,20 @@
>  #define KVM_HPAGE_MASK(x)	(~(KVM_HPAGE_SIZE(x) - 1))
>  #define KVM_PAGES_PER_HPAGE(x)	(KVM_HPAGE_SIZE(x) / PAGE_SIZE)
>  
> +#define kvm_arch_required_alignment	kvm_arch_required_alignment
> +static inline int kvm_arch_required_alignment(u64 gpa)
> +{
> +	int zeros = count_trailing_zeros(gpa);
> +
> +	WARN_ON_ONCE(!PAGE_ALIGNED(gpa));
> +	if (zeros >= KVM_HPAGE_SHIFT(PG_LEVEL_1G))
> +		return KVM_HPAGE_SHIFT(PG_LEVEL_1G);
> +	else if (zeros >= KVM_HPAGE_SHIFT(PG_LEVEL_2M))
> +		return KVM_HPAGE_SHIFT(PG_LEVEL_2M);
> +
> +	return PAGE_SHIFT;
> +}
> +
>  #define KVM_MEMSLOT_PAGES_TO_MMU_PAGES_RATIO 50
>  #define KVM_MIN_ALLOC_MMU_PAGES 64UL
>  #define KVM_MMU_HASH_SHIFT 12
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index c9c4eef457b0..f4ff96171d24 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -2113,6 +2113,13 @@ static bool kvm_check_memslot_overlap(struct kvm_memslots *slots, int id,
>  	return false;
>  }
>  
> +#ifndef kvm_arch_required_alignment
> +__weak int kvm_arch_required_alignment(u64 gpa)
> +{
> +	return PAGE_SHIFT
> +}
> +#endif
> +
>  /*
>   * Return true when ALIGNMENT(offset) >= ALIGNMENT(gpa).
>   */
> @@ -2123,7 +2130,7 @@ static bool kvm_check_rmem_offset_alignment(u64 offset, u64 gpa)
>  	if (!gpa)
>  		return false;
>  
> -	return !!(count_trailing_zeros(offset) >= count_trailing_zeros(gpa));
> +	return !!(count_trailing_zeros(offset) >= kvm_arch_required_alignment(gpa));
>  }
>  
>  /*
> -- 
> 2.25.1
> 
> 
> 
> -- 
> Isaku Yamahata <isaku.yamahata@gmail.com>
Xiaoyao Li March 24, 2023, 2:29 a.m. UTC | #13
On 3/24/2023 10:10 AM, Chao Peng wrote:
> On Wed, Mar 22, 2023 at 05:41:31PM -0700, Isaku Yamahata wrote:
>> On Wed, Mar 08, 2023 at 03:40:26PM +0800,
>> Chao Peng <chao.p.peng@linux.intel.com> wrote:
>>
>>> On Wed, Mar 08, 2023 at 12:13:24AM +0000, Ackerley Tng wrote:
>>>> Chao Peng <chao.p.peng@linux.intel.com> writes:
>>>>
>>>>> On Sat, Jan 14, 2023 at 12:01:01AM +0000, Sean Christopherson wrote:
>>>>>> On Fri, Dec 02, 2022, Chao Peng wrote:
>>>>> ...
>>>>>> Strongly prefer to use similar logic to existing code that detects wraps:
>>>>
>>>>>> 		mem->restricted_offset + mem->memory_size < mem->restricted_offset
>>>>
>>>>>> This is also where I'd like to add the "gfn is aligned to offset"
>>>>>> check, though
>>>>>> my brain is too fried to figure that out right now.
>>>>
>>>>> Used count_trailing_zeros() for this TODO, unsure we have other better
>>>>> approach.
>>>>
>>>>> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
>>>>> index afc8c26fa652..fd34c5f7cd2f 100644
>>>>> --- a/virt/kvm/kvm_main.c
>>>>> +++ b/virt/kvm/kvm_main.c
>>>>> @@ -56,6 +56,7 @@
>>>>>    #include <asm/processor.h>
>>>>>    #include <asm/ioctl.h>
>>>>>    #include <linux/uaccess.h>
>>>>> +#include <linux/count_zeros.h>
>>>>
>>>>>    #include "coalesced_mmio.h"
>>>>>    #include "async_pf.h"
>>>>> @@ -2087,6 +2088,19 @@ static bool kvm_check_memslot_overlap(struct
>>>>> kvm_memslots *slots, int id,
>>>>>    	return false;
>>>>>    }
>>>>
>>>>> +/*
>>>>> + * Return true when ALIGNMENT(offset) >= ALIGNMENT(gpa).
>>>>> + */
>>>>> +static bool kvm_check_rmem_offset_alignment(u64 offset, u64 gpa)
>>>>> +{
>>>>> +	if (!offset)
>>>>> +		return true;
>>>>> +	if (!gpa)
>>>>> +		return false;
>>>>> +
>>>>> +	return !!(count_trailing_zeros(offset) >= count_trailing_zeros(gpa));
>>
>> This check doesn't work expected. For example, offset = 2GB, gpa=4GB
>> this check fails.
> 
> This case is expected to fail as Sean initially suggested[*]:
>    I would rather reject memslot if the gfn has lesser alignment than
>    the offset. I'm totally ok with this approach _if_ there's a use case.
>    Until such a use case presents itself, I would rather be conservative
>    from a uAPI perspective.
> 
> I understand that we put tighter restriction on this but if you see such
> restriction is really a big issue for real usage, instead of a
> theoretical problem, then we can loosen the check here. But at that time
> below code is kind of x86 specific and may need improve.
> 
> BTW, in latest code, I replaced count_trailing_zeros() with fls64():
>    return !!(fls64(offset) >= fls64(gpa));

wouldn't it be !!(ffs64(offset) <= ffs64(gpa)) ?

> [*] https://lore.kernel.org/all/Y8HldeHBrw+OOZVm@google.com/
> 
> Chao
>> I come up with the following.
>>
>> >From ec87e25082f0497431b732702fae82c6a05071bf Mon Sep 17 00:00:00 2001
>> Message-Id: <ec87e25082f0497431b732702fae82c6a05071bf.1679531995.git.isaku.yamahata@intel.com>
>> From: Isaku Yamahata <isaku.yamahata@intel.com>
>> Date: Wed, 22 Mar 2023 15:32:56 -0700
>> Subject: [PATCH] KVM: Relax alignment check for restricted mem
>>
>> kvm_check_rmem_offset_alignment() only checks based on offset alignment
>> and GPA alignment.  However, the actual alignment for offset depends
>> on architecture.  For x86 case, it can be 1G, 2M or 4K.  So even if
>> GPA is aligned for 1G+, only 1G-alignment is required for offset.
>>
>> Without this patch, gpa=4G, offset=2G results in failure of memory slot
>> creation.
>>
>> Fixes: edc8814b2c77 ("KVM: Require gfn be aligned with restricted offset")
>> Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
>> ---
>>   arch/x86/include/asm/kvm_host.h | 15 +++++++++++++++
>>   virt/kvm/kvm_main.c             |  9 ++++++++-
>>   2 files changed, 23 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>> index 88e11dd3afde..03af44650f24 100644
>> --- a/arch/x86/include/asm/kvm_host.h
>> +++ b/arch/x86/include/asm/kvm_host.h
>> @@ -16,6 +16,7 @@
>>   #include <linux/irq_work.h>
>>   #include <linux/irq.h>
>>   #include <linux/workqueue.h>
>> +#include <linux/count_zeros.h>
>>   
>>   #include <linux/kvm.h>
>>   #include <linux/kvm_para.h>
>> @@ -143,6 +144,20 @@
>>   #define KVM_HPAGE_MASK(x)	(~(KVM_HPAGE_SIZE(x) - 1))
>>   #define KVM_PAGES_PER_HPAGE(x)	(KVM_HPAGE_SIZE(x) / PAGE_SIZE)
>>   
>> +#define kvm_arch_required_alignment	kvm_arch_required_alignment
>> +static inline int kvm_arch_required_alignment(u64 gpa)
>> +{
>> +	int zeros = count_trailing_zeros(gpa);
>> +
>> +	WARN_ON_ONCE(!PAGE_ALIGNED(gpa));
>> +	if (zeros >= KVM_HPAGE_SHIFT(PG_LEVEL_1G))
>> +		return KVM_HPAGE_SHIFT(PG_LEVEL_1G);
>> +	else if (zeros >= KVM_HPAGE_SHIFT(PG_LEVEL_2M))
>> +		return KVM_HPAGE_SHIFT(PG_LEVEL_2M);
>> +
>> +	return PAGE_SHIFT;
>> +}
>> +
>>   #define KVM_MEMSLOT_PAGES_TO_MMU_PAGES_RATIO 50
>>   #define KVM_MIN_ALLOC_MMU_PAGES 64UL
>>   #define KVM_MMU_HASH_SHIFT 12
>> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
>> index c9c4eef457b0..f4ff96171d24 100644
>> --- a/virt/kvm/kvm_main.c
>> +++ b/virt/kvm/kvm_main.c
>> @@ -2113,6 +2113,13 @@ static bool kvm_check_memslot_overlap(struct kvm_memslots *slots, int id,
>>   	return false;
>>   }
>>   
>> +#ifndef kvm_arch_required_alignment
>> +__weak int kvm_arch_required_alignment(u64 gpa)
>> +{
>> +	return PAGE_SHIFT
>> +}
>> +#endif
>> +
>>   /*
>>    * Return true when ALIGNMENT(offset) >= ALIGNMENT(gpa).
>>    */
>> @@ -2123,7 +2130,7 @@ static bool kvm_check_rmem_offset_alignment(u64 offset, u64 gpa)
>>   	if (!gpa)
>>   		return false;
>>   
>> -	return !!(count_trailing_zeros(offset) >= count_trailing_zeros(gpa));
>> +	return !!(count_trailing_zeros(offset) >= kvm_arch_required_alignment(gpa));
>>   }
>>   
>>   /*
>> -- 
>> 2.25.1
>>
>>
>>
>> -- 
>> Isaku Yamahata <isaku.yamahata@gmail.com>
Chao Peng March 28, 2023, 10:41 a.m. UTC | #14
On Fri, Mar 24, 2023 at 10:29:25AM +0800, Xiaoyao Li wrote:
> On 3/24/2023 10:10 AM, Chao Peng wrote:
> > On Wed, Mar 22, 2023 at 05:41:31PM -0700, Isaku Yamahata wrote:
> > > On Wed, Mar 08, 2023 at 03:40:26PM +0800,
> > > Chao Peng <chao.p.peng@linux.intel.com> wrote:
> > > 
> > > > On Wed, Mar 08, 2023 at 12:13:24AM +0000, Ackerley Tng wrote:
> > > > > Chao Peng <chao.p.peng@linux.intel.com> writes:
> > > > > 
> > > > > > On Sat, Jan 14, 2023 at 12:01:01AM +0000, Sean Christopherson wrote:
> > > > > > > On Fri, Dec 02, 2022, Chao Peng wrote:
> > > > > > ...
> > > > > > > Strongly prefer to use similar logic to existing code that detects wraps:
> > > > > 
> > > > > > > 		mem->restricted_offset + mem->memory_size < mem->restricted_offset
> > > > > 
> > > > > > > This is also where I'd like to add the "gfn is aligned to offset"
> > > > > > > check, though
> > > > > > > my brain is too fried to figure that out right now.
> > > > > 
> > > > > > Used count_trailing_zeros() for this TODO, unsure we have other better
> > > > > > approach.
> > > > > 
> > > > > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > > > > > index afc8c26fa652..fd34c5f7cd2f 100644
> > > > > > --- a/virt/kvm/kvm_main.c
> > > > > > +++ b/virt/kvm/kvm_main.c
> > > > > > @@ -56,6 +56,7 @@
> > > > > >    #include <asm/processor.h>
> > > > > >    #include <asm/ioctl.h>
> > > > > >    #include <linux/uaccess.h>
> > > > > > +#include <linux/count_zeros.h>
> > > > > 
> > > > > >    #include "coalesced_mmio.h"
> > > > > >    #include "async_pf.h"
> > > > > > @@ -2087,6 +2088,19 @@ static bool kvm_check_memslot_overlap(struct
> > > > > > kvm_memslots *slots, int id,
> > > > > >    	return false;
> > > > > >    }
> > > > > 
> > > > > > +/*
> > > > > > + * Return true when ALIGNMENT(offset) >= ALIGNMENT(gpa).
> > > > > > + */
> > > > > > +static bool kvm_check_rmem_offset_alignment(u64 offset, u64 gpa)
> > > > > > +{
> > > > > > +	if (!offset)
> > > > > > +		return true;
> > > > > > +	if (!gpa)
> > > > > > +		return false;
> > > > > > +
> > > > > > +	return !!(count_trailing_zeros(offset) >= count_trailing_zeros(gpa));
> > > 
> > > This check doesn't work expected. For example, offset = 2GB, gpa=4GB
> > > this check fails.
> > 
> > This case is expected to fail as Sean initially suggested[*]:
> >    I would rather reject memslot if the gfn has lesser alignment than
> >    the offset. I'm totally ok with this approach _if_ there's a use case.
> >    Until such a use case presents itself, I would rather be conservative
> >    from a uAPI perspective.
> > 
> > I understand that we put tighter restriction on this but if you see such
> > restriction is really a big issue for real usage, instead of a
> > theoretical problem, then we can loosen the check here. But at that time
> > below code is kind of x86 specific and may need improve.
> > 
> > BTW, in latest code, I replaced count_trailing_zeros() with fls64():
> >    return !!(fls64(offset) >= fls64(gpa));
> 
> wouldn't it be !!(ffs64(offset) <= ffs64(gpa)) ?

As the function document explains, here we want to return true when
ALIGNMENT(offset) >= ALIGNMENT(gpa), so '>=' is what we need.

It's worthy clarifying that in Sean's original suggestion he actually
mentioned the opposite. He said 'reject memslot if the gfn has lesser
alignment than the offset', but I wonder this is his purpose, since
if ALIGNMENT(offset) < ALIGNMENT(gpa), we wouldn't be possible to map
the page as largepage. Consider we have below config:

  gpa=2M, offset=1M

In this case KVM tries to map gpa at 2M as 2M hugepage but the physical
page at the offset(1M) in private_fd cannot provide the 2M page due to
misalignment.

But as we discussed in the off-list thread, here we do find a real use
case indicating this check is too strict. i.e. QEMU immediately fails
when launch a guest > 2G memory. For this case QEMU splits guest memory
space into two slots:

  Slot#1(ram_below_4G): gpa=0x0, offset=0x0, size=2G
  Slot#2(ram_above_4G): gpa=4G,  offset=2G,  size=totalsize-2G

This strict alignment check fails for slot#2 because offset(2G) has less
alignment than gpa(4G). To allow this, one solution can revert to my
previous change in kvm_alloc_memslot_metadata() to disallow hugepage
only when the offset/gpa are not aligned to related page size.

Sean, How do you think?

Chao
> 
> > [*] https://lore.kernel.org/all/Y8HldeHBrw+OOZVm@google.com/
> > 
> > Chao
> > > I come up with the following.
> > > 
> > > >From ec87e25082f0497431b732702fae82c6a05071bf Mon Sep 17 00:00:00 2001
> > > Message-Id: <ec87e25082f0497431b732702fae82c6a05071bf.1679531995.git.isaku.yamahata@intel.com>
> > > From: Isaku Yamahata <isaku.yamahata@intel.com>
> > > Date: Wed, 22 Mar 2023 15:32:56 -0700
> > > Subject: [PATCH] KVM: Relax alignment check for restricted mem
> > > 
> > > kvm_check_rmem_offset_alignment() only checks based on offset alignment
> > > and GPA alignment.  However, the actual alignment for offset depends
> > > on architecture.  For x86 case, it can be 1G, 2M or 4K.  So even if
> > > GPA is aligned for 1G+, only 1G-alignment is required for offset.
> > > 
> > > Without this patch, gpa=4G, offset=2G results in failure of memory slot
> > > creation.
> > > 
> > > Fixes: edc8814b2c77 ("KVM: Require gfn be aligned with restricted offset")
> > > Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
> > > ---
> > >   arch/x86/include/asm/kvm_host.h | 15 +++++++++++++++
> > >   virt/kvm/kvm_main.c             |  9 ++++++++-
> > >   2 files changed, 23 insertions(+), 1 deletion(-)
> > > 
> > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > > index 88e11dd3afde..03af44650f24 100644
> > > --- a/arch/x86/include/asm/kvm_host.h
> > > +++ b/arch/x86/include/asm/kvm_host.h
> > > @@ -16,6 +16,7 @@
> > >   #include <linux/irq_work.h>
> > >   #include <linux/irq.h>
> > >   #include <linux/workqueue.h>
> > > +#include <linux/count_zeros.h>
> > >   #include <linux/kvm.h>
> > >   #include <linux/kvm_para.h>
> > > @@ -143,6 +144,20 @@
> > >   #define KVM_HPAGE_MASK(x)	(~(KVM_HPAGE_SIZE(x) - 1))
> > >   #define KVM_PAGES_PER_HPAGE(x)	(KVM_HPAGE_SIZE(x) / PAGE_SIZE)
> > > +#define kvm_arch_required_alignment	kvm_arch_required_alignment
> > > +static inline int kvm_arch_required_alignment(u64 gpa)
> > > +{
> > > +	int zeros = count_trailing_zeros(gpa);
> > > +
> > > +	WARN_ON_ONCE(!PAGE_ALIGNED(gpa));
> > > +	if (zeros >= KVM_HPAGE_SHIFT(PG_LEVEL_1G))
> > > +		return KVM_HPAGE_SHIFT(PG_LEVEL_1G);
> > > +	else if (zeros >= KVM_HPAGE_SHIFT(PG_LEVEL_2M))
> > > +		return KVM_HPAGE_SHIFT(PG_LEVEL_2M);
> > > +
> > > +	return PAGE_SHIFT;
> > > +}
> > > +
> > >   #define KVM_MEMSLOT_PAGES_TO_MMU_PAGES_RATIO 50
> > >   #define KVM_MIN_ALLOC_MMU_PAGES 64UL
> > >   #define KVM_MMU_HASH_SHIFT 12
> > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > > index c9c4eef457b0..f4ff96171d24 100644
> > > --- a/virt/kvm/kvm_main.c
> > > +++ b/virt/kvm/kvm_main.c
> > > @@ -2113,6 +2113,13 @@ static bool kvm_check_memslot_overlap(struct kvm_memslots *slots, int id,
> > >   	return false;
> > >   }
> > > +#ifndef kvm_arch_required_alignment
> > > +__weak int kvm_arch_required_alignment(u64 gpa)
> > > +{
> > > +	return PAGE_SHIFT
> > > +}
> > > +#endif
> > > +
> > >   /*
> > >    * Return true when ALIGNMENT(offset) >= ALIGNMENT(gpa).
> > >    */
> > > @@ -2123,7 +2130,7 @@ static bool kvm_check_rmem_offset_alignment(u64 offset, u64 gpa)
> > >   	if (!gpa)
> > >   		return false;
> > > -	return !!(count_trailing_zeros(offset) >= count_trailing_zeros(gpa));
> > > +	return !!(count_trailing_zeros(offset) >= kvm_arch_required_alignment(gpa));
> > >   }
> > >   /*
> > > -- 
> > > 2.25.1
> > > 
> > > 
> > > 
> > > -- 
> > > Isaku Yamahata <isaku.yamahata@gmail.com>
Sean Christopherson April 14, 2023, 9:08 p.m. UTC | #15
On Tue, Mar 28, 2023, Chao Peng wrote:
> On Fri, Mar 24, 2023 at 10:29:25AM +0800, Xiaoyao Li wrote:
> > On 3/24/2023 10:10 AM, Chao Peng wrote:
> > > On Wed, Mar 22, 2023 at 05:41:31PM -0700, Isaku Yamahata wrote:
> > > > On Wed, Mar 08, 2023 at 03:40:26PM +0800,
> > > > Chao Peng <chao.p.peng@linux.intel.com> wrote:
> > > > 
> > > > > On Wed, Mar 08, 2023 at 12:13:24AM +0000, Ackerley Tng wrote:
> > > > > > Chao Peng <chao.p.peng@linux.intel.com> writes:
> > > > > > 
> > > > > > > On Sat, Jan 14, 2023 at 12:01:01AM +0000, Sean Christopherson wrote:
> > > > > > > > On Fri, Dec 02, 2022, Chao Peng wrote:
> > > > > > > +static bool kvm_check_rmem_offset_alignment(u64 offset, u64 gpa)
> > > > > > > +{
> > > > > > > +	if (!offset)
> > > > > > > +		return true;
> > > > > > > +	if (!gpa)
> > > > > > > +		return false;
> > > > > > > +
> > > > > > > +	return !!(count_trailing_zeros(offset) >= count_trailing_zeros(gpa));
> > > > 
> > > > This check doesn't work expected. For example, offset = 2GB, gpa=4GB
> > > > this check fails.
> > > 
> > > This case is expected to fail as Sean initially suggested[*]:
> > >    I would rather reject memslot if the gfn has lesser alignment than
> > >    the offset. I'm totally ok with this approach _if_ there's a use case.
> > >    Until such a use case presents itself, I would rather be conservative
> > >    from a uAPI perspective.
> > > 
> > > I understand that we put tighter restriction on this but if you see such
> > > restriction is really a big issue for real usage, instead of a
> > > theoretical problem, then we can loosen the check here. But at that time
> > > below code is kind of x86 specific and may need improve.
> > > 
> > > BTW, in latest code, I replaced count_trailing_zeros() with fls64():
> > >    return !!(fls64(offset) >= fls64(gpa));
> > 
> > wouldn't it be !!(ffs64(offset) <= ffs64(gpa)) ?
> 
> As the function document explains, here we want to return true when
> ALIGNMENT(offset) >= ALIGNMENT(gpa), so '>=' is what we need.
> 
> It's worthy clarifying that in Sean's original suggestion he actually
> mentioned the opposite. He said 'reject memslot if the gfn has lesser
> alignment than the offset', but I wonder this is his purpose, since
> if ALIGNMENT(offset) < ALIGNMENT(gpa), we wouldn't be possible to map
> the page as largepage. Consider we have below config:
> 
>   gpa=2M, offset=1M
> 
> In this case KVM tries to map gpa at 2M as 2M hugepage but the physical
> page at the offset(1M) in private_fd cannot provide the 2M page due to
> misalignment.
> 
> But as we discussed in the off-list thread, here we do find a real use
> case indicating this check is too strict. i.e. QEMU immediately fails
> when launch a guest > 2G memory. For this case QEMU splits guest memory
> space into two slots:
> 
>   Slot#1(ram_below_4G): gpa=0x0, offset=0x0, size=2G
>   Slot#2(ram_above_4G): gpa=4G,  offset=2G,  size=totalsize-2G
> 
> This strict alignment check fails for slot#2 because offset(2G) has less
> alignment than gpa(4G). To allow this, one solution can revert to my
> previous change in kvm_alloc_memslot_metadata() to disallow hugepage
> only when the offset/gpa are not aligned to related page size.
> 
> Sean, How do you think?

I agree, a pure alignment check is too restrictive, and not really what I intended
despite past me literally saying that's what I wanted :-)  I think I may have also
inverted the "less alignment" statement, but luckily I believe that ends up being
a moot point.

The goal is to avoid having to juggle scenarios where KVM wants to create a hugepage,
but restrictedmem can't provide one because of a misaligned file offset.  I think
the rule we want is that the offset must be aligned to the largest page size allowed
by the memslot _size_.  E.g. on x86, if the memslot size is >=1GiB then the offset
must be 1GiB or beter, ditto for >=2MiB and >=4KiB (ignoring that 4KiB is already a
requirement).

We could loosen that to say the largest size allowed by the memslot, but I don't
think that's worth the effort unless it's trivially easy to implement in code,
e.g. KVM could technically allow a 4KiB aligned offset if the memslot is 2MiB
sized but only 4KiB aligned on the GPA.  I doubt there's a real use case for such
a memslot, so I want to disallow that unless it's super easy to implement.
Sean Christopherson April 25, 2023, 11:01 p.m. UTC | #16
On Tue, Apr 18, 2023, Ackerley Tng wrote:
> Sean Christopherson <seanjc@google.com> writes:
> > I agree, a pure alignment check is too restrictive, and not really what I
> > intended despite past me literally saying that's what I wanted :-)  I think
> > I may have also inverted the "less alignment" statement, but luckily I
> > believe that ends up being a moot point.
> 
> > The goal is to avoid having to juggle scenarios where KVM wants to create a
> > hugepage, but restrictedmem can't provide one because of a misaligned file
> > offset.  I think the rule we want is that the offset must be aligned to the
> > largest page size allowed by the memslot _size_.  E.g. on x86, if the
> > memslot size is >=1GiB then the offset must be 1GiB or beter, ditto for
> > >=2MiB and >=4KiB (ignoring that 4KiB is already a requirement).
> 
> > We could loosen that to say the largest size allowed by the memslot, but I
> > don't think that's worth the effort unless it's trivially easy to implement
> > in code, e.g. KVM could technically allow a 4KiB aligned offset if the
> > memslot is 2MiB sized but only 4KiB aligned on the GPA.  I doubt there's a
> > real use case for such a memslot, so I want to disallow that unless it's
> > super easy to implement.
> 
> Checking my understanding here about why we need this alignment check:
> 
> When KVM requests a page from restrictedmem, KVM will provide an offset
> into the file in terms of 4K pages.
> 
> When shmem is configured to use hugepages, shmem_get_folio() will round
> the requested offset down to the nearest hugepage-aligned boundary in
> shmem_alloc_hugefolio().
> 
> Example of problematic configuration provided to
> KVM_SET_USER_MEMORY_REGION2:
> 
> + shmem configured to use 1GB pages
> + restrictedmem_offset provided to KVM_SET_USER_MEMORY_REGION2: 0x4000
> + memory_size provided in KVM_SET_USER_MEMORY_REGION2: 1GB
> + KVM requests offset (pgoff_t) 0x8, which translates to offset 0x8000
> 
> restrictedmem_get_page() and shmem_get_folio() returns the page for
> offset 0x0 in the file, since rounding down 0x8000 to the nearest 1GB is
> 0x0. This is allocating outside the range that KVM is supposed to use,
> since the parameters provided in KVM_SET_USER_MEMORY_REGION2 is only
> supposed to be offset 0x4000 to (0x4000 + 1GB = 0x40004000) in the file.
> 
> IIUC shmem will actually just round down (0x4000 rounded down to nearest
> 1GB will be 0x0) and allocate without checking bounds, so if offset 0x0
> to 0x4000 in the file were supposed to be used by something else, there
> might be issues.
> 
> Hence, this alignment check ensures that rounding down of any offsets
> provided by KVM (based on page size configured in the backing file
> provided) to restrictedmem_get_page() must not go below the offset
> provided to KVM_SET_USER_MEMORY_REGION2.
> 
> Enforcing alignment of restrictedmem_offset based on the currently-set
> page size in the backing file (i.e. shmem) may not be effective, since
> the size of the pages in the backing file can be adjusted to a larger
> size after KVM_SET_USER_MEMORY_REGION2 succeeds. With that, we may still
> end up allocating outside the range that KVM was provided with.
> 
> Hence, to be safe, we should check alignment to the max page size across
> all backing filesystems, so the constraint is
> 
>     rounding down restrictedmem_offset to
>     min(max page size across all backing filesystems,
>         max page size that fits in memory_size) == restrictedmem_offset
> 
> which is the same check as
> 
>     restrictedmem_offset must be aligned to min(max page size across all
>     backing filesystems, max page size that fits in memory_size)
> 
> which can safely reduce to
> 
>     restrictedmem_offset must be aligned to max page size that fits in
>     memory_size
> 
> since "max page size that fits in memory_size" is probably <= to "max
> page size across all backing filesystems", and if it's larger, it'll
> just be a tighter constraint.

Yes?  The alignment check isn't strictly required, KVM _could_ deal with the above
scenario, it's just a lot simpler and safer for KVM if the file offset needs to
be sanely aligned.

> If the above understanding is correct:
> 
> + We must enforce this in the KVM_SET_USER_MEMORY_REGION2 handler, since
>   IIUC shmem will just round down and allocate without checking bounds.
> 
>     + I think this is okay because holes in the restrictedmem file (in
>       terms of offset) made to accommodate this constraint don't cost us
>       anything anyway(?) Are they just arbitrary offsets in a file? In
>       our case, this file is usually a new and empty file.
> 
>     + In the case of migration of a restrictedmem file between two KVM
>       VMs, this constraint would cause a problem is if the largest
>       possible page size on the destination machine is larger than that
>       of the source machine. In that case, we might have to move the
>       data in the file to a different offset (a separate problem).

Hmm, I was thinking this would be a non-issue because the check would be tied to
the max page _possible_ page size irrespective of hardware support, but that would
be problematic if KVM ever supports 512GiB pages.  I'm not sure that speculatively
requiring super huge memslots to be 512GiB aligned is sensible.

Aha!  If we go with a KVM ioctl(), a clean way around this is tie the alignment
requirement to the memfd flags, e.g. if userspace requests the memfd to be backed
by PMD hugepages, then the memslot offset needs to be 2MiB aligned on x86.  That
will continue to work if (big if) KVM supports 512GiB pages because the "legacy"
memfd would still be capped at 2MiB pages.

Architectures that support variable hugepage sizes might need to do something
else, but I don't think that possibility affects what x86 can/can't do.

> + On this note, it seems like there is no check for when the range is
>   smaller than the allocated page? Like if the range provided is 4KB in
>   size, but shmem is then configured to use a 1GB page, will we end up
>   allocating past the end of the range?

No, KVM already gracefully handles situations like this.  Well, x86 does, I assume
other architectures do too :-)

As above, the intent of the extra restriction is so that KVM doen't need even more
weird code (read: math) to gracefully handle the new edge cases that would come with
fd-only memslots.
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 7772ab37ac89..27ef31133352 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -114,6 +114,7 @@ 
 	KVM_ARCH_REQ_FLAGS(31, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
 #define KVM_REQ_HV_TLB_FLUSH \
 	KVM_ARCH_REQ_FLAGS(32, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_MEMORY_MCE		KVM_ARCH_REQ(33)
 
 #define CR0_RESERVED_BITS                                               \
 	(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5aefcff614d2..c67e22f3e2ee 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6587,6 +6587,13 @@  int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state)
 }
 #endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */
 
+#ifdef CONFIG_HAVE_KVM_RESTRICTED_MEM
+void kvm_arch_memory_mce(struct kvm *kvm)
+{
+	kvm_make_all_cpus_request(kvm, KVM_REQ_MEMORY_MCE);
+}
+#endif
+
 static int kvm_vm_ioctl_get_clock(struct kvm *kvm, void __user *argp)
 {
 	struct kvm_clock_data data = { 0 };
@@ -10357,6 +10364,12 @@  static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
 		if (kvm_check_request(KVM_REQ_UPDATE_CPU_DIRTY_LOGGING, vcpu))
 			static_call(kvm_x86_update_cpu_dirty_logging)(vcpu);
+
+		if (kvm_check_request(KVM_REQ_MEMORY_MCE, vcpu)) {
+			vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
+			r = 0;
+			goto out;
+		}
 	}
 
 	if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win ||
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 153842bb33df..f032d878e034 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -590,6 +590,7 @@  struct kvm_memory_slot {
 	struct file *restricted_file;
 	loff_t restricted_offset;
 	struct restrictedmem_notifier notifier;
+	struct kvm *kvm;
 };
 
 static inline bool kvm_slot_can_be_private(const struct kvm_memory_slot *slot)
@@ -2363,6 +2364,8 @@  static inline int kvm_restricted_mem_get_pfn(struct kvm_memory_slot *slot,
 	*pfn = page_to_pfn(page);
 	return ret;
 }
+
+void kvm_arch_memory_mce(struct kvm *kvm);
 #endif /* CONFIG_HAVE_KVM_RESTRICTED_MEM */
 
 #endif
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index e107afea32f0..ac835fc77273 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -936,6 +936,121 @@  static int kvm_init_mmu_notifier(struct kvm *kvm)
 
 #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
 
+#ifdef CONFIG_HAVE_KVM_RESTRICTED_MEM
+static bool restrictedmem_range_is_valid(struct kvm_memory_slot *slot,
+					 pgoff_t start, pgoff_t end,
+					 gfn_t *gfn_start, gfn_t *gfn_end)
+{
+	unsigned long base_pgoff = slot->restricted_offset >> PAGE_SHIFT;
+
+	if (start > base_pgoff)
+		*gfn_start = slot->base_gfn + start - base_pgoff;
+	else
+		*gfn_start = slot->base_gfn;
+
+	if (end < base_pgoff + slot->npages)
+		*gfn_end = slot->base_gfn + end - base_pgoff;
+	else
+		*gfn_end = slot->base_gfn + slot->npages;
+
+	if (*gfn_start >= *gfn_end)
+		return false;
+
+	return true;
+}
+
+static void kvm_restrictedmem_invalidate_begin(struct restrictedmem_notifier *notifier,
+					       pgoff_t start, pgoff_t end)
+{
+	struct kvm_memory_slot *slot = container_of(notifier,
+						    struct kvm_memory_slot,
+						    notifier);
+	struct kvm *kvm = slot->kvm;
+	gfn_t gfn_start, gfn_end;
+	struct kvm_gfn_range gfn_range;
+	int idx;
+
+	if (!restrictedmem_range_is_valid(slot, start, end,
+					  &gfn_start, &gfn_end))
+		return;
+
+	gfn_range.start = gfn_start;
+	gfn_range.end = gfn_end;
+	gfn_range.slot = slot;
+	gfn_range.pte = __pte(0);
+	gfn_range.may_block = true;
+
+	idx = srcu_read_lock(&kvm->srcu);
+	KVM_MMU_LOCK(kvm);
+
+	kvm_mmu_invalidate_begin(kvm);
+	kvm_mmu_invalidate_range_add(kvm, gfn_start, gfn_end);
+	if (kvm_unmap_gfn_range(kvm, &gfn_range))
+		kvm_flush_remote_tlbs(kvm);
+
+	KVM_MMU_UNLOCK(kvm);
+	srcu_read_unlock(&kvm->srcu, idx);
+}
+
+static void kvm_restrictedmem_invalidate_end(struct restrictedmem_notifier *notifier,
+					     pgoff_t start, pgoff_t end)
+{
+	struct kvm_memory_slot *slot = container_of(notifier,
+						    struct kvm_memory_slot,
+						    notifier);
+	struct kvm *kvm = slot->kvm;
+	gfn_t gfn_start, gfn_end;
+
+	if (!restrictedmem_range_is_valid(slot, start, end,
+					  &gfn_start, &gfn_end))
+		return;
+
+	KVM_MMU_LOCK(kvm);
+	kvm_mmu_invalidate_end(kvm);
+	KVM_MMU_UNLOCK(kvm);
+}
+
+static void kvm_restrictedmem_error(struct restrictedmem_notifier *notifier,
+				    pgoff_t start, pgoff_t end)
+{
+	struct kvm_memory_slot *slot = container_of(notifier,
+						    struct kvm_memory_slot,
+						    notifier);
+	kvm_arch_memory_mce(slot->kvm);
+}
+
+static struct restrictedmem_notifier_ops kvm_restrictedmem_notifier_ops = {
+	.invalidate_start = kvm_restrictedmem_invalidate_begin,
+	.invalidate_end = kvm_restrictedmem_invalidate_end,
+	.error = kvm_restrictedmem_error,
+};
+
+static inline void kvm_restrictedmem_register(struct kvm_memory_slot *slot)
+{
+	slot->notifier.ops = &kvm_restrictedmem_notifier_ops;
+	restrictedmem_register_notifier(slot->restricted_file, &slot->notifier);
+}
+
+static inline void kvm_restrictedmem_unregister(struct kvm_memory_slot *slot)
+{
+	restrictedmem_unregister_notifier(slot->restricted_file,
+					  &slot->notifier);
+}
+
+#else /* !CONFIG_HAVE_KVM_RESTRICTED_MEM */
+
+static inline void kvm_restrictedmem_register(struct kvm_memory_slot *slot)
+{
+	WARN_ON_ONCE(1);
+}
+
+static inline void kvm_restrictedmem_unregister(struct kvm_memory_slot *slot)
+{
+	WARN_ON_ONCE(1);
+}
+
+#endif /* CONFIG_HAVE_KVM_RESTRICTED_MEM */
+
 #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
 static int kvm_pm_notifier_call(struct notifier_block *bl,
 				unsigned long state,
@@ -980,6 +1095,11 @@  static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
 /* This does not remove the slot from struct kvm_memslots data structures */
 static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
 {
+	if (slot->flags & KVM_MEM_PRIVATE) {
+		kvm_restrictedmem_unregister(slot);
+		fput(slot->restricted_file);
+	}
+
 	kvm_destroy_dirty_bitmap(slot);
 
 	kvm_arch_free_memslot(kvm, slot);
@@ -1551,10 +1671,14 @@  static void kvm_replace_memslot(struct kvm *kvm,
 	}
 }
 
-static int check_memory_region_flags(const struct kvm_user_mem_region *mem)
+static int check_memory_region_flags(struct kvm *kvm,
+				     const struct kvm_user_mem_region *mem)
 {
 	u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
 
+	if (kvm_arch_has_private_mem(kvm))
+		valid_flags |= KVM_MEM_PRIVATE;
+
 #ifdef __KVM_HAVE_READONLY_MEM
 	valid_flags |= KVM_MEM_READONLY;
 #endif
@@ -1630,6 +1754,9 @@  static int kvm_prepare_memory_region(struct kvm *kvm,
 {
 	int r;
 
+	if (change == KVM_MR_CREATE && new->flags & KVM_MEM_PRIVATE)
+		kvm_restrictedmem_register(new);
+
 	/*
 	 * If dirty logging is disabled, nullify the bitmap; the old bitmap
 	 * will be freed on "commit".  If logging is enabled in both old and
@@ -1658,6 +1785,9 @@  static int kvm_prepare_memory_region(struct kvm *kvm,
 	if (r && new && new->dirty_bitmap && (!old || !old->dirty_bitmap))
 		kvm_destroy_dirty_bitmap(new);
 
+	if (r && change == KVM_MR_CREATE && new->flags & KVM_MEM_PRIVATE)
+		kvm_restrictedmem_unregister(new);
+
 	return r;
 }
 
@@ -1963,7 +2093,7 @@  int __kvm_set_memory_region(struct kvm *kvm,
 	int as_id, id;
 	int r;
 
-	r = check_memory_region_flags(mem);
+	r = check_memory_region_flags(kvm, mem);
 	if (r)
 		return r;
 
@@ -1982,6 +2112,10 @@  int __kvm_set_memory_region(struct kvm *kvm,
 	     !access_ok((void __user *)(unsigned long)mem->userspace_addr,
 			mem->memory_size))
 		return -EINVAL;
+	if (mem->flags & KVM_MEM_PRIVATE &&
+		(mem->restricted_offset & (PAGE_SIZE - 1) ||
+		 mem->restricted_offset > U64_MAX - mem->memory_size))
+		return -EINVAL;
 	if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM)
 		return -EINVAL;
 	if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
@@ -2020,6 +2154,9 @@  int __kvm_set_memory_region(struct kvm *kvm,
 		if ((kvm->nr_memslot_pages + npages) < kvm->nr_memslot_pages)
 			return -EINVAL;
 	} else { /* Modify an existing slot. */
+		/* Private memslots are immutable, they can only be deleted. */
+		if (mem->flags & KVM_MEM_PRIVATE)
+			return -EINVAL;
 		if ((mem->userspace_addr != old->userspace_addr) ||
 		    (npages != old->npages) ||
 		    ((mem->flags ^ old->flags) & KVM_MEM_READONLY))
@@ -2048,10 +2185,28 @@  int __kvm_set_memory_region(struct kvm *kvm,
 	new->npages = npages;
 	new->flags = mem->flags;
 	new->userspace_addr = mem->userspace_addr;
+	if (mem->flags & KVM_MEM_PRIVATE) {
+		new->restricted_file = fget(mem->restricted_fd);
+		if (!new->restricted_file ||
+		    !file_is_restrictedmem(new->restricted_file)) {
+			r = -EINVAL;
+			goto out;
+		}
+		new->restricted_offset = mem->restricted_offset;
+	}
+
+	new->kvm = kvm;
 
 	r = kvm_set_memslot(kvm, old, new, change);
 	if (r)
-		kfree(new);
+		goto out;
+
+	return 0;
+
+out:
+	if (new->restricted_file)
+		fput(new->restricted_file);
+	kfree(new);
 	return r;
 }
 EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
@@ -2351,6 +2506,8 @@  static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
 #ifdef CONFIG_HAVE_KVM_MEMORY_ATTRIBUTES
 static u64 kvm_supported_mem_attributes(struct kvm *kvm)
 {
+	if (kvm_arch_has_private_mem(kvm))
+		return KVM_MEMORY_ATTRIBUTE_PRIVATE;
 	return 0;
 }
 
@@ -4822,16 +4979,28 @@  static long kvm_vm_ioctl(struct file *filp,
 	}
 	case KVM_SET_USER_MEMORY_REGION: {
 		struct kvm_user_mem_region mem;
-		unsigned long size = sizeof(struct kvm_userspace_memory_region);
+		unsigned int flags_offset = offsetof(typeof(mem), flags);
+		unsigned long size;
+		u32 flags;
 
 		kvm_sanity_check_user_mem_region_alias();
 
+		memset(&mem, 0, sizeof(mem));
+
 		r = -EFAULT;
+		if (get_user(flags, (u32 __user *)(argp + flags_offset)))
+			goto out;
+
+		if (flags & KVM_MEM_PRIVATE)
+			size = sizeof(struct kvm_userspace_memory_region_ext);
+		else
+			size = sizeof(struct kvm_userspace_memory_region);
+
 		if (copy_from_user(&mem, argp, size))
 			goto out;
 
 		r = -EINVAL;
-		if (mem.flags & KVM_MEM_PRIVATE)
+		if ((flags ^ mem.flags) & KVM_MEM_PRIVATE)
 			goto out;
 
 		r = kvm_vm_ioctl_set_memory_region(kvm, &mem);