diff mbox series

[v4,12/12] KVM: Expose KVM_MEM_PRIVATE

Message ID 20220118132121.31388-13-chao.p.peng@linux.intel.com (mailing list archive)
State New, archived
Headers show
Series KVM: mm: fd-based approach for supporting KVM guest private memory | expand

Commit Message

Chao Peng Jan. 18, 2022, 1:21 p.m. UTC
KVM_MEM_PRIVATE is not exposed by default but architecture code can turn
on it by implementing kvm_arch_private_memory_supported().

Also private memslot cannot be movable and the same file+offset can not
be mapped into different GFNs.

Signed-off-by: Yu Zhang <yu.c.zhang@linux.intel.com>
Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
---
 include/linux/kvm_host.h |  1 +
 virt/kvm/kvm_main.c      | 49 ++++++++++++++++++++++++++++++++++------
 2 files changed, 43 insertions(+), 7 deletions(-)

Comments

Maciej S. Szmigiero Jan. 25, 2022, 8:20 p.m. UTC | #1
On 18.01.2022 14:21, Chao Peng wrote:
> KVM_MEM_PRIVATE is not exposed by default but architecture code can turn
> on it by implementing kvm_arch_private_memory_supported().
> 
> Also private memslot cannot be movable and the same file+offset can not
> be mapped into different GFNs.
> 
> Signed-off-by: Yu Zhang <yu.c.zhang@linux.intel.com>
> Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
> ---
(..)
>   
>   static bool kvm_check_memslot_overlap(struct kvm_memslots *slots, int id,
> -				      gfn_t start, gfn_t end)
> +				      struct file *file,
> +				      gfn_t start, gfn_t end,
> +				      loff_t start_off, loff_t end_off)
>   {
>   	struct kvm_memslot_iter iter;
> +	struct kvm_memory_slot *slot;
> +	struct inode *inode;
> +	int bkt;
>   
>   	kvm_for_each_memslot_in_gfn_range(&iter, slots, start, end) {
>   		if (iter.slot->id != id)
>   			return true;
>   	}
>   
> +	/* Disallow mapping the same file+offset into multiple gfns. */
> +	if (file) {
> +		inode = file_inode(file);
> +		kvm_for_each_memslot(slot, bkt, slots) {
> +			if (slot->private_file &&
> +			     file_inode(slot->private_file) == inode &&
> +			     !(end_off <= slot->private_offset ||
> +			       start_off >= slot->private_offset
> +					     + (slot->npages >> PAGE_SHIFT)))
> +				return true;
> +		}
> +	}

That's a linear scan of all memslots on each CREATE (and MOVE) operation
with a fd - we just spent more than a year rewriting similar linear scans
into more efficient operations in KVM.

Thanks,
Maciej
Chao Peng Feb. 17, 2022, 1:45 p.m. UTC | #2
On Tue, Jan 25, 2022 at 09:20:39PM +0100, Maciej S. Szmigiero wrote:
> On 18.01.2022 14:21, Chao Peng wrote:
> > KVM_MEM_PRIVATE is not exposed by default but architecture code can turn
> > on it by implementing kvm_arch_private_memory_supported().
> > 
> > Also private memslot cannot be movable and the same file+offset can not
> > be mapped into different GFNs.
> > 
> > Signed-off-by: Yu Zhang <yu.c.zhang@linux.intel.com>
> > Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
> > ---
> (..)
> >   static bool kvm_check_memslot_overlap(struct kvm_memslots *slots, int id,
> > -				      gfn_t start, gfn_t end)
> > +				      struct file *file,
> > +				      gfn_t start, gfn_t end,
> > +				      loff_t start_off, loff_t end_off)
> >   {
> >   	struct kvm_memslot_iter iter;
> > +	struct kvm_memory_slot *slot;
> > +	struct inode *inode;
> > +	int bkt;
> >   	kvm_for_each_memslot_in_gfn_range(&iter, slots, start, end) {
> >   		if (iter.slot->id != id)
> >   			return true;
> >   	}
> > +	/* Disallow mapping the same file+offset into multiple gfns. */
> > +	if (file) {
> > +		inode = file_inode(file);
> > +		kvm_for_each_memslot(slot, bkt, slots) {
> > +			if (slot->private_file &&
> > +			     file_inode(slot->private_file) == inode &&
> > +			     !(end_off <= slot->private_offset ||
> > +			       start_off >= slot->private_offset
> > +					     + (slot->npages >> PAGE_SHIFT)))
> > +				return true;
> > +		}
> > +	}
> 
> That's a linear scan of all memslots on each CREATE (and MOVE) operation
> with a fd - we just spent more than a year rewriting similar linear scans
> into more efficient operations in KVM.

In the last version I tried to solve this problem by using interval tree
(just like existing hva_tree), but finally we realized that in one VM we
can have multiple fds with overlapped offsets so that approach is
incorrect. See https://lkml.org/lkml/2021/12/28/480 for the discussion.

So linear scan is used before I can find a better way.

Chao
Maciej S. Szmigiero Feb. 22, 2022, 1:16 a.m. UTC | #3
On 17.02.2022 14:45, Chao Peng wrote:
> On Tue, Jan 25, 2022 at 09:20:39PM +0100, Maciej S. Szmigiero wrote:
>> On 18.01.2022 14:21, Chao Peng wrote:
>>> KVM_MEM_PRIVATE is not exposed by default but architecture code can turn
>>> on it by implementing kvm_arch_private_memory_supported().
>>>
>>> Also private memslot cannot be movable and the same file+offset can not
>>> be mapped into different GFNs.
>>>
>>> Signed-off-by: Yu Zhang <yu.c.zhang@linux.intel.com>
>>> Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
>>> ---
>> (..)
>>>    static bool kvm_check_memslot_overlap(struct kvm_memslots *slots, int id,
>>> -				      gfn_t start, gfn_t end)
>>> +				      struct file *file,
>>> +				      gfn_t start, gfn_t end,
>>> +				      loff_t start_off, loff_t end_off)
>>>    {
>>>    	struct kvm_memslot_iter iter;
>>> +	struct kvm_memory_slot *slot;
>>> +	struct inode *inode;
>>> +	int bkt;
>>>    	kvm_for_each_memslot_in_gfn_range(&iter, slots, start, end) {
>>>    		if (iter.slot->id != id)
>>>    			return true;
>>>    	}
>>> +	/* Disallow mapping the same file+offset into multiple gfns. */
>>> +	if (file) {
>>> +		inode = file_inode(file);
>>> +		kvm_for_each_memslot(slot, bkt, slots) {
>>> +			if (slot->private_file &&
>>> +			     file_inode(slot->private_file) == inode &&
>>> +			     !(end_off <= slot->private_offset ||
>>> +			       start_off >= slot->private_offset
>>> +					     + (slot->npages >> PAGE_SHIFT)))
>>> +				return true;
>>> +		}
>>> +	}
>>
>> That's a linear scan of all memslots on each CREATE (and MOVE) operation
>> with a fd - we just spent more than a year rewriting similar linear scans
>> into more efficient operations in KVM.
> 
> In the last version I tried to solve this problem by using interval tree
> (just like existing hva_tree), but finally we realized that in one VM we
> can have multiple fds with overlapped offsets so that approach is
> incorrect. See https://lkml.org/lkml/2021/12/28/480 for the discussion.

That's right, in this case a two-level structure would be necessary:
the first level matching a file, then the second level matching that
file ranges.
However, if such data is going to be used just for checking possible
overlap at memslot add or move time it is almost certainly an overkill.

> So linear scan is used before I can find a better way.

Another option would be to simply not check for overlap at add or move
time, declare such configuration undefined behavior under KVM API and
make sure in MMU notifiers that nothing bad happens to the host kernel
if it turns out somebody actually set up a VM this way (it could be
inefficient in this case, since it's not supposed to ever happen
unless there is a bug somewhere in the userspace part).

> Chao

Thanks,
Maciej
Chao Peng Feb. 23, 2022, noon UTC | #4
On Tue, Feb 22, 2022 at 02:16:46AM +0100, Maciej S. Szmigiero wrote:
> On 17.02.2022 14:45, Chao Peng wrote:
> > On Tue, Jan 25, 2022 at 09:20:39PM +0100, Maciej S. Szmigiero wrote:
> > > On 18.01.2022 14:21, Chao Peng wrote:
> > > > KVM_MEM_PRIVATE is not exposed by default but architecture code can turn
> > > > on it by implementing kvm_arch_private_memory_supported().
> > > > 
> > > > Also private memslot cannot be movable and the same file+offset can not
> > > > be mapped into different GFNs.
> > > > 
> > > > Signed-off-by: Yu Zhang <yu.c.zhang@linux.intel.com>
> > > > Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
> > > > ---
> > > (..)
> > > >    static bool kvm_check_memslot_overlap(struct kvm_memslots *slots, int id,
> > > > -				      gfn_t start, gfn_t end)
> > > > +				      struct file *file,
> > > > +				      gfn_t start, gfn_t end,
> > > > +				      loff_t start_off, loff_t end_off)
> > > >    {
> > > >    	struct kvm_memslot_iter iter;
> > > > +	struct kvm_memory_slot *slot;
> > > > +	struct inode *inode;
> > > > +	int bkt;
> > > >    	kvm_for_each_memslot_in_gfn_range(&iter, slots, start, end) {
> > > >    		if (iter.slot->id != id)
> > > >    			return true;
> > > >    	}
> > > > +	/* Disallow mapping the same file+offset into multiple gfns. */
> > > > +	if (file) {
> > > > +		inode = file_inode(file);
> > > > +		kvm_for_each_memslot(slot, bkt, slots) {
> > > > +			if (slot->private_file &&
> > > > +			     file_inode(slot->private_file) == inode &&
> > > > +			     !(end_off <= slot->private_offset ||
> > > > +			       start_off >= slot->private_offset
> > > > +					     + (slot->npages >> PAGE_SHIFT)))
> > > > +				return true;
> > > > +		}
> > > > +	}
> > > 
> > > That's a linear scan of all memslots on each CREATE (and MOVE) operation
> > > with a fd - we just spent more than a year rewriting similar linear scans
> > > into more efficient operations in KVM.
> > 
> > In the last version I tried to solve this problem by using interval tree
> > (just like existing hva_tree), but finally we realized that in one VM we
> > can have multiple fds with overlapped offsets so that approach is
> > incorrect. See https://lkml.org/lkml/2021/12/28/480 for the discussion.
> 
> That's right, in this case a two-level structure would be necessary:
> the first level matching a file, then the second level matching that
> file ranges.
> However, if such data is going to be used just for checking possible
> overlap at memslot add or move time it is almost certainly an overkill.

Yes, that is also what I'm seeing.

> 
> > So linear scan is used before I can find a better way.
> 
> Another option would be to simply not check for overlap at add or move
> time, declare such configuration undefined behavior under KVM API and
> make sure in MMU notifiers that nothing bad happens to the host kernel
> if it turns out somebody actually set up a VM this way (it could be
> inefficient in this case, since it's not supposed to ever happen
> unless there is a bug somewhere in the userspace part).

Specific to TDX case, SEAMMODULE will fail the overlapping case and then
KVM prints a message to the kernel log. It will not cause any other side
effect, it does look weird however. Yes warn that in the API document
can help to some extent.

Thanks,
Chao
> 
> > Chao
> 
> Thanks,
> Maciej
Maciej S. Szmigiero Feb. 23, 2022, 6:32 p.m. UTC | #5
On 23.02.2022 13:00, Chao Peng wrote:
> On Tue, Feb 22, 2022 at 02:16:46AM +0100, Maciej S. Szmigiero wrote:
>> On 17.02.2022 14:45, Chao Peng wrote:
>>> On Tue, Jan 25, 2022 at 09:20:39PM +0100, Maciej S. Szmigiero wrote:
>>>> On 18.01.2022 14:21, Chao Peng wrote:
>>>>> KVM_MEM_PRIVATE is not exposed by default but architecture code can turn
>>>>> on it by implementing kvm_arch_private_memory_supported().
>>>>>
>>>>> Also private memslot cannot be movable and the same file+offset can not
>>>>> be mapped into different GFNs.
>>>>>
>>>>> Signed-off-by: Yu Zhang <yu.c.zhang@linux.intel.com>
>>>>> Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
>>>>> ---
>>>> (..)
>>>>>     static bool kvm_check_memslot_overlap(struct kvm_memslots *slots, int id,
>>>>> -				      gfn_t start, gfn_t end)
>>>>> +				      struct file *file,
>>>>> +				      gfn_t start, gfn_t end,
>>>>> +				      loff_t start_off, loff_t end_off)
>>>>>     {
>>>>>     	struct kvm_memslot_iter iter;
>>>>> +	struct kvm_memory_slot *slot;
>>>>> +	struct inode *inode;
>>>>> +	int bkt;
>>>>>     	kvm_for_each_memslot_in_gfn_range(&iter, slots, start, end) {
>>>>>     		if (iter.slot->id != id)
>>>>>     			return true;
>>>>>     	}
>>>>> +	/* Disallow mapping the same file+offset into multiple gfns. */
>>>>> +	if (file) {
>>>>> +		inode = file_inode(file);
>>>>> +		kvm_for_each_memslot(slot, bkt, slots) {
>>>>> +			if (slot->private_file &&
>>>>> +			     file_inode(slot->private_file) == inode &&
>>>>> +			     !(end_off <= slot->private_offset ||
>>>>> +			       start_off >= slot->private_offset
>>>>> +					     + (slot->npages >> PAGE_SHIFT)))
>>>>> +				return true;
>>>>> +		}
>>>>> +	}
>>>>
>>>> That's a linear scan of all memslots on each CREATE (and MOVE) operation
>>>> with a fd - we just spent more than a year rewriting similar linear scans
>>>> into more efficient operations in KVM.
>>>
(..)
>>> So linear scan is used before I can find a better way.
>>
>> Another option would be to simply not check for overlap at add or move
>> time, declare such configuration undefined behavior under KVM API and
>> make sure in MMU notifiers that nothing bad happens to the host kernel
>> if it turns out somebody actually set up a VM this way (it could be
>> inefficient in this case, since it's not supposed to ever happen
>> unless there is a bug somewhere in the userspace part).
> 
> Specific to TDX case, SEAMMODULE will fail the overlapping case and then
> KVM prints a message to the kernel log. It will not cause any other side
> effect, it does look weird however. Yes warn that in the API document
> can help to some extent.

So for the functionality you are adding this code for (TDX) this scan
isn't necessary and the overlapping case (not supported anyway) is safely
handled by the hardware (or firmware)?
Then I would simply remove the scan and, maybe, add a comment instead
that the overlap check is done by the hardware.

By the way, if a kernel log message could be triggered by (misbehaving)
userspace then it should be rate limited (if it isn't already).

> Thanks,
> Chao

Thanks,
Maciej
Chao Peng Feb. 24, 2022, 8:07 a.m. UTC | #6
On Wed, Feb 23, 2022 at 07:32:37PM +0100, Maciej S. Szmigiero wrote:
> On 23.02.2022 13:00, Chao Peng wrote:
> > On Tue, Feb 22, 2022 at 02:16:46AM +0100, Maciej S. Szmigiero wrote:
> > > On 17.02.2022 14:45, Chao Peng wrote:
> > > > On Tue, Jan 25, 2022 at 09:20:39PM +0100, Maciej S. Szmigiero wrote:
> > > > > On 18.01.2022 14:21, Chao Peng wrote:
> > > > > > KVM_MEM_PRIVATE is not exposed by default but architecture code can turn
> > > > > > on it by implementing kvm_arch_private_memory_supported().
> > > > > > 
> > > > > > Also private memslot cannot be movable and the same file+offset can not
> > > > > > be mapped into different GFNs.
> > > > > > 
> > > > > > Signed-off-by: Yu Zhang <yu.c.zhang@linux.intel.com>
> > > > > > Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
> > > > > > ---
> > > > > (..)
> > > > > >     static bool kvm_check_memslot_overlap(struct kvm_memslots *slots, int id,
> > > > > > -				      gfn_t start, gfn_t end)
> > > > > > +				      struct file *file,
> > > > > > +				      gfn_t start, gfn_t end,
> > > > > > +				      loff_t start_off, loff_t end_off)
> > > > > >     {
> > > > > >     	struct kvm_memslot_iter iter;
> > > > > > +	struct kvm_memory_slot *slot;
> > > > > > +	struct inode *inode;
> > > > > > +	int bkt;
> > > > > >     	kvm_for_each_memslot_in_gfn_range(&iter, slots, start, end) {
> > > > > >     		if (iter.slot->id != id)
> > > > > >     			return true;
> > > > > >     	}
> > > > > > +	/* Disallow mapping the same file+offset into multiple gfns. */
> > > > > > +	if (file) {
> > > > > > +		inode = file_inode(file);
> > > > > > +		kvm_for_each_memslot(slot, bkt, slots) {
> > > > > > +			if (slot->private_file &&
> > > > > > +			     file_inode(slot->private_file) == inode &&
> > > > > > +			     !(end_off <= slot->private_offset ||
> > > > > > +			       start_off >= slot->private_offset
> > > > > > +					     + (slot->npages >> PAGE_SHIFT)))
> > > > > > +				return true;
> > > > > > +		}
> > > > > > +	}
> > > > > 
> > > > > That's a linear scan of all memslots on each CREATE (and MOVE) operation
> > > > > with a fd - we just spent more than a year rewriting similar linear scans
> > > > > into more efficient operations in KVM.
> > > > 
> (..)
> > > > So linear scan is used before I can find a better way.
> > > 
> > > Another option would be to simply not check for overlap at add or move
> > > time, declare such configuration undefined behavior under KVM API and
> > > make sure in MMU notifiers that nothing bad happens to the host kernel
> > > if it turns out somebody actually set up a VM this way (it could be
> > > inefficient in this case, since it's not supposed to ever happen
> > > unless there is a bug somewhere in the userspace part).
> > 
> > Specific to TDX case, SEAMMODULE will fail the overlapping case and then
> > KVM prints a message to the kernel log. It will not cause any other side
> > effect, it does look weird however. Yes warn that in the API document
> > can help to some extent.
> 
> So for the functionality you are adding this code for (TDX) this scan
> isn't necessary and the overlapping case (not supported anyway) is safely
> handled by the hardware (or firmware)?

Yes, it will be handled by the firmware.

> Then I would simply remove the scan and, maybe, add a comment instead
> that the overlap check is done by the hardware.

Sure.

> 
> By the way, if a kernel log message could be triggered by (misbehaving)
> userspace then it should be rate limited (if it isn't already).

Thanks for mention.

Chao
> 
> > Thanks,
> > Chao
> 
> Thanks,
> Maciej
diff mbox series

Patch

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 117cf0da9c5e..444b390261c0 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1328,6 +1328,7 @@  bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu);
 int kvm_arch_post_init_vm(struct kvm *kvm);
 void kvm_arch_pre_destroy_vm(struct kvm *kvm);
 int kvm_arch_create_vm_debugfs(struct kvm *kvm);
+bool kvm_arch_private_memory_supported(struct kvm *kvm);
 
 #ifndef __KVM_HAVE_ARCH_VM_ALLOC
 /*
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 10e553215618..51d0f08a8601 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1491,10 +1491,19 @@  static void kvm_replace_memslot(struct kvm *kvm,
 	}
 }
 
-static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem)
+bool __weak kvm_arch_private_memory_supported(struct kvm *kvm)
+{
+	return false;
+}
+
+static int check_memory_region_flags(struct kvm *kvm,
+				const struct kvm_userspace_memory_region *mem)
 {
 	u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
 
+	if (kvm_arch_private_memory_supported(kvm))
+		valid_flags |= KVM_MEM_PRIVATE;
+
 #ifdef __KVM_HAVE_READONLY_MEM
 	valid_flags |= KVM_MEM_READONLY;
 #endif
@@ -1873,15 +1882,32 @@  static int kvm_set_memslot(struct kvm *kvm,
 }
 
 static bool kvm_check_memslot_overlap(struct kvm_memslots *slots, int id,
-				      gfn_t start, gfn_t end)
+				      struct file *file,
+				      gfn_t start, gfn_t end,
+				      loff_t start_off, loff_t end_off)
 {
 	struct kvm_memslot_iter iter;
+	struct kvm_memory_slot *slot;
+	struct inode *inode;
+	int bkt;
 
 	kvm_for_each_memslot_in_gfn_range(&iter, slots, start, end) {
 		if (iter.slot->id != id)
 			return true;
 	}
 
+	/* Disallow mapping the same file+offset into multiple gfns. */
+	if (file) {
+		inode = file_inode(file);
+		kvm_for_each_memslot(slot, bkt, slots) {
+			if (slot->private_file &&
+			     file_inode(slot->private_file) == inode &&
+			     !(end_off <= slot->private_offset ||
+			       start_off >= slot->private_offset
+					     + (slot->npages >> PAGE_SHIFT)))
+				return true;
+		}
+	}
 	return false;
 }
 
@@ -1906,7 +1932,7 @@  int __kvm_set_memory_region(struct kvm *kvm,
 	int as_id, id;
 	int r;
 
-	r = check_memory_region_flags(mem);
+	r = check_memory_region_flags(kvm, mem);
 	if (r)
 		return r;
 
@@ -1919,10 +1945,12 @@  int __kvm_set_memory_region(struct kvm *kvm,
 		return -EINVAL;
 	if (mem->guest_phys_addr & (PAGE_SIZE - 1))
 		return -EINVAL;
-	/* We can read the guest memory with __xxx_user() later on. */
 	if ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
-	    (mem->userspace_addr != untagged_addr(mem->userspace_addr)) ||
-	     !access_ok((void __user *)(unsigned long)mem->userspace_addr,
+	    (mem->userspace_addr != untagged_addr(mem->userspace_addr)))
+		return -EINVAL;
+	/* We can read the guest memory with __xxx_user() later on. */
+	if (!(mem->flags & KVM_MEM_PRIVATE) &&
+	    !access_ok((void __user *)(unsigned long)mem->userspace_addr,
 			mem->memory_size))
 		return -EINVAL;
 	if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM)
@@ -1963,6 +1991,9 @@  int __kvm_set_memory_region(struct kvm *kvm,
 		if ((kvm->nr_memslot_pages + npages) < kvm->nr_memslot_pages)
 			return -EINVAL;
 	} else { /* Modify an existing slot. */
+		/* Private memslots are immutable, they can only be deleted. */
+		if (mem->flags & KVM_MEM_PRIVATE)
+			return -EINVAL;
 		if ((mem->userspace_addr != old->userspace_addr) ||
 		    (npages != old->npages) ||
 		    ((mem->flags ^ old->flags) & KVM_MEM_READONLY))
@@ -1983,7 +2014,11 @@  int __kvm_set_memory_region(struct kvm *kvm,
 	}
 
 	if ((change == KVM_MR_CREATE || change == KVM_MR_MOVE) &&
-	    kvm_check_memslot_overlap(slots, id, base_gfn, base_gfn + npages)) {
+	    kvm_check_memslot_overlap(slots, id, file,
+				      base_gfn, base_gfn + npages,
+				      region_ext->private_offset,
+				      region_ext->private_offset +
+						mem->memory_size)) {
 		r = -EEXIST;
 		goto out;
 	}