diff mbox series

[RFCv2,08/16] KVM: Use GUP instead of copy_from/to_user() to access guest memory

Message ID 20201020061859.18385-9-kirill.shutemov@linux.intel.com (mailing list archive)
State New, archived
Headers show
Series KVM protected memory extension | expand

Commit Message

Kirill A . Shutemov Oct. 20, 2020, 6:18 a.m. UTC
New helpers copy_from_guest()/copy_to_guest() to be used if KVM memory
protection feature is enabled.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
 include/linux/kvm_host.h |  4 ++
 virt/kvm/kvm_main.c      | 90 +++++++++++++++++++++++++++++++---------
 2 files changed, 75 insertions(+), 19 deletions(-)

Comments

John Hubbard Oct. 20, 2020, 8:25 a.m. UTC | #1
On 10/19/20 11:18 PM, Kirill A. Shutemov wrote:
> New helpers copy_from_guest()/copy_to_guest() to be used if KVM memory
> protection feature is enabled.
> 
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> ---
>   include/linux/kvm_host.h |  4 ++
>   virt/kvm/kvm_main.c      | 90 +++++++++++++++++++++++++++++++---------
>   2 files changed, 75 insertions(+), 19 deletions(-)
> 
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 05e3c2fb3ef7..380a64613880 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -504,6 +504,7 @@ struct kvm {
>   	struct srcu_struct irq_srcu;
>   	pid_t userspace_pid;
>   	unsigned int max_halt_poll_ns;
> +	bool mem_protected;
>   };
>   
>   #define kvm_err(fmt, ...) \
> @@ -728,6 +729,9 @@ void kvm_set_pfn_dirty(kvm_pfn_t pfn);
>   void kvm_set_pfn_accessed(kvm_pfn_t pfn);
>   void kvm_get_pfn(kvm_pfn_t pfn);
>   
> +int copy_from_guest(void *data, unsigned long hva, int len, bool protected);
> +int copy_to_guest(unsigned long hva, const void *data, int len, bool protected);
> +
>   void kvm_release_pfn(kvm_pfn_t pfn, bool dirty, struct gfn_to_pfn_cache *cache);
>   int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
>   			int len);
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index cf88233b819a..a9884cb8c867 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -2313,19 +2313,70 @@ static int next_segment(unsigned long len, int offset)
>   		return len;
>   }
>   
> +int copy_from_guest(void *data, unsigned long hva, int len, bool protected)
> +{
> +	int offset = offset_in_page(hva);
> +	struct page *page;
> +	int npages, seg;
> +
> +	if (!protected)
> +		return __copy_from_user(data, (void __user *)hva, len);
> +
> +	might_fault();
> +	kasan_check_write(data, len);
> +	check_object_size(data, len, false);
> +
> +	while ((seg = next_segment(len, offset)) != 0) {
> +		npages = get_user_pages_unlocked(hva, 1, &page, 0);
> +		if (npages != 1)
> +			return -EFAULT;
> +		memcpy(data, page_address(page) + offset, seg);

Hi Kirill!

OK, so the copy_from_guest() is a read-only case for gup, which I think is safe
from a gup/pup + filesystem point of view, but see below about copy_to_guest()...

> +		put_page(page);
> +		len -= seg;
> +		hva += seg;
> +		offset = 0;
> +	}
> +
> +	return 0;
> +}
> +
> +int copy_to_guest(unsigned long hva, const void *data, int len, bool protected)
> +{
> +	int offset = offset_in_page(hva);
> +	struct page *page;
> +	int npages, seg;
> +
> +	if (!protected)
> +		return __copy_to_user((void __user *)hva, data, len);
> +
> +	might_fault();
> +	kasan_check_read(data, len);
> +	check_object_size(data, len, true);
> +
> +	while ((seg = next_segment(len, offset)) != 0) {
> +		npages = get_user_pages_unlocked(hva, 1, &page, FOLL_WRITE);


Should copy_to_guest() use pin_user_pages_unlocked() instead of gup_unlocked?
We wrote a  "Case 5" in Documentation/core-api/pin_user_pages.rst, just for this
situation, I think:


CASE 5: Pinning in order to write to the data within the page
-------------------------------------------------------------
Even though neither DMA nor Direct IO is involved, just a simple case of "pin,
write to a page's data, unpin" can cause a problem. Case 5 may be considered a
superset of Case 1, plus Case 2, plus anything that invokes that pattern. In
other words, if the code is neither Case 1 nor Case 2, it may still require
FOLL_PIN, for patterns like this:

Correct (uses FOLL_PIN calls):
     pin_user_pages()
     write to the data within the pages
     unpin_user_pages()


thanks,
Kirill A. Shutemov Oct. 20, 2020, 12:51 p.m. UTC | #2
On Tue, Oct 20, 2020 at 01:25:59AM -0700, John Hubbard wrote:
> On 10/19/20 11:18 PM, Kirill A. Shutemov wrote:
> > New helpers copy_from_guest()/copy_to_guest() to be used if KVM memory
> > protection feature is enabled.
> > 
> > Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> > ---
> >   include/linux/kvm_host.h |  4 ++
> >   virt/kvm/kvm_main.c      | 90 +++++++++++++++++++++++++++++++---------
> >   2 files changed, 75 insertions(+), 19 deletions(-)
> > 
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index 05e3c2fb3ef7..380a64613880 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -504,6 +504,7 @@ struct kvm {
> >   	struct srcu_struct irq_srcu;
> >   	pid_t userspace_pid;
> >   	unsigned int max_halt_poll_ns;
> > +	bool mem_protected;
> >   };
> >   #define kvm_err(fmt, ...) \
> > @@ -728,6 +729,9 @@ void kvm_set_pfn_dirty(kvm_pfn_t pfn);
> >   void kvm_set_pfn_accessed(kvm_pfn_t pfn);
> >   void kvm_get_pfn(kvm_pfn_t pfn);
> > +int copy_from_guest(void *data, unsigned long hva, int len, bool protected);
> > +int copy_to_guest(unsigned long hva, const void *data, int len, bool protected);
> > +
> >   void kvm_release_pfn(kvm_pfn_t pfn, bool dirty, struct gfn_to_pfn_cache *cache);
> >   int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
> >   			int len);
> > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > index cf88233b819a..a9884cb8c867 100644
> > --- a/virt/kvm/kvm_main.c
> > +++ b/virt/kvm/kvm_main.c
> > @@ -2313,19 +2313,70 @@ static int next_segment(unsigned long len, int offset)
> >   		return len;
> >   }
> > +int copy_from_guest(void *data, unsigned long hva, int len, bool protected)
> > +{
> > +	int offset = offset_in_page(hva);
> > +	struct page *page;
> > +	int npages, seg;
> > +
> > +	if (!protected)
> > +		return __copy_from_user(data, (void __user *)hva, len);
> > +
> > +	might_fault();
> > +	kasan_check_write(data, len);
> > +	check_object_size(data, len, false);
> > +
> > +	while ((seg = next_segment(len, offset)) != 0) {
> > +		npages = get_user_pages_unlocked(hva, 1, &page, 0);
> > +		if (npages != 1)
> > +			return -EFAULT;
> > +		memcpy(data, page_address(page) + offset, seg);
> 
> Hi Kirill!
> 
> OK, so the copy_from_guest() is a read-only case for gup, which I think is safe
> from a gup/pup + filesystem point of view, but see below about copy_to_guest()...
> 
> > +		put_page(page);
> > +		len -= seg;
> > +		hva += seg;
> > +		offset = 0;
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +int copy_to_guest(unsigned long hva, const void *data, int len, bool protected)
> > +{
> > +	int offset = offset_in_page(hva);
> > +	struct page *page;
> > +	int npages, seg;
> > +
> > +	if (!protected)
> > +		return __copy_to_user((void __user *)hva, data, len);
> > +
> > +	might_fault();
> > +	kasan_check_read(data, len);
> > +	check_object_size(data, len, true);
> > +
> > +	while ((seg = next_segment(len, offset)) != 0) {
> > +		npages = get_user_pages_unlocked(hva, 1, &page, FOLL_WRITE);
> 
> 
> Should copy_to_guest() use pin_user_pages_unlocked() instead of gup_unlocked?
> We wrote a  "Case 5" in Documentation/core-api/pin_user_pages.rst, just for this
> situation, I think:
> 
> 
> CASE 5: Pinning in order to write to the data within the page
> -------------------------------------------------------------
> Even though neither DMA nor Direct IO is involved, just a simple case of "pin,
> write to a page's data, unpin" can cause a problem. Case 5 may be considered a
> superset of Case 1, plus Case 2, plus anything that invokes that pattern. In
> other words, if the code is neither Case 1 nor Case 2, it may still require
> FOLL_PIN, for patterns like this:
> 
> Correct (uses FOLL_PIN calls):
>     pin_user_pages()
>     write to the data within the pages
>     unpin_user_pages()

Right. I didn't internalize changes in GUP interface yet. Will update.
Ira Weiny Oct. 20, 2020, 5:29 p.m. UTC | #3
On Tue, Oct 20, 2020 at 09:18:51AM +0300, Kirill A. Shutemov wrote:
> New helpers copy_from_guest()/copy_to_guest() to be used if KVM memory
> protection feature is enabled.
> 
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> ---
>  include/linux/kvm_host.h |  4 ++
>  virt/kvm/kvm_main.c      | 90 +++++++++++++++++++++++++++++++---------
>  2 files changed, 75 insertions(+), 19 deletions(-)
> 
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 05e3c2fb3ef7..380a64613880 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -504,6 +504,7 @@ struct kvm {
>  	struct srcu_struct irq_srcu;
>  	pid_t userspace_pid;
>  	unsigned int max_halt_poll_ns;
> +	bool mem_protected;
>  };
>  
>  #define kvm_err(fmt, ...) \
> @@ -728,6 +729,9 @@ void kvm_set_pfn_dirty(kvm_pfn_t pfn);
>  void kvm_set_pfn_accessed(kvm_pfn_t pfn);
>  void kvm_get_pfn(kvm_pfn_t pfn);
>  
> +int copy_from_guest(void *data, unsigned long hva, int len, bool protected);
> +int copy_to_guest(unsigned long hva, const void *data, int len, bool protected);
> +
>  void kvm_release_pfn(kvm_pfn_t pfn, bool dirty, struct gfn_to_pfn_cache *cache);
>  int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
>  			int len);
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index cf88233b819a..a9884cb8c867 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -2313,19 +2313,70 @@ static int next_segment(unsigned long len, int offset)
>  		return len;
>  }
>  
> +int copy_from_guest(void *data, unsigned long hva, int len, bool protected)
> +{
> +	int offset = offset_in_page(hva);
> +	struct page *page;
> +	int npages, seg;
> +
> +	if (!protected)
> +		return __copy_from_user(data, (void __user *)hva, len);
> +
> +	might_fault();
> +	kasan_check_write(data, len);
> +	check_object_size(data, len, false);
> +
> +	while ((seg = next_segment(len, offset)) != 0) {
> +		npages = get_user_pages_unlocked(hva, 1, &page, 0);
> +		if (npages != 1)
> +			return -EFAULT;
> +		memcpy(data, page_address(page) + offset, seg);
> +		put_page(page);
> +		len -= seg;
> +		hva += seg;
> +		offset = 0;

Why is data not updated on each iteration?

> +	}
> +
> +	return 0;
> +}
> +
> +int copy_to_guest(unsigned long hva, const void *data, int len, bool protected)
> +{
> +	int offset = offset_in_page(hva);
> +	struct page *page;
> +	int npages, seg;
> +
> +	if (!protected)
> +		return __copy_to_user((void __user *)hva, data, len);
> +
> +	might_fault();
> +	kasan_check_read(data, len);
> +	check_object_size(data, len, true);
> +
> +	while ((seg = next_segment(len, offset)) != 0) {
> +		npages = get_user_pages_unlocked(hva, 1, &page, FOLL_WRITE);
> +		if (npages != 1)
> +			return -EFAULT;
> +		memcpy(page_address(page) + offset, data, seg);
> +		put_page(page);
> +		len -= seg;
> +		hva += seg;
> +		offset = 0;

Same question?  Doesn't this result in *data being copied to multiple pages?

Ira

> +	}
> +
> +	return 0;
> +}
> +
>  static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn,
> -				 void *data, int offset, int len)
> +				 void *data, int offset, int len,
> +				 bool protected)
>  {
> -	int r;
>  	unsigned long addr;
>  
>  	addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
>  	if (kvm_is_error_hva(addr))
>  		return -EFAULT;
> -	r = __copy_from_user(data, (void __user *)addr + offset, len);
> -	if (r)
> -		return -EFAULT;
> -	return 0;
> +	return copy_from_guest(data, addr + offset, len, protected);
>  }
>  
>  int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
> @@ -2333,7 +2384,8 @@ int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
>  {
>  	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
>  
> -	return __kvm_read_guest_page(slot, gfn, data, offset, len);
> +	return __kvm_read_guest_page(slot, gfn, data, offset, len,
> +				     kvm->mem_protected);
>  }
>  EXPORT_SYMBOL_GPL(kvm_read_guest_page);
>  
> @@ -2342,7 +2394,8 @@ int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data,
>  {
>  	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
>  
> -	return __kvm_read_guest_page(slot, gfn, data, offset, len);
> +	return __kvm_read_guest_page(slot, gfn, data, offset, len,
> +				     vcpu->kvm->mem_protected);
>  }
>  EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page);
>  
> @@ -2415,7 +2468,8 @@ int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa,
>  EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic);
>  
>  static int __kvm_write_guest_page(struct kvm_memory_slot *memslot, gfn_t gfn,
> -			          const void *data, int offset, int len)
> +			          const void *data, int offset, int len,
> +				  bool protected)
>  {
>  	int r;
>  	unsigned long addr;
> @@ -2423,7 +2477,8 @@ static int __kvm_write_guest_page(struct kvm_memory_slot *memslot, gfn_t gfn,
>  	addr = gfn_to_hva_memslot(memslot, gfn);
>  	if (kvm_is_error_hva(addr))
>  		return -EFAULT;
> -	r = __copy_to_user((void __user *)addr + offset, data, len);
> +
> +	r = copy_to_guest(addr + offset, data, len, protected);
>  	if (r)
>  		return -EFAULT;
>  	mark_page_dirty_in_slot(memslot, gfn);
> @@ -2435,7 +2490,8 @@ int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn,
>  {
>  	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
>  
> -	return __kvm_write_guest_page(slot, gfn, data, offset, len);
> +	return __kvm_write_guest_page(slot, gfn, data, offset, len,
> +				      kvm->mem_protected);
>  }
>  EXPORT_SYMBOL_GPL(kvm_write_guest_page);
>  
> @@ -2444,7 +2500,8 @@ int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
>  {
>  	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
>  
> -	return __kvm_write_guest_page(slot, gfn, data, offset, len);
> +	return __kvm_write_guest_page(slot, gfn, data, offset, len,
> +				      vcpu->kvm->mem_protected);
>  }
>  EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page);
>  
> @@ -2560,7 +2617,7 @@ int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
>  	if (unlikely(!ghc->memslot))
>  		return kvm_write_guest(kvm, gpa, data, len);
>  
> -	r = __copy_to_user((void __user *)ghc->hva + offset, data, len);
> +	r = copy_to_guest(ghc->hva + offset, data, len, kvm->mem_protected);
>  	if (r)
>  		return -EFAULT;
>  	mark_page_dirty_in_slot(ghc->memslot, gpa >> PAGE_SHIFT);
> @@ -2581,7 +2638,6 @@ int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
>  				 unsigned long len)
>  {
>  	struct kvm_memslots *slots = kvm_memslots(kvm);
> -	int r;
>  	gpa_t gpa = ghc->gpa + offset;
>  
>  	BUG_ON(len + offset > ghc->len);
> @@ -2597,11 +2653,7 @@ int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
>  	if (unlikely(!ghc->memslot))
>  		return kvm_read_guest(kvm, gpa, data, len);
>  
> -	r = __copy_from_user(data, (void __user *)ghc->hva + offset, len);
> -	if (r)
> -		return -EFAULT;
> -
> -	return 0;
> +	return copy_from_guest(data, ghc->hva + offset, len, kvm->mem_protected);
>  }
>  EXPORT_SYMBOL_GPL(kvm_read_guest_offset_cached);
>  
> -- 
> 2.26.2
> 
>
Kirill A . Shutemov Oct. 22, 2020, 11:37 a.m. UTC | #4
On Tue, Oct 20, 2020 at 10:29:44AM -0700, Ira Weiny wrote:
> On Tue, Oct 20, 2020 at 09:18:51AM +0300, Kirill A. Shutemov wrote:
> > New helpers copy_from_guest()/copy_to_guest() to be used if KVM memory
> > protection feature is enabled.
> > 
> > Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> > ---
> >  include/linux/kvm_host.h |  4 ++
> >  virt/kvm/kvm_main.c      | 90 +++++++++++++++++++++++++++++++---------
> >  2 files changed, 75 insertions(+), 19 deletions(-)
> > 
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index 05e3c2fb3ef7..380a64613880 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -504,6 +504,7 @@ struct kvm {
> >  	struct srcu_struct irq_srcu;
> >  	pid_t userspace_pid;
> >  	unsigned int max_halt_poll_ns;
> > +	bool mem_protected;
> >  };
> >  
> >  #define kvm_err(fmt, ...) \
> > @@ -728,6 +729,9 @@ void kvm_set_pfn_dirty(kvm_pfn_t pfn);
> >  void kvm_set_pfn_accessed(kvm_pfn_t pfn);
> >  void kvm_get_pfn(kvm_pfn_t pfn);
> >  
> > +int copy_from_guest(void *data, unsigned long hva, int len, bool protected);
> > +int copy_to_guest(unsigned long hva, const void *data, int len, bool protected);
> > +
> >  void kvm_release_pfn(kvm_pfn_t pfn, bool dirty, struct gfn_to_pfn_cache *cache);
> >  int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
> >  			int len);
> > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > index cf88233b819a..a9884cb8c867 100644
> > --- a/virt/kvm/kvm_main.c
> > +++ b/virt/kvm/kvm_main.c
> > @@ -2313,19 +2313,70 @@ static int next_segment(unsigned long len, int offset)
> >  		return len;
> >  }
> >  
> > +int copy_from_guest(void *data, unsigned long hva, int len, bool protected)
> > +{
> > +	int offset = offset_in_page(hva);
> > +	struct page *page;
> > +	int npages, seg;
> > +
> > +	if (!protected)
> > +		return __copy_from_user(data, (void __user *)hva, len);
> > +
> > +	might_fault();
> > +	kasan_check_write(data, len);
> > +	check_object_size(data, len, false);
> > +
> > +	while ((seg = next_segment(len, offset)) != 0) {
> > +		npages = get_user_pages_unlocked(hva, 1, &page, 0);
> > +		if (npages != 1)
> > +			return -EFAULT;
> > +		memcpy(data, page_address(page) + offset, seg);
> > +		put_page(page);
> > +		len -= seg;
> > +		hva += seg;
> > +		offset = 0;
> 
> Why is data not updated on each iteration?

Ouch. I guess no caller actually steps over page boundary. Will fix.
Matthew Wilcox (Oracle) Oct. 22, 2020, 11:49 a.m. UTC | #5
On Tue, Oct 20, 2020 at 01:25:59AM -0700, John Hubbard wrote:
> Should copy_to_guest() use pin_user_pages_unlocked() instead of gup_unlocked?
> We wrote a  "Case 5" in Documentation/core-api/pin_user_pages.rst, just for this
> situation, I think:
> 
> 
> CASE 5: Pinning in order to write to the data within the page
> -------------------------------------------------------------
> Even though neither DMA nor Direct IO is involved, just a simple case of "pin,
> write to a page's data, unpin" can cause a problem. Case 5 may be considered a
> superset of Case 1, plus Case 2, plus anything that invokes that pattern. In
> other words, if the code is neither Case 1 nor Case 2, it may still require
> FOLL_PIN, for patterns like this:
> 
> Correct (uses FOLL_PIN calls):
>     pin_user_pages()
>     write to the data within the pages
>     unpin_user_pages()

Case 5 is crap though.  That bug should have been fixed by getting
the locking right.  ie:

	get_user_pages_fast();
	lock_page();
	kmap();
	set_bit();
	kunmap();
	set_page_dirty()
	unlock_page();

I should have vetoed that patch at the time, but I was busy with other things.
John Hubbard Oct. 22, 2020, 7:58 p.m. UTC | #6
On 10/22/20 4:49 AM, Matthew Wilcox wrote:
> On Tue, Oct 20, 2020 at 01:25:59AM -0700, John Hubbard wrote:
>> Should copy_to_guest() use pin_user_pages_unlocked() instead of gup_unlocked?
>> We wrote a  "Case 5" in Documentation/core-api/pin_user_pages.rst, just for this
>> situation, I think:
>>
>>
>> CASE 5: Pinning in order to write to the data within the page
>> -------------------------------------------------------------
>> Even though neither DMA nor Direct IO is involved, just a simple case of "pin,
>> write to a page's data, unpin" can cause a problem. Case 5 may be considered a
>> superset of Case 1, plus Case 2, plus anything that invokes that pattern. In
>> other words, if the code is neither Case 1 nor Case 2, it may still require
>> FOLL_PIN, for patterns like this:
>>
>> Correct (uses FOLL_PIN calls):
>>      pin_user_pages()
>>      write to the data within the pages
>>      unpin_user_pages()
> 
> Case 5 is crap though.  That bug should have been fixed by getting
> the locking right.  ie:
> 
> 	get_user_pages_fast();
> 	lock_page();
> 	kmap();
> 	set_bit();
> 	kunmap();
> 	set_page_dirty()
> 	unlock_page();
> 
> I should have vetoed that patch at the time, but I was busy with other things.
> 

It does seem like lock_page() is better, for now at least, because it
forces the kind of synchronization with file system writeback that is
still yet to be implemented for pin_user_pages().

Long term though, Case 5 provides an alternative way to do this
pattern--without using lock_page(). Also, note that Case 5, *in
general*, need not be done page-at-a-time, unlike the lock_page()
approach. Therefore, Case 5 might potentially help at some call sites,
either for deadlock avoidance or performance improvements.

In other words, once the other half of the pin_user_pages() plan is
implemented, either of these approaches should work.

Or, are you thinking that there is never a situation in which Case 5 is
valid?


thanks,
Matthew Wilcox (Oracle) Oct. 26, 2020, 4:21 a.m. UTC | #7
On Thu, Oct 22, 2020 at 12:58:14PM -0700, John Hubbard wrote:
> On 10/22/20 4:49 AM, Matthew Wilcox wrote:
> > On Tue, Oct 20, 2020 at 01:25:59AM -0700, John Hubbard wrote:
> > > Should copy_to_guest() use pin_user_pages_unlocked() instead of gup_unlocked?
> > > We wrote a  "Case 5" in Documentation/core-api/pin_user_pages.rst, just for this
> > > situation, I think:
> > > 
> > > 
> > > CASE 5: Pinning in order to write to the data within the page
> > > -------------------------------------------------------------
> > > Even though neither DMA nor Direct IO is involved, just a simple case of "pin,
> > > write to a page's data, unpin" can cause a problem. Case 5 may be considered a
> > > superset of Case 1, plus Case 2, plus anything that invokes that pattern. In
> > > other words, if the code is neither Case 1 nor Case 2, it may still require
> > > FOLL_PIN, for patterns like this:
> > > 
> > > Correct (uses FOLL_PIN calls):
> > >      pin_user_pages()
> > >      write to the data within the pages
> > >      unpin_user_pages()
> > 
> > Case 5 is crap though.  That bug should have been fixed by getting
> > the locking right.  ie:
> > 
> > 	get_user_pages_fast();
> > 	lock_page();
> > 	kmap();
> > 	set_bit();
> > 	kunmap();
> > 	set_page_dirty()
> > 	unlock_page();
> > 
> > I should have vetoed that patch at the time, but I was busy with other things.
> 
> It does seem like lock_page() is better, for now at least, because it
> forces the kind of synchronization with file system writeback that is
> still yet to be implemented for pin_user_pages().
> 
> Long term though, Case 5 provides an alternative way to do this
> pattern--without using lock_page(). Also, note that Case 5, *in
> general*, need not be done page-at-a-time, unlike the lock_page()
> approach. Therefore, Case 5 might potentially help at some call sites,
> either for deadlock avoidance or performance improvements.
> 
> In other words, once the other half of the pin_user_pages() plan is
> implemented, either of these approaches should work.
> 
> Or, are you thinking that there is never a situation in which Case 5 is
> valid?

I don't think the page pinning approach is ever valid.  For file
mappings, the infiniband registration should take a lease on the inode,
blocking truncation.  DAX shouldn't be using struct pages at all, so
there shouldn't be anything there to pin.

It's been five years since DAX was merged, and page pinning still
doesn't work.  How much longer before the people who are pushing it
realise that it's fundamentally flawed?
John Hubbard Oct. 26, 2020, 4:44 a.m. UTC | #8
On 10/25/20 9:21 PM, Matthew Wilcox wrote:
> On Thu, Oct 22, 2020 at 12:58:14PM -0700, John Hubbard wrote:
>> On 10/22/20 4:49 AM, Matthew Wilcox wrote:
>>> On Tue, Oct 20, 2020 at 01:25:59AM -0700, John Hubbard wrote:
>>>> Should copy_to_guest() use pin_user_pages_unlocked() instead of gup_unlocked?
>>>> We wrote a  "Case 5" in Documentation/core-api/pin_user_pages.rst, just for this
>>>> situation, I think:
>>>>
>>>>
>>>> CASE 5: Pinning in order to write to the data within the page
>>>> -------------------------------------------------------------
>>>> Even though neither DMA nor Direct IO is involved, just a simple case of "pin,
>>>> write to a page's data, unpin" can cause a problem. Case 5 may be considered a
>>>> superset of Case 1, plus Case 2, plus anything that invokes that pattern. In
>>>> other words, if the code is neither Case 1 nor Case 2, it may still require
>>>> FOLL_PIN, for patterns like this:
>>>>
>>>> Correct (uses FOLL_PIN calls):
>>>>       pin_user_pages()
>>>>       write to the data within the pages
>>>>       unpin_user_pages()
>>>
>>> Case 5 is crap though.  That bug should have been fixed by getting
>>> the locking right.  ie:
>>>
>>> 	get_user_pages_fast();
>>> 	lock_page();
>>> 	kmap();
>>> 	set_bit();
>>> 	kunmap();
>>> 	set_page_dirty()
>>> 	unlock_page();
>>>
>>> I should have vetoed that patch at the time, but I was busy with other things.
>>
>> It does seem like lock_page() is better, for now at least, because it
>> forces the kind of synchronization with file system writeback that is
>> still yet to be implemented for pin_user_pages().
>>
>> Long term though, Case 5 provides an alternative way to do this
>> pattern--without using lock_page(). Also, note that Case 5, *in
>> general*, need not be done page-at-a-time, unlike the lock_page()
>> approach. Therefore, Case 5 might potentially help at some call sites,
>> either for deadlock avoidance or performance improvements.
>>
>> In other words, once the other half of the pin_user_pages() plan is
>> implemented, either of these approaches should work.
>>
>> Or, are you thinking that there is never a situation in which Case 5 is
>> valid?
> 
> I don't think the page pinning approach is ever valid.  For file

Could you qualify that? Surely you don't mean that the entire pin_user_pages
story is a waste of time--I would have expected you to make more noise
earlier if you thought that, yes?

> mappings, the infiniband registration should take a lease on the inode,
> blocking truncation.  DAX shouldn't be using struct pages at all, so
> there shouldn't be anything there to pin.
> 
> It's been five years since DAX was merged, and page pinning still
> doesn't work.  How much longer before the people who are pushing it
> realise that it's fundamentally flawed?
> 

Is this a separate rant about *only* DAX, or is general RDMA in your sights
too? :)



thanks,
Matthew Wilcox (Oracle) Oct. 26, 2020, 1:28 p.m. UTC | #9
On Sun, Oct 25, 2020 at 09:44:07PM -0700, John Hubbard wrote:
> On 10/25/20 9:21 PM, Matthew Wilcox wrote:
> > I don't think the page pinning approach is ever valid.  For file
> 
> Could you qualify that? Surely you don't mean that the entire pin_user_pages
> story is a waste of time--I would have expected you to make more noise
> earlier if you thought that, yes?

I do think page pinning is the wrong approach for everything.  I did say
so at the time, and I continue to say so when the opportunity presents
itself.  But shouting about it constantly only annoys people, so I don't
generally bother.  I have other things to work on, and they're productive,
so I don't need to spend my time arguing.

> > It's been five years since DAX was merged, and page pinning still
> > doesn't work.  How much longer before the people who are pushing it
> > realise that it's fundamentally flawed?
> 
> Is this a separate rant about *only* DAX, or is general RDMA in your sights
> too? :)

This is a case where it's not RDMA's _fault_ that there's no good API
for it to do what it needs to do.  There's a lot of work needed to wean
Linux device drivers off their assumption that there's a struct page
for every byte of memory.
Jason Gunthorpe Oct. 26, 2020, 2:16 p.m. UTC | #10
On Mon, Oct 26, 2020 at 01:28:30PM +0000, Matthew Wilcox wrote:

> > > It's been five years since DAX was merged, and page pinning still
> > > doesn't work.  How much longer before the people who are pushing it
> > > realise that it's fundamentally flawed?
> > 
> > Is this a separate rant about *only* DAX, or is general RDMA in your sights
> > too? :)
> 
> This is a case where it's not RDMA's _fault_ that there's no good API
> for it to do what it needs to do.  There's a lot of work needed to wean
> Linux device drivers off their assumption that there's a struct page
> for every byte of memory.

People who care seem to have just given up and are using RDMA ODP, so
I'm not optimistic this DAX issue will ever be solved. I've also
almost removed all the struct page references from this flow in RDMA,
so if there is some way that helps it is certainly doable.

Regardless of DAX the pinning indication is now being used during
fork() for some good reasons, and seems to make sense in other use
cases. It just doesn't seem like a way to solve the DAX issue.

More or less it seems to mean that pages pinned cannot be write
protected and more broadly the kernel should not change the PTEs for
those pages independently of the application. ie the more agressive
COW on fork() caused data corruption regressions...

I wonder if the point here is that some page owners can't/won't
support DMA pinning and should just be blocked completely for them.

I'd much rather have write access pin_user_pages() just fail than oops
the kernel on ext4 owned VMAs, for instance.

Jason
John Hubbard Oct. 26, 2020, 8:52 p.m. UTC | #11
On 10/26/20 6:28 AM, Matthew Wilcox wrote:
> On Sun, Oct 25, 2020 at 09:44:07PM -0700, John Hubbard wrote:
>> On 10/25/20 9:21 PM, Matthew Wilcox wrote:
>>> I don't think the page pinning approach is ever valid.  For file
>>
>> Could you qualify that? Surely you don't mean that the entire pin_user_pages
>> story is a waste of time--I would have expected you to make more noise
>> earlier if you thought that, yes?
> 
> I do think page pinning is the wrong approach for everything.  I did say


Not *everything*, just "pinning for DMA", right? Because I don't recall
any viable solutions for Direct IO that avoided gup/pup!

Also, back to Case 5: I *could* create a small patchset to change over
the very few Case 5 call sites to use "gup, lock_page(), write to
page...etc", instead of pup. And also, update pin_user_pages.rst to
recommend that approach in similar situations. After all, it's not
really a long-term DMA pin, which is really what pin_user_pages*() is
intended for.

Would that be something you'd like to see happen? It's certainly easy
enough to fix that up. And your retroactive NAK is sufficient motivation
to do so.


> so at the time, and I continue to say so when the opportunity presents
> itself.  But shouting about it constantly only annoys people, so I don't
> generally bother.  I have other things to work on, and they're productive,
> so I don't need to spend my time arguing.


Sure. As a practical matter, I've assumed that page pinning is not going
to go away any time soon, so I want it to work properly while it's here.
But if there is a viable way to eventually replace dma-pinning with
something better, then let's keep thinking about it. I'm glad to help in
that area.


thanks,
diff mbox series

Patch

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 05e3c2fb3ef7..380a64613880 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -504,6 +504,7 @@  struct kvm {
 	struct srcu_struct irq_srcu;
 	pid_t userspace_pid;
 	unsigned int max_halt_poll_ns;
+	bool mem_protected;
 };
 
 #define kvm_err(fmt, ...) \
@@ -728,6 +729,9 @@  void kvm_set_pfn_dirty(kvm_pfn_t pfn);
 void kvm_set_pfn_accessed(kvm_pfn_t pfn);
 void kvm_get_pfn(kvm_pfn_t pfn);
 
+int copy_from_guest(void *data, unsigned long hva, int len, bool protected);
+int copy_to_guest(unsigned long hva, const void *data, int len, bool protected);
+
 void kvm_release_pfn(kvm_pfn_t pfn, bool dirty, struct gfn_to_pfn_cache *cache);
 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
 			int len);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index cf88233b819a..a9884cb8c867 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2313,19 +2313,70 @@  static int next_segment(unsigned long len, int offset)
 		return len;
 }
 
+int copy_from_guest(void *data, unsigned long hva, int len, bool protected)
+{
+	int offset = offset_in_page(hva);
+	struct page *page;
+	int npages, seg;
+
+	if (!protected)
+		return __copy_from_user(data, (void __user *)hva, len);
+
+	might_fault();
+	kasan_check_write(data, len);
+	check_object_size(data, len, false);
+
+	while ((seg = next_segment(len, offset)) != 0) {
+		npages = get_user_pages_unlocked(hva, 1, &page, 0);
+		if (npages != 1)
+			return -EFAULT;
+		memcpy(data, page_address(page) + offset, seg);
+		put_page(page);
+		len -= seg;
+		hva += seg;
+		offset = 0;
+	}
+
+	return 0;
+}
+
+int copy_to_guest(unsigned long hva, const void *data, int len, bool protected)
+{
+	int offset = offset_in_page(hva);
+	struct page *page;
+	int npages, seg;
+
+	if (!protected)
+		return __copy_to_user((void __user *)hva, data, len);
+
+	might_fault();
+	kasan_check_read(data, len);
+	check_object_size(data, len, true);
+
+	while ((seg = next_segment(len, offset)) != 0) {
+		npages = get_user_pages_unlocked(hva, 1, &page, FOLL_WRITE);
+		if (npages != 1)
+			return -EFAULT;
+		memcpy(page_address(page) + offset, data, seg);
+		put_page(page);
+		len -= seg;
+		hva += seg;
+		offset = 0;
+	}
+
+	return 0;
+}
+
 static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn,
-				 void *data, int offset, int len)
+				 void *data, int offset, int len,
+				 bool protected)
 {
-	int r;
 	unsigned long addr;
 
 	addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
 	if (kvm_is_error_hva(addr))
 		return -EFAULT;
-	r = __copy_from_user(data, (void __user *)addr + offset, len);
-	if (r)
-		return -EFAULT;
-	return 0;
+	return copy_from_guest(data, addr + offset, len, protected);
 }
 
 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
@@ -2333,7 +2384,8 @@  int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
 {
 	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
 
-	return __kvm_read_guest_page(slot, gfn, data, offset, len);
+	return __kvm_read_guest_page(slot, gfn, data, offset, len,
+				     kvm->mem_protected);
 }
 EXPORT_SYMBOL_GPL(kvm_read_guest_page);
 
@@ -2342,7 +2394,8 @@  int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data,
 {
 	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
 
-	return __kvm_read_guest_page(slot, gfn, data, offset, len);
+	return __kvm_read_guest_page(slot, gfn, data, offset, len,
+				     vcpu->kvm->mem_protected);
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page);
 
@@ -2415,7 +2468,8 @@  int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa,
 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic);
 
 static int __kvm_write_guest_page(struct kvm_memory_slot *memslot, gfn_t gfn,
-			          const void *data, int offset, int len)
+			          const void *data, int offset, int len,
+				  bool protected)
 {
 	int r;
 	unsigned long addr;
@@ -2423,7 +2477,8 @@  static int __kvm_write_guest_page(struct kvm_memory_slot *memslot, gfn_t gfn,
 	addr = gfn_to_hva_memslot(memslot, gfn);
 	if (kvm_is_error_hva(addr))
 		return -EFAULT;
-	r = __copy_to_user((void __user *)addr + offset, data, len);
+
+	r = copy_to_guest(addr + offset, data, len, protected);
 	if (r)
 		return -EFAULT;
 	mark_page_dirty_in_slot(memslot, gfn);
@@ -2435,7 +2490,8 @@  int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn,
 {
 	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
 
-	return __kvm_write_guest_page(slot, gfn, data, offset, len);
+	return __kvm_write_guest_page(slot, gfn, data, offset, len,
+				      kvm->mem_protected);
 }
 EXPORT_SYMBOL_GPL(kvm_write_guest_page);
 
@@ -2444,7 +2500,8 @@  int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
 {
 	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
 
-	return __kvm_write_guest_page(slot, gfn, data, offset, len);
+	return __kvm_write_guest_page(slot, gfn, data, offset, len,
+				      vcpu->kvm->mem_protected);
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page);
 
@@ -2560,7 +2617,7 @@  int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
 	if (unlikely(!ghc->memslot))
 		return kvm_write_guest(kvm, gpa, data, len);
 
-	r = __copy_to_user((void __user *)ghc->hva + offset, data, len);
+	r = copy_to_guest(ghc->hva + offset, data, len, kvm->mem_protected);
 	if (r)
 		return -EFAULT;
 	mark_page_dirty_in_slot(ghc->memslot, gpa >> PAGE_SHIFT);
@@ -2581,7 +2638,6 @@  int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
 				 unsigned long len)
 {
 	struct kvm_memslots *slots = kvm_memslots(kvm);
-	int r;
 	gpa_t gpa = ghc->gpa + offset;
 
 	BUG_ON(len + offset > ghc->len);
@@ -2597,11 +2653,7 @@  int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
 	if (unlikely(!ghc->memslot))
 		return kvm_read_guest(kvm, gpa, data, len);
 
-	r = __copy_from_user(data, (void __user *)ghc->hva + offset, len);
-	if (r)
-		return -EFAULT;
-
-	return 0;
+	return copy_from_guest(data, ghc->hva + offset, len, kvm->mem_protected);
 }
 EXPORT_SYMBOL_GPL(kvm_read_guest_offset_cached);