diff mbox series

[RFC] vhost: don't use kmap() to log dirty pages

Message ID 1557195809-12373-1-git-send-email-jasowang@redhat.com (mailing list archive)
State New, archived
Headers show
Series [RFC] vhost: don't use kmap() to log dirty pages | expand

Commit Message

Jason Wang May 7, 2019, 2:23 a.m. UTC
Vhost log dirty pages directly to a userspace bitmap through GUP and
kmap_atomic() since kernel doesn't have a set_bit_to_user()
helper. This will cause issues for the arch that has virtually tagged
caches. The way to fix is to keep using userspace virtual address.

Fortunately, futex has a cmpxchg to userspace memory helper
futex_atomic_cmpxchg_inatomic(). So switch to use it to exchange the
userspace bitmap with zero, set the bit and then write it back through
put_user().

Note: there're archs (few non popular ones) that don't implement
futex helper, we can't log dirty pages. We can fix them on top or
simply disable LOG_ALL features of vhost.

Cc: Christoph Hellwig <hch@infradead.org>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Fixes: 3a4d5c94e9593 ("vhost_net: a kernel-level virtio server")
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/vhost.c | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

Comments

Christoph Hellwig May 7, 2019, 3:47 p.m. UTC | #1
On Mon, May 06, 2019 at 10:23:29PM -0400, Jason Wang wrote:
> Note: there're archs (few non popular ones) that don't implement
> futex helper, we can't log dirty pages. We can fix them on top or
> simply disable LOG_ALL features of vhost.

That means vhost now has to depend on HAVE_FUTEX_CMPXCHG to make
sure we have a working implementation.


>  #include <linux/sched/signal.h>
>  #include <linux/interval_tree_generic.h>
>  #include <linux/nospec.h>
> +#include <asm/futex.h>

Also please include the futex maintainers to make sure they are fine
with this first usage of <asm/futex.h> outside of kernel/futex.c.


> +static int set_bit_to_user(int nr, u32 __user *addr)
>  {
>  	unsigned long log = (unsigned long)addr;
>  	struct page *page;
> +	u32 old_log;
>  	int r;
>  
>  	r = get_user_pages_fast(log, 1, 1, &page);
>  	if (r < 0)
>  		return r;
>  	BUG_ON(r != 1);
> +
> +	r = futex_atomic_cmpxchg_inatomic(&old_log, addr, 0, 0);
> +	if (r < 0)
> +		return r;
> +
> +	old_log |= 1 << nr;
> +	r = put_user(old_log, addr);
> +	if (r < 0)
> +		return r;

And this just looks odd to me.  Why do we need the futex call to
replace a 0 value with 0?  Why does it still duplicate the
put_user?  This doesn't look like actually working code to me.

Also don't we need a pagefault_disable() around
futex_atomic_cmpxchg_inatomic?
Jason Wang May 8, 2019, 3:43 a.m. UTC | #2
On 2019/5/7 下午11:47, Christoph Hellwig wrote:
> On Mon, May 06, 2019 at 10:23:29PM -0400, Jason Wang wrote:
>> Note: there're archs (few non popular ones) that don't implement
>> futex helper, we can't log dirty pages. We can fix them on top or
>> simply disable LOG_ALL features of vhost.
>
> That means vhost now has to depend on HAVE_FUTEX_CMPXCHG to make
> sure we have a working implementation.

I found HAVE_FUTEX_CMPXCHG is not a must for arch that has the
implementation and futex does some kind of runtime detection like:

static void __init futex_detect_cmpxchg(void)
{
#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
	u32 curval;

	/*
	 * This will fail and we want it. Some arch implementations do
	 * runtime detection of the futex_atomic_cmpxchg_inatomic()
	 * functionality. We want to know that before we call in any
	 * of the complex code paths. Also we want to prevent
	 * registration of robust lists in that case. NULL is
	 * guaranteed to fault and we get -EFAULT on functional
	 * implementation, the non-functional ones will return
	 * -ENOSYS.
	 */
	if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
		futex_cmpxchg_enabled = 1;
#endif
}


>
>
>>  #include <linux/sched/signal.h>
>>  #include <linux/interval_tree_generic.h>
>>  #include <linux/nospec.h>
>> +#include <asm/futex.h>
>
> Also please include the futex maintainers to make sure they are fine
> with this first usage of <asm/futex.h> outside of kernel/futex.c.
>

Thanks for ccing them. Will do for next version.

If we decide to go this way, we probably need to move it to uaccess
for a more generic helper.

>
>> +static int set_bit_to_user(int nr, u32 __user *addr)
>>  {
>>  	unsigned long log = (unsigned long)addr;
>>  	struct page *page;
>> +	u32 old_log;
>>  	int r;
>>  
>>  	r = get_user_pages_fast(log, 1, 1, &page);
>>  	if (r < 0)
>>  		return r;
>>  	BUG_ON(r != 1);
>> +
>> +	r = futex_atomic_cmpxchg_inatomic(&old_log, addr, 0, 0);
>> +	if (r < 0)
>> +		return r;
>> +
>> +	old_log |= 1 << nr;
>> +	r = put_user(old_log, addr);
>> +	if (r < 0)
>> +		return r;
>
> And this just looks odd to me.  Why do we need the futex call to
> replace a 0 value with 0?  Why does it still duplicate the
> put_user?  This doesn't look like actually working code to me.

Yes, this is a bug. Should be something like:

static int set_bit_to_user(int nr, u32 __user *addr)
{
        unsigned long log = (unsigned long)addr;
        struct page *page;
        u32 old_log, new_log, l;
        int r;

        r = get_user_pages_fast(log, 1, 1, &page);
        if (r < 0)
                return r;
	BUG_ON(r != 1);

        do {
                r = get_user(old_log, addr);
                if (r < 0)
                        return r;
                new_log = old_log | (1 << nr);
		r = futex_atomic_cmpxchg_inatomic(&l, addr, old_log, new_log);
                if (r < 0)
                        return r;
        } while(l != new_log);

	set_page_dirty_lock(page);
        put_page(page);
        return 0;
}

>
> Also don't we need a pagefault_disable() around
> futex_atomic_cmpxchg_inatomic?

Since we don't want to deal with pagefault, so the page has been
pinned before futex_atomic_cmpxchg_inatomic().

Thanks
Michael S. Tsirkin May 8, 2019, 4:12 a.m. UTC | #3
On Mon, May 06, 2019 at 10:23:29PM -0400, Jason Wang wrote:
> Vhost log dirty pages directly to a userspace bitmap through GUP and
> kmap_atomic() since kernel doesn't have a set_bit_to_user()
> helper. This will cause issues for the arch that has virtually tagged
> caches. The way to fix is to keep using userspace virtual address.
> 
> Fortunately, futex has a cmpxchg to userspace memory helper
> futex_atomic_cmpxchg_inatomic(). So switch to use it to exchange the
> userspace bitmap with zero, set the bit and then write it back through
> put_user().
> 
> Note: there're archs (few non popular ones) that don't implement
> futex helper, we can't log dirty pages. We can fix them on top or
> simply disable LOG_ALL features of vhost.

Or implement futex_atomic_cmpxchg using kmap if they don't have
virtually tagged caches.

> 
> Cc: Christoph Hellwig <hch@infradead.org>
> Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
> Cc: Andrea Arcangeli <aarcange@redhat.com>
> Fixes: 3a4d5c94e9593 ("vhost_net: a kernel-level virtio server")
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
>  drivers/vhost/vhost.c | 27 +++++++++++++++------------
>  1 file changed, 15 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 351af88..9c94c41 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -31,6 +31,7 @@
>  #include <linux/sched/signal.h>
>  #include <linux/interval_tree_generic.h>
>  #include <linux/nospec.h>
> +#include <asm/futex.h>
>  
>  #include "vhost.h"
>  
> @@ -1692,25 +1693,27 @@ long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp)
>  }
>  EXPORT_SYMBOL_GPL(vhost_dev_ioctl);
>  
> -/* TODO: This is really inefficient.  We need something like get_user()
> - * (instruction directly accesses the data, with an exception table entry
> - * returning -EFAULT). See Documentation/x86/exception-tables.txt.
> - */
> -static int set_bit_to_user(int nr, void __user *addr)
> +static int set_bit_to_user(int nr, u32 __user *addr)
>  {
>  	unsigned long log = (unsigned long)addr;
>  	struct page *page;
> -	void *base;
> -	int bit = nr + (log % PAGE_SIZE) * 8;
> +	u32 old_log;
>  	int r;
>  
>  	r = get_user_pages_fast(log, 1, 1, &page);
>  	if (r < 0)
>  		return r;
>  	BUG_ON(r != 1);
> -	base = kmap_atomic(page);
> -	set_bit(bit, base);
> -	kunmap_atomic(base);
> +
> +	r = futex_atomic_cmpxchg_inatomic(&old_log, addr, 0, 0);
> +	if (r < 0)
> +		return r;

So I think this is a great idea!

However one issue here is that futex_atomic_cmpxchg_inatomic will fail if the
page is swapped out. I suspect we need a variant that blocks the thread
instead.

> +
> +	old_log |= 1 << nr;
> +	r = put_user(old_log, addr);
> +	if (r < 0)
> +		return r;
> +
>  	set_page_dirty_lock(page);
>  	put_page(page);
>  	return 0;
> @@ -1727,8 +1730,8 @@ static int log_write(void __user *log_base,
>  	write_length += write_address % VHOST_PAGE_SIZE;
>  	for (;;) {
>  		u64 base = (u64)(unsigned long)log_base;
> -		u64 log = base + write_page / 8;
> -		int bit = write_page % 8;
> +		u64 log = base + write_page / 32;
> +		int bit = write_page % 32;
>  		if ((u64)(unsigned long)log != log)
>  			return -EFAULT;
>  		r = set_bit_to_user(bit, (void __user *)(unsigned long)log);
> -- 
> 1.8.3.1
Jason Wang May 8, 2019, 4:28 a.m. UTC | #4
On 2019/5/8 下午12:12, Michael S. Tsirkin wrote:
> On Mon, May 06, 2019 at 10:23:29PM -0400, Jason Wang wrote:
>> Vhost log dirty pages directly to a userspace bitmap through GUP and
>> kmap_atomic() since kernel doesn't have a set_bit_to_user()
>> helper. This will cause issues for the arch that has virtually tagged
>> caches. The way to fix is to keep using userspace virtual address.
>>
>> Fortunately, futex has a cmpxchg to userspace memory helper
>> futex_atomic_cmpxchg_inatomic(). So switch to use it to exchange the
>> userspace bitmap with zero, set the bit and then write it back through
>> put_user().
>>
>> Note: there're archs (few non popular ones) that don't implement
>> futex helper, we can't log dirty pages. We can fix them on top or
>> simply disable LOG_ALL features of vhost.
> Or implement futex_atomic_cmpxchg using kmap if they don't have
> virtually tagged caches.


Yes, this might work.


>
>> Cc: Christoph Hellwig <hch@infradead.org>
>> Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
>> Cc: Andrea Arcangeli <aarcange@redhat.com>
>> Fixes: 3a4d5c94e9593 ("vhost_net: a kernel-level virtio server")
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>> ---
>>   drivers/vhost/vhost.c | 27 +++++++++++++++------------
>>   1 file changed, 15 insertions(+), 12 deletions(-)
>>
>> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
>> index 351af88..9c94c41 100644
>> --- a/drivers/vhost/vhost.c
>> +++ b/drivers/vhost/vhost.c
>> @@ -31,6 +31,7 @@
>>   #include <linux/sched/signal.h>
>>   #include <linux/interval_tree_generic.h>
>>   #include <linux/nospec.h>
>> +#include <asm/futex.h>
>>   
>>   #include "vhost.h"
>>   
>> @@ -1692,25 +1693,27 @@ long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp)
>>   }
>>   EXPORT_SYMBOL_GPL(vhost_dev_ioctl);
>>   
>> -/* TODO: This is really inefficient.  We need something like get_user()
>> - * (instruction directly accesses the data, with an exception table entry
>> - * returning -EFAULT). See Documentation/x86/exception-tables.txt.
>> - */
>> -static int set_bit_to_user(int nr, void __user *addr)
>> +static int set_bit_to_user(int nr, u32 __user *addr)
>>   {
>>   	unsigned long log = (unsigned long)addr;
>>   	struct page *page;
>> -	void *base;
>> -	int bit = nr + (log % PAGE_SIZE) * 8;
>> +	u32 old_log;
>>   	int r;
>>   
>>   	r = get_user_pages_fast(log, 1, 1, &page);
>>   	if (r < 0)
>>   		return r;
>>   	BUG_ON(r != 1);
>> -	base = kmap_atomic(page);
>> -	set_bit(bit, base);
>> -	kunmap_atomic(base);
>> +
>> +	r = futex_atomic_cmpxchg_inatomic(&old_log, addr, 0, 0);
>> +	if (r < 0)
>> +		return r;
> So I think this is a great idea!
>
> However one issue here is that futex_atomic_cmpxchg_inatomic will fail if the
> page is swapped out. I suspect we need a variant that blocks the thread
> instead.


I guess not since the patch still try to pin the page before.

Thanks


>
>> +
>> +	old_log |= 1 << nr;
>> +	r = put_user(old_log, addr);
>> +	if (r < 0)
>> +		return r;
>> +
>>   	set_page_dirty_lock(page);
>>   	put_page(page);
>>   	return 0;
>> @@ -1727,8 +1730,8 @@ static int log_write(void __user *log_base,
>>   	write_length += write_address % VHOST_PAGE_SIZE;
>>   	for (;;) {
>>   		u64 base = (u64)(unsigned long)log_base;
>> -		u64 log = base + write_page / 8;
>> -		int bit = write_page % 8;
>> +		u64 log = base + write_page / 32;
>> +		int bit = write_page % 32;
>>   		if ((u64)(unsigned long)log != log)
>>   			return -EFAULT;
>>   		r = set_bit_to_user(bit, (void __user *)(unsigned long)log);
>> -- 
>> 1.8.3.1
diff mbox series

Patch

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 351af88..9c94c41 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -31,6 +31,7 @@ 
 #include <linux/sched/signal.h>
 #include <linux/interval_tree_generic.h>
 #include <linux/nospec.h>
+#include <asm/futex.h>
 
 #include "vhost.h"
 
@@ -1692,25 +1693,27 @@  long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp)
 }
 EXPORT_SYMBOL_GPL(vhost_dev_ioctl);
 
-/* TODO: This is really inefficient.  We need something like get_user()
- * (instruction directly accesses the data, with an exception table entry
- * returning -EFAULT). See Documentation/x86/exception-tables.txt.
- */
-static int set_bit_to_user(int nr, void __user *addr)
+static int set_bit_to_user(int nr, u32 __user *addr)
 {
 	unsigned long log = (unsigned long)addr;
 	struct page *page;
-	void *base;
-	int bit = nr + (log % PAGE_SIZE) * 8;
+	u32 old_log;
 	int r;
 
 	r = get_user_pages_fast(log, 1, 1, &page);
 	if (r < 0)
 		return r;
 	BUG_ON(r != 1);
-	base = kmap_atomic(page);
-	set_bit(bit, base);
-	kunmap_atomic(base);
+
+	r = futex_atomic_cmpxchg_inatomic(&old_log, addr, 0, 0);
+	if (r < 0)
+		return r;
+
+	old_log |= 1 << nr;
+	r = put_user(old_log, addr);
+	if (r < 0)
+		return r;
+
 	set_page_dirty_lock(page);
 	put_page(page);
 	return 0;
@@ -1727,8 +1730,8 @@  static int log_write(void __user *log_base,
 	write_length += write_address % VHOST_PAGE_SIZE;
 	for (;;) {
 		u64 base = (u64)(unsigned long)log_base;
-		u64 log = base + write_page / 8;
-		int bit = write_page % 8;
+		u64 log = base + write_page / 32;
+		int bit = write_page % 32;
 		if ((u64)(unsigned long)log != log)
 			return -EFAULT;
 		r = set_bit_to_user(bit, (void __user *)(unsigned long)log);