diff mbox

[2/2] 9p: v9fs new readpages.

Message ID 1476120280-30873-2-git-send-email-edward.shishkin@gmail.com (mailing list archive)
State New, archived
Headers show

Commit Message

Edward Shishkin Oct. 10, 2016, 5:24 p.m. UTC
Modify v9fs private ->readpages() method of address_space
operations for merging pages into long 9p messages.

Signed-off-by: Edward Shishkin <edward.shishkin@gmail.com>
---
 fs/9p/vfs_addr.c | 156 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 155 insertions(+), 1 deletion(-)

Comments

Alexander Graf Oct. 25, 2016, 2:13 p.m. UTC | #1
On 10/10/2016 07:24 PM, Edward Shishkin wrote:
> Modify v9fs private ->readpages() method of address_space
> operations for merging pages into long 9p messages.
>
> Signed-off-by: Edward Shishkin <edward.shishkin@gmail.com>
> ---
>   fs/9p/vfs_addr.c | 156 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>   1 file changed, 155 insertions(+), 1 deletion(-)
>
> diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
> index e871886..4ad248e 100644
> --- a/fs/9p/vfs_addr.c
> +++ b/fs/9p/vfs_addr.c
> @@ -34,6 +34,7 @@
>   #include <linux/idr.h>
>   #include <linux/sched.h>
>   #include <linux/uio.h>
> +#include <linux/slab.h>
>   #include <net/9p/9p.h>
>   #include <net/9p/client.h>
>   #include <trace/events/writeback.h>
> @@ -99,6 +100,148 @@ static int v9fs_vfs_readpage(struct file *filp, struct page *page)
>   	return v9fs_fid_readpage(filp->private_data, page);
>   }
>   
> +/*
> + * Context for "fast readpages"
> + */
> +struct v9fs_readpages_ctx {
> +	struct file *filp;
> +	struct address_space *mapping;
> +	pgoff_t start_index; /* index of the first page with actual data */
> +	char *buf; /* buffer with actual data */
> +	int len; /* length of the actual data */
> +	int num_pages; /* maximal data chunk (in pages) that can be
> +			  passed per transmission */
> +};
> +
> +static int init_readpages_ctx(struct v9fs_readpages_ctx *ctx,
> +			      struct file *filp,
> +			      struct address_space *mapping,
> +			      int num_pages)
> +{
> +	memset(ctx, 0, sizeof(*ctx));
> +	ctx->buf = kmalloc(num_pages << PAGE_SHIFT, GFP_USER);

Doesn't this a) have potential information leak to user space and b) 
allow user space to allocate big amounts of kernel memory? A limited 
page pool would probably be better here.

I also don't quite grasp yet what pattern you're actually optimizing. 
Basically you're doing implicit read-ahead on behalf of the reader, 
right? So why would that be faster in a random 4k read scenario? Also, 
have you compared tmpfs hosted files to only benchmark the transmission 
path?

> +	if (!ctx->buf)
> +		return -ENOMEM;
> +	ctx->filp = filp;
> +	ctx->mapping = mapping;
> +	ctx->num_pages = num_pages;
> +	return 0;
> +}
> +
> +static void done_readpages_ctx(struct v9fs_readpages_ctx *ctx)
> +{
> +	kfree(ctx->buf);
> +}
> +
> +static int receive_buffer(struct file *filp,
> +			  char *buf,
> +			  off_t offset, /* offset in the file */
> +			  int len,
> +			  int *err)
> +{
> +	struct kvec kvec;
> +	struct iov_iter iter;
> +
> +	kvec.iov_base = buf;
> +	kvec.iov_len = len;
> +	iov_iter_kvec(&iter, READ | ITER_KVEC, &kvec, 1, len);
> +
> +	return p9_client_read(filp->private_data, offset, &iter, err);
> +}
> +
> +static int fast_filler(struct v9fs_readpages_ctx *ctx, struct page *page)
> +{
> +	int err;
> +	int ret = 0;
> +	char *kdata;
> +	int to_page;
> +	off_t off_in_buf;
> +	struct inode *inode = page->mapping->host;
> +
> +	BUG_ON(!PageLocked(page));
> +	/*
> +	 * first, validate the buffer
> +	 */
> +	if (page->index < ctx->start_index ||
> +	    ctx->start_index + ctx->num_pages < page->index) {
> +		/*
> +		 * No actual data in the buffer,
> +		 * so actualize it
> +		 */
> +		ret = receive_buffer(ctx->filp,
> +				     ctx->buf,
> +				     page_offset(page),
> +				     ctx->num_pages << PAGE_SHIFT,

Doesn't this potentially read beyond the end of the file?


Alex

> +				     &err);
> +		if (err) {
> +			printk("failed to receive buffer off=%llu (%d)\n",
> +			       (unsigned long long)page_offset(page),
> +			       err);
> +			ret = err;
> +			goto done;
> +		}
> +		ctx->start_index = page->index;
> +		ctx->len = ret;
> +		ret = 0;
> +	}
> +	/*
> +	 * fill the page with buffer's data
> +	 */
> +	off_in_buf = (page->index - ctx->start_index) << PAGE_SHIFT;
> +	if (off_in_buf >= ctx->len) {
> +		/*
> +		 * No actual data to fill the page with
> +		 */
> +		ret = -1;
> +		goto done;
> +	}
> +	to_page = ctx->len - off_in_buf;
> +	if (to_page >= PAGE_SIZE)
> +		to_page = PAGE_SIZE;
> +
> +	kdata = kmap_atomic(page);
> +	memcpy(kdata, ctx->buf + off_in_buf, to_page);
> +	memset(kdata + to_page, 0, PAGE_SIZE - to_page);
> +	kunmap_atomic(kdata);
> +
> +	flush_dcache_page(page);
> +	SetPageUptodate(page);
> +	v9fs_readpage_to_fscache(inode, page);
> + done:
> +	unlock_page(page);
> +	return ret;
> +}
> +
> +/**
> + * Try to read pages by groups. For every such group we issue only one
> + * read request to the server.
> + * @num_pages: maximal chunk of data (in pages) that can be passed per
> + * such request
> + */
> +static int v9fs_readpages_tryfast(struct file *filp,
> +				  struct address_space *mapping,
> +				  struct list_head *pages,
> +				  int num_pages)
> +{
> +	int ret;
> +	struct v9fs_readpages_ctx ctx;
> +
> +	ret = init_readpages_ctx(&ctx, filp, mapping, num_pages);
> +	if (ret)
> +		/*
> +		 * Can not allocate resources for the fast path,
> +		 * so do it by slow way
> +		 */
> +		return read_cache_pages(mapping, pages,
> +					(void *)v9fs_vfs_readpage, filp);
> +
> +	else
> +		ret = read_cache_pages(mapping, pages,
> +				       (void *)fast_filler, &ctx);
> +	done_readpages_ctx(&ctx);
> +	return ret;
> +}
> +
>   /**
>    * v9fs_vfs_readpages - read a set of pages from 9P
>    *
> @@ -114,6 +257,7 @@ static int v9fs_vfs_readpages(struct file *filp, struct address_space *mapping,
>   {
>   	int ret = 0;
>   	struct inode *inode;
> +	struct v9fs_flush_set *fset;
>   
>   	inode = mapping->host;
>   	p9_debug(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, filp);
> @@ -122,7 +266,17 @@ static int v9fs_vfs_readpages(struct file *filp, struct address_space *mapping,
>   	if (ret == 0)
>   		return ret;
>   
> -	ret = read_cache_pages(mapping, pages, (void *)v9fs_vfs_readpage, filp);
> +	fset = v9fs_inode2v9ses(mapping->host)->flush;
> +	if (!fset)
> +		/*
> +		 * Do it by slow way
> +		 */
> +		ret = read_cache_pages(mapping, pages,
> +				       (void *)v9fs_vfs_readpage, filp);
> +	else
> +		ret = v9fs_readpages_tryfast(filp, mapping,
> +					     pages, fset->num_pages);
> +
>   	p9_debug(P9_DEBUG_VFS, "  = %d\n", ret);
>   	return ret;
>   }

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Edward Shishkin Dec. 9, 2016, 6:42 p.m. UTC | #2
Hello Alexander,

Thank you for the comments.
Please, find my answers below.


On 10/25/2016 04:13 PM, Alexander Graf wrote:
> On 10/10/2016 07:24 PM, Edward Shishkin wrote:
>> Modify v9fs private ->readpages() method of address_space
>> operations for merging pages into long 9p messages.
>>
>> Signed-off-by: Edward Shishkin <edward.shishkin@gmail.com>
>> ---
>>   fs/9p/vfs_addr.c | 156 
>> ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>>   1 file changed, 155 insertions(+), 1 deletion(-)
>>
>> diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
>> index e871886..4ad248e 100644
>> --- a/fs/9p/vfs_addr.c
>> +++ b/fs/9p/vfs_addr.c
>> @@ -34,6 +34,7 @@
>>   #include <linux/idr.h>
>>   #include <linux/sched.h>
>>   #include <linux/uio.h>
>> +#include <linux/slab.h>
>>   #include <net/9p/9p.h>
>>   #include <net/9p/client.h>
>>   #include <trace/events/writeback.h>
>> @@ -99,6 +100,148 @@ static int v9fs_vfs_readpage(struct file *filp, 
>> struct page *page)
>>       return v9fs_fid_readpage(filp->private_data, page);
>>   }
>>   +/*
>> + * Context for "fast readpages"
>> + */
>> +struct v9fs_readpages_ctx {
>> +    struct file *filp;
>> +    struct address_space *mapping;
>> +    pgoff_t start_index; /* index of the first page with actual data */
>> +    char *buf; /* buffer with actual data */
>> +    int len; /* length of the actual data */
>> +    int num_pages; /* maximal data chunk (in pages) that can be
>> +              passed per transmission */
>> +};
>> +
>> +static int init_readpages_ctx(struct v9fs_readpages_ctx *ctx,
>> +                  struct file *filp,
>> +                  struct address_space *mapping,
>> +                  int num_pages)
>> +{
>> +    memset(ctx, 0, sizeof(*ctx));
>> +    ctx->buf = kmalloc(num_pages << PAGE_SHIFT, GFP_USER);
>
> Doesn't this a) have potential information leak to user space


Yes, allocation with such flag is a mistake. Will be fixed in v2.


> and b) allow user space to allocate big amounts of kernel memory?


Yes, I definitely missed a sanity check for the num_pages. Will be fixed 
in v2.


> A limited page pool would probably be better here.
>
> I also don't quite grasp yet what pattern you're actually optimizing.


reading/writing a large file in sequential/random manner.


> Basically you're doing implicit read-ahead on behalf of the reader, right?


Yes. As you can see, we always read (ctx->num_pages) number of pages.


> So why would that be faster in a random 4k read scenario?


You mean 41 MB/s vs 64 MB/s?
With such read-ahead it's more likely that an up-to-date page will be 
present in
the cache. So, why not? After all, our speedup of random 4K reads is not 
dramatic.


> Also, have you compared tmpfs hosted files to only benchmark the 
> transmission path?


No, I haven't. Only I/O speedup was a concern.
I can obtain such numbers, if interesting.


>
>> +    if (!ctx->buf)
>> +        return -ENOMEM;
>> +    ctx->filp = filp;
>> +    ctx->mapping = mapping;
>> +    ctx->num_pages = num_pages;
>> +    return 0;
>> +}
>> +
>> +static void done_readpages_ctx(struct v9fs_readpages_ctx *ctx)
>> +{
>> +    kfree(ctx->buf);
>> +}
>> +
>> +static int receive_buffer(struct file *filp,
>> +              char *buf,
>> +              off_t offset, /* offset in the file */
>> +              int len,
>> +              int *err)
>> +{
>> +    struct kvec kvec;
>> +    struct iov_iter iter;
>> +
>> +    kvec.iov_base = buf;
>> +    kvec.iov_len = len;
>> +    iov_iter_kvec(&iter, READ | ITER_KVEC, &kvec, 1, len);
>> +
>> +    return p9_client_read(filp->private_data, offset, &iter, err);
>> +}
>> +
>> +static int fast_filler(struct v9fs_readpages_ctx *ctx, struct page 
>> *page)
>> +{
>> +    int err;
>> +    int ret = 0;
>> +    char *kdata;
>> +    int to_page;
>> +    off_t off_in_buf;
>> +    struct inode *inode = page->mapping->host;
>> +
>> +    BUG_ON(!PageLocked(page));
>> +    /*
>> +     * first, validate the buffer
>> +     */
>> +    if (page->index < ctx->start_index ||
>> +        ctx->start_index + ctx->num_pages < page->index) {
>> +        /*
>> +         * No actual data in the buffer,
>> +         * so actualize it
>> +         */
>> +        ret = receive_buffer(ctx->filp,
>> +                     ctx->buf,
>> +                     page_offset(page),
>> +                     ctx->num_pages << PAGE_SHIFT,
>
> Doesn't this potentially read beyond the end of the file?


POSIX doesn't prohibit to read beyond the end of a file. The only 
requirement is that
the supplement should be filled with zeros.
Full pages of such supplement are filled with zeros from the buffer: 
here we rely on
the local file system on the host. Partial pages are filled with zeros 
by us at the place
I have marked with (*) below.

Thanks,
Edward.

>
> Alex
>
>> +                     &err);
>> +        if (err) {
>> +            printk("failed to receive buffer off=%llu (%d)\n",
>> +                   (unsigned long long)page_offset(page),
>> +                   err);
>> +            ret = err;
>> +            goto done;
>> +        }
>> +        ctx->start_index = page->index;
>> +        ctx->len = ret;
>> +        ret = 0;
>> +    }
>> +    /*
>> +     * fill the page with buffer's data
>> +     */
>> +    off_in_buf = (page->index - ctx->start_index) << PAGE_SHIFT;
>> +    if (off_in_buf >= ctx->len) {
>> +        /*
>> +         * No actual data to fill the page with
>> +         */
>> +        ret = -1;
>> +        goto done;
>> +    }
>> +    to_page = ctx->len - off_in_buf;
>> +    if (to_page >= PAGE_SIZE)
>> +        to_page = PAGE_SIZE;
>> +
>> +    kdata = kmap_atomic(page);
>> +    memcpy(kdata, ctx->buf + off_in_buf, to_page);
>> +    memset(kdata + to_page, 0, PAGE_SIZE - to_page);

(*)

>> +    kunmap_atomic(kdata);
>> +
>> +    flush_dcache_page(page);
>> +    SetPageUptodate(page);
>> +    v9fs_readpage_to_fscache(inode, page);
>> + done:
>> +    unlock_page(page);
>> +    return ret;
>> +}
>> +
>> +/**
>> + * Try to read pages by groups. For every such group we issue only one
>> + * read request to the server.
>> + * @num_pages: maximal chunk of data (in pages) that can be passed per
>> + * such request
>> + */
>> +static int v9fs_readpages_tryfast(struct file *filp,
>> +                  struct address_space *mapping,
>> +                  struct list_head *pages,
>> +                  int num_pages)
>> +{
>> +    int ret;
>> +    struct v9fs_readpages_ctx ctx;
>> +
>> +    ret = init_readpages_ctx(&ctx, filp, mapping, num_pages);
>> +    if (ret)
>> +        /*
>> +         * Can not allocate resources for the fast path,
>> +         * so do it by slow way
>> +         */
>> +        return read_cache_pages(mapping, pages,
>> +                    (void *)v9fs_vfs_readpage, filp);
>> +
>> +    else
>> +        ret = read_cache_pages(mapping, pages,
>> +                       (void *)fast_filler, &ctx);
>> +    done_readpages_ctx(&ctx);
>> +    return ret;
>> +}
>> +
>>   /**
>>    * v9fs_vfs_readpages - read a set of pages from 9P
>>    *
>> @@ -114,6 +257,7 @@ static int v9fs_vfs_readpages(struct file *filp, 
>> struct address_space *mapping,
>>   {
>>       int ret = 0;
>>       struct inode *inode;
>> +    struct v9fs_flush_set *fset;
>>         inode = mapping->host;
>>       p9_debug(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, filp);
>> @@ -122,7 +266,17 @@ static int v9fs_vfs_readpages(struct file *filp, 
>> struct address_space *mapping,
>>       if (ret == 0)
>>           return ret;
>>   -    ret = read_cache_pages(mapping, pages, (void 
>> *)v9fs_vfs_readpage, filp);
>> +    fset = v9fs_inode2v9ses(mapping->host)->flush;
>> +    if (!fset)
>> +        /*
>> +         * Do it by slow way
>> +         */
>> +        ret = read_cache_pages(mapping, pages,
>> +                       (void *)v9fs_vfs_readpage, filp);
>> +    else
>> +        ret = v9fs_readpages_tryfast(filp, mapping,
>> +                         pages, fset->num_pages);
>> +
>>       p9_debug(P9_DEBUG_VFS, "  = %d\n", ret);
>>       return ret;
>>   }
>
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index e871886..4ad248e 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -34,6 +34,7 @@ 
 #include <linux/idr.h>
 #include <linux/sched.h>
 #include <linux/uio.h>
+#include <linux/slab.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
 #include <trace/events/writeback.h>
@@ -99,6 +100,148 @@  static int v9fs_vfs_readpage(struct file *filp, struct page *page)
 	return v9fs_fid_readpage(filp->private_data, page);
 }
 
+/*
+ * Context for "fast readpages"
+ */
+struct v9fs_readpages_ctx {
+	struct file *filp;
+	struct address_space *mapping;
+	pgoff_t start_index; /* index of the first page with actual data */
+	char *buf; /* buffer with actual data */
+	int len; /* length of the actual data */
+	int num_pages; /* maximal data chunk (in pages) that can be
+			  passed per transmission */
+};
+
+static int init_readpages_ctx(struct v9fs_readpages_ctx *ctx,
+			      struct file *filp,
+			      struct address_space *mapping,
+			      int num_pages)
+{
+	memset(ctx, 0, sizeof(*ctx));
+	ctx->buf = kmalloc(num_pages << PAGE_SHIFT, GFP_USER);
+	if (!ctx->buf)
+		return -ENOMEM;
+	ctx->filp = filp;
+	ctx->mapping = mapping;
+	ctx->num_pages = num_pages;
+	return 0;
+}
+
+static void done_readpages_ctx(struct v9fs_readpages_ctx *ctx)
+{
+	kfree(ctx->buf);
+}
+
+static int receive_buffer(struct file *filp,
+			  char *buf,
+			  off_t offset, /* offset in the file */
+			  int len,
+			  int *err)
+{
+	struct kvec kvec;
+	struct iov_iter iter;
+
+	kvec.iov_base = buf;
+	kvec.iov_len = len;
+	iov_iter_kvec(&iter, READ | ITER_KVEC, &kvec, 1, len);
+
+	return p9_client_read(filp->private_data, offset, &iter, err);
+}
+
+static int fast_filler(struct v9fs_readpages_ctx *ctx, struct page *page)
+{
+	int err;
+	int ret = 0;
+	char *kdata;
+	int to_page;
+	off_t off_in_buf;
+	struct inode *inode = page->mapping->host;
+
+	BUG_ON(!PageLocked(page));
+	/*
+	 * first, validate the buffer
+	 */
+	if (page->index < ctx->start_index ||
+	    ctx->start_index + ctx->num_pages < page->index) {
+		/*
+		 * No actual data in the buffer,
+		 * so actualize it
+		 */
+		ret = receive_buffer(ctx->filp,
+				     ctx->buf,
+				     page_offset(page),
+				     ctx->num_pages << PAGE_SHIFT,
+				     &err);
+		if (err) {
+			printk("failed to receive buffer off=%llu (%d)\n",
+			       (unsigned long long)page_offset(page),
+			       err);
+			ret = err;
+			goto done;
+		}
+		ctx->start_index = page->index;
+		ctx->len = ret;
+		ret = 0;
+	}
+	/*
+	 * fill the page with buffer's data
+	 */
+	off_in_buf = (page->index - ctx->start_index) << PAGE_SHIFT;
+	if (off_in_buf >= ctx->len) {
+		/*
+		 * No actual data to fill the page with
+		 */
+		ret = -1;
+		goto done;
+	}
+	to_page = ctx->len - off_in_buf;
+	if (to_page >= PAGE_SIZE)
+		to_page = PAGE_SIZE;
+
+	kdata = kmap_atomic(page);
+	memcpy(kdata, ctx->buf + off_in_buf, to_page);
+	memset(kdata + to_page, 0, PAGE_SIZE - to_page);
+	kunmap_atomic(kdata);
+
+	flush_dcache_page(page);
+	SetPageUptodate(page);
+	v9fs_readpage_to_fscache(inode, page);
+ done:
+	unlock_page(page);
+	return ret;
+}
+
+/**
+ * Try to read pages by groups. For every such group we issue only one
+ * read request to the server.
+ * @num_pages: maximal chunk of data (in pages) that can be passed per
+ * such request
+ */
+static int v9fs_readpages_tryfast(struct file *filp,
+				  struct address_space *mapping,
+				  struct list_head *pages,
+				  int num_pages)
+{
+	int ret;
+	struct v9fs_readpages_ctx ctx;
+
+	ret = init_readpages_ctx(&ctx, filp, mapping, num_pages);
+	if (ret)
+		/*
+		 * Can not allocate resources for the fast path,
+		 * so do it by slow way
+		 */
+		return read_cache_pages(mapping, pages,
+					(void *)v9fs_vfs_readpage, filp);
+
+	else
+		ret = read_cache_pages(mapping, pages,
+				       (void *)fast_filler, &ctx);
+	done_readpages_ctx(&ctx);
+	return ret;
+}
+
 /**
  * v9fs_vfs_readpages - read a set of pages from 9P
  *
@@ -114,6 +257,7 @@  static int v9fs_vfs_readpages(struct file *filp, struct address_space *mapping,
 {
 	int ret = 0;
 	struct inode *inode;
+	struct v9fs_flush_set *fset;
 
 	inode = mapping->host;
 	p9_debug(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, filp);
@@ -122,7 +266,17 @@  static int v9fs_vfs_readpages(struct file *filp, struct address_space *mapping,
 	if (ret == 0)
 		return ret;
 
-	ret = read_cache_pages(mapping, pages, (void *)v9fs_vfs_readpage, filp);
+	fset = v9fs_inode2v9ses(mapping->host)->flush;
+	if (!fset)
+		/*
+		 * Do it by slow way
+		 */
+		ret = read_cache_pages(mapping, pages,
+				       (void *)v9fs_vfs_readpage, filp);
+	else
+		ret = v9fs_readpages_tryfast(filp, mapping,
+					     pages, fset->num_pages);
+
 	p9_debug(P9_DEBUG_VFS, "  = %d\n", ret);
 	return ret;
 }