diff mbox

[V9fs-developer,1/2] 9p: v9fs add writepages.

Message ID 1476120280-30873-1-git-send-email-edward.shishkin@gmail.com (mailing list archive)
State Superseded, archived
Headers show

Commit Message

Edward Shishkin Oct. 10, 2016, 5:24 p.m. UTC
Add a v9fs private ->writepages() method of address_space
operations for merging pages into long 9p messages.

Signed-off-by: Edward Shishkin <edward.shishkin@gmail.com>
---
 fs/9p/v9fs.c      |  46 +++++++
 fs/9p/v9fs.h      |  22 +++-
 fs/9p/vfs_addr.c  | 357 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/9p/vfs_super.c |   8 +-
 4 files changed, 431 insertions(+), 2 deletions(-)

Comments

Alexander Graf Oct. 25, 2016, 2:01 p.m. UTC | #1
On 10/10/2016 07:24 PM, Edward Shishkin wrote:
> Add a v9fs private ->writepages() method of address_space
> operations for merging pages into long 9p messages.
>
> Signed-off-by: Edward Shishkin <edward.shishkin@gmail.com>
> ---
>   fs/9p/v9fs.c      |  46 +++++++
>   fs/9p/v9fs.h      |  22 +++-
>   fs/9p/vfs_addr.c  | 357 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>   fs/9p/vfs_super.c |   8 +-
>   4 files changed, 431 insertions(+), 2 deletions(-)
>
> diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
> index 072e759..3b49daf 100644
> --- a/fs/9p/v9fs.c
> +++ b/fs/9p/v9fs.c
> @@ -32,6 +32,7 @@
>   #include <linux/parser.h>
>   #include <linux/idr.h>
>   #include <linux/slab.h>
> +#include <linux/pagemap.h>
>   #include <net/9p/9p.h>
>   #include <net/9p/client.h>
>   #include <net/9p/transport.h>
> @@ -309,6 +310,49 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
>   	return ret;
>   }
>   
> +void put_flush_set(struct v9fs_flush_set *fset)
> +{
> +	if (!fset)
> +		return;
> +	if (fset->pages)
> +		kfree(fset->pages);
> +	if (fset->buf)
> +		kfree(fset->buf);
> +	kfree(fset);
> +}
> +
> +/**
> + * Allocate and initalize flush set
> + * Pre-conditions: valid msize is set
> + */
> +int alloc_init_flush_set(struct v9fs_session_info *v9ses)
> +{
> +	int ret = -ENOMEM;
> +	int num_pages;
> +	struct v9fs_flush_set *fset = NULL;
> +
> +	num_pages = v9ses->clnt->msize >> PAGE_SHIFT;
> +	if (num_pages < 2)
> +		/* speedup impossible */
> +		return 0;
> +	fset = kzalloc(sizeof(*fset), GFP_KERNEL);
> +	if (!fset)
> +		goto error;
> +	fset->num_pages = num_pages;
> +	fset->pages = kzalloc(num_pages * sizeof(*fset->pages), GFP_KERNEL);
> +	if (!fset->pages)
> +		goto error;
> +	fset->buf = kzalloc(num_pages << PAGE_SHIFT, GFP_USER);
> +	if (!fset->buf)
> +		goto error;
> +	spin_lock_init(&(fset->lock));
> +	v9ses->flush = fset;
> +	return 0;
> + error:
> +	put_flush_set(fset);
> +	return ret;
> +}
> +
>   /**
>    * v9fs_session_init - initialize session
>    * @v9ses: session information structure
> @@ -444,6 +488,8 @@ void v9fs_session_close(struct v9fs_session_info *v9ses)
>   	kfree(v9ses->uname);
>   	kfree(v9ses->aname);
>   
> +	put_flush_set(v9ses->flush);
> +
>   	bdi_destroy(&v9ses->bdi);
>   
>   	spin_lock(&v9fs_sessionlist_lock);
> diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
> index 6877050..d1092e4 100644
> --- a/fs/9p/v9fs.h
> +++ b/fs/9p/v9fs.h
> @@ -23,6 +23,7 @@
>   #ifndef FS_9P_V9FS_H
>   #define FS_9P_V9FS_H
>   
> +#include <linux/kconfig.h>
>   #include <linux/backing-dev.h>
>   
>   /**
> @@ -69,6 +70,13 @@ enum p9_cache_modes {
>   	CACHE_FSCACHE,
>   };
>   
> +struct v9fs_flush_set {
> +        struct page **pages;
> +	int num_pages;
> +        char *buf;
> +	spinlock_t lock;
> +};
> +
>   /**
>    * struct v9fs_session_info - per-instance session information
>    * @flags: session options of type &p9_session_flags
> @@ -105,7 +113,7 @@ struct v9fs_session_info {
>   	char *cachetag;
>   	struct fscache_cookie *fscache;
>   #endif
> -
> +	struct v9fs_flush_set *flush; /* flush set for writepages */
>   	char *uname;		/* user name to mount as */
>   	char *aname;		/* name of remote hierarchy being mounted */
>   	unsigned int maxdata;	/* max data for client interface */
> @@ -158,6 +166,8 @@ extern const struct inode_operations v9fs_symlink_inode_operations_dotl;
>   extern struct inode *v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses,
>   					      struct p9_fid *fid,
>   					      struct super_block *sb, int new);
> +extern int alloc_init_flush_set(struct v9fs_session_info *v9ses);
> +extern void put_flush_set(struct v9fs_flush_set *fset);
>   
>   /* other default globals */
>   #define V9FS_PORT	564
> @@ -222,4 +232,14 @@ v9fs_get_new_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
>   		return v9fs_inode_from_fid(v9ses, fid, sb, 1);
>   }
>   
> +static inline int spin_trylock_flush_set(struct v9fs_flush_set *fset)
> +{
> +	return spin_trylock(&(fset->lock));
> +}
> +
> +static inline void spin_unlock_flush_set(struct v9fs_flush_set *fset)
> +{
> +	spin_unlock(&(fset->lock));
> +}
> +
>   #endif
> diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
> index 6181ad7..e871886 100644
> --- a/fs/9p/vfs_addr.c
> +++ b/fs/9p/vfs_addr.c
> @@ -36,6 +36,7 @@
>   #include <linux/uio.h>
>   #include <net/9p/9p.h>
>   #include <net/9p/client.h>
> +#include <trace/events/writeback.h>
>   
>   #include "v9fs.h"
>   #include "v9fs_vfs.h"
> @@ -209,6 +210,361 @@ static int v9fs_vfs_writepage(struct page *page, struct writeback_control *wbc)
>   	return retval;
>   }
>   
> +static void redirty_pages_for_writeback(struct page **pages, int nr,
> +					struct writeback_control *wbc)
> +{
> +	int i;
> +	for (i = 0; i < nr; i++) {
> +		lock_page(pages[i]);
> +		redirty_page_for_writepage(wbc, pages[i]);
> +		unlock_page(pages[i]);
> +	}
> +}
> +
> +static void set_pages_error(struct page **pages, int nr, int error)
> +{
> +	int i;
> +	for (i = 0; i < nr; i++) {
> +		lock_page(pages[i]);
> +		SetPageError(pages[i]);
> +		mapping_set_error(pages[i]->mapping, error);
> +		unlock_page(pages[i]);
> +	}
> +}
> +
> +#define V9FS_WRITEPAGES_DEBUG   (0)
> +
> +struct flush_context {
> +	struct writeback_control *wbc;
> +	struct address_space *mapping;
> +	struct v9fs_flush_set *fset;
> +	pgoff_t done_index;
> +	pgoff_t writeback_index;
> +	pgoff_t index;
> +	pgoff_t end; /* Inclusive */
> +	const char *msg;
> +	int cycled;
> +	int range_whole;
> +	int done;
> +};
> +
> +/**
> + * Copy a page with file's data to buffer.
> + * Handle races with truncate, etc.
> + * Return number of copied bytes
> + *
> + * @page: page to copy data from;
> + * @page_nr: serial number of the page
> + */
> +static int flush_page(struct page *page, int page_nr, struct flush_context *ctx)
> +{
> +	char *kdata;
> +	loff_t isize;
> +	int copied = 0;
> +	struct writeback_control *wbc = ctx->wbc;
> +	/*
> +	 * At this point, the page may be truncated or
> +	 * invalidated (changing page->mapping to NULL), or
> +	 * even swizzled back from swapper_space to tmpfs file
> +	 * mapping. However, page->index will not change
> +	 * because we have a reference on the page.
> +	 */
> +	if (page->index > ctx->end) {
> +		/*
> +		 * can't be range_cyclic (1st pass) because
> +		 * end == -1 in that case.
> +		 */
> +		ctx->done = 1;
> +		ctx->msg = "page out of range";
> +		goto exit;
> +	}
> +	ctx->done_index = page->index;
> +	lock_page(page);
> +	/*
> +	 * Page truncated or invalidated. We can freely skip it
> +	 * then, even for data integrity operations: the page
> +	 * has disappeared concurrently, so there could be no
> +	 * real expectation of this data interity operation
> +	 * even if there is now a new, dirty page at the same
> +	 * pagecache address.
> +	 */
> +	if (unlikely(page->mapping != ctx->mapping)) {
> +		unlock_page(page);
> +		ctx->msg = "page truncated or invalidated";
> +		goto exit;
> +	}
> +	if (!PageDirty(page)) {
> +		/*
> +		 * someone wrote it for us
> +		 */
> +		unlock_page(page);
> +		ctx->msg = "page not dirty";
> +		goto exit;
> +	}
> +	if (PageWriteback(page)) {
> +		if (wbc->sync_mode != WB_SYNC_NONE)
> +			wait_on_page_writeback(page);
> +		else {
> +			unlock_page(page);
> +			ctx->msg = "page is writeback";
> +			goto exit;
> +		}
> +	}
> +	BUG_ON(PageWriteback(page));
> +	if (!clear_page_dirty_for_io(page)) {
> +		unlock_page(page);
> +		ctx->msg = "failed to clear page dirty";
> +		goto exit;
> +	}
> +	trace_wbc_writepage(wbc, inode_to_bdi(ctx->mapping->host));
> +
> +	set_page_writeback(page);
> +	isize = i_size_read(ctx->mapping->host);
> +	if (page->index == isize >> PAGE_SHIFT)
> +		copied = isize & ~PAGE_MASK;
> +	else
> +		copied = PAGE_SIZE;
> +	kdata = kmap_atomic(page);
> +	memcpy(ctx->fset->buf + (page_nr << PAGE_SHIFT), kdata, copied);
> +	kunmap_atomic(kdata);
> +	end_page_writeback(page);
> +
> +	unlock_page(page);
> +	/*
> +	 * We stop writing back only if we are not doing
> +	 * integrity sync. In case of integrity sync we have to
> +	 * keep going until we have written all the pages
> +	 * we tagged for writeback prior to entering this loop.
> +	 */
> +	if (--wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE)
> +		ctx->done = 1;
> + exit:
> +	return copied;
> +}
> +
> +static int send_buffer(off_t offset, int len, struct flush_context *ctx)
> +{
> +	int ret = 0;
> +	struct kvec kvec;
> +	struct iov_iter iter;
> +	struct v9fs_inode *v9inode = V9FS_I(ctx->mapping->host);
> +
> +	kvec.iov_base = ctx->fset->buf;
> +	kvec.iov_len = len;
> +	iov_iter_kvec(&iter, WRITE | ITER_KVEC, &kvec, 1, len);
> +	BUG_ON(!v9inode->writeback_fid);
> +
> +	p9_client_write(v9inode->writeback_fid, offset, &iter, &ret);
> +	return ret;
> +}
> +
> +/**
> + * Helper function for managing 9pFS write requests.
> + * The main purpose of this function is to provide support for
> + * the coalescing of several pages into a single 9p message.
> + * This is similarly to NFS's pagelist.
> + *
> + * Copy pages with adjusent indices to a buffer and send it to
> + * the server.

Why do you need to copy the pages? The transport below 9p - virtio in 
your case - has native scatter-gather support, so you don't need to copy 
anything, no?


Alex


------------------------------------------------------------------------------
The Command Line: Reinvented for Modern Developers
Did the resurgence of CLI tooling catch you by surprise?
Reconnect with the command line and become more productive. 
Learn the new .NET and ASP.NET CLI. Get your free copy!
http://sdm.link/telerik
Edward Shishkin Dec. 9, 2016, 6:43 p.m. UTC | #2
On 10/25/2016 04:01 PM, Alexander Graf wrote:
> On 10/10/2016 07:24 PM, Edward Shishkin wrote:
>> Add a v9fs private ->writepages() method of address_space
>> operations for merging pages into long 9p messages.
>>
>> Signed-off-by: Edward Shishkin <edward.shishkin@gmail.com>
>> ---
>>   fs/9p/v9fs.c      |  46 +++++++
>>   fs/9p/v9fs.h      |  22 +++-
>>   fs/9p/vfs_addr.c  | 357 
>> ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>   fs/9p/vfs_super.c |   8 +-
>>   4 files changed, 431 insertions(+), 2 deletions(-)
>>
>> diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
>> index 072e759..3b49daf 100644
>> --- a/fs/9p/v9fs.c
>> +++ b/fs/9p/v9fs.c
>> @@ -32,6 +32,7 @@
>>   #include <linux/parser.h>
>>   #include <linux/idr.h>
>>   #include <linux/slab.h>
>> +#include <linux/pagemap.h>
>>   #include <net/9p/9p.h>
>>   #include <net/9p/client.h>
>>   #include <net/9p/transport.h>
>> @@ -309,6 +310,49 @@ static int v9fs_parse_options(struct 
>> v9fs_session_info *v9ses, char *opts)
>>       return ret;
>>   }
>>   +void put_flush_set(struct v9fs_flush_set *fset)
>> +{
>> +    if (!fset)
>> +        return;
>> +    if (fset->pages)
>> +        kfree(fset->pages);
>> +    if (fset->buf)
>> +        kfree(fset->buf);
>> +    kfree(fset);
>> +}
>> +
>> +/**
>> + * Allocate and initalize flush set
>> + * Pre-conditions: valid msize is set
>> + */
>> +int alloc_init_flush_set(struct v9fs_session_info *v9ses)
>> +{
>> +    int ret = -ENOMEM;
>> +    int num_pages;
>> +    struct v9fs_flush_set *fset = NULL;
>> +
>> +    num_pages = v9ses->clnt->msize >> PAGE_SHIFT;
>> +    if (num_pages < 2)
>> +        /* speedup impossible */
>> +        return 0;
>> +    fset = kzalloc(sizeof(*fset), GFP_KERNEL);
>> +    if (!fset)
>> +        goto error;
>> +    fset->num_pages = num_pages;
>> +    fset->pages = kzalloc(num_pages * sizeof(*fset->pages), 
>> GFP_KERNEL);
>> +    if (!fset->pages)
>> +        goto error;
>> +    fset->buf = kzalloc(num_pages << PAGE_SHIFT, GFP_USER);
>> +    if (!fset->buf)
>> +        goto error;
>> +    spin_lock_init(&(fset->lock));
>> +    v9ses->flush = fset;
>> +    return 0;
>> + error:
>> +    put_flush_set(fset);
>> +    return ret;
>> +}
>> +
>>   /**
>>    * v9fs_session_init - initialize session
>>    * @v9ses: session information structure
>> @@ -444,6 +488,8 @@ void v9fs_session_close(struct v9fs_session_info 
>> *v9ses)
>>       kfree(v9ses->uname);
>>       kfree(v9ses->aname);
>>   +    put_flush_set(v9ses->flush);
>> +
>>       bdi_destroy(&v9ses->bdi);
>>         spin_lock(&v9fs_sessionlist_lock);
>> diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
>> index 6877050..d1092e4 100644
>> --- a/fs/9p/v9fs.h
>> +++ b/fs/9p/v9fs.h
>> @@ -23,6 +23,7 @@
>>   #ifndef FS_9P_V9FS_H
>>   #define FS_9P_V9FS_H
>>   +#include <linux/kconfig.h>
>>   #include <linux/backing-dev.h>
>>     /**
>> @@ -69,6 +70,13 @@ enum p9_cache_modes {
>>       CACHE_FSCACHE,
>>   };
>>   +struct v9fs_flush_set {
>> +        struct page **pages;
>> +    int num_pages;
>> +        char *buf;
>> +    spinlock_t lock;
>> +};
>> +
>>   /**
>>    * struct v9fs_session_info - per-instance session information
>>    * @flags: session options of type &p9_session_flags
>> @@ -105,7 +113,7 @@ struct v9fs_session_info {
>>       char *cachetag;
>>       struct fscache_cookie *fscache;
>>   #endif
>> -
>> +    struct v9fs_flush_set *flush; /* flush set for writepages */
>>       char *uname;        /* user name to mount as */
>>       char *aname;        /* name of remote hierarchy being mounted */
>>       unsigned int maxdata;    /* max data for client interface */
>> @@ -158,6 +166,8 @@ extern const struct inode_operations 
>> v9fs_symlink_inode_operations_dotl;
>>   extern struct inode *v9fs_inode_from_fid_dotl(struct 
>> v9fs_session_info *v9ses,
>>                             struct p9_fid *fid,
>>                             struct super_block *sb, int new);
>> +extern int alloc_init_flush_set(struct v9fs_session_info *v9ses);
>> +extern void put_flush_set(struct v9fs_flush_set *fset);
>>     /* other default globals */
>>   #define V9FS_PORT    564
>> @@ -222,4 +232,14 @@ v9fs_get_new_inode_from_fid(struct 
>> v9fs_session_info *v9ses, struct p9_fid *fid,
>>           return v9fs_inode_from_fid(v9ses, fid, sb, 1);
>>   }
>>   +static inline int spin_trylock_flush_set(struct v9fs_flush_set *fset)
>> +{
>> +    return spin_trylock(&(fset->lock));
>> +}
>> +
>> +static inline void spin_unlock_flush_set(struct v9fs_flush_set *fset)
>> +{
>> +    spin_unlock(&(fset->lock));
>> +}
>> +
>>   #endif
>> diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
>> index 6181ad7..e871886 100644
>> --- a/fs/9p/vfs_addr.c
>> +++ b/fs/9p/vfs_addr.c
>> @@ -36,6 +36,7 @@
>>   #include <linux/uio.h>
>>   #include <net/9p/9p.h>
>>   #include <net/9p/client.h>
>> +#include <trace/events/writeback.h>
>>     #include "v9fs.h"
>>   #include "v9fs_vfs.h"
>> @@ -209,6 +210,361 @@ static int v9fs_vfs_writepage(struct page 
>> *page, struct writeback_control *wbc)
>>       return retval;
>>   }
>>   +static void redirty_pages_for_writeback(struct page **pages, int nr,
>> +                    struct writeback_control *wbc)
>> +{
>> +    int i;
>> +    for (i = 0; i < nr; i++) {
>> +        lock_page(pages[i]);
>> +        redirty_page_for_writepage(wbc, pages[i]);
>> +        unlock_page(pages[i]);
>> +    }
>> +}
>> +
>> +static void set_pages_error(struct page **pages, int nr, int error)
>> +{
>> +    int i;
>> +    for (i = 0; i < nr; i++) {
>> +        lock_page(pages[i]);
>> +        SetPageError(pages[i]);
>> +        mapping_set_error(pages[i]->mapping, error);
>> +        unlock_page(pages[i]);
>> +    }
>> +}
>> +
>> +#define V9FS_WRITEPAGES_DEBUG   (0)
>> +
>> +struct flush_context {
>> +    struct writeback_control *wbc;
>> +    struct address_space *mapping;
>> +    struct v9fs_flush_set *fset;
>> +    pgoff_t done_index;
>> +    pgoff_t writeback_index;
>> +    pgoff_t index;
>> +    pgoff_t end; /* Inclusive */
>> +    const char *msg;
>> +    int cycled;
>> +    int range_whole;
>> +    int done;
>> +};
>> +
>> +/**
>> + * Copy a page with file's data to buffer.
>> + * Handle races with truncate, etc.
>> + * Return number of copied bytes
>> + *
>> + * @page: page to copy data from;
>> + * @page_nr: serial number of the page
>> + */
>> +static int flush_page(struct page *page, int page_nr, struct 
>> flush_context *ctx)
>> +{
>> +    char *kdata;
>> +    loff_t isize;
>> +    int copied = 0;
>> +    struct writeback_control *wbc = ctx->wbc;
>> +    /*
>> +     * At this point, the page may be truncated or
>> +     * invalidated (changing page->mapping to NULL), or
>> +     * even swizzled back from swapper_space to tmpfs file
>> +     * mapping. However, page->index will not change
>> +     * because we have a reference on the page.
>> +     */
>> +    if (page->index > ctx->end) {
>> +        /*
>> +         * can't be range_cyclic (1st pass) because
>> +         * end == -1 in that case.
>> +         */
>> +        ctx->done = 1;
>> +        ctx->msg = "page out of range";
>> +        goto exit;
>> +    }
>> +    ctx->done_index = page->index;
>> +    lock_page(page);
>> +    /*
>> +     * Page truncated or invalidated. We can freely skip it
>> +     * then, even for data integrity operations: the page
>> +     * has disappeared concurrently, so there could be no
>> +     * real expectation of this data interity operation
>> +     * even if there is now a new, dirty page at the same
>> +     * pagecache address.
>> +     */
>> +    if (unlikely(page->mapping != ctx->mapping)) {
>> +        unlock_page(page);
>> +        ctx->msg = "page truncated or invalidated";
>> +        goto exit;
>> +    }
>> +    if (!PageDirty(page)) {
>> +        /*
>> +         * someone wrote it for us
>> +         */
>> +        unlock_page(page);
>> +        ctx->msg = "page not dirty";
>> +        goto exit;
>> +    }
>> +    if (PageWriteback(page)) {
>> +        if (wbc->sync_mode != WB_SYNC_NONE)
>> +            wait_on_page_writeback(page);
>> +        else {
>> +            unlock_page(page);
>> +            ctx->msg = "page is writeback";
>> +            goto exit;
>> +        }
>> +    }
>> +    BUG_ON(PageWriteback(page));
>> +    if (!clear_page_dirty_for_io(page)) {
>> +        unlock_page(page);
>> +        ctx->msg = "failed to clear page dirty";
>> +        goto exit;
>> +    }
>> +    trace_wbc_writepage(wbc, inode_to_bdi(ctx->mapping->host));
>> +
>> +    set_page_writeback(page);
>> +    isize = i_size_read(ctx->mapping->host);
>> +    if (page->index == isize >> PAGE_SHIFT)
>> +        copied = isize & ~PAGE_MASK;
>> +    else
>> +        copied = PAGE_SIZE;
>> +    kdata = kmap_atomic(page);
>> +    memcpy(ctx->fset->buf + (page_nr << PAGE_SHIFT), kdata, copied);
>> +    kunmap_atomic(kdata);
>> +    end_page_writeback(page);
>> +
>> +    unlock_page(page);
>> +    /*
>> +     * We stop writing back only if we are not doing
>> +     * integrity sync. In case of integrity sync we have to
>> +     * keep going until we have written all the pages
>> +     * we tagged for writeback prior to entering this loop.
>> +     */
>> +    if (--wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE)
>> +        ctx->done = 1;
>> + exit:
>> +    return copied;
>> +}
>> +
>> +static int send_buffer(off_t offset, int len, struct flush_context 
>> *ctx)
>> +{
>> +    int ret = 0;
>> +    struct kvec kvec;
>> +    struct iov_iter iter;
>> +    struct v9fs_inode *v9inode = V9FS_I(ctx->mapping->host);
>> +
>> +    kvec.iov_base = ctx->fset->buf;
>> +    kvec.iov_len = len;
>> +    iov_iter_kvec(&iter, WRITE | ITER_KVEC, &kvec, 1, len);
>> +    BUG_ON(!v9inode->writeback_fid);
>> +
>> +    p9_client_write(v9inode->writeback_fid, offset, &iter, &ret);
>> +    return ret;
>> +}
>> +
>> +/**
>> + * Helper function for managing 9pFS write requests.
>> + * The main purpose of this function is to provide support for
>> + * the coalescing of several pages into a single 9p message.
>> + * This is similarly to NFS's pagelist.
>> + *
>> + * Copy pages with adjusent indices to a buffer and send it to
>> + * the server.
>
> Why do you need to copy the pages? The transport below 9p - virtio in 
> your case - has native scatter-gather support, so you don't need to 
> copy anything, no?
>


Perhaps we can avoid copying pages. However it would mean modification
of the 9P file transfer protocol, which is not aware of pages. I suspect 
that
such activity requires substantial sponsorship, and I am not sure that 
it will
be interesting for Huawei.

Thanks,
Edward.

------------------------------------------------------------------------------
Developer Access Program for Intel Xeon Phi Processors
Access to Intel Xeon Phi processor-based developer platforms.
With one year of Intel Parallel Studio XE.
Training and support from Colfax.
Order your platform today.http://sdm.link/xeonphi
diff mbox

Patch

diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 072e759..3b49daf 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -32,6 +32,7 @@ 
 #include <linux/parser.h>
 #include <linux/idr.h>
 #include <linux/slab.h>
+#include <linux/pagemap.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
 #include <net/9p/transport.h>
@@ -309,6 +310,49 @@  static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
 	return ret;
 }
 
+void put_flush_set(struct v9fs_flush_set *fset)
+{
+	if (!fset)
+		return;
+	if (fset->pages)
+		kfree(fset->pages);
+	if (fset->buf)
+		kfree(fset->buf);
+	kfree(fset);
+}
+
+/**
+ * Allocate and initalize flush set
+ * Pre-conditions: valid msize is set
+ */
+int alloc_init_flush_set(struct v9fs_session_info *v9ses)
+{
+	int ret = -ENOMEM;
+	int num_pages;
+	struct v9fs_flush_set *fset = NULL;
+
+	num_pages = v9ses->clnt->msize >> PAGE_SHIFT;
+	if (num_pages < 2)
+		/* speedup impossible */
+		return 0;
+	fset = kzalloc(sizeof(*fset), GFP_KERNEL);
+	if (!fset)
+		goto error;
+	fset->num_pages = num_pages;
+	fset->pages = kzalloc(num_pages * sizeof(*fset->pages), GFP_KERNEL);
+	if (!fset->pages)
+		goto error;
+	fset->buf = kzalloc(num_pages << PAGE_SHIFT, GFP_USER);
+	if (!fset->buf)
+		goto error;
+	spin_lock_init(&(fset->lock));
+	v9ses->flush = fset;
+	return 0;
+ error:
+	put_flush_set(fset);
+	return ret;
+}
+
 /**
  * v9fs_session_init - initialize session
  * @v9ses: session information structure
@@ -444,6 +488,8 @@  void v9fs_session_close(struct v9fs_session_info *v9ses)
 	kfree(v9ses->uname);
 	kfree(v9ses->aname);
 
+	put_flush_set(v9ses->flush);
+
 	bdi_destroy(&v9ses->bdi);
 
 	spin_lock(&v9fs_sessionlist_lock);
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 6877050..d1092e4 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -23,6 +23,7 @@ 
 #ifndef FS_9P_V9FS_H
 #define FS_9P_V9FS_H
 
+#include <linux/kconfig.h>
 #include <linux/backing-dev.h>
 
 /**
@@ -69,6 +70,13 @@  enum p9_cache_modes {
 	CACHE_FSCACHE,
 };
 
+struct v9fs_flush_set {
+        struct page **pages;
+	int num_pages;
+        char *buf;
+	spinlock_t lock;
+};
+
 /**
  * struct v9fs_session_info - per-instance session information
  * @flags: session options of type &p9_session_flags
@@ -105,7 +113,7 @@  struct v9fs_session_info {
 	char *cachetag;
 	struct fscache_cookie *fscache;
 #endif
-
+	struct v9fs_flush_set *flush; /* flush set for writepages */
 	char *uname;		/* user name to mount as */
 	char *aname;		/* name of remote hierarchy being mounted */
 	unsigned int maxdata;	/* max data for client interface */
@@ -158,6 +166,8 @@  extern const struct inode_operations v9fs_symlink_inode_operations_dotl;
 extern struct inode *v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses,
 					      struct p9_fid *fid,
 					      struct super_block *sb, int new);
+extern int alloc_init_flush_set(struct v9fs_session_info *v9ses);
+extern void put_flush_set(struct v9fs_flush_set *fset);
 
 /* other default globals */
 #define V9FS_PORT	564
@@ -222,4 +232,14 @@  v9fs_get_new_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
 		return v9fs_inode_from_fid(v9ses, fid, sb, 1);
 }
 
+static inline int spin_trylock_flush_set(struct v9fs_flush_set *fset)
+{
+	return spin_trylock(&(fset->lock));
+}
+
+static inline void spin_unlock_flush_set(struct v9fs_flush_set *fset)
+{
+	spin_unlock(&(fset->lock));
+}
+
 #endif
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 6181ad7..e871886 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -36,6 +36,7 @@ 
 #include <linux/uio.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
+#include <trace/events/writeback.h>
 
 #include "v9fs.h"
 #include "v9fs_vfs.h"
@@ -209,6 +210,361 @@  static int v9fs_vfs_writepage(struct page *page, struct writeback_control *wbc)
 	return retval;
 }
 
+static void redirty_pages_for_writeback(struct page **pages, int nr,
+					struct writeback_control *wbc)
+{
+	int i;
+	for (i = 0; i < nr; i++) {
+		lock_page(pages[i]);
+		redirty_page_for_writepage(wbc, pages[i]);
+		unlock_page(pages[i]);
+	}
+}
+
+static void set_pages_error(struct page **pages, int nr, int error)
+{
+	int i;
+	for (i = 0; i < nr; i++) {
+		lock_page(pages[i]);
+		SetPageError(pages[i]);
+		mapping_set_error(pages[i]->mapping, error);
+		unlock_page(pages[i]);
+	}
+}
+
+#define V9FS_WRITEPAGES_DEBUG   (0)
+
+struct flush_context {
+	struct writeback_control *wbc;
+	struct address_space *mapping;
+	struct v9fs_flush_set *fset;
+	pgoff_t done_index;
+	pgoff_t writeback_index;
+	pgoff_t index;
+	pgoff_t end; /* Inclusive */
+	const char *msg;
+	int cycled;
+	int range_whole;
+	int done;
+};
+
+/**
+ * Copy a page with file's data to buffer.
+ * Handle races with truncate, etc.
+ * Return number of copied bytes
+ *
+ * @page: page to copy data from;
+ * @page_nr: serial number of the page
+ */
+static int flush_page(struct page *page, int page_nr, struct flush_context *ctx)
+{
+	char *kdata;
+	loff_t isize;
+	int copied = 0;
+	struct writeback_control *wbc = ctx->wbc;
+	/*
+	 * At this point, the page may be truncated or
+	 * invalidated (changing page->mapping to NULL), or
+	 * even swizzled back from swapper_space to tmpfs file
+	 * mapping. However, page->index will not change
+	 * because we have a reference on the page.
+	 */
+	if (page->index > ctx->end) {
+		/*
+		 * can't be range_cyclic (1st pass) because
+		 * end == -1 in that case.
+		 */
+		ctx->done = 1;
+		ctx->msg = "page out of range";
+		goto exit;
+	}
+	ctx->done_index = page->index;
+	lock_page(page);
+	/*
+	 * Page truncated or invalidated. We can freely skip it
+	 * then, even for data integrity operations: the page
+	 * has disappeared concurrently, so there could be no
+	 * real expectation of this data interity operation
+	 * even if there is now a new, dirty page at the same
+	 * pagecache address.
+	 */
+	if (unlikely(page->mapping != ctx->mapping)) {
+		unlock_page(page);
+		ctx->msg = "page truncated or invalidated";
+		goto exit;
+	}
+	if (!PageDirty(page)) {
+		/*
+		 * someone wrote it for us
+		 */
+		unlock_page(page);
+		ctx->msg = "page not dirty";
+		goto exit;
+	}
+	if (PageWriteback(page)) {
+		if (wbc->sync_mode != WB_SYNC_NONE)
+			wait_on_page_writeback(page);
+		else {
+			unlock_page(page);
+			ctx->msg = "page is writeback";
+			goto exit;
+		}
+	}
+	BUG_ON(PageWriteback(page));
+	if (!clear_page_dirty_for_io(page)) {
+		unlock_page(page);
+		ctx->msg = "failed to clear page dirty";
+		goto exit;
+	}
+	trace_wbc_writepage(wbc, inode_to_bdi(ctx->mapping->host));
+
+	set_page_writeback(page);
+	isize = i_size_read(ctx->mapping->host);
+	if (page->index == isize >> PAGE_SHIFT)
+		copied = isize & ~PAGE_MASK;
+	else
+		copied = PAGE_SIZE;
+	kdata = kmap_atomic(page);
+	memcpy(ctx->fset->buf + (page_nr << PAGE_SHIFT), kdata, copied);
+	kunmap_atomic(kdata);
+	end_page_writeback(page);
+
+	unlock_page(page);
+	/*
+	 * We stop writing back only if we are not doing
+	 * integrity sync. In case of integrity sync we have to
+	 * keep going until we have written all the pages
+	 * we tagged for writeback prior to entering this loop.
+	 */
+	if (--wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE)
+		ctx->done = 1;
+ exit:
+	return copied;
+}
+
+static int send_buffer(off_t offset, int len, struct flush_context *ctx)
+{
+	int ret = 0;
+	struct kvec kvec;
+	struct iov_iter iter;
+	struct v9fs_inode *v9inode = V9FS_I(ctx->mapping->host);
+
+	kvec.iov_base = ctx->fset->buf;
+	kvec.iov_len = len;
+	iov_iter_kvec(&iter, WRITE | ITER_KVEC, &kvec, 1, len);
+	BUG_ON(!v9inode->writeback_fid);
+
+	p9_client_write(v9inode->writeback_fid, offset, &iter, &ret);
+	return ret;
+}
+
+/**
+ * Helper function for managing 9pFS write requests.
+ * The main purpose of this function is to provide support for
+ * the coalescing of several pages into a single 9p message.
+ * This is similarly to NFS's pagelist.
+ *
+ * Copy pages with adjusent indices to a buffer and send it to
+ * the server.
+ *
+ * @pages: array of pages with ascending indices;
+ * @nr_pages: number of pages in the array;
+ */
+static int flush_pages(struct page **pages, int nr_pages,
+		       struct flush_context *ctx)
+{
+	int ret;
+	int pos = 0;
+	int iter_pos;
+	int iter_nrpages;
+	pgoff_t iter_page_idx;
+
+	while (pos < nr_pages) {
+
+		int i;
+		int iter_len = 0;
+		struct page *page;
+
+		iter_pos = pos;
+		iter_nrpages = 0;
+		iter_page_idx = pages[pos]->index;
+
+		for (i = 0; pos < nr_pages; i++) {
+			int from_page;
+
+			page = pages[pos];
+			if (page->index != iter_page_idx + i) {
+				/*
+				 * Hole in the indices,
+				 * further coalesce impossible.
+				 * Try to send what we have accumulated.
+				 * This page will be processed in the next
+				 * iteration
+				 */
+				goto iter_send;
+			}
+			from_page = flush_page(page, i, ctx);
+
+			iter_len += from_page;
+			iter_nrpages++;
+			pos++;
+
+			if (from_page != PAGE_SIZE) {
+				/*
+				 * Not full page was flushed,
+				 * further coalesce impossible.
+				 * Try to send what we have accumulated.
+				 */
+#if V9FS_WRITEPAGES_DEBUG
+				if (from_page == 0)
+				    printk("9p: page %lu is not flushed (%s)\n",
+					   page->index, ctx->msg);
+#endif
+				goto iter_send;
+			}
+		};
+	iter_send:
+		if (iter_len == 0)
+			/*
+			 * Nothing to send
+			 */
+			goto next_iter;
+		ret = send_buffer(iter_page_idx << PAGE_SHIFT,
+				  iter_len, ctx);
+		if (ret == -EAGAIN) {
+			redirty_pages_for_writeback(pages + iter_pos,
+						    iter_nrpages, ctx->wbc);
+			ret = 0;
+		} else if (ret < 0) {
+			/*
+			 * Something bad happened.
+			 * done_index is set past this chunk,
+			 * so media errors will not choke
+			 * background writeout for the entire
+			 * file.
+			 */
+			printk("9p: send_buffer failed (%d)\n", ret);
+
+			set_pages_error(pages + iter_pos, iter_nrpages, ret);
+			ctx->done_index =
+				pages[iter_pos + iter_nrpages - 1]->index + 1;
+			ctx->done = 1;
+			return ret;
+		} else
+			ret = 0;
+	next_iter:
+		if (ctx->done)
+			return ret;
+	};
+	return 0;
+}
+
+static void init_flush_context(struct flush_context *ctx,
+			       struct address_space *mapping,
+			       struct writeback_control *wbc,
+			       struct v9fs_flush_set *fset)
+{
+	ctx->wbc = wbc;
+	ctx->mapping = mapping;
+	ctx->fset = fset;
+	ctx->done = 0;
+	ctx->range_whole = 0;
+
+	if (wbc->range_cyclic) {
+		ctx->writeback_index = mapping->writeback_index;
+		ctx->index = ctx->writeback_index;
+		if (ctx->index == 0)
+			ctx->cycled = 1;
+		else
+			ctx->cycled = 0;
+		ctx->end = -1;
+	} else {
+		ctx->index = wbc->range_start >> PAGE_SHIFT;
+		ctx->end = wbc->range_end >> PAGE_SHIFT;
+		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+			ctx->range_whole = 1;
+		ctx->cycled = 1; /* ignore range_cyclic tests */
+	}
+}
+
+/**
+ * Pre-condition: flush set is locked
+ */
+static int v9fs_writepages_fastpath(struct address_space *mapping,
+				    struct writeback_control *wbc,
+				    struct v9fs_flush_set *fset)
+{
+	int ret = 0;
+	int tag;
+	int nr_pages;
+        struct page **pages = fset->pages;
+	struct flush_context ctx;
+
+	init_flush_context(&ctx, mapping, wbc, fset);
+
+	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
+		tag = PAGECACHE_TAG_TOWRITE;
+	else
+		tag = PAGECACHE_TAG_DIRTY;
+retry:
+	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
+		tag_pages_for_writeback(mapping, ctx.index, ctx.end);
+
+	ctx.done_index = ctx.index;
+
+	while (!ctx.done && (ctx.index <= ctx.end)) {
+		int i;
+		nr_pages = find_get_pages_tag(mapping, &ctx.index, tag,
+					      1 + min(ctx.end - ctx.index,
+					      (pgoff_t)(fset->num_pages - 1)),
+					      pages);
+		if (nr_pages == 0)
+			break;
+
+		ret = flush_pages(pages, nr_pages, &ctx);
+		/*
+		 * unpin pages
+		 */
+		for (i = 0; i < nr_pages; i++)
+			put_page(pages[i]);
+		if (ret < 0)
+			break;
+		cond_resched();
+	}
+	if (!ctx.cycled && !ctx.done) {
+		/*
+		 * range_cyclic:
+		 * We hit the last page and there is more work
+		 * to be done: wrap back to the start of the file
+		 */
+		ctx.cycled = 1;
+		ctx.index = 0;
+		ctx.end = ctx.writeback_index - 1;
+		goto retry;
+	}
+	if (wbc->range_cyclic || (ctx.range_whole && wbc->nr_to_write > 0))
+		mapping->writeback_index = ctx.done_index;
+	return ret;
+}
+
+static int v9fs_writepages(struct address_space *mapping,
+			   struct writeback_control *wbc)
+{
+	int ret;
+	struct v9fs_flush_set *fset;
+
+	fset = v9fs_inode2v9ses(mapping->host)->flush;
+	if (!fset || !spin_trylock_flush_set(fset))
+		/*
+		 * do it by slow way
+		 */
+		return generic_writepages(mapping, wbc);
+
+	ret = v9fs_writepages_fastpath(mapping, wbc, fset);
+	spin_unlock_flush_set(fset);
+	return ret;
+}
+
 /**
  * v9fs_launder_page - Writeback a dirty page
  * Returns 0 on success.
@@ -342,6 +698,7 @@  const struct address_space_operations v9fs_addr_operations = {
 	.readpages = v9fs_vfs_readpages,
 	.set_page_dirty = __set_page_dirty_nobuffers,
 	.writepage = v9fs_vfs_writepage,
+	.writepages = v9fs_writepages,
 	.write_begin = v9fs_write_begin,
 	.write_end = v9fs_write_end,
 	.releasepage = v9fs_release_page,
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index de3ed86..c1f9af1 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -140,8 +140,14 @@  static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
 	}
 	v9fs_fill_super(sb, v9ses, flags, data);
 
-	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
+	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
+		retval = alloc_init_flush_set(v9ses);
+		if (IS_ERR(v9ses->flush)) {
+			retval = PTR_ERR(fid);
+			goto release_sb;
+		}
 		sb->s_d_op = &v9fs_cached_dentry_operations;
+	}
 	else
 		sb->s_d_op = &v9fs_dentry_operations;