diff mbox

[1/2] 9p: v9fs add writepages.

Message ID 1476120280-30873-1-git-send-email-edward.shishkin@gmail.com (mailing list archive)
State New, archived
Headers show

Commit Message

Edward Shishkin Oct. 10, 2016, 5:24 p.m. UTC
Add a v9fs private ->writepages() method of address_space
operations for merging pages into long 9p messages.

Signed-off-by: Edward Shishkin <edward.shishkin@gmail.com>
---
 fs/9p/v9fs.c      |  46 +++++++
 fs/9p/v9fs.h      |  22 +++-
 fs/9p/vfs_addr.c  | 357 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/9p/vfs_super.c |   8 +-
 4 files changed, 431 insertions(+), 2 deletions(-)

Comments

Alexander Graf Oct. 25, 2016, 2:01 p.m. UTC | #1
On 10/10/2016 07:24 PM, Edward Shishkin wrote:
> Add a v9fs private ->writepages() method of address_space
> operations for merging pages into long 9p messages.
>
> Signed-off-by: Edward Shishkin <edward.shishkin@gmail.com>
> ---
>   fs/9p/v9fs.c      |  46 +++++++
>   fs/9p/v9fs.h      |  22 +++-
>   fs/9p/vfs_addr.c  | 357 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>   fs/9p/vfs_super.c |   8 +-
>   4 files changed, 431 insertions(+), 2 deletions(-)
>
> diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
> index 072e759..3b49daf 100644
> --- a/fs/9p/v9fs.c
> +++ b/fs/9p/v9fs.c
> @@ -32,6 +32,7 @@
>   #include <linux/parser.h>
>   #include <linux/idr.h>
>   #include <linux/slab.h>
> +#include <linux/pagemap.h>
>   #include <net/9p/9p.h>
>   #include <net/9p/client.h>
>   #include <net/9p/transport.h>
> @@ -309,6 +310,49 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
>   	return ret;
>   }
>   
> +void put_flush_set(struct v9fs_flush_set *fset)
> +{
> +	if (!fset)
> +		return;
> +	if (fset->pages)
> +		kfree(fset->pages);
> +	if (fset->buf)
> +		kfree(fset->buf);
> +	kfree(fset);
> +}
> +
> +/**
> + * Allocate and initalize flush set
> + * Pre-conditions: valid msize is set
> + */
> +int alloc_init_flush_set(struct v9fs_session_info *v9ses)
> +{
> +	int ret = -ENOMEM;
> +	int num_pages;
> +	struct v9fs_flush_set *fset = NULL;
> +
> +	num_pages = v9ses->clnt->msize >> PAGE_SHIFT;
> +	if (num_pages < 2)
> +		/* speedup impossible */
> +		return 0;
> +	fset = kzalloc(sizeof(*fset), GFP_KERNEL);
> +	if (!fset)
> +		goto error;
> +	fset->num_pages = num_pages;
> +	fset->pages = kzalloc(num_pages * sizeof(*fset->pages), GFP_KERNEL);
> +	if (!fset->pages)
> +		goto error;
> +	fset->buf = kzalloc(num_pages << PAGE_SHIFT, GFP_USER);
> +	if (!fset->buf)
> +		goto error;
> +	spin_lock_init(&(fset->lock));
> +	v9ses->flush = fset;
> +	return 0;
> + error:
> +	put_flush_set(fset);
> +	return ret;
> +}
> +
>   /**
>    * v9fs_session_init - initialize session
>    * @v9ses: session information structure
> @@ -444,6 +488,8 @@ void v9fs_session_close(struct v9fs_session_info *v9ses)
>   	kfree(v9ses->uname);
>   	kfree(v9ses->aname);
>   
> +	put_flush_set(v9ses->flush);
> +
>   	bdi_destroy(&v9ses->bdi);
>   
>   	spin_lock(&v9fs_sessionlist_lock);
> diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
> index 6877050..d1092e4 100644
> --- a/fs/9p/v9fs.h
> +++ b/fs/9p/v9fs.h
> @@ -23,6 +23,7 @@
>   #ifndef FS_9P_V9FS_H
>   #define FS_9P_V9FS_H
>   
> +#include <linux/kconfig.h>
>   #include <linux/backing-dev.h>
>   
>   /**
> @@ -69,6 +70,13 @@ enum p9_cache_modes {
>   	CACHE_FSCACHE,
>   };
>   
> +struct v9fs_flush_set {
> +        struct page **pages;
> +	int num_pages;
> +        char *buf;
> +	spinlock_t lock;
> +};
> +
>   /**
>    * struct v9fs_session_info - per-instance session information
>    * @flags: session options of type &p9_session_flags
> @@ -105,7 +113,7 @@ struct v9fs_session_info {
>   	char *cachetag;
>   	struct fscache_cookie *fscache;
>   #endif
> -
> +	struct v9fs_flush_set *flush; /* flush set for writepages */
>   	char *uname;		/* user name to mount as */
>   	char *aname;		/* name of remote hierarchy being mounted */
>   	unsigned int maxdata;	/* max data for client interface */
> @@ -158,6 +166,8 @@ extern const struct inode_operations v9fs_symlink_inode_operations_dotl;
>   extern struct inode *v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses,
>   					      struct p9_fid *fid,
>   					      struct super_block *sb, int new);
> +extern int alloc_init_flush_set(struct v9fs_session_info *v9ses);
> +extern void put_flush_set(struct v9fs_flush_set *fset);
>   
>   /* other default globals */
>   #define V9FS_PORT	564
> @@ -222,4 +232,14 @@ v9fs_get_new_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
>   		return v9fs_inode_from_fid(v9ses, fid, sb, 1);
>   }
>   
> +static inline int spin_trylock_flush_set(struct v9fs_flush_set *fset)
> +{
> +	return spin_trylock(&(fset->lock));
> +}
> +
> +static inline void spin_unlock_flush_set(struct v9fs_flush_set *fset)
> +{
> +	spin_unlock(&(fset->lock));
> +}
> +
>   #endif
> diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
> index 6181ad7..e871886 100644
> --- a/fs/9p/vfs_addr.c
> +++ b/fs/9p/vfs_addr.c
> @@ -36,6 +36,7 @@
>   #include <linux/uio.h>
>   #include <net/9p/9p.h>
>   #include <net/9p/client.h>
> +#include <trace/events/writeback.h>
>   
>   #include "v9fs.h"
>   #include "v9fs_vfs.h"
> @@ -209,6 +210,361 @@ static int v9fs_vfs_writepage(struct page *page, struct writeback_control *wbc)
>   	return retval;
>   }
>   
> +static void redirty_pages_for_writeback(struct page **pages, int nr,
> +					struct writeback_control *wbc)
> +{
> +	int i;
> +	for (i = 0; i < nr; i++) {
> +		lock_page(pages[i]);
> +		redirty_page_for_writepage(wbc, pages[i]);
> +		unlock_page(pages[i]);
> +	}
> +}
> +
> +static void set_pages_error(struct page **pages, int nr, int error)
> +{
> +	int i;
> +	for (i = 0; i < nr; i++) {
> +		lock_page(pages[i]);
> +		SetPageError(pages[i]);
> +		mapping_set_error(pages[i]->mapping, error);
> +		unlock_page(pages[i]);
> +	}
> +}
> +
> +#define V9FS_WRITEPAGES_DEBUG   (0)
> +
> +struct flush_context {
> +	struct writeback_control *wbc;
> +	struct address_space *mapping;
> +	struct v9fs_flush_set *fset;
> +	pgoff_t done_index;
> +	pgoff_t writeback_index;
> +	pgoff_t index;
> +	pgoff_t end; /* Inclusive */
> +	const char *msg;
> +	int cycled;
> +	int range_whole;
> +	int done;
> +};
> +
> +/**
> + * Copy a page with file's data to buffer.
> + * Handle races with truncate, etc.
> + * Return number of copied bytes
> + *
> + * @page: page to copy data from;
> + * @page_nr: serial number of the page
> + */
> +static int flush_page(struct page *page, int page_nr, struct flush_context *ctx)
> +{
> +	char *kdata;
> +	loff_t isize;
> +	int copied = 0;
> +	struct writeback_control *wbc = ctx->wbc;
> +	/*
> +	 * At this point, the page may be truncated or
> +	 * invalidated (changing page->mapping to NULL), or
> +	 * even swizzled back from swapper_space to tmpfs file
> +	 * mapping. However, page->index will not change
> +	 * because we have a reference on the page.
> +	 */
> +	if (page->index > ctx->end) {
> +		/*
> +		 * can't be range_cyclic (1st pass) because
> +		 * end == -1 in that case.
> +		 */
> +		ctx->done = 1;
> +		ctx->msg = "page out of range";
> +		goto exit;
> +	}
> +	ctx->done_index = page->index;
> +	lock_page(page);
> +	/*
> +	 * Page truncated or invalidated. We can freely skip it
> +	 * then, even for data integrity operations: the page
> +	 * has disappeared concurrently, so there could be no
> +	 * real expectation of this data interity operation
> +	 * even if there is now a new, dirty page at the same
> +	 * pagecache address.
> +	 */
> +	if (unlikely(page->mapping != ctx->mapping)) {
> +		unlock_page(page);
> +		ctx->msg = "page truncated or invalidated";
> +		goto exit;
> +	}
> +	if (!PageDirty(page)) {
> +		/*
> +		 * someone wrote it for us
> +		 */
> +		unlock_page(page);
> +		ctx->msg = "page not dirty";
> +		goto exit;
> +	}
> +	if (PageWriteback(page)) {
> +		if (wbc->sync_mode != WB_SYNC_NONE)
> +			wait_on_page_writeback(page);
> +		else {
> +			unlock_page(page);
> +			ctx->msg = "page is writeback";
> +			goto exit;
> +		}
> +	}
> +	BUG_ON(PageWriteback(page));
> +	if (!clear_page_dirty_for_io(page)) {
> +		unlock_page(page);
> +		ctx->msg = "failed to clear page dirty";
> +		goto exit;
> +	}
> +	trace_wbc_writepage(wbc, inode_to_bdi(ctx->mapping->host));
> +
> +	set_page_writeback(page);
> +	isize = i_size_read(ctx->mapping->host);
> +	if (page->index == isize >> PAGE_SHIFT)
> +		copied = isize & ~PAGE_MASK;
> +	else
> +		copied = PAGE_SIZE;
> +	kdata = kmap_atomic(page);
> +	memcpy(ctx->fset->buf + (page_nr << PAGE_SHIFT), kdata, copied);
> +	kunmap_atomic(kdata);
> +	end_page_writeback(page);
> +
> +	unlock_page(page);
> +	/*
> +	 * We stop writing back only if we are not doing
> +	 * integrity sync. In case of integrity sync we have to
> +	 * keep going until we have written all the pages
> +	 * we tagged for writeback prior to entering this loop.
> +	 */
> +	if (--wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE)
> +		ctx->done = 1;
> + exit:
> +	return copied;
> +}
> +
> +static int send_buffer(off_t offset, int len, struct flush_context *ctx)
> +{
> +	int ret = 0;
> +	struct kvec kvec;
> +	struct iov_iter iter;
> +	struct v9fs_inode *v9inode = V9FS_I(ctx->mapping->host);
> +
> +	kvec.iov_base = ctx->fset->buf;
> +	kvec.iov_len = len;
> +	iov_iter_kvec(&iter, WRITE | ITER_KVEC, &kvec, 1, len);
> +	BUG_ON(!v9inode->writeback_fid);
> +
> +	p9_client_write(v9inode->writeback_fid, offset, &iter, &ret);
> +	return ret;
> +}
> +
> +/**
> + * Helper function for managing 9pFS write requests.
> + * The main purpose of this function is to provide support for
> + * the coalescing of several pages into a single 9p message.
> + * This is similarly to NFS's pagelist.
> + *
> + * Copy pages with adjusent indices to a buffer and send it to
> + * the server.

Why do you need to copy the pages? The transport below 9p - virtio in 
your case - has native scatter-gather support, so you don't need to copy 
anything, no?


Alex

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Edward Shishkin Dec. 9, 2016, 6:43 p.m. UTC | #2
On 10/25/2016 04:01 PM, Alexander Graf wrote:
> On 10/10/2016 07:24 PM, Edward Shishkin wrote:
>> Add a v9fs private ->writepages() method of address_space
>> operations for merging pages into long 9p messages.
>>
>> Signed-off-by: Edward Shishkin <edward.shishkin@gmail.com>
>> ---
>>   fs/9p/v9fs.c      |  46 +++++++
>>   fs/9p/v9fs.h      |  22 +++-
>>   fs/9p/vfs_addr.c  | 357 
>> ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>   fs/9p/vfs_super.c |   8 +-
>>   4 files changed, 431 insertions(+), 2 deletions(-)
>>
>> diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
>> index 072e759..3b49daf 100644
>> --- a/fs/9p/v9fs.c
>> +++ b/fs/9p/v9fs.c
>> @@ -32,6 +32,7 @@
>>   #include <linux/parser.h>
>>   #include <linux/idr.h>
>>   #include <linux/slab.h>
>> +#include <linux/pagemap.h>
>>   #include <net/9p/9p.h>
>>   #include <net/9p/client.h>
>>   #include <net/9p/transport.h>
>> @@ -309,6 +310,49 @@ static int v9fs_parse_options(struct 
>> v9fs_session_info *v9ses, char *opts)
>>       return ret;
>>   }
>>   +void put_flush_set(struct v9fs_flush_set *fset)
>> +{
>> +    if (!fset)
>> +        return;
>> +    if (fset->pages)
>> +        kfree(fset->pages);
>> +    if (fset->buf)
>> +        kfree(fset->buf);
>> +    kfree(fset);
>> +}
>> +
>> +/**
>> + * Allocate and initalize flush set
>> + * Pre-conditions: valid msize is set
>> + */
>> +int alloc_init_flush_set(struct v9fs_session_info *v9ses)
>> +{
>> +    int ret = -ENOMEM;
>> +    int num_pages;
>> +    struct v9fs_flush_set *fset = NULL;
>> +
>> +    num_pages = v9ses->clnt->msize >> PAGE_SHIFT;
>> +    if (num_pages < 2)
>> +        /* speedup impossible */
>> +        return 0;
>> +    fset = kzalloc(sizeof(*fset), GFP_KERNEL);
>> +    if (!fset)
>> +        goto error;
>> +    fset->num_pages = num_pages;
>> +    fset->pages = kzalloc(num_pages * sizeof(*fset->pages), 
>> GFP_KERNEL);
>> +    if (!fset->pages)
>> +        goto error;
>> +    fset->buf = kzalloc(num_pages << PAGE_SHIFT, GFP_USER);
>> +    if (!fset->buf)
>> +        goto error;
>> +    spin_lock_init(&(fset->lock));
>> +    v9ses->flush = fset;
>> +    return 0;
>> + error:
>> +    put_flush_set(fset);
>> +    return ret;
>> +}
>> +
>>   /**
>>    * v9fs_session_init - initialize session
>>    * @v9ses: session information structure
>> @@ -444,6 +488,8 @@ void v9fs_session_close(struct v9fs_session_info 
>> *v9ses)
>>       kfree(v9ses->uname);
>>       kfree(v9ses->aname);
>>   +    put_flush_set(v9ses->flush);
>> +
>>       bdi_destroy(&v9ses->bdi);
>>         spin_lock(&v9fs_sessionlist_lock);
>> diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
>> index 6877050..d1092e4 100644
>> --- a/fs/9p/v9fs.h
>> +++ b/fs/9p/v9fs.h
>> @@ -23,6 +23,7 @@
>>   #ifndef FS_9P_V9FS_H
>>   #define FS_9P_V9FS_H
>>   +#include <linux/kconfig.h>
>>   #include <linux/backing-dev.h>
>>     /**
>> @@ -69,6 +70,13 @@ enum p9_cache_modes {
>>       CACHE_FSCACHE,
>>   };
>>   +struct v9fs_flush_set {
>> +        struct page **pages;
>> +    int num_pages;
>> +        char *buf;
>> +    spinlock_t lock;
>> +};
>> +
>>   /**
>>    * struct v9fs_session_info - per-instance session information
>>    * @flags: session options of type &p9_session_flags
>> @@ -105,7 +113,7 @@ struct v9fs_session_info {
>>       char *cachetag;
>>       struct fscache_cookie *fscache;
>>   #endif
>> -
>> +    struct v9fs_flush_set *flush; /* flush set for writepages */
>>       char *uname;        /* user name to mount as */
>>       char *aname;        /* name of remote hierarchy being mounted */
>>       unsigned int maxdata;    /* max data for client interface */
>> @@ -158,6 +166,8 @@ extern const struct inode_operations 
>> v9fs_symlink_inode_operations_dotl;
>>   extern struct inode *v9fs_inode_from_fid_dotl(struct 
>> v9fs_session_info *v9ses,
>>                             struct p9_fid *fid,
>>                             struct super_block *sb, int new);
>> +extern int alloc_init_flush_set(struct v9fs_session_info *v9ses);
>> +extern void put_flush_set(struct v9fs_flush_set *fset);
>>     /* other default globals */
>>   #define V9FS_PORT    564
>> @@ -222,4 +232,14 @@ v9fs_get_new_inode_from_fid(struct 
>> v9fs_session_info *v9ses, struct p9_fid *fid,
>>           return v9fs_inode_from_fid(v9ses, fid, sb, 1);
>>   }
>>   +static inline int spin_trylock_flush_set(struct v9fs_flush_set *fset)
>> +{
>> +    return spin_trylock(&(fset->lock));
>> +}
>> +
>> +static inline void spin_unlock_flush_set(struct v9fs_flush_set *fset)
>> +{
>> +    spin_unlock(&(fset->lock));
>> +}
>> +
>>   #endif
>> diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
>> index 6181ad7..e871886 100644
>> --- a/fs/9p/vfs_addr.c
>> +++ b/fs/9p/vfs_addr.c
>> @@ -36,6 +36,7 @@
>>   #include <linux/uio.h>
>>   #include <net/9p/9p.h>
>>   #include <net/9p/client.h>
>> +#include <trace/events/writeback.h>
>>     #include "v9fs.h"
>>   #include "v9fs_vfs.h"
>> @@ -209,6 +210,361 @@ static int v9fs_vfs_writepage(struct page 
>> *page, struct writeback_control *wbc)
>>       return retval;
>>   }
>>   +static void redirty_pages_for_writeback(struct page **pages, int nr,
>> +                    struct writeback_control *wbc)
>> +{
>> +    int i;
>> +    for (i = 0; i < nr; i++) {
>> +        lock_page(pages[i]);
>> +        redirty_page_for_writepage(wbc, pages[i]);
>> +        unlock_page(pages[i]);
>> +    }
>> +}
>> +
>> +static void set_pages_error(struct page **pages, int nr, int error)
>> +{
>> +    int i;
>> +    for (i = 0; i < nr; i++) {
>> +        lock_page(pages[i]);
>> +        SetPageError(pages[i]);
>> +        mapping_set_error(pages[i]->mapping, error);
>> +        unlock_page(pages[i]);
>> +    }
>> +}
>> +
>> +#define V9FS_WRITEPAGES_DEBUG   (0)
>> +
>> +struct flush_context {
>> +    struct writeback_control *wbc;
>> +    struct address_space *mapping;
>> +    struct v9fs_flush_set *fset;
>> +    pgoff_t done_index;
>> +    pgoff_t writeback_index;
>> +    pgoff_t index;
>> +    pgoff_t end; /* Inclusive */
>> +    const char *msg;
>> +    int cycled;
>> +    int range_whole;
>> +    int done;
>> +};
>> +
>> +/**
>> + * Copy a page with file's data to buffer.
>> + * Handle races with truncate, etc.
>> + * Return number of copied bytes
>> + *
>> + * @page: page to copy data from;
>> + * @page_nr: serial number of the page
>> + */
>> +static int flush_page(struct page *page, int page_nr, struct 
>> flush_context *ctx)
>> +{
>> +    char *kdata;
>> +    loff_t isize;
>> +    int copied = 0;
>> +    struct writeback_control *wbc = ctx->wbc;
>> +    /*
>> +     * At this point, the page may be truncated or
>> +     * invalidated (changing page->mapping to NULL), or
>> +     * even swizzled back from swapper_space to tmpfs file
>> +     * mapping. However, page->index will not change
>> +     * because we have a reference on the page.
>> +     */
>> +    if (page->index > ctx->end) {
>> +        /*
>> +         * can't be range_cyclic (1st pass) because
>> +         * end == -1 in that case.
>> +         */
>> +        ctx->done = 1;
>> +        ctx->msg = "page out of range";
>> +        goto exit;
>> +    }
>> +    ctx->done_index = page->index;
>> +    lock_page(page);
>> +    /*
>> +     * Page truncated or invalidated. We can freely skip it
>> +     * then, even for data integrity operations: the page
>> +     * has disappeared concurrently, so there could be no
>> +     * real expectation of this data interity operation
>> +     * even if there is now a new, dirty page at the same
>> +     * pagecache address.
>> +     */
>> +    if (unlikely(page->mapping != ctx->mapping)) {
>> +        unlock_page(page);
>> +        ctx->msg = "page truncated or invalidated";
>> +        goto exit;
>> +    }
>> +    if (!PageDirty(page)) {
>> +        /*
>> +         * someone wrote it for us
>> +         */
>> +        unlock_page(page);
>> +        ctx->msg = "page not dirty";
>> +        goto exit;
>> +    }
>> +    if (PageWriteback(page)) {
>> +        if (wbc->sync_mode != WB_SYNC_NONE)
>> +            wait_on_page_writeback(page);
>> +        else {
>> +            unlock_page(page);
>> +            ctx->msg = "page is writeback";
>> +            goto exit;
>> +        }
>> +    }
>> +    BUG_ON(PageWriteback(page));
>> +    if (!clear_page_dirty_for_io(page)) {
>> +        unlock_page(page);
>> +        ctx->msg = "failed to clear page dirty";
>> +        goto exit;
>> +    }
>> +    trace_wbc_writepage(wbc, inode_to_bdi(ctx->mapping->host));
>> +
>> +    set_page_writeback(page);
>> +    isize = i_size_read(ctx->mapping->host);
>> +    if (page->index == isize >> PAGE_SHIFT)
>> +        copied = isize & ~PAGE_MASK;
>> +    else
>> +        copied = PAGE_SIZE;
>> +    kdata = kmap_atomic(page);
>> +    memcpy(ctx->fset->buf + (page_nr << PAGE_SHIFT), kdata, copied);
>> +    kunmap_atomic(kdata);
>> +    end_page_writeback(page);
>> +
>> +    unlock_page(page);
>> +    /*
>> +     * We stop writing back only if we are not doing
>> +     * integrity sync. In case of integrity sync we have to
>> +     * keep going until we have written all the pages
>> +     * we tagged for writeback prior to entering this loop.
>> +     */
>> +    if (--wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE)
>> +        ctx->done = 1;
>> + exit:
>> +    return copied;
>> +}
>> +
>> +static int send_buffer(off_t offset, int len, struct flush_context 
>> *ctx)
>> +{
>> +    int ret = 0;
>> +    struct kvec kvec;
>> +    struct iov_iter iter;
>> +    struct v9fs_inode *v9inode = V9FS_I(ctx->mapping->host);
>> +
>> +    kvec.iov_base = ctx->fset->buf;
>> +    kvec.iov_len = len;
>> +    iov_iter_kvec(&iter, WRITE | ITER_KVEC, &kvec, 1, len);
>> +    BUG_ON(!v9inode->writeback_fid);
>> +
>> +    p9_client_write(v9inode->writeback_fid, offset, &iter, &ret);
>> +    return ret;
>> +}
>> +
>> +/**
>> + * Helper function for managing 9pFS write requests.
>> + * The main purpose of this function is to provide support for
>> + * the coalescing of several pages into a single 9p message.
>> + * This is similarly to NFS's pagelist.
>> + *
>> + * Copy pages with adjusent indices to a buffer and send it to
>> + * the server.
>
> Why do you need to copy the pages? The transport below 9p - virtio in 
> your case - has native scatter-gather support, so you don't need to 
> copy anything, no?
>


Perhaps we can avoid copying pages. However it would mean modification
of the 9P file transfer protocol, which is not aware of pages. I suspect 
that
such activity requires substantial sponsorship, and I am not sure that 
it will
be interesting for Huawei.

Thanks,
Edward.
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 072e759..3b49daf 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -32,6 +32,7 @@ 
 #include <linux/parser.h>
 #include <linux/idr.h>
 #include <linux/slab.h>
+#include <linux/pagemap.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
 #include <net/9p/transport.h>
@@ -309,6 +310,49 @@  static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
 	return ret;
 }
 
+void put_flush_set(struct v9fs_flush_set *fset)
+{
+	if (!fset)
+		return;
+	if (fset->pages)
+		kfree(fset->pages);
+	if (fset->buf)
+		kfree(fset->buf);
+	kfree(fset);
+}
+
+/**
+ * Allocate and initalize flush set
+ * Pre-conditions: valid msize is set
+ */
+int alloc_init_flush_set(struct v9fs_session_info *v9ses)
+{
+	int ret = -ENOMEM;
+	int num_pages;
+	struct v9fs_flush_set *fset = NULL;
+
+	num_pages = v9ses->clnt->msize >> PAGE_SHIFT;
+	if (num_pages < 2)
+		/* speedup impossible */
+		return 0;
+	fset = kzalloc(sizeof(*fset), GFP_KERNEL);
+	if (!fset)
+		goto error;
+	fset->num_pages = num_pages;
+	fset->pages = kzalloc(num_pages * sizeof(*fset->pages), GFP_KERNEL);
+	if (!fset->pages)
+		goto error;
+	fset->buf = kzalloc(num_pages << PAGE_SHIFT, GFP_USER);
+	if (!fset->buf)
+		goto error;
+	spin_lock_init(&(fset->lock));
+	v9ses->flush = fset;
+	return 0;
+ error:
+	put_flush_set(fset);
+	return ret;
+}
+
 /**
  * v9fs_session_init - initialize session
  * @v9ses: session information structure
@@ -444,6 +488,8 @@  void v9fs_session_close(struct v9fs_session_info *v9ses)
 	kfree(v9ses->uname);
 	kfree(v9ses->aname);
 
+	put_flush_set(v9ses->flush);
+
 	bdi_destroy(&v9ses->bdi);
 
 	spin_lock(&v9fs_sessionlist_lock);
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 6877050..d1092e4 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -23,6 +23,7 @@ 
 #ifndef FS_9P_V9FS_H
 #define FS_9P_V9FS_H
 
+#include <linux/kconfig.h>
 #include <linux/backing-dev.h>
 
 /**
@@ -69,6 +70,13 @@  enum p9_cache_modes {
 	CACHE_FSCACHE,
 };
 
+struct v9fs_flush_set {
+        struct page **pages;
+	int num_pages;
+        char *buf;
+	spinlock_t lock;
+};
+
 /**
  * struct v9fs_session_info - per-instance session information
  * @flags: session options of type &p9_session_flags
@@ -105,7 +113,7 @@  struct v9fs_session_info {
 	char *cachetag;
 	struct fscache_cookie *fscache;
 #endif
-
+	struct v9fs_flush_set *flush; /* flush set for writepages */
 	char *uname;		/* user name to mount as */
 	char *aname;		/* name of remote hierarchy being mounted */
 	unsigned int maxdata;	/* max data for client interface */
@@ -158,6 +166,8 @@  extern const struct inode_operations v9fs_symlink_inode_operations_dotl;
 extern struct inode *v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses,
 					      struct p9_fid *fid,
 					      struct super_block *sb, int new);
+extern int alloc_init_flush_set(struct v9fs_session_info *v9ses);
+extern void put_flush_set(struct v9fs_flush_set *fset);
 
 /* other default globals */
 #define V9FS_PORT	564
@@ -222,4 +232,14 @@  v9fs_get_new_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
 		return v9fs_inode_from_fid(v9ses, fid, sb, 1);
 }
 
+static inline int spin_trylock_flush_set(struct v9fs_flush_set *fset)
+{
+	return spin_trylock(&(fset->lock));
+}
+
+static inline void spin_unlock_flush_set(struct v9fs_flush_set *fset)
+{
+	spin_unlock(&(fset->lock));
+}
+
 #endif
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 6181ad7..e871886 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -36,6 +36,7 @@ 
 #include <linux/uio.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
+#include <trace/events/writeback.h>
 
 #include "v9fs.h"
 #include "v9fs_vfs.h"
@@ -209,6 +210,361 @@  static int v9fs_vfs_writepage(struct page *page, struct writeback_control *wbc)
 	return retval;
 }
 
+static void redirty_pages_for_writeback(struct page **pages, int nr,
+					struct writeback_control *wbc)
+{
+	int i;
+	for (i = 0; i < nr; i++) {
+		lock_page(pages[i]);
+		redirty_page_for_writepage(wbc, pages[i]);
+		unlock_page(pages[i]);
+	}
+}
+
+static void set_pages_error(struct page **pages, int nr, int error)
+{
+	int i;
+	for (i = 0; i < nr; i++) {
+		lock_page(pages[i]);
+		SetPageError(pages[i]);
+		mapping_set_error(pages[i]->mapping, error);
+		unlock_page(pages[i]);
+	}
+}
+
+#define V9FS_WRITEPAGES_DEBUG   (0)
+
+struct flush_context {
+	struct writeback_control *wbc;
+	struct address_space *mapping;
+	struct v9fs_flush_set *fset;
+	pgoff_t done_index;
+	pgoff_t writeback_index;
+	pgoff_t index;
+	pgoff_t end; /* Inclusive */
+	const char *msg;
+	int cycled;
+	int range_whole;
+	int done;
+};
+
+/**
+ * Copy a page with file's data to buffer.
+ * Handle races with truncate, etc.
+ * Return number of copied bytes
+ *
+ * @page: page to copy data from;
+ * @page_nr: serial number of the page
+ */
+static int flush_page(struct page *page, int page_nr, struct flush_context *ctx)
+{
+	char *kdata;
+	loff_t isize;
+	int copied = 0;
+	struct writeback_control *wbc = ctx->wbc;
+	/*
+	 * At this point, the page may be truncated or
+	 * invalidated (changing page->mapping to NULL), or
+	 * even swizzled back from swapper_space to tmpfs file
+	 * mapping. However, page->index will not change
+	 * because we have a reference on the page.
+	 */
+	if (page->index > ctx->end) {
+		/*
+		 * can't be range_cyclic (1st pass) because
+		 * end == -1 in that case.
+		 */
+		ctx->done = 1;
+		ctx->msg = "page out of range";
+		goto exit;
+	}
+	ctx->done_index = page->index;
+	lock_page(page);
+	/*
+	 * Page truncated or invalidated. We can freely skip it
+	 * then, even for data integrity operations: the page
+	 * has disappeared concurrently, so there could be no
+	 * real expectation of this data interity operation
+	 * even if there is now a new, dirty page at the same
+	 * pagecache address.
+	 */
+	if (unlikely(page->mapping != ctx->mapping)) {
+		unlock_page(page);
+		ctx->msg = "page truncated or invalidated";
+		goto exit;
+	}
+	if (!PageDirty(page)) {
+		/*
+		 * someone wrote it for us
+		 */
+		unlock_page(page);
+		ctx->msg = "page not dirty";
+		goto exit;
+	}
+	if (PageWriteback(page)) {
+		if (wbc->sync_mode != WB_SYNC_NONE)
+			wait_on_page_writeback(page);
+		else {
+			unlock_page(page);
+			ctx->msg = "page is writeback";
+			goto exit;
+		}
+	}
+	BUG_ON(PageWriteback(page));
+	if (!clear_page_dirty_for_io(page)) {
+		unlock_page(page);
+		ctx->msg = "failed to clear page dirty";
+		goto exit;
+	}
+	trace_wbc_writepage(wbc, inode_to_bdi(ctx->mapping->host));
+
+	set_page_writeback(page);
+	isize = i_size_read(ctx->mapping->host);
+	if (page->index == isize >> PAGE_SHIFT)
+		copied = isize & ~PAGE_MASK;
+	else
+		copied = PAGE_SIZE;
+	kdata = kmap_atomic(page);
+	memcpy(ctx->fset->buf + (page_nr << PAGE_SHIFT), kdata, copied);
+	kunmap_atomic(kdata);
+	end_page_writeback(page);
+
+	unlock_page(page);
+	/*
+	 * We stop writing back only if we are not doing
+	 * integrity sync. In case of integrity sync we have to
+	 * keep going until we have written all the pages
+	 * we tagged for writeback prior to entering this loop.
+	 */
+	if (--wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE)
+		ctx->done = 1;
+ exit:
+	return copied;
+}
+
+static int send_buffer(off_t offset, int len, struct flush_context *ctx)
+{
+	int ret = 0;
+	struct kvec kvec;
+	struct iov_iter iter;
+	struct v9fs_inode *v9inode = V9FS_I(ctx->mapping->host);
+
+	kvec.iov_base = ctx->fset->buf;
+	kvec.iov_len = len;
+	iov_iter_kvec(&iter, WRITE | ITER_KVEC, &kvec, 1, len);
+	BUG_ON(!v9inode->writeback_fid);
+
+	p9_client_write(v9inode->writeback_fid, offset, &iter, &ret);
+	return ret;
+}
+
+/**
+ * Helper function for managing 9pFS write requests.
+ * The main purpose of this function is to provide support for
+ * the coalescing of several pages into a single 9p message.
+ * This is similarly to NFS's pagelist.
+ *
+ * Copy pages with adjusent indices to a buffer and send it to
+ * the server.
+ *
+ * @pages: array of pages with ascending indices;
+ * @nr_pages: number of pages in the array;
+ */
+static int flush_pages(struct page **pages, int nr_pages,
+		       struct flush_context *ctx)
+{
+	int ret;
+	int pos = 0;
+	int iter_pos;
+	int iter_nrpages;
+	pgoff_t iter_page_idx;
+
+	while (pos < nr_pages) {
+
+		int i;
+		int iter_len = 0;
+		struct page *page;
+
+		iter_pos = pos;
+		iter_nrpages = 0;
+		iter_page_idx = pages[pos]->index;
+
+		for (i = 0; pos < nr_pages; i++) {
+			int from_page;
+
+			page = pages[pos];
+			if (page->index != iter_page_idx + i) {
+				/*
+				 * Hole in the indices,
+				 * further coalesce impossible.
+				 * Try to send what we have accumulated.
+				 * This page will be processed in the next
+				 * iteration
+				 */
+				goto iter_send;
+			}
+			from_page = flush_page(page, i, ctx);
+
+			iter_len += from_page;
+			iter_nrpages++;
+			pos++;
+
+			if (from_page != PAGE_SIZE) {
+				/*
+				 * Not full page was flushed,
+				 * further coalesce impossible.
+				 * Try to send what we have accumulated.
+				 */
+#if V9FS_WRITEPAGES_DEBUG
+				if (from_page == 0)
+				    printk("9p: page %lu is not flushed (%s)\n",
+					   page->index, ctx->msg);
+#endif
+				goto iter_send;
+			}
+		};
+	iter_send:
+		if (iter_len == 0)
+			/*
+			 * Nothing to send
+			 */
+			goto next_iter;
+		ret = send_buffer(iter_page_idx << PAGE_SHIFT,
+				  iter_len, ctx);
+		if (ret == -EAGAIN) {
+			redirty_pages_for_writeback(pages + iter_pos,
+						    iter_nrpages, ctx->wbc);
+			ret = 0;
+		} else if (ret < 0) {
+			/*
+			 * Something bad happened.
+			 * done_index is set past this chunk,
+			 * so media errors will not choke
+			 * background writeout for the entire
+			 * file.
+			 */
+			printk("9p: send_buffer failed (%d)\n", ret);
+
+			set_pages_error(pages + iter_pos, iter_nrpages, ret);
+			ctx->done_index =
+				pages[iter_pos + iter_nrpages - 1]->index + 1;
+			ctx->done = 1;
+			return ret;
+		} else
+			ret = 0;
+	next_iter:
+		if (ctx->done)
+			return ret;
+	};
+	return 0;
+}
+
+static void init_flush_context(struct flush_context *ctx,
+			       struct address_space *mapping,
+			       struct writeback_control *wbc,
+			       struct v9fs_flush_set *fset)
+{
+	ctx->wbc = wbc;
+	ctx->mapping = mapping;
+	ctx->fset = fset;
+	ctx->done = 0;
+	ctx->range_whole = 0;
+
+	if (wbc->range_cyclic) {
+		ctx->writeback_index = mapping->writeback_index;
+		ctx->index = ctx->writeback_index;
+		if (ctx->index == 0)
+			ctx->cycled = 1;
+		else
+			ctx->cycled = 0;
+		ctx->end = -1;
+	} else {
+		ctx->index = wbc->range_start >> PAGE_SHIFT;
+		ctx->end = wbc->range_end >> PAGE_SHIFT;
+		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+			ctx->range_whole = 1;
+		ctx->cycled = 1; /* ignore range_cyclic tests */
+	}
+}
+
+/**
+ * Pre-condition: flush set is locked
+ */
+static int v9fs_writepages_fastpath(struct address_space *mapping,
+				    struct writeback_control *wbc,
+				    struct v9fs_flush_set *fset)
+{
+	int ret = 0;
+	int tag;
+	int nr_pages;
+        struct page **pages = fset->pages;
+	struct flush_context ctx;
+
+	init_flush_context(&ctx, mapping, wbc, fset);
+
+	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
+		tag = PAGECACHE_TAG_TOWRITE;
+	else
+		tag = PAGECACHE_TAG_DIRTY;
+retry:
+	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
+		tag_pages_for_writeback(mapping, ctx.index, ctx.end);
+
+	ctx.done_index = ctx.index;
+
+	while (!ctx.done && (ctx.index <= ctx.end)) {
+		int i;
+		nr_pages = find_get_pages_tag(mapping, &ctx.index, tag,
+					      1 + min(ctx.end - ctx.index,
+					      (pgoff_t)(fset->num_pages - 1)),
+					      pages);
+		if (nr_pages == 0)
+			break;
+
+		ret = flush_pages(pages, nr_pages, &ctx);
+		/*
+		 * unpin pages
+		 */
+		for (i = 0; i < nr_pages; i++)
+			put_page(pages[i]);
+		if (ret < 0)
+			break;
+		cond_resched();
+	}
+	if (!ctx.cycled && !ctx.done) {
+		/*
+		 * range_cyclic:
+		 * We hit the last page and there is more work
+		 * to be done: wrap back to the start of the file
+		 */
+		ctx.cycled = 1;
+		ctx.index = 0;
+		ctx.end = ctx.writeback_index - 1;
+		goto retry;
+	}
+	if (wbc->range_cyclic || (ctx.range_whole && wbc->nr_to_write > 0))
+		mapping->writeback_index = ctx.done_index;
+	return ret;
+}
+
+static int v9fs_writepages(struct address_space *mapping,
+			   struct writeback_control *wbc)
+{
+	int ret;
+	struct v9fs_flush_set *fset;
+
+	fset = v9fs_inode2v9ses(mapping->host)->flush;
+	if (!fset || !spin_trylock_flush_set(fset))
+		/*
+		 * do it by slow way
+		 */
+		return generic_writepages(mapping, wbc);
+
+	ret = v9fs_writepages_fastpath(mapping, wbc, fset);
+	spin_unlock_flush_set(fset);
+	return ret;
+}
+
 /**
  * v9fs_launder_page - Writeback a dirty page
  * Returns 0 on success.
@@ -342,6 +698,7 @@  const struct address_space_operations v9fs_addr_operations = {
 	.readpages = v9fs_vfs_readpages,
 	.set_page_dirty = __set_page_dirty_nobuffers,
 	.writepage = v9fs_vfs_writepage,
+	.writepages = v9fs_writepages,
 	.write_begin = v9fs_write_begin,
 	.write_end = v9fs_write_end,
 	.releasepage = v9fs_release_page,
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index de3ed86..c1f9af1 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -140,8 +140,14 @@  static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
 	}
 	v9fs_fill_super(sb, v9ses, flags, data);
 
-	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
+	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
+		retval = alloc_init_flush_set(v9ses);
+		if (IS_ERR(v9ses->flush)) {
+			retval = PTR_ERR(fid);
+			goto release_sb;
+		}
 		sb->s_d_op = &v9fs_cached_dentry_operations;
+	}
 	else
 		sb->s_d_op = &v9fs_dentry_operations;