diff mbox

[04/34] pnfs: hook nfs_write_begin/end to allow layout driver manipulation

Message ID ea5e2bf59d726fbfec6f63af9eda83dbc15cb96d.1307921137.git.rees@umich.edu (mailing list archive)
State New, archived
Headers show

Commit Message

Jim Rees June 12, 2011, 11:43 p.m. UTC
From: Peng Tao <bergwolf@gmail.com>

Signed-off-by: Fred Isaman <iisaman@citi.umich.edu>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Reported-by: Alexandros Batsakis <batsakis@netapp.com>
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Peng Tao <bergwolf@gmail.com>
---
 fs/nfs/file.c          |   26 ++++++++++-
 fs/nfs/pnfs.c          |   41 +++++++++++++++++
 fs/nfs/pnfs.h          |  115 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/nfs/write.c         |   12 +++--
 include/linux/nfs_fs.h |    3 +-
 5 files changed, 189 insertions(+), 8 deletions(-)

Comments

Fred Isaman June 13, 2011, 2:44 p.m. UTC | #1
On Sun, Jun 12, 2011 at 7:43 PM, Jim Rees <rees@umich.edu> wrote:
> From: Peng Tao <bergwolf@gmail.com>
>
> Signed-off-by: Fred Isaman <iisaman@citi.umich.edu>
> Signed-off-by: Benny Halevy <bhalevy@panasas.com>
> Reported-by: Alexandros Batsakis <batsakis@netapp.com>
> Signed-off-by: Andy Adamson <andros@netapp.com>
> Signed-off-by: Fred Isaman <iisaman@netapp.com>
> Signed-off-by: Benny Halevy <bhalevy@panasas.com>
> Signed-off-by: Peng Tao <bergwolf@gmail.com>
> ---
>  fs/nfs/file.c          |   26 ++++++++++-
>  fs/nfs/pnfs.c          |   41 +++++++++++++++++
>  fs/nfs/pnfs.h          |  115 ++++++++++++++++++++++++++++++++++++++++++++++++
>  fs/nfs/write.c         |   12 +++--
>  include/linux/nfs_fs.h |    3 +-
>  5 files changed, 189 insertions(+), 8 deletions(-)
>
> diff --git a/fs/nfs/file.c b/fs/nfs/file.c
> index 2f093ed..1768762 100644
> --- a/fs/nfs/file.c
> +++ b/fs/nfs/file.c
> @@ -384,12 +384,15 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
>        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
>        struct page *page;
>        int once_thru = 0;
> +       struct pnfs_layout_segment *lseg;
>
>        dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n",
>                file->f_path.dentry->d_parent->d_name.name,
>                file->f_path.dentry->d_name.name,
>                mapping->host->i_ino, len, (long long) pos);
> -
> +       lseg = pnfs_update_layout(mapping->host,
> +                                 nfs_file_open_context(file),
> +                                 pos, len, IOMODE_RW, GFP_NOFS);


This looks like it is left over from before the rearrangements done to
where pnfs_update_layout.
In particular, we don't want to hold the reference on the lseg from
here until flush time.  And there
seems to be no reason to.  If the client needs a layout to deal with
read-in here, it should instead
trigger the nfs_want_read_modify_write clause.

Fred

>  start:
>        /*
>         * Prevent starvation issues if someone is doing a consistency
> @@ -409,6 +412,9 @@ start:
>        if (ret) {
>                unlock_page(page);
>                page_cache_release(page);
> +               *pagep = NULL;
> +               *fsdata = NULL;
> +               goto out;
>        } else if (!once_thru &&
>                   nfs_want_read_modify_write(file, page, pos, len)) {
>                once_thru = 1;
> @@ -417,6 +423,12 @@ start:
>                if (!ret)
>                        goto start;
>        }
> +       ret = pnfs_write_begin(file, page, pos, len, lseg, fsdata);
> + out:
> +       if (ret) {
> +               put_lseg(lseg);
> +               *fsdata = NULL;
> +       }
>        return ret;
>  }
>
> @@ -426,6 +438,7 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
>  {
>        unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
>        int status;
> +       struct pnfs_layout_segment *lseg;
>
>        dfprintk(PAGECACHE, "NFS: write_end(%s/%s(%ld), %u@%lld)\n",
>                file->f_path.dentry->d_parent->d_name.name,
> @@ -452,10 +465,17 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
>                        zero_user_segment(page, pglen, PAGE_CACHE_SIZE);
>        }
>
> -       status = nfs_updatepage(file, page, offset, copied);
> +       lseg = nfs4_pull_lseg_from_fsdata(file, fsdata);
> +       status = pnfs_write_end(file, page, pos, len, copied, lseg);
> +       if (status)
> +               goto out;
> +       status = nfs_updatepage(file, page, offset, copied, lseg, fsdata);
>
> +out:
>        unlock_page(page);
>        page_cache_release(page);
> +       pnfs_write_end_cleanup(file, fsdata);
> +       put_lseg(lseg);
>
>        if (status < 0)
>                return status;
> @@ -577,7 +597,7 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
>
>        ret = VM_FAULT_LOCKED;
>        if (nfs_flush_incompatible(filp, page) == 0 &&
> -           nfs_updatepage(filp, page, 0, pagelen) == 0)
> +           nfs_updatepage(filp, page, 0, pagelen, NULL, NULL) == 0)
>                goto out;
>
>        ret = VM_FAULT_SIGBUS;
> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
> index f03a5e0..e693718 100644
> --- a/fs/nfs/pnfs.c
> +++ b/fs/nfs/pnfs.c
> @@ -1138,6 +1138,41 @@ pnfs_try_to_write_data(struct nfs_write_data *wdata,
>  }
>
>  /*
> + * This gives the layout driver an opportunity to read in page "around"
> + * the data to be written.  It returns 0 on success, otherwise an error code
> + * which will either be passed up to user, or ignored if
> + * some previous part of write succeeded.
> + * Note the range [pos, pos+len-1] is entirely within the page.
> + */
> +int _pnfs_write_begin(struct inode *inode, struct page *page,
> +                     loff_t pos, unsigned len,
> +                     struct pnfs_layout_segment *lseg,
> +                     struct pnfs_fsdata **fsdata)
> +{
> +       struct pnfs_fsdata *data;
> +       int status = 0;
> +
> +       dprintk("--> %s: pos=%llu len=%u\n",
> +               __func__, (unsigned long long)pos, len);
> +       data = kzalloc(sizeof(struct pnfs_fsdata), GFP_KERNEL);
> +       if (!data) {
> +               status = -ENOMEM;
> +               goto out;
> +       }
> +       data->lseg = lseg; /* refcount passed into data to be managed there */
> +       status = NFS_SERVER(inode)->pnfs_curr_ld->write_begin(
> +                                               lseg, page, pos, len, data);
> +       if (status) {
> +               kfree(data);
> +               data = NULL;
> +       }
> +out:
> +       *fsdata = data;
> +       dprintk("<-- %s: status=%d\n", __func__, status);
> +       return status;
> +}
> +
> +/*
>  * Called by non rpc-based layout drivers
>  */
>  int
> @@ -1237,6 +1272,12 @@ pnfs_set_layoutcommit(struct nfs_write_data *wdata)
>  }
>  EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit);
>
> +void pnfs_free_fsdata(struct pnfs_fsdata *fsdata)
> +{
> +       /* lseg refcounting handled directly in nfs_write_end */
> +       kfree(fsdata);
> +}
> +
>  /*
>  * For the LAYOUT4_NFSV4_1_FILES layout type, NFS_DATA_SYNC WRITEs and
>  * NFS_UNSTABLE WRITEs with a COMMIT to data servers must store enough
> diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
> index a3fc0f2..525ec55 100644
> --- a/fs/nfs/pnfs.h
> +++ b/fs/nfs/pnfs.h
> @@ -54,6 +54,12 @@ enum pnfs_try_status {
>        PNFS_NOT_ATTEMPTED = 1,
>  };
>
> +struct pnfs_fsdata {
> +       struct pnfs_layout_segment *lseg;
> +       int bypass_eof;
> +       void *private;
> +};
> +
>  #ifdef CONFIG_NFS_V4_1
>
>  #define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4"
> @@ -106,6 +112,14 @@ struct pnfs_layoutdriver_type {
>         */
>        enum pnfs_try_status (*read_pagelist) (struct nfs_read_data *nfs_data);
>        enum pnfs_try_status (*write_pagelist) (struct nfs_write_data *nfs_data, int how);
> +       int (*write_begin) (struct pnfs_layout_segment *lseg, struct page *page,
> +                           loff_t pos, unsigned count,
> +                           struct pnfs_fsdata *fsdata);
> +       int (*write_end)(struct inode *inode, struct page *page, loff_t pos,
> +                        unsigned count, unsigned copied,
> +                        struct pnfs_layout_segment *lseg);
> +       void (*write_end_cleanup)(struct file *filp,
> +                                 struct pnfs_fsdata *fsdata);
>
>        void (*free_deviceid_node) (struct nfs4_deviceid_node *);
>
> @@ -175,6 +189,7 @@ enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *,
>  enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *,
>                                            const struct rpc_call_ops *);
>  bool pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req);
> +void pnfs_free_fsdata(struct pnfs_fsdata *fsdata);
>  int pnfs_layout_process(struct nfs4_layoutget *lgp);
>  void pnfs_free_lseg_list(struct list_head *tmp_list);
>  void pnfs_destroy_layout(struct nfs_inode *);
> @@ -186,6 +201,10 @@ void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
>  int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
>                                  struct pnfs_layout_hdr *lo,
>                                  struct nfs4_state *open_state);
> +int _pnfs_write_begin(struct inode *inode, struct page *page,
> +                     loff_t pos, unsigned len,
> +                     struct pnfs_layout_segment *lseg,
> +                     struct pnfs_fsdata **fsdata);
>  int mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
>                                struct list_head *tmp_list,
>                                struct pnfs_layout_range *recall_range);
> @@ -287,6 +306,13 @@ static inline void pnfs_clear_request_commit(struct nfs_page *req)
>                put_lseg(req->wb_commit_lseg);
>  }
>
> +static inline int pnfs_grow_ok(struct pnfs_layout_segment *lseg,
> +                              struct pnfs_fsdata *fsdata)
> +{
> +       return !fsdata  || ((struct pnfs_layout_segment *)fsdata == lseg) ||
> +               !fsdata->bypass_eof;
> +}
> +
>  /* Should the pNFS client commit and return the layout upon a setattr */
>  static inline bool
>  pnfs_ld_layoutret_on_setattr(struct inode *inode)
> @@ -297,6 +323,49 @@ pnfs_ld_layoutret_on_setattr(struct inode *inode)
>                PNFS_LAYOUTRET_ON_SETATTR;
>  }
>
> +static inline int pnfs_write_begin(struct file *filp, struct page *page,
> +                                  loff_t pos, unsigned len,
> +                                  struct pnfs_layout_segment *lseg,
> +                                  void **fsdata)
> +{
> +       struct inode *inode = filp->f_dentry->d_inode;
> +       struct nfs_server *nfss = NFS_SERVER(inode);
> +       int status = 0;
> +
> +       *fsdata = lseg;
> +       if (lseg && nfss->pnfs_curr_ld->write_begin)
> +               status = _pnfs_write_begin(inode, page, pos, len, lseg,
> +                                          (struct pnfs_fsdata **) fsdata);
> +       return status;
> +}
> +
> +/* CAREFUL - what happens if copied < len??? */
> +static inline int pnfs_write_end(struct file *filp, struct page *page,
> +                                loff_t pos, unsigned len, unsigned copied,
> +                                struct pnfs_layout_segment *lseg)
> +{
> +       struct inode *inode = filp->f_dentry->d_inode;
> +       struct nfs_server *nfss = NFS_SERVER(inode);
> +
> +       if (nfss->pnfs_curr_ld && nfss->pnfs_curr_ld->write_end)
> +               return nfss->pnfs_curr_ld->write_end(inode, page, pos, len,
> +                                                    copied, lseg);
> +       else
> +               return 0;
> +}
> +
> +static inline void pnfs_write_end_cleanup(struct file *filp, void *fsdata)
> +{
> +       struct nfs_server *nfss = NFS_SERVER(filp->f_dentry->d_inode);
> +
> +       if (fsdata && nfss->pnfs_curr_ld) {
> +               if (nfss->pnfs_curr_ld->write_end_cleanup)
> +                       nfss->pnfs_curr_ld->write_end_cleanup(filp, fsdata);
> +               if (nfss->pnfs_curr_ld->write_begin)
> +                       pnfs_free_fsdata(fsdata);
> +       }
> +}
> +
>  static inline int pnfs_return_layout(struct inode *ino)
>  {
>        struct nfs_inode *nfsi = NFS_I(ino);
> @@ -317,6 +386,19 @@ static inline void pnfs_pageio_init(struct nfs_pageio_descriptor *pgio,
>                pgio->pg_test = ld->pg_test;
>  }
>
> +static inline struct pnfs_layout_segment *
> +nfs4_pull_lseg_from_fsdata(struct file *filp, void *fsdata)
> +{
> +       if (fsdata) {
> +               struct nfs_server *nfss = NFS_SERVER(filp->f_dentry->d_inode);
> +
> +               if (nfss->pnfs_curr_ld && nfss->pnfs_curr_ld->write_begin)
> +                       return ((struct pnfs_fsdata *) fsdata)->lseg;
> +               return (struct pnfs_layout_segment *)fsdata;
> +       }
> +       return NULL;
> +}
> +
>  #else  /* CONFIG_NFS_V4_1 */
>
>  static inline void pnfs_destroy_all_layouts(struct nfs_client *clp)
> @@ -345,6 +427,12 @@ pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
>        return NULL;
>  }
>
> +static inline int pnfs_grow_ok(struct pnfs_layout_segment *lseg,
> +                              struct pnfs_fsdata *fsdata)
> +{
> +       return 1;
> +}
> +
>  static inline enum pnfs_try_status
>  pnfs_try_to_read_data(struct nfs_read_data *data,
>                      const struct rpc_call_ops *call_ops)
> @@ -364,6 +452,26 @@ static inline int pnfs_return_layout(struct inode *ino)
>        return 0;
>  }
>
> +static inline int pnfs_write_begin(struct file *filp, struct page *page,
> +                                  loff_t pos, unsigned len,
> +                                  struct pnfs_layout_segment *lseg,
> +                                  void **fsdata)
> +{
> +       *fsdata = NULL;
> +       return 0;
> +}
> +
> +static inline int pnfs_write_end(struct file *filp, struct page *page,
> +                                loff_t pos, unsigned len, unsigned copied,
> +                                struct pnfs_layout_segment *lseg)
> +{
> +       return 0;
> +}
> +
> +static inline void pnfs_write_end_cleanup(struct file *filp, void *fsdata)
> +{
> +}
> +
>  static inline bool
>  pnfs_ld_layoutret_on_setattr(struct inode *inode)
>  {
> @@ -435,6 +543,13 @@ static inline int pnfs_layoutcommit_inode(struct inode *inode, bool sync)
>  static inline void nfs4_deviceid_purge_client(struct nfs_client *ncl)
>  {
>  }
> +
> +static inline struct pnfs_layout_segment *
> +nfs4_pull_lseg_from_fsdata(struct file *filp, void *fsdata)
> +{
> +       return NULL;
> +}
> +
>  #endif /* CONFIG_NFS_V4_1 */
>
>  #endif /* FS_NFS_PNFS_H */
> diff --git a/fs/nfs/write.c b/fs/nfs/write.c
> index e268e3b..75e2a6b 100644
> --- a/fs/nfs/write.c
> +++ b/fs/nfs/write.c
> @@ -673,7 +673,9 @@ out:
>  }
>
>  static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
> -               unsigned int offset, unsigned int count)
> +               unsigned int offset, unsigned int count,
> +               struct pnfs_layout_segment *lseg, void *fsdata)
> +
>  {
>        struct nfs_page *req;
>
> @@ -681,7 +683,8 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
>        if (IS_ERR(req))
>                return PTR_ERR(req);
>        /* Update file length */
> -       nfs_grow_file(page, offset, count);
> +       if (pnfs_grow_ok(lseg, fsdata))
> +               nfs_grow_file(page, offset, count);
>        nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes);
>        nfs_mark_request_dirty(req);
>        nfs_clear_page_tag_locked(req);
> @@ -734,7 +737,8 @@ static int nfs_write_pageuptodate(struct page *page, struct inode *inode)
>  * things with a page scheduled for an RPC call (e.g. invalidate it).
>  */
>  int nfs_updatepage(struct file *file, struct page *page,
> -               unsigned int offset, unsigned int count)
> +               unsigned int offset, unsigned int count,
> +               struct pnfs_layout_segment *lseg, void *fsdata)
>  {
>        struct nfs_open_context *ctx = nfs_file_open_context(file);
>        struct inode    *inode = page->mapping->host;
> @@ -759,7 +763,7 @@ int nfs_updatepage(struct file *file, struct page *page,
>                offset = 0;
>        }
>
> -       status = nfs_writepage_setup(ctx, page, offset, count);
> +       status = nfs_writepage_setup(ctx, page, offset, count, lseg, fsdata);
>        if (status < 0)
>                nfs_set_pageerror(page);
>
> diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
> index 1b93b9c..be1ac1d 100644
> --- a/include/linux/nfs_fs.h
> +++ b/include/linux/nfs_fs.h
> @@ -510,7 +510,8 @@ extern int  nfs_congestion_kb;
>  extern int  nfs_writepage(struct page *page, struct writeback_control *wbc);
>  extern int  nfs_writepages(struct address_space *, struct writeback_control *);
>  extern int  nfs_flush_incompatible(struct file *file, struct page *page);
> -extern int  nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int);
> +extern int  nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int,
> +                       struct pnfs_layout_segment *, void *);
>  extern void nfs_writeback_done(struct rpc_task *, struct nfs_write_data *);
>
>  /*
> --
> 1.7.4.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
tao.peng@emc.com June 14, 2011, 11:01 a.m. UTC | #2
Hi, Fred,

> -----Original Message-----
> From: linux-nfs-owner@vger.kernel.org [mailto:linux-nfs-owner@vger.kernel.org]
> On Behalf Of Fred Isaman
> Sent: Monday, June 13, 2011 10:44 PM
> To: Jim Rees
> Cc: linux-nfs@vger.kernel.org; peter honeyman
> Subject: Re: [PATCH 04/34] pnfs: hook nfs_write_begin/end to allow layout driver
> manipulation
>
> On Sun, Jun 12, 2011 at 7:43 PM, Jim Rees <rees@umich.edu> wrote:
> > From: Peng Tao <bergwolf@gmail.com>
> >
> > Signed-off-by: Fred Isaman <iisaman@citi.umich.edu>
> > Signed-off-by: Benny Halevy <bhalevy@panasas.com>
> > Reported-by: Alexandros Batsakis <batsakis@netapp.com>
> > Signed-off-by: Andy Adamson <andros@netapp.com>
> > Signed-off-by: Fred Isaman <iisaman@netapp.com>
> > Signed-off-by: Benny Halevy <bhalevy@panasas.com>
> > Signed-off-by: Peng Tao <bergwolf@gmail.com>
> > ---
> >  fs/nfs/file.c          |   26 ++++++++++-
> >  fs/nfs/pnfs.c          |   41 +++++++++++++++++
> >  fs/nfs/pnfs.h          |  115
> ++++++++++++++++++++++++++++++++++++++++++++++++
> >  fs/nfs/write.c         |   12 +++--
> >  include/linux/nfs_fs.h |    3 +-
> >  5 files changed, 189 insertions(+), 8 deletions(-)
> >
> > diff --git a/fs/nfs/file.c b/fs/nfs/file.c
> > index 2f093ed..1768762 100644
> > --- a/fs/nfs/file.c
> > +++ b/fs/nfs/file.c
> > @@ -384,12 +384,15 @@ static int nfs_write_begin(struct file *file, struct
> address_space *mapping,
> >        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
> >        struct page *page;
> >        int once_thru = 0;
> > +       struct pnfs_layout_segment *lseg;
> >
> >        dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n",
> >                file->f_path.dentry->d_parent->d_name.name,
> >                file->f_path.dentry->d_name.name,
> >                mapping->host->i_ino, len, (long long) pos);
> > -
> > +       lseg = pnfs_update_layout(mapping->host,
> > +                                 nfs_file_open_context(file),
> > +                                 pos, len, IOMODE_RW, GFP_NOFS);
>
>
> This looks like it is left over from before the rearrangements done to
> where pnfs_update_layout.
> In particular, we don't want to hold the reference on the lseg from
> here until flush time.  And there
> seems to be no reason to.  If the client needs a layout to deal with
> read-in here, it should instead
> trigger the nfs_want_read_modify_write clause.
Yes, you are right. Directly calling pnfs_update_layout here can be avoided.
But it seems triggering nfs_want_read_modify_write will acquire a read-only layout segment via readpage code path.
For write, client will need a read-write layout segment so it would mean two layoutget for each new segment (one in nfs_readpage and one at flush time). It may not be good for performance.
Does current generic code have method to avoid this?

Thanks,
Tao

>
> Fred
>
> >  start:
> >        /*
> >         * Prevent starvation issues if someone is doing a consistency
> > @@ -409,6 +412,9 @@ start:
> >        if (ret) {
> >                unlock_page(page);
> >                page_cache_release(page);
> > +               *pagep = NULL;
> > +               *fsdata = NULL;
> > +               goto out;
> >        } else if (!once_thru &&
> >                   nfs_want_read_modify_write(file, page, pos, len)) {
> >                once_thru = 1;
> > @@ -417,6 +423,12 @@ start:
> >                if (!ret)
> >                        goto start;
> >        }
> > +       ret = pnfs_write_begin(file, page, pos, len, lseg, fsdata);
> > + out:
> > +       if (ret) {
> > +               put_lseg(lseg);
> > +               *fsdata = NULL;
> > +       }
> >        return ret;
> >  }
> >
> > @@ -426,6 +438,7 @@ static int nfs_write_end(struct file *file, struct
> address_space *mapping,
> >  {
> >        unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
> >        int status;
> > +       struct pnfs_layout_segment *lseg;
> >
> >        dfprintk(PAGECACHE, "NFS: write_end(%s/%s(%ld), %u@%lld)\n",
> >                file->f_path.dentry->d_parent->d_name.name,
> > @@ -452,10 +465,17 @@ static int nfs_write_end(struct file *file, struct
> address_space *mapping,
> >                        zero_user_segment(page, pglen,
> PAGE_CACHE_SIZE);
> >        }
> >
> > -       status = nfs_updatepage(file, page, offset, copied);
> > +       lseg = nfs4_pull_lseg_from_fsdata(file, fsdata);
> > +       status = pnfs_write_end(file, page, pos, len, copied, lseg);
> > +       if (status)
> > +               goto out;
> > +       status = nfs_updatepage(file, page, offset, copied, lseg, fsdata);
> >
> > +out:
> >        unlock_page(page);
> >        page_cache_release(page);
> > +       pnfs_write_end_cleanup(file, fsdata);
> > +       put_lseg(lseg);
> >
> >        if (status < 0)
> >                return status;
> > @@ -577,7 +597,7 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct
> *vma, struct vm_fault *vmf)
> >
> >        ret = VM_FAULT_LOCKED;
> >        if (nfs_flush_incompatible(filp, page) == 0 &&
> > -           nfs_updatepage(filp, page, 0, pagelen) == 0)
> > +           nfs_updatepage(filp, page, 0, pagelen, NULL, NULL) == 0)
> >                goto out;
> >
> >        ret = VM_FAULT_SIGBUS;
> > diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
> > index f03a5e0..e693718 100644
> > --- a/fs/nfs/pnfs.c
> > +++ b/fs/nfs/pnfs.c
> > @@ -1138,6 +1138,41 @@ pnfs_try_to_write_data(struct nfs_write_data
> *wdata,
> >  }
> >
> >  /*
> > + * This gives the layout driver an opportunity to read in page "around"
> > + * the data to be written.  It returns 0 on success, otherwise an error code
> > + * which will either be passed up to user, or ignored if
> > + * some previous part of write succeeded.
> > + * Note the range [pos, pos+len-1] is entirely within the page.
> > + */
> > +int _pnfs_write_begin(struct inode *inode, struct page *page,
> > +                     loff_t pos, unsigned len,
> > +                     struct pnfs_layout_segment *lseg,
> > +                     struct pnfs_fsdata **fsdata)
> > +{
> > +       struct pnfs_fsdata *data;
> > +       int status = 0;
> > +
> > +       dprintk("--> %s: pos=%llu len=%u\n",
> > +               __func__, (unsigned long long)pos, len);
> > +       data = kzalloc(sizeof(struct pnfs_fsdata), GFP_KERNEL);
> > +       if (!data) {
> > +               status = -ENOMEM;
> > +               goto out;
> > +       }
> > +       data->lseg = lseg; /* refcount passed into data to be managed there */
> > +       status = NFS_SERVER(inode)->pnfs_curr_ld->write_begin(
> > +                                               lseg, page, pos,
> len, data);
> > +       if (status) {
> > +               kfree(data);
> > +               data = NULL;
> > +       }
> > +out:
> > +       *fsdata = data;
> > +       dprintk("<-- %s: status=%d\n", __func__, status);
> > +       return status;
> > +}
> > +
> > +/*
> >  * Called by non rpc-based layout drivers
> >  */
> >  int
> > @@ -1237,6 +1272,12 @@ pnfs_set_layoutcommit(struct nfs_write_data *wdata)
> >  }
> >  EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit);
> >
> > +void pnfs_free_fsdata(struct pnfs_fsdata *fsdata)
> > +{
> > +       /* lseg refcounting handled directly in nfs_write_end */
> > +       kfree(fsdata);
> > +}
> > +
> >  /*
> >  * For the LAYOUT4_NFSV4_1_FILES layout type, NFS_DATA_SYNC WRITEs and
> >  * NFS_UNSTABLE WRITEs with a COMMIT to data servers must store enough
> > diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
> > index a3fc0f2..525ec55 100644
> > --- a/fs/nfs/pnfs.h
> > +++ b/fs/nfs/pnfs.h
> > @@ -54,6 +54,12 @@ enum pnfs_try_status {
> >        PNFS_NOT_ATTEMPTED = 1,
> >  };
> >
> > +struct pnfs_fsdata {
> > +       struct pnfs_layout_segment *lseg;
> > +       int bypass_eof;
> > +       void *private;
> > +};
> > +
> >  #ifdef CONFIG_NFS_V4_1
> >
> >  #define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4"
> > @@ -106,6 +112,14 @@ struct pnfs_layoutdriver_type {
> >         */
> >        enum pnfs_try_status (*read_pagelist) (struct nfs_read_data
> *nfs_data);
> >        enum pnfs_try_status (*write_pagelist) (struct nfs_write_data
> *nfs_data, int how);
> > +       int (*write_begin) (struct pnfs_layout_segment *lseg, struct page
> *page,
> > +                           loff_t pos, unsigned count,
> > +                           struct pnfs_fsdata *fsdata);
> > +       int (*write_end)(struct inode *inode, struct page *page, loff_t pos,
> > +                        unsigned count, unsigned copied,
> > +                        struct pnfs_layout_segment *lseg);
> > +       void (*write_end_cleanup)(struct file *filp,
> > +                                 struct pnfs_fsdata *fsdata);
> >
> >        void (*free_deviceid_node) (struct nfs4_deviceid_node *);
> >
> > @@ -175,6 +189,7 @@ enum pnfs_try_status pnfs_try_to_write_data(struct
> nfs_write_data *,
> >  enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *,
> >                                            const struct
> rpc_call_ops *);
> >  bool pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page
> *prev, struct nfs_page *req);
> > +void pnfs_free_fsdata(struct pnfs_fsdata *fsdata);
> >  int pnfs_layout_process(struct nfs4_layoutget *lgp);
> >  void pnfs_free_lseg_list(struct list_head *tmp_list);
> >  void pnfs_destroy_layout(struct nfs_inode *);
> > @@ -186,6 +201,10 @@ void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
> >  int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
> >                                  struct pnfs_layout_hdr *lo,
> >                                  struct nfs4_state *open_state);
> > +int _pnfs_write_begin(struct inode *inode, struct page *page,
> > +                     loff_t pos, unsigned len,
> > +                     struct pnfs_layout_segment *lseg,
> > +                     struct pnfs_fsdata **fsdata);
> >  int mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
> >                                struct list_head *tmp_list,
> >                                struct pnfs_layout_range
> *recall_range);
> > @@ -287,6 +306,13 @@ static inline void pnfs_clear_request_commit(struct
> nfs_page *req)
> >                put_lseg(req->wb_commit_lseg);
> >  }
> >
> > +static inline int pnfs_grow_ok(struct pnfs_layout_segment *lseg,
> > +                              struct pnfs_fsdata *fsdata)
> > +{
> > +       return !fsdata  || ((struct pnfs_layout_segment *)fsdata == lseg) ||
> > +               !fsdata->bypass_eof;
> > +}
> > +
> >  /* Should the pNFS client commit and return the layout upon a setattr */
> >  static inline bool
> >  pnfs_ld_layoutret_on_setattr(struct inode *inode)
> > @@ -297,6 +323,49 @@ pnfs_ld_layoutret_on_setattr(struct inode *inode)
> >                PNFS_LAYOUTRET_ON_SETATTR;
> >  }
> >
> > +static inline int pnfs_write_begin(struct file *filp, struct page *page,
> > +                                  loff_t pos, unsigned len,
> > +                                  struct pnfs_layout_segment *lseg,
> > +                                  void **fsdata)
> > +{
> > +       struct inode *inode = filp->f_dentry->d_inode;
> > +       struct nfs_server *nfss = NFS_SERVER(inode);
> > +       int status = 0;
> > +
> > +       *fsdata = lseg;
> > +       if (lseg && nfss->pnfs_curr_ld->write_begin)
> > +               status = _pnfs_write_begin(inode, page, pos, len, lseg,
> > +                                          (struct pnfs_fsdata **)
> fsdata);
> > +       return status;
> > +}
> > +
> > +/* CAREFUL - what happens if copied < len??? */
> > +static inline int pnfs_write_end(struct file *filp, struct page *page,
> > +                                loff_t pos, unsigned len, unsigned
> copied,
> > +                                struct pnfs_layout_segment *lseg)
> > +{
> > +       struct inode *inode = filp->f_dentry->d_inode;
> > +       struct nfs_server *nfss = NFS_SERVER(inode);
> > +
> > +       if (nfss->pnfs_curr_ld && nfss->pnfs_curr_ld->write_end)
> > +               return nfss->pnfs_curr_ld->write_end(inode, page, pos, len,
> > +                                                    copied,
> lseg);
> > +       else
> > +               return 0;
> > +}
> > +
> > +static inline void pnfs_write_end_cleanup(struct file *filp, void *fsdata)
> > +{
> > +       struct nfs_server *nfss = NFS_SERVER(filp->f_dentry->d_inode);
> > +
> > +       if (fsdata && nfss->pnfs_curr_ld) {
> > +               if (nfss->pnfs_curr_ld->write_end_cleanup)
> > +                       nfss->pnfs_curr_ld->write_end_cleanup(filp,
> fsdata);
> > +               if (nfss->pnfs_curr_ld->write_begin)
> > +                       pnfs_free_fsdata(fsdata);
> > +       }
> > +}
> > +
> >  static inline int pnfs_return_layout(struct inode *ino)
> >  {
> >        struct nfs_inode *nfsi = NFS_I(ino);
> > @@ -317,6 +386,19 @@ static inline void pnfs_pageio_init(struct
> nfs_pageio_descriptor *pgio,
> >                pgio->pg_test = ld->pg_test;
> >  }
> >
> > +static inline struct pnfs_layout_segment *
> > +nfs4_pull_lseg_from_fsdata(struct file *filp, void *fsdata)
> > +{
> > +       if (fsdata) {
> > +               struct nfs_server *nfss =
> NFS_SERVER(filp->f_dentry->d_inode);
> > +
> > +               if (nfss->pnfs_curr_ld && nfss->pnfs_curr_ld->write_begin)
> > +                       return ((struct pnfs_fsdata *) fsdata)->lseg;
> > +               return (struct pnfs_layout_segment *)fsdata;
> > +       }
> > +       return NULL;
> > +}
> > +
> >  #else  /* CONFIG_NFS_V4_1 */
> >
> >  static inline void pnfs_destroy_all_layouts(struct nfs_client *clp)
> > @@ -345,6 +427,12 @@ pnfs_update_layout(struct inode *ino, struct
> nfs_open_context *ctx,
> >        return NULL;
> >  }
> >
> > +static inline int pnfs_grow_ok(struct pnfs_layout_segment *lseg,
> > +                              struct pnfs_fsdata *fsdata)
> > +{
> > +       return 1;
> > +}
> > +
> >  static inline enum pnfs_try_status
> >  pnfs_try_to_read_data(struct nfs_read_data *data,
> >                      const struct rpc_call_ops *call_ops)
> > @@ -364,6 +452,26 @@ static inline int pnfs_return_layout(struct inode *ino)
> >        return 0;
> >  }
> >
> > +static inline int pnfs_write_begin(struct file *filp, struct page *page,
> > +                                  loff_t pos, unsigned len,
> > +                                  struct pnfs_layout_segment *lseg,
> > +                                  void **fsdata)
> > +{
> > +       *fsdata = NULL;
> > +       return 0;
> > +}
> > +
> > +static inline int pnfs_write_end(struct file *filp, struct page *page,
> > +                                loff_t pos, unsigned len, unsigned
> copied,
> > +                                struct pnfs_layout_segment *lseg)
> > +{
> > +       return 0;
> > +}
> > +
> > +static inline void pnfs_write_end_cleanup(struct file *filp, void *fsdata)
> > +{
> > +}
> > +
> >  static inline bool
> >  pnfs_ld_layoutret_on_setattr(struct inode *inode)
> >  {
> > @@ -435,6 +543,13 @@ static inline int pnfs_layoutcommit_inode(struct inode
> *inode, bool sync)
> >  static inline void nfs4_deviceid_purge_client(struct nfs_client *ncl)
> >  {
> >  }
> > +
> > +static inline struct pnfs_layout_segment *
> > +nfs4_pull_lseg_from_fsdata(struct file *filp, void *fsdata)
> > +{
> > +       return NULL;
> > +}
> > +
> >  #endif /* CONFIG_NFS_V4_1 */
> >
> >  #endif /* FS_NFS_PNFS_H */
> > diff --git a/fs/nfs/write.c b/fs/nfs/write.c
> > index e268e3b..75e2a6b 100644
> > --- a/fs/nfs/write.c
> > +++ b/fs/nfs/write.c
> > @@ -673,7 +673,9 @@ out:
> >  }
> >
> >  static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
> > -               unsigned int offset, unsigned int count)
> > +               unsigned int offset, unsigned int count,
> > +               struct pnfs_layout_segment *lseg, void *fsdata)
> > +
> >  {
> >        struct nfs_page *req;
> >
> > @@ -681,7 +683,8 @@ static int nfs_writepage_setup(struct nfs_open_context
> *ctx, struct page *page,
> >        if (IS_ERR(req))
> >                return PTR_ERR(req);
> >        /* Update file length */
> > -       nfs_grow_file(page, offset, count);
> > +       if (pnfs_grow_ok(lseg, fsdata))
> > +               nfs_grow_file(page, offset, count);
> >        nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes);
> >        nfs_mark_request_dirty(req);
> >        nfs_clear_page_tag_locked(req);
> > @@ -734,7 +737,8 @@ static int nfs_write_pageuptodate(struct page *page,
> struct inode *inode)
> >  * things with a page scheduled for an RPC call (e.g. invalidate it).
> >  */
> >  int nfs_updatepage(struct file *file, struct page *page,
> > -               unsigned int offset, unsigned int count)
> > +               unsigned int offset, unsigned int count,
> > +               struct pnfs_layout_segment *lseg, void *fsdata)
> >  {
> >        struct nfs_open_context *ctx = nfs_file_open_context(file);
> >        struct inode    *inode = page->mapping->host;
> > @@ -759,7 +763,7 @@ int nfs_updatepage(struct file *file, struct page *page,
> >                offset = 0;
> >        }
> >
> > -       status = nfs_writepage_setup(ctx, page, offset, count);
> > +       status = nfs_writepage_setup(ctx, page, offset, count, lseg, fsdata);
> >        if (status < 0)
> >                nfs_set_pageerror(page);
> >
> > diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
> > index 1b93b9c..be1ac1d 100644
> > --- a/include/linux/nfs_fs.h
> > +++ b/include/linux/nfs_fs.h
> > @@ -510,7 +510,8 @@ extern int  nfs_congestion_kb;
> >  extern int  nfs_writepage(struct page *page, struct writeback_control *wbc);
> >  extern int  nfs_writepages(struct address_space *, struct writeback_control *);
> >  extern int  nfs_flush_incompatible(struct file *file, struct page *page);
> > -extern int  nfs_updatepage(struct file *, struct page *, unsigned int, unsigned
> int);
> > +extern int  nfs_updatepage(struct file *, struct page *, unsigned int, unsigned
> int,
> > +                       struct pnfs_layout_segment *, void *);
> >  extern void nfs_writeback_done(struct rpc_task *, struct nfs_write_data *);
> >
> >  /*
> > --
> > 1.7.4.1
> >
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> >
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Fred Isaman June 14, 2011, 2:05 p.m. UTC | #3
On Tue, Jun 14, 2011 at 7:01 AM,  <tao.peng@emc.com> wrote:
> Hi, Fred,
>
>> -----Original Message-----
>> From: linux-nfs-owner@vger.kernel.org [mailto:linux-nfs-owner@vger.kernel.org]
>> On Behalf Of Fred Isaman
>> Sent: Monday, June 13, 2011 10:44 PM
>> To: Jim Rees
>> Cc: linux-nfs@vger.kernel.org; peter honeyman
>> Subject: Re: [PATCH 04/34] pnfs: hook nfs_write_begin/end to allow layout driver
>> manipulation
>>
>> On Sun, Jun 12, 2011 at 7:43 PM, Jim Rees <rees@umich.edu> wrote:
>> > From: Peng Tao <bergwolf@gmail.com>
>> >
>> > Signed-off-by: Fred Isaman <iisaman@citi.umich.edu>
>> > Signed-off-by: Benny Halevy <bhalevy@panasas.com>
>> > Reported-by: Alexandros Batsakis <batsakis@netapp.com>
>> > Signed-off-by: Andy Adamson <andros@netapp.com>
>> > Signed-off-by: Fred Isaman <iisaman@netapp.com>
>> > Signed-off-by: Benny Halevy <bhalevy@panasas.com>
>> > Signed-off-by: Peng Tao <bergwolf@gmail.com>
>> > ---
>> >  fs/nfs/file.c          |   26 ++++++++++-
>> >  fs/nfs/pnfs.c          |   41 +++++++++++++++++
>> >  fs/nfs/pnfs.h          |  115
>> ++++++++++++++++++++++++++++++++++++++++++++++++
>> >  fs/nfs/write.c         |   12 +++--
>> >  include/linux/nfs_fs.h |    3 +-
>> >  5 files changed, 189 insertions(+), 8 deletions(-)
>> >
>> > diff --git a/fs/nfs/file.c b/fs/nfs/file.c
>> > index 2f093ed..1768762 100644
>> > --- a/fs/nfs/file.c
>> > +++ b/fs/nfs/file.c
>> > @@ -384,12 +384,15 @@ static int nfs_write_begin(struct file *file, struct
>> address_space *mapping,
>> >        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
>> >        struct page *page;
>> >        int once_thru = 0;
>> > +       struct pnfs_layout_segment *lseg;
>> >
>> >        dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n",
>> >                file->f_path.dentry->d_parent->d_name.name,
>> >                file->f_path.dentry->d_name.name,
>> >                mapping->host->i_ino, len, (long long) pos);
>> > -
>> > +       lseg = pnfs_update_layout(mapping->host,
>> > +                                 nfs_file_open_context(file),
>> > +                                 pos, len, IOMODE_RW, GFP_NOFS);
>>
>>
>> This looks like it is left over from before the rearrangements done to
>> where pnfs_update_layout.
>> In particular, we don't want to hold the reference on the lseg from
>> here until flush time.  And there
>> seems to be no reason to.  If the client needs a layout to deal with
>> read-in here, it should instead
>> trigger the nfs_want_read_modify_write clause.
> Yes, you are right. Directly calling pnfs_update_layout here can be avoided.
> But it seems triggering nfs_want_read_modify_write will acquire a read-only layout segment via readpage code path.
> For write, client will need a read-write layout segment so it would mean two layoutget for each new segment (one in nfs_readpage and one at flush time). It may not be good for performance.
> Does current generic code have method to avoid this?
>
> Thanks,
> Tao
>

No.  However, note that this only hits in the case where you are doing
subpage writes.

Fred
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Peng Tao June 14, 2011, 3:53 p.m. UTC | #4
On Tue, Jun 14, 2011 at 10:05 PM, Fred Isaman <iisaman@netapp.com> wrote:
> On Tue, Jun 14, 2011 at 7:01 AM,  <tao.peng@emc.com> wrote:
>> Hi, Fred,
>>
>>> -----Original Message-----
>>> From: linux-nfs-owner@vger.kernel.org [mailto:linux-nfs-owner@vger.kernel.org]
>>> On Behalf Of Fred Isaman
>>> Sent: Monday, June 13, 2011 10:44 PM
>>> To: Jim Rees
>>> Cc: linux-nfs@vger.kernel.org; peter honeyman
>>> Subject: Re: [PATCH 04/34] pnfs: hook nfs_write_begin/end to allow layout driver
>>> manipulation
>>>
>>> On Sun, Jun 12, 2011 at 7:43 PM, Jim Rees <rees@umich.edu> wrote:
>>> > From: Peng Tao <bergwolf@gmail.com>
>>> >
>>> > Signed-off-by: Fred Isaman <iisaman@citi.umich.edu>
>>> > Signed-off-by: Benny Halevy <bhalevy@panasas.com>
>>> > Reported-by: Alexandros Batsakis <batsakis@netapp.com>
>>> > Signed-off-by: Andy Adamson <andros@netapp.com>
>>> > Signed-off-by: Fred Isaman <iisaman@netapp.com>
>>> > Signed-off-by: Benny Halevy <bhalevy@panasas.com>
>>> > Signed-off-by: Peng Tao <bergwolf@gmail.com>
>>> > ---
>>> >  fs/nfs/file.c          |   26 ++++++++++-
>>> >  fs/nfs/pnfs.c          |   41 +++++++++++++++++
>>> >  fs/nfs/pnfs.h          |  115
>>> ++++++++++++++++++++++++++++++++++++++++++++++++
>>> >  fs/nfs/write.c         |   12 +++--
>>> >  include/linux/nfs_fs.h |    3 +-
>>> >  5 files changed, 189 insertions(+), 8 deletions(-)
>>> >
>>> > diff --git a/fs/nfs/file.c b/fs/nfs/file.c
>>> > index 2f093ed..1768762 100644
>>> > --- a/fs/nfs/file.c
>>> > +++ b/fs/nfs/file.c
>>> > @@ -384,12 +384,15 @@ static int nfs_write_begin(struct file *file, struct
>>> address_space *mapping,
>>> >        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
>>> >        struct page *page;
>>> >        int once_thru = 0;
>>> > +       struct pnfs_layout_segment *lseg;
>>> >
>>> >        dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n",
>>> >                file->f_path.dentry->d_parent->d_name.name,
>>> >                file->f_path.dentry->d_name.name,
>>> >                mapping->host->i_ino, len, (long long) pos);
>>> > -
>>> > +       lseg = pnfs_update_layout(mapping->host,
>>> > +                                 nfs_file_open_context(file),
>>> > +                                 pos, len, IOMODE_RW, GFP_NOFS);
>>>
>>>
>>> This looks like it is left over from before the rearrangements done to
>>> where pnfs_update_layout.
>>> In particular, we don't want to hold the reference on the lseg from
>>> here until flush time.  And there
>>> seems to be no reason to.  If the client needs a layout to deal with
>>> read-in here, it should instead
>>> trigger the nfs_want_read_modify_write clause.
>> Yes, you are right. Directly calling pnfs_update_layout here can be avoided.
>> But it seems triggering nfs_want_read_modify_write will acquire a read-only layout segment via readpage code path.
>> For write, client will need a read-write layout segment so it would mean two layoutget for each new segment (one in nfs_readpage and one at flush time). It may not be good for performance.
>> Does current generic code have method to avoid this?
>>
>> Thanks,
>> Tao
>>
>
> No.  However, note that this only hits in the case where you are doing
> subpage writes.
block layout driver need the segment to determine if it should dirty
other pages in the same fsblock based on if it is a first write to an
INVALID extent. So it still hits whenever an fsblock is dirtied for
the first time.

Thanks,
Tao

>
> Fred
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Fred Isaman June 14, 2011, 4:02 p.m. UTC | #5
On Tue, Jun 14, 2011 at 11:53 AM, Peng Tao <bergwolf@gmail.com> wrote:
> On Tue, Jun 14, 2011 at 10:05 PM, Fred Isaman <iisaman@netapp.com> wrote:
>> On Tue, Jun 14, 2011 at 7:01 AM,  <tao.peng@emc.com> wrote:
>>> Hi, Fred,
>>>
>>>> -----Original Message-----
>>>> From: linux-nfs-owner@vger.kernel.org [mailto:linux-nfs-owner@vger.kernel.org]
>>>> On Behalf Of Fred Isaman
>>>> Sent: Monday, June 13, 2011 10:44 PM
>>>> To: Jim Rees
>>>> Cc: linux-nfs@vger.kernel.org; peter honeyman
>>>> Subject: Re: [PATCH 04/34] pnfs: hook nfs_write_begin/end to allow layout driver
>>>> manipulation
>>>>
>>>> On Sun, Jun 12, 2011 at 7:43 PM, Jim Rees <rees@umich.edu> wrote:
>>>> > From: Peng Tao <bergwolf@gmail.com>
>>>> >
>>>> > Signed-off-by: Fred Isaman <iisaman@citi.umich.edu>
>>>> > Signed-off-by: Benny Halevy <bhalevy@panasas.com>
>>>> > Reported-by: Alexandros Batsakis <batsakis@netapp.com>
>>>> > Signed-off-by: Andy Adamson <andros@netapp.com>
>>>> > Signed-off-by: Fred Isaman <iisaman@netapp.com>
>>>> > Signed-off-by: Benny Halevy <bhalevy@panasas.com>
>>>> > Signed-off-by: Peng Tao <bergwolf@gmail.com>
>>>> > ---
>>>> >  fs/nfs/file.c          |   26 ++++++++++-
>>>> >  fs/nfs/pnfs.c          |   41 +++++++++++++++++
>>>> >  fs/nfs/pnfs.h          |  115
>>>> ++++++++++++++++++++++++++++++++++++++++++++++++
>>>> >  fs/nfs/write.c         |   12 +++--
>>>> >  include/linux/nfs_fs.h |    3 +-
>>>> >  5 files changed, 189 insertions(+), 8 deletions(-)
>>>> >
>>>> > diff --git a/fs/nfs/file.c b/fs/nfs/file.c
>>>> > index 2f093ed..1768762 100644
>>>> > --- a/fs/nfs/file.c
>>>> > +++ b/fs/nfs/file.c
>>>> > @@ -384,12 +384,15 @@ static int nfs_write_begin(struct file *file, struct
>>>> address_space *mapping,
>>>> >        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
>>>> >        struct page *page;
>>>> >        int once_thru = 0;
>>>> > +       struct pnfs_layout_segment *lseg;
>>>> >
>>>> >        dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n",
>>>> >                file->f_path.dentry->d_parent->d_name.name,
>>>> >                file->f_path.dentry->d_name.name,
>>>> >                mapping->host->i_ino, len, (long long) pos);
>>>> > -
>>>> > +       lseg = pnfs_update_layout(mapping->host,
>>>> > +                                 nfs_file_open_context(file),
>>>> > +                                 pos, len, IOMODE_RW, GFP_NOFS);
>>>>
>>>>
>>>> This looks like it is left over from before the rearrangements done to
>>>> where pnfs_update_layout.
>>>> In particular, we don't want to hold the reference on the lseg from
>>>> here until flush time.  And there
>>>> seems to be no reason to.  If the client needs a layout to deal with
>>>> read-in here, it should instead
>>>> trigger the nfs_want_read_modify_write clause.
>>> Yes, you are right. Directly calling pnfs_update_layout here can be avoided.
>>> But it seems triggering nfs_want_read_modify_write will acquire a read-only layout segment via readpage code path.
>>> For write, client will need a read-write layout segment so it would mean two layoutget for each new segment (one in nfs_readpage and one at flush time). It may not be good for performance.
>>> Does current generic code have method to avoid this?
>>>
>>> Thanks,
>>> Tao
>>>
>>
>> No.  However, note that this only hits in the case where you are doing
>> subpage writes.
> block layout driver need the segment to determine if it should dirty
> other pages in the same fsblock based on if it is a first write to an
> INVALID extent. So it still hits whenever an fsblock is dirtied for
> the first time.
>
> Thanks,
> Tao
>

Why can't this be delayed until flush?

Fred
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 2f093ed..1768762 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -384,12 +384,15 @@  static int nfs_write_begin(struct file *file, struct address_space *mapping,
 	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
 	struct page *page;
 	int once_thru = 0;
+	struct pnfs_layout_segment *lseg;
 
 	dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n",
 		file->f_path.dentry->d_parent->d_name.name,
 		file->f_path.dentry->d_name.name,
 		mapping->host->i_ino, len, (long long) pos);
-
+	lseg = pnfs_update_layout(mapping->host,
+				  nfs_file_open_context(file),
+				  pos, len, IOMODE_RW, GFP_NOFS);
 start:
 	/*
 	 * Prevent starvation issues if someone is doing a consistency
@@ -409,6 +412,9 @@  start:
 	if (ret) {
 		unlock_page(page);
 		page_cache_release(page);
+		*pagep = NULL;
+		*fsdata = NULL;
+		goto out;
 	} else if (!once_thru &&
 		   nfs_want_read_modify_write(file, page, pos, len)) {
 		once_thru = 1;
@@ -417,6 +423,12 @@  start:
 		if (!ret)
 			goto start;
 	}
+	ret = pnfs_write_begin(file, page, pos, len, lseg, fsdata);
+ out:
+	if (ret) {
+		put_lseg(lseg);
+		*fsdata = NULL;
+	}
 	return ret;
 }
 
@@ -426,6 +438,7 @@  static int nfs_write_end(struct file *file, struct address_space *mapping,
 {
 	unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
 	int status;
+	struct pnfs_layout_segment *lseg;
 
 	dfprintk(PAGECACHE, "NFS: write_end(%s/%s(%ld), %u@%lld)\n",
 		file->f_path.dentry->d_parent->d_name.name,
@@ -452,10 +465,17 @@  static int nfs_write_end(struct file *file, struct address_space *mapping,
 			zero_user_segment(page, pglen, PAGE_CACHE_SIZE);
 	}
 
-	status = nfs_updatepage(file, page, offset, copied);
+	lseg = nfs4_pull_lseg_from_fsdata(file, fsdata);
+	status = pnfs_write_end(file, page, pos, len, copied, lseg);
+	if (status)
+		goto out;
+	status = nfs_updatepage(file, page, offset, copied, lseg, fsdata);
 
+out:
 	unlock_page(page);
 	page_cache_release(page);
+	pnfs_write_end_cleanup(file, fsdata);
+	put_lseg(lseg);
 
 	if (status < 0)
 		return status;
@@ -577,7 +597,7 @@  static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 
 	ret = VM_FAULT_LOCKED;
 	if (nfs_flush_incompatible(filp, page) == 0 &&
-	    nfs_updatepage(filp, page, 0, pagelen) == 0)
+	    nfs_updatepage(filp, page, 0, pagelen, NULL, NULL) == 0)
 		goto out;
 
 	ret = VM_FAULT_SIGBUS;
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index f03a5e0..e693718 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1138,6 +1138,41 @@  pnfs_try_to_write_data(struct nfs_write_data *wdata,
 }
 
 /*
+ * This gives the layout driver an opportunity to read in page "around"
+ * the data to be written.  It returns 0 on success, otherwise an error code
+ * which will either be passed up to user, or ignored if
+ * some previous part of write succeeded.
+ * Note the range [pos, pos+len-1] is entirely within the page.
+ */
+int _pnfs_write_begin(struct inode *inode, struct page *page,
+		      loff_t pos, unsigned len,
+		      struct pnfs_layout_segment *lseg,
+		      struct pnfs_fsdata **fsdata)
+{
+	struct pnfs_fsdata *data;
+	int status = 0;
+
+	dprintk("--> %s: pos=%llu len=%u\n",
+		__func__, (unsigned long long)pos, len);
+	data = kzalloc(sizeof(struct pnfs_fsdata), GFP_KERNEL);
+	if (!data) {
+		status = -ENOMEM;
+		goto out;
+	}
+	data->lseg = lseg; /* refcount passed into data to be managed there */
+	status = NFS_SERVER(inode)->pnfs_curr_ld->write_begin(
+						lseg, page, pos, len, data);
+	if (status) {
+		kfree(data);
+		data = NULL;
+	}
+out:
+	*fsdata = data;
+	dprintk("<-- %s: status=%d\n", __func__, status);
+	return status;
+}
+
+/*
  * Called by non rpc-based layout drivers
  */
 int
@@ -1237,6 +1272,12 @@  pnfs_set_layoutcommit(struct nfs_write_data *wdata)
 }
 EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit);
 
+void pnfs_free_fsdata(struct pnfs_fsdata *fsdata)
+{
+	/* lseg refcounting handled directly in nfs_write_end */
+	kfree(fsdata);
+}
+
 /*
  * For the LAYOUT4_NFSV4_1_FILES layout type, NFS_DATA_SYNC WRITEs and
  * NFS_UNSTABLE WRITEs with a COMMIT to data servers must store enough
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index a3fc0f2..525ec55 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -54,6 +54,12 @@  enum pnfs_try_status {
 	PNFS_NOT_ATTEMPTED = 1,
 };
 
+struct pnfs_fsdata {
+	struct pnfs_layout_segment *lseg;
+	int bypass_eof;
+	void *private;
+};
+
 #ifdef CONFIG_NFS_V4_1
 
 #define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4"
@@ -106,6 +112,14 @@  struct pnfs_layoutdriver_type {
 	 */
 	enum pnfs_try_status (*read_pagelist) (struct nfs_read_data *nfs_data);
 	enum pnfs_try_status (*write_pagelist) (struct nfs_write_data *nfs_data, int how);
+	int (*write_begin) (struct pnfs_layout_segment *lseg, struct page *page,
+			    loff_t pos, unsigned count,
+			    struct pnfs_fsdata *fsdata);
+	int (*write_end)(struct inode *inode, struct page *page, loff_t pos,
+			 unsigned count, unsigned copied,
+			 struct pnfs_layout_segment *lseg);
+	void (*write_end_cleanup)(struct file *filp,
+				  struct pnfs_fsdata *fsdata);
 
 	void (*free_deviceid_node) (struct nfs4_deviceid_node *);
 
@@ -175,6 +189,7 @@  enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *,
 enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *,
 					    const struct rpc_call_ops *);
 bool pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req);
+void pnfs_free_fsdata(struct pnfs_fsdata *fsdata);
 int pnfs_layout_process(struct nfs4_layoutget *lgp);
 void pnfs_free_lseg_list(struct list_head *tmp_list);
 void pnfs_destroy_layout(struct nfs_inode *);
@@ -186,6 +201,10 @@  void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
 int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
 				  struct pnfs_layout_hdr *lo,
 				  struct nfs4_state *open_state);
+int _pnfs_write_begin(struct inode *inode, struct page *page,
+		      loff_t pos, unsigned len,
+		      struct pnfs_layout_segment *lseg,
+		      struct pnfs_fsdata **fsdata);
 int mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
 				struct list_head *tmp_list,
 				struct pnfs_layout_range *recall_range);
@@ -287,6 +306,13 @@  static inline void pnfs_clear_request_commit(struct nfs_page *req)
 		put_lseg(req->wb_commit_lseg);
 }
 
+static inline int pnfs_grow_ok(struct pnfs_layout_segment *lseg,
+			       struct pnfs_fsdata *fsdata)
+{
+	return !fsdata  || ((struct pnfs_layout_segment *)fsdata == lseg) ||
+		!fsdata->bypass_eof;
+}
+
 /* Should the pNFS client commit and return the layout upon a setattr */
 static inline bool
 pnfs_ld_layoutret_on_setattr(struct inode *inode)
@@ -297,6 +323,49 @@  pnfs_ld_layoutret_on_setattr(struct inode *inode)
 		PNFS_LAYOUTRET_ON_SETATTR;
 }
 
+static inline int pnfs_write_begin(struct file *filp, struct page *page,
+				   loff_t pos, unsigned len,
+				   struct pnfs_layout_segment *lseg,
+				   void **fsdata)
+{
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct nfs_server *nfss = NFS_SERVER(inode);
+	int status = 0;
+
+	*fsdata = lseg;
+	if (lseg && nfss->pnfs_curr_ld->write_begin)
+		status = _pnfs_write_begin(inode, page, pos, len, lseg,
+					   (struct pnfs_fsdata **) fsdata);
+	return status;
+}
+
+/* CAREFUL - what happens if copied < len??? */
+static inline int pnfs_write_end(struct file *filp, struct page *page,
+				 loff_t pos, unsigned len, unsigned copied,
+				 struct pnfs_layout_segment *lseg)
+{
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct nfs_server *nfss = NFS_SERVER(inode);
+
+	if (nfss->pnfs_curr_ld && nfss->pnfs_curr_ld->write_end)
+		return nfss->pnfs_curr_ld->write_end(inode, page, pos, len,
+						     copied, lseg);
+	else
+		return 0;
+}
+
+static inline void pnfs_write_end_cleanup(struct file *filp, void *fsdata)
+{
+	struct nfs_server *nfss = NFS_SERVER(filp->f_dentry->d_inode);
+
+	if (fsdata && nfss->pnfs_curr_ld) {
+		if (nfss->pnfs_curr_ld->write_end_cleanup)
+			nfss->pnfs_curr_ld->write_end_cleanup(filp, fsdata);
+		if (nfss->pnfs_curr_ld->write_begin)
+			pnfs_free_fsdata(fsdata);
+	}
+}
+
 static inline int pnfs_return_layout(struct inode *ino)
 {
 	struct nfs_inode *nfsi = NFS_I(ino);
@@ -317,6 +386,19 @@  static inline void pnfs_pageio_init(struct nfs_pageio_descriptor *pgio,
 		pgio->pg_test = ld->pg_test;
 }
 
+static inline struct pnfs_layout_segment *
+nfs4_pull_lseg_from_fsdata(struct file *filp, void *fsdata)
+{
+	if (fsdata) {
+		struct nfs_server *nfss = NFS_SERVER(filp->f_dentry->d_inode);
+
+		if (nfss->pnfs_curr_ld && nfss->pnfs_curr_ld->write_begin)
+			return ((struct pnfs_fsdata *) fsdata)->lseg;
+		return (struct pnfs_layout_segment *)fsdata;
+	}
+	return NULL;
+}
+
 #else  /* CONFIG_NFS_V4_1 */
 
 static inline void pnfs_destroy_all_layouts(struct nfs_client *clp)
@@ -345,6 +427,12 @@  pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
 	return NULL;
 }
 
+static inline int pnfs_grow_ok(struct pnfs_layout_segment *lseg,
+			       struct pnfs_fsdata *fsdata)
+{
+	return 1;
+}
+
 static inline enum pnfs_try_status
 pnfs_try_to_read_data(struct nfs_read_data *data,
 		      const struct rpc_call_ops *call_ops)
@@ -364,6 +452,26 @@  static inline int pnfs_return_layout(struct inode *ino)
 	return 0;
 }
 
+static inline int pnfs_write_begin(struct file *filp, struct page *page,
+				   loff_t pos, unsigned len,
+				   struct pnfs_layout_segment *lseg,
+				   void **fsdata)
+{
+	*fsdata = NULL;
+	return 0;
+}
+
+static inline int pnfs_write_end(struct file *filp, struct page *page,
+				 loff_t pos, unsigned len, unsigned copied,
+				 struct pnfs_layout_segment *lseg)
+{
+	return 0;
+}
+
+static inline void pnfs_write_end_cleanup(struct file *filp, void *fsdata)
+{
+}
+
 static inline bool
 pnfs_ld_layoutret_on_setattr(struct inode *inode)
 {
@@ -435,6 +543,13 @@  static inline int pnfs_layoutcommit_inode(struct inode *inode, bool sync)
 static inline void nfs4_deviceid_purge_client(struct nfs_client *ncl)
 {
 }
+
+static inline struct pnfs_layout_segment *
+nfs4_pull_lseg_from_fsdata(struct file *filp, void *fsdata)
+{
+	return NULL;
+}
+
 #endif /* CONFIG_NFS_V4_1 */
 
 #endif /* FS_NFS_PNFS_H */
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index e268e3b..75e2a6b 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -673,7 +673,9 @@  out:
 }
 
 static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
-		unsigned int offset, unsigned int count)
+		unsigned int offset, unsigned int count,
+		struct pnfs_layout_segment *lseg, void *fsdata)
+
 {
 	struct nfs_page	*req;
 
@@ -681,7 +683,8 @@  static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 	/* Update file length */
-	nfs_grow_file(page, offset, count);
+	if (pnfs_grow_ok(lseg, fsdata))
+		nfs_grow_file(page, offset, count);
 	nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes);
 	nfs_mark_request_dirty(req);
 	nfs_clear_page_tag_locked(req);
@@ -734,7 +737,8 @@  static int nfs_write_pageuptodate(struct page *page, struct inode *inode)
  * things with a page scheduled for an RPC call (e.g. invalidate it).
  */
 int nfs_updatepage(struct file *file, struct page *page,
-		unsigned int offset, unsigned int count)
+		unsigned int offset, unsigned int count,
+		struct pnfs_layout_segment *lseg, void *fsdata)
 {
 	struct nfs_open_context *ctx = nfs_file_open_context(file);
 	struct inode	*inode = page->mapping->host;
@@ -759,7 +763,7 @@  int nfs_updatepage(struct file *file, struct page *page,
 		offset = 0;
 	}
 
-	status = nfs_writepage_setup(ctx, page, offset, count);
+	status = nfs_writepage_setup(ctx, page, offset, count, lseg, fsdata);
 	if (status < 0)
 		nfs_set_pageerror(page);
 
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 1b93b9c..be1ac1d 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -510,7 +510,8 @@  extern int  nfs_congestion_kb;
 extern int  nfs_writepage(struct page *page, struct writeback_control *wbc);
 extern int  nfs_writepages(struct address_space *, struct writeback_control *);
 extern int  nfs_flush_incompatible(struct file *file, struct page *page);
-extern int  nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int);
+extern int  nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int,
+			struct pnfs_layout_segment *, void *);
 extern void nfs_writeback_done(struct rpc_task *, struct nfs_write_data *);
 
 /*