nfs: track writeback errors with errseq_t
diff mbox

Message ID 20170720194227.7830-1-jlayton@kernel.org
State New
Headers show

Commit Message

Jeff Layton July 20, 2017, 7:42 p.m. UTC
From: Jeff Layton <jlayton@redhat.com>

There is some ambiguity in nfs about how writeback errors are tracked.

For instance, nfs_pageio_add_request calls mapping_set_error when the
add fails, but we track errors that occur after adding the request
with a dedicated int error in the open context.

Now that we have better infrastructure for the vfs layer, this
latter int is now unnecessary. Just have nfs_context_set_write_error set
the error in the mapping when one occurs.

Have NFS use file_write_and_wait_range to initiate and wait on writeback
of the data, and then check again after issuing the commit(s).

With this, we also don't need to pay attention to the ERROR_WRITE
flag for reporting, and just clear it to indicate to subsequent
writers that they should try to go asynchronous again.

In nfs_page_async_flush, sample the error before locking and joining
the requests, and check for errors since that point.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
 fs/nfs/file.c          | 24 +++++++++++-------------
 fs/nfs/inode.c         |  3 +--
 fs/nfs/write.c         |  8 ++++++--
 include/linux/nfs_fs.h |  1 -
 4 files changed, 18 insertions(+), 18 deletions(-)

I have a baling wire and duct tape solution for testing this with
xfstests (using iptables REJECT targets and soft mounts). This seems to
make nfs do the right thing.

Comments

Jeff Layton Aug. 25, 2017, 5:59 p.m. UTC | #1
On Thu, 2017-07-20 at 15:42 -0400, Jeff Layton wrote:
> From: Jeff Layton <jlayton@redhat.com>
> 
> There is some ambiguity in nfs about how writeback errors are tracked.
> 
> For instance, nfs_pageio_add_request calls mapping_set_error when the
> add fails, but we track errors that occur after adding the request
> with a dedicated int error in the open context.
> 
> Now that we have better infrastructure for the vfs layer, this
> latter int is now unnecessary. Just have nfs_context_set_write_error set
> the error in the mapping when one occurs.
> 
> Have NFS use file_write_and_wait_range to initiate and wait on writeback
> of the data, and then check again after issuing the commit(s).
> 
> With this, we also don't need to pay attention to the ERROR_WRITE
> flag for reporting, and just clear it to indicate to subsequent
> writers that they should try to go asynchronous again.
> 
> In nfs_page_async_flush, sample the error before locking and joining
> the requests, and check for errors since that point.
> 
> Signed-off-by: Jeff Layton <jlayton@redhat.com>
> ---
>  fs/nfs/file.c          | 24 +++++++++++-------------
>  fs/nfs/inode.c         |  3 +--
>  fs/nfs/write.c         |  8 ++++++--
>  include/linux/nfs_fs.h |  1 -
>  4 files changed, 18 insertions(+), 18 deletions(-)
> 
> I have a baling wire and duct tape solution for testing this with
> xfstests (using iptables REJECT targets and soft mounts). This seems to
> make nfs do the right thing.
> 
> diff --git a/fs/nfs/file.c b/fs/nfs/file.c
> index 5713eb32a45e..15d3c6faafd3 100644
> --- a/fs/nfs/file.c
> +++ b/fs/nfs/file.c
> @@ -212,25 +212,23 @@ nfs_file_fsync_commit(struct file *file, loff_t start, loff_t end, int datasync)
>  {
>  	struct nfs_open_context *ctx = nfs_file_open_context(file);
>  	struct inode *inode = file_inode(file);
> -	int have_error, do_resend, status;
> -	int ret = 0;
> +	int do_resend, status;
> +	int ret;
>  
>  	dprintk("NFS: fsync file(%pD2) datasync %d\n", file, datasync);
>  
>  	nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
>  	do_resend = test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags);
> -	have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
> -	status = nfs_commit_inode(inode, FLUSH_SYNC);
> -	have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
> -	if (have_error) {
> -		ret = xchg(&ctx->error, 0);
> -		if (ret)
> -			goto out;
> -	}
> -	if (status < 0) {
> +	clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
> +	ret = nfs_commit_inode(inode, FLUSH_SYNC);
> +
> +	/* Recheck and advance after the commit */
> +	status = file_check_and_advance_wb_err(file);
> +	if (!ret)
>  		ret = status;
> +	if (ret)
>  		goto out;
> -	}
> +
>  	do_resend |= test_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags);
>  	if (do_resend)
>  		ret = -EAGAIN;
> @@ -247,7 +245,7 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
>  	trace_nfs_fsync_enter(inode);
>  
>  	do {
> -		ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
> +		ret = file_write_and_wait_range(file, start, end);
>  		if (ret != 0)
>  			break;
>  		ret = nfs_file_fsync_commit(file, start, end, datasync);
> diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
> index 109279d6d91b..c48f673c5cc9 100644
> --- a/fs/nfs/inode.c
> +++ b/fs/nfs/inode.c
> @@ -900,7 +900,6 @@ struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry,
>  	ctx->state = NULL;
>  	ctx->mode = f_mode;
>  	ctx->flags = 0;
> -	ctx->error = 0;
>  	ctx->flock_owner = (fl_owner_t)filp;
>  	nfs_init_lock_context(&ctx->lock_context);
>  	ctx->lock_context.open_context = ctx;
> @@ -1009,7 +1008,7 @@ void nfs_file_clear_open_context(struct file *filp)
>  		 * We fatal error on write before. Try to writeback
>  		 * every page again.
>  		 */
> -		if (ctx->error < 0)
> +		if (filemap_check_wb_err(inode->i_mapping, filp->f_wb_err))
>  			invalidate_inode_pages2(inode->i_mapping);
>  		filp->private_data = NULL;
>  		spin_lock(&inode->i_lock);
> diff --git a/fs/nfs/write.c b/fs/nfs/write.c
> index b1af5dee5e0a..c2fcaf07cd24 100644
> --- a/fs/nfs/write.c
> +++ b/fs/nfs/write.c
> @@ -149,7 +149,9 @@ static void nfs_io_completion_put(struct nfs_io_completion *ioc)
>  
>  static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
>  {
> -	ctx->error = error;
> +	struct inode *inode = d_inode(ctx->dentry);
> +
> +	mapping_set_error(inode->i_mapping, error);
>  	smp_wmb();
>  	set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
>  }
> @@ -628,6 +630,8 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
>  {
>  	struct nfs_page *req;
>  	int ret = 0;
> +	struct address_space *mapping = page_file_mapping(page);
> +	errseq_t since = filemap_sample_wb_err(mapping);
>  
>  	req = nfs_lock_and_join_requests(page, nonblock);
>  	if (!req)
> @@ -641,7 +645,7 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
>  
>  	ret = 0;
>  	/* If there is a fatal error that covers this write, just exit */
> -	if (nfs_error_is_fatal_on_server(req->wb_context->error))
> +	if (nfs_error_is_fatal_on_server(filemap_check_wb_err(mapping, since)))
>  		goto out_launder;
>  
>  	if (!nfs_pageio_add_request(pgio, req)) {
> diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
> index e52cc55ac300..a96b0bd52b32 100644
> --- a/include/linux/nfs_fs.h
> +++ b/include/linux/nfs_fs.h
> @@ -77,7 +77,6 @@ struct nfs_open_context {
>  #define NFS_CONTEXT_RESEND_WRITES	(1)
>  #define NFS_CONTEXT_BAD			(2)
>  #define NFS_CONTEXT_UNLOCK	(3)
> -	int error;
>  
>  	struct list_head list;
>  	struct nfs4_threshold	*mdsthreshold;

Anna and Trond,

Ping? I haven't seen any word on this patch, and it hasn't shown up in
any branches. Do you have concerns with it, or is this good to go for
v4.14?

Thanks,
NeilBrown Aug. 27, 2017, 11:24 p.m. UTC | #2
On Fri, Aug 25 2017, Jeff Layton wrote:

> On Thu, 2017-07-20 at 15:42 -0400, Jeff Layton wrote:
>> From: Jeff Layton <jlayton@redhat.com>
>> 
>> There is some ambiguity in nfs about how writeback errors are tracked.
>> 
>> For instance, nfs_pageio_add_request calls mapping_set_error when the
>> add fails, but we track errors that occur after adding the request
>> with a dedicated int error in the open context.
>> 
>> Now that we have better infrastructure for the vfs layer, this
>> latter int is now unnecessary. Just have nfs_context_set_write_error set
>> the error in the mapping when one occurs.
>> 
>> Have NFS use file_write_and_wait_range to initiate and wait on writeback
>> of the data, and then check again after issuing the commit(s).
>> 
>> With this, we also don't need to pay attention to the ERROR_WRITE
>> flag for reporting, and just clear it to indicate to subsequent
>> writers that they should try to go asynchronous again.
>> 
>> In nfs_page_async_flush, sample the error before locking and joining
>> the requests, and check for errors since that point.
>> 
>> Signed-off-by: Jeff Layton <jlayton@redhat.com>
>> ---
>>  fs/nfs/file.c          | 24 +++++++++++-------------
>>  fs/nfs/inode.c         |  3 +--
>>  fs/nfs/write.c         |  8 ++++++--
>>  include/linux/nfs_fs.h |  1 -
>>  4 files changed, 18 insertions(+), 18 deletions(-)
>> 
>> I have a baling wire and duct tape solution for testing this with
>> xfstests (using iptables REJECT targets and soft mounts). This seems to
>> make nfs do the right thing.
>> 
>> diff --git a/fs/nfs/file.c b/fs/nfs/file.c
>> index 5713eb32a45e..15d3c6faafd3 100644
>> --- a/fs/nfs/file.c
>> +++ b/fs/nfs/file.c
>> @@ -212,25 +212,23 @@ nfs_file_fsync_commit(struct file *file, loff_t start, loff_t end, int datasync)
>>  {
>>  	struct nfs_open_context *ctx = nfs_file_open_context(file);
>>  	struct inode *inode = file_inode(file);
>> -	int have_error, do_resend, status;
>> -	int ret = 0;
>> +	int do_resend, status;
>> +	int ret;
>>  
>>  	dprintk("NFS: fsync file(%pD2) datasync %d\n", file, datasync);
>>  
>>  	nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
>>  	do_resend = test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags);
>> -	have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
>> -	status = nfs_commit_inode(inode, FLUSH_SYNC);
>> -	have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
>> -	if (have_error) {
>> -		ret = xchg(&ctx->error, 0);
>> -		if (ret)
>> -			goto out;
>> -	}
>> -	if (status < 0) {
>> +	clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
>> +	ret = nfs_commit_inode(inode, FLUSH_SYNC);
>> +
>> +	/* Recheck and advance after the commit */
>> +	status = file_check_and_advance_wb_err(file);

This change makes the code inconsistent with the comment above the
function, which still references ctx->error.  The intent of the comment
is still correct, but the details have changed.

Also, there is a call to mapping_set_error() in
nfs_pageio_add_request().
I wonder if that should be changed to
  nfs_context_set_write_error(req->wb_context, desc->pg_error)
??

Otherwise, patch looks good to me.
Thanks,
NeilBrown
Jeff Layton Aug. 28, 2017, 11:47 a.m. UTC | #3
On Mon, 2017-08-28 at 09:24 +1000, NeilBrown wrote:
> On Fri, Aug 25 2017, Jeff Layton wrote:
> 
> > On Thu, 2017-07-20 at 15:42 -0400, Jeff Layton wrote:
> > > From: Jeff Layton <jlayton@redhat.com>
> > > 
> > > There is some ambiguity in nfs about how writeback errors are
> > > tracked.
> > > 
> > > For instance, nfs_pageio_add_request calls mapping_set_error when
> > > the
> > > add fails, but we track errors that occur after adding the
> > > request
> > > with a dedicated int error in the open context.
> > > 
> > > Now that we have better infrastructure for the vfs layer, this
> > > latter int is now unnecessary. Just have
> > > nfs_context_set_write_error set
> > > the error in the mapping when one occurs.
> > > 
> > > Have NFS use file_write_and_wait_range to initiate and wait on
> > > writeback
> > > of the data, and then check again after issuing the commit(s).
> > > 
> > > With this, we also don't need to pay attention to the ERROR_WRITE
> > > flag for reporting, and just clear it to indicate to subsequent
> > > writers that they should try to go asynchronous again.
> > > 
> > > In nfs_page_async_flush, sample the error before locking and
> > > joining
> > > the requests, and check for errors since that point.
> > > 
> > > Signed-off-by: Jeff Layton <jlayton@redhat.com>
> > > ---
> > >  fs/nfs/file.c          | 24 +++++++++++-------------
> > >  fs/nfs/inode.c         |  3 +--
> > >  fs/nfs/write.c         |  8 ++++++--
> > >  include/linux/nfs_fs.h |  1 -
> > >  4 files changed, 18 insertions(+), 18 deletions(-)
> > > 
> > > I have a baling wire and duct tape solution for testing this with
> > > xfstests (using iptables REJECT targets and soft mounts). This
> > > seems to
> > > make nfs do the right thing.
> > > 
> > > diff --git a/fs/nfs/file.c b/fs/nfs/file.c
> > > index 5713eb32a45e..15d3c6faafd3 100644
> > > --- a/fs/nfs/file.c
> > > +++ b/fs/nfs/file.c
> > > @@ -212,25 +212,23 @@ nfs_file_fsync_commit(struct file *file,
> > > loff_t start, loff_t end, int datasync)
> > >  {
> > >  	struct nfs_open_context *ctx =
> > > nfs_file_open_context(file);
> > >  	struct inode *inode = file_inode(file);
> > > -	int have_error, do_resend, status;
> > > -	int ret = 0;
> > > +	int do_resend, status;
> > > +	int ret;
> > >  
> > >  	dprintk("NFS: fsync file(%pD2) datasync %d\n", file,
> > > datasync);
> > >  
> > >  	nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
> > >  	do_resend =
> > > test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags);
> > > -	have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE,
> > > &ctx->flags);
> > > -	status = nfs_commit_inode(inode, FLUSH_SYNC);
> > > -	have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx-
> > > >flags);
> > > -	if (have_error) {
> > > -		ret = xchg(&ctx->error, 0);
> > > -		if (ret)
> > > -			goto out;
> > > -	}
> > > -	if (status < 0) {
> > > +	clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
> > > +	ret = nfs_commit_inode(inode, FLUSH_SYNC);
> > > +
> > > +	/* Recheck and advance after the commit */
> > > +	status = file_check_and_advance_wb_err(file);
> 
> This change makes the code inconsistent with the comment above the
> function, which still references ctx->error.  The intent of the
> comment
> is still correct, but the details have changed.
> 

Good catch. I'll fix that up in a respin.

> Also, there is a call to mapping_set_error() in
> nfs_pageio_add_request().
> I wonder if that should be changed to
>   nfs_context_set_write_error(req->wb_context, desc->pg_error)
> ??
> 

Trickier question...

I'm not quite sure what semantics we're looking for with
NFS_CONTEXT_ERROR_WRITE. I know that it forces writes to be
synchronous, but I'm not quite sure why it gets cleared the way it
does. It's set on any error but cleared before issuing a commit.

I added a similar flag to Ceph inodes recently, but only clear it when
a write succeeds. Wouldn't that make more sense here as well?
NeilBrown Aug. 29, 2017, 1:23 a.m. UTC | #4
On Mon, Aug 28 2017, Jeff Layton wrote:

> On Mon, 2017-08-28 at 09:24 +1000, NeilBrown wrote:
>> On Fri, Aug 25 2017, Jeff Layton wrote:
>> 
>> > On Thu, 2017-07-20 at 15:42 -0400, Jeff Layton wrote:
>> > > From: Jeff Layton <jlayton@redhat.com>
>> > > 
>> > > There is some ambiguity in nfs about how writeback errors are
>> > > tracked.
>> > > 
>> > > For instance, nfs_pageio_add_request calls mapping_set_error when
>> > > the
>> > > add fails, but we track errors that occur after adding the
>> > > request
>> > > with a dedicated int error in the open context.
>> > > 
>> > > Now that we have better infrastructure for the vfs layer, this
>> > > latter int is now unnecessary. Just have
>> > > nfs_context_set_write_error set
>> > > the error in the mapping when one occurs.
>> > > 
>> > > Have NFS use file_write_and_wait_range to initiate and wait on
>> > > writeback
>> > > of the data, and then check again after issuing the commit(s).
>> > > 
>> > > With this, we also don't need to pay attention to the ERROR_WRITE
>> > > flag for reporting, and just clear it to indicate to subsequent
>> > > writers that they should try to go asynchronous again.
>> > > 
>> > > In nfs_page_async_flush, sample the error before locking and
>> > > joining
>> > > the requests, and check for errors since that point.
>> > > 
>> > > Signed-off-by: Jeff Layton <jlayton@redhat.com>
>> > > ---
>> > >  fs/nfs/file.c          | 24 +++++++++++-------------
>> > >  fs/nfs/inode.c         |  3 +--
>> > >  fs/nfs/write.c         |  8 ++++++--
>> > >  include/linux/nfs_fs.h |  1 -
>> > >  4 files changed, 18 insertions(+), 18 deletions(-)
>> > > 
>> > > I have a baling wire and duct tape solution for testing this with
>> > > xfstests (using iptables REJECT targets and soft mounts). This
>> > > seems to
>> > > make nfs do the right thing.
>> > > 
>> > > diff --git a/fs/nfs/file.c b/fs/nfs/file.c
>> > > index 5713eb32a45e..15d3c6faafd3 100644
>> > > --- a/fs/nfs/file.c
>> > > +++ b/fs/nfs/file.c
>> > > @@ -212,25 +212,23 @@ nfs_file_fsync_commit(struct file *file,
>> > > loff_t start, loff_t end, int datasync)
>> > >  {
>> > >  	struct nfs_open_context *ctx =
>> > > nfs_file_open_context(file);
>> > >  	struct inode *inode = file_inode(file);
>> > > -	int have_error, do_resend, status;
>> > > -	int ret = 0;
>> > > +	int do_resend, status;
>> > > +	int ret;
>> > >  
>> > >  	dprintk("NFS: fsync file(%pD2) datasync %d\n", file,
>> > > datasync);
>> > >  
>> > >  	nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
>> > >  	do_resend =
>> > > test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags);
>> > > -	have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE,
>> > > &ctx->flags);
>> > > -	status = nfs_commit_inode(inode, FLUSH_SYNC);
>> > > -	have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx-
>> > > >flags);
>> > > -	if (have_error) {
>> > > -		ret = xchg(&ctx->error, 0);
>> > > -		if (ret)
>> > > -			goto out;
>> > > -	}
>> > > -	if (status < 0) {
>> > > +	clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
>> > > +	ret = nfs_commit_inode(inode, FLUSH_SYNC);
>> > > +
>> > > +	/* Recheck and advance after the commit */
>> > > +	status = file_check_and_advance_wb_err(file);
>> 
>> This change makes the code inconsistent with the comment above the
>> function, which still references ctx->error.  The intent of the
>> comment
>> is still correct, but the details have changed.
>> 
>
> Good catch. I'll fix that up in a respin.
>
>> Also, there is a call to mapping_set_error() in
>> nfs_pageio_add_request().
>> I wonder if that should be changed to
>>   nfs_context_set_write_error(req->wb_context, desc->pg_error)
>> ??
>> 
>
> Trickier question...
>
> I'm not quite sure what semantics we're looking for with
> NFS_CONTEXT_ERROR_WRITE. I know that it forces writes to be
> synchronous, but I'm not quite sure why it gets cleared the way it
> does. It's set on any error but cleared before issuing a commit.
>
> I added a similar flag to Ceph inodes recently, but only clear it when
> a write succeeds. Wouldn't that make more sense here as well?

It is a bit hard to wrap one's mind around.

In the original code (commit 7b159fc18d417980) it looks like:
 - test-and-clear bit
 - write and sync
 - test-bit

This does, I think, seem safer than "clear on successful write" as the
writes could complete out-of-order and I wouldn't be surprised if the
unsuccessful ones completed with an error before the successful one -
particularly with an error like EDQUOT.

However the current code does the writes before the test-and-clear, and
only does the commit afterwards.  That makes it less clear why the
current sequence is a good idea.

However ... nfs_file_fsync_commit() is only called if
filemap_write_and_wait_range() returned with success, so we only clear
the flag after successful writes(?).

Oh....
This patch from me:

Commit: 2edb6bc3852c ("NFS - fix recent breakage to NFS error handling.")

seems to have been reverted by

Commit: 7b281ee02655 ("NFS: fsync() must exit with an error if page writeback failed")

which probably isn't good.  It appears that this code is very fragile
and easily broken.
Maybe we need to work out exactly what is required, and document it - so
we can stop breaking it.
Or maybe we need some unit tests.....

NeilBrown
Jeff Layton Aug. 29, 2017, 10:54 a.m. UTC | #5
On Tue, 2017-08-29 at 11:23 +1000, NeilBrown wrote:
> On Mon, Aug 28 2017, Jeff Layton wrote:
> 
> > On Mon, 2017-08-28 at 09:24 +1000, NeilBrown wrote:
> > > On Fri, Aug 25 2017, Jeff Layton wrote:
> > > 
> > > > On Thu, 2017-07-20 at 15:42 -0400, Jeff Layton wrote:
> > > > > From: Jeff Layton <jlayton@redhat.com>
> > > > > 
> > > > > There is some ambiguity in nfs about how writeback errors are
> > > > > tracked.
> > > > > 
> > > > > For instance, nfs_pageio_add_request calls mapping_set_error when
> > > > > the
> > > > > add fails, but we track errors that occur after adding the
> > > > > request
> > > > > with a dedicated int error in the open context.
> > > > > 
> > > > > Now that we have better infrastructure for the vfs layer, this
> > > > > latter int is now unnecessary. Just have
> > > > > nfs_context_set_write_error set
> > > > > the error in the mapping when one occurs.
> > > > > 
> > > > > Have NFS use file_write_and_wait_range to initiate and wait on
> > > > > writeback
> > > > > of the data, and then check again after issuing the commit(s).
> > > > > 
> > > > > With this, we also don't need to pay attention to the ERROR_WRITE
> > > > > flag for reporting, and just clear it to indicate to subsequent
> > > > > writers that they should try to go asynchronous again.
> > > > > 
> > > > > In nfs_page_async_flush, sample the error before locking and
> > > > > joining
> > > > > the requests, and check for errors since that point.
> > > > > 
> > > > > Signed-off-by: Jeff Layton <jlayton@redhat.com>
> > > > > ---
> > > > >  fs/nfs/file.c          | 24 +++++++++++-------------
> > > > >  fs/nfs/inode.c         |  3 +--
> > > > >  fs/nfs/write.c         |  8 ++++++--
> > > > >  include/linux/nfs_fs.h |  1 -
> > > > >  4 files changed, 18 insertions(+), 18 deletions(-)
> > > > > 
> > > > > I have a baling wire and duct tape solution for testing this with
> > > > > xfstests (using iptables REJECT targets and soft mounts). This
> > > > > seems to
> > > > > make nfs do the right thing.
> > > > > 
> > > > > diff --git a/fs/nfs/file.c b/fs/nfs/file.c
> > > > > index 5713eb32a45e..15d3c6faafd3 100644
> > > > > --- a/fs/nfs/file.c
> > > > > +++ b/fs/nfs/file.c
> > > > > @@ -212,25 +212,23 @@ nfs_file_fsync_commit(struct file *file,
> > > > > loff_t start, loff_t end, int datasync)
> > > > >  {
> > > > >  	struct nfs_open_context *ctx =
> > > > > nfs_file_open_context(file);
> > > > >  	struct inode *inode = file_inode(file);
> > > > > -	int have_error, do_resend, status;
> > > > > -	int ret = 0;
> > > > > +	int do_resend, status;
> > > > > +	int ret;
> > > > >  
> > > > >  	dprintk("NFS: fsync file(%pD2) datasync %d\n", file,
> > > > > datasync);
> > > > >  
> > > > >  	nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
> > > > >  	do_resend =
> > > > > test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags);
> > > > > -	have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE,
> > > > > &ctx->flags);
> > > > > -	status = nfs_commit_inode(inode, FLUSH_SYNC);
> > > > > -	have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx-
> > > > > > flags);
> > > > > 
> > > > > -	if (have_error) {
> > > > > -		ret = xchg(&ctx->error, 0);
> > > > > -		if (ret)
> > > > > -			goto out;
> > > > > -	}
> > > > > -	if (status < 0) {
> > > > > +	clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
> > > > > +	ret = nfs_commit_inode(inode, FLUSH_SYNC);
> > > > > +
> > > > > +	/* Recheck and advance after the commit */
> > > > > +	status = file_check_and_advance_wb_err(file);
> > > 
> > > This change makes the code inconsistent with the comment above the
> > > function, which still references ctx->error.  The intent of the
> > > comment
> > > is still correct, but the details have changed.
> > > 
> > 
> > Good catch. I'll fix that up in a respin.
> > 
> > > Also, there is a call to mapping_set_error() in
> > > nfs_pageio_add_request().
> > > I wonder if that should be changed to
> > >   nfs_context_set_write_error(req->wb_context, desc->pg_error)
> > > ??
> > > 
> > 
> > Trickier question...
> > 
> > I'm not quite sure what semantics we're looking for with
> > NFS_CONTEXT_ERROR_WRITE. I know that it forces writes to be
> > synchronous, but I'm not quite sure why it gets cleared the way it
> > does. It's set on any error but cleared before issuing a commit.
> > 
> > I added a similar flag to Ceph inodes recently, but only clear it when
> > a write succeeds. Wouldn't that make more sense here as well?
> 
> It is a bit hard to wrap one's mind around.
> 
> In the original code (commit 7b159fc18d417980) it looks like:
>  - test-and-clear bit
>  - write and sync
>  - test-bit
> 
> This does, I think, seem safer than "clear on successful write" as the
> writes could complete out-of-order and I wouldn't be surprised if the
> unsuccessful ones completed with an error before the successful one -
> particularly with an error like EDQUOT.
> 
> However the current code does the writes before the test-and-clear, and
> only does the commit afterwards.  That makes it less clear why the
> current sequence is a good idea.
> 
> However ... nfs_file_fsync_commit() is only called if
> filemap_write_and_wait_range() returned with success, so we only clear
> the flag after successful writes(?).
> 
> Oh....
> This patch from me:
> 
> Commit: 2edb6bc3852c ("NFS - fix recent breakage to NFS error handling.")
> 
> seems to have been reverted by
> 
> Commit: 7b281ee02655 ("NFS: fsync() must exit with an error if page writeback failed")
> 
> which probably isn't good.  It appears that this code is very fragile
> and easily broken.
> Maybe we need to work out exactly what is required, and document it - so
> we can stop breaking it.
> Or maybe we need some unit tests.....
> 

Yes, laying out what's necessary for this would be very helpful. We
clearly want to set the flag when an error occurs. Under what
circumstances should we be clearing it?

I'm not sure we can really do much better than clearing it on a
successful write. With Ceph, was that this is just a hint to the write
submission mechanism and we generally aren't too concerned if a few slip
past in either direction.
NeilBrown Sept. 7, 2017, 3:37 a.m. UTC | #6
On Tue, Aug 29 2017, Jeff Layton wrote:

> On Tue, 2017-08-29 at 11:23 +1000, NeilBrown wrote:
>> On Mon, Aug 28 2017, Jeff Layton wrote:
>> 
>> > On Mon, 2017-08-28 at 09:24 +1000, NeilBrown wrote:
>> > > On Fri, Aug 25 2017, Jeff Layton wrote:
>> > > 
>> > > > On Thu, 2017-07-20 at 15:42 -0400, Jeff Layton wrote:
>> > > > > From: Jeff Layton <jlayton@redhat.com>
>> > > > > 
>> > > > > There is some ambiguity in nfs about how writeback errors are
>> > > > > tracked.
>> > > > > 
>> > > > > For instance, nfs_pageio_add_request calls mapping_set_error when
>> > > > > the
>> > > > > add fails, but we track errors that occur after adding the
>> > > > > request
>> > > > > with a dedicated int error in the open context.
>> > > > > 
>> > > > > Now that we have better infrastructure for the vfs layer, this
>> > > > > latter int is now unnecessary. Just have
>> > > > > nfs_context_set_write_error set
>> > > > > the error in the mapping when one occurs.
>> > > > > 
>> > > > > Have NFS use file_write_and_wait_range to initiate and wait on
>> > > > > writeback
>> > > > > of the data, and then check again after issuing the commit(s).
>> > > > > 
>> > > > > With this, we also don't need to pay attention to the ERROR_WRITE
>> > > > > flag for reporting, and just clear it to indicate to subsequent
>> > > > > writers that they should try to go asynchronous again.
>> > > > > 
>> > > > > In nfs_page_async_flush, sample the error before locking and
>> > > > > joining
>> > > > > the requests, and check for errors since that point.
>> > > > > 
>> > > > > Signed-off-by: Jeff Layton <jlayton@redhat.com>
>> > > > > ---
>> > > > >  fs/nfs/file.c          | 24 +++++++++++-------------
>> > > > >  fs/nfs/inode.c         |  3 +--
>> > > > >  fs/nfs/write.c         |  8 ++++++--
>> > > > >  include/linux/nfs_fs.h |  1 -
>> > > > >  4 files changed, 18 insertions(+), 18 deletions(-)
>> > > > > 
>> > > > > I have a baling wire and duct tape solution for testing this with
>> > > > > xfstests (using iptables REJECT targets and soft mounts). This
>> > > > > seems to
>> > > > > make nfs do the right thing.
>> > > > > 
>> > > > > diff --git a/fs/nfs/file.c b/fs/nfs/file.c
>> > > > > index 5713eb32a45e..15d3c6faafd3 100644
>> > > > > --- a/fs/nfs/file.c
>> > > > > +++ b/fs/nfs/file.c
>> > > > > @@ -212,25 +212,23 @@ nfs_file_fsync_commit(struct file *file,
>> > > > > loff_t start, loff_t end, int datasync)
>> > > > >  {
>> > > > >  	struct nfs_open_context *ctx =
>> > > > > nfs_file_open_context(file);
>> > > > >  	struct inode *inode = file_inode(file);
>> > > > > -	int have_error, do_resend, status;
>> > > > > -	int ret = 0;
>> > > > > +	int do_resend, status;
>> > > > > +	int ret;
>> > > > >  
>> > > > >  	dprintk("NFS: fsync file(%pD2) datasync %d\n", file,
>> > > > > datasync);
>> > > > >  
>> > > > >  	nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
>> > > > >  	do_resend =
>> > > > > test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags);
>> > > > > -	have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE,
>> > > > > &ctx->flags);
>> > > > > -	status = nfs_commit_inode(inode, FLUSH_SYNC);
>> > > > > -	have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx-
>> > > > > > flags);
>> > > > > 
>> > > > > -	if (have_error) {
>> > > > > -		ret = xchg(&ctx->error, 0);
>> > > > > -		if (ret)
>> > > > > -			goto out;
>> > > > > -	}
>> > > > > -	if (status < 0) {
>> > > > > +	clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
>> > > > > +	ret = nfs_commit_inode(inode, FLUSH_SYNC);
>> > > > > +
>> > > > > +	/* Recheck and advance after the commit */
>> > > > > +	status = file_check_and_advance_wb_err(file);
>> > > 
>> > > This change makes the code inconsistent with the comment above the
>> > > function, which still references ctx->error.  The intent of the
>> > > comment
>> > > is still correct, but the details have changed.
>> > > 
>> > 
>> > Good catch. I'll fix that up in a respin.
>> > 
>> > > Also, there is a call to mapping_set_error() in
>> > > nfs_pageio_add_request().
>> > > I wonder if that should be changed to
>> > >   nfs_context_set_write_error(req->wb_context, desc->pg_error)
>> > > ??
>> > > 
>> > 
>> > Trickier question...
>> > 
>> > I'm not quite sure what semantics we're looking for with
>> > NFS_CONTEXT_ERROR_WRITE. I know that it forces writes to be
>> > synchronous, but I'm not quite sure why it gets cleared the way it
>> > does. It's set on any error but cleared before issuing a commit.
>> > 
>> > I added a similar flag to Ceph inodes recently, but only clear it when
>> > a write succeeds. Wouldn't that make more sense here as well?
>> 
>> It is a bit hard to wrap one's mind around.
>> 
>> In the original code (commit 7b159fc18d417980) it looks like:
>>  - test-and-clear bit
>>  - write and sync
>>  - test-bit
>> 
>> This does, I think, seem safer than "clear on successful write" as the
>> writes could complete out-of-order and I wouldn't be surprised if the
>> unsuccessful ones completed with an error before the successful one -
>> particularly with an error like EDQUOT.
>> 
>> However the current code does the writes before the test-and-clear, and
>> only does the commit afterwards.  That makes it less clear why the
>> current sequence is a good idea.
>> 
>> However ... nfs_file_fsync_commit() is only called if
>> filemap_write_and_wait_range() returned with success, so we only clear
>> the flag after successful writes(?).
>> 
>> Oh....
>> This patch from me:
>> 
>> Commit: 2edb6bc3852c ("NFS - fix recent breakage to NFS error handling.")
>> 
>> seems to have been reverted by
>> 
>> Commit: 7b281ee02655 ("NFS: fsync() must exit with an error if page writeback failed")
>> 
>> which probably isn't good.  It appears that this code is very fragile
>> and easily broken.

On further investigation, I think the problem that I fixed and then we
reintroduced will be fixed again - more permanently - by your patch.
The root problem is that nfs keeps error codes in a different way to the
MM core.  By unifying those, the problem goes.
(The specific problem is that writes which hit EDQUOT on the server can
 report EIO on the client).


>> Maybe we need to work out exactly what is required, and document it - so
>> we can stop breaking it.
>> Or maybe we need some unit tests.....
>> 
>
> Yes, laying out what's necessary for this would be very helpful. We
> clearly want to set the flag when an error occurs. Under what
> circumstances should we be clearing it?

Well.... looking back at  7b159fc18d417980f57ae which introduced the
flag, prior to that write errors (ctx->error) were only reported by
nfs_file_flush and nfs_fsync, so only one close() and fsync().

After that commit, setting the flag would mean that errors could be
returned by 'write'.  So clearing as part of returning the error makes
perfect sense.

As long as the error gets recorded, and gets returned when it is
recorded, it doesn't much matter when the flag is cleared.  With your
patches we don't need to flag any more to get errors reliably reported.

Leaving the flag set means that writes go more slowly - we don't get
large queue of background rights building up but destined for failure.
This is the main point made in the comment message when the flag was
introduced.
Of course, by the time we first get an error there could already
by a large queue, so we probably want that to drain completely before
allowing async writes again.

It might make sense to have 2 flags.  One which says "writes should be
synchronous", another that says "There was an error recently".
We clear the error flag before calling nfs_fsync, and if it is still
clear afterwards, we clear the sync-writes flag.  Maybe that is more
complex than needed though.

I'm leaning towards your suggestion that it doesn't matter very much
when it gets cleared, and clearing it on any successful write is
simplest.

So I'm still in favor of using nfs_context_set_write_error() in
nfs_pageio_add_request(), primarily because it is most consistent - we
don't need exceptions.

Thanks,
NeilBrown


>
> I'm not sure we can really do much better than clearing it on a
> successful write. With Ceph, was that this is just a hint to the write
> submission mechanism and we generally aren't too concerned if a few slip
> past in either direction.
> -- 
> Jeff Layton <jlayton@redhat.com>
Jeff Layton Sept. 7, 2017, 11:35 a.m. UTC | #7
On Thu, 2017-09-07 at 13:37 +1000, NeilBrown wrote:
> On Tue, Aug 29 2017, Jeff Layton wrote:
> 
> > On Tue, 2017-08-29 at 11:23 +1000, NeilBrown wrote:
> > > On Mon, Aug 28 2017, Jeff Layton wrote:
> > > 
> > > > On Mon, 2017-08-28 at 09:24 +1000, NeilBrown wrote:
> > > > > On Fri, Aug 25 2017, Jeff Layton wrote:
> > > > > 
> > > > > > On Thu, 2017-07-20 at 15:42 -0400, Jeff Layton wrote:
> > > > > > > From: Jeff Layton <jlayton@redhat.com>
> > > > > > > 
> > > > > > > There is some ambiguity in nfs about how writeback errors are
> > > > > > > tracked.
> > > > > > > 
> > > > > > > For instance, nfs_pageio_add_request calls mapping_set_error when
> > > > > > > the
> > > > > > > add fails, but we track errors that occur after adding the
> > > > > > > request
> > > > > > > with a dedicated int error in the open context.
> > > > > > > 
> > > > > > > Now that we have better infrastructure for the vfs layer, this
> > > > > > > latter int is now unnecessary. Just have
> > > > > > > nfs_context_set_write_error set
> > > > > > > the error in the mapping when one occurs.
> > > > > > > 
> > > > > > > Have NFS use file_write_and_wait_range to initiate and wait on
> > > > > > > writeback
> > > > > > > of the data, and then check again after issuing the commit(s).
> > > > > > > 
> > > > > > > With this, we also don't need to pay attention to the ERROR_WRITE
> > > > > > > flag for reporting, and just clear it to indicate to subsequent
> > > > > > > writers that they should try to go asynchronous again.
> > > > > > > 
> > > > > > > In nfs_page_async_flush, sample the error before locking and
> > > > > > > joining
> > > > > > > the requests, and check for errors since that point.
> > > > > > > 
> > > > > > > Signed-off-by: Jeff Layton <jlayton@redhat.com>
> > > > > > > ---
> > > > > > >  fs/nfs/file.c          | 24 +++++++++++-------------
> > > > > > >  fs/nfs/inode.c         |  3 +--
> > > > > > >  fs/nfs/write.c         |  8 ++++++--
> > > > > > >  include/linux/nfs_fs.h |  1 -
> > > > > > >  4 files changed, 18 insertions(+), 18 deletions(-)
> > > > > > > 
> > > > > > > I have a baling wire and duct tape solution for testing this with
> > > > > > > xfstests (using iptables REJECT targets and soft mounts). This
> > > > > > > seems to
> > > > > > > make nfs do the right thing.
> > > > > > > 
> > > > > > > diff --git a/fs/nfs/file.c b/fs/nfs/file.c
> > > > > > > index 5713eb32a45e..15d3c6faafd3 100644
> > > > > > > --- a/fs/nfs/file.c
> > > > > > > +++ b/fs/nfs/file.c
> > > > > > > @@ -212,25 +212,23 @@ nfs_file_fsync_commit(struct file *file,
> > > > > > > loff_t start, loff_t end, int datasync)
> > > > > > >  {
> > > > > > >  	struct nfs_open_context *ctx =
> > > > > > > nfs_file_open_context(file);
> > > > > > >  	struct inode *inode = file_inode(file);
> > > > > > > -	int have_error, do_resend, status;
> > > > > > > -	int ret = 0;
> > > > > > > +	int do_resend, status;
> > > > > > > +	int ret;
> > > > > > >  
> > > > > > >  	dprintk("NFS: fsync file(%pD2) datasync %d\n", file,
> > > > > > > datasync);
> > > > > > >  
> > > > > > >  	nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
> > > > > > >  	do_resend =
> > > > > > > test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags);
> > > > > > > -	have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE,
> > > > > > > &ctx->flags);
> > > > > > > -	status = nfs_commit_inode(inode, FLUSH_SYNC);
> > > > > > > -	have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx-
> > > > > > > > flags);
> > > > > > > 
> > > > > > > -	if (have_error) {
> > > > > > > -		ret = xchg(&ctx->error, 0);
> > > > > > > -		if (ret)
> > > > > > > -			goto out;
> > > > > > > -	}
> > > > > > > -	if (status < 0) {
> > > > > > > +	clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
> > > > > > > +	ret = nfs_commit_inode(inode, FLUSH_SYNC);
> > > > > > > +
> > > > > > > +	/* Recheck and advance after the commit */
> > > > > > > +	status = file_check_and_advance_wb_err(file);
> > > > > 
> > > > > This change makes the code inconsistent with the comment above the
> > > > > function, which still references ctx->error.  The intent of the
> > > > > comment
> > > > > is still correct, but the details have changed.
> > > > > 
> > > > 
> > > > Good catch. I'll fix that up in a respin.
> > > > 
> > > > > Also, there is a call to mapping_set_error() in
> > > > > nfs_pageio_add_request().
> > > > > I wonder if that should be changed to
> > > > >   nfs_context_set_write_error(req->wb_context, desc->pg_error)
> > > > > ??
> > > > > 
> > > > 
> > > > Trickier question...
> > > > 
> > > > I'm not quite sure what semantics we're looking for with
> > > > NFS_CONTEXT_ERROR_WRITE. I know that it forces writes to be
> > > > synchronous, but I'm not quite sure why it gets cleared the way it
> > > > does. It's set on any error but cleared before issuing a commit.
> > > > 
> > > > I added a similar flag to Ceph inodes recently, but only clear it when
> > > > a write succeeds. Wouldn't that make more sense here as well?
> > > 
> > > It is a bit hard to wrap one's mind around.
> > > 
> > > In the original code (commit 7b159fc18d417980) it looks like:
> > >  - test-and-clear bit
> > >  - write and sync
> > >  - test-bit
> > > 
> > > This does, I think, seem safer than "clear on successful write" as the
> > > writes could complete out-of-order and I wouldn't be surprised if the
> > > unsuccessful ones completed with an error before the successful one -
> > > particularly with an error like EDQUOT.
> > > 
> > > However the current code does the writes before the test-and-clear, and
> > > only does the commit afterwards.  That makes it less clear why the
> > > current sequence is a good idea.
> > > 
> > > However ... nfs_file_fsync_commit() is only called if
> > > filemap_write_and_wait_range() returned with success, so we only clear
> > > the flag after successful writes(?).
> > > 
> > > Oh....
> > > This patch from me:
> > > 
> > > Commit: 2edb6bc3852c ("NFS - fix recent breakage to NFS error handling.")
> > > 
> > > seems to have been reverted by
> > > 
> > > Commit: 7b281ee02655 ("NFS: fsync() must exit with an error if page writeback failed")
> > > 
> > > which probably isn't good.  It appears that this code is very fragile
> > > and easily broken.
> 
> On further investigation, I think the problem that I fixed and then we
> reintroduced will be fixed again - more permanently - by your patch.
> The root problem is that nfs keeps error codes in a different way to the
> MM core.  By unifying those, the problem goes.
> (The specific problem is that writes which hit EDQUOT on the server can
>  report EIO on the client).
> 
> 
> > > Maybe we need to work out exactly what is required, and document it - so
> > > we can stop breaking it.
> > > Or maybe we need some unit tests.....
> > > 
> > 
> > Yes, laying out what's necessary for this would be very helpful. We
> > clearly want to set the flag when an error occurs. Under what
> > circumstances should we be clearing it?
> 
> Well.... looking back at  7b159fc18d417980f57ae which introduced the
> flag, prior to that write errors (ctx->error) were only reported by
> nfs_file_flush and nfs_fsync, so only one close() and fsync().
> 
> After that commit, setting the flag would mean that errors could be
> returned by 'write'.  So clearing as part of returning the error makes
> perfect sense.
> 
> As long as the error gets recorded, and gets returned when it is
> recorded, it doesn't much matter when the flag is cleared.  With your
> patches we don't need to flag any more to get errors reliably reported.
> 
> Leaving the flag set means that writes go more slowly - we don't get
> large queue of background rights building up but destined for failure.
> This is the main point made in the comment message when the flag was
> introduced.
> Of course, by the time we first get an error there could already
> by a large queue, so we probably want that to drain completely before
> allowing async writes again.
> 
> It might make sense to have 2 flags.  One which says "writes should be
> synchronous", another that says "There was an error recently".
> We clear the error flag before calling nfs_fsync, and if it is still
> clear afterwards, we clear the sync-writes flag.  Maybe that is more
> complex than needed though.
> 
> I'm leaning towards your suggestion that it doesn't matter very much
> when it gets cleared, and clearing it on any successful write is
> simplest.
> 
> So I'm still in favor of using nfs_context_set_write_error() in
> nfs_pageio_add_request(), primarily because it is most consistent - we
> don't need exceptions.

Thanks for taking a closer look. I can easily make the change above, and
I do think that keeping this mechanism as simple as possible will make
it easier to prevent bitrot.

That said... NFS_CONTEXT_ERROR_WRITE is a per ctx flag, and the ctx is a
per open file description object.

Is that the correct way to track this? All of the ctx's will share the
same inode. If we're getting writeback errors for one context, it's
quite likely that we'll be seeing them via others.

I suppose the counterargument is when we have things like expiring krb5
tickets. Write failures via an expiring set of creds may have no effect
on writeback via other creds.

Still, I think a per-inode flag might make more sense here.

Thoughts?
Trond Myklebust Sept. 7, 2017, 2:54 p.m. UTC | #8
On Thu, 2017-09-07 at 07:35 -0400, Jeff Layton wrote:
> On Thu, 2017-09-07 at 13:37 +1000, NeilBrown wrote:

> > On Tue, Aug 29 2017, Jeff Layton wrote:

> > 

> > > On Tue, 2017-08-29 at 11:23 +1000, NeilBrown wrote:

> > > > On Mon, Aug 28 2017, Jeff Layton wrote:

> > > > 

> > > > > On Mon, 2017-08-28 at 09:24 +1000, NeilBrown wrote:

> > > > > > On Fri, Aug 25 2017, Jeff Layton wrote:

> > > > > > 

> > > > > > > On Thu, 2017-07-20 at 15:42 -0400, Jeff Layton wrote:

> > > > > > > > From: Jeff Layton <jlayton@redhat.com>

> > > > > > > > 

> > > > > > > > There is some ambiguity in nfs about how writeback

> > > > > > > > errors are

> > > > > > > > tracked.

> > > > > > > > 

> > > > > > > > For instance, nfs_pageio_add_request calls

> > > > > > > > mapping_set_error when

> > > > > > > > the

> > > > > > > > add fails, but we track errors that occur after adding

> > > > > > > > the

> > > > > > > > request

> > > > > > > > with a dedicated int error in the open context.

> > > > > > > > 

> > > > > > > > Now that we have better infrastructure for the vfs

> > > > > > > > layer, this

> > > > > > > > latter int is now unnecessary. Just have

> > > > > > > > nfs_context_set_write_error set

> > > > > > > > the error in the mapping when one occurs.

> > > > > > > > 

> > > > > > > > Have NFS use file_write_and_wait_range to initiate and

> > > > > > > > wait on

> > > > > > > > writeback

> > > > > > > > of the data, and then check again after issuing the

> > > > > > > > commit(s).

> > > > > > > > 

> > > > > > > > With this, we also don't need to pay attention to the

> > > > > > > > ERROR_WRITE

> > > > > > > > flag for reporting, and just clear it to indicate to

> > > > > > > > subsequent

> > > > > > > > writers that they should try to go asynchronous again.

> > > > > > > > 

> > > > > > > > In nfs_page_async_flush, sample the error before

> > > > > > > > locking and

> > > > > > > > joining

> > > > > > > > the requests, and check for errors since that point.

> > > > > > > > 

> > > > > > > > Signed-off-by: Jeff Layton <jlayton@redhat.com>

> > > > > > > > ---

> > > > > > > >  fs/nfs/file.c          | 24 +++++++++++-------------

> > > > > > > >  fs/nfs/inode.c         |  3 +--

> > > > > > > >  fs/nfs/write.c         |  8 ++++++--

> > > > > > > >  include/linux/nfs_fs.h |  1 -

> > > > > > > >  4 files changed, 18 insertions(+), 18 deletions(-)

> > > > > > > > 

> > > > > > > > I have a baling wire and duct tape solution for testing

> > > > > > > > this with

> > > > > > > > xfstests (using iptables REJECT targets and soft

> > > > > > > > mounts). This

> > > > > > > > seems to

> > > > > > > > make nfs do the right thing.

> > > > > > > > 

> > > > > > > > diff --git a/fs/nfs/file.c b/fs/nfs/file.c

> > > > > > > > index 5713eb32a45e..15d3c6faafd3 100644

> > > > > > > > --- a/fs/nfs/file.c

> > > > > > > > +++ b/fs/nfs/file.c

> > > > > > > > @@ -212,25 +212,23 @@ nfs_file_fsync_commit(struct file

> > > > > > > > *file,

> > > > > > > > loff_t start, loff_t end, int datasync)

> > > > > > > >  {

> > > > > > > >  	struct nfs_open_context *ctx =

> > > > > > > > nfs_file_open_context(file);

> > > > > > > >  	struct inode *inode = file_inode(file);

> > > > > > > > -	int have_error, do_resend, status;

> > > > > > > > -	int ret = 0;

> > > > > > > > +	int do_resend, status;

> > > > > > > > +	int ret;

> > > > > > > >  

> > > > > > > >  	dprintk("NFS: fsync file(%pD2) datasync %d\n",

> > > > > > > > file,

> > > > > > > > datasync);

> > > > > > > >  

> > > > > > > >  	nfs_inc_stats(inode, NFSIOS_VFSFSYNC);

> > > > > > > >  	do_resend =

> > > > > > > > test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES, &ctx-

> > > > > > > > >flags);

> > > > > > > > -	have_error =

> > > > > > > > test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE,

> > > > > > > > &ctx->flags);

> > > > > > > > -	status = nfs_commit_inode(inode, FLUSH_SYNC);

> > > > > > > > -	have_error |=

> > > > > > > > test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx-

> > > > > > > > > flags);

> > > > > > > > 

> > > > > > > > -	if (have_error) {

> > > > > > > > -		ret = xchg(&ctx->error, 0);

> > > > > > > > -		if (ret)

> > > > > > > > -			goto out;

> > > > > > > > -	}

> > > > > > > > -	if (status < 0) {

> > > > > > > > +	clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx-

> > > > > > > > >flags);

> > > > > > > > +	ret = nfs_commit_inode(inode, FLUSH_SYNC);

> > > > > > > > +

> > > > > > > > +	/* Recheck and advance after the commit */

> > > > > > > > +	status = file_check_and_advance_wb_err(file);

> > > > > > 

> > > > > > This change makes the code inconsistent with the comment

> > > > > > above the

> > > > > > function, which still references ctx->error.  The intent of

> > > > > > the

> > > > > > comment

> > > > > > is still correct, but the details have changed.

> > > > > > 

> > > > > 

> > > > > Good catch. I'll fix that up in a respin.

> > > > > 

> > > > > > Also, there is a call to mapping_set_error() in

> > > > > > nfs_pageio_add_request().

> > > > > > I wonder if that should be changed to

> > > > > >   nfs_context_set_write_error(req->wb_context, desc-

> > > > > > >pg_error)

> > > > > > ??

> > > > > > 

> > > > > 

> > > > > Trickier question...

> > > > > 

> > > > > I'm not quite sure what semantics we're looking for with

> > > > > NFS_CONTEXT_ERROR_WRITE. I know that it forces writes to be

> > > > > synchronous, but I'm not quite sure why it gets cleared the

> > > > > way it

> > > > > does. It's set on any error but cleared before issuing a

> > > > > commit.

> > > > > 

> > > > > I added a similar flag to Ceph inodes recently, but only

> > > > > clear it when

> > > > > a write succeeds. Wouldn't that make more sense here as well?

> > > > 

> > > > It is a bit hard to wrap one's mind around.

> > > > 

> > > > In the original code (commit 7b159fc18d417980) it looks like:

> > > >  - test-and-clear bit

> > > >  - write and sync

> > > >  - test-bit

> > > > 

> > > > This does, I think, seem safer than "clear on successful write"

> > > > as the

> > > > writes could complete out-of-order and I wouldn't be surprised

> > > > if the

> > > > unsuccessful ones completed with an error before the successful

> > > > one -

> > > > particularly with an error like EDQUOT.

> > > > 

> > > > However the current code does the writes before the test-and-

> > > > clear, and

> > > > only does the commit afterwards.  That makes it less clear why

> > > > the

> > > > current sequence is a good idea.

> > > > 

> > > > However ... nfs_file_fsync_commit() is only called if

> > > > filemap_write_and_wait_range() returned with success, so we

> > > > only clear

> > > > the flag after successful writes(?).

> > > > 

> > > > Oh....

> > > > This patch from me:

> > > > 

> > > > Commit: 2edb6bc3852c ("NFS - fix recent breakage to NFS error

> > > > handling.")

> > > > 

> > > > seems to have been reverted by

> > > > 

> > > > Commit: 7b281ee02655 ("NFS: fsync() must exit with an error if

> > > > page writeback failed")

> > > > 

> > > > which probably isn't good.  It appears that this code is very

> > > > fragile

> > > > and easily broken.

> > 

> > On further investigation, I think the problem that I fixed and then

> > we

> > reintroduced will be fixed again - more permanently - by your

> > patch.

> > The root problem is that nfs keeps error codes in a different way

> > to the

> > MM core.  By unifying those, the problem goes.

> > (The specific problem is that writes which hit EDQUOT on the server

> > can

> >  report EIO on the client).

> > 

> > 

> > > > Maybe we need to work out exactly what is required, and

> > > > document it - so

> > > > we can stop breaking it.

> > > > Or maybe we need some unit tests.....

> > > > 

> > > 

> > > Yes, laying out what's necessary for this would be very helpful.

> > > We

> > > clearly want to set the flag when an error occurs. Under what

> > > circumstances should we be clearing it?

> > 

> > Well.... looking back at  7b159fc18d417980f57ae which introduced

> > the

> > flag, prior to that write errors (ctx->error) were only reported by

> > nfs_file_flush and nfs_fsync, so only one close() and fsync().

> > 

> > After that commit, setting the flag would mean that errors could be

> > returned by 'write'.  So clearing as part of returning the error

> > makes

> > perfect sense.

> > 

> > As long as the error gets recorded, and gets returned when it is

> > recorded, it doesn't much matter when the flag is cleared.  With

> > your

> > patches we don't need to flag any more to get errors reliably

> > reported.

> > 

> > Leaving the flag set means that writes go more slowly - we don't

> > get

> > large queue of background rights building up but destined for

> > failure.

> > This is the main point made in the comment message when the flag

> > was

> > introduced.

> > Of course, by the time we first get an error there could already

> > by a large queue, so we probably want that to drain completely

> > before

> > allowing async writes again.


We already have this functionality implemented in the existing code.

> > 

> > It might make sense to have 2 flags.  One which says "writes should

> > be

> > synchronous", another that says "There was an error recently".

> > We clear the error flag before calling nfs_fsync, and if it is

> > still

> > clear afterwards, we clear the sync-writes flag.  Maybe that is

> > more

> > complex than needed though.

> > 


We also need to preserve the NFS_CONTEXT_RESEND_WRITES flag. I don't
see any global mechanism that will replace that.

> > I'm leaning towards your suggestion that it doesn't matter very

> > much

> > when it gets cleared, and clearing it on any successful write is

> > simplest.

> > 

> > So I'm still in favor of using nfs_context_set_write_error() in

> > nfs_pageio_add_request(), primarily because it is most consistent -

> > we

> > don't need exceptions.

> 

> Thanks for taking a closer look. I can easily make the change above,

> and

> I do think that keeping this mechanism as simple as possible will

> make

> it easier to prevent bitrot.

> 

> That said... NFS_CONTEXT_ERROR_WRITE is a per ctx flag, and the ctx

> is a

> per open file description object.

> 

> Is that the correct way to track this? All of the ctx's will share

> the

> same inode. If we're getting writeback errors for one context, it's

> quite likely that we'll be seeing them via others.

> 

> I suppose the counterargument is when we have things like expiring

> krb5

> tickets. Write failures via an expiring set of creds may have no

> effect

> on writeback via other creds.

> 

> Still, I think a per-inode flag might make more sense here.

> 

> Thoughts?


As far as I'm concerned, that would be a regression. The most common
problem when flushing writeback data to the server aside from ENOSPC
(and possibly ESTALE) is EACCES, which is particular to the file
descriptor that opened the file.

File contexts, and NFS_CONTEXT_ERROR_WRITE solve that problem by being
private to the file descriptor.

-- 
Trond Myklebust
Linux NFS client maintainer, PrimaryData
trond.myklebust@primarydata.com

Patch
diff mbox

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 5713eb32a45e..15d3c6faafd3 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -212,25 +212,23 @@  nfs_file_fsync_commit(struct file *file, loff_t start, loff_t end, int datasync)
 {
 	struct nfs_open_context *ctx = nfs_file_open_context(file);
 	struct inode *inode = file_inode(file);
-	int have_error, do_resend, status;
-	int ret = 0;
+	int do_resend, status;
+	int ret;
 
 	dprintk("NFS: fsync file(%pD2) datasync %d\n", file, datasync);
 
 	nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
 	do_resend = test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags);
-	have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
-	status = nfs_commit_inode(inode, FLUSH_SYNC);
-	have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
-	if (have_error) {
-		ret = xchg(&ctx->error, 0);
-		if (ret)
-			goto out;
-	}
-	if (status < 0) {
+	clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
+	ret = nfs_commit_inode(inode, FLUSH_SYNC);
+
+	/* Recheck and advance after the commit */
+	status = file_check_and_advance_wb_err(file);
+	if (!ret)
 		ret = status;
+	if (ret)
 		goto out;
-	}
+
 	do_resend |= test_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags);
 	if (do_resend)
 		ret = -EAGAIN;
@@ -247,7 +245,7 @@  nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 	trace_nfs_fsync_enter(inode);
 
 	do {
-		ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+		ret = file_write_and_wait_range(file, start, end);
 		if (ret != 0)
 			break;
 		ret = nfs_file_fsync_commit(file, start, end, datasync);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 109279d6d91b..c48f673c5cc9 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -900,7 +900,6 @@  struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry,
 	ctx->state = NULL;
 	ctx->mode = f_mode;
 	ctx->flags = 0;
-	ctx->error = 0;
 	ctx->flock_owner = (fl_owner_t)filp;
 	nfs_init_lock_context(&ctx->lock_context);
 	ctx->lock_context.open_context = ctx;
@@ -1009,7 +1008,7 @@  void nfs_file_clear_open_context(struct file *filp)
 		 * We fatal error on write before. Try to writeback
 		 * every page again.
 		 */
-		if (ctx->error < 0)
+		if (filemap_check_wb_err(inode->i_mapping, filp->f_wb_err))
 			invalidate_inode_pages2(inode->i_mapping);
 		filp->private_data = NULL;
 		spin_lock(&inode->i_lock);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index b1af5dee5e0a..c2fcaf07cd24 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -149,7 +149,9 @@  static void nfs_io_completion_put(struct nfs_io_completion *ioc)
 
 static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
 {
-	ctx->error = error;
+	struct inode *inode = d_inode(ctx->dentry);
+
+	mapping_set_error(inode->i_mapping, error);
 	smp_wmb();
 	set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
 }
@@ -628,6 +630,8 @@  static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
 {
 	struct nfs_page *req;
 	int ret = 0;
+	struct address_space *mapping = page_file_mapping(page);
+	errseq_t since = filemap_sample_wb_err(mapping);
 
 	req = nfs_lock_and_join_requests(page, nonblock);
 	if (!req)
@@ -641,7 +645,7 @@  static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
 
 	ret = 0;
 	/* If there is a fatal error that covers this write, just exit */
-	if (nfs_error_is_fatal_on_server(req->wb_context->error))
+	if (nfs_error_is_fatal_on_server(filemap_check_wb_err(mapping, since)))
 		goto out_launder;
 
 	if (!nfs_pageio_add_request(pgio, req)) {
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index e52cc55ac300..a96b0bd52b32 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -77,7 +77,6 @@  struct nfs_open_context {
 #define NFS_CONTEXT_RESEND_WRITES	(1)
 #define NFS_CONTEXT_BAD			(2)
 #define NFS_CONTEXT_UNLOCK	(3)
-	int error;
 
 	struct list_head list;
 	struct nfs4_threshold	*mdsthreshold;