diff mbox

[1/5] pNFS: recoalesce when ld write pagelist fails

Message ID 1312685635-1593-1-git-send-email-bergwolf@gmail.com (mailing list archive)
State New, archived
Headers show

Commit Message

Peng Tao Aug. 7, 2011, 2:53 a.m. UTC
For pnfs pagelist write failure, we need to pg_recoalesce and resend
IO to mds.

Signed-off-by: Peng Tao <peng_tao@emc.com>
---
 fs/nfs/internal.h |    4 ++++
 fs/nfs/pnfs.c     |   35 ++++++++++++++++++++++++++++++++---
 fs/nfs/write.c    |   21 ++++++++++++++-------
 3 files changed, 50 insertions(+), 10 deletions(-)

Comments

Boaz Harrosh Aug. 10, 2011, 5:52 p.m. UTC | #1
On 08/06/2011 07:53 PM, Peng Tao wrote:
> For pnfs pagelist write failure, we need to pg_recoalesce and resend
> IO to mds.
> 

I have not given this subject any thought or investigation, so I don't
know what we should do, but the gut feeling is that I have seen all this
code else where and we could be having a bigger re-use of existing code.

What if we dig into:
	data->mds_ops->rpc_call_done(&data->task, data);
	data->mds_ops->rpc_release(data);

And do all the pages tear-down and unlocks but if there is an error
not set them as clean. That is keep them dirty. Then mark the layout
as error and let the normal code choose an MDS write_out. (Just a wild
thought)

Trond please look in here, can't it be made simpler?


> Signed-off-by: Peng Tao <peng_tao@emc.com>
> ---
>  fs/nfs/internal.h |    4 ++++
>  fs/nfs/pnfs.c     |   35 ++++++++++++++++++++++++++++++++---
>  fs/nfs/write.c    |   21 ++++++++++++++-------
>  3 files changed, 50 insertions(+), 10 deletions(-)
> 
> diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
> index ab12913..62f183d 100644
> --- a/fs/nfs/internal.h
> +++ b/fs/nfs/internal.h
> @@ -305,6 +305,10 @@ extern void nfs_readdata_release(struct nfs_read_data *rdata);
>  /* write.c */
>  extern int nfs_generic_flush(struct nfs_pageio_descriptor *desc,
>  		struct list_head *head);
> +extern int do_nfs_writepage(struct page *page, struct writeback_control *wbc,
> +		struct nfs_pageio_descriptor *pgio);
> +extern void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio,
> +		struct inode *inode, int ioflags);
>  extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio);
>  extern void nfs_writedata_release(struct nfs_write_data *wdata);
>  extern void nfs_commit_free(struct nfs_write_data *p);
> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
> index e550e88..08aba45 100644
> --- a/fs/nfs/pnfs.c
> +++ b/fs/nfs/pnfs.c
> @@ -1172,6 +1172,13 @@ int
>  pnfs_ld_write_done(struct nfs_write_data *data)
>  {
>  	int status;
> +	struct nfs_pageio_descriptor pgio;
> +	struct writeback_control wbc = {
> +		.sync_mode = WB_SYNC_ALL,
> +		.range_start = data->mds_offset,
> +		.nr_to_write = data->npages,
> +		.range_end = LLONG_MAX,
> +	};
>  
>  	if (!data->pnfs_error) {
>  		pnfs_set_layoutcommit(data);
> @@ -1180,11 +1187,33 @@ pnfs_ld_write_done(struct nfs_write_data *data)
>  		return 0;
>  	}
>  
> +	put_lseg(data->lseg);
> +	data->lseg = NULL;
>  	dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__,
>  		data->pnfs_error);
> -	status = nfs_initiate_write(data, NFS_CLIENT(data->inode),
> -				    data->mds_ops, NFS_FILE_SYNC);
> -	return status ? : -EAGAIN;
> +	nfs_pageio_init_write_mds(&pgio, data->inode, FLUSH_STABLE);
> +	pgio.pg_recoalesce = 1;
> +	while (!list_empty(&data->pages)) {
> +		struct nfs_page *req = nfs_list_entry(data->pages.next);
> +		struct page *page = req->wb_page;
> +
> +		nfs_list_remove_request(req);
> +		nfs_clear_page_tag_locked(req);
> +
> +		end_page_writeback(page);
> +
> +		lock_page(page);
> +		status = do_nfs_writepage(page, &wbc, &pgio);
> +		if (status) {
> +			/* FIXME: is this enough?? */
> +			set_page_dirty(page);
> +		}
> +		unlock_page(page);
> +	}
> +	nfs_pageio_complete(&pgio);
> +	nfs_writedata_release(data);
> +
> +	return 0;
>  }
>  EXPORT_SYMBOL_GPL(pnfs_ld_write_done);
>  
> diff --git a/fs/nfs/write.c b/fs/nfs/write.c
> index b39b37f..0ccdf98 100644
> --- a/fs/nfs/write.c
> +++ b/fs/nfs/write.c
> @@ -285,14 +285,9 @@ out:
>  	return ret;
>  }
>  
> -static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio)
> +int do_nfs_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio)
>  {
> -	struct inode *inode = page->mapping->host;
>  	int ret;
> -
> -	nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
> -	nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1);
> -
>  	nfs_pageio_cond_complete(pgio, page->index);
>  	ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE);
>  	if (ret == -EAGAIN) {
> @@ -301,6 +296,17 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, st
>  	}
>  	return ret;
>  }
> +EXPORT_SYMBOL_GPL(do_nfs_writepage);
> +
> +static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio)

This is a terrible name please invent something more appropriate. You can't have an
nfs_do_writepage call a do_nfs_writepage it's the same name. Please use a different name
that describes what is the point of this function. (nfs_writepage_stats ???)

> +{
> +	struct inode *inode = page->mapping->host;
> +
> +	nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
> +	nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1);
> +
> +	return do_nfs_writepage(page, wbc, pgio);
> +}
>  
>  /*
>   * Write an mmapped page to the server.
> @@ -1051,12 +1057,13 @@ static const struct nfs_pageio_ops nfs_pageio_write_ops = {
>  	.pg_doio = nfs_generic_pg_writepages,
>  };
>  
> -static void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio,
> +void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio,
>  				  struct inode *inode, int ioflags)
>  {
>  	nfs_pageio_init(pgio, inode, &nfs_pageio_write_ops,
>  				NFS_SERVER(inode)->wsize, ioflags);
>  }
> +EXPORT_SYMBOL_GPL(nfs_pageio_init_write_mds);

Why is this EXPORT? if you are going to use it from LD driver later
in a patch that we did not yet see, please only make it export in the
patch that has a user for it. (You don't need EXPORT_X if it is used
by a different file inside nfs.ko, just only remove the static)

>  
>  void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio)
>  {

Thanks for looking into this issue. Actually looking back we always had
a problem here. I never was able to pass my error injection tests.

Boaz
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Peng Tao Aug. 11, 2011, 12:03 a.m. UTC | #2
On Thu, Aug 11, 2011 at 1:52 AM, Boaz Harrosh <bharrosh@panasas.com> wrote:
> On 08/06/2011 07:53 PM, Peng Tao wrote:
>> For pnfs pagelist write failure, we need to pg_recoalesce and resend
>> IO to mds.
>>
>
> I have not given this subject any thought or investigation, so I don't
> know what we should do, but the gut feeling is that I have seen all this
> code else where and we could be having a bigger re-use of existing code.
>
> What if we dig into:
>        data->mds_ops->rpc_call_done(&data->task, data);
>        data->mds_ops->rpc_release(data);
>
> And do all the pages tear-down and unlocks but if there is an error
> not set them as clean. That is keep them dirty. Then mark the layout
> as error and let the normal code choose an MDS write_out. (Just a wild
> thought)
This may work only for write failures. But for read, we will have to
recoalesce and send to MDS. So I prefer to let read and write have
similar retry code path like this.

>
> Trond please look in here, can't it be made simpler?
>
>
>> Signed-off-by: Peng Tao <peng_tao@emc.com>
>> ---
>>  fs/nfs/internal.h |    4 ++++
>>  fs/nfs/pnfs.c     |   35 ++++++++++++++++++++++++++++++++---
>>  fs/nfs/write.c    |   21 ++++++++++++++-------
>>  3 files changed, 50 insertions(+), 10 deletions(-)
>>
>> diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
>> index ab12913..62f183d 100644
>> --- a/fs/nfs/internal.h
>> +++ b/fs/nfs/internal.h
>> @@ -305,6 +305,10 @@ extern void nfs_readdata_release(struct nfs_read_data *rdata);
>>  /* write.c */
>>  extern int nfs_generic_flush(struct nfs_pageio_descriptor *desc,
>>               struct list_head *head);
>> +extern int do_nfs_writepage(struct page *page, struct writeback_control *wbc,
>> +             struct nfs_pageio_descriptor *pgio);
>> +extern void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio,
>> +             struct inode *inode, int ioflags);
>>  extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio);
>>  extern void nfs_writedata_release(struct nfs_write_data *wdata);
>>  extern void nfs_commit_free(struct nfs_write_data *p);
>> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
>> index e550e88..08aba45 100644
>> --- a/fs/nfs/pnfs.c
>> +++ b/fs/nfs/pnfs.c
>> @@ -1172,6 +1172,13 @@ int
>>  pnfs_ld_write_done(struct nfs_write_data *data)
>>  {
>>       int status;
>> +     struct nfs_pageio_descriptor pgio;
>> +     struct writeback_control wbc = {
>> +             .sync_mode = WB_SYNC_ALL,
>> +             .range_start = data->mds_offset,
>> +             .nr_to_write = data->npages,
>> +             .range_end = LLONG_MAX,
>> +     };
>>
>>       if (!data->pnfs_error) {
>>               pnfs_set_layoutcommit(data);
>> @@ -1180,11 +1187,33 @@ pnfs_ld_write_done(struct nfs_write_data *data)
>>               return 0;
>>       }
>>
>> +     put_lseg(data->lseg);
>> +     data->lseg = NULL;
>>       dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__,
>>               data->pnfs_error);
>> -     status = nfs_initiate_write(data, NFS_CLIENT(data->inode),
>> -                                 data->mds_ops, NFS_FILE_SYNC);
>> -     return status ? : -EAGAIN;
>> +     nfs_pageio_init_write_mds(&pgio, data->inode, FLUSH_STABLE);
>> +     pgio.pg_recoalesce = 1;
>> +     while (!list_empty(&data->pages)) {
>> +             struct nfs_page *req = nfs_list_entry(data->pages.next);
>> +             struct page *page = req->wb_page;
>> +
>> +             nfs_list_remove_request(req);
>> +             nfs_clear_page_tag_locked(req);
>> +
>> +             end_page_writeback(page);
>> +
>> +             lock_page(page);
>> +             status = do_nfs_writepage(page, &wbc, &pgio);
>> +             if (status) {
>> +                     /* FIXME: is this enough?? */
>> +                     set_page_dirty(page);
>> +             }
>> +             unlock_page(page);
>> +     }
>> +     nfs_pageio_complete(&pgio);
>> +     nfs_writedata_release(data);
>> +
>> +     return 0;
>>  }
>>  EXPORT_SYMBOL_GPL(pnfs_ld_write_done);
>>
>> diff --git a/fs/nfs/write.c b/fs/nfs/write.c
>> index b39b37f..0ccdf98 100644
>> --- a/fs/nfs/write.c
>> +++ b/fs/nfs/write.c
>> @@ -285,14 +285,9 @@ out:
>>       return ret;
>>  }
>>
>> -static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio)
>> +int do_nfs_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio)
>>  {
>> -     struct inode *inode = page->mapping->host;
>>       int ret;
>> -
>> -     nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
>> -     nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1);
>> -
>>       nfs_pageio_cond_complete(pgio, page->index);
>>       ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE);
>>       if (ret == -EAGAIN) {
>> @@ -301,6 +296,17 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, st
>>       }
>>       return ret;
>>  }
>> +EXPORT_SYMBOL_GPL(do_nfs_writepage);
>> +
>> +static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio)
>
> This is a terrible name please invent something more appropriate. You can't have an
> nfs_do_writepage call a do_nfs_writepage it's the same name. Please use a different name
> that describes what is the point of this function. (nfs_writepage_stats ???)
Agreed. Will change it.

>
>> +{
>> +     struct inode *inode = page->mapping->host;
>> +
>> +     nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
>> +     nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1);
>> +
>> +     return do_nfs_writepage(page, wbc, pgio);
>> +}
>>
>>  /*
>>   * Write an mmapped page to the server.
>> @@ -1051,12 +1057,13 @@ static const struct nfs_pageio_ops nfs_pageio_write_ops = {
>>       .pg_doio = nfs_generic_pg_writepages,
>>  };
>>
>> -static void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio,
>> +void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio,
>>                                 struct inode *inode, int ioflags)
>>  {
>>       nfs_pageio_init(pgio, inode, &nfs_pageio_write_ops,
>>                               NFS_SERVER(inode)->wsize, ioflags);
>>  }
>> +EXPORT_SYMBOL_GPL(nfs_pageio_init_write_mds);
>
> Why is this EXPORT? if you are going to use it from LD driver later
> in a patch that we did not yet see, please only make it export in the
> patch that has a user for it. (You don't need EXPORT_X if it is used
> by a different file inside nfs.ko, just only remove the static)
Will change it. Thanks.

>
>>
>>  void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio)
>>  {
>
> Thanks for looking into this issue. Actually looking back we always had
> a problem here. I never was able to pass my error injection tests.
Thanks for reviewing. I will wait for Trond's comments before sending
the next version.
Boaz Harrosh Aug. 11, 2011, 6:53 p.m. UTC | #3
On 08/10/2011 05:03 PM, Peng Tao wrote:
> On Thu, Aug 11, 2011 at 1:52 AM, Boaz Harrosh <bharrosh@panasas.com> wrote:
>> On 08/06/2011 07:53 PM, Peng Tao wrote:
>>> For pnfs pagelist write failure, we need to pg_recoalesce and resend
>>> IO to mds.
>>>
>>
>> I have not given this subject any thought or investigation, so I don't
>> know what we should do, but the gut feeling is that I have seen all this
>> code else where and we could be having a bigger re-use of existing code.
>>
>> What if we dig into:
>>        data->mds_ops->rpc_call_done(&data->task, data);
>>        data->mds_ops->rpc_release(data);
>>
>> And do all the pages tear-down and unlocks but if there is an error
>> not set them as clean. That is keep them dirty. Then mark the layout
>> as error and let the normal code choose an MDS write_out. (Just a wild
>> thought)
> This may work only for write failures. But for read, we will have to
> recoalesce and send to MDS. So I prefer to let read and write have
> similar retry code path like this.
> 

I disagree. Look even now the read path is very different then the write
path. (See your two patches: write-patch is 3 times bigger the read-patch)

You should see if what I say is possible for write. And then maybe some
thing will come up also for read. They do not necessarily need to be the
same. (I think)

>>
>> Trond please look in here, can't it be made simpler?

Thanks
Boaz
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Peng Tao Aug. 11, 2011, 11:53 p.m. UTC | #4
On Fri, Aug 12, 2011 at 2:53 AM, Boaz Harrosh <bharrosh@panasas.com> wrote:
> On 08/10/2011 05:03 PM, Peng Tao wrote:
>> On Thu, Aug 11, 2011 at 1:52 AM, Boaz Harrosh <bharrosh@panasas.com> wrote:
>>> On 08/06/2011 07:53 PM, Peng Tao wrote:
>>>> For pnfs pagelist write failure, we need to pg_recoalesce and resend
>>>> IO to mds.
>>>>
>>>
>>> I have not given this subject any thought or investigation, so I don't
>>> know what we should do, but the gut feeling is that I have seen all this
>>> code else where and we could be having a bigger re-use of existing code.
>>>
>>> What if we dig into:
>>>        data->mds_ops->rpc_call_done(&data->task, data);
>>>        data->mds_ops->rpc_release(data);
>>>
>>> And do all the pages tear-down and unlocks but if there is an error
>>> not set them as clean. That is keep them dirty. Then mark the layout
>>> as error and let the normal code choose an MDS write_out. (Just a wild
>>> thought)
>> This may work only for write failures. But for read, we will have to
>> recoalesce and send to MDS. So I prefer to let read and write have
>> similar retry code path like this.
>>
>
> I disagree. Look even now the read path is very different then the write
> path. (See your two patches: write-patch is 3 times bigger the read-patch)
I mean their logic is the same: if pnfs_error is set, recoalesce the
pages and re-send to MDS :)

>
> You should see if what I say is possible for write. And then maybe some
> thing will come up also for read. They do not necessarily need to be the
> same. (I think)
I agree that it is possible for write. We can re-dirty the pages and
rely on next flush to write it out to MDS. This is mentioned by Trond
before. However, the method won't work for read failures. I don't see
how we can queue failed read pages and let someone else re-send it
later.
Boaz Harrosh Aug. 12, 2011, 12:10 a.m. UTC | #5
On 08/11/2011 04:53 PM, Peng Tao wrote:
>>
>> You should see if what I say is possible for write. And then maybe some
>> thing will come up also for read. They do not necessarily need to be the
>> same. (I think)
> I agree that it is possible for write. We can re-dirty the pages and
> rely on next flush to write it out to MDS. This is mentioned by Trond
> before. However, the method won't work for read failures. I don't see
> how we can queue failed read pages and let someone else re-send it
> later.
> 

Lets leave the read patch as is for now. Lets try to do it only for
writes.

It is OK to have write do one way and read do another way, I think.

Maybe later we can find a better solution for reads as well.

Thanks
Boaz
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Boaz Harrosh Aug. 16, 2011, 8:19 p.m. UTC | #6
On 08/16/2011 12:20 AM, tao.peng@emc.com wrote:
> 
> I tried to rewrite the write patch to handle failures inside
> mds_ops->rpc_release. However, I get a problem w.r.t. "redirty and
> rely on next flush". If the failed write is the *last flush*, we end
> up with relying no one and the dirty pages are simply dropped. Do you
> have any suggestions how to handle it?
> 
> Thanks, Tao
> 

Tao Hi.

OK, I see what you mean. That would be a problem

I had a totally different idea You know how today we just do:
	nfs_initiate_write()

Which is bad because it can actually dead lock because
we are taking up the only thread that needs to service
that call. Well I thought, with you thread-for-pnfs patch,
can it not now work? I think it is worth a try?

See if you can advance your thread-for-blocks-objects
patch to current code and inject some errors. I think
it will work this time. What do you think?

Thanks
Boaz
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Peng Tao Aug. 17, 2011, 9:44 a.m. UTC | #7
Hi, Boaz,

On Wed, Aug 17, 2011 at 4:19 AM, Boaz Harrosh <bharrosh@panasas.com> wrote:
> On 08/16/2011 12:20 AM, tao.peng@emc.com wrote:
>>
>> I tried to rewrite the write patch to handle failures inside
>> mds_ops->rpc_release. However, I get a problem w.r.t. "redirty and
>> rely on next flush". If the failed write is the *last flush*, we end
>> up with relying no one and the dirty pages are simply dropped. Do you
>> have any suggestions how to handle it?
>>
>> Thanks, Tao
>>
>
> Tao Hi.
>
> OK, I see what you mean. That would be a problem
>
> I had a totally different idea You know how today we just do:
>        nfs_initiate_write()
>
> Which is bad because it can actually dead lock because
> we are taking up the only thread that needs to service
> that call. Well I thought, with you thread-for-pnfs patch,
> can it not now work? I think it is worth a try?
The problem w/ directly calling nfs_initiate_write() is that we may
have pagelist length larger than server's rsize/wsize. So if client
sends all the pages in single READ/WRITE rpc, MDS will reject the
READ/WRITE operation. Therefore we need to recoalesce them before
re-sending to MDS.

>
> See if you can advance your thread-for-blocks-objects
> patch to current code and inject some errors. I think
> it will work this time. What do you think?
The thread-for-block-objects patch is to solve the default workqueue
deadlock problem. But it can't solve the too large IO size for MDS
problem. So I think we need all of the three patches to handle LD IO
failures.

Thanks,
Tao
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Boaz Harrosh Aug. 22, 2011, 11:28 p.m. UTC | #8
On 08/17/2011 02:44 AM, Peng Tao wrote:
<snip>

> The problem w/ directly calling nfs_initiate_write() is that we may
> have pagelist length larger than server's rsize/wsize. So if client
> sends all the pages in single READ/WRITE rpc, MDS will reject the
> READ/WRITE operation. Therefore we need to recoalesce them before
> re-sending to MDS.
> 

Rrrr, you are absolutely right. I needed a smack on the head, to
get back on track.

<snip>

Thanks
Boaz
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index ab12913..62f183d 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -305,6 +305,10 @@  extern void nfs_readdata_release(struct nfs_read_data *rdata);
 /* write.c */
 extern int nfs_generic_flush(struct nfs_pageio_descriptor *desc,
 		struct list_head *head);
+extern int do_nfs_writepage(struct page *page, struct writeback_control *wbc,
+		struct nfs_pageio_descriptor *pgio);
+extern void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio,
+		struct inode *inode, int ioflags);
 extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio);
 extern void nfs_writedata_release(struct nfs_write_data *wdata);
 extern void nfs_commit_free(struct nfs_write_data *p);
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index e550e88..08aba45 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1172,6 +1172,13 @@  int
 pnfs_ld_write_done(struct nfs_write_data *data)
 {
 	int status;
+	struct nfs_pageio_descriptor pgio;
+	struct writeback_control wbc = {
+		.sync_mode = WB_SYNC_ALL,
+		.range_start = data->mds_offset,
+		.nr_to_write = data->npages,
+		.range_end = LLONG_MAX,
+	};
 
 	if (!data->pnfs_error) {
 		pnfs_set_layoutcommit(data);
@@ -1180,11 +1187,33 @@  pnfs_ld_write_done(struct nfs_write_data *data)
 		return 0;
 	}
 
+	put_lseg(data->lseg);
+	data->lseg = NULL;
 	dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__,
 		data->pnfs_error);
-	status = nfs_initiate_write(data, NFS_CLIENT(data->inode),
-				    data->mds_ops, NFS_FILE_SYNC);
-	return status ? : -EAGAIN;
+	nfs_pageio_init_write_mds(&pgio, data->inode, FLUSH_STABLE);
+	pgio.pg_recoalesce = 1;
+	while (!list_empty(&data->pages)) {
+		struct nfs_page *req = nfs_list_entry(data->pages.next);
+		struct page *page = req->wb_page;
+
+		nfs_list_remove_request(req);
+		nfs_clear_page_tag_locked(req);
+
+		end_page_writeback(page);
+
+		lock_page(page);
+		status = do_nfs_writepage(page, &wbc, &pgio);
+		if (status) {
+			/* FIXME: is this enough?? */
+			set_page_dirty(page);
+		}
+		unlock_page(page);
+	}
+	nfs_pageio_complete(&pgio);
+	nfs_writedata_release(data);
+
+	return 0;
 }
 EXPORT_SYMBOL_GPL(pnfs_ld_write_done);
 
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index b39b37f..0ccdf98 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -285,14 +285,9 @@  out:
 	return ret;
 }
 
-static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio)
+int do_nfs_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio)
 {
-	struct inode *inode = page->mapping->host;
 	int ret;
-
-	nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
-	nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1);
-
 	nfs_pageio_cond_complete(pgio, page->index);
 	ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE);
 	if (ret == -EAGAIN) {
@@ -301,6 +296,17 @@  static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, st
 	}
 	return ret;
 }
+EXPORT_SYMBOL_GPL(do_nfs_writepage);
+
+static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio)
+{
+	struct inode *inode = page->mapping->host;
+
+	nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
+	nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1);
+
+	return do_nfs_writepage(page, wbc, pgio);
+}
 
 /*
  * Write an mmapped page to the server.
@@ -1051,12 +1057,13 @@  static const struct nfs_pageio_ops nfs_pageio_write_ops = {
 	.pg_doio = nfs_generic_pg_writepages,
 };
 
-static void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio,
+void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio,
 				  struct inode *inode, int ioflags)
 {
 	nfs_pageio_init(pgio, inode, &nfs_pageio_write_ops,
 				NFS_SERVER(inode)->wsize, ioflags);
 }
+EXPORT_SYMBOL_GPL(nfs_pageio_init_write_mds);
 
 void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio)
 {