diff mbox

[v3,1/1] PNFS fix dangling DS mount

Message ID 20170630195201.95597-1-kolga@netapp.com (mailing list archive)
State New, archived
Headers show

Commit Message

Olga Kornievskaia June 30, 2017, 7:52 p.m. UTC
There is a regression by commit 8d40b0f14846 ("NFS filelayout:call
GETDEVICEINFO after pnfs_layout_process completes"). It leaves the
DS mount dangling.

Previously, filelayout_alloc_sec() would call filelayout_check_layout()
which would call nfs4_find_get_deviceid which ups the count on the
device_id. It's only called once and it's matched by the
filelayout_free_lseg() that calls nfs4_fl_put_deviceid().

After that patch, each read/write ends up calling nfs4_find_get_deviceid
and there is no balance for that. Instead, do nfs4_fl_put_deviceid()
in the filelayout's .pg_cleanup and remove it from filelayout_free_lseg.

But we still need a reference to hold over the lifetime of the segment.
For every new lseg that's created we need to take a reference on deviceid
that uses it. It will be released in the "free_lseg" routine.

Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
---
 fs/nfs/filelayout/filelayout.c         | 21 +++++++++++++++++----
 fs/nfs/flexfilelayout/flexfilelayout.c |  9 ++++++---
 fs/nfs/pnfs.c                          | 13 ++++++++++---
 fs/nfs/pnfs.h                          |  3 ++-
 4 files changed, 35 insertions(+), 11 deletions(-)

Comments

Steve Dickson July 5, 2017, 1:52 p.m. UTC | #1
On 06/30/2017 03:52 PM, Olga Kornievskaia wrote:
> There is a regression by commit 8d40b0f14846 ("NFS filelayout:call
> GETDEVICEINFO after pnfs_layout_process completes"). It leaves the
> DS mount dangling.
> 
> Previously, filelayout_alloc_sec() would call filelayout_check_layout()
> which would call nfs4_find_get_deviceid which ups the count on the
> device_id. It's only called once and it's matched by the
> filelayout_free_lseg() that calls nfs4_fl_put_deviceid().
> 
> After that patch, each read/write ends up calling nfs4_find_get_deviceid
> and there is no balance for that. Instead, do nfs4_fl_put_deviceid()
> in the filelayout's .pg_cleanup and remove it from filelayout_free_lseg.
> 
> But we still need a reference to hold over the lifetime of the segment.
> For every new lseg that's created we need to take a reference on deviceid
> that uses it. It will be released in the "free_lseg" routine.
> 
> Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
Tested-by: Steve Dickson <steved@redhat.com>

steved.

> ---
>  fs/nfs/filelayout/filelayout.c         | 21 +++++++++++++++++----
>  fs/nfs/flexfilelayout/flexfilelayout.c |  9 ++++++---
>  fs/nfs/pnfs.c                          | 13 ++++++++++---
>  fs/nfs/pnfs.h                          |  3 ++-
>  4 files changed, 35 insertions(+), 11 deletions(-)
> 
> diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
> index 1cf85d6..86d694e 100644
> --- a/fs/nfs/filelayout/filelayout.c
> +++ b/fs/nfs/filelayout/filelayout.c
> @@ -909,9 +909,10 @@ static void _filelayout_free_lseg(struct nfs4_filelayout_segment *fl)
>  	struct pnfs_layout_hdr *lo;
>  	struct nfs4_filelayout_segment *fl;
>  	int status;
> +	bool new_layout = false;
>  
>  	lseg = pnfs_update_layout(ino, ctx, pos, count, iomode, strict_iomode,
> -				  gfp_flags);
> +				  gfp_flags, &new_layout);
>  	if (!lseg)
>  		lseg = ERR_PTR(-ENOMEM);
>  	if (IS_ERR(lseg))
> @@ -924,7 +925,8 @@ static void _filelayout_free_lseg(struct nfs4_filelayout_segment *fl)
>  	if (status) {
>  		pnfs_put_lseg(lseg);
>  		lseg = ERR_PTR(status);
> -	}
> +	} else if (new_layout)
> +		nfs4_get_deviceid(&fl->dsaddr->id_node);
>  out:
>  	return lseg;
>  }
> @@ -991,18 +993,29 @@ static void _filelayout_free_lseg(struct nfs4_filelayout_segment *fl)
>  	nfs_pageio_reset_write_mds(pgio);
>  }
>  
> +static void filelayout_pg_cleanup(struct nfs_pageio_descriptor *desc)
> +{
> +	if (desc->pg_lseg) {
> +		struct nfs4_filelayout_segment *fl =
> +			FILELAYOUT_LSEG(desc->pg_lseg);
> +
> +		nfs4_fl_put_deviceid(fl->dsaddr);
> +	}
> +	pnfs_generic_pg_cleanup(desc);
> +}
> +
>  static const struct nfs_pageio_ops filelayout_pg_read_ops = {
>  	.pg_init = filelayout_pg_init_read,
>  	.pg_test = filelayout_pg_test,
>  	.pg_doio = pnfs_generic_pg_readpages,
> -	.pg_cleanup = pnfs_generic_pg_cleanup,
> +	.pg_cleanup = filelayout_pg_cleanup,
>  };
>  
>  static const struct nfs_pageio_ops filelayout_pg_write_ops = {
>  	.pg_init = filelayout_pg_init_write,
>  	.pg_test = filelayout_pg_test,
>  	.pg_doio = pnfs_generic_pg_writepages,
> -	.pg_cleanup = pnfs_generic_pg_cleanup,
> +	.pg_cleanup = filelayout_pg_cleanup,
>  };
>  
>  static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j)
> diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
> index 23542dc..53a4a19 100644
> --- a/fs/nfs/flexfilelayout/flexfilelayout.c
> +++ b/fs/nfs/flexfilelayout/flexfilelayout.c
> @@ -820,7 +820,8 @@ static bool ff_layout_has_rw_segments(struct pnfs_layout_hdr *layout)
>  					   NFS4_MAX_UINT64,
>  					   IOMODE_READ,
>  					   strict_iomode,
> -					   GFP_KERNEL);
> +					   GFP_KERNEL,
> +					   NULL);
>  	if (IS_ERR(pgio->pg_lseg)) {
>  		pgio->pg_error = PTR_ERR(pgio->pg_lseg);
>  		pgio->pg_lseg = NULL;
> @@ -904,7 +905,8 @@ static bool ff_layout_has_rw_segments(struct pnfs_layout_hdr *layout)
>  						   NFS4_MAX_UINT64,
>  						   IOMODE_RW,
>  						   false,
> -						   GFP_NOFS);
> +						   GFP_NOFS,
> +						   NULL);
>  		if (IS_ERR(pgio->pg_lseg)) {
>  			pgio->pg_error = PTR_ERR(pgio->pg_lseg);
>  			pgio->pg_lseg = NULL;
> @@ -960,7 +962,8 @@ static bool ff_layout_has_rw_segments(struct pnfs_layout_hdr *layout)
>  						   NFS4_MAX_UINT64,
>  						   IOMODE_RW,
>  						   false,
> -						   GFP_NOFS);
> +						   GFP_NOFS,
> +						   NULL);
>  		if (IS_ERR(pgio->pg_lseg)) {
>  			pgio->pg_error = PTR_ERR(pgio->pg_lseg);
>  			pgio->pg_lseg = NULL;
> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
> index c383d09..fb011c6 100644
> --- a/fs/nfs/pnfs.c
> +++ b/fs/nfs/pnfs.c
> @@ -1698,7 +1698,8 @@ struct pnfs_layout_segment *
>  		   u64 count,
>  		   enum pnfs_iomode iomode,
>  		   bool strict_iomode,
> -		   gfp_t gfp_flags)
> +		   gfp_t gfp_flags,
> +		   bool *new)
>  {
>  	struct pnfs_layout_range arg = {
>  		.iomode = iomode,
> @@ -1764,8 +1765,12 @@ struct pnfs_layout_segment *
>  	if (lseg) {
>  		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
>  				PNFS_UPDATE_LAYOUT_FOUND_CACHED);
> +		if (new)
> +			*new = false;
>  		goto out_unlock;
>  	}
> +	if (new)
> +		*new = true;
>  
>  	if (!nfs4_valid_open_stateid(ctx->state)) {
>  		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
> @@ -2126,7 +2131,8 @@ void pnfs_error_mark_layout_for_return(struct inode *inode,
>  						   rd_size,
>  						   IOMODE_READ,
>  						   false,
> -						   GFP_KERNEL);
> +						   GFP_KERNEL,
> +						   NULL);
>  		if (IS_ERR(pgio->pg_lseg)) {
>  			pgio->pg_error = PTR_ERR(pgio->pg_lseg);
>  			pgio->pg_lseg = NULL;
> @@ -2153,7 +2159,8 @@ void pnfs_error_mark_layout_for_return(struct inode *inode,
>  						   wb_size,
>  						   IOMODE_RW,
>  						   false,
> -						   GFP_NOFS);
> +						   GFP_NOFS,
> +						   NULL);
>  		if (IS_ERR(pgio->pg_lseg)) {
>  			pgio->pg_error = PTR_ERR(pgio->pg_lseg);
>  			pgio->pg_lseg = NULL;
> diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
> index 99731e3..978fab0 100644
> --- a/fs/nfs/pnfs.h
> +++ b/fs/nfs/pnfs.h
> @@ -291,7 +291,8 @@ struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
>  					       u64 count,
>  					       enum pnfs_iomode iomode,
>  					       bool strict_iomode,
> -					       gfp_t gfp_flags);
> +					       gfp_t gfp_flags,
> +					       bool *new);
>  void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo,
>  		const nfs4_stateid *arg_stateid,
>  		const struct pnfs_layout_range *range,
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Trond Myklebust July 20, 2017, 7:56 p.m. UTC | #2
Hi Olga,

Apologies for missing this patch. It was hiding in my 'linux-fsdevel'
mailbox, so I didn't recognise it as a NFS patch.

On Fri, 2017-06-30 at 15:52 -0400, Olga Kornievskaia wrote:
> There is a regression by commit 8d40b0f14846 ("NFS filelayout:call

> GETDEVICEINFO after pnfs_layout_process completes"). It leaves the

> DS mount dangling.

> 

> Previously, filelayout_alloc_sec() would call

> filelayout_check_layout()

> which would call nfs4_find_get_deviceid which ups the count on the

> device_id. It's only called once and it's matched by the

> filelayout_free_lseg() that calls nfs4_fl_put_deviceid().

> 

> After that patch, each read/write ends up calling

> nfs4_find_get_deviceid

> and there is no balance for that. Instead, do nfs4_fl_put_deviceid()

> in the filelayout's .pg_cleanup and remove it from

> filelayout_free_lseg.

> 

> But we still need a reference to hold over the lifetime of the

> segment.

> For every new lseg that's created we need to take a reference on

> deviceid

> that uses it. It will be released in the "free_lseg" routine.


This is what I'm not understanding. If you have a reference in the
layout segment, then why do you need to call nfs4_find_get_deviceid()
in the read/write code?

Isn't it sufficient to change the "pg_init" calls to check whether or
not the struct nfs4_filelayout_segment has set a value for dsaddr (that
needs to be done with care to avoid races - cmpxchg() is your friend),
and then rely on that reference being set for the remainder of the
layout segment lifetime?


-- 
Trond Myklebust
Linux NFS client maintainer, PrimaryData
trond.myklebust@primarydata.com
Olga Kornievskaia July 20, 2017, 8:14 p.m. UTC | #3
> On Jul 20, 2017, at 3:56 PM, Trond Myklebust <trondmy@primarydata.com> wrote:
> 
> Hi Olga,
> 
> Apologies for missing this patch. It was hiding in my 'linux-fsdevel'
> mailbox, so I didn't recognise it as a NFS patch.
> 

Yeah after I mailed it out I realized I cc-ed fsdevel incorrectly.

> On Fri, 2017-06-30 at 15:52 -0400, Olga Kornievskaia wrote:
>> There is a regression by commit 8d40b0f14846 ("NFS filelayout:call
>> GETDEVICEINFO after pnfs_layout_process completes"). It leaves the
>> DS mount dangling.
>> 
>> Previously, filelayout_alloc_sec() would call
>> filelayout_check_layout()
>> which would call nfs4_find_get_deviceid which ups the count on the
>> device_id. It's only called once and it's matched by the
>> filelayout_free_lseg() that calls nfs4_fl_put_deviceid().
>> 
>> After that patch, each read/write ends up calling
>> nfs4_find_get_deviceid
>> and there is no balance for that. Instead, do nfs4_fl_put_deviceid()
>> in the filelayout's .pg_cleanup and remove it from
>> filelayout_free_lseg.
>> 
>> But we still need a reference to hold over the lifetime of the
>> segment.
>> For every new lseg that's created we need to take a reference on
>> deviceid
>> that uses it. It will be released in the "free_lseg" routine.
> 
> This is what I'm not understanding. If you have a reference in the
> layout segment, then why do you need to call nfs4_find_get_deviceid()
> in the read/write code?

I think I’m probably misunderstanding the question. It sounds to me that you asking me for why the commit 8d40b0f14846 was done the way it was done (I’d would say it was done as per your suggestion).

I would say the call to nfs4_find_get_deviceid() has always been in the read/write code. It was a part of the pnfs_update_layout() but the commit 8d40b0f14846 moved it out of it (so that the layoutget was complete) and then the call to the getdeviceinfo would be done. 

> Isn't it sufficient to change the "pg_init" calls to check whether or
> not the struct nfs4_filelayout_segment has set a value for dsaddr (that
> needs to be done with care to avoid races - cmpxchg() is your friend),
> and then rely on that reference being set for the remainder of the
> layout segment lifetime?

Are you suggesting to change when getdeviceinfo is suppose to be called?

> 
> 
> -- 
> Trond Myklebust
> Linux NFS client maintainer, PrimaryData
> trond.myklebust@primarydata.com

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 1cf85d6..86d694e 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -909,9 +909,10 @@  static void _filelayout_free_lseg(struct nfs4_filelayout_segment *fl)
 	struct pnfs_layout_hdr *lo;
 	struct nfs4_filelayout_segment *fl;
 	int status;
+	bool new_layout = false;
 
 	lseg = pnfs_update_layout(ino, ctx, pos, count, iomode, strict_iomode,
-				  gfp_flags);
+				  gfp_flags, &new_layout);
 	if (!lseg)
 		lseg = ERR_PTR(-ENOMEM);
 	if (IS_ERR(lseg))
@@ -924,7 +925,8 @@  static void _filelayout_free_lseg(struct nfs4_filelayout_segment *fl)
 	if (status) {
 		pnfs_put_lseg(lseg);
 		lseg = ERR_PTR(status);
-	}
+	} else if (new_layout)
+		nfs4_get_deviceid(&fl->dsaddr->id_node);
 out:
 	return lseg;
 }
@@ -991,18 +993,29 @@  static void _filelayout_free_lseg(struct nfs4_filelayout_segment *fl)
 	nfs_pageio_reset_write_mds(pgio);
 }
 
+static void filelayout_pg_cleanup(struct nfs_pageio_descriptor *desc)
+{
+	if (desc->pg_lseg) {
+		struct nfs4_filelayout_segment *fl =
+			FILELAYOUT_LSEG(desc->pg_lseg);
+
+		nfs4_fl_put_deviceid(fl->dsaddr);
+	}
+	pnfs_generic_pg_cleanup(desc);
+}
+
 static const struct nfs_pageio_ops filelayout_pg_read_ops = {
 	.pg_init = filelayout_pg_init_read,
 	.pg_test = filelayout_pg_test,
 	.pg_doio = pnfs_generic_pg_readpages,
-	.pg_cleanup = pnfs_generic_pg_cleanup,
+	.pg_cleanup = filelayout_pg_cleanup,
 };
 
 static const struct nfs_pageio_ops filelayout_pg_write_ops = {
 	.pg_init = filelayout_pg_init_write,
 	.pg_test = filelayout_pg_test,
 	.pg_doio = pnfs_generic_pg_writepages,
-	.pg_cleanup = pnfs_generic_pg_cleanup,
+	.pg_cleanup = filelayout_pg_cleanup,
 };
 
 static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j)
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 23542dc..53a4a19 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -820,7 +820,8 @@  static bool ff_layout_has_rw_segments(struct pnfs_layout_hdr *layout)
 					   NFS4_MAX_UINT64,
 					   IOMODE_READ,
 					   strict_iomode,
-					   GFP_KERNEL);
+					   GFP_KERNEL,
+					   NULL);
 	if (IS_ERR(pgio->pg_lseg)) {
 		pgio->pg_error = PTR_ERR(pgio->pg_lseg);
 		pgio->pg_lseg = NULL;
@@ -904,7 +905,8 @@  static bool ff_layout_has_rw_segments(struct pnfs_layout_hdr *layout)
 						   NFS4_MAX_UINT64,
 						   IOMODE_RW,
 						   false,
-						   GFP_NOFS);
+						   GFP_NOFS,
+						   NULL);
 		if (IS_ERR(pgio->pg_lseg)) {
 			pgio->pg_error = PTR_ERR(pgio->pg_lseg);
 			pgio->pg_lseg = NULL;
@@ -960,7 +962,8 @@  static bool ff_layout_has_rw_segments(struct pnfs_layout_hdr *layout)
 						   NFS4_MAX_UINT64,
 						   IOMODE_RW,
 						   false,
-						   GFP_NOFS);
+						   GFP_NOFS,
+						   NULL);
 		if (IS_ERR(pgio->pg_lseg)) {
 			pgio->pg_error = PTR_ERR(pgio->pg_lseg);
 			pgio->pg_lseg = NULL;
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index c383d09..fb011c6 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1698,7 +1698,8 @@  struct pnfs_layout_segment *
 		   u64 count,
 		   enum pnfs_iomode iomode,
 		   bool strict_iomode,
-		   gfp_t gfp_flags)
+		   gfp_t gfp_flags,
+		   bool *new)
 {
 	struct pnfs_layout_range arg = {
 		.iomode = iomode,
@@ -1764,8 +1765,12 @@  struct pnfs_layout_segment *
 	if (lseg) {
 		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
 				PNFS_UPDATE_LAYOUT_FOUND_CACHED);
+		if (new)
+			*new = false;
 		goto out_unlock;
 	}
+	if (new)
+		*new = true;
 
 	if (!nfs4_valid_open_stateid(ctx->state)) {
 		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
@@ -2126,7 +2131,8 @@  void pnfs_error_mark_layout_for_return(struct inode *inode,
 						   rd_size,
 						   IOMODE_READ,
 						   false,
-						   GFP_KERNEL);
+						   GFP_KERNEL,
+						   NULL);
 		if (IS_ERR(pgio->pg_lseg)) {
 			pgio->pg_error = PTR_ERR(pgio->pg_lseg);
 			pgio->pg_lseg = NULL;
@@ -2153,7 +2159,8 @@  void pnfs_error_mark_layout_for_return(struct inode *inode,
 						   wb_size,
 						   IOMODE_RW,
 						   false,
-						   GFP_NOFS);
+						   GFP_NOFS,
+						   NULL);
 		if (IS_ERR(pgio->pg_lseg)) {
 			pgio->pg_error = PTR_ERR(pgio->pg_lseg);
 			pgio->pg_lseg = NULL;
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 99731e3..978fab0 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -291,7 +291,8 @@  struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
 					       u64 count,
 					       enum pnfs_iomode iomode,
 					       bool strict_iomode,
-					       gfp_t gfp_flags);
+					       gfp_t gfp_flags,
+					       bool *new);
 void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo,
 		const nfs4_stateid *arg_stateid,
 		const struct pnfs_layout_range *range,