diff mbox series

[RFC,v3] mm, pmem, xfs: Introduce MF_MEM_REMOVE for unbind

Message ID 20220615125400.880067-1-ruansy.fnst@fujitsu.com (mailing list archive)
State Superseded
Headers show
Series [RFC,v3] mm, pmem, xfs: Introduce MF_MEM_REMOVE for unbind | expand

Commit Message

Shiyang Ruan June 15, 2022, 12:54 p.m. UTC
This patch is inspired by Dan's "mm, dax, pmem: Introduce
dev_pagemap_failure()"[1].  With the help of dax_holder and
->notify_failure() mechanism, the pmem driver is able to ask filesystem
(or mapped device) on it to unmap all files in use and notify processes
who are using those files.

Call trace:
trigger unbind
 -> unbind_store()
  -> ... (skip)
   -> devres_release_all()   # was pmem driver ->remove() in v1
    -> kill_dax()
     -> dax_holder_notify_failure(dax_dev, 0, U64_MAX, MF_MEM_REMOVE)
      -> xfs_dax_notify_failure()

Introduce MF_MEM_REMOVE to let filesystem know this is a remove event.
So do not shutdown filesystem directly if something not supported, or if
failure range includes metadata area.  Make sure all files and processes
are handled correctly.

[1]: https://lore.kernel.org/linux-mm/161604050314.1463742.14151665140035795571.stgit@dwillia2-desk3.amr.corp.intel.com/

Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>

==
Changes since v2:
  1. Rebased on next-20220615

Changes since v1:
  1. Drop the needless change of moving {kill,put}_dax()
  2. Rebased on '[PATCHSETS] v14 fsdax-rmap + v11 fsdax-reflink'[2]

---
 drivers/dax/super.c         | 2 +-
 fs/xfs/xfs_notify_failure.c | 6 +++++-
 include/linux/mm.h          | 1 +
 3 files changed, 7 insertions(+), 2 deletions(-)

Comments

Darrick J. Wong June 22, 2022, 4:49 p.m. UTC | #1
On Wed, Jun 15, 2022 at 08:54:00PM +0800, Shiyang Ruan wrote:
> This patch is inspired by Dan's "mm, dax, pmem: Introduce
> dev_pagemap_failure()"[1].  With the help of dax_holder and
> ->notify_failure() mechanism, the pmem driver is able to ask filesystem
> (or mapped device) on it to unmap all files in use and notify processes
> who are using those files.
> 
> Call trace:
> trigger unbind
>  -> unbind_store()
>   -> ... (skip)
>    -> devres_release_all()   # was pmem driver ->remove() in v1
>     -> kill_dax()
>      -> dax_holder_notify_failure(dax_dev, 0, U64_MAX, MF_MEM_REMOVE)
>       -> xfs_dax_notify_failure()
> 
> Introduce MF_MEM_REMOVE to let filesystem know this is a remove event.
> So do not shutdown filesystem directly if something not supported, or if
> failure range includes metadata area.  Make sure all files and processes
> are handled correctly.
> 
> [1]: https://lore.kernel.org/linux-mm/161604050314.1463742.14151665140035795571.stgit@dwillia2-desk3.amr.corp.intel.com/
> 
> Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>
> 
> ==
> Changes since v2:
>   1. Rebased on next-20220615
> 
> Changes since v1:
>   1. Drop the needless change of moving {kill,put}_dax()
>   2. Rebased on '[PATCHSETS] v14 fsdax-rmap + v11 fsdax-reflink'[2]
> 
> ---
>  drivers/dax/super.c         | 2 +-
>  fs/xfs/xfs_notify_failure.c | 6 +++++-
>  include/linux/mm.h          | 1 +
>  3 files changed, 7 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/dax/super.c b/drivers/dax/super.c
> index 9b5e2a5eb0ae..d4bc83159d46 100644
> --- a/drivers/dax/super.c
> +++ b/drivers/dax/super.c
> @@ -323,7 +323,7 @@ void kill_dax(struct dax_device *dax_dev)
>  		return;
>  
>  	if (dax_dev->holder_data != NULL)
> -		dax_holder_notify_failure(dax_dev, 0, U64_MAX, 0);
> +		dax_holder_notify_failure(dax_dev, 0, U64_MAX, MF_MEM_REMOVE);

At the point we're initiating a MEM_REMOVE call, is the pmem already
gone, or is it about to be gone?

>  
>  	clear_bit(DAXDEV_ALIVE, &dax_dev->flags);
>  	synchronize_srcu(&dax_srcu);
> diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c
> index aa8dc27c599c..91d3f05d4241 100644
> --- a/fs/xfs/xfs_notify_failure.c
> +++ b/fs/xfs/xfs_notify_failure.c
> @@ -73,7 +73,9 @@ xfs_dax_failure_fn(
>  	struct failure_info		*notify = data;
>  	int				error = 0;
>  
> -	if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) ||
> +	/* Do not shutdown so early when device is to be removed */
> +	if (!(notify->mf_flags & MF_MEM_REMOVE) ||
> +	    XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) ||
>  	    (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) {
>  		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
>  		return -EFSCORRUPTED;
> @@ -182,6 +184,8 @@ xfs_dax_notify_failure(
>  
>  	if (mp->m_logdev_targp && mp->m_logdev_targp->bt_daxdev == dax_dev &&
>  	    mp->m_logdev_targp != mp->m_ddev_targp) {
> +		if (mf_flags & MF_MEM_REMOVE)
> +			return -EOPNOTSUPP;

The reason I ask is that if the pmem is *about to be* but not yet
removed from the system, shouldn't we at least try to flush all dirty
files and the log to reduce data loss and minimize recovery time?

If it's already gone, then you might as well shut down immediately,
unless there's a chance the pmem will come back(?)

--D

>  		xfs_err(mp, "ondisk log corrupt, shutting down fs!");
>  		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
>  		return -EFSCORRUPTED;
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 623c2ee8330a..bbeb31883362 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -3249,6 +3249,7 @@ enum mf_flags {
>  	MF_SOFT_OFFLINE = 1 << 3,
>  	MF_UNPOISON = 1 << 4,
>  	MF_NO_RETRY = 1 << 5,
> +	MF_MEM_REMOVE = 1 << 6,
>  };
>  int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
>  		      unsigned long count, int mf_flags);
> -- 
> 2.36.1
> 
> 
>
Shiyang Ruan June 24, 2022, 1:51 a.m. UTC | #2
在 2022/6/23 0:49, Darrick J. Wong 写道:
> On Wed, Jun 15, 2022 at 08:54:00PM +0800, Shiyang Ruan wrote:
>> This patch is inspired by Dan's "mm, dax, pmem: Introduce
>> dev_pagemap_failure()"[1].  With the help of dax_holder and
>> ->notify_failure() mechanism, the pmem driver is able to ask filesystem
>> (or mapped device) on it to unmap all files in use and notify processes
>> who are using those files.
>>
>> Call trace:
>> trigger unbind
>>   -> unbind_store()
>>    -> ... (skip)
>>     -> devres_release_all()   # was pmem driver ->remove() in v1
>>      -> kill_dax()
>>       -> dax_holder_notify_failure(dax_dev, 0, U64_MAX, MF_MEM_REMOVE)
>>        -> xfs_dax_notify_failure()
>>
>> Introduce MF_MEM_REMOVE to let filesystem know this is a remove event.
>> So do not shutdown filesystem directly if something not supported, or if
>> failure range includes metadata area.  Make sure all files and processes
>> are handled correctly.
>>
>> [1]: https://lore.kernel.org/linux-mm/161604050314.1463742.14151665140035795571.stgit@dwillia2-desk3.amr.corp.intel.com/
>>
>> Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>
>>
>> ==
>> Changes since v2:
>>    1. Rebased on next-20220615
>>
>> Changes since v1:
>>    1. Drop the needless change of moving {kill,put}_dax()
>>    2. Rebased on '[PATCHSETS] v14 fsdax-rmap + v11 fsdax-reflink'[2]
>>
>> ---
>>   drivers/dax/super.c         | 2 +-
>>   fs/xfs/xfs_notify_failure.c | 6 +++++-
>>   include/linux/mm.h          | 1 +
>>   3 files changed, 7 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/dax/super.c b/drivers/dax/super.c
>> index 9b5e2a5eb0ae..d4bc83159d46 100644
>> --- a/drivers/dax/super.c
>> +++ b/drivers/dax/super.c
>> @@ -323,7 +323,7 @@ void kill_dax(struct dax_device *dax_dev)
>>   		return;
>>   
>>   	if (dax_dev->holder_data != NULL)
>> -		dax_holder_notify_failure(dax_dev, 0, U64_MAX, 0);
>> +		dax_holder_notify_failure(dax_dev, 0, U64_MAX, MF_MEM_REMOVE);
> 
> At the point we're initiating a MEM_REMOVE call, is the pmem already
> gone, or is it about to be gone?

It's about to be gone.

I found two cases:
   1. exec `unbind` by user, who wants to unplug the pmem
   2. handle failures during initialization

> 
>>   
>>   	clear_bit(DAXDEV_ALIVE, &dax_dev->flags);
>>   	synchronize_srcu(&dax_srcu);
>> diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c
>> index aa8dc27c599c..91d3f05d4241 100644
>> --- a/fs/xfs/xfs_notify_failure.c
>> +++ b/fs/xfs/xfs_notify_failure.c
>> @@ -73,7 +73,9 @@ xfs_dax_failure_fn(
>>   	struct failure_info		*notify = data;
>>   	int				error = 0;
>>   
>> -	if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) ||
>> +	/* Do not shutdown so early when device is to be removed */
>> +	if (!(notify->mf_flags & MF_MEM_REMOVE) ||
>> +	    XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) ||
>>   	    (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) {
>>   		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
>>   		return -EFSCORRUPTED;
>> @@ -182,6 +184,8 @@ xfs_dax_notify_failure(
>>   
>>   	if (mp->m_logdev_targp && mp->m_logdev_targp->bt_daxdev == dax_dev &&
>>   	    mp->m_logdev_targp != mp->m_ddev_targp) {
>> +		if (mf_flags & MF_MEM_REMOVE)
>> +			return -EOPNOTSUPP;
> 
> The reason I ask is that if the pmem is *about to be* but not yet
> removed from the system, shouldn't we at least try to flush all dirty
> files and the log to reduce data loss and minimize recovery time?

Yes, they should be flushed.  Will add it.


--
Thanks,
Ruan.

> 
> If it's already gone, then you might as well shut down immediately,
> unless there's a chance the pmem will come back(?)
> 
> --D
> 
>>   		xfs_err(mp, "ondisk log corrupt, shutting down fs!");
>>   		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
>>   		return -EFSCORRUPTED;
>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>> index 623c2ee8330a..bbeb31883362 100644
>> --- a/include/linux/mm.h
>> +++ b/include/linux/mm.h
>> @@ -3249,6 +3249,7 @@ enum mf_flags {
>>   	MF_SOFT_OFFLINE = 1 << 3,
>>   	MF_UNPOISON = 1 << 4,
>>   	MF_NO_RETRY = 1 << 5,
>> +	MF_MEM_REMOVE = 1 << 6,
>>   };
>>   int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
>>   		      unsigned long count, int mf_flags);
>> -- 
>> 2.36.1
>>
>>
>>
diff mbox series

Patch

diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index 9b5e2a5eb0ae..d4bc83159d46 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -323,7 +323,7 @@  void kill_dax(struct dax_device *dax_dev)
 		return;
 
 	if (dax_dev->holder_data != NULL)
-		dax_holder_notify_failure(dax_dev, 0, U64_MAX, 0);
+		dax_holder_notify_failure(dax_dev, 0, U64_MAX, MF_MEM_REMOVE);
 
 	clear_bit(DAXDEV_ALIVE, &dax_dev->flags);
 	synchronize_srcu(&dax_srcu);
diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c
index aa8dc27c599c..91d3f05d4241 100644
--- a/fs/xfs/xfs_notify_failure.c
+++ b/fs/xfs/xfs_notify_failure.c
@@ -73,7 +73,9 @@  xfs_dax_failure_fn(
 	struct failure_info		*notify = data;
 	int				error = 0;
 
-	if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) ||
+	/* Do not shutdown so early when device is to be removed */
+	if (!(notify->mf_flags & MF_MEM_REMOVE) ||
+	    XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) ||
 	    (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) {
 		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
 		return -EFSCORRUPTED;
@@ -182,6 +184,8 @@  xfs_dax_notify_failure(
 
 	if (mp->m_logdev_targp && mp->m_logdev_targp->bt_daxdev == dax_dev &&
 	    mp->m_logdev_targp != mp->m_ddev_targp) {
+		if (mf_flags & MF_MEM_REMOVE)
+			return -EOPNOTSUPP;
 		xfs_err(mp, "ondisk log corrupt, shutting down fs!");
 		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
 		return -EFSCORRUPTED;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 623c2ee8330a..bbeb31883362 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3249,6 +3249,7 @@  enum mf_flags {
 	MF_SOFT_OFFLINE = 1 << 3,
 	MF_UNPOISON = 1 << 4,
 	MF_NO_RETRY = 1 << 5,
+	MF_MEM_REMOVE = 1 << 6,
 };
 int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
 		      unsigned long count, int mf_flags);