diff mbox series

[v12,2/2] mm, pmem, xfs: Introduce MF_MEM_REMOVE for unbind

Message ID 20230629081651.253626-3-ruansy.fnst@fujitsu.com (mailing list archive)
State New
Headers show
Series mm, pmem, xfs: Introduce MF_MEM_REMOVE for unbind | expand

Commit Message

Shiyang Ruan June 29, 2023, 8:16 a.m. UTC
This patch is inspired by Dan's "mm, dax, pmem: Introduce
dev_pagemap_failure()"[1].  With the help of dax_holder and
->notify_failure() mechanism, the pmem driver is able to ask filesystem
on it to unmap all files in use, and notify processes who are using
those files.

Call trace:
trigger unbind
 -> unbind_store()
  -> ... (skip)
   -> devres_release_all()
    -> kill_dax()
     -> dax_holder_notify_failure(dax_dev, 0, U64_MAX, MF_MEM_PRE_REMOVE)
      -> xfs_dax_notify_failure()
      `-> freeze_super()             // freeze (kernel call)
      `-> do xfs rmap
      ` -> mf_dax_kill_procs()
      `  -> collect_procs_fsdax()    // all associated processes
      `  -> unmap_and_kill()
      ` -> invalidate_inode_pages2_range() // drop file's cache
      `-> thaw_super()               // thaw (both kernel & user call)

Introduce MF_MEM_PRE_REMOVE to let filesystem know this is a remove
event.  Use the exclusive freeze/thaw[2] to lock the filesystem to prevent
new dax mapping from being created.  Do not shutdown filesystem directly
if configuration is not supported, or if failure range includes metadata
area.  Make sure all files and processes(not only the current progress)
are handled correctly.  Also drop the cache of associated files before
pmem is removed.

[1]: https://lore.kernel.org/linux-mm/161604050314.1463742.14151665140035795571.stgit@dwillia2-desk3.amr.corp.intel.com/
[2]: https://lore.kernel.org/linux-xfs/168688010689.860947.1788875898367401950.stgit@frogsfrogsfrogs/

Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>
---
 drivers/dax/super.c         |  3 +-
 fs/xfs/xfs_notify_failure.c | 86 ++++++++++++++++++++++++++++++++++---
 include/linux/mm.h          |  1 +
 mm/memory-failure.c         | 17 ++++++--
 4 files changed, 96 insertions(+), 11 deletions(-)

Comments

kernel test robot June 29, 2023, 12:02 p.m. UTC | #1
Hi Shiyang,

kernel test robot noticed the following build errors:

[auto build test ERROR on akpm-mm/mm-everything]

url:    https://github.com/intel-lab-lkp/linux/commits/Shiyang-Ruan/xfs-fix-the-calculation-for-end-and-length/20230629-161913
base:   https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link:    https://lore.kernel.org/r/20230629081651.253626-3-ruansy.fnst%40fujitsu.com
patch subject: [PATCH v12 2/2] mm, pmem, xfs: Introduce MF_MEM_REMOVE for unbind
config: x86_64-kexec (https://download.01.org/0day-ci/archive/20230629/202306291954.zqVvCUZ5-lkp@intel.com/config)
compiler: gcc-12 (Debian 12.2.0-14) 12.2.0
reproduce: (https://download.01.org/0day-ci/archive/20230629/202306291954.zqVvCUZ5-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202306291954.zqVvCUZ5-lkp@intel.com/

All errors (new ones prefixed by >>):

   fs/xfs/xfs_notify_failure.c: In function 'xfs_dax_notify_failure_freeze':
>> fs/xfs/xfs_notify_failure.c:127:33: error: 'FREEZE_HOLDER_KERNEL' undeclared (first use in this function)
     127 |         while (freeze_super(sb, FREEZE_HOLDER_KERNEL) != 0) {
         |                                 ^~~~~~~~~~~~~~~~~~~~
   fs/xfs/xfs_notify_failure.c:127:33: note: each undeclared identifier is reported only once for each function it appears in
>> fs/xfs/xfs_notify_failure.c:127:16: error: too many arguments to function 'freeze_super'
     127 |         while (freeze_super(sb, FREEZE_HOLDER_KERNEL) != 0) {
         |                ^~~~~~~~~~~~
   In file included from include/linux/huge_mm.h:8,
                    from include/linux/mm.h:988,
                    from fs/xfs/kmem.h:11,
                    from fs/xfs/xfs_linux.h:24,
                    from fs/xfs/xfs.h:22,
                    from fs/xfs/xfs_notify_failure.c:6:
   include/linux/fs.h:2289:12: note: declared here
    2289 | extern int freeze_super(struct super_block *super);
         |            ^~~~~~~~~~~~
   fs/xfs/xfs_notify_failure.c: In function 'xfs_dax_notify_failure_thaw':
   fs/xfs/xfs_notify_failure.c:140:32: error: 'FREEZE_HOLDER_KERNEL' undeclared (first use in this function)
     140 |         error = thaw_super(sb, FREEZE_HOLDER_KERNEL);
         |                                ^~~~~~~~~~~~~~~~~~~~
>> fs/xfs/xfs_notify_failure.c:140:17: error: too many arguments to function 'thaw_super'
     140 |         error = thaw_super(sb, FREEZE_HOLDER_KERNEL);
         |                 ^~~~~~~~~~
   include/linux/fs.h:2290:12: note: declared here
    2290 | extern int thaw_super(struct super_block *super);
         |            ^~~~~~~~~~
>> fs/xfs/xfs_notify_failure.c:148:24: error: 'FREEZE_HOLDER_USERSPACE' undeclared (first use in this function)
     148 |         thaw_super(sb, FREEZE_HOLDER_USERSPACE);
         |                        ^~~~~~~~~~~~~~~~~~~~~~~
   fs/xfs/xfs_notify_failure.c:148:9: error: too many arguments to function 'thaw_super'
     148 |         thaw_super(sb, FREEZE_HOLDER_USERSPACE);
         |         ^~~~~~~~~~
   include/linux/fs.h:2290:12: note: declared here
    2290 | extern int thaw_super(struct super_block *super);
         |            ^~~~~~~~~~


vim +/FREEZE_HOLDER_KERNEL +127 fs/xfs/xfs_notify_failure.c

   119	
   120	static void
   121	xfs_dax_notify_failure_freeze(
   122		struct xfs_mount	*mp)
   123	{
   124		struct super_block 	*sb = mp->m_super;
   125	
   126		/* Wait until no one is holding the FREEZE_HOLDER_KERNEL. */
 > 127		while (freeze_super(sb, FREEZE_HOLDER_KERNEL) != 0) {
   128			// Shall we just wait, or print warning then return -EBUSY?
   129			delay(HZ / 10);
   130		}
   131	}
   132	
   133	static void
   134	xfs_dax_notify_failure_thaw(
   135		struct xfs_mount	*mp)
   136	{
   137		struct super_block	*sb = mp->m_super;
   138		int			error;
   139	
 > 140		error = thaw_super(sb, FREEZE_HOLDER_KERNEL);
   141		if (error)
   142			xfs_emerg(mp, "still frozen after notify failure, err=%d",
   143				  error);
   144		/*
   145		 * Also thaw userspace call anyway because the device is about to be
   146		 * removed immediately.
   147		 */
 > 148		thaw_super(sb, FREEZE_HOLDER_USERSPACE);
   149	}
   150
Shiyang Ruan July 14, 2023, 9:07 a.m. UTC | #2
Hi Darrick,

Thanks for applying the 1st patch.

Now, since this patch is based on the new freeze_super()/thaw_super() 
api[1], I'd like to ask what's the plan for this api?  It seems to have 
missed the v6.5-rc1.

[1] 
https://lore.kernel.org/linux-xfs/168688010689.860947.1788875898367401950.stgit@frogsfrogsfrogs/


--
Thanks,
Ruan.


在 2023/6/29 16:16, Shiyang Ruan 写道:
> This patch is inspired by Dan's "mm, dax, pmem: Introduce
> dev_pagemap_failure()"[1].  With the help of dax_holder and
> ->notify_failure() mechanism, the pmem driver is able to ask filesystem
> on it to unmap all files in use, and notify processes who are using
> those files.
> 
> Call trace:
> trigger unbind
>   -> unbind_store()
>    -> ... (skip)
>     -> devres_release_all()
>      -> kill_dax()
>       -> dax_holder_notify_failure(dax_dev, 0, U64_MAX, MF_MEM_PRE_REMOVE)
>        -> xfs_dax_notify_failure()
>        `-> freeze_super()             // freeze (kernel call)
>        `-> do xfs rmap
>        ` -> mf_dax_kill_procs()
>        `  -> collect_procs_fsdax()    // all associated processes
>        `  -> unmap_and_kill()
>        ` -> invalidate_inode_pages2_range() // drop file's cache
>        `-> thaw_super()               // thaw (both kernel & user call)
> 
> Introduce MF_MEM_PRE_REMOVE to let filesystem know this is a remove
> event.  Use the exclusive freeze/thaw[2] to lock the filesystem to prevent
> new dax mapping from being created.  Do not shutdown filesystem directly
> if configuration is not supported, or if failure range includes metadata
> area.  Make sure all files and processes(not only the current progress)
> are handled correctly.  Also drop the cache of associated files before
> pmem is removed.
> 
> [1]: https://lore.kernel.org/linux-mm/161604050314.1463742.14151665140035795571.stgit@dwillia2-desk3.amr.corp.intel.com/
> [2]: https://lore.kernel.org/linux-xfs/168688010689.860947.1788875898367401950.stgit@frogsfrogsfrogs/
> 
> Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>
> ---
>   drivers/dax/super.c         |  3 +-
>   fs/xfs/xfs_notify_failure.c | 86 ++++++++++++++++++++++++++++++++++---
>   include/linux/mm.h          |  1 +
>   mm/memory-failure.c         | 17 ++++++--
>   4 files changed, 96 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/dax/super.c b/drivers/dax/super.c
> index c4c4728a36e4..2e1a35e82fce 100644
> --- a/drivers/dax/super.c
> +++ b/drivers/dax/super.c
> @@ -323,7 +323,8 @@ void kill_dax(struct dax_device *dax_dev)
>   		return;
>   
>   	if (dax_dev->holder_data != NULL)
> -		dax_holder_notify_failure(dax_dev, 0, U64_MAX, 0);
> +		dax_holder_notify_failure(dax_dev, 0, U64_MAX,
> +				MF_MEM_PRE_REMOVE);
>   
>   	clear_bit(DAXDEV_ALIVE, &dax_dev->flags);
>   	synchronize_srcu(&dax_srcu);
> diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c
> index 4a9bbd3fe120..f6ec56b76db6 100644
> --- a/fs/xfs/xfs_notify_failure.c
> +++ b/fs/xfs/xfs_notify_failure.c
> @@ -22,6 +22,7 @@
>   
>   #include <linux/mm.h>
>   #include <linux/dax.h>
> +#include <linux/fs.h>
>   
>   struct xfs_failure_info {
>   	xfs_agblock_t		startblock;
> @@ -73,10 +74,16 @@ xfs_dax_failure_fn(
>   	struct xfs_mount		*mp = cur->bc_mp;
>   	struct xfs_inode		*ip;
>   	struct xfs_failure_info		*notify = data;
> +	struct address_space		*mapping;
> +	pgoff_t				pgoff;
> +	unsigned long			pgcnt;
>   	int				error = 0;
>   
>   	if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) ||
>   	    (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) {
> +		/* Continue the query because this isn't a failure. */
> +		if (notify->mf_flags & MF_MEM_PRE_REMOVE)
> +			return 0;
>   		notify->want_shutdown = true;
>   		return 0;
>   	}
> @@ -92,14 +99,55 @@ xfs_dax_failure_fn(
>   		return 0;
>   	}
>   
> -	error = mf_dax_kill_procs(VFS_I(ip)->i_mapping,
> -				  xfs_failure_pgoff(mp, rec, notify),
> -				  xfs_failure_pgcnt(mp, rec, notify),
> -				  notify->mf_flags);
> +	mapping = VFS_I(ip)->i_mapping;
> +	pgoff = xfs_failure_pgoff(mp, rec, notify);
> +	pgcnt = xfs_failure_pgcnt(mp, rec, notify);
> +
> +	/* Continue the rmap query if the inode isn't a dax file. */
> +	if (dax_mapping(mapping))
> +		error = mf_dax_kill_procs(mapping, pgoff, pgcnt,
> +					  notify->mf_flags);
> +
> +	/* Invalidate the cache in dax pages. */
> +	if (notify->mf_flags & MF_MEM_PRE_REMOVE)
> +		invalidate_inode_pages2_range(mapping, pgoff,
> +					      pgoff + pgcnt - 1);
> +
>   	xfs_irele(ip);
>   	return error;
>   }
>   
> +static void
> +xfs_dax_notify_failure_freeze(
> +	struct xfs_mount	*mp)
> +{
> +	struct super_block 	*sb = mp->m_super;
> +
> +	/* Wait until no one is holding the FREEZE_HOLDER_KERNEL. */
> +	while (freeze_super(sb, FREEZE_HOLDER_KERNEL) != 0) {
> +		// Shall we just wait, or print warning then return -EBUSY?
> +		delay(HZ / 10);
> +	}
> +}
> +
> +static void
> +xfs_dax_notify_failure_thaw(
> +	struct xfs_mount	*mp)
> +{
> +	struct super_block	*sb = mp->m_super;
> +	int			error;
> +
> +	error = thaw_super(sb, FREEZE_HOLDER_KERNEL);
> +	if (error)
> +		xfs_emerg(mp, "still frozen after notify failure, err=%d",
> +			  error);
> +	/*
> +	 * Also thaw userspace call anyway because the device is about to be
> +	 * removed immediately.
> +	 */
> +	thaw_super(sb, FREEZE_HOLDER_USERSPACE);
> +}
> +
>   static int
>   xfs_dax_notify_ddev_failure(
>   	struct xfs_mount	*mp,
> @@ -120,7 +168,7 @@ xfs_dax_notify_ddev_failure(
>   
>   	error = xfs_trans_alloc_empty(mp, &tp);
>   	if (error)
> -		return error;
> +		goto out;
>   
>   	for (; agno <= end_agno; agno++) {
>   		struct xfs_rmap_irec	ri_low = { };
> @@ -165,11 +213,23 @@ xfs_dax_notify_ddev_failure(
>   	}
>   
>   	xfs_trans_cancel(tp);
> +
> +	/*
> +	 * Determine how to shutdown the filesystem according to the
> +	 * error code and flags.
> +	 */
>   	if (error || notify.want_shutdown) {
>   		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
>   		if (!error)
>   			error = -EFSCORRUPTED;
> -	}
> +	} else if (mf_flags & MF_MEM_PRE_REMOVE)
> +		xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
> +
> +out:
> +	/* Thaw the fs if it is freezed before. */
> +	if (mf_flags & MF_MEM_PRE_REMOVE)
> +		xfs_dax_notify_failure_thaw(mp);
> +
>   	return error;
>   }
>   
> @@ -197,6 +257,8 @@ xfs_dax_notify_failure(
>   
>   	if (mp->m_logdev_targp && mp->m_logdev_targp->bt_daxdev == dax_dev &&
>   	    mp->m_logdev_targp != mp->m_ddev_targp) {
> +		if (mf_flags & MF_MEM_PRE_REMOVE)
> +			return 0;
>   		xfs_err(mp, "ondisk log corrupt, shutting down fs!");
>   		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
>   		return -EFSCORRUPTED;
> @@ -210,6 +272,12 @@ xfs_dax_notify_failure(
>   	ddev_start = mp->m_ddev_targp->bt_dax_part_off;
>   	ddev_end = ddev_start + bdev_nr_bytes(mp->m_ddev_targp->bt_bdev) - 1;
>   
> +	/* Notify failure on the whole device. */
> +	if (offset == 0 && len == U64_MAX) {
> +		offset = ddev_start;
> +		len = bdev_nr_bytes(mp->m_ddev_targp->bt_bdev);
> +	}
> +
>   	/* Ignore the range out of filesystem area */
>   	if (offset + len - 1 < ddev_start)
>   		return -ENXIO;
> @@ -226,6 +294,12 @@ xfs_dax_notify_failure(
>   	if (offset + len - 1 > ddev_end)
>   		len = ddev_end - offset + 1;
>   
> +	if (mf_flags & MF_MEM_PRE_REMOVE) {
> +		xfs_info(mp, "device is about to be removed!");
> +		/* Freeze fs to prevent new mappings from being created. */
> +		xfs_dax_notify_failure_freeze(mp);
> +	}
> +
>   	return xfs_dax_notify_ddev_failure(mp, BTOBB(offset), BTOBB(len),
>   			mf_flags);
>   }
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 27ce77080c79..a80c255b88d2 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -3576,6 +3576,7 @@ enum mf_flags {
>   	MF_UNPOISON = 1 << 4,
>   	MF_SW_SIMULATED = 1 << 5,
>   	MF_NO_RETRY = 1 << 6,
> +	MF_MEM_PRE_REMOVE = 1 << 7,
>   };
>   int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
>   		      unsigned long count, int mf_flags);
> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> index 5b663eca1f29..483b75f2fcfb 100644
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -688,7 +688,7 @@ static void add_to_kill_fsdax(struct task_struct *tsk, struct page *p,
>    */
>   static void collect_procs_fsdax(struct page *page,
>   		struct address_space *mapping, pgoff_t pgoff,
> -		struct list_head *to_kill)
> +		struct list_head *to_kill, bool pre_remove)
>   {
>   	struct vm_area_struct *vma;
>   	struct task_struct *tsk;
> @@ -696,8 +696,15 @@ static void collect_procs_fsdax(struct page *page,
>   	i_mmap_lock_read(mapping);
>   	read_lock(&tasklist_lock);
>   	for_each_process(tsk) {
> -		struct task_struct *t = task_early_kill(tsk, true);
> +		struct task_struct *t = tsk;
>   
> +		/*
> +		 * Search for all tasks while MF_MEM_PRE_REMOVE, because the
> +		 * current may not be the one accessing the fsdax page.
> +		 * Otherwise, search for the current task.
> +		 */
> +		if (!pre_remove)
> +			t = task_early_kill(tsk, true);
>   		if (!t)
>   			continue;
>   		vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
> @@ -1793,6 +1800,7 @@ int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
>   	dax_entry_t cookie;
>   	struct page *page;
>   	size_t end = index + count;
> +	bool pre_remove = mf_flags & MF_MEM_PRE_REMOVE;
>   
>   	mf_flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
>   
> @@ -1804,9 +1812,10 @@ int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
>   		if (!page)
>   			goto unlock;
>   
> -		SetPageHWPoison(page);
> +		if (!pre_remove)
> +			SetPageHWPoison(page);
>   
> -		collect_procs_fsdax(page, mapping, index, &to_kill);
> +		collect_procs_fsdax(page, mapping, index, &to_kill, pre_remove);
>   		unmap_and_kill(&to_kill, page_to_pfn(page), mapping,
>   				index, mf_flags);
>   unlock:
Darrick J. Wong July 14, 2023, 2:18 p.m. UTC | #3
On Fri, Jul 14, 2023 at 05:07:58PM +0800, Shiyang Ruan wrote:
> Hi Darrick,
> 
> Thanks for applying the 1st patch.
> 
> Now, since this patch is based on the new freeze_super()/thaw_super()
> api[1], I'd like to ask what's the plan for this api?  It seems to have
> missed the v6.5-rc1.
> 
> [1] https://lore.kernel.org/linux-xfs/168688010689.860947.1788875898367401950.stgit@frogsfrogsfrogs/

6.6.  I intend to push the XFS UBSAN fixes to the list today for review.
Early next week I'll resend the 6.5 rebase of the kernelfreeze series
and push it to vfs-for-next.  Some time after that will come large folio
writes.

--D

> 
> --
> Thanks,
> Ruan.
> 
> 
> 在 2023/6/29 16:16, Shiyang Ruan 写道:
> > This patch is inspired by Dan's "mm, dax, pmem: Introduce
> > dev_pagemap_failure()"[1].  With the help of dax_holder and
> > ->notify_failure() mechanism, the pmem driver is able to ask filesystem
> > on it to unmap all files in use, and notify processes who are using
> > those files.
> > 
> > Call trace:
> > trigger unbind
> >   -> unbind_store()
> >    -> ... (skip)
> >     -> devres_release_all()
> >      -> kill_dax()
> >       -> dax_holder_notify_failure(dax_dev, 0, U64_MAX, MF_MEM_PRE_REMOVE)
> >        -> xfs_dax_notify_failure()
> >        `-> freeze_super()             // freeze (kernel call)
> >        `-> do xfs rmap
> >        ` -> mf_dax_kill_procs()
> >        `  -> collect_procs_fsdax()    // all associated processes
> >        `  -> unmap_and_kill()
> >        ` -> invalidate_inode_pages2_range() // drop file's cache
> >        `-> thaw_super()               // thaw (both kernel & user call)
> > 
> > Introduce MF_MEM_PRE_REMOVE to let filesystem know this is a remove
> > event.  Use the exclusive freeze/thaw[2] to lock the filesystem to prevent
> > new dax mapping from being created.  Do not shutdown filesystem directly
> > if configuration is not supported, or if failure range includes metadata
> > area.  Make sure all files and processes(not only the current progress)
> > are handled correctly.  Also drop the cache of associated files before
> > pmem is removed.
> > 
> > [1]: https://lore.kernel.org/linux-mm/161604050314.1463742.14151665140035795571.stgit@dwillia2-desk3.amr.corp.intel.com/
> > [2]: https://lore.kernel.org/linux-xfs/168688010689.860947.1788875898367401950.stgit@frogsfrogsfrogs/
> > 
> > Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>
> > ---
> >   drivers/dax/super.c         |  3 +-
> >   fs/xfs/xfs_notify_failure.c | 86 ++++++++++++++++++++++++++++++++++---
> >   include/linux/mm.h          |  1 +
> >   mm/memory-failure.c         | 17 ++++++--
> >   4 files changed, 96 insertions(+), 11 deletions(-)
> > 
> > diff --git a/drivers/dax/super.c b/drivers/dax/super.c
> > index c4c4728a36e4..2e1a35e82fce 100644
> > --- a/drivers/dax/super.c
> > +++ b/drivers/dax/super.c
> > @@ -323,7 +323,8 @@ void kill_dax(struct dax_device *dax_dev)
> >   		return;
> >   	if (dax_dev->holder_data != NULL)
> > -		dax_holder_notify_failure(dax_dev, 0, U64_MAX, 0);
> > +		dax_holder_notify_failure(dax_dev, 0, U64_MAX,
> > +				MF_MEM_PRE_REMOVE);
> >   	clear_bit(DAXDEV_ALIVE, &dax_dev->flags);
> >   	synchronize_srcu(&dax_srcu);
> > diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c
> > index 4a9bbd3fe120..f6ec56b76db6 100644
> > --- a/fs/xfs/xfs_notify_failure.c
> > +++ b/fs/xfs/xfs_notify_failure.c
> > @@ -22,6 +22,7 @@
> >   #include <linux/mm.h>
> >   #include <linux/dax.h>
> > +#include <linux/fs.h>
> >   struct xfs_failure_info {
> >   	xfs_agblock_t		startblock;
> > @@ -73,10 +74,16 @@ xfs_dax_failure_fn(
> >   	struct xfs_mount		*mp = cur->bc_mp;
> >   	struct xfs_inode		*ip;
> >   	struct xfs_failure_info		*notify = data;
> > +	struct address_space		*mapping;
> > +	pgoff_t				pgoff;
> > +	unsigned long			pgcnt;
> >   	int				error = 0;
> >   	if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) ||
> >   	    (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) {
> > +		/* Continue the query because this isn't a failure. */
> > +		if (notify->mf_flags & MF_MEM_PRE_REMOVE)
> > +			return 0;
> >   		notify->want_shutdown = true;
> >   		return 0;
> >   	}
> > @@ -92,14 +99,55 @@ xfs_dax_failure_fn(
> >   		return 0;
> >   	}
> > -	error = mf_dax_kill_procs(VFS_I(ip)->i_mapping,
> > -				  xfs_failure_pgoff(mp, rec, notify),
> > -				  xfs_failure_pgcnt(mp, rec, notify),
> > -				  notify->mf_flags);
> > +	mapping = VFS_I(ip)->i_mapping;
> > +	pgoff = xfs_failure_pgoff(mp, rec, notify);
> > +	pgcnt = xfs_failure_pgcnt(mp, rec, notify);
> > +
> > +	/* Continue the rmap query if the inode isn't a dax file. */
> > +	if (dax_mapping(mapping))
> > +		error = mf_dax_kill_procs(mapping, pgoff, pgcnt,
> > +					  notify->mf_flags);
> > +
> > +	/* Invalidate the cache in dax pages. */
> > +	if (notify->mf_flags & MF_MEM_PRE_REMOVE)
> > +		invalidate_inode_pages2_range(mapping, pgoff,
> > +					      pgoff + pgcnt - 1);
> > +
> >   	xfs_irele(ip);
> >   	return error;
> >   }
> > +static void
> > +xfs_dax_notify_failure_freeze(
> > +	struct xfs_mount	*mp)
> > +{
> > +	struct super_block 	*sb = mp->m_super;
> > +
> > +	/* Wait until no one is holding the FREEZE_HOLDER_KERNEL. */
> > +	while (freeze_super(sb, FREEZE_HOLDER_KERNEL) != 0) {
> > +		// Shall we just wait, or print warning then return -EBUSY?
> > +		delay(HZ / 10);
> > +	}
> > +}
> > +
> > +static void
> > +xfs_dax_notify_failure_thaw(
> > +	struct xfs_mount	*mp)
> > +{
> > +	struct super_block	*sb = mp->m_super;
> > +	int			error;
> > +
> > +	error = thaw_super(sb, FREEZE_HOLDER_KERNEL);
> > +	if (error)
> > +		xfs_emerg(mp, "still frozen after notify failure, err=%d",
> > +			  error);
> > +	/*
> > +	 * Also thaw userspace call anyway because the device is about to be
> > +	 * removed immediately.
> > +	 */
> > +	thaw_super(sb, FREEZE_HOLDER_USERSPACE);
> > +}
> > +
> >   static int
> >   xfs_dax_notify_ddev_failure(
> >   	struct xfs_mount	*mp,
> > @@ -120,7 +168,7 @@ xfs_dax_notify_ddev_failure(
> >   	error = xfs_trans_alloc_empty(mp, &tp);
> >   	if (error)
> > -		return error;
> > +		goto out;
> >   	for (; agno <= end_agno; agno++) {
> >   		struct xfs_rmap_irec	ri_low = { };
> > @@ -165,11 +213,23 @@ xfs_dax_notify_ddev_failure(
> >   	}
> >   	xfs_trans_cancel(tp);
> > +
> > +	/*
> > +	 * Determine how to shutdown the filesystem according to the
> > +	 * error code and flags.
> > +	 */
> >   	if (error || notify.want_shutdown) {
> >   		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
> >   		if (!error)
> >   			error = -EFSCORRUPTED;
> > -	}
> > +	} else if (mf_flags & MF_MEM_PRE_REMOVE)
> > +		xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
> > +
> > +out:
> > +	/* Thaw the fs if it is freezed before. */
> > +	if (mf_flags & MF_MEM_PRE_REMOVE)
> > +		xfs_dax_notify_failure_thaw(mp);
> > +
> >   	return error;
> >   }
> > @@ -197,6 +257,8 @@ xfs_dax_notify_failure(
> >   	if (mp->m_logdev_targp && mp->m_logdev_targp->bt_daxdev == dax_dev &&
> >   	    mp->m_logdev_targp != mp->m_ddev_targp) {
> > +		if (mf_flags & MF_MEM_PRE_REMOVE)
> > +			return 0;
> >   		xfs_err(mp, "ondisk log corrupt, shutting down fs!");
> >   		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
> >   		return -EFSCORRUPTED;
> > @@ -210,6 +272,12 @@ xfs_dax_notify_failure(
> >   	ddev_start = mp->m_ddev_targp->bt_dax_part_off;
> >   	ddev_end = ddev_start + bdev_nr_bytes(mp->m_ddev_targp->bt_bdev) - 1;
> > +	/* Notify failure on the whole device. */
> > +	if (offset == 0 && len == U64_MAX) {
> > +		offset = ddev_start;
> > +		len = bdev_nr_bytes(mp->m_ddev_targp->bt_bdev);
> > +	}
> > +
> >   	/* Ignore the range out of filesystem area */
> >   	if (offset + len - 1 < ddev_start)
> >   		return -ENXIO;
> > @@ -226,6 +294,12 @@ xfs_dax_notify_failure(
> >   	if (offset + len - 1 > ddev_end)
> >   		len = ddev_end - offset + 1;
> > +	if (mf_flags & MF_MEM_PRE_REMOVE) {
> > +		xfs_info(mp, "device is about to be removed!");
> > +		/* Freeze fs to prevent new mappings from being created. */
> > +		xfs_dax_notify_failure_freeze(mp);
> > +	}
> > +
> >   	return xfs_dax_notify_ddev_failure(mp, BTOBB(offset), BTOBB(len),
> >   			mf_flags);
> >   }
> > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > index 27ce77080c79..a80c255b88d2 100644
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -3576,6 +3576,7 @@ enum mf_flags {
> >   	MF_UNPOISON = 1 << 4,
> >   	MF_SW_SIMULATED = 1 << 5,
> >   	MF_NO_RETRY = 1 << 6,
> > +	MF_MEM_PRE_REMOVE = 1 << 7,
> >   };
> >   int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
> >   		      unsigned long count, int mf_flags);
> > diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> > index 5b663eca1f29..483b75f2fcfb 100644
> > --- a/mm/memory-failure.c
> > +++ b/mm/memory-failure.c
> > @@ -688,7 +688,7 @@ static void add_to_kill_fsdax(struct task_struct *tsk, struct page *p,
> >    */
> >   static void collect_procs_fsdax(struct page *page,
> >   		struct address_space *mapping, pgoff_t pgoff,
> > -		struct list_head *to_kill)
> > +		struct list_head *to_kill, bool pre_remove)
> >   {
> >   	struct vm_area_struct *vma;
> >   	struct task_struct *tsk;
> > @@ -696,8 +696,15 @@ static void collect_procs_fsdax(struct page *page,
> >   	i_mmap_lock_read(mapping);
> >   	read_lock(&tasklist_lock);
> >   	for_each_process(tsk) {
> > -		struct task_struct *t = task_early_kill(tsk, true);
> > +		struct task_struct *t = tsk;
> > +		/*
> > +		 * Search for all tasks while MF_MEM_PRE_REMOVE, because the
> > +		 * current may not be the one accessing the fsdax page.
> > +		 * Otherwise, search for the current task.
> > +		 */
> > +		if (!pre_remove)
> > +			t = task_early_kill(tsk, true);
> >   		if (!t)
> >   			continue;
> >   		vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
> > @@ -1793,6 +1800,7 @@ int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
> >   	dax_entry_t cookie;
> >   	struct page *page;
> >   	size_t end = index + count;
> > +	bool pre_remove = mf_flags & MF_MEM_PRE_REMOVE;
> >   	mf_flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
> > @@ -1804,9 +1812,10 @@ int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
> >   		if (!page)
> >   			goto unlock;
> > -		SetPageHWPoison(page);
> > +		if (!pre_remove)
> > +			SetPageHWPoison(page);
> > -		collect_procs_fsdax(page, mapping, index, &to_kill);
> > +		collect_procs_fsdax(page, mapping, index, &to_kill, pre_remove);
> >   		unmap_and_kill(&to_kill, page_to_pfn(page), mapping,
> >   				index, mf_flags);
> >   unlock:
Shiyang Ruan July 20, 2023, 1:50 a.m. UTC | #4
在 2023/7/14 22:18, Darrick J. Wong 写道:
> On Fri, Jul 14, 2023 at 05:07:58PM +0800, Shiyang Ruan wrote:
>> Hi Darrick,
>>
>> Thanks for applying the 1st patch.
>>
>> Now, since this patch is based on the new freeze_super()/thaw_super()
>> api[1], I'd like to ask what's the plan for this api?  It seems to have
>> missed the v6.5-rc1.
>>
>> [1] https://lore.kernel.org/linux-xfs/168688010689.860947.1788875898367401950.stgit@frogsfrogsfrogs/
> 
> 6.6.  I intend to push the XFS UBSAN fixes to the list today for review.
> Early next week I'll resend the 6.5 rebase of the kernelfreeze series
> and push it to vfs-for-next.  Some time after that will come large folio
> writes.

Got it.  Thanks for your information!


--
Ruan.

> 
> --D
> 
>>
>> --
>> Thanks,
>> Ruan.
>>
>>
>> 在 2023/6/29 16:16, Shiyang Ruan 写道:
>>> This patch is inspired by Dan's "mm, dax, pmem: Introduce
>>> dev_pagemap_failure()"[1].  With the help of dax_holder and
>>> ->notify_failure() mechanism, the pmem driver is able to ask filesystem
>>> on it to unmap all files in use, and notify processes who are using
>>> those files.
>>>
>>> Call trace:
>>> trigger unbind
>>>    -> unbind_store()
>>>     -> ... (skip)
>>>      -> devres_release_all()
>>>       -> kill_dax()
>>>        -> dax_holder_notify_failure(dax_dev, 0, U64_MAX, MF_MEM_PRE_REMOVE)
>>>         -> xfs_dax_notify_failure()
>>>         `-> freeze_super()             // freeze (kernel call)
>>>         `-> do xfs rmap
>>>         ` -> mf_dax_kill_procs()
>>>         `  -> collect_procs_fsdax()    // all associated processes
>>>         `  -> unmap_and_kill()
>>>         ` -> invalidate_inode_pages2_range() // drop file's cache
>>>         `-> thaw_super()               // thaw (both kernel & user call)
>>>
>>> Introduce MF_MEM_PRE_REMOVE to let filesystem know this is a remove
>>> event.  Use the exclusive freeze/thaw[2] to lock the filesystem to prevent
>>> new dax mapping from being created.  Do not shutdown filesystem directly
>>> if configuration is not supported, or if failure range includes metadata
>>> area.  Make sure all files and processes(not only the current progress)
>>> are handled correctly.  Also drop the cache of associated files before
>>> pmem is removed.
>>>
>>> [1]: https://lore.kernel.org/linux-mm/161604050314.1463742.14151665140035795571.stgit@dwillia2-desk3.amr.corp.intel.com/
>>> [2]: https://lore.kernel.org/linux-xfs/168688010689.860947.1788875898367401950.stgit@frogsfrogsfrogs/
>>>
>>> Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>
>>> ---
>>>    drivers/dax/super.c         |  3 +-
>>>    fs/xfs/xfs_notify_failure.c | 86 ++++++++++++++++++++++++++++++++++---
>>>    include/linux/mm.h          |  1 +
>>>    mm/memory-failure.c         | 17 ++++++--
>>>    4 files changed, 96 insertions(+), 11 deletions(-)
>>>
>>> diff --git a/drivers/dax/super.c b/drivers/dax/super.c
>>> index c4c4728a36e4..2e1a35e82fce 100644
>>> --- a/drivers/dax/super.c
>>> +++ b/drivers/dax/super.c
>>> @@ -323,7 +323,8 @@ void kill_dax(struct dax_device *dax_dev)
>>>    		return;
>>>    	if (dax_dev->holder_data != NULL)
>>> -		dax_holder_notify_failure(dax_dev, 0, U64_MAX, 0);
>>> +		dax_holder_notify_failure(dax_dev, 0, U64_MAX,
>>> +				MF_MEM_PRE_REMOVE);
>>>    	clear_bit(DAXDEV_ALIVE, &dax_dev->flags);
>>>    	synchronize_srcu(&dax_srcu);
>>> diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c
>>> index 4a9bbd3fe120..f6ec56b76db6 100644
>>> --- a/fs/xfs/xfs_notify_failure.c
>>> +++ b/fs/xfs/xfs_notify_failure.c
>>> @@ -22,6 +22,7 @@
>>>    #include <linux/mm.h>
>>>    #include <linux/dax.h>
>>> +#include <linux/fs.h>
>>>    struct xfs_failure_info {
>>>    	xfs_agblock_t		startblock;
>>> @@ -73,10 +74,16 @@ xfs_dax_failure_fn(
>>>    	struct xfs_mount		*mp = cur->bc_mp;
>>>    	struct xfs_inode		*ip;
>>>    	struct xfs_failure_info		*notify = data;
>>> +	struct address_space		*mapping;
>>> +	pgoff_t				pgoff;
>>> +	unsigned long			pgcnt;
>>>    	int				error = 0;
>>>    	if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) ||
>>>    	    (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) {
>>> +		/* Continue the query because this isn't a failure. */
>>> +		if (notify->mf_flags & MF_MEM_PRE_REMOVE)
>>> +			return 0;
>>>    		notify->want_shutdown = true;
>>>    		return 0;
>>>    	}
>>> @@ -92,14 +99,55 @@ xfs_dax_failure_fn(
>>>    		return 0;
>>>    	}
>>> -	error = mf_dax_kill_procs(VFS_I(ip)->i_mapping,
>>> -				  xfs_failure_pgoff(mp, rec, notify),
>>> -				  xfs_failure_pgcnt(mp, rec, notify),
>>> -				  notify->mf_flags);
>>> +	mapping = VFS_I(ip)->i_mapping;
>>> +	pgoff = xfs_failure_pgoff(mp, rec, notify);
>>> +	pgcnt = xfs_failure_pgcnt(mp, rec, notify);
>>> +
>>> +	/* Continue the rmap query if the inode isn't a dax file. */
>>> +	if (dax_mapping(mapping))
>>> +		error = mf_dax_kill_procs(mapping, pgoff, pgcnt,
>>> +					  notify->mf_flags);
>>> +
>>> +	/* Invalidate the cache in dax pages. */
>>> +	if (notify->mf_flags & MF_MEM_PRE_REMOVE)
>>> +		invalidate_inode_pages2_range(mapping, pgoff,
>>> +					      pgoff + pgcnt - 1);
>>> +
>>>    	xfs_irele(ip);
>>>    	return error;
>>>    }
>>> +static void
>>> +xfs_dax_notify_failure_freeze(
>>> +	struct xfs_mount	*mp)
>>> +{
>>> +	struct super_block 	*sb = mp->m_super;
>>> +
>>> +	/* Wait until no one is holding the FREEZE_HOLDER_KERNEL. */
>>> +	while (freeze_super(sb, FREEZE_HOLDER_KERNEL) != 0) {
>>> +		// Shall we just wait, or print warning then return -EBUSY?
>>> +		delay(HZ / 10);
>>> +	}
>>> +}
>>> +
>>> +static void
>>> +xfs_dax_notify_failure_thaw(
>>> +	struct xfs_mount	*mp)
>>> +{
>>> +	struct super_block	*sb = mp->m_super;
>>> +	int			error;
>>> +
>>> +	error = thaw_super(sb, FREEZE_HOLDER_KERNEL);
>>> +	if (error)
>>> +		xfs_emerg(mp, "still frozen after notify failure, err=%d",
>>> +			  error);
>>> +	/*
>>> +	 * Also thaw userspace call anyway because the device is about to be
>>> +	 * removed immediately.
>>> +	 */
>>> +	thaw_super(sb, FREEZE_HOLDER_USERSPACE);
>>> +}
>>> +
>>>    static int
>>>    xfs_dax_notify_ddev_failure(
>>>    	struct xfs_mount	*mp,
>>> @@ -120,7 +168,7 @@ xfs_dax_notify_ddev_failure(
>>>    	error = xfs_trans_alloc_empty(mp, &tp);
>>>    	if (error)
>>> -		return error;
>>> +		goto out;
>>>    	for (; agno <= end_agno; agno++) {
>>>    		struct xfs_rmap_irec	ri_low = { };
>>> @@ -165,11 +213,23 @@ xfs_dax_notify_ddev_failure(
>>>    	}
>>>    	xfs_trans_cancel(tp);
>>> +
>>> +	/*
>>> +	 * Determine how to shutdown the filesystem according to the
>>> +	 * error code and flags.
>>> +	 */
>>>    	if (error || notify.want_shutdown) {
>>>    		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
>>>    		if (!error)
>>>    			error = -EFSCORRUPTED;
>>> -	}
>>> +	} else if (mf_flags & MF_MEM_PRE_REMOVE)
>>> +		xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
>>> +
>>> +out:
>>> +	/* Thaw the fs if it is freezed before. */
>>> +	if (mf_flags & MF_MEM_PRE_REMOVE)
>>> +		xfs_dax_notify_failure_thaw(mp);
>>> +
>>>    	return error;
>>>    }
>>> @@ -197,6 +257,8 @@ xfs_dax_notify_failure(
>>>    	if (mp->m_logdev_targp && mp->m_logdev_targp->bt_daxdev == dax_dev &&
>>>    	    mp->m_logdev_targp != mp->m_ddev_targp) {
>>> +		if (mf_flags & MF_MEM_PRE_REMOVE)
>>> +			return 0;
>>>    		xfs_err(mp, "ondisk log corrupt, shutting down fs!");
>>>    		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
>>>    		return -EFSCORRUPTED;
>>> @@ -210,6 +272,12 @@ xfs_dax_notify_failure(
>>>    	ddev_start = mp->m_ddev_targp->bt_dax_part_off;
>>>    	ddev_end = ddev_start + bdev_nr_bytes(mp->m_ddev_targp->bt_bdev) - 1;
>>> +	/* Notify failure on the whole device. */
>>> +	if (offset == 0 && len == U64_MAX) {
>>> +		offset = ddev_start;
>>> +		len = bdev_nr_bytes(mp->m_ddev_targp->bt_bdev);
>>> +	}
>>> +
>>>    	/* Ignore the range out of filesystem area */
>>>    	if (offset + len - 1 < ddev_start)
>>>    		return -ENXIO;
>>> @@ -226,6 +294,12 @@ xfs_dax_notify_failure(
>>>    	if (offset + len - 1 > ddev_end)
>>>    		len = ddev_end - offset + 1;
>>> +	if (mf_flags & MF_MEM_PRE_REMOVE) {
>>> +		xfs_info(mp, "device is about to be removed!");
>>> +		/* Freeze fs to prevent new mappings from being created. */
>>> +		xfs_dax_notify_failure_freeze(mp);
>>> +	}
>>> +
>>>    	return xfs_dax_notify_ddev_failure(mp, BTOBB(offset), BTOBB(len),
>>>    			mf_flags);
>>>    }
>>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>>> index 27ce77080c79..a80c255b88d2 100644
>>> --- a/include/linux/mm.h
>>> +++ b/include/linux/mm.h
>>> @@ -3576,6 +3576,7 @@ enum mf_flags {
>>>    	MF_UNPOISON = 1 << 4,
>>>    	MF_SW_SIMULATED = 1 << 5,
>>>    	MF_NO_RETRY = 1 << 6,
>>> +	MF_MEM_PRE_REMOVE = 1 << 7,
>>>    };
>>>    int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
>>>    		      unsigned long count, int mf_flags);
>>> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
>>> index 5b663eca1f29..483b75f2fcfb 100644
>>> --- a/mm/memory-failure.c
>>> +++ b/mm/memory-failure.c
>>> @@ -688,7 +688,7 @@ static void add_to_kill_fsdax(struct task_struct *tsk, struct page *p,
>>>     */
>>>    static void collect_procs_fsdax(struct page *page,
>>>    		struct address_space *mapping, pgoff_t pgoff,
>>> -		struct list_head *to_kill)
>>> +		struct list_head *to_kill, bool pre_remove)
>>>    {
>>>    	struct vm_area_struct *vma;
>>>    	struct task_struct *tsk;
>>> @@ -696,8 +696,15 @@ static void collect_procs_fsdax(struct page *page,
>>>    	i_mmap_lock_read(mapping);
>>>    	read_lock(&tasklist_lock);
>>>    	for_each_process(tsk) {
>>> -		struct task_struct *t = task_early_kill(tsk, true);
>>> +		struct task_struct *t = tsk;
>>> +		/*
>>> +		 * Search for all tasks while MF_MEM_PRE_REMOVE, because the
>>> +		 * current may not be the one accessing the fsdax page.
>>> +		 * Otherwise, search for the current task.
>>> +		 */
>>> +		if (!pre_remove)
>>> +			t = task_early_kill(tsk, true);
>>>    		if (!t)
>>>    			continue;
>>>    		vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
>>> @@ -1793,6 +1800,7 @@ int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
>>>    	dax_entry_t cookie;
>>>    	struct page *page;
>>>    	size_t end = index + count;
>>> +	bool pre_remove = mf_flags & MF_MEM_PRE_REMOVE;
>>>    	mf_flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
>>> @@ -1804,9 +1812,10 @@ int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
>>>    		if (!page)
>>>    			goto unlock;
>>> -		SetPageHWPoison(page);
>>> +		if (!pre_remove)
>>> +			SetPageHWPoison(page);
>>> -		collect_procs_fsdax(page, mapping, index, &to_kill);
>>> +		collect_procs_fsdax(page, mapping, index, &to_kill, pre_remove);
>>>    		unmap_and_kill(&to_kill, page_to_pfn(page), mapping,
>>>    				index, mf_flags);
>>>    unlock:
Shiyang Ruan July 29, 2023, 10:01 a.m. UTC | #5
在 2023/7/20 9:50, Shiyang Ruan 写道:
> 
> 
> 在 2023/7/14 22:18, Darrick J. Wong 写道:
>> On Fri, Jul 14, 2023 at 05:07:58PM +0800, Shiyang Ruan wrote:
>>> Hi Darrick,
>>>
>>> Thanks for applying the 1st patch.
>>>
>>> Now, since this patch is based on the new freeze_super()/thaw_super()
>>> api[1], I'd like to ask what's the plan for this api?  It seems to have
>>> missed the v6.5-rc1.
>>>
>>> [1] 
>>> https://lore.kernel.org/linux-xfs/168688010689.860947.1788875898367401950.stgit@frogsfrogsfrogs/
>>
>> 6.6.  I intend to push the XFS UBSAN fixes to the list today for review.
>> Early next week I'll resend the 6.5 rebase of the kernelfreeze series
>> and push it to vfs-for-next.  Some time after that will come large folio
>> writes.
> 
> Got it.  Thanks for your information!

A small request:  If you have time to give some comments, I would 
appreciate it because I hope we can make the most out of this 
period(before freeze api be merged in 6.6).


--
Thanks,
Ruan.

> 
> 
> -- 
> Ruan.
> 
>>
>> --D
>>
>>>
>>> -- 
>>> Thanks,
>>> Ruan.
>>>
>>>
>>> 在 2023/6/29 16:16, Shiyang Ruan 写道:
>>>> This patch is inspired by Dan's "mm, dax, pmem: Introduce
>>>> dev_pagemap_failure()"[1].  With the help of dax_holder and
>>>> ->notify_failure() mechanism, the pmem driver is able to ask filesystem
>>>> on it to unmap all files in use, and notify processes who are using
>>>> those files.
>>>>
>>>> Call trace:
>>>> trigger unbind
>>>>    -> unbind_store()
>>>>     -> ... (skip)
>>>>      -> devres_release_all()
>>>>       -> kill_dax()
>>>>        -> dax_holder_notify_failure(dax_dev, 0, U64_MAX, 
>>>> MF_MEM_PRE_REMOVE)
>>>>         -> xfs_dax_notify_failure()
>>>>         `-> freeze_super()             // freeze (kernel call)
>>>>         `-> do xfs rmap
>>>>         ` -> mf_dax_kill_procs()
>>>>         `  -> collect_procs_fsdax()    // all associated processes
>>>>         `  -> unmap_and_kill()
>>>>         ` -> invalidate_inode_pages2_range() // drop file's cache
>>>>         `-> thaw_super()               // thaw (both kernel & user 
>>>> call)
>>>>
>>>> Introduce MF_MEM_PRE_REMOVE to let filesystem know this is a remove
>>>> event.  Use the exclusive freeze/thaw[2] to lock the filesystem to 
>>>> prevent
>>>> new dax mapping from being created.  Do not shutdown filesystem 
>>>> directly
>>>> if configuration is not supported, or if failure range includes 
>>>> metadata
>>>> area.  Make sure all files and processes(not only the current progress)
>>>> are handled correctly.  Also drop the cache of associated files before
>>>> pmem is removed.
>>>>
>>>> [1]: 
>>>> https://lore.kernel.org/linux-mm/161604050314.1463742.14151665140035795571.stgit@dwillia2-desk3.amr.corp.intel.com/
>>>> [2]: 
>>>> https://lore.kernel.org/linux-xfs/168688010689.860947.1788875898367401950.stgit@frogsfrogsfrogs/
>>>>
>>>> Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>
>>>> ---
>>>>    drivers/dax/super.c         |  3 +-
>>>>    fs/xfs/xfs_notify_failure.c | 86 
>>>> ++++++++++++++++++++++++++++++++++---
>>>>    include/linux/mm.h          |  1 +
>>>>    mm/memory-failure.c         | 17 ++++++--
>>>>    4 files changed, 96 insertions(+), 11 deletions(-)
>>>>
>>>> diff --git a/drivers/dax/super.c b/drivers/dax/super.c
>>>> index c4c4728a36e4..2e1a35e82fce 100644
>>>> --- a/drivers/dax/super.c
>>>> +++ b/drivers/dax/super.c
>>>> @@ -323,7 +323,8 @@ void kill_dax(struct dax_device *dax_dev)
>>>>            return;
>>>>        if (dax_dev->holder_data != NULL)
>>>> -        dax_holder_notify_failure(dax_dev, 0, U64_MAX, 0);
>>>> +        dax_holder_notify_failure(dax_dev, 0, U64_MAX,
>>>> +                MF_MEM_PRE_REMOVE);
>>>>        clear_bit(DAXDEV_ALIVE, &dax_dev->flags);
>>>>        synchronize_srcu(&dax_srcu);
>>>> diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c
>>>> index 4a9bbd3fe120..f6ec56b76db6 100644
>>>> --- a/fs/xfs/xfs_notify_failure.c
>>>> +++ b/fs/xfs/xfs_notify_failure.c
>>>> @@ -22,6 +22,7 @@
>>>>    #include <linux/mm.h>
>>>>    #include <linux/dax.h>
>>>> +#include <linux/fs.h>
>>>>    struct xfs_failure_info {
>>>>        xfs_agblock_t        startblock;
>>>> @@ -73,10 +74,16 @@ xfs_dax_failure_fn(
>>>>        struct xfs_mount        *mp = cur->bc_mp;
>>>>        struct xfs_inode        *ip;
>>>>        struct xfs_failure_info        *notify = data;
>>>> +    struct address_space        *mapping;
>>>> +    pgoff_t                pgoff;
>>>> +    unsigned long            pgcnt;
>>>>        int                error = 0;
>>>>        if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) ||
>>>>            (rec->rm_flags & (XFS_RMAP_ATTR_FORK | 
>>>> XFS_RMAP_BMBT_BLOCK))) {
>>>> +        /* Continue the query because this isn't a failure. */
>>>> +        if (notify->mf_flags & MF_MEM_PRE_REMOVE)
>>>> +            return 0;
>>>>            notify->want_shutdown = true;
>>>>            return 0;
>>>>        }
>>>> @@ -92,14 +99,55 @@ xfs_dax_failure_fn(
>>>>            return 0;
>>>>        }
>>>> -    error = mf_dax_kill_procs(VFS_I(ip)->i_mapping,
>>>> -                  xfs_failure_pgoff(mp, rec, notify),
>>>> -                  xfs_failure_pgcnt(mp, rec, notify),
>>>> -                  notify->mf_flags);
>>>> +    mapping = VFS_I(ip)->i_mapping;
>>>> +    pgoff = xfs_failure_pgoff(mp, rec, notify);
>>>> +    pgcnt = xfs_failure_pgcnt(mp, rec, notify);
>>>> +
>>>> +    /* Continue the rmap query if the inode isn't a dax file. */
>>>> +    if (dax_mapping(mapping))
>>>> +        error = mf_dax_kill_procs(mapping, pgoff, pgcnt,
>>>> +                      notify->mf_flags);
>>>> +
>>>> +    /* Invalidate the cache in dax pages. */
>>>> +    if (notify->mf_flags & MF_MEM_PRE_REMOVE)
>>>> +        invalidate_inode_pages2_range(mapping, pgoff,
>>>> +                          pgoff + pgcnt - 1);
>>>> +
>>>>        xfs_irele(ip);
>>>>        return error;
>>>>    }
>>>> +static void
>>>> +xfs_dax_notify_failure_freeze(
>>>> +    struct xfs_mount    *mp)
>>>> +{
>>>> +    struct super_block     *sb = mp->m_super;
>>>> +
>>>> +    /* Wait until no one is holding the FREEZE_HOLDER_KERNEL. */
>>>> +    while (freeze_super(sb, FREEZE_HOLDER_KERNEL) != 0) {
>>>> +        // Shall we just wait, or print warning then return -EBUSY?
>>>> +        delay(HZ / 10);
>>>> +    }
>>>> +}
>>>> +
>>>> +static void
>>>> +xfs_dax_notify_failure_thaw(
>>>> +    struct xfs_mount    *mp)
>>>> +{
>>>> +    struct super_block    *sb = mp->m_super;
>>>> +    int            error;
>>>> +
>>>> +    error = thaw_super(sb, FREEZE_HOLDER_KERNEL);
>>>> +    if (error)
>>>> +        xfs_emerg(mp, "still frozen after notify failure, err=%d",
>>>> +              error);
>>>> +    /*
>>>> +     * Also thaw userspace call anyway because the device is about 
>>>> to be
>>>> +     * removed immediately.
>>>> +     */
>>>> +    thaw_super(sb, FREEZE_HOLDER_USERSPACE);
>>>> +}
>>>> +
>>>>    static int
>>>>    xfs_dax_notify_ddev_failure(
>>>>        struct xfs_mount    *mp,
>>>> @@ -120,7 +168,7 @@ xfs_dax_notify_ddev_failure(
>>>>        error = xfs_trans_alloc_empty(mp, &tp);
>>>>        if (error)
>>>> -        return error;
>>>> +        goto out;
>>>>        for (; agno <= end_agno; agno++) {
>>>>            struct xfs_rmap_irec    ri_low = { };
>>>> @@ -165,11 +213,23 @@ xfs_dax_notify_ddev_failure(
>>>>        }
>>>>        xfs_trans_cancel(tp);
>>>> +
>>>> +    /*
>>>> +     * Determine how to shutdown the filesystem according to the
>>>> +     * error code and flags.
>>>> +     */
>>>>        if (error || notify.want_shutdown) {
>>>>            xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
>>>>            if (!error)
>>>>                error = -EFSCORRUPTED;
>>>> -    }
>>>> +    } else if (mf_flags & MF_MEM_PRE_REMOVE)
>>>> +        xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
>>>> +
>>>> +out:
>>>> +    /* Thaw the fs if it is freezed before. */
>>>> +    if (mf_flags & MF_MEM_PRE_REMOVE)
>>>> +        xfs_dax_notify_failure_thaw(mp);
>>>> +
>>>>        return error;
>>>>    }
>>>> @@ -197,6 +257,8 @@ xfs_dax_notify_failure(
>>>>        if (mp->m_logdev_targp && mp->m_logdev_targp->bt_daxdev == 
>>>> dax_dev &&
>>>>            mp->m_logdev_targp != mp->m_ddev_targp) {
>>>> +        if (mf_flags & MF_MEM_PRE_REMOVE)
>>>> +            return 0;
>>>>            xfs_err(mp, "ondisk log corrupt, shutting down fs!");
>>>>            xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
>>>>            return -EFSCORRUPTED;
>>>> @@ -210,6 +272,12 @@ xfs_dax_notify_failure(
>>>>        ddev_start = mp->m_ddev_targp->bt_dax_part_off;
>>>>        ddev_end = ddev_start + 
>>>> bdev_nr_bytes(mp->m_ddev_targp->bt_bdev) - 1;
>>>> +    /* Notify failure on the whole device. */
>>>> +    if (offset == 0 && len == U64_MAX) {
>>>> +        offset = ddev_start;
>>>> +        len = bdev_nr_bytes(mp->m_ddev_targp->bt_bdev);
>>>> +    }
>>>> +
>>>>        /* Ignore the range out of filesystem area */
>>>>        if (offset + len - 1 < ddev_start)
>>>>            return -ENXIO;
>>>> @@ -226,6 +294,12 @@ xfs_dax_notify_failure(
>>>>        if (offset + len - 1 > ddev_end)
>>>>            len = ddev_end - offset + 1;
>>>> +    if (mf_flags & MF_MEM_PRE_REMOVE) {
>>>> +        xfs_info(mp, "device is about to be removed!");
>>>> +        /* Freeze fs to prevent new mappings from being created. */
>>>> +        xfs_dax_notify_failure_freeze(mp);
>>>> +    }
>>>> +
>>>>        return xfs_dax_notify_ddev_failure(mp, BTOBB(offset), 
>>>> BTOBB(len),
>>>>                mf_flags);
>>>>    }
>>>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>>>> index 27ce77080c79..a80c255b88d2 100644
>>>> --- a/include/linux/mm.h
>>>> +++ b/include/linux/mm.h
>>>> @@ -3576,6 +3576,7 @@ enum mf_flags {
>>>>        MF_UNPOISON = 1 << 4,
>>>>        MF_SW_SIMULATED = 1 << 5,
>>>>        MF_NO_RETRY = 1 << 6,
>>>> +    MF_MEM_PRE_REMOVE = 1 << 7,
>>>>    };
>>>>    int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
>>>>                  unsigned long count, int mf_flags);
>>>> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
>>>> index 5b663eca1f29..483b75f2fcfb 100644
>>>> --- a/mm/memory-failure.c
>>>> +++ b/mm/memory-failure.c
>>>> @@ -688,7 +688,7 @@ static void add_to_kill_fsdax(struct task_struct 
>>>> *tsk, struct page *p,
>>>>     */
>>>>    static void collect_procs_fsdax(struct page *page,
>>>>            struct address_space *mapping, pgoff_t pgoff,
>>>> -        struct list_head *to_kill)
>>>> +        struct list_head *to_kill, bool pre_remove)
>>>>    {
>>>>        struct vm_area_struct *vma;
>>>>        struct task_struct *tsk;
>>>> @@ -696,8 +696,15 @@ static void collect_procs_fsdax(struct page *page,
>>>>        i_mmap_lock_read(mapping);
>>>>        read_lock(&tasklist_lock);
>>>>        for_each_process(tsk) {
>>>> -        struct task_struct *t = task_early_kill(tsk, true);
>>>> +        struct task_struct *t = tsk;
>>>> +        /*
>>>> +         * Search for all tasks while MF_MEM_PRE_REMOVE, because the
>>>> +         * current may not be the one accessing the fsdax page.
>>>> +         * Otherwise, search for the current task.
>>>> +         */
>>>> +        if (!pre_remove)
>>>> +            t = task_early_kill(tsk, true);
>>>>            if (!t)
>>>>                continue;
>>>>            vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, 
>>>> pgoff) {
>>>> @@ -1793,6 +1800,7 @@ int mf_dax_kill_procs(struct address_space 
>>>> *mapping, pgoff_t index,
>>>>        dax_entry_t cookie;
>>>>        struct page *page;
>>>>        size_t end = index + count;
>>>> +    bool pre_remove = mf_flags & MF_MEM_PRE_REMOVE;
>>>>        mf_flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
>>>> @@ -1804,9 +1812,10 @@ int mf_dax_kill_procs(struct address_space 
>>>> *mapping, pgoff_t index,
>>>>            if (!page)
>>>>                goto unlock;
>>>> -        SetPageHWPoison(page);
>>>> +        if (!pre_remove)
>>>> +            SetPageHWPoison(page);
>>>> -        collect_procs_fsdax(page, mapping, index, &to_kill);
>>>> +        collect_procs_fsdax(page, mapping, index, &to_kill, 
>>>> pre_remove);
>>>>            unmap_and_kill(&to_kill, page_to_pfn(page), mapping,
>>>>                    index, mf_flags);
>>>>    unlock:
Darrick J. Wong July 29, 2023, 3:15 p.m. UTC | #6
On Thu, Jun 29, 2023 at 04:16:51PM +0800, Shiyang Ruan wrote:
> This patch is inspired by Dan's "mm, dax, pmem: Introduce
> dev_pagemap_failure()"[1].  With the help of dax_holder and
> ->notify_failure() mechanism, the pmem driver is able to ask filesystem
> on it to unmap all files in use, and notify processes who are using
> those files.
> 
> Call trace:
> trigger unbind
>  -> unbind_store()
>   -> ... (skip)
>    -> devres_release_all()
>     -> kill_dax()
>      -> dax_holder_notify_failure(dax_dev, 0, U64_MAX, MF_MEM_PRE_REMOVE)
>       -> xfs_dax_notify_failure()
>       `-> freeze_super()             // freeze (kernel call)
>       `-> do xfs rmap
>       ` -> mf_dax_kill_procs()
>       `  -> collect_procs_fsdax()    // all associated processes
>       `  -> unmap_and_kill()
>       ` -> invalidate_inode_pages2_range() // drop file's cache
>       `-> thaw_super()               // thaw (both kernel & user call)
> 
> Introduce MF_MEM_PRE_REMOVE to let filesystem know this is a remove
> event.  Use the exclusive freeze/thaw[2] to lock the filesystem to prevent
> new dax mapping from being created.  Do not shutdown filesystem directly
> if configuration is not supported, or if failure range includes metadata
> area.  Make sure all files and processes(not only the current progress)
> are handled correctly.  Also drop the cache of associated files before
> pmem is removed.
> 
> [1]: https://lore.kernel.org/linux-mm/161604050314.1463742.14151665140035795571.stgit@dwillia2-desk3.amr.corp.intel.com/
> [2]: https://lore.kernel.org/linux-xfs/168688010689.860947.1788875898367401950.stgit@frogsfrogsfrogs/
> 
> Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>
> ---
>  drivers/dax/super.c         |  3 +-
>  fs/xfs/xfs_notify_failure.c | 86 ++++++++++++++++++++++++++++++++++---
>  include/linux/mm.h          |  1 +
>  mm/memory-failure.c         | 17 ++++++--
>  4 files changed, 96 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/dax/super.c b/drivers/dax/super.c
> index c4c4728a36e4..2e1a35e82fce 100644
> --- a/drivers/dax/super.c
> +++ b/drivers/dax/super.c
> @@ -323,7 +323,8 @@ void kill_dax(struct dax_device *dax_dev)
>  		return;
>  
>  	if (dax_dev->holder_data != NULL)
> -		dax_holder_notify_failure(dax_dev, 0, U64_MAX, 0);
> +		dax_holder_notify_failure(dax_dev, 0, U64_MAX,
> +				MF_MEM_PRE_REMOVE);
>  
>  	clear_bit(DAXDEV_ALIVE, &dax_dev->flags);
>  	synchronize_srcu(&dax_srcu);
> diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c
> index 4a9bbd3fe120..f6ec56b76db6 100644
> --- a/fs/xfs/xfs_notify_failure.c
> +++ b/fs/xfs/xfs_notify_failure.c
> @@ -22,6 +22,7 @@
>  
>  #include <linux/mm.h>
>  #include <linux/dax.h>
> +#include <linux/fs.h>
>  
>  struct xfs_failure_info {
>  	xfs_agblock_t		startblock;
> @@ -73,10 +74,16 @@ xfs_dax_failure_fn(
>  	struct xfs_mount		*mp = cur->bc_mp;
>  	struct xfs_inode		*ip;
>  	struct xfs_failure_info		*notify = data;
> +	struct address_space		*mapping;
> +	pgoff_t				pgoff;
> +	unsigned long			pgcnt;
>  	int				error = 0;
>  
>  	if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) ||
>  	    (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) {
> +		/* Continue the query because this isn't a failure. */
> +		if (notify->mf_flags & MF_MEM_PRE_REMOVE)
> +			return 0;
>  		notify->want_shutdown = true;
>  		return 0;
>  	}
> @@ -92,14 +99,55 @@ xfs_dax_failure_fn(
>  		return 0;
>  	}
>  
> -	error = mf_dax_kill_procs(VFS_I(ip)->i_mapping,
> -				  xfs_failure_pgoff(mp, rec, notify),
> -				  xfs_failure_pgcnt(mp, rec, notify),
> -				  notify->mf_flags);
> +	mapping = VFS_I(ip)->i_mapping;
> +	pgoff = xfs_failure_pgoff(mp, rec, notify);
> +	pgcnt = xfs_failure_pgcnt(mp, rec, notify);
> +
> +	/* Continue the rmap query if the inode isn't a dax file. */
> +	if (dax_mapping(mapping))
> +		error = mf_dax_kill_procs(mapping, pgoff, pgcnt,
> +					  notify->mf_flags);
> +
> +	/* Invalidate the cache in dax pages. */
> +	if (notify->mf_flags & MF_MEM_PRE_REMOVE)
> +		invalidate_inode_pages2_range(mapping, pgoff,
> +					      pgoff + pgcnt - 1);
> +
>  	xfs_irele(ip);
>  	return error;
>  }
>  
> +static void
> +xfs_dax_notify_failure_freeze(
> +	struct xfs_mount	*mp)
> +{
> +	struct super_block 	*sb = mp->m_super;

Nit: extra space right    ^ here.

> +
> +	/* Wait until no one is holding the FREEZE_HOLDER_KERNEL. */
> +	while (freeze_super(sb, FREEZE_HOLDER_KERNEL) != 0) {
> +		// Shall we just wait, or print warning then return -EBUSY?

Hm.  PRE_REMOVE gets called before the pmem gets unplugged, right?  So
we'll send a second notification after it goes away, right?

If so, then I'd say return the error here instead of looping, and live
with a kernel-frozen fs discarding the PRE_REMOVE message.

> +		delay(HZ / 10);
> +	}
> +}
> +
> +static void
> +xfs_dax_notify_failure_thaw(
> +	struct xfs_mount	*mp)
> +{
> +	struct super_block	*sb = mp->m_super;
> +	int			error;
> +
> +	error = thaw_super(sb, FREEZE_HOLDER_KERNEL);
> +	if (error)
> +		xfs_emerg(mp, "still frozen after notify failure, err=%d",
> +			  error);
> +	/*
> +	 * Also thaw userspace call anyway because the device is about to be
> +	 * removed immediately.
> +	 */
> +	thaw_super(sb, FREEZE_HOLDER_USERSPACE);
> +}
> +
>  static int
>  xfs_dax_notify_ddev_failure(
>  	struct xfs_mount	*mp,
> @@ -120,7 +168,7 @@ xfs_dax_notify_ddev_failure(
>  
>  	error = xfs_trans_alloc_empty(mp, &tp);
>  	if (error)
> -		return error;
> +		goto out;
>  
>  	for (; agno <= end_agno; agno++) {
>  		struct xfs_rmap_irec	ri_low = { };
> @@ -165,11 +213,23 @@ xfs_dax_notify_ddev_failure(
>  	}
>  
>  	xfs_trans_cancel(tp);
> +
> +	/*
> +	 * Determine how to shutdown the filesystem according to the
> +	 * error code and flags.
> +	 */
>  	if (error || notify.want_shutdown) {
>  		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
>  		if (!error)
>  			error = -EFSCORRUPTED;
> -	}
> +	} else if (mf_flags & MF_MEM_PRE_REMOVE)
> +		xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
> +
> +out:
> +	/* Thaw the fs if it is freezed before. */
> +	if (mf_flags & MF_MEM_PRE_REMOVE)
> +		xfs_dax_notify_failure_thaw(mp);

_thaw should be called from the same function that called _freeze.

The rest of the patch seems ok to me.

--D

> +
>  	return error;
>  }
>  
> @@ -197,6 +257,8 @@ xfs_dax_notify_failure(
>  
>  	if (mp->m_logdev_targp && mp->m_logdev_targp->bt_daxdev == dax_dev &&
>  	    mp->m_logdev_targp != mp->m_ddev_targp) {
> +		if (mf_flags & MF_MEM_PRE_REMOVE)
> +			return 0;
>  		xfs_err(mp, "ondisk log corrupt, shutting down fs!");
>  		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
>  		return -EFSCORRUPTED;
> @@ -210,6 +272,12 @@ xfs_dax_notify_failure(
>  	ddev_start = mp->m_ddev_targp->bt_dax_part_off;
>  	ddev_end = ddev_start + bdev_nr_bytes(mp->m_ddev_targp->bt_bdev) - 1;
>  
> +	/* Notify failure on the whole device. */
> +	if (offset == 0 && len == U64_MAX) {
> +		offset = ddev_start;
> +		len = bdev_nr_bytes(mp->m_ddev_targp->bt_bdev);
> +	}
> +
>  	/* Ignore the range out of filesystem area */
>  	if (offset + len - 1 < ddev_start)
>  		return -ENXIO;
> @@ -226,6 +294,12 @@ xfs_dax_notify_failure(
>  	if (offset + len - 1 > ddev_end)
>  		len = ddev_end - offset + 1;
>  
> +	if (mf_flags & MF_MEM_PRE_REMOVE) {
> +		xfs_info(mp, "device is about to be removed!");
> +		/* Freeze fs to prevent new mappings from being created. */
> +		xfs_dax_notify_failure_freeze(mp);
> +	}
> +
>  	return xfs_dax_notify_ddev_failure(mp, BTOBB(offset), BTOBB(len),
>  			mf_flags);
>  }
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 27ce77080c79..a80c255b88d2 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -3576,6 +3576,7 @@ enum mf_flags {
>  	MF_UNPOISON = 1 << 4,
>  	MF_SW_SIMULATED = 1 << 5,
>  	MF_NO_RETRY = 1 << 6,
> +	MF_MEM_PRE_REMOVE = 1 << 7,
>  };
>  int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
>  		      unsigned long count, int mf_flags);
> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> index 5b663eca1f29..483b75f2fcfb 100644
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -688,7 +688,7 @@ static void add_to_kill_fsdax(struct task_struct *tsk, struct page *p,
>   */
>  static void collect_procs_fsdax(struct page *page,
>  		struct address_space *mapping, pgoff_t pgoff,
> -		struct list_head *to_kill)
> +		struct list_head *to_kill, bool pre_remove)
>  {
>  	struct vm_area_struct *vma;
>  	struct task_struct *tsk;
> @@ -696,8 +696,15 @@ static void collect_procs_fsdax(struct page *page,
>  	i_mmap_lock_read(mapping);
>  	read_lock(&tasklist_lock);
>  	for_each_process(tsk) {
> -		struct task_struct *t = task_early_kill(tsk, true);
> +		struct task_struct *t = tsk;
>  
> +		/*
> +		 * Search for all tasks while MF_MEM_PRE_REMOVE, because the
> +		 * current may not be the one accessing the fsdax page.
> +		 * Otherwise, search for the current task.
> +		 */
> +		if (!pre_remove)
> +			t = task_early_kill(tsk, true);
>  		if (!t)
>  			continue;
>  		vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
> @@ -1793,6 +1800,7 @@ int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
>  	dax_entry_t cookie;
>  	struct page *page;
>  	size_t end = index + count;
> +	bool pre_remove = mf_flags & MF_MEM_PRE_REMOVE;
>  
>  	mf_flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
>  
> @@ -1804,9 +1812,10 @@ int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
>  		if (!page)
>  			goto unlock;
>  
> -		SetPageHWPoison(page);
> +		if (!pre_remove)
> +			SetPageHWPoison(page);
>  
> -		collect_procs_fsdax(page, mapping, index, &to_kill);
> +		collect_procs_fsdax(page, mapping, index, &to_kill, pre_remove);
>  		unmap_and_kill(&to_kill, page_to_pfn(page), mapping,
>  				index, mf_flags);
>  unlock:
> -- 
> 2.40.1
>
Darrick J. Wong July 29, 2023, 3:15 p.m. UTC | #7
On Sat, Jul 29, 2023 at 06:01:00PM +0800, Shiyang Ruan wrote:
> 
> 
> 在 2023/7/20 9:50, Shiyang Ruan 写道:
> > 
> > 
> > 在 2023/7/14 22:18, Darrick J. Wong 写道:
> > > On Fri, Jul 14, 2023 at 05:07:58PM +0800, Shiyang Ruan wrote:
> > > > Hi Darrick,
> > > > 
> > > > Thanks for applying the 1st patch.
> > > > 
> > > > Now, since this patch is based on the new freeze_super()/thaw_super()
> > > > api[1], I'd like to ask what's the plan for this api?  It seems to have
> > > > missed the v6.5-rc1.
> > > > 
> > > > [1] https://lore.kernel.org/linux-xfs/168688010689.860947.1788875898367401950.stgit@frogsfrogsfrogs/
> > > 
> > > 6.6.  I intend to push the XFS UBSAN fixes to the list today for review.
> > > Early next week I'll resend the 6.5 rebase of the kernelfreeze series
> > > and push it to vfs-for-next.  Some time after that will come large folio
> > > writes.
> > 
> > Got it.  Thanks for your information!
> 
> A small request:  If you have time to give some comments, I would appreciate
> it because I hope we can make the most out of this period(before freeze api
> be merged in 6.6).

Done.

--D


> 
> --
> Thanks,
> Ruan.
> 
> > 
> > 
> > -- 
> > Ruan.
> > 
> > > 
> > > --D
> > > 
> > > > 
> > > > -- 
> > > > Thanks,
> > > > Ruan.
> > > > 
> > > > 
> > > > 在 2023/6/29 16:16, Shiyang Ruan 写道:
> > > > > This patch is inspired by Dan's "mm, dax, pmem: Introduce
> > > > > dev_pagemap_failure()"[1].  With the help of dax_holder and
> > > > > ->notify_failure() mechanism, the pmem driver is able to ask filesystem
> > > > > on it to unmap all files in use, and notify processes who are using
> > > > > those files.
> > > > > 
> > > > > Call trace:
> > > > > trigger unbind
> > > > >    -> unbind_store()
> > > > >     -> ... (skip)
> > > > >      -> devres_release_all()
> > > > >       -> kill_dax()
> > > > >        -> dax_holder_notify_failure(dax_dev, 0, U64_MAX,
> > > > > MF_MEM_PRE_REMOVE)
> > > > >         -> xfs_dax_notify_failure()
> > > > >         `-> freeze_super()             // freeze (kernel call)
> > > > >         `-> do xfs rmap
> > > > >         ` -> mf_dax_kill_procs()
> > > > >         `  -> collect_procs_fsdax()    // all associated processes
> > > > >         `  -> unmap_and_kill()
> > > > >         ` -> invalidate_inode_pages2_range() // drop file's cache
> > > > >         `-> thaw_super()               // thaw (both kernel
> > > > > & user call)
> > > > > 
> > > > > Introduce MF_MEM_PRE_REMOVE to let filesystem know this is a remove
> > > > > event.  Use the exclusive freeze/thaw[2] to lock the
> > > > > filesystem to prevent
> > > > > new dax mapping from being created.  Do not shutdown
> > > > > filesystem directly
> > > > > if configuration is not supported, or if failure range
> > > > > includes metadata
> > > > > area.  Make sure all files and processes(not only the current progress)
> > > > > are handled correctly.  Also drop the cache of associated files before
> > > > > pmem is removed.
> > > > > 
> > > > > [1]: https://lore.kernel.org/linux-mm/161604050314.1463742.14151665140035795571.stgit@dwillia2-desk3.amr.corp.intel.com/
> > > > > [2]: https://lore.kernel.org/linux-xfs/168688010689.860947.1788875898367401950.stgit@frogsfrogsfrogs/
> > > > > 
> > > > > Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>
> > > > > ---
> > > > >    drivers/dax/super.c         |  3 +-
> > > > >    fs/xfs/xfs_notify_failure.c | 86
> > > > > ++++++++++++++++++++++++++++++++++---
> > > > >    include/linux/mm.h          |  1 +
> > > > >    mm/memory-failure.c         | 17 ++++++--
> > > > >    4 files changed, 96 insertions(+), 11 deletions(-)
> > > > > 
> > > > > diff --git a/drivers/dax/super.c b/drivers/dax/super.c
> > > > > index c4c4728a36e4..2e1a35e82fce 100644
> > > > > --- a/drivers/dax/super.c
> > > > > +++ b/drivers/dax/super.c
> > > > > @@ -323,7 +323,8 @@ void kill_dax(struct dax_device *dax_dev)
> > > > >            return;
> > > > >        if (dax_dev->holder_data != NULL)
> > > > > -        dax_holder_notify_failure(dax_dev, 0, U64_MAX, 0);
> > > > > +        dax_holder_notify_failure(dax_dev, 0, U64_MAX,
> > > > > +                MF_MEM_PRE_REMOVE);
> > > > >        clear_bit(DAXDEV_ALIVE, &dax_dev->flags);
> > > > >        synchronize_srcu(&dax_srcu);
> > > > > diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c
> > > > > index 4a9bbd3fe120..f6ec56b76db6 100644
> > > > > --- a/fs/xfs/xfs_notify_failure.c
> > > > > +++ b/fs/xfs/xfs_notify_failure.c
> > > > > @@ -22,6 +22,7 @@
> > > > >    #include <linux/mm.h>
> > > > >    #include <linux/dax.h>
> > > > > +#include <linux/fs.h>
> > > > >    struct xfs_failure_info {
> > > > >        xfs_agblock_t        startblock;
> > > > > @@ -73,10 +74,16 @@ xfs_dax_failure_fn(
> > > > >        struct xfs_mount        *mp = cur->bc_mp;
> > > > >        struct xfs_inode        *ip;
> > > > >        struct xfs_failure_info        *notify = data;
> > > > > +    struct address_space        *mapping;
> > > > > +    pgoff_t                pgoff;
> > > > > +    unsigned long            pgcnt;
> > > > >        int                error = 0;
> > > > >        if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) ||
> > > > >            (rec->rm_flags & (XFS_RMAP_ATTR_FORK |
> > > > > XFS_RMAP_BMBT_BLOCK))) {
> > > > > +        /* Continue the query because this isn't a failure. */
> > > > > +        if (notify->mf_flags & MF_MEM_PRE_REMOVE)
> > > > > +            return 0;
> > > > >            notify->want_shutdown = true;
> > > > >            return 0;
> > > > >        }
> > > > > @@ -92,14 +99,55 @@ xfs_dax_failure_fn(
> > > > >            return 0;
> > > > >        }
> > > > > -    error = mf_dax_kill_procs(VFS_I(ip)->i_mapping,
> > > > > -                  xfs_failure_pgoff(mp, rec, notify),
> > > > > -                  xfs_failure_pgcnt(mp, rec, notify),
> > > > > -                  notify->mf_flags);
> > > > > +    mapping = VFS_I(ip)->i_mapping;
> > > > > +    pgoff = xfs_failure_pgoff(mp, rec, notify);
> > > > > +    pgcnt = xfs_failure_pgcnt(mp, rec, notify);
> > > > > +
> > > > > +    /* Continue the rmap query if the inode isn't a dax file. */
> > > > > +    if (dax_mapping(mapping))
> > > > > +        error = mf_dax_kill_procs(mapping, pgoff, pgcnt,
> > > > > +                      notify->mf_flags);
> > > > > +
> > > > > +    /* Invalidate the cache in dax pages. */
> > > > > +    if (notify->mf_flags & MF_MEM_PRE_REMOVE)
> > > > > +        invalidate_inode_pages2_range(mapping, pgoff,
> > > > > +                          pgoff + pgcnt - 1);
> > > > > +
> > > > >        xfs_irele(ip);
> > > > >        return error;
> > > > >    }
> > > > > +static void
> > > > > +xfs_dax_notify_failure_freeze(
> > > > > +    struct xfs_mount    *mp)
> > > > > +{
> > > > > +    struct super_block     *sb = mp->m_super;
> > > > > +
> > > > > +    /* Wait until no one is holding the FREEZE_HOLDER_KERNEL. */
> > > > > +    while (freeze_super(sb, FREEZE_HOLDER_KERNEL) != 0) {
> > > > > +        // Shall we just wait, or print warning then return -EBUSY?
> > > > > +        delay(HZ / 10);
> > > > > +    }
> > > > > +}
> > > > > +
> > > > > +static void
> > > > > +xfs_dax_notify_failure_thaw(
> > > > > +    struct xfs_mount    *mp)
> > > > > +{
> > > > > +    struct super_block    *sb = mp->m_super;
> > > > > +    int            error;
> > > > > +
> > > > > +    error = thaw_super(sb, FREEZE_HOLDER_KERNEL);
> > > > > +    if (error)
> > > > > +        xfs_emerg(mp, "still frozen after notify failure, err=%d",
> > > > > +              error);
> > > > > +    /*
> > > > > +     * Also thaw userspace call anyway because the device
> > > > > is about to be
> > > > > +     * removed immediately.
> > > > > +     */
> > > > > +    thaw_super(sb, FREEZE_HOLDER_USERSPACE);
> > > > > +}
> > > > > +
> > > > >    static int
> > > > >    xfs_dax_notify_ddev_failure(
> > > > >        struct xfs_mount    *mp,
> > > > > @@ -120,7 +168,7 @@ xfs_dax_notify_ddev_failure(
> > > > >        error = xfs_trans_alloc_empty(mp, &tp);
> > > > >        if (error)
> > > > > -        return error;
> > > > > +        goto out;
> > > > >        for (; agno <= end_agno; agno++) {
> > > > >            struct xfs_rmap_irec    ri_low = { };
> > > > > @@ -165,11 +213,23 @@ xfs_dax_notify_ddev_failure(
> > > > >        }
> > > > >        xfs_trans_cancel(tp);
> > > > > +
> > > > > +    /*
> > > > > +     * Determine how to shutdown the filesystem according to the
> > > > > +     * error code and flags.
> > > > > +     */
> > > > >        if (error || notify.want_shutdown) {
> > > > >            xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
> > > > >            if (!error)
> > > > >                error = -EFSCORRUPTED;
> > > > > -    }
> > > > > +    } else if (mf_flags & MF_MEM_PRE_REMOVE)
> > > > > +        xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
> > > > > +
> > > > > +out:
> > > > > +    /* Thaw the fs if it is freezed before. */
> > > > > +    if (mf_flags & MF_MEM_PRE_REMOVE)
> > > > > +        xfs_dax_notify_failure_thaw(mp);
> > > > > +
> > > > >        return error;
> > > > >    }
> > > > > @@ -197,6 +257,8 @@ xfs_dax_notify_failure(
> > > > >        if (mp->m_logdev_targp &&
> > > > > mp->m_logdev_targp->bt_daxdev == dax_dev &&
> > > > >            mp->m_logdev_targp != mp->m_ddev_targp) {
> > > > > +        if (mf_flags & MF_MEM_PRE_REMOVE)
> > > > > +            return 0;
> > > > >            xfs_err(mp, "ondisk log corrupt, shutting down fs!");
> > > > >            xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
> > > > >            return -EFSCORRUPTED;
> > > > > @@ -210,6 +272,12 @@ xfs_dax_notify_failure(
> > > > >        ddev_start = mp->m_ddev_targp->bt_dax_part_off;
> > > > >        ddev_end = ddev_start +
> > > > > bdev_nr_bytes(mp->m_ddev_targp->bt_bdev) - 1;
> > > > > +    /* Notify failure on the whole device. */
> > > > > +    if (offset == 0 && len == U64_MAX) {
> > > > > +        offset = ddev_start;
> > > > > +        len = bdev_nr_bytes(mp->m_ddev_targp->bt_bdev);
> > > > > +    }
> > > > > +
> > > > >        /* Ignore the range out of filesystem area */
> > > > >        if (offset + len - 1 < ddev_start)
> > > > >            return -ENXIO;
> > > > > @@ -226,6 +294,12 @@ xfs_dax_notify_failure(
> > > > >        if (offset + len - 1 > ddev_end)
> > > > >            len = ddev_end - offset + 1;
> > > > > +    if (mf_flags & MF_MEM_PRE_REMOVE) {
> > > > > +        xfs_info(mp, "device is about to be removed!");
> > > > > +        /* Freeze fs to prevent new mappings from being created. */
> > > > > +        xfs_dax_notify_failure_freeze(mp);
> > > > > +    }
> > > > > +
> > > > >        return xfs_dax_notify_ddev_failure(mp, BTOBB(offset),
> > > > > BTOBB(len),
> > > > >                mf_flags);
> > > > >    }
> > > > > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > > > > index 27ce77080c79..a80c255b88d2 100644
> > > > > --- a/include/linux/mm.h
> > > > > +++ b/include/linux/mm.h
> > > > > @@ -3576,6 +3576,7 @@ enum mf_flags {
> > > > >        MF_UNPOISON = 1 << 4,
> > > > >        MF_SW_SIMULATED = 1 << 5,
> > > > >        MF_NO_RETRY = 1 << 6,
> > > > > +    MF_MEM_PRE_REMOVE = 1 << 7,
> > > > >    };
> > > > >    int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
> > > > >                  unsigned long count, int mf_flags);
> > > > > diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> > > > > index 5b663eca1f29..483b75f2fcfb 100644
> > > > > --- a/mm/memory-failure.c
> > > > > +++ b/mm/memory-failure.c
> > > > > @@ -688,7 +688,7 @@ static void add_to_kill_fsdax(struct
> > > > > task_struct *tsk, struct page *p,
> > > > >     */
> > > > >    static void collect_procs_fsdax(struct page *page,
> > > > >            struct address_space *mapping, pgoff_t pgoff,
> > > > > -        struct list_head *to_kill)
> > > > > +        struct list_head *to_kill, bool pre_remove)
> > > > >    {
> > > > >        struct vm_area_struct *vma;
> > > > >        struct task_struct *tsk;
> > > > > @@ -696,8 +696,15 @@ static void collect_procs_fsdax(struct page *page,
> > > > >        i_mmap_lock_read(mapping);
> > > > >        read_lock(&tasklist_lock);
> > > > >        for_each_process(tsk) {
> > > > > -        struct task_struct *t = task_early_kill(tsk, true);
> > > > > +        struct task_struct *t = tsk;
> > > > > +        /*
> > > > > +         * Search for all tasks while MF_MEM_PRE_REMOVE, because the
> > > > > +         * current may not be the one accessing the fsdax page.
> > > > > +         * Otherwise, search for the current task.
> > > > > +         */
> > > > > +        if (!pre_remove)
> > > > > +            t = task_early_kill(tsk, true);
> > > > >            if (!t)
> > > > >                continue;
> > > > >            vma_interval_tree_foreach(vma, &mapping->i_mmap,
> > > > > pgoff, pgoff) {
> > > > > @@ -1793,6 +1800,7 @@ int mf_dax_kill_procs(struct
> > > > > address_space *mapping, pgoff_t index,
> > > > >        dax_entry_t cookie;
> > > > >        struct page *page;
> > > > >        size_t end = index + count;
> > > > > +    bool pre_remove = mf_flags & MF_MEM_PRE_REMOVE;
> > > > >        mf_flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
> > > > > @@ -1804,9 +1812,10 @@ int mf_dax_kill_procs(struct
> > > > > address_space *mapping, pgoff_t index,
> > > > >            if (!page)
> > > > >                goto unlock;
> > > > > -        SetPageHWPoison(page);
> > > > > +        if (!pre_remove)
> > > > > +            SetPageHWPoison(page);
> > > > > -        collect_procs_fsdax(page, mapping, index, &to_kill);
> > > > > +        collect_procs_fsdax(page, mapping, index, &to_kill,
> > > > > pre_remove);
> > > > >            unmap_and_kill(&to_kill, page_to_pfn(page), mapping,
> > > > >                    index, mf_flags);
> > > > >    unlock:
Shiyang Ruan July 31, 2023, 9:36 a.m. UTC | #8
在 2023/7/29 23:15, Darrick J. Wong 写道:
> On Thu, Jun 29, 2023 at 04:16:51PM +0800, Shiyang Ruan wrote:
>> This patch is inspired by Dan's "mm, dax, pmem: Introduce
>> dev_pagemap_failure()"[1].  With the help of dax_holder and
>> ->notify_failure() mechanism, the pmem driver is able to ask filesystem
>> on it to unmap all files in use, and notify processes who are using
>> those files.
>>
>> Call trace:
>> trigger unbind
>>   -> unbind_store()
>>    -> ... (skip)
>>     -> devres_release_all()
>>      -> kill_dax()
>>       -> dax_holder_notify_failure(dax_dev, 0, U64_MAX, MF_MEM_PRE_REMOVE)
>>        -> xfs_dax_notify_failure()
>>        `-> freeze_super()             // freeze (kernel call)
>>        `-> do xfs rmap
>>        ` -> mf_dax_kill_procs()
>>        `  -> collect_procs_fsdax()    // all associated processes
>>        `  -> unmap_and_kill()
>>        ` -> invalidate_inode_pages2_range() // drop file's cache
>>        `-> thaw_super()               // thaw (both kernel & user call)
>>
>> Introduce MF_MEM_PRE_REMOVE to let filesystem know this is a remove
>> event.  Use the exclusive freeze/thaw[2] to lock the filesystem to prevent
>> new dax mapping from being created.  Do not shutdown filesystem directly
>> if configuration is not supported, or if failure range includes metadata
>> area.  Make sure all files and processes(not only the current progress)
>> are handled correctly.  Also drop the cache of associated files before
>> pmem is removed.
>>
>> [1]: https://lore.kernel.org/linux-mm/161604050314.1463742.14151665140035795571.stgit@dwillia2-desk3.amr.corp.intel.com/
>> [2]: https://lore.kernel.org/linux-xfs/168688010689.860947.1788875898367401950.stgit@frogsfrogsfrogs/
>>
>> Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>
>> ---
>>   drivers/dax/super.c         |  3 +-
>>   fs/xfs/xfs_notify_failure.c | 86 ++++++++++++++++++++++++++++++++++---
>>   include/linux/mm.h          |  1 +
>>   mm/memory-failure.c         | 17 ++++++--
>>   4 files changed, 96 insertions(+), 11 deletions(-)
>>
>> diff --git a/drivers/dax/super.c b/drivers/dax/super.c
>> index c4c4728a36e4..2e1a35e82fce 100644
>> --- a/drivers/dax/super.c
>> +++ b/drivers/dax/super.c
>> @@ -323,7 +323,8 @@ void kill_dax(struct dax_device *dax_dev)
>>   		return;
>>   
>>   	if (dax_dev->holder_data != NULL)
>> -		dax_holder_notify_failure(dax_dev, 0, U64_MAX, 0);
>> +		dax_holder_notify_failure(dax_dev, 0, U64_MAX,
>> +				MF_MEM_PRE_REMOVE);
>>   
>>   	clear_bit(DAXDEV_ALIVE, &dax_dev->flags);
>>   	synchronize_srcu(&dax_srcu);
>> diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c
>> index 4a9bbd3fe120..f6ec56b76db6 100644
>> --- a/fs/xfs/xfs_notify_failure.c
>> +++ b/fs/xfs/xfs_notify_failure.c
>> @@ -22,6 +22,7 @@
>>   
>>   #include <linux/mm.h>
>>   #include <linux/dax.h>
>> +#include <linux/fs.h>
>>   
>>   struct xfs_failure_info {
>>   	xfs_agblock_t		startblock;
>> @@ -73,10 +74,16 @@ xfs_dax_failure_fn(
>>   	struct xfs_mount		*mp = cur->bc_mp;
>>   	struct xfs_inode		*ip;
>>   	struct xfs_failure_info		*notify = data;
>> +	struct address_space		*mapping;
>> +	pgoff_t				pgoff;
>> +	unsigned long			pgcnt;
>>   	int				error = 0;
>>   
>>   	if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) ||
>>   	    (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) {
>> +		/* Continue the query because this isn't a failure. */
>> +		if (notify->mf_flags & MF_MEM_PRE_REMOVE)
>> +			return 0;
>>   		notify->want_shutdown = true;
>>   		return 0;
>>   	}
>> @@ -92,14 +99,55 @@ xfs_dax_failure_fn(
>>   		return 0;
>>   	}
>>   
>> -	error = mf_dax_kill_procs(VFS_I(ip)->i_mapping,
>> -				  xfs_failure_pgoff(mp, rec, notify),
>> -				  xfs_failure_pgcnt(mp, rec, notify),
>> -				  notify->mf_flags);
>> +	mapping = VFS_I(ip)->i_mapping;
>> +	pgoff = xfs_failure_pgoff(mp, rec, notify);
>> +	pgcnt = xfs_failure_pgcnt(mp, rec, notify);
>> +
>> +	/* Continue the rmap query if the inode isn't a dax file. */
>> +	if (dax_mapping(mapping))
>> +		error = mf_dax_kill_procs(mapping, pgoff, pgcnt,
>> +					  notify->mf_flags);
>> +
>> +	/* Invalidate the cache in dax pages. */
>> +	if (notify->mf_flags & MF_MEM_PRE_REMOVE)
>> +		invalidate_inode_pages2_range(mapping, pgoff,
>> +					      pgoff + pgcnt - 1);
>> +
>>   	xfs_irele(ip);
>>   	return error;
>>   }
>>   
>> +static void
>> +xfs_dax_notify_failure_freeze(
>> +	struct xfs_mount	*mp)
>> +{
>> +	struct super_block 	*sb = mp->m_super;
> 
> Nit: extra space right    ^ here.
> 
>> +
>> +	/* Wait until no one is holding the FREEZE_HOLDER_KERNEL. */
>> +	while (freeze_super(sb, FREEZE_HOLDER_KERNEL) != 0) {
>> +		// Shall we just wait, or print warning then return -EBUSY?
> 
> Hm.  PRE_REMOVE gets called before the pmem gets unplugged, right?  So
> we'll send a second notification after it goes away, right?

For the first question, yes.

But I'm not sure about the second one.  Do you mean: we'll send this 
notification again if unbind didn't success because freeze_super() 
returns -EBUSY?  In other words, if the previous unbind operation did 
not work, we could unbind the device again.

> 
> If so, then I'd say return the error here instead of looping, and live
> with a kernel-frozen fs discarding the PRE_REMOVE message.
> 
>> +		delay(HZ / 10);
>> +	}
>> +}
>> +
>> +static void
>> +xfs_dax_notify_failure_thaw(
>> +	struct xfs_mount	*mp)
>> +{
>> +	struct super_block	*sb = mp->m_super;
>> +	int			error;
>> +
>> +	error = thaw_super(sb, FREEZE_HOLDER_KERNEL);
>> +	if (error)
>> +		xfs_emerg(mp, "still frozen after notify failure, err=%d",
>> +			  error);
>> +	/*
>> +	 * Also thaw userspace call anyway because the device is about to be
>> +	 * removed immediately.
>> +	 */
>> +	thaw_super(sb, FREEZE_HOLDER_USERSPACE);
>> +}
>> +
>>   static int
>>   xfs_dax_notify_ddev_failure(
>>   	struct xfs_mount	*mp,
>> @@ -120,7 +168,7 @@ xfs_dax_notify_ddev_failure(
>>   
>>   	error = xfs_trans_alloc_empty(mp, &tp);
>>   	if (error)
>> -		return error;
>> +		goto out;
>>   
>>   	for (; agno <= end_agno; agno++) {
>>   		struct xfs_rmap_irec	ri_low = { };
>> @@ -165,11 +213,23 @@ xfs_dax_notify_ddev_failure(
>>   	}
>>   
>>   	xfs_trans_cancel(tp);
>> +
>> +	/*
>> +	 * Determine how to shutdown the filesystem according to the
>> +	 * error code and flags.
>> +	 */
>>   	if (error || notify.want_shutdown) {
>>   		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
>>   		if (!error)
>>   			error = -EFSCORRUPTED;
>> -	}
>> +	} else if (mf_flags & MF_MEM_PRE_REMOVE)
>> +		xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
>> +
>> +out:
>> +	/* Thaw the fs if it is freezed before. */
>> +	if (mf_flags & MF_MEM_PRE_REMOVE)
>> +		xfs_dax_notify_failure_thaw(mp);
> 
> _thaw should be called from the same function that called _freeze.

Will fix this.

> 
> The rest of the patch seems ok to me.

Thank you!


--
Ruan.

> 
> --D
> 
>> +
>>   	return error;
>>   }
>>   
>> @@ -197,6 +257,8 @@ xfs_dax_notify_failure(
>>   
>>   	if (mp->m_logdev_targp && mp->m_logdev_targp->bt_daxdev == dax_dev &&
>>   	    mp->m_logdev_targp != mp->m_ddev_targp) {
>> +		if (mf_flags & MF_MEM_PRE_REMOVE)
>> +			return 0;
>>   		xfs_err(mp, "ondisk log corrupt, shutting down fs!");
>>   		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
>>   		return -EFSCORRUPTED;
>> @@ -210,6 +272,12 @@ xfs_dax_notify_failure(
>>   	ddev_start = mp->m_ddev_targp->bt_dax_part_off;
>>   	ddev_end = ddev_start + bdev_nr_bytes(mp->m_ddev_targp->bt_bdev) - 1;
>>   
>> +	/* Notify failure on the whole device. */
>> +	if (offset == 0 && len == U64_MAX) {
>> +		offset = ddev_start;
>> +		len = bdev_nr_bytes(mp->m_ddev_targp->bt_bdev);
>> +	}
>> +
>>   	/* Ignore the range out of filesystem area */
>>   	if (offset + len - 1 < ddev_start)
>>   		return -ENXIO;
>> @@ -226,6 +294,12 @@ xfs_dax_notify_failure(
>>   	if (offset + len - 1 > ddev_end)
>>   		len = ddev_end - offset + 1;
>>   
>> +	if (mf_flags & MF_MEM_PRE_REMOVE) {
>> +		xfs_info(mp, "device is about to be removed!");
>> +		/* Freeze fs to prevent new mappings from being created. */
>> +		xfs_dax_notify_failure_freeze(mp);
>> +	}
>> +
>>   	return xfs_dax_notify_ddev_failure(mp, BTOBB(offset), BTOBB(len),
>>   			mf_flags);
>>   }
>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>> index 27ce77080c79..a80c255b88d2 100644
>> --- a/include/linux/mm.h
>> +++ b/include/linux/mm.h
>> @@ -3576,6 +3576,7 @@ enum mf_flags {
>>   	MF_UNPOISON = 1 << 4,
>>   	MF_SW_SIMULATED = 1 << 5,
>>   	MF_NO_RETRY = 1 << 6,
>> +	MF_MEM_PRE_REMOVE = 1 << 7,
>>   };
>>   int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
>>   		      unsigned long count, int mf_flags);
>> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
>> index 5b663eca1f29..483b75f2fcfb 100644
>> --- a/mm/memory-failure.c
>> +++ b/mm/memory-failure.c
>> @@ -688,7 +688,7 @@ static void add_to_kill_fsdax(struct task_struct *tsk, struct page *p,
>>    */
>>   static void collect_procs_fsdax(struct page *page,
>>   		struct address_space *mapping, pgoff_t pgoff,
>> -		struct list_head *to_kill)
>> +		struct list_head *to_kill, bool pre_remove)
>>   {
>>   	struct vm_area_struct *vma;
>>   	struct task_struct *tsk;
>> @@ -696,8 +696,15 @@ static void collect_procs_fsdax(struct page *page,
>>   	i_mmap_lock_read(mapping);
>>   	read_lock(&tasklist_lock);
>>   	for_each_process(tsk) {
>> -		struct task_struct *t = task_early_kill(tsk, true);
>> +		struct task_struct *t = tsk;
>>   
>> +		/*
>> +		 * Search for all tasks while MF_MEM_PRE_REMOVE, because the
>> +		 * current may not be the one accessing the fsdax page.
>> +		 * Otherwise, search for the current task.
>> +		 */
>> +		if (!pre_remove)
>> +			t = task_early_kill(tsk, true);
>>   		if (!t)
>>   			continue;
>>   		vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
>> @@ -1793,6 +1800,7 @@ int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
>>   	dax_entry_t cookie;
>>   	struct page *page;
>>   	size_t end = index + count;
>> +	bool pre_remove = mf_flags & MF_MEM_PRE_REMOVE;
>>   
>>   	mf_flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
>>   
>> @@ -1804,9 +1812,10 @@ int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
>>   		if (!page)
>>   			goto unlock;
>>   
>> -		SetPageHWPoison(page);
>> +		if (!pre_remove)
>> +			SetPageHWPoison(page);
>>   
>> -		collect_procs_fsdax(page, mapping, index, &to_kill);
>> +		collect_procs_fsdax(page, mapping, index, &to_kill, pre_remove);
>>   		unmap_and_kill(&to_kill, page_to_pfn(page), mapping,
>>   				index, mf_flags);
>>   unlock:
>> -- 
>> 2.40.1
>>
Darrick J. Wong Aug. 1, 2023, 3:25 a.m. UTC | #9
On Mon, Jul 31, 2023 at 05:36:36PM +0800, Shiyang Ruan wrote:
> 
> 
> 在 2023/7/29 23:15, Darrick J. Wong 写道:
> > On Thu, Jun 29, 2023 at 04:16:51PM +0800, Shiyang Ruan wrote:
> > > This patch is inspired by Dan's "mm, dax, pmem: Introduce
> > > dev_pagemap_failure()"[1].  With the help of dax_holder and
> > > ->notify_failure() mechanism, the pmem driver is able to ask filesystem
> > > on it to unmap all files in use, and notify processes who are using
> > > those files.
> > > 
> > > Call trace:
> > > trigger unbind
> > >   -> unbind_store()
> > >    -> ... (skip)
> > >     -> devres_release_all()
> > >      -> kill_dax()
> > >       -> dax_holder_notify_failure(dax_dev, 0, U64_MAX, MF_MEM_PRE_REMOVE)
> > >        -> xfs_dax_notify_failure()
> > >        `-> freeze_super()             // freeze (kernel call)
> > >        `-> do xfs rmap
> > >        ` -> mf_dax_kill_procs()
> > >        `  -> collect_procs_fsdax()    // all associated processes
> > >        `  -> unmap_and_kill()
> > >        ` -> invalidate_inode_pages2_range() // drop file's cache
> > >        `-> thaw_super()               // thaw (both kernel & user call)
> > > 
> > > Introduce MF_MEM_PRE_REMOVE to let filesystem know this is a remove
> > > event.  Use the exclusive freeze/thaw[2] to lock the filesystem to prevent
> > > new dax mapping from being created.  Do not shutdown filesystem directly
> > > if configuration is not supported, or if failure range includes metadata
> > > area.  Make sure all files and processes(not only the current progress)
> > > are handled correctly.  Also drop the cache of associated files before
> > > pmem is removed.
> > > 
> > > [1]: https://lore.kernel.org/linux-mm/161604050314.1463742.14151665140035795571.stgit@dwillia2-desk3.amr.corp.intel.com/
> > > [2]: https://lore.kernel.org/linux-xfs/168688010689.860947.1788875898367401950.stgit@frogsfrogsfrogs/
> > > 
> > > Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>
> > > ---
> > >   drivers/dax/super.c         |  3 +-
> > >   fs/xfs/xfs_notify_failure.c | 86 ++++++++++++++++++++++++++++++++++---
> > >   include/linux/mm.h          |  1 +
> > >   mm/memory-failure.c         | 17 ++++++--
> > >   4 files changed, 96 insertions(+), 11 deletions(-)
> > > 
> > > diff --git a/drivers/dax/super.c b/drivers/dax/super.c
> > > index c4c4728a36e4..2e1a35e82fce 100644
> > > --- a/drivers/dax/super.c
> > > +++ b/drivers/dax/super.c
> > > @@ -323,7 +323,8 @@ void kill_dax(struct dax_device *dax_dev)
> > >   		return;
> > >   	if (dax_dev->holder_data != NULL)
> > > -		dax_holder_notify_failure(dax_dev, 0, U64_MAX, 0);
> > > +		dax_holder_notify_failure(dax_dev, 0, U64_MAX,
> > > +				MF_MEM_PRE_REMOVE);
> > >   	clear_bit(DAXDEV_ALIVE, &dax_dev->flags);
> > >   	synchronize_srcu(&dax_srcu);
> > > diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c
> > > index 4a9bbd3fe120..f6ec56b76db6 100644
> > > --- a/fs/xfs/xfs_notify_failure.c
> > > +++ b/fs/xfs/xfs_notify_failure.c
> > > @@ -22,6 +22,7 @@
> > >   #include <linux/mm.h>
> > >   #include <linux/dax.h>
> > > +#include <linux/fs.h>
> > >   struct xfs_failure_info {
> > >   	xfs_agblock_t		startblock;
> > > @@ -73,10 +74,16 @@ xfs_dax_failure_fn(
> > >   	struct xfs_mount		*mp = cur->bc_mp;
> > >   	struct xfs_inode		*ip;
> > >   	struct xfs_failure_info		*notify = data;
> > > +	struct address_space		*mapping;
> > > +	pgoff_t				pgoff;
> > > +	unsigned long			pgcnt;
> > >   	int				error = 0;
> > >   	if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) ||
> > >   	    (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) {
> > > +		/* Continue the query because this isn't a failure. */
> > > +		if (notify->mf_flags & MF_MEM_PRE_REMOVE)
> > > +			return 0;
> > >   		notify->want_shutdown = true;
> > >   		return 0;
> > >   	}
> > > @@ -92,14 +99,55 @@ xfs_dax_failure_fn(
> > >   		return 0;
> > >   	}
> > > -	error = mf_dax_kill_procs(VFS_I(ip)->i_mapping,
> > > -				  xfs_failure_pgoff(mp, rec, notify),
> > > -				  xfs_failure_pgcnt(mp, rec, notify),
> > > -				  notify->mf_flags);
> > > +	mapping = VFS_I(ip)->i_mapping;
> > > +	pgoff = xfs_failure_pgoff(mp, rec, notify);
> > > +	pgcnt = xfs_failure_pgcnt(mp, rec, notify);
> > > +
> > > +	/* Continue the rmap query if the inode isn't a dax file. */
> > > +	if (dax_mapping(mapping))
> > > +		error = mf_dax_kill_procs(mapping, pgoff, pgcnt,
> > > +					  notify->mf_flags);
> > > +
> > > +	/* Invalidate the cache in dax pages. */
> > > +	if (notify->mf_flags & MF_MEM_PRE_REMOVE)
> > > +		invalidate_inode_pages2_range(mapping, pgoff,
> > > +					      pgoff + pgcnt - 1);
> > > +
> > >   	xfs_irele(ip);
> > >   	return error;
> > >   }
> > > +static void
> > > +xfs_dax_notify_failure_freeze(
> > > +	struct xfs_mount	*mp)
> > > +{
> > > +	struct super_block 	*sb = mp->m_super;
> > 
> > Nit: extra space right    ^ here.
> > 
> > > +
> > > +	/* Wait until no one is holding the FREEZE_HOLDER_KERNEL. */
> > > +	while (freeze_super(sb, FREEZE_HOLDER_KERNEL) != 0) {
> > > +		// Shall we just wait, or print warning then return -EBUSY?
> > 
> > Hm.  PRE_REMOVE gets called before the pmem gets unplugged, right?  So
> > we'll send a second notification after it goes away, right?
> 
> For the first question, yes.
> 
> But I'm not sure about the second one.  Do you mean: we'll send this
> notification again if unbind didn't success because freeze_super() returns
> -EBUSY?  In other words, if the previous unbind operation did not work, we
> could unbind the device again.

Yeah.  If the MF_MEM_PRE_REMOVE fails with EBUSY, then call it again
without PRE_REMOVE and let it kill processes.

--D

> > 
> > If so, then I'd say return the error here instead of looping, and live
> > with a kernel-frozen fs discarding the PRE_REMOVE message.
> > 
> > > +		delay(HZ / 10);
> > > +	}
> > > +}
> > > +
> > > +static void
> > > +xfs_dax_notify_failure_thaw(
> > > +	struct xfs_mount	*mp)
> > > +{
> > > +	struct super_block	*sb = mp->m_super;
> > > +	int			error;
> > > +
> > > +	error = thaw_super(sb, FREEZE_HOLDER_KERNEL);
> > > +	if (error)
> > > +		xfs_emerg(mp, "still frozen after notify failure, err=%d",
> > > +			  error);
> > > +	/*
> > > +	 * Also thaw userspace call anyway because the device is about to be
> > > +	 * removed immediately.
> > > +	 */
> > > +	thaw_super(sb, FREEZE_HOLDER_USERSPACE);
> > > +}
> > > +
> > >   static int
> > >   xfs_dax_notify_ddev_failure(
> > >   	struct xfs_mount	*mp,
> > > @@ -120,7 +168,7 @@ xfs_dax_notify_ddev_failure(
> > >   	error = xfs_trans_alloc_empty(mp, &tp);
> > >   	if (error)
> > > -		return error;
> > > +		goto out;
> > >   	for (; agno <= end_agno; agno++) {
> > >   		struct xfs_rmap_irec	ri_low = { };
> > > @@ -165,11 +213,23 @@ xfs_dax_notify_ddev_failure(
> > >   	}
> > >   	xfs_trans_cancel(tp);
> > > +
> > > +	/*
> > > +	 * Determine how to shutdown the filesystem according to the
> > > +	 * error code and flags.
> > > +	 */
> > >   	if (error || notify.want_shutdown) {
> > >   		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
> > >   		if (!error)
> > >   			error = -EFSCORRUPTED;
> > > -	}
> > > +	} else if (mf_flags & MF_MEM_PRE_REMOVE)
> > > +		xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
> > > +
> > > +out:
> > > +	/* Thaw the fs if it is freezed before. */
> > > +	if (mf_flags & MF_MEM_PRE_REMOVE)
> > > +		xfs_dax_notify_failure_thaw(mp);
> > 
> > _thaw should be called from the same function that called _freeze.
> 
> Will fix this.
> 
> > 
> > The rest of the patch seems ok to me.
> 
> Thank you!
> 
> 
> --
> Ruan.
> 
> > 
> > --D
> > 
> > > +
> > >   	return error;
> > >   }
> > > @@ -197,6 +257,8 @@ xfs_dax_notify_failure(
> > >   	if (mp->m_logdev_targp && mp->m_logdev_targp->bt_daxdev == dax_dev &&
> > >   	    mp->m_logdev_targp != mp->m_ddev_targp) {
> > > +		if (mf_flags & MF_MEM_PRE_REMOVE)
> > > +			return 0;
> > >   		xfs_err(mp, "ondisk log corrupt, shutting down fs!");
> > >   		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
> > >   		return -EFSCORRUPTED;
> > > @@ -210,6 +272,12 @@ xfs_dax_notify_failure(
> > >   	ddev_start = mp->m_ddev_targp->bt_dax_part_off;
> > >   	ddev_end = ddev_start + bdev_nr_bytes(mp->m_ddev_targp->bt_bdev) - 1;
> > > +	/* Notify failure on the whole device. */
> > > +	if (offset == 0 && len == U64_MAX) {
> > > +		offset = ddev_start;
> > > +		len = bdev_nr_bytes(mp->m_ddev_targp->bt_bdev);
> > > +	}
> > > +
> > >   	/* Ignore the range out of filesystem area */
> > >   	if (offset + len - 1 < ddev_start)
> > >   		return -ENXIO;
> > > @@ -226,6 +294,12 @@ xfs_dax_notify_failure(
> > >   	if (offset + len - 1 > ddev_end)
> > >   		len = ddev_end - offset + 1;
> > > +	if (mf_flags & MF_MEM_PRE_REMOVE) {
> > > +		xfs_info(mp, "device is about to be removed!");
> > > +		/* Freeze fs to prevent new mappings from being created. */
> > > +		xfs_dax_notify_failure_freeze(mp);
> > > +	}
> > > +
> > >   	return xfs_dax_notify_ddev_failure(mp, BTOBB(offset), BTOBB(len),
> > >   			mf_flags);
> > >   }
> > > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > > index 27ce77080c79..a80c255b88d2 100644
> > > --- a/include/linux/mm.h
> > > +++ b/include/linux/mm.h
> > > @@ -3576,6 +3576,7 @@ enum mf_flags {
> > >   	MF_UNPOISON = 1 << 4,
> > >   	MF_SW_SIMULATED = 1 << 5,
> > >   	MF_NO_RETRY = 1 << 6,
> > > +	MF_MEM_PRE_REMOVE = 1 << 7,
> > >   };
> > >   int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
> > >   		      unsigned long count, int mf_flags);
> > > diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> > > index 5b663eca1f29..483b75f2fcfb 100644
> > > --- a/mm/memory-failure.c
> > > +++ b/mm/memory-failure.c
> > > @@ -688,7 +688,7 @@ static void add_to_kill_fsdax(struct task_struct *tsk, struct page *p,
> > >    */
> > >   static void collect_procs_fsdax(struct page *page,
> > >   		struct address_space *mapping, pgoff_t pgoff,
> > > -		struct list_head *to_kill)
> > > +		struct list_head *to_kill, bool pre_remove)
> > >   {
> > >   	struct vm_area_struct *vma;
> > >   	struct task_struct *tsk;
> > > @@ -696,8 +696,15 @@ static void collect_procs_fsdax(struct page *page,
> > >   	i_mmap_lock_read(mapping);
> > >   	read_lock(&tasklist_lock);
> > >   	for_each_process(tsk) {
> > > -		struct task_struct *t = task_early_kill(tsk, true);
> > > +		struct task_struct *t = tsk;
> > > +		/*
> > > +		 * Search for all tasks while MF_MEM_PRE_REMOVE, because the
> > > +		 * current may not be the one accessing the fsdax page.
> > > +		 * Otherwise, search for the current task.
> > > +		 */
> > > +		if (!pre_remove)
> > > +			t = task_early_kill(tsk, true);
> > >   		if (!t)
> > >   			continue;
> > >   		vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
> > > @@ -1793,6 +1800,7 @@ int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
> > >   	dax_entry_t cookie;
> > >   	struct page *page;
> > >   	size_t end = index + count;
> > > +	bool pre_remove = mf_flags & MF_MEM_PRE_REMOVE;
> > >   	mf_flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
> > > @@ -1804,9 +1812,10 @@ int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
> > >   		if (!page)
> > >   			goto unlock;
> > > -		SetPageHWPoison(page);
> > > +		if (!pre_remove)
> > > +			SetPageHWPoison(page);
> > > -		collect_procs_fsdax(page, mapping, index, &to_kill);
> > > +		collect_procs_fsdax(page, mapping, index, &to_kill, pre_remove);
> > >   		unmap_and_kill(&to_kill, page_to_pfn(page), mapping,
> > >   				index, mf_flags);
> > >   unlock:
> > > -- 
> > > 2.40.1
> > >
Shiyang Ruan Aug. 3, 2023, 10:44 a.m. UTC | #10
在 2023/8/1 11:25, Darrick J. Wong 写道:
> On Mon, Jul 31, 2023 at 05:36:36PM +0800, Shiyang Ruan wrote:
>>
>>
>> 在 2023/7/29 23:15, Darrick J. Wong 写道:
>>> On Thu, Jun 29, 2023 at 04:16:51PM +0800, Shiyang Ruan wrote:
>>>> This patch is inspired by Dan's "mm, dax, pmem: Introduce
>>>> dev_pagemap_failure()"[1].  With the help of dax_holder and
>>>> ->notify_failure() mechanism, the pmem driver is able to ask filesystem
>>>> on it to unmap all files in use, and notify processes who are using
>>>> those files.
>>>>
>>>> Call trace:
>>>> trigger unbind
>>>>    -> unbind_store()
>>>>     -> ... (skip)
>>>>      -> devres_release_all()
>>>>       -> kill_dax()
>>>>        -> dax_holder_notify_failure(dax_dev, 0, U64_MAX, MF_MEM_PRE_REMOVE)
>>>>         -> xfs_dax_notify_failure()
>>>>         `-> freeze_super()             // freeze (kernel call)
>>>>         `-> do xfs rmap
>>>>         ` -> mf_dax_kill_procs()
>>>>         `  -> collect_procs_fsdax()    // all associated processes
>>>>         `  -> unmap_and_kill()
>>>>         ` -> invalidate_inode_pages2_range() // drop file's cache
>>>>         `-> thaw_super()               // thaw (both kernel & user call)
>>>>
>>>> Introduce MF_MEM_PRE_REMOVE to let filesystem know this is a remove
>>>> event.  Use the exclusive freeze/thaw[2] to lock the filesystem to prevent
>>>> new dax mapping from being created.  Do not shutdown filesystem directly
>>>> if configuration is not supported, or if failure range includes metadata
>>>> area.  Make sure all files and processes(not only the current progress)
>>>> are handled correctly.  Also drop the cache of associated files before
>>>> pmem is removed.
>>>>
>>>> [1]: https://lore.kernel.org/linux-mm/161604050314.1463742.14151665140035795571.stgit@dwillia2-desk3.amr.corp.intel.com/
>>>> [2]: https://lore.kernel.org/linux-xfs/168688010689.860947.1788875898367401950.stgit@frogsfrogsfrogs/
>>>>
>>>> Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>
>>>> ---
>>>>    drivers/dax/super.c         |  3 +-
>>>>    fs/xfs/xfs_notify_failure.c | 86 ++++++++++++++++++++++++++++++++++---
>>>>    include/linux/mm.h          |  1 +
>>>>    mm/memory-failure.c         | 17 ++++++--
>>>>    4 files changed, 96 insertions(+), 11 deletions(-)
>>>>
>>>> diff --git a/drivers/dax/super.c b/drivers/dax/super.c
>>>> index c4c4728a36e4..2e1a35e82fce 100644
>>>> --- a/drivers/dax/super.c
>>>> +++ b/drivers/dax/super.c
>>>> @@ -323,7 +323,8 @@ void kill_dax(struct dax_device *dax_dev)
>>>>    		return;
>>>>    	if (dax_dev->holder_data != NULL)
>>>> -		dax_holder_notify_failure(dax_dev, 0, U64_MAX, 0);
>>>> +		dax_holder_notify_failure(dax_dev, 0, U64_MAX,
>>>> +				MF_MEM_PRE_REMOVE);
>>>>    	clear_bit(DAXDEV_ALIVE, &dax_dev->flags);
>>>>    	synchronize_srcu(&dax_srcu);
>>>> diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c
>>>> index 4a9bbd3fe120..f6ec56b76db6 100644
>>>> --- a/fs/xfs/xfs_notify_failure.c
>>>> +++ b/fs/xfs/xfs_notify_failure.c
>>>> @@ -22,6 +22,7 @@
>>>>    #include <linux/mm.h>
>>>>    #include <linux/dax.h>
>>>> +#include <linux/fs.h>
>>>>    struct xfs_failure_info {
>>>>    	xfs_agblock_t		startblock;
>>>> @@ -73,10 +74,16 @@ xfs_dax_failure_fn(
>>>>    	struct xfs_mount		*mp = cur->bc_mp;
>>>>    	struct xfs_inode		*ip;
>>>>    	struct xfs_failure_info		*notify = data;
>>>> +	struct address_space		*mapping;
>>>> +	pgoff_t				pgoff;
>>>> +	unsigned long			pgcnt;
>>>>    	int				error = 0;
>>>>    	if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) ||
>>>>    	    (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) {
>>>> +		/* Continue the query because this isn't a failure. */
>>>> +		if (notify->mf_flags & MF_MEM_PRE_REMOVE)
>>>> +			return 0;
>>>>    		notify->want_shutdown = true;
>>>>    		return 0;
>>>>    	}
>>>> @@ -92,14 +99,55 @@ xfs_dax_failure_fn(
>>>>    		return 0;
>>>>    	}
>>>> -	error = mf_dax_kill_procs(VFS_I(ip)->i_mapping,
>>>> -				  xfs_failure_pgoff(mp, rec, notify),
>>>> -				  xfs_failure_pgcnt(mp, rec, notify),
>>>> -				  notify->mf_flags);
>>>> +	mapping = VFS_I(ip)->i_mapping;
>>>> +	pgoff = xfs_failure_pgoff(mp, rec, notify);
>>>> +	pgcnt = xfs_failure_pgcnt(mp, rec, notify);
>>>> +
>>>> +	/* Continue the rmap query if the inode isn't a dax file. */
>>>> +	if (dax_mapping(mapping))
>>>> +		error = mf_dax_kill_procs(mapping, pgoff, pgcnt,
>>>> +					  notify->mf_flags);
>>>> +
>>>> +	/* Invalidate the cache in dax pages. */
>>>> +	if (notify->mf_flags & MF_MEM_PRE_REMOVE)
>>>> +		invalidate_inode_pages2_range(mapping, pgoff,
>>>> +					      pgoff + pgcnt - 1);
>>>> +
>>>>    	xfs_irele(ip);
>>>>    	return error;
>>>>    }
>>>> +static void
>>>> +xfs_dax_notify_failure_freeze(
>>>> +	struct xfs_mount	*mp)
>>>> +{
>>>> +	struct super_block 	*sb = mp->m_super;
>>>
>>> Nit: extra space right    ^ here.
>>>
>>>> +
>>>> +	/* Wait until no one is holding the FREEZE_HOLDER_KERNEL. */
>>>> +	while (freeze_super(sb, FREEZE_HOLDER_KERNEL) != 0) {
>>>> +		// Shall we just wait, or print warning then return -EBUSY?
>>>
>>> Hm.  PRE_REMOVE gets called before the pmem gets unplugged, right?  So
>>> we'll send a second notification after it goes away, right?
>>
>> For the first question, yes.
>>
>> But I'm not sure about the second one.  Do you mean: we'll send this
>> notification again if unbind didn't success because freeze_super() returns
>> -EBUSY?  In other words, if the previous unbind operation did not work, we
>> could unbind the device again.
> 
> Yeah.  If the MF_MEM_PRE_REMOVE fails with EBUSY, then call it again
> without PRE_REMOVE and let it kill processes.

Ok.  But I have to pass the flag (MF_MEM_PRE_REMOVE) to 
mf_dax_kill_procs() so that it can search for all processes who are 
holding dax pages rather than the only the current process.

Then, my thought is, if filesystem is currently frozen by kernel during 
unbind, just allow the -EBUSY and keep on the RMAP & killing processes. 
After RMAP is done, ignore the kernel thaw as well.  In this way, there 
is no need to send a second notification.

```
     bool frozen_by_kernel = false;

     // skip... other definitions

     if (mf_flags & MF_MEM_PRE_REMOVE) {
         xfs_info(mp, "Device is about to be removed!");
         /* Freeze fs to prevent new mappings from being created. */
         error = xfs_dax_notify_failure_freeze(mp);
         if (error) {
             /* Keep on if filesystem is frozen by kernel */
             if (error == -EBUSY)
                 frozen_by_kernel = true;
             else
                 return error;
         }
     }

     // skip... RMAP

out:
     /* Thaw the filesystem. */
     if (mf_flags & MF_MEM_PRE_REMOVE)
         /* don't thaw kernel frozen if already frozen by kernel */
         xfs_dax_notify_failure_thaw(mp, frozen_by_kernel);

     return error;
```


--
Thanks,
Ruan.

> 
> --D
> 
>>>
>>> If so, then I'd say return the error here instead of looping, and live
>>> with a kernel-frozen fs discarding the PRE_REMOVE message.
>>>
>>>> +		delay(HZ / 10);
>>>> +	}
>>>> +}
>>>> +
>>>> +static void
>>>> +xfs_dax_notify_failure_thaw(
>>>> +	struct xfs_mount	*mp)
>>>> +{
>>>> +	struct super_block	*sb = mp->m_super;
>>>> +	int			error;
>>>> +
>>>> +	error = thaw_super(sb, FREEZE_HOLDER_KERNEL);
>>>> +	if (error)
>>>> +		xfs_emerg(mp, "still frozen after notify failure, err=%d",
>>>> +			  error);
>>>> +	/*
>>>> +	 * Also thaw userspace call anyway because the device is about to be
>>>> +	 * removed immediately.
>>>> +	 */
>>>> +	thaw_super(sb, FREEZE_HOLDER_USERSPACE);
>>>> +}
>>>> +
>>>>    static int
>>>>    xfs_dax_notify_ddev_failure(
>>>>    	struct xfs_mount	*mp,
>>>> @@ -120,7 +168,7 @@ xfs_dax_notify_ddev_failure(
>>>>    	error = xfs_trans_alloc_empty(mp, &tp);
>>>>    	if (error)
>>>> -		return error;
>>>> +		goto out;
>>>>    	for (; agno <= end_agno; agno++) {
>>>>    		struct xfs_rmap_irec	ri_low = { };
>>>> @@ -165,11 +213,23 @@ xfs_dax_notify_ddev_failure(
>>>>    	}
>>>>    	xfs_trans_cancel(tp);
>>>> +
>>>> +	/*
>>>> +	 * Determine how to shutdown the filesystem according to the
>>>> +	 * error code and flags.
>>>> +	 */
>>>>    	if (error || notify.want_shutdown) {
>>>>    		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
>>>>    		if (!error)
>>>>    			error = -EFSCORRUPTED;
>>>> -	}
>>>> +	} else if (mf_flags & MF_MEM_PRE_REMOVE)
>>>> +		xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
>>>> +
>>>> +out:
>>>> +	/* Thaw the fs if it is freezed before. */
>>>> +	if (mf_flags & MF_MEM_PRE_REMOVE)
>>>> +		xfs_dax_notify_failure_thaw(mp);
>>>
>>> _thaw should be called from the same function that called _freeze.
>>
>> Will fix this.
>>
>>>
>>> The rest of the patch seems ok to me.
>>
>> Thank you!
>>
>>
>> --
>> Ruan.
>>
>>>
>>> --D
>>>
>>>> +
>>>>    	return error;
>>>>    }
>>>> @@ -197,6 +257,8 @@ xfs_dax_notify_failure(
>>>>    	if (mp->m_logdev_targp && mp->m_logdev_targp->bt_daxdev == dax_dev &&
>>>>    	    mp->m_logdev_targp != mp->m_ddev_targp) {
>>>> +		if (mf_flags & MF_MEM_PRE_REMOVE)
>>>> +			return 0;
>>>>    		xfs_err(mp, "ondisk log corrupt, shutting down fs!");
>>>>    		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
>>>>    		return -EFSCORRUPTED;
>>>> @@ -210,6 +272,12 @@ xfs_dax_notify_failure(
>>>>    	ddev_start = mp->m_ddev_targp->bt_dax_part_off;
>>>>    	ddev_end = ddev_start + bdev_nr_bytes(mp->m_ddev_targp->bt_bdev) - 1;
>>>> +	/* Notify failure on the whole device. */
>>>> +	if (offset == 0 && len == U64_MAX) {
>>>> +		offset = ddev_start;
>>>> +		len = bdev_nr_bytes(mp->m_ddev_targp->bt_bdev);
>>>> +	}
>>>> +
>>>>    	/* Ignore the range out of filesystem area */
>>>>    	if (offset + len - 1 < ddev_start)
>>>>    		return -ENXIO;
>>>> @@ -226,6 +294,12 @@ xfs_dax_notify_failure(
>>>>    	if (offset + len - 1 > ddev_end)
>>>>    		len = ddev_end - offset + 1;
>>>> +	if (mf_flags & MF_MEM_PRE_REMOVE) {
>>>> +		xfs_info(mp, "device is about to be removed!");
>>>> +		/* Freeze fs to prevent new mappings from being created. */
>>>> +		xfs_dax_notify_failure_freeze(mp);
>>>> +	}
>>>> +
>>>>    	return xfs_dax_notify_ddev_failure(mp, BTOBB(offset), BTOBB(len),
>>>>    			mf_flags);
>>>>    }
>>>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>>>> index 27ce77080c79..a80c255b88d2 100644
>>>> --- a/include/linux/mm.h
>>>> +++ b/include/linux/mm.h
>>>> @@ -3576,6 +3576,7 @@ enum mf_flags {
>>>>    	MF_UNPOISON = 1 << 4,
>>>>    	MF_SW_SIMULATED = 1 << 5,
>>>>    	MF_NO_RETRY = 1 << 6,
>>>> +	MF_MEM_PRE_REMOVE = 1 << 7,
>>>>    };
>>>>    int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
>>>>    		      unsigned long count, int mf_flags);
>>>> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
>>>> index 5b663eca1f29..483b75f2fcfb 100644
>>>> --- a/mm/memory-failure.c
>>>> +++ b/mm/memory-failure.c
>>>> @@ -688,7 +688,7 @@ static void add_to_kill_fsdax(struct task_struct *tsk, struct page *p,
>>>>     */
>>>>    static void collect_procs_fsdax(struct page *page,
>>>>    		struct address_space *mapping, pgoff_t pgoff,
>>>> -		struct list_head *to_kill)
>>>> +		struct list_head *to_kill, bool pre_remove)
>>>>    {
>>>>    	struct vm_area_struct *vma;
>>>>    	struct task_struct *tsk;
>>>> @@ -696,8 +696,15 @@ static void collect_procs_fsdax(struct page *page,
>>>>    	i_mmap_lock_read(mapping);
>>>>    	read_lock(&tasklist_lock);
>>>>    	for_each_process(tsk) {
>>>> -		struct task_struct *t = task_early_kill(tsk, true);
>>>> +		struct task_struct *t = tsk;
>>>> +		/*
>>>> +		 * Search for all tasks while MF_MEM_PRE_REMOVE, because the
>>>> +		 * current may not be the one accessing the fsdax page.
>>>> +		 * Otherwise, search for the current task.
>>>> +		 */
>>>> +		if (!pre_remove)
>>>> +			t = task_early_kill(tsk, true);
>>>>    		if (!t)
>>>>    			continue;
>>>>    		vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
>>>> @@ -1793,6 +1800,7 @@ int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
>>>>    	dax_entry_t cookie;
>>>>    	struct page *page;
>>>>    	size_t end = index + count;
>>>> +	bool pre_remove = mf_flags & MF_MEM_PRE_REMOVE;
>>>>    	mf_flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
>>>> @@ -1804,9 +1812,10 @@ int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
>>>>    		if (!page)
>>>>    			goto unlock;
>>>> -		SetPageHWPoison(page);
>>>> +		if (!pre_remove)
>>>> +			SetPageHWPoison(page);
>>>> -		collect_procs_fsdax(page, mapping, index, &to_kill);
>>>> +		collect_procs_fsdax(page, mapping, index, &to_kill, pre_remove);
>>>>    		unmap_and_kill(&to_kill, page_to_pfn(page), mapping,
>>>>    				index, mf_flags);
>>>>    unlock:
>>>> -- 
>>>> 2.40.1
>>>>
Dan Williams Aug. 8, 2023, 12:31 a.m. UTC | #11
Shiyang Ruan wrote:
> This patch is inspired by Dan's "mm, dax, pmem: Introduce
> dev_pagemap_failure()"[1].  With the help of dax_holder and
> ->notify_failure() mechanism, the pmem driver is able to ask filesystem
> on it to unmap all files in use, and notify processes who are using
> those files.
> 
> Call trace:
> trigger unbind
>  -> unbind_store()
>   -> ... (skip)
>    -> devres_release_all()
>     -> kill_dax()
>      -> dax_holder_notify_failure(dax_dev, 0, U64_MAX, MF_MEM_PRE_REMOVE)
>       -> xfs_dax_notify_failure()
>       `-> freeze_super()             // freeze (kernel call)
>       `-> do xfs rmap
>       ` -> mf_dax_kill_procs()
>       `  -> collect_procs_fsdax()    // all associated processes
>       `  -> unmap_and_kill()
>       ` -> invalidate_inode_pages2_range() // drop file's cache
>       `-> thaw_super()               // thaw (both kernel & user call)
> 
> Introduce MF_MEM_PRE_REMOVE to let filesystem know this is a remove
> event.  Use the exclusive freeze/thaw[2] to lock the filesystem to prevent
> new dax mapping from being created.  Do not shutdown filesystem directly
> if configuration is not supported, or if failure range includes metadata
> area.  Make sure all files and processes(not only the current progress)
> are handled correctly.  Also drop the cache of associated files before
> pmem is removed.

I would say more about why this is important for DAX users. Yes, the
devm_memremap_pages() vs get_user_pages() infrastructure can be improved
if it has a mechanism to revoke all pages that it has handed out for a
given device, but that's not an end user visible effect.

The end user impact needs to be clear. Is this for existing deployed
pmem where a user accidentally removes a device and wants failures and
process killing instead of hangs?

The reason Linux has got along without this for so long is because pmem
is difficult to remove (and with the sunset of Optane, difficult to
acquire). One motivation to pursue this is CXL where hotplug is better
defined and use cases like dynamic capacity devices where making forward
progress to kill processes is better than hanging.

It would help to have an example of what happens without this patch.

> 
> [1]: https://lore.kernel.org/linux-mm/161604050314.1463742.14151665140035795571.stgit@dwillia2-desk3.amr.corp.intel.com/
> [2]: https://lore.kernel.org/linux-xfs/168688010689.860947.1788875898367401950.stgit@frogsfrogsfrogs/
> 
> Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>
> ---
>  drivers/dax/super.c         |  3 +-
>  fs/xfs/xfs_notify_failure.c | 86 ++++++++++++++++++++++++++++++++++---
>  include/linux/mm.h          |  1 +
>  mm/memory-failure.c         | 17 ++++++--
>  4 files changed, 96 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/dax/super.c b/drivers/dax/super.c
> index c4c4728a36e4..2e1a35e82fce 100644
> --- a/drivers/dax/super.c
> +++ b/drivers/dax/super.c
> @@ -323,7 +323,8 @@ void kill_dax(struct dax_device *dax_dev)
>  		return;
>  
>  	if (dax_dev->holder_data != NULL)
> -		dax_holder_notify_failure(dax_dev, 0, U64_MAX, 0);
> +		dax_holder_notify_failure(dax_dev, 0, U64_MAX,
> +				MF_MEM_PRE_REMOVE);

The motivation in the original proposal was to convey the death of
large extents to memory_failure(). However, that proposal predated your
mf_dax_kill_procs() approach. With mf_dax_kill_procs() the need for a
new bulk memory_failure() API is gone.

This is where the end user impact needs to be clear. It seems that
without this patch the filesystem may assume failure while the device is
already present, but that seems ok. The goal is forward progress after a
mistake not necessarily minimizing damage after a mistake. The fact that
the current code is not as gentle could be considered a feature because
graceful shutdown should always unmount before unplug, and if one
unplugs before unmount it is already understood that they get to keep
the pieces.

Because the driver ->remove() callback can not enforce that the device
is still present it seems unnecessary to optimize for the case where the
filesystem is the device is being removed from an actively mounted
filesystem, but the device is still present.

The dax_holder_notify_failure(dax_dev, 0, U64_MAX) is sufficient to say
"userspace failed to umount before hardware eject, stop trying to access
this range", rather than "try to finish up in this range, but it might
already be too late".
Shiyang Ruan Aug. 23, 2023, 8:36 a.m. UTC | #12
在 2023/8/8 8:31, Dan Williams 写道:
> Shiyang Ruan wrote:
>> This patch is inspired by Dan's "mm, dax, pmem: Introduce
>> dev_pagemap_failure()"[1].  With the help of dax_holder and
>> ->notify_failure() mechanism, the pmem driver is able to ask filesystem
>> on it to unmap all files in use, and notify processes who are using
>> those files.
>>
>> Call trace:
>> trigger unbind
>>   -> unbind_store()
>>    -> ... (skip)
>>     -> devres_release_all()
>>      -> kill_dax()
>>       -> dax_holder_notify_failure(dax_dev, 0, U64_MAX, MF_MEM_PRE_REMOVE)
>>        -> xfs_dax_notify_failure()
>>        `-> freeze_super()             // freeze (kernel call)
>>        `-> do xfs rmap
>>        ` -> mf_dax_kill_procs()
>>        `  -> collect_procs_fsdax()    // all associated processes
>>        `  -> unmap_and_kill()
>>        ` -> invalidate_inode_pages2_range() // drop file's cache
>>        `-> thaw_super()               // thaw (both kernel & user call)
>>
>> Introduce MF_MEM_PRE_REMOVE to let filesystem know this is a remove
>> event.  Use the exclusive freeze/thaw[2] to lock the filesystem to prevent
>> new dax mapping from being created.  Do not shutdown filesystem directly
>> if configuration is not supported, or if failure range includes metadata
>> area.  Make sure all files and processes(not only the current progress)
>> are handled correctly.  Also drop the cache of associated files before
>> pmem is removed.
> 
> I would say more about why this is important for DAX users. Yes, the
> devm_memremap_pages() vs get_user_pages() infrastructure can be improved
> if it has a mechanism to revoke all pages that it has handed out for a
> given device, but that's not an end user visible effect.
> 
> The end user impact needs to be clear. Is this for existing deployed
> pmem where a user accidentally removes a device and wants failures and
> process killing instead of hangs?
> 
> The reason Linux has got along without this for so long is because pmem
> is difficult to remove (and with the sunset of Optane, difficult to
> acquire). One motivation to pursue this is CXL where hotplug is better
> defined and use cases like dynamic capacity devices where making forward
> progress to kill processes is better than hanging.
> 
> It would help to have an example of what happens without this patch.
> 
>>
>> [1]: https://lore.kernel.org/linux-mm/161604050314.1463742.14151665140035795571.stgit@dwillia2-desk3.amr.corp.intel.com/
>> [2]: https://lore.kernel.org/linux-xfs/168688010689.860947.1788875898367401950.stgit@frogsfrogsfrogs/
>>
>> Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>
>> ---
>>   drivers/dax/super.c         |  3 +-
>>   fs/xfs/xfs_notify_failure.c | 86 ++++++++++++++++++++++++++++++++++---
>>   include/linux/mm.h          |  1 +
>>   mm/memory-failure.c         | 17 ++++++--
>>   4 files changed, 96 insertions(+), 11 deletions(-)
>>
>> diff --git a/drivers/dax/super.c b/drivers/dax/super.c
>> index c4c4728a36e4..2e1a35e82fce 100644
>> --- a/drivers/dax/super.c
>> +++ b/drivers/dax/super.c
>> @@ -323,7 +323,8 @@ void kill_dax(struct dax_device *dax_dev)
>>   		return;
>>   
>>   	if (dax_dev->holder_data != NULL)
>> -		dax_holder_notify_failure(dax_dev, 0, U64_MAX, 0);
>> +		dax_holder_notify_failure(dax_dev, 0, U64_MAX,
>> +				MF_MEM_PRE_REMOVE);
> 
> The motivation in the original proposal was to convey the death of
> large extents to memory_failure(). However, that proposal predated your
> mf_dax_kill_procs() approach. With mf_dax_kill_procs() the need for a
> new bulk memory_failure() API is gone.
> 
> This is where the end user impact needs to be clear. It seems that
> without this patch the filesystem may assume failure while the device is
> already present, but that seems ok. The goal is forward progress after a
> mistake not necessarily minimizing damage after a mistake. The fact that
> the current code is not as gentle could be considered a feature because
> graceful shutdown should always unmount before unplug, and if one
> unplugs before unmount it is already understood that they get to keep
> the pieces.
> 
> Because the driver ->remove() callback can not enforce that the device
> is still present it seems unnecessary to optimize for the case where the
> filesystem is the device is being removed from an actively mounted
> filesystem, but the device is still present.
> 
> The dax_holder_notify_failure(dax_dev, 0, U64_MAX) is sufficient to say
> "userspace failed to umount before hardware eject, stop trying to access
> this range", rather than "try to finish up in this range, but it might
> already be too late".

Hi Dan,

I added an simple example of "accidentally remove pmem device" and its 
consequences of not having this patch in the latest version.  Please review.


--
Thanks,
Ruan.
diff mbox series

Patch

diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index c4c4728a36e4..2e1a35e82fce 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -323,7 +323,8 @@  void kill_dax(struct dax_device *dax_dev)
 		return;
 
 	if (dax_dev->holder_data != NULL)
-		dax_holder_notify_failure(dax_dev, 0, U64_MAX, 0);
+		dax_holder_notify_failure(dax_dev, 0, U64_MAX,
+				MF_MEM_PRE_REMOVE);
 
 	clear_bit(DAXDEV_ALIVE, &dax_dev->flags);
 	synchronize_srcu(&dax_srcu);
diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c
index 4a9bbd3fe120..f6ec56b76db6 100644
--- a/fs/xfs/xfs_notify_failure.c
+++ b/fs/xfs/xfs_notify_failure.c
@@ -22,6 +22,7 @@ 
 
 #include <linux/mm.h>
 #include <linux/dax.h>
+#include <linux/fs.h>
 
 struct xfs_failure_info {
 	xfs_agblock_t		startblock;
@@ -73,10 +74,16 @@  xfs_dax_failure_fn(
 	struct xfs_mount		*mp = cur->bc_mp;
 	struct xfs_inode		*ip;
 	struct xfs_failure_info		*notify = data;
+	struct address_space		*mapping;
+	pgoff_t				pgoff;
+	unsigned long			pgcnt;
 	int				error = 0;
 
 	if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) ||
 	    (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) {
+		/* Continue the query because this isn't a failure. */
+		if (notify->mf_flags & MF_MEM_PRE_REMOVE)
+			return 0;
 		notify->want_shutdown = true;
 		return 0;
 	}
@@ -92,14 +99,55 @@  xfs_dax_failure_fn(
 		return 0;
 	}
 
-	error = mf_dax_kill_procs(VFS_I(ip)->i_mapping,
-				  xfs_failure_pgoff(mp, rec, notify),
-				  xfs_failure_pgcnt(mp, rec, notify),
-				  notify->mf_flags);
+	mapping = VFS_I(ip)->i_mapping;
+	pgoff = xfs_failure_pgoff(mp, rec, notify);
+	pgcnt = xfs_failure_pgcnt(mp, rec, notify);
+
+	/* Continue the rmap query if the inode isn't a dax file. */
+	if (dax_mapping(mapping))
+		error = mf_dax_kill_procs(mapping, pgoff, pgcnt,
+					  notify->mf_flags);
+
+	/* Invalidate the cache in dax pages. */
+	if (notify->mf_flags & MF_MEM_PRE_REMOVE)
+		invalidate_inode_pages2_range(mapping, pgoff,
+					      pgoff + pgcnt - 1);
+
 	xfs_irele(ip);
 	return error;
 }
 
+static void
+xfs_dax_notify_failure_freeze(
+	struct xfs_mount	*mp)
+{
+	struct super_block 	*sb = mp->m_super;
+
+	/* Wait until no one is holding the FREEZE_HOLDER_KERNEL. */
+	while (freeze_super(sb, FREEZE_HOLDER_KERNEL) != 0) {
+		// Shall we just wait, or print warning then return -EBUSY?
+		delay(HZ / 10);
+	}
+}
+
+static void
+xfs_dax_notify_failure_thaw(
+	struct xfs_mount	*mp)
+{
+	struct super_block	*sb = mp->m_super;
+	int			error;
+
+	error = thaw_super(sb, FREEZE_HOLDER_KERNEL);
+	if (error)
+		xfs_emerg(mp, "still frozen after notify failure, err=%d",
+			  error);
+	/*
+	 * Also thaw userspace call anyway because the device is about to be
+	 * removed immediately.
+	 */
+	thaw_super(sb, FREEZE_HOLDER_USERSPACE);
+}
+
 static int
 xfs_dax_notify_ddev_failure(
 	struct xfs_mount	*mp,
@@ -120,7 +168,7 @@  xfs_dax_notify_ddev_failure(
 
 	error = xfs_trans_alloc_empty(mp, &tp);
 	if (error)
-		return error;
+		goto out;
 
 	for (; agno <= end_agno; agno++) {
 		struct xfs_rmap_irec	ri_low = { };
@@ -165,11 +213,23 @@  xfs_dax_notify_ddev_failure(
 	}
 
 	xfs_trans_cancel(tp);
+
+	/*
+	 * Determine how to shutdown the filesystem according to the
+	 * error code and flags.
+	 */
 	if (error || notify.want_shutdown) {
 		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
 		if (!error)
 			error = -EFSCORRUPTED;
-	}
+	} else if (mf_flags & MF_MEM_PRE_REMOVE)
+		xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
+
+out:
+	/* Thaw the fs if it is freezed before. */
+	if (mf_flags & MF_MEM_PRE_REMOVE)
+		xfs_dax_notify_failure_thaw(mp);
+
 	return error;
 }
 
@@ -197,6 +257,8 @@  xfs_dax_notify_failure(
 
 	if (mp->m_logdev_targp && mp->m_logdev_targp->bt_daxdev == dax_dev &&
 	    mp->m_logdev_targp != mp->m_ddev_targp) {
+		if (mf_flags & MF_MEM_PRE_REMOVE)
+			return 0;
 		xfs_err(mp, "ondisk log corrupt, shutting down fs!");
 		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
 		return -EFSCORRUPTED;
@@ -210,6 +272,12 @@  xfs_dax_notify_failure(
 	ddev_start = mp->m_ddev_targp->bt_dax_part_off;
 	ddev_end = ddev_start + bdev_nr_bytes(mp->m_ddev_targp->bt_bdev) - 1;
 
+	/* Notify failure on the whole device. */
+	if (offset == 0 && len == U64_MAX) {
+		offset = ddev_start;
+		len = bdev_nr_bytes(mp->m_ddev_targp->bt_bdev);
+	}
+
 	/* Ignore the range out of filesystem area */
 	if (offset + len - 1 < ddev_start)
 		return -ENXIO;
@@ -226,6 +294,12 @@  xfs_dax_notify_failure(
 	if (offset + len - 1 > ddev_end)
 		len = ddev_end - offset + 1;
 
+	if (mf_flags & MF_MEM_PRE_REMOVE) {
+		xfs_info(mp, "device is about to be removed!");
+		/* Freeze fs to prevent new mappings from being created. */
+		xfs_dax_notify_failure_freeze(mp);
+	}
+
 	return xfs_dax_notify_ddev_failure(mp, BTOBB(offset), BTOBB(len),
 			mf_flags);
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 27ce77080c79..a80c255b88d2 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3576,6 +3576,7 @@  enum mf_flags {
 	MF_UNPOISON = 1 << 4,
 	MF_SW_SIMULATED = 1 << 5,
 	MF_NO_RETRY = 1 << 6,
+	MF_MEM_PRE_REMOVE = 1 << 7,
 };
 int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
 		      unsigned long count, int mf_flags);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 5b663eca1f29..483b75f2fcfb 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -688,7 +688,7 @@  static void add_to_kill_fsdax(struct task_struct *tsk, struct page *p,
  */
 static void collect_procs_fsdax(struct page *page,
 		struct address_space *mapping, pgoff_t pgoff,
-		struct list_head *to_kill)
+		struct list_head *to_kill, bool pre_remove)
 {
 	struct vm_area_struct *vma;
 	struct task_struct *tsk;
@@ -696,8 +696,15 @@  static void collect_procs_fsdax(struct page *page,
 	i_mmap_lock_read(mapping);
 	read_lock(&tasklist_lock);
 	for_each_process(tsk) {
-		struct task_struct *t = task_early_kill(tsk, true);
+		struct task_struct *t = tsk;
 
+		/*
+		 * Search for all tasks while MF_MEM_PRE_REMOVE, because the
+		 * current may not be the one accessing the fsdax page.
+		 * Otherwise, search for the current task.
+		 */
+		if (!pre_remove)
+			t = task_early_kill(tsk, true);
 		if (!t)
 			continue;
 		vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
@@ -1793,6 +1800,7 @@  int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
 	dax_entry_t cookie;
 	struct page *page;
 	size_t end = index + count;
+	bool pre_remove = mf_flags & MF_MEM_PRE_REMOVE;
 
 	mf_flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
 
@@ -1804,9 +1812,10 @@  int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
 		if (!page)
 			goto unlock;
 
-		SetPageHWPoison(page);
+		if (!pre_remove)
+			SetPageHWPoison(page);
 
-		collect_procs_fsdax(page, mapping, index, &to_kill);
+		collect_procs_fsdax(page, mapping, index, &to_kill, pre_remove);
 		unmap_and_kill(&to_kill, page_to_pfn(page), mapping,
 				index, mf_flags);
 unlock: