diff mbox series

[v10,8/9] xfs: Implement ->notify_failure() for XFS

Message ID 20220127124058.1172422-9-ruansy.fnst@fujitsu.com (mailing list archive)
State New, archived
Headers show
Series fsdax: introduce fs query to support reflink | expand

Commit Message

Shiyang Ruan Jan. 27, 2022, 12:40 p.m. UTC
Introduce xfs_notify_failure.c to handle failure related works, such as
implement ->notify_failure(), register/unregister dax holder in xfs, and
so on.

If the rmap feature of XFS enabled, we can query it to find files and
metadata which are associated with the corrupt data.  For now all we do
is kill processes with that file mapped into their address spaces, but
future patches could actually do something about corrupt metadata.

After that, the memory failure needs to notify the processes who are
using those files.

Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>
---
 fs/xfs/Makefile             |   1 +
 fs/xfs/xfs_buf.c            |  12 ++
 fs/xfs/xfs_fsops.c          |   3 +
 fs/xfs/xfs_mount.h          |   1 +
 fs/xfs/xfs_notify_failure.c | 222 ++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_notify_failure.h |  10 ++
 6 files changed, 249 insertions(+)
 create mode 100644 fs/xfs/xfs_notify_failure.c
 create mode 100644 fs/xfs/xfs_notify_failure.h

Comments

kernel test robot Jan. 27, 2022, 5:56 p.m. UTC | #1
Hi Shiyang,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on linux/master]
[also build test WARNING on linus/master v5.17-rc1 next-20220127]
[cannot apply to xfs-linux/for-next]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Shiyang-Ruan/fsdax-introduce-fs-query-to-support-reflink/20220127-204239
base:   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 2c271fe77d52a0555161926c232cd5bc07178b39
config: ia64-defconfig (https://download.01.org/0day-ci/archive/20220128/202201280101.4ECasWmD-lkp@intel.com/config)
compiler: ia64-linux-gcc (GCC) 11.2.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/cb7650562991fc273fbf4c53b6e3db4bb9bb0b5e
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Shiyang-Ruan/fsdax-introduce-fs-query-to-support-reflink/20220127-204239
        git checkout cb7650562991fc273fbf4c53b6e3db4bb9bb0b5e
        # save the config file to linux build tree
        mkdir build_dir
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.2.0 make.cross O=build_dir ARCH=ia64 SHELL=/bin/bash fs/xfs/

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

   In file included from fs/xfs/xfs_buf.h:14,
                    from fs/xfs/xfs_linux.h:80,
                    from fs/xfs/xfs.h:22,
                    from fs/xfs/xfs_notify_failure.c:6:
   include/linux/dax.h:73:30: warning: 'struct dax_holder_operations' declared inside parameter list will not be visible outside of this definition or declaration
      73 |                 const struct dax_holder_operations *ops)
         |                              ^~~~~~~~~~~~~~~~~~~~~
   fs/xfs/xfs_notify_failure.c:220:14: error: variable 'xfs_dax_holder_operations' has initializer but incomplete type
     220 | const struct dax_holder_operations xfs_dax_holder_operations = {
         |              ^~~~~~~~~~~~~~~~~~~~~
   fs/xfs/xfs_notify_failure.c:221:10: error: 'const struct dax_holder_operations' has no member named 'notify_failure'
     221 |         .notify_failure         = xfs_dax_notify_failure,
         |          ^~~~~~~~~~~~~~
>> fs/xfs/xfs_notify_failure.c:221:35: warning: excess elements in struct initializer
     221 |         .notify_failure         = xfs_dax_notify_failure,
         |                                   ^~~~~~~~~~~~~~~~~~~~~~
   fs/xfs/xfs_notify_failure.c:221:35: note: (near initialization for 'xfs_dax_holder_operations')
   fs/xfs/xfs_notify_failure.c:220:36: error: storage size of 'xfs_dax_holder_operations' isn't known
     220 | const struct dax_holder_operations xfs_dax_holder_operations = {
         |                                    ^~~~~~~~~~~~~~~~~~~~~~~~~


vim +221 fs/xfs/xfs_notify_failure.c

   219	
   220	const struct dax_holder_operations xfs_dax_holder_operations = {
 > 221		.notify_failure		= xfs_dax_notify_failure,

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
kernel test robot Jan. 27, 2022, 7:39 p.m. UTC | #2
Hi Shiyang,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on linux/master]
[also build test ERROR on linus/master v5.17-rc1 next-20220127]
[cannot apply to xfs-linux/for-next]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Shiyang-Ruan/fsdax-introduce-fs-query-to-support-reflink/20220127-204239
base:   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 2c271fe77d52a0555161926c232cd5bc07178b39
config: ia64-defconfig (https://download.01.org/0day-ci/archive/20220128/202201280314.SI8wtlfT-lkp@intel.com/config)
compiler: ia64-linux-gcc (GCC) 11.2.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/cb7650562991fc273fbf4c53b6e3db4bb9bb0b5e
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Shiyang-Ruan/fsdax-introduce-fs-query-to-support-reflink/20220127-204239
        git checkout cb7650562991fc273fbf4c53b6e3db4bb9bb0b5e
        # save the config file to linux build tree
        mkdir build_dir
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.2.0 make.cross O=build_dir ARCH=ia64 SHELL=/bin/bash fs/

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   In file included from fs/xfs/xfs_buf.h:14,
                    from fs/xfs/xfs_linux.h:80,
                    from fs/xfs/xfs.h:22,
                    from fs/xfs/xfs_buf.c:6:
   include/linux/dax.h:73:30: warning: 'struct dax_holder_operations' declared inside parameter list will not be visible outside of this definition or declaration
      73 |                 const struct dax_holder_operations *ops)
         |                              ^~~~~~~~~~~~~~~~~~~~~
   fs/xfs/xfs_buf.c: In function 'xfs_alloc_buftarg':
>> fs/xfs/xfs_buf.c:1959:33: error: passing argument 3 of 'dax_register_holder' from incompatible pointer type [-Werror=incompatible-pointer-types]
    1959 |                                 &xfs_dax_holder_operations);
         |                                 ^~~~~~~~~~~~~~~~~~~~~~~~~~
         |                                 |
         |                                 const struct dax_holder_operations *
   In file included from fs/xfs/xfs_buf.h:14,
                    from fs/xfs/xfs_linux.h:80,
                    from fs/xfs/xfs.h:22,
                    from fs/xfs/xfs_buf.c:6:
   include/linux/dax.h:73:53: note: expected 'const struct dax_holder_operations *' but argument is of type 'const struct dax_holder_operations *'
      73 |                 const struct dax_holder_operations *ops)
         |                 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~~
   cc1: some warnings being treated as errors
--
   In file included from fs/xfs/xfs_buf.h:14,
                    from fs/xfs/xfs_linux.h:80,
                    from fs/xfs/xfs.h:22,
                    from fs/xfs/xfs_notify_failure.c:6:
   include/linux/dax.h:73:30: warning: 'struct dax_holder_operations' declared inside parameter list will not be visible outside of this definition or declaration
      73 |                 const struct dax_holder_operations *ops)
         |                              ^~~~~~~~~~~~~~~~~~~~~
>> fs/xfs/xfs_notify_failure.c:220:14: error: variable 'xfs_dax_holder_operations' has initializer but incomplete type
     220 | const struct dax_holder_operations xfs_dax_holder_operations = {
         |              ^~~~~~~~~~~~~~~~~~~~~
>> fs/xfs/xfs_notify_failure.c:221:10: error: 'const struct dax_holder_operations' has no member named 'notify_failure'
     221 |         .notify_failure         = xfs_dax_notify_failure,
         |          ^~~~~~~~~~~~~~
   fs/xfs/xfs_notify_failure.c:221:35: warning: excess elements in struct initializer
     221 |         .notify_failure         = xfs_dax_notify_failure,
         |                                   ^~~~~~~~~~~~~~~~~~~~~~
   fs/xfs/xfs_notify_failure.c:221:35: note: (near initialization for 'xfs_dax_holder_operations')
>> fs/xfs/xfs_notify_failure.c:220:36: error: storage size of 'xfs_dax_holder_operations' isn't known
     220 | const struct dax_holder_operations xfs_dax_holder_operations = {
         |                                    ^~~~~~~~~~~~~~~~~~~~~~~~~


vim +/dax_register_holder +1959 fs/xfs/xfs_buf.c

  1938	
  1939	struct xfs_buftarg *
  1940	xfs_alloc_buftarg(
  1941		struct xfs_mount	*mp,
  1942		struct block_device	*bdev)
  1943	{
  1944		xfs_buftarg_t		*btp;
  1945	
  1946		btp = kmem_zalloc(sizeof(*btp), KM_NOFS);
  1947	
  1948		btp->bt_mount = mp;
  1949		btp->bt_dev =  bdev->bd_dev;
  1950		btp->bt_bdev = bdev;
  1951		btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off);
  1952		if (btp->bt_daxdev) {
  1953			if (dax_get_holder(btp->bt_daxdev)) {
  1954				xfs_err(mp, "DAX device already in use?!");
  1955				goto error_free;
  1956			}
  1957	
  1958			dax_register_holder(btp->bt_daxdev, mp,
> 1959					&xfs_dax_holder_operations);
  1960		}
  1961	
  1962		/*
  1963		 * Buffer IO error rate limiting. Limit it to no more than 10 messages
  1964		 * per 30 seconds so as to not spam logs too much on repeated errors.
  1965		 */
  1966		ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ,
  1967				     DEFAULT_RATELIMIT_BURST);
  1968	
  1969		if (xfs_setsize_buftarg_early(btp, bdev))
  1970			goto error_free;
  1971	
  1972		if (list_lru_init(&btp->bt_lru))
  1973			goto error_free;
  1974	
  1975		if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL))
  1976			goto error_lru;
  1977	
  1978		btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count;
  1979		btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan;
  1980		btp->bt_shrinker.seeks = DEFAULT_SEEKS;
  1981		btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE;
  1982		if (register_shrinker(&btp->bt_shrinker))
  1983			goto error_pcpu;
  1984		return btp;
  1985	
  1986	error_pcpu:
  1987		percpu_counter_destroy(&btp->bt_io_count);
  1988	error_lru:
  1989		list_lru_destroy(&btp->bt_lru);
  1990	error_free:
  1991		kmem_free(btp);
  1992		return NULL;
  1993	}
  1994	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
Darrick J. Wong Feb. 1, 2022, 8:41 p.m. UTC | #3
On Thu, Jan 27, 2022 at 08:40:57PM +0800, Shiyang Ruan wrote:
> Introduce xfs_notify_failure.c to handle failure related works, such as
> implement ->notify_failure(), register/unregister dax holder in xfs, and
> so on.
> 
> If the rmap feature of XFS enabled, we can query it to find files and
> metadata which are associated with the corrupt data.  For now all we do
> is kill processes with that file mapped into their address spaces, but
> future patches could actually do something about corrupt metadata.
> 
> After that, the memory failure needs to notify the processes who are
> using those files.
> 
> Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>
> ---
>  fs/xfs/Makefile             |   1 +
>  fs/xfs/xfs_buf.c            |  12 ++
>  fs/xfs/xfs_fsops.c          |   3 +
>  fs/xfs/xfs_mount.h          |   1 +
>  fs/xfs/xfs_notify_failure.c | 222 ++++++++++++++++++++++++++++++++++++
>  fs/xfs/xfs_notify_failure.h |  10 ++
>  6 files changed, 249 insertions(+)
>  create mode 100644 fs/xfs/xfs_notify_failure.c
>  create mode 100644 fs/xfs/xfs_notify_failure.h
> 
> diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
> index 04611a1068b4..389970b3e13b 100644
> --- a/fs/xfs/Makefile
> +++ b/fs/xfs/Makefile
> @@ -84,6 +84,7 @@ xfs-y				+= xfs_aops.o \
>  				   xfs_message.o \
>  				   xfs_mount.o \
>  				   xfs_mru_cache.o \
> +				   xfs_notify_failure.o \
>  				   xfs_pwork.o \
>  				   xfs_reflink.o \
>  				   xfs_stats.o \
> diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
> index b45e0d50a405..017010b3d601 100644
> --- a/fs/xfs/xfs_buf.c
> +++ b/fs/xfs/xfs_buf.c
> @@ -19,6 +19,7 @@
>  #include "xfs_errortag.h"
>  #include "xfs_error.h"
>  #include "xfs_ag.h"
> +#include "xfs_notify_failure.h"
>  
>  static struct kmem_cache *xfs_buf_cache;
>  
> @@ -1892,6 +1893,8 @@ xfs_free_buftarg(
>  	list_lru_destroy(&btp->bt_lru);
>  
>  	blkdev_issue_flush(btp->bt_bdev);
> +	if (btp->bt_daxdev)
> +		dax_unregister_holder(btp->bt_daxdev);
>  	fs_put_dax(btp->bt_daxdev);
>  
>  	kmem_free(btp);
> @@ -1946,6 +1949,15 @@ xfs_alloc_buftarg(
>  	btp->bt_dev =  bdev->bd_dev;
>  	btp->bt_bdev = bdev;
>  	btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off);
> +	if (btp->bt_daxdev) {
> +		if (dax_get_holder(btp->bt_daxdev)) {
> +			xfs_err(mp, "DAX device already in use?!");
> +			goto error_free;
> +		}
> +
> +		dax_register_holder(btp->bt_daxdev, mp,
> +				&xfs_dax_holder_operations);

Um... is XFS required to take a lock here?  How do we prevent parallel
mounts of filesystems on two partitions from breaking each other?

> +	}
>  
>  	/*
>  	 * Buffer IO error rate limiting. Limit it to no more than 10 messages
> diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
> index 33e26690a8c4..d4d36c5bef11 100644
> --- a/fs/xfs/xfs_fsops.c
> +++ b/fs/xfs/xfs_fsops.c
> @@ -542,6 +542,9 @@ xfs_do_force_shutdown(
>  	} else if (flags & SHUTDOWN_CORRUPT_INCORE) {
>  		tag = XFS_PTAG_SHUTDOWN_CORRUPT;
>  		why = "Corruption of in-memory data";
> +	} else if (flags & SHUTDOWN_CORRUPT_ONDISK) {
> +		tag = XFS_PTAG_SHUTDOWN_CORRUPT;
> +		why = "Corruption of on-disk metadata";
>  	} else {
>  		tag = XFS_PTAG_SHUTDOWN_IOERROR;
>  		why = "Metadata I/O Error";
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index 00720a02e761..47ff4ac53c4c 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -435,6 +435,7 @@ void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname,
>  #define SHUTDOWN_LOG_IO_ERROR	0x0002	/* write attempt to the log failed */
>  #define SHUTDOWN_FORCE_UMOUNT	0x0004	/* shutdown from a forced unmount */
>  #define SHUTDOWN_CORRUPT_INCORE	0x0008	/* corrupt in-memory data structures */
> +#define SHUTDOWN_CORRUPT_ONDISK	0x0010  /* corrupt metadata on device */
>  
>  #define XFS_SHUTDOWN_STRINGS \
>  	{ SHUTDOWN_META_IO_ERROR,	"metadata_io" }, \
> diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c
> new file mode 100644
> index 000000000000..6abaa043f4bc
> --- /dev/null
> +++ b/fs/xfs/xfs_notify_failure.c
> @@ -0,0 +1,222 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (c) 2021 Fujitsu.  All Rights Reserved.

2022?

> + */
> +
> +#include "xfs.h"
> +#include "xfs_shared.h"
> +#include "xfs_format.h"
> +#include "xfs_log_format.h"
> +#include "xfs_trans_resv.h"
> +#include "xfs_mount.h"
> +#include "xfs_alloc.h"
> +#include "xfs_bit.h"
> +#include "xfs_btree.h"
> +#include "xfs_inode.h"
> +#include "xfs_icache.h"
> +#include "xfs_rmap.h"
> +#include "xfs_rmap_btree.h"
> +#include "xfs_rtalloc.h"
> +#include "xfs_trans.h"
> +
> +#include <linux/mm.h>
> +#include <linux/dax.h>
> +
> +struct failure_info {
> +	xfs_agblock_t		startblock;
> +	xfs_extlen_t		blockcount;
> +	int			mf_flags;
> +};
> +
> +#if IS_ENABLED(CONFIG_MEMORY_FAILURE) && IS_ENABLED(CONFIG_FS_DAX)
> +static pgoff_t
> +xfs_failure_pgoff(
> +	struct xfs_mount		*mp,
> +	const struct xfs_rmap_irec	*rec,
> +	const struct failure_info	*notify)
> +{
> +	uint64_t			pos = rec->rm_offset;
> +
> +	if (notify->startblock > rec->rm_startblock)
> +		pos += XFS_FSB_TO_B(mp,
> +				notify->startblock - rec->rm_startblock);
> +	return pos >> PAGE_SHIFT;
> +}
> +
> +static unsigned long
> +xfs_failure_pgcnt(
> +	struct xfs_mount		*mp,
> +	const struct xfs_rmap_irec	*rec,
> +	const struct failure_info	*notify)
> +{
> +	xfs_agblock_t			end_rec;
> +	xfs_agblock_t			end_notify;
> +	xfs_agblock_t			start_cross;
> +	xfs_agblock_t			end_cross;
> +
> +	start_cross = max(rec->rm_startblock, notify->startblock);
> +
> +	end_rec = rec->rm_startblock + rec->rm_blockcount;
> +	end_notify = notify->startblock + notify->blockcount;
> +	end_cross = min(end_rec, end_notify);
> +
> +	return XFS_FSB_TO_B(mp, end_cross - start_cross) >> PAGE_SHIFT;
> +}
> +
> +static int
> +xfs_dax_failure_fn(
> +	struct xfs_btree_cur		*cur,
> +	const struct xfs_rmap_irec	*rec,
> +	void				*data)
> +{
> +	struct xfs_mount		*mp = cur->bc_mp;
> +	struct xfs_inode		*ip;
> +	struct failure_info		*notify = data;
> +	int				error = 0;
> +
> +	if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) ||
> +	    (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) {
> +		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
> +		return -EFSCORRUPTED;
> +	}
> +
> +	/* Get files that incore, filter out others that are not in use. */
> +	error = xfs_iget(mp, cur->bc_tp, rec->rm_owner, XFS_IGET_INCORE,
> +			 0, &ip);
> +	/* Continue the rmap query if the inode isn't incore */
> +	if (error == -ENODATA)
> +		return 0;
> +	if (error)
> +		return error;
> +
> +	error = mf_dax_kill_procs(VFS_I(ip)->i_mapping,
> +				  xfs_failure_pgoff(mp, rec, notify),
> +				  xfs_failure_pgcnt(mp, rec, notify),
> +				  notify->mf_flags);
> +	xfs_irele(ip);
> +	return error;
> +}
> +#else
> +static int
> +xfs_dax_failure_fn(
> +	struct xfs_btree_cur		*cur,
> +	const struct xfs_rmap_irec	*rec,
> +	void				*data)
> +{
> +	struct xfs_mount		*mp = cur->bc_mp;
> +
> +	/* No other option besides shutting down the fs. */
> +	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
> +	return -EFSCORRUPTED;
> +}
> +#endif /* CONFIG_MEMORY_FAILURE && CONFIG_FS_DAX */
> +
> +static int
> +xfs_dax_notify_ddev_failure(
> +	struct xfs_mount	*mp,
> +	xfs_daddr_t		daddr,
> +	xfs_daddr_t		bblen,
> +	int			mf_flags)
> +{
> +	struct xfs_trans	*tp = NULL;
> +	struct xfs_btree_cur	*cur = NULL;
> +	struct xfs_buf		*agf_bp = NULL;
> +	struct failure_info	notify;
> +	int			error = 0;
> +	xfs_fsblock_t		fsbno = XFS_DADDR_TO_FSB(mp, daddr);
> +	xfs_agnumber_t		agno = XFS_FSB_TO_AGNO(mp, fsbno);
> +	xfs_fsblock_t		end_fsbno = XFS_DADDR_TO_FSB(mp, daddr + bblen);
> +	xfs_agnumber_t		end_agno = XFS_FSB_TO_AGNO(mp, end_fsbno);
> +
> +	/*
> +	 * Once a file is found by rmap, we take the intersection of two ranges:
> +	 * notification range and file extent range, to make sure we won't go
> +	 * out of scope.
> +	 */
> +	notify.mf_flags = mf_flags;
> +	notify.startblock = XFS_FSB_TO_AGBNO(mp, fsbno);
> +	notify.blockcount = XFS_BB_TO_FSB(mp, bblen);

This looks even more incorrect than what V9 did, because the notify
structure helps the per-AG rmap record iterator pick the records (within
that AG) that overlap the failed area, but we haven't selected any AGs
yet.

> +
> +	error = xfs_trans_alloc_empty(mp, &tp);
> +	if (error)
> +		return error;
> +
> +	for (; agno <= end_agno; agno++) {
> +		struct xfs_rmap_irec	ri_low = { };
> +		struct xfs_rmap_irec	ri_high;

So declare @notify here:

		struct failure_info	notify;

> +
> +		error = xfs_alloc_read_agf(mp, tp, agno, 0, &agf_bp);
> +		if (error)
> +			break;
> +
> +		cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, agf_bp->b_pag);
> +
> +		/*
> +		 * Set the rmap range from ri_low to ri_high, which represents
> +		 * a [start, end] where we looking for the files or metadata.
> +		 * The part of range out of a AG will be ignored.  So, it's fine
> +		 * to set ri_low to "startblock" in all loops.  When it reaches
> +		 * the last AG, set the ri_high to "endblock" to make sure we
> +		 * actually end at the end.
> +		 */
> +		memset(&ri_high, 0xFF, sizeof(ri_high));
> +		ri_low.rm_startblock = XFS_FSB_TO_AGBNO(mp, fsbno);
> +		if (agno == end_agno)
> +			ri_high.rm_startblock = XFS_FSB_TO_AGBNO(mp, end_fsbno);

And initialize it here:

		struct xfs_agf		*agf = agf_bp->b_addr;
		xfs_agblock_t		agend;

		agend = min(be32_to_cpu(agf->agf_length),
				ri_high.rm_startblock);
		notify.startblock = ri_low.rm_startblock;
		notify.blockcount = agend - ri_low.rm_startblock;

Before passing @notify to the rmap iteration.

> +
> +		error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
> +				xfs_dax_failure_fn, &notify);
> +		xfs_btree_del_cursor(cur, error);
> +		xfs_trans_brelse(tp, agf_bp);
> +		if (error)
> +			break;
> +
> +		fsbno = XFS_AGB_TO_FSB(mp, agno + 1, 0);
> +	}
> +
> +	xfs_trans_cancel(tp);
> +	return error;
> +}
> +
> +static int
> +xfs_dax_notify_failure(
> +	struct dax_device	*dax_dev,
> +	u64			offset,
> +	u64			len,
> +	int			mf_flags)
> +{
> +	struct xfs_mount	*mp = dax_get_holder(dax_dev);
> +
> +	if (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_daxdev == dax_dev) {
> +		xfs_warn(mp,
> +			 "notify_failure() not supported on realtime device!");
> +		return -EOPNOTSUPP;
> +	}
> +
> +	if (mp->m_logdev_targp && mp->m_logdev_targp->bt_daxdev == dax_dev &&
> +	    mp->m_logdev_targp != mp->m_ddev_targp) {
> +		xfs_err(mp, "ondisk log corrupt, shutting down fs!");
> +		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
> +		return -EFSCORRUPTED;
> +	}
> +
> +	if (!xfs_has_rmapbt(mp)) {
> +		xfs_warn(mp, "notify_failure() needs rmapbt enabled!");
> +		return -EOPNOTSUPP;
> +	}
> +
> +	if (offset < mp->m_ddev_targp->bt_dax_part_off ||
> +	    ((offset + len) > mp->m_ddev_targp->bt_bdev->bd_nr_sectors <<
> +				SECTOR_SHIFT)) {

I think this logic is wrong?

If the failed region overlaps the entire fs, offset will be less than
bt_dax_part_off.

We're looking for reasons to ignore the failure event, right?

So if the failed area is before the start of the fs then we can ignore
it:

	if (offset + len < mp->m_ddev_targp->bt_dax_part_off)
		return -ENXIO;

and if the failed area starts after the fs then we can ignore that too:

	bdev_end = mp->m_ddev_targp->bt_dax_part_off +
		   mp->m_ddev_targp->bt_bdev->bd_nr_sectors;
	if (offset > bdev_end)
		return -ENXIO;

Right?

--D

> +		xfs_warn(mp, "notify_failure() goes out of the scope.");
> +		return -ENXIO;
> +	}
> +
> +	offset -= mp->m_ddev_targp->bt_dax_part_off;
> +	return xfs_dax_notify_ddev_failure(mp, BTOBB(offset), BTOBB(len),
> +			mf_flags);
> +}
> +
> +const struct dax_holder_operations xfs_dax_holder_operations = {
> +	.notify_failure		= xfs_dax_notify_failure,
> +};
> diff --git a/fs/xfs/xfs_notify_failure.h b/fs/xfs/xfs_notify_failure.h
> new file mode 100644
> index 000000000000..f40cb315e7ce
> --- /dev/null
> +++ b/fs/xfs/xfs_notify_failure.h
> @@ -0,0 +1,10 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (c) 2021 Fujitsu.  All Rights Reserved.
> + */
> +#ifndef __XFS_NOTIFY_FAILURE_H__
> +#define __XFS_NOTIFY_FAILURE_H__
> +
> +extern const struct dax_holder_operations xfs_dax_holder_operations;
> +
> +#endif  /* __XFS_NOTIFY_FAILURE_H__ */
> -- 
> 2.34.1
> 
> 
>
Dan Williams Feb. 16, 2022, 1:56 a.m. UTC | #4
On Thu, Jan 27, 2022 at 4:41 AM Shiyang Ruan <ruansy.fnst@fujitsu.com> wrote:
>
> Introduce xfs_notify_failure.c to handle failure related works, such as
> implement ->notify_failure(), register/unregister dax holder in xfs, and
> so on.
>
> If the rmap feature of XFS enabled, we can query it to find files and
> metadata which are associated with the corrupt data.  For now all we do
> is kill processes with that file mapped into their address spaces, but
> future patches could actually do something about corrupt metadata.
>
> After that, the memory failure needs to notify the processes who are
> using those files.
>
> Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>
> ---
>  fs/xfs/Makefile             |   1 +
>  fs/xfs/xfs_buf.c            |  12 ++
>  fs/xfs/xfs_fsops.c          |   3 +
>  fs/xfs/xfs_mount.h          |   1 +
>  fs/xfs/xfs_notify_failure.c | 222 ++++++++++++++++++++++++++++++++++++
>  fs/xfs/xfs_notify_failure.h |  10 ++
>  6 files changed, 249 insertions(+)
>  create mode 100644 fs/xfs/xfs_notify_failure.c
>  create mode 100644 fs/xfs/xfs_notify_failure.h
>
> diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
> index 04611a1068b4..389970b3e13b 100644
> --- a/fs/xfs/Makefile
> +++ b/fs/xfs/Makefile
> @@ -84,6 +84,7 @@ xfs-y                         += xfs_aops.o \
>                                    xfs_message.o \
>                                    xfs_mount.o \
>                                    xfs_mru_cache.o \
> +                                  xfs_notify_failure.o \
>                                    xfs_pwork.o \
>                                    xfs_reflink.o \
>                                    xfs_stats.o \
> diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
> index b45e0d50a405..017010b3d601 100644
> --- a/fs/xfs/xfs_buf.c
> +++ b/fs/xfs/xfs_buf.c
> @@ -19,6 +19,7 @@
>  #include "xfs_errortag.h"
>  #include "xfs_error.h"
>  #include "xfs_ag.h"
> +#include "xfs_notify_failure.h"
>
>  static struct kmem_cache *xfs_buf_cache;
>
> @@ -1892,6 +1893,8 @@ xfs_free_buftarg(
>         list_lru_destroy(&btp->bt_lru);
>
>         blkdev_issue_flush(btp->bt_bdev);
> +       if (btp->bt_daxdev)
> +               dax_unregister_holder(btp->bt_daxdev);
>         fs_put_dax(btp->bt_daxdev);
>
>         kmem_free(btp);
> @@ -1946,6 +1949,15 @@ xfs_alloc_buftarg(
>         btp->bt_dev =  bdev->bd_dev;
>         btp->bt_bdev = bdev;
>         btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off);
> +       if (btp->bt_daxdev) {
> +               if (dax_get_holder(btp->bt_daxdev)) {
> +                       xfs_err(mp, "DAX device already in use?!");

Per the earlier feedback this can be checked atomically inside of
dax_register_holder() with cmpxchg().

> +                       goto error_free;
> +               }
> +
> +               dax_register_holder(btp->bt_daxdev, mp,
> +                               &xfs_dax_holder_operations);
> +       }
>
>         /*
>          * Buffer IO error rate limiting. Limit it to no more than 10 messages
> diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
> index 33e26690a8c4..d4d36c5bef11 100644
> --- a/fs/xfs/xfs_fsops.c
> +++ b/fs/xfs/xfs_fsops.c
> @@ -542,6 +542,9 @@ xfs_do_force_shutdown(
>         } else if (flags & SHUTDOWN_CORRUPT_INCORE) {
>                 tag = XFS_PTAG_SHUTDOWN_CORRUPT;
>                 why = "Corruption of in-memory data";
> +       } else if (flags & SHUTDOWN_CORRUPT_ONDISK) {
> +               tag = XFS_PTAG_SHUTDOWN_CORRUPT;
> +               why = "Corruption of on-disk metadata";
>         } else {
>                 tag = XFS_PTAG_SHUTDOWN_IOERROR;
>                 why = "Metadata I/O Error";
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index 00720a02e761..47ff4ac53c4c 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -435,6 +435,7 @@ void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname,
>  #define SHUTDOWN_LOG_IO_ERROR  0x0002  /* write attempt to the log failed */
>  #define SHUTDOWN_FORCE_UMOUNT  0x0004  /* shutdown from a forced unmount */
>  #define SHUTDOWN_CORRUPT_INCORE        0x0008  /* corrupt in-memory data structures */
> +#define SHUTDOWN_CORRUPT_ONDISK        0x0010  /* corrupt metadata on device */
>
>  #define XFS_SHUTDOWN_STRINGS \
>         { SHUTDOWN_META_IO_ERROR,       "metadata_io" }, \
> diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c
> new file mode 100644
> index 000000000000..6abaa043f4bc
> --- /dev/null
> +++ b/fs/xfs/xfs_notify_failure.c
> @@ -0,0 +1,222 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (c) 2021 Fujitsu.  All Rights Reserved.
> + */
> +
> +#include "xfs.h"
> +#include "xfs_shared.h"
> +#include "xfs_format.h"
> +#include "xfs_log_format.h"
> +#include "xfs_trans_resv.h"
> +#include "xfs_mount.h"
> +#include "xfs_alloc.h"
> +#include "xfs_bit.h"
> +#include "xfs_btree.h"
> +#include "xfs_inode.h"
> +#include "xfs_icache.h"
> +#include "xfs_rmap.h"
> +#include "xfs_rmap_btree.h"
> +#include "xfs_rtalloc.h"
> +#include "xfs_trans.h"
> +
> +#include <linux/mm.h>
> +#include <linux/dax.h>
> +
> +struct failure_info {
> +       xfs_agblock_t           startblock;
> +       xfs_extlen_t            blockcount;
> +       int                     mf_flags;
> +};
> +
> +#if IS_ENABLED(CONFIG_MEMORY_FAILURE) && IS_ENABLED(CONFIG_FS_DAX)
> +static pgoff_t
> +xfs_failure_pgoff(
> +       struct xfs_mount                *mp,
> +       const struct xfs_rmap_irec      *rec,
> +       const struct failure_info       *notify)
> +{
> +       uint64_t                        pos = rec->rm_offset;
> +
> +       if (notify->startblock > rec->rm_startblock)
> +               pos += XFS_FSB_TO_B(mp,
> +                               notify->startblock - rec->rm_startblock);
> +       return pos >> PAGE_SHIFT;
> +}
> +
> +static unsigned long
> +xfs_failure_pgcnt(
> +       struct xfs_mount                *mp,
> +       const struct xfs_rmap_irec      *rec,
> +       const struct failure_info       *notify)
> +{
> +       xfs_agblock_t                   end_rec;
> +       xfs_agblock_t                   end_notify;
> +       xfs_agblock_t                   start_cross;
> +       xfs_agblock_t                   end_cross;
> +
> +       start_cross = max(rec->rm_startblock, notify->startblock);
> +
> +       end_rec = rec->rm_startblock + rec->rm_blockcount;
> +       end_notify = notify->startblock + notify->blockcount;
> +       end_cross = min(end_rec, end_notify);
> +
> +       return XFS_FSB_TO_B(mp, end_cross - start_cross) >> PAGE_SHIFT;
> +}
> +
> +static int
> +xfs_dax_failure_fn(
> +       struct xfs_btree_cur            *cur,
> +       const struct xfs_rmap_irec      *rec,
> +       void                            *data)
> +{
> +       struct xfs_mount                *mp = cur->bc_mp;
> +       struct xfs_inode                *ip;
> +       struct failure_info             *notify = data;
> +       int                             error = 0;
> +
> +       if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) ||
> +           (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) {
> +               xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
> +               return -EFSCORRUPTED;
> +       }
> +
> +       /* Get files that incore, filter out others that are not in use. */
> +       error = xfs_iget(mp, cur->bc_tp, rec->rm_owner, XFS_IGET_INCORE,
> +                        0, &ip);
> +       /* Continue the rmap query if the inode isn't incore */
> +       if (error == -ENODATA)
> +               return 0;
> +       if (error)
> +               return error;
> +
> +       error = mf_dax_kill_procs(VFS_I(ip)->i_mapping,
> +                                 xfs_failure_pgoff(mp, rec, notify),
> +                                 xfs_failure_pgcnt(mp, rec, notify),
> +                                 notify->mf_flags);
> +       xfs_irele(ip);
> +       return error;
> +}
> +#else
> +static int
> +xfs_dax_failure_fn(
> +       struct xfs_btree_cur            *cur,
> +       const struct xfs_rmap_irec      *rec,
> +       void                            *data)
> +{
> +       struct xfs_mount                *mp = cur->bc_mp;
> +
> +       /* No other option besides shutting down the fs. */
> +       xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
> +       return -EFSCORRUPTED;
> +}
> +#endif /* CONFIG_MEMORY_FAILURE && CONFIG_FS_DAX */
> +
> +static int
> +xfs_dax_notify_ddev_failure(
> +       struct xfs_mount        *mp,
> +       xfs_daddr_t             daddr,
> +       xfs_daddr_t             bblen,
> +       int                     mf_flags)
> +{
> +       struct xfs_trans        *tp = NULL;
> +       struct xfs_btree_cur    *cur = NULL;
> +       struct xfs_buf          *agf_bp = NULL;
> +       struct failure_info     notify;
> +       int                     error = 0;
> +       xfs_fsblock_t           fsbno = XFS_DADDR_TO_FSB(mp, daddr);
> +       xfs_agnumber_t          agno = XFS_FSB_TO_AGNO(mp, fsbno);
> +       xfs_fsblock_t           end_fsbno = XFS_DADDR_TO_FSB(mp, daddr + bblen);
> +       xfs_agnumber_t          end_agno = XFS_FSB_TO_AGNO(mp, end_fsbno);
> +
> +       /*
> +        * Once a file is found by rmap, we take the intersection of two ranges:
> +        * notification range and file extent range, to make sure we won't go
> +        * out of scope.
> +        */
> +       notify.mf_flags = mf_flags;
> +       notify.startblock = XFS_FSB_TO_AGBNO(mp, fsbno);
> +       notify.blockcount = XFS_BB_TO_FSB(mp, bblen);
> +
> +       error = xfs_trans_alloc_empty(mp, &tp);
> +       if (error)
> +               return error;
> +
> +       for (; agno <= end_agno; agno++) {
> +               struct xfs_rmap_irec    ri_low = { };
> +               struct xfs_rmap_irec    ri_high;
> +
> +               error = xfs_alloc_read_agf(mp, tp, agno, 0, &agf_bp);
> +               if (error)
> +                       break;
> +
> +               cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, agf_bp->b_pag);
> +
> +               /*
> +                * Set the rmap range from ri_low to ri_high, which represents
> +                * a [start, end] where we looking for the files or metadata.
> +                * The part of range out of a AG will be ignored.  So, it's fine
> +                * to set ri_low to "startblock" in all loops.  When it reaches
> +                * the last AG, set the ri_high to "endblock" to make sure we
> +                * actually end at the end.
> +                */
> +               memset(&ri_high, 0xFF, sizeof(ri_high));
> +               ri_low.rm_startblock = XFS_FSB_TO_AGBNO(mp, fsbno);
> +               if (agno == end_agno)
> +                       ri_high.rm_startblock = XFS_FSB_TO_AGBNO(mp, end_fsbno);
> +
> +               error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
> +                               xfs_dax_failure_fn, &notify);
> +               xfs_btree_del_cursor(cur, error);
> +               xfs_trans_brelse(tp, agf_bp);
> +               if (error)
> +                       break;
> +
> +               fsbno = XFS_AGB_TO_FSB(mp, agno + 1, 0);
> +       }
> +
> +       xfs_trans_cancel(tp);
> +       return error;
> +}
> +
> +static int
> +xfs_dax_notify_failure(
> +       struct dax_device       *dax_dev,
> +       u64                     offset,
> +       u64                     len,
> +       int                     mf_flags)
> +{
> +       struct xfs_mount        *mp = dax_get_holder(dax_dev);
> +
> +       if (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_daxdev == dax_dev) {
> +               xfs_warn(mp,
> +                        "notify_failure() not supported on realtime device!");
> +               return -EOPNOTSUPP;
> +       }
> +
> +       if (mp->m_logdev_targp && mp->m_logdev_targp->bt_daxdev == dax_dev &&
> +           mp->m_logdev_targp != mp->m_ddev_targp) {
> +               xfs_err(mp, "ondisk log corrupt, shutting down fs!");
> +               xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
> +               return -EFSCORRUPTED;
> +       }
> +
> +       if (!xfs_has_rmapbt(mp)) {
> +               xfs_warn(mp, "notify_failure() needs rmapbt enabled!");

Doesn't this need to be resolved at mount time?

> +               return -EOPNOTSUPP;
> +       }
> +
> +       if (offset < mp->m_ddev_targp->bt_dax_part_off ||
> +           ((offset + len) > mp->m_ddev_targp->bt_bdev->bd_nr_sectors <<
> +                               SECTOR_SHIFT)) {

With the removal of partition support bt_dax_part_off can never be
non-zero and the offset / len validation should be done against the
boundaries of the dax device in terms of physical page offset and
nr_pages.

> +               xfs_warn(mp, "notify_failure() goes out of the scope.");
> +               return -ENXIO;
> +       }
> +
> +       offset -= mp->m_ddev_targp->bt_dax_part_off;
> +       return xfs_dax_notify_ddev_failure(mp, BTOBB(offset), BTOBB(len),
> +                       mf_flags);

Same here, all offset adjustment code can be dropped because failure
notification should be disabled at mount time if the mount point is
not associated with a whole disk device.

> +}
> +
> +const struct dax_holder_operations xfs_dax_holder_operations = {
> +       .notify_failure         = xfs_dax_notify_failure,
> +};
> diff --git a/fs/xfs/xfs_notify_failure.h b/fs/xfs/xfs_notify_failure.h
> new file mode 100644
> index 000000000000..f40cb315e7ce
> --- /dev/null
> +++ b/fs/xfs/xfs_notify_failure.h
> @@ -0,0 +1,10 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (c) 2021 Fujitsu.  All Rights Reserved.
> + */
> +#ifndef __XFS_NOTIFY_FAILURE_H__
> +#define __XFS_NOTIFY_FAILURE_H__
> +
> +extern const struct dax_holder_operations xfs_dax_holder_operations;
> +
> +#endif  /* __XFS_NOTIFY_FAILURE_H__ */
> --
> 2.34.1
>
>
>
diff mbox series

Patch

diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 04611a1068b4..389970b3e13b 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -84,6 +84,7 @@  xfs-y				+= xfs_aops.o \
 				   xfs_message.o \
 				   xfs_mount.o \
 				   xfs_mru_cache.o \
+				   xfs_notify_failure.o \
 				   xfs_pwork.o \
 				   xfs_reflink.o \
 				   xfs_stats.o \
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index b45e0d50a405..017010b3d601 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -19,6 +19,7 @@ 
 #include "xfs_errortag.h"
 #include "xfs_error.h"
 #include "xfs_ag.h"
+#include "xfs_notify_failure.h"
 
 static struct kmem_cache *xfs_buf_cache;
 
@@ -1892,6 +1893,8 @@  xfs_free_buftarg(
 	list_lru_destroy(&btp->bt_lru);
 
 	blkdev_issue_flush(btp->bt_bdev);
+	if (btp->bt_daxdev)
+		dax_unregister_holder(btp->bt_daxdev);
 	fs_put_dax(btp->bt_daxdev);
 
 	kmem_free(btp);
@@ -1946,6 +1949,15 @@  xfs_alloc_buftarg(
 	btp->bt_dev =  bdev->bd_dev;
 	btp->bt_bdev = bdev;
 	btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off);
+	if (btp->bt_daxdev) {
+		if (dax_get_holder(btp->bt_daxdev)) {
+			xfs_err(mp, "DAX device already in use?!");
+			goto error_free;
+		}
+
+		dax_register_holder(btp->bt_daxdev, mp,
+				&xfs_dax_holder_operations);
+	}
 
 	/*
 	 * Buffer IO error rate limiting. Limit it to no more than 10 messages
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 33e26690a8c4..d4d36c5bef11 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -542,6 +542,9 @@  xfs_do_force_shutdown(
 	} else if (flags & SHUTDOWN_CORRUPT_INCORE) {
 		tag = XFS_PTAG_SHUTDOWN_CORRUPT;
 		why = "Corruption of in-memory data";
+	} else if (flags & SHUTDOWN_CORRUPT_ONDISK) {
+		tag = XFS_PTAG_SHUTDOWN_CORRUPT;
+		why = "Corruption of on-disk metadata";
 	} else {
 		tag = XFS_PTAG_SHUTDOWN_IOERROR;
 		why = "Metadata I/O Error";
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 00720a02e761..47ff4ac53c4c 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -435,6 +435,7 @@  void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname,
 #define SHUTDOWN_LOG_IO_ERROR	0x0002	/* write attempt to the log failed */
 #define SHUTDOWN_FORCE_UMOUNT	0x0004	/* shutdown from a forced unmount */
 #define SHUTDOWN_CORRUPT_INCORE	0x0008	/* corrupt in-memory data structures */
+#define SHUTDOWN_CORRUPT_ONDISK	0x0010  /* corrupt metadata on device */
 
 #define XFS_SHUTDOWN_STRINGS \
 	{ SHUTDOWN_META_IO_ERROR,	"metadata_io" }, \
diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c
new file mode 100644
index 000000000000..6abaa043f4bc
--- /dev/null
+++ b/fs/xfs/xfs_notify_failure.c
@@ -0,0 +1,222 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2021 Fujitsu.  All Rights Reserved.
+ */
+
+#include "xfs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_alloc.h"
+#include "xfs_bit.h"
+#include "xfs_btree.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_rtalloc.h"
+#include "xfs_trans.h"
+
+#include <linux/mm.h>
+#include <linux/dax.h>
+
+struct failure_info {
+	xfs_agblock_t		startblock;
+	xfs_extlen_t		blockcount;
+	int			mf_flags;
+};
+
+#if IS_ENABLED(CONFIG_MEMORY_FAILURE) && IS_ENABLED(CONFIG_FS_DAX)
+static pgoff_t
+xfs_failure_pgoff(
+	struct xfs_mount		*mp,
+	const struct xfs_rmap_irec	*rec,
+	const struct failure_info	*notify)
+{
+	uint64_t			pos = rec->rm_offset;
+
+	if (notify->startblock > rec->rm_startblock)
+		pos += XFS_FSB_TO_B(mp,
+				notify->startblock - rec->rm_startblock);
+	return pos >> PAGE_SHIFT;
+}
+
+static unsigned long
+xfs_failure_pgcnt(
+	struct xfs_mount		*mp,
+	const struct xfs_rmap_irec	*rec,
+	const struct failure_info	*notify)
+{
+	xfs_agblock_t			end_rec;
+	xfs_agblock_t			end_notify;
+	xfs_agblock_t			start_cross;
+	xfs_agblock_t			end_cross;
+
+	start_cross = max(rec->rm_startblock, notify->startblock);
+
+	end_rec = rec->rm_startblock + rec->rm_blockcount;
+	end_notify = notify->startblock + notify->blockcount;
+	end_cross = min(end_rec, end_notify);
+
+	return XFS_FSB_TO_B(mp, end_cross - start_cross) >> PAGE_SHIFT;
+}
+
+static int
+xfs_dax_failure_fn(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_rmap_irec	*rec,
+	void				*data)
+{
+	struct xfs_mount		*mp = cur->bc_mp;
+	struct xfs_inode		*ip;
+	struct failure_info		*notify = data;
+	int				error = 0;
+
+	if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) ||
+	    (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) {
+		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
+		return -EFSCORRUPTED;
+	}
+
+	/* Get files that incore, filter out others that are not in use. */
+	error = xfs_iget(mp, cur->bc_tp, rec->rm_owner, XFS_IGET_INCORE,
+			 0, &ip);
+	/* Continue the rmap query if the inode isn't incore */
+	if (error == -ENODATA)
+		return 0;
+	if (error)
+		return error;
+
+	error = mf_dax_kill_procs(VFS_I(ip)->i_mapping,
+				  xfs_failure_pgoff(mp, rec, notify),
+				  xfs_failure_pgcnt(mp, rec, notify),
+				  notify->mf_flags);
+	xfs_irele(ip);
+	return error;
+}
+#else
+static int
+xfs_dax_failure_fn(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_rmap_irec	*rec,
+	void				*data)
+{
+	struct xfs_mount		*mp = cur->bc_mp;
+
+	/* No other option besides shutting down the fs. */
+	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
+	return -EFSCORRUPTED;
+}
+#endif /* CONFIG_MEMORY_FAILURE && CONFIG_FS_DAX */
+
+static int
+xfs_dax_notify_ddev_failure(
+	struct xfs_mount	*mp,
+	xfs_daddr_t		daddr,
+	xfs_daddr_t		bblen,
+	int			mf_flags)
+{
+	struct xfs_trans	*tp = NULL;
+	struct xfs_btree_cur	*cur = NULL;
+	struct xfs_buf		*agf_bp = NULL;
+	struct failure_info	notify;
+	int			error = 0;
+	xfs_fsblock_t		fsbno = XFS_DADDR_TO_FSB(mp, daddr);
+	xfs_agnumber_t		agno = XFS_FSB_TO_AGNO(mp, fsbno);
+	xfs_fsblock_t		end_fsbno = XFS_DADDR_TO_FSB(mp, daddr + bblen);
+	xfs_agnumber_t		end_agno = XFS_FSB_TO_AGNO(mp, end_fsbno);
+
+	/*
+	 * Once a file is found by rmap, we take the intersection of two ranges:
+	 * notification range and file extent range, to make sure we won't go
+	 * out of scope.
+	 */
+	notify.mf_flags = mf_flags;
+	notify.startblock = XFS_FSB_TO_AGBNO(mp, fsbno);
+	notify.blockcount = XFS_BB_TO_FSB(mp, bblen);
+
+	error = xfs_trans_alloc_empty(mp, &tp);
+	if (error)
+		return error;
+
+	for (; agno <= end_agno; agno++) {
+		struct xfs_rmap_irec	ri_low = { };
+		struct xfs_rmap_irec	ri_high;
+
+		error = xfs_alloc_read_agf(mp, tp, agno, 0, &agf_bp);
+		if (error)
+			break;
+
+		cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, agf_bp->b_pag);
+
+		/*
+		 * Set the rmap range from ri_low to ri_high, which represents
+		 * a [start, end] where we looking for the files or metadata.
+		 * The part of range out of a AG will be ignored.  So, it's fine
+		 * to set ri_low to "startblock" in all loops.  When it reaches
+		 * the last AG, set the ri_high to "endblock" to make sure we
+		 * actually end at the end.
+		 */
+		memset(&ri_high, 0xFF, sizeof(ri_high));
+		ri_low.rm_startblock = XFS_FSB_TO_AGBNO(mp, fsbno);
+		if (agno == end_agno)
+			ri_high.rm_startblock = XFS_FSB_TO_AGBNO(mp, end_fsbno);
+
+		error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
+				xfs_dax_failure_fn, &notify);
+		xfs_btree_del_cursor(cur, error);
+		xfs_trans_brelse(tp, agf_bp);
+		if (error)
+			break;
+
+		fsbno = XFS_AGB_TO_FSB(mp, agno + 1, 0);
+	}
+
+	xfs_trans_cancel(tp);
+	return error;
+}
+
+static int
+xfs_dax_notify_failure(
+	struct dax_device	*dax_dev,
+	u64			offset,
+	u64			len,
+	int			mf_flags)
+{
+	struct xfs_mount	*mp = dax_get_holder(dax_dev);
+
+	if (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_daxdev == dax_dev) {
+		xfs_warn(mp,
+			 "notify_failure() not supported on realtime device!");
+		return -EOPNOTSUPP;
+	}
+
+	if (mp->m_logdev_targp && mp->m_logdev_targp->bt_daxdev == dax_dev &&
+	    mp->m_logdev_targp != mp->m_ddev_targp) {
+		xfs_err(mp, "ondisk log corrupt, shutting down fs!");
+		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
+		return -EFSCORRUPTED;
+	}
+
+	if (!xfs_has_rmapbt(mp)) {
+		xfs_warn(mp, "notify_failure() needs rmapbt enabled!");
+		return -EOPNOTSUPP;
+	}
+
+	if (offset < mp->m_ddev_targp->bt_dax_part_off ||
+	    ((offset + len) > mp->m_ddev_targp->bt_bdev->bd_nr_sectors <<
+				SECTOR_SHIFT)) {
+		xfs_warn(mp, "notify_failure() goes out of the scope.");
+		return -ENXIO;
+	}
+
+	offset -= mp->m_ddev_targp->bt_dax_part_off;
+	return xfs_dax_notify_ddev_failure(mp, BTOBB(offset), BTOBB(len),
+			mf_flags);
+}
+
+const struct dax_holder_operations xfs_dax_holder_operations = {
+	.notify_failure		= xfs_dax_notify_failure,
+};
diff --git a/fs/xfs/xfs_notify_failure.h b/fs/xfs/xfs_notify_failure.h
new file mode 100644
index 000000000000..f40cb315e7ce
--- /dev/null
+++ b/fs/xfs/xfs_notify_failure.h
@@ -0,0 +1,10 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2021 Fujitsu.  All Rights Reserved.
+ */
+#ifndef __XFS_NOTIFY_FAILURE_H__
+#define __XFS_NOTIFY_FAILURE_H__
+
+extern const struct dax_holder_operations xfs_dax_holder_operations;
+
+#endif  /* __XFS_NOTIFY_FAILURE_H__ */