diff mbox series

[04/15] xfs: introduce a file mapping exchange log intent item

Message ID 171150380732.3216674.5999741892452595331.stgit@frogsfrogsfrogs (mailing list archive)
State Superseded
Headers show
Series [01/15] vfs: export remap and write check helpers | expand

Commit Message

Darrick J. Wong March 27, 2024, 1:53 a.m. UTC
From: Darrick J. Wong <djwong@kernel.org>

Introduce a new intent log item to handle exchanging mappings between
the forks of two files.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/Makefile                 |    1 
 fs/xfs/libxfs/xfs_log_format.h  |   42 ++++++-
 fs/xfs/libxfs/xfs_log_recover.h |    2 
 fs/xfs/xfs_exchmaps_item.c      |  235 +++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_exchmaps_item.h      |   59 ++++++++++
 fs/xfs/xfs_log_recover.c        |    2 
 fs/xfs/xfs_super.c              |   19 +++
 7 files changed, 357 insertions(+), 3 deletions(-)
 create mode 100644 fs/xfs/xfs_exchmaps_item.c
 create mode 100644 fs/xfs/xfs_exchmaps_item.h

Comments

Dave Chinner April 7, 2024, 11:51 p.m. UTC | #1
On Tue, Mar 26, 2024 at 06:53:52PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <djwong@kernel.org>
> 
> Introduce a new intent log item to handle exchanging mappings between
> the forks of two files.
> 
> Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> Reviewed-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/xfs/Makefile                 |    1 
>  fs/xfs/libxfs/xfs_log_format.h  |   42 ++++++-
>  fs/xfs/libxfs/xfs_log_recover.h |    2 
>  fs/xfs/xfs_exchmaps_item.c      |  235 +++++++++++++++++++++++++++++++++++++++
>  fs/xfs/xfs_exchmaps_item.h      |   59 ++++++++++
>  fs/xfs/xfs_log_recover.c        |    2 
>  fs/xfs/xfs_super.c              |   19 +++
>  7 files changed, 357 insertions(+), 3 deletions(-)
>  create mode 100644 fs/xfs/xfs_exchmaps_item.c
>  create mode 100644 fs/xfs/xfs_exchmaps_item.h
> 
> 
> diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
> index 2474242f5a05f..68ca9726e7b7d 100644
> --- a/fs/xfs/Makefile
> +++ b/fs/xfs/Makefile
> @@ -102,6 +102,7 @@ xfs-y				+= xfs_log.o \
>  				   xfs_buf_item.o \
>  				   xfs_buf_item_recover.o \
>  				   xfs_dquot_item_recover.o \
> +				   xfs_exchmaps_item.o \
>  				   xfs_extfree_item.o \
>  				   xfs_attr_item.o \
>  				   xfs_icreate_item.o \
> diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
> index 16872972e1e97..09024431cae9a 100644
> --- a/fs/xfs/libxfs/xfs_log_format.h
> +++ b/fs/xfs/libxfs/xfs_log_format.h
> @@ -117,8 +117,9 @@ struct xfs_unmount_log_format {
>  #define XLOG_REG_TYPE_ATTRD_FORMAT	28
>  #define XLOG_REG_TYPE_ATTR_NAME	29
>  #define XLOG_REG_TYPE_ATTR_VALUE	30
> -#define XLOG_REG_TYPE_MAX		30
> -
> +#define XLOG_REG_TYPE_XMI_FORMAT	31
> +#define XLOG_REG_TYPE_XMD_FORMAT	32
> +#define XLOG_REG_TYPE_MAX		32
>  
>  /*
>   * Flags to log operation header
> @@ -243,6 +244,8 @@ typedef struct xfs_trans_header {
>  #define	XFS_LI_BUD		0x1245
>  #define	XFS_LI_ATTRI		0x1246  /* attr set/remove intent*/
>  #define	XFS_LI_ATTRD		0x1247  /* attr set/remove done */
> +#define	XFS_LI_XMI		0x1248  /* mapping exchange intent */
> +#define	XFS_LI_XMD		0x1249  /* mapping exchange done */
>  
>  #define XFS_LI_TYPE_DESC \
>  	{ XFS_LI_EFI,		"XFS_LI_EFI" }, \
> @@ -260,7 +263,9 @@ typedef struct xfs_trans_header {
>  	{ XFS_LI_BUI,		"XFS_LI_BUI" }, \
>  	{ XFS_LI_BUD,		"XFS_LI_BUD" }, \
>  	{ XFS_LI_ATTRI,		"XFS_LI_ATTRI" }, \
> -	{ XFS_LI_ATTRD,		"XFS_LI_ATTRD" }
> +	{ XFS_LI_ATTRD,		"XFS_LI_ATTRD" }, \
> +	{ XFS_LI_XMI,		"XFS_LI_XMI" }, \
> +	{ XFS_LI_XMD,		"XFS_LI_XMD" }
>  
>  /*
>   * Inode Log Item Format definitions.
> @@ -878,6 +883,37 @@ struct xfs_bud_log_format {
>  	uint64_t		bud_bui_id;	/* id of corresponding bui */
>  };
>  
> +/*
> + * XMI/XMD (file mapping exchange) log format definitions
> + */
> +
> +/* This is the structure used to lay out an mapping exchange log item. */
> +struct xfs_xmi_log_format {
> +	uint16_t		xmi_type;	/* xmi log item type */
> +	uint16_t		xmi_size;	/* size of this item */
> +	uint32_t		__pad;		/* must be zero */
> +	uint64_t		xmi_id;		/* xmi identifier */

Why does this ID need to be a 64 bit ID?  If it is 32 bit, then
there's no need for any padding, and it doesn't seem likely to me
that we'd have millions of exchanges in flight at once.

(Edit: I see why later - I address it there....)

> +
> +	uint64_t		xmi_inode1;	/* inumber of first file */
> +	uint64_t		xmi_inode2;	/* inumber of second file */

Inode numbers are not unique identifiers. Intents get replayed after
everything else has been replayed, including inode unlink and
reallocation.

Without a generation number, there is no way to determine if the
inode number in the intent is actually pointing at the correct inode
when we go to replay the intent.

Yes, I know it's unlikely that this might occur, but I'd much prefer
that we fully identify inodes in the on-disk metadata so we can
check it at recovery time than leaving it out and just hoping we are
operating on the correct inode life-cycle...


> +	uint64_t		xmi_startoff1;	/* block offset into file1 */
> +	uint64_t		xmi_startoff2;	/* block offset into file2 */
> +	uint64_t		xmi_blockcount;	/* number of blocks */
> +	uint64_t		xmi_flags;	/* XFS_EXCHMAPS_* */
> +	uint64_t		xmi_isize1;	/* intended file1 size */
> +	uint64_t		xmi_isize2;	/* intended file2 size */

How do these inode sizes differ from xmi_startoff{1,2} +
xmi_blockcount?

> +/* Allocate and initialize an xmi item. */
> +STATIC struct xfs_xmi_log_item *
> +xfs_xmi_init(
> +	struct xfs_mount	*mp)
> +
> +{
> +	struct xfs_xmi_log_item	*xmi_lip;
> +
> +	xmi_lip = kmem_cache_zalloc(xfs_xmi_cache, GFP_KERNEL | __GFP_NOFAIL);
> +
> +	xfs_log_item_init(mp, &xmi_lip->xmi_item, XFS_LI_XMI, &xfs_xmi_item_ops);
> +	xmi_lip->xmi_format.xmi_id = (uintptr_t)(void *)xmi_lip;

OK. Encoding the pointer as the ID is a mistake we made with EFIs
and we should stop repeating it in all new intents. There is no
guarantee that the pointer is a unique identifier because these are
allocated out of a slab cache. Hence we can allocate an xmi, log it,
finish the xmds, free the xmi, then run another exchange and get
exactly the same XMI pointer returned to us for the new exchange
intent. Now we potentially have multiple exchange items in the
journal with the same ID.

Can we use a u32 and get_random_u32() for the ID here, please? We
already do this for checkpoint discrimination in the log (i.e. to
identify what checkpoint an ophdr belongs to), so we really should
be doing the same for all ids we use in the journal for matching
items.

Longer term, We probably should move all the intent/done identifiers
to a psuedo random identifier mechanism, but that's outside the
scope of this change....

> +/*
> + * This routine is called to create an in-core file mapping exchange item from
> + * the xmi format structure which was logged on disk.  It allocates an in-core
> + * xmi, copies the exchange information from the format structure into it, and
> + * adds the xmi to the AIL with the given LSN.
> + */
> +STATIC int
> +xlog_recover_xmi_commit_pass2(
> +	struct xlog			*log,
> +	struct list_head		*buffer_list,
> +	struct xlog_recover_item	*item,
> +	xfs_lsn_t			lsn)
> +{
> +	struct xfs_mount		*mp = log->l_mp;
> +	struct xfs_xmi_log_item		*xmi_lip;
> +	struct xfs_xmi_log_format	*xmi_formatp;
> +	size_t				len;
> +
> +	len = sizeof(struct xfs_xmi_log_format);
> +	if (item->ri_buf[0].i_len != len) {
> +		XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
> +		return -EFSCORRUPTED;
> +	}
> +
> +	xmi_formatp = item->ri_buf[0].i_addr;
> +	if (xmi_formatp->__pad != 0) {
> +		XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
> +		return -EFSCORRUPTED;
> +	}
> +
> +	xmi_lip = xfs_xmi_init(mp);
> +	memcpy(&xmi_lip->xmi_format, xmi_formatp, len);

Should this be validating that the structure contents are within
valid ranges?

-Dave.
Darrick J. Wong April 9, 2024, 1:18 a.m. UTC | #2
On Mon, Apr 08, 2024 at 09:51:01AM +1000, Dave Chinner wrote:
> On Tue, Mar 26, 2024 at 06:53:52PM -0700, Darrick J. Wong wrote:
> > From: Darrick J. Wong <djwong@kernel.org>
> > 
> > Introduce a new intent log item to handle exchanging mappings between
> > the forks of two files.
> > 
> > Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> > Reviewed-by: Christoph Hellwig <hch@lst.de>
> > ---
> >  fs/xfs/Makefile                 |    1 
> >  fs/xfs/libxfs/xfs_log_format.h  |   42 ++++++-
> >  fs/xfs/libxfs/xfs_log_recover.h |    2 
> >  fs/xfs/xfs_exchmaps_item.c      |  235 +++++++++++++++++++++++++++++++++++++++
> >  fs/xfs/xfs_exchmaps_item.h      |   59 ++++++++++
> >  fs/xfs/xfs_log_recover.c        |    2 
> >  fs/xfs/xfs_super.c              |   19 +++
> >  7 files changed, 357 insertions(+), 3 deletions(-)
> >  create mode 100644 fs/xfs/xfs_exchmaps_item.c
> >  create mode 100644 fs/xfs/xfs_exchmaps_item.h
> > 
> > 
> > diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
> > index 2474242f5a05f..68ca9726e7b7d 100644
> > --- a/fs/xfs/Makefile
> > +++ b/fs/xfs/Makefile
> > @@ -102,6 +102,7 @@ xfs-y				+= xfs_log.o \
> >  				   xfs_buf_item.o \
> >  				   xfs_buf_item_recover.o \
> >  				   xfs_dquot_item_recover.o \
> > +				   xfs_exchmaps_item.o \
> >  				   xfs_extfree_item.o \
> >  				   xfs_attr_item.o \
> >  				   xfs_icreate_item.o \
> > diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
> > index 16872972e1e97..09024431cae9a 100644
> > --- a/fs/xfs/libxfs/xfs_log_format.h
> > +++ b/fs/xfs/libxfs/xfs_log_format.h
> > @@ -117,8 +117,9 @@ struct xfs_unmount_log_format {
> >  #define XLOG_REG_TYPE_ATTRD_FORMAT	28
> >  #define XLOG_REG_TYPE_ATTR_NAME	29
> >  #define XLOG_REG_TYPE_ATTR_VALUE	30
> > -#define XLOG_REG_TYPE_MAX		30
> > -
> > +#define XLOG_REG_TYPE_XMI_FORMAT	31
> > +#define XLOG_REG_TYPE_XMD_FORMAT	32
> > +#define XLOG_REG_TYPE_MAX		32
> >  
> >  /*
> >   * Flags to log operation header
> > @@ -243,6 +244,8 @@ typedef struct xfs_trans_header {
> >  #define	XFS_LI_BUD		0x1245
> >  #define	XFS_LI_ATTRI		0x1246  /* attr set/remove intent*/
> >  #define	XFS_LI_ATTRD		0x1247  /* attr set/remove done */
> > +#define	XFS_LI_XMI		0x1248  /* mapping exchange intent */
> > +#define	XFS_LI_XMD		0x1249  /* mapping exchange done */
> >  
> >  #define XFS_LI_TYPE_DESC \
> >  	{ XFS_LI_EFI,		"XFS_LI_EFI" }, \
> > @@ -260,7 +263,9 @@ typedef struct xfs_trans_header {
> >  	{ XFS_LI_BUI,		"XFS_LI_BUI" }, \
> >  	{ XFS_LI_BUD,		"XFS_LI_BUD" }, \
> >  	{ XFS_LI_ATTRI,		"XFS_LI_ATTRI" }, \
> > -	{ XFS_LI_ATTRD,		"XFS_LI_ATTRD" }
> > +	{ XFS_LI_ATTRD,		"XFS_LI_ATTRD" }, \
> > +	{ XFS_LI_XMI,		"XFS_LI_XMI" }, \
> > +	{ XFS_LI_XMD,		"XFS_LI_XMD" }
> >  
> >  /*
> >   * Inode Log Item Format definitions.
> > @@ -878,6 +883,37 @@ struct xfs_bud_log_format {
> >  	uint64_t		bud_bui_id;	/* id of corresponding bui */
> >  };
> >  
> > +/*
> > + * XMI/XMD (file mapping exchange) log format definitions
> > + */
> > +
> > +/* This is the structure used to lay out an mapping exchange log item. */
> > +struct xfs_xmi_log_format {
> > +	uint16_t		xmi_type;	/* xmi log item type */
> > +	uint16_t		xmi_size;	/* size of this item */
> > +	uint32_t		__pad;		/* must be zero */
> > +	uint64_t		xmi_id;		/* xmi identifier */
> 
> Why does this ID need to be a 64 bit ID?  If it is 32 bit, then
> there's no need for any padding, and it doesn't seem likely to me
> that we'd have millions of exchanges in flight at once.
> 
> (Edit: I see why later - I address it there....)
> 
> > +
> > +	uint64_t		xmi_inode1;	/* inumber of first file */
> > +	uint64_t		xmi_inode2;	/* inumber of second file */
> 
> Inode numbers are not unique identifiers. Intents get replayed after
> everything else has been replayed, including inode unlink and
> reallocation.
> 
> Without a generation number, there is no way to determine if the
> inode number in the intent is actually pointing at the correct inode
> when we go to replay the intent.
> 
> Yes, I know it's unlikely that this might occur, but I'd much prefer
> that we fully identify inodes in the on-disk metadata so we can
> check it at recovery time than leaving it out and just hoping we are
> operating on the correct inode life-cycle...

I suppose I don't mind encoding the generation number, but I'm not
thrilled that you're suggesting this after I finally got someone to
complete the review of this patchset after 3 years.

I probably ought to use the u32 pad in the attri log format to do the
same for parent pointers, seeing as parent pointers have their own attri
log opcodes now anyway.

> 
> > +	uint64_t		xmi_startoff1;	/* block offset into file1 */
> > +	uint64_t		xmi_startoff2;	/* block offset into file2 */
> > +	uint64_t		xmi_blockcount;	/* number of blocks */
> > +	uint64_t		xmi_flags;	/* XFS_EXCHMAPS_* */
> > +	uint64_t		xmi_isize1;	/* intended file1 size */
> > +	uint64_t		xmi_isize2;	/* intended file2 size */
> 
> How do these inode sizes differ from xmi_startoff{1,2} +
> xmi_blockcount?

If the caller specifies XFS_EXCHANGE_RANGE_TO_EOF then we need to
exchange the file sizes too so that we preserve a mid-fsblock EOF.

> 
> > +/* Allocate and initialize an xmi item. */
> > +STATIC struct xfs_xmi_log_item *
> > +xfs_xmi_init(
> > +	struct xfs_mount	*mp)
> > +
> > +{
> > +	struct xfs_xmi_log_item	*xmi_lip;
> > +
> > +	xmi_lip = kmem_cache_zalloc(xfs_xmi_cache, GFP_KERNEL | __GFP_NOFAIL);
> > +
> > +	xfs_log_item_init(mp, &xmi_lip->xmi_item, XFS_LI_XMI, &xfs_xmi_item_ops);
> > +	xmi_lip->xmi_format.xmi_id = (uintptr_t)(void *)xmi_lip;
> 
> OK. Encoding the pointer as the ID is a mistake we made with EFIs
> and we should stop repeating it in all new intents. There is no
> guarantee that the pointer is a unique identifier because these are
> allocated out of a slab cache. Hence we can allocate an xmi, log it,
> finish the xmds, free the xmi, then run another exchange and get
> exactly the same XMI pointer returned to us for the new exchange
> intent. Now we potentially have multiple exchange items in the
> journal with the same ID.
> 
> Can we use a u32 and get_random_u32() for the ID here, please? We
> already do this for checkpoint discrimination in the log (i.e. to
> identify what checkpoint an ophdr belongs to), so we really should
> be doing the same for all ids we use in the journal for matching
> items.
> 
> Longer term, We probably should move all the intent/done identifiers
> to a psuedo random identifier mechanism, but that's outside the
> scope of this change....

Agreed.  I don't think it's appropriate to gate acceptance of this
patchset on this kind of general change.

> 
> > +/*
> > + * This routine is called to create an in-core file mapping exchange item from
> > + * the xmi format structure which was logged on disk.  It allocates an in-core
> > + * xmi, copies the exchange information from the format structure into it, and
> > + * adds the xmi to the AIL with the given LSN.
> > + */
> > +STATIC int
> > +xlog_recover_xmi_commit_pass2(
> > +	struct xlog			*log,
> > +	struct list_head		*buffer_list,
> > +	struct xlog_recover_item	*item,
> > +	xfs_lsn_t			lsn)
> > +{
> > +	struct xfs_mount		*mp = log->l_mp;
> > +	struct xfs_xmi_log_item		*xmi_lip;
> > +	struct xfs_xmi_log_format	*xmi_formatp;
> > +	size_t				len;
> > +
> > +	len = sizeof(struct xfs_xmi_log_format);
> > +	if (item->ri_buf[0].i_len != len) {
> > +		XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
> > +		return -EFSCORRUPTED;
> > +	}
> > +
> > +	xmi_formatp = item->ri_buf[0].i_addr;
> > +	if (xmi_formatp->__pad != 0) {
> > +		XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
> > +		return -EFSCORRUPTED;
> > +	}
> > +
> > +	xmi_lip = xfs_xmi_init(mp);
> > +	memcpy(&xmi_lip->xmi_format, xmi_formatp, len);
> 
> Should this be validating that the structure contents are within
> valid ranges?

That's done within _recover_work, like all the other intent items.  If
you want to clean that up then I'll review those patches, but I am not
going to prepend more generic cleanups.

--D

> -Dave.
> 
> -- 
> Dave Chinner
> david@fromorbit.com
Darrick J. Wong April 9, 2024, 3:06 a.m. UTC | #3
On Mon, Apr 08, 2024 at 06:18:48PM -0700, Darrick J. Wong wrote:
> On Mon, Apr 08, 2024 at 09:51:01AM +1000, Dave Chinner wrote:
> > On Tue, Mar 26, 2024 at 06:53:52PM -0700, Darrick J. Wong wrote:
> > > From: Darrick J. Wong <djwong@kernel.org>
> > > 
> > > Introduce a new intent log item to handle exchanging mappings between
> > > the forks of two files.
> > > 
> > > Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> > > Reviewed-by: Christoph Hellwig <hch@lst.de>
> > > ---
> > >  fs/xfs/Makefile                 |    1 
> > >  fs/xfs/libxfs/xfs_log_format.h  |   42 ++++++-
> > >  fs/xfs/libxfs/xfs_log_recover.h |    2 
> > >  fs/xfs/xfs_exchmaps_item.c      |  235 +++++++++++++++++++++++++++++++++++++++
> > >  fs/xfs/xfs_exchmaps_item.h      |   59 ++++++++++
> > >  fs/xfs/xfs_log_recover.c        |    2 
> > >  fs/xfs/xfs_super.c              |   19 +++
> > >  7 files changed, 357 insertions(+), 3 deletions(-)
> > >  create mode 100644 fs/xfs/xfs_exchmaps_item.c
> > >  create mode 100644 fs/xfs/xfs_exchmaps_item.h
> > > 
> > > 
> > > diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
> > > index 2474242f5a05f..68ca9726e7b7d 100644
> > > --- a/fs/xfs/Makefile
> > > +++ b/fs/xfs/Makefile
> > > @@ -102,6 +102,7 @@ xfs-y				+= xfs_log.o \
> > >  				   xfs_buf_item.o \
> > >  				   xfs_buf_item_recover.o \
> > >  				   xfs_dquot_item_recover.o \
> > > +				   xfs_exchmaps_item.o \
> > >  				   xfs_extfree_item.o \
> > >  				   xfs_attr_item.o \
> > >  				   xfs_icreate_item.o \
> > > diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
> > > index 16872972e1e97..09024431cae9a 100644
> > > --- a/fs/xfs/libxfs/xfs_log_format.h
> > > +++ b/fs/xfs/libxfs/xfs_log_format.h
> > > @@ -117,8 +117,9 @@ struct xfs_unmount_log_format {
> > >  #define XLOG_REG_TYPE_ATTRD_FORMAT	28
> > >  #define XLOG_REG_TYPE_ATTR_NAME	29
> > >  #define XLOG_REG_TYPE_ATTR_VALUE	30
> > > -#define XLOG_REG_TYPE_MAX		30
> > > -
> > > +#define XLOG_REG_TYPE_XMI_FORMAT	31
> > > +#define XLOG_REG_TYPE_XMD_FORMAT	32
> > > +#define XLOG_REG_TYPE_MAX		32
> > >  
> > >  /*
> > >   * Flags to log operation header
> > > @@ -243,6 +244,8 @@ typedef struct xfs_trans_header {
> > >  #define	XFS_LI_BUD		0x1245
> > >  #define	XFS_LI_ATTRI		0x1246  /* attr set/remove intent*/
> > >  #define	XFS_LI_ATTRD		0x1247  /* attr set/remove done */
> > > +#define	XFS_LI_XMI		0x1248  /* mapping exchange intent */
> > > +#define	XFS_LI_XMD		0x1249  /* mapping exchange done */
> > >  
> > >  #define XFS_LI_TYPE_DESC \
> > >  	{ XFS_LI_EFI,		"XFS_LI_EFI" }, \
> > > @@ -260,7 +263,9 @@ typedef struct xfs_trans_header {
> > >  	{ XFS_LI_BUI,		"XFS_LI_BUI" }, \
> > >  	{ XFS_LI_BUD,		"XFS_LI_BUD" }, \
> > >  	{ XFS_LI_ATTRI,		"XFS_LI_ATTRI" }, \
> > > -	{ XFS_LI_ATTRD,		"XFS_LI_ATTRD" }
> > > +	{ XFS_LI_ATTRD,		"XFS_LI_ATTRD" }, \
> > > +	{ XFS_LI_XMI,		"XFS_LI_XMI" }, \
> > > +	{ XFS_LI_XMD,		"XFS_LI_XMD" }
> > >  
> > >  /*
> > >   * Inode Log Item Format definitions.
> > > @@ -878,6 +883,37 @@ struct xfs_bud_log_format {
> > >  	uint64_t		bud_bui_id;	/* id of corresponding bui */
> > >  };
> > >  
> > > +/*
> > > + * XMI/XMD (file mapping exchange) log format definitions
> > > + */
> > > +
> > > +/* This is the structure used to lay out an mapping exchange log item. */
> > > +struct xfs_xmi_log_format {
> > > +	uint16_t		xmi_type;	/* xmi log item type */
> > > +	uint16_t		xmi_size;	/* size of this item */
> > > +	uint32_t		__pad;		/* must be zero */
> > > +	uint64_t		xmi_id;		/* xmi identifier */
> > 
> > Why does this ID need to be a 64 bit ID?  If it is 32 bit, then
> > there's no need for any padding, and it doesn't seem likely to me
> > that we'd have millions of exchanges in flight at once.
> > 
> > (Edit: I see why later - I address it there....)
> > 
> > > +
> > > +	uint64_t		xmi_inode1;	/* inumber of first file */
> > > +	uint64_t		xmi_inode2;	/* inumber of second file */
> > 
> > Inode numbers are not unique identifiers. Intents get replayed after
> > everything else has been replayed, including inode unlink and
> > reallocation.
> > 
> > Without a generation number, there is no way to determine if the
> > inode number in the intent is actually pointing at the correct inode
> > when we go to replay the intent.
> > 
> > Yes, I know it's unlikely that this might occur, but I'd much prefer
> > that we fully identify inodes in the on-disk metadata so we can
> > check it at recovery time than leaving it out and just hoping we are
> > operating on the correct inode life-cycle...
> 
> I suppose I don't mind encoding the generation number, but I'm not
> thrilled that you're suggesting this after I finally got someone to
> complete the review of this patchset after 3 years.
> 
> I probably ought to use the u32 pad in the attri log format to do the
> same for parent pointers, seeing as parent pointers have their own attri
> log opcodes now anyway.

Actually, no.  This is the wrong thing to do.

Adding i_generation to log items is something that we should apply
systematically to all the log items that target files -- BUIs, ATTRIs,
and XMIs.  I don't know why I indulged this, but I looked into how to
squeeze in i_generation: I can't do BUIs without revving the
xfs_map_extents ondisk format to include a new u32 field.  I can't do
ATTRIs without expanding the size of those by u32 because there's no
space left after parent pointers used up the padding field.

Dave, I've spent years working on this code.  I've gotten it to work.
I'm not interested in yet another refactoring of this now old code, and
I'm tired of not being able to merge patches just because you had some
random idea "let's expand the scope to fix these other code smells too!"
I want to move on with merging my dev branch, because there are 200 more
patches to go to finish online repair.

Andrey wants to get fsverity moving, and I can't help him do that
because I'm stuck on this.  Christoph wants to use some of realtime
modernization patches, and I can't help him do that because I'm stuck on
this.  John wants to get forcealign and untorn writes moving and I
can't help him do that because I'm stuck on this.  Catherine wants to
learn more about the codebase and I can't help her do that because I'm
stuck on this.

As I keep saying, I want to get online repair to the point where I can
send it out for wider testing with real users to find out how it really
works.  The customers who want the feature also want this, but I can't
help them do that because I'm stuck on review.

Do you get the picture?  Many projects are stalled on ME, and I don't
like that.  I don't get the sense that everyone else is enjoying this
stat of affairs either.

I'm very worn down by this XFS mentality of "we have to clean up all
the smells as a gatekeeping condition of merging someone else's work".
No, we don't have to do that.  I don't want to have to think about how
to engineer some unrelated ondisk format cleanup into a giant pile of
code and make /that/ work seamlessly too.

That is not a reasonable thing to ask for, and I'm not going to do it.

--D

> > 
> > > +	uint64_t		xmi_startoff1;	/* block offset into file1 */
> > > +	uint64_t		xmi_startoff2;	/* block offset into file2 */
> > > +	uint64_t		xmi_blockcount;	/* number of blocks */
> > > +	uint64_t		xmi_flags;	/* XFS_EXCHMAPS_* */
> > > +	uint64_t		xmi_isize1;	/* intended file1 size */
> > > +	uint64_t		xmi_isize2;	/* intended file2 size */
> > 
> > How do these inode sizes differ from xmi_startoff{1,2} +
> > xmi_blockcount?
> 
> If the caller specifies XFS_EXCHANGE_RANGE_TO_EOF then we need to
> exchange the file sizes too so that we preserve a mid-fsblock EOF.
> 
> > 
> > > +/* Allocate and initialize an xmi item. */
> > > +STATIC struct xfs_xmi_log_item *
> > > +xfs_xmi_init(
> > > +	struct xfs_mount	*mp)
> > > +
> > > +{
> > > +	struct xfs_xmi_log_item	*xmi_lip;
> > > +
> > > +	xmi_lip = kmem_cache_zalloc(xfs_xmi_cache, GFP_KERNEL | __GFP_NOFAIL);
> > > +
> > > +	xfs_log_item_init(mp, &xmi_lip->xmi_item, XFS_LI_XMI, &xfs_xmi_item_ops);
> > > +	xmi_lip->xmi_format.xmi_id = (uintptr_t)(void *)xmi_lip;
> > 
> > OK. Encoding the pointer as the ID is a mistake we made with EFIs
> > and we should stop repeating it in all new intents. There is no
> > guarantee that the pointer is a unique identifier because these are
> > allocated out of a slab cache. Hence we can allocate an xmi, log it,
> > finish the xmds, free the xmi, then run another exchange and get
> > exactly the same XMI pointer returned to us for the new exchange
> > intent. Now we potentially have multiple exchange items in the
> > journal with the same ID.
> > 
> > Can we use a u32 and get_random_u32() for the ID here, please? We
> > already do this for checkpoint discrimination in the log (i.e. to
> > identify what checkpoint an ophdr belongs to), so we really should
> > be doing the same for all ids we use in the journal for matching
> > items.
> > 
> > Longer term, We probably should move all the intent/done identifiers
> > to a psuedo random identifier mechanism, but that's outside the
> > scope of this change....
> 
> Agreed.  I don't think it's appropriate to gate acceptance of this
> patchset on this kind of general change.
> 
> > 
> > > +/*
> > > + * This routine is called to create an in-core file mapping exchange item from
> > > + * the xmi format structure which was logged on disk.  It allocates an in-core
> > > + * xmi, copies the exchange information from the format structure into it, and
> > > + * adds the xmi to the AIL with the given LSN.
> > > + */
> > > +STATIC int
> > > +xlog_recover_xmi_commit_pass2(
> > > +	struct xlog			*log,
> > > +	struct list_head		*buffer_list,
> > > +	struct xlog_recover_item	*item,
> > > +	xfs_lsn_t			lsn)
> > > +{
> > > +	struct xfs_mount		*mp = log->l_mp;
> > > +	struct xfs_xmi_log_item		*xmi_lip;
> > > +	struct xfs_xmi_log_format	*xmi_formatp;
> > > +	size_t				len;
> > > +
> > > +	len = sizeof(struct xfs_xmi_log_format);
> > > +	if (item->ri_buf[0].i_len != len) {
> > > +		XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
> > > +		return -EFSCORRUPTED;
> > > +	}
> > > +
> > > +	xmi_formatp = item->ri_buf[0].i_addr;
> > > +	if (xmi_formatp->__pad != 0) {
> > > +		XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
> > > +		return -EFSCORRUPTED;
> > > +	}
> > > +
> > > +	xmi_lip = xfs_xmi_init(mp);
> > > +	memcpy(&xmi_lip->xmi_format, xmi_formatp, len);
> > 
> > Should this be validating that the structure contents are within
> > valid ranges?
> 
> That's done within _recover_work, like all the other intent items.  If
> you want to clean that up then I'll review those patches, but I am not
> going to prepend more generic cleanups.
> 
> --D
> 
> > -Dave.
> > 
> > -- 
> > Dave Chinner
> > david@fromorbit.com
>
diff mbox series

Patch

diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 2474242f5a05f..68ca9726e7b7d 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -102,6 +102,7 @@  xfs-y				+= xfs_log.o \
 				   xfs_buf_item.o \
 				   xfs_buf_item_recover.o \
 				   xfs_dquot_item_recover.o \
+				   xfs_exchmaps_item.o \
 				   xfs_extfree_item.o \
 				   xfs_attr_item.o \
 				   xfs_icreate_item.o \
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index 16872972e1e97..09024431cae9a 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -117,8 +117,9 @@  struct xfs_unmount_log_format {
 #define XLOG_REG_TYPE_ATTRD_FORMAT	28
 #define XLOG_REG_TYPE_ATTR_NAME	29
 #define XLOG_REG_TYPE_ATTR_VALUE	30
-#define XLOG_REG_TYPE_MAX		30
-
+#define XLOG_REG_TYPE_XMI_FORMAT	31
+#define XLOG_REG_TYPE_XMD_FORMAT	32
+#define XLOG_REG_TYPE_MAX		32
 
 /*
  * Flags to log operation header
@@ -243,6 +244,8 @@  typedef struct xfs_trans_header {
 #define	XFS_LI_BUD		0x1245
 #define	XFS_LI_ATTRI		0x1246  /* attr set/remove intent*/
 #define	XFS_LI_ATTRD		0x1247  /* attr set/remove done */
+#define	XFS_LI_XMI		0x1248  /* mapping exchange intent */
+#define	XFS_LI_XMD		0x1249  /* mapping exchange done */
 
 #define XFS_LI_TYPE_DESC \
 	{ XFS_LI_EFI,		"XFS_LI_EFI" }, \
@@ -260,7 +263,9 @@  typedef struct xfs_trans_header {
 	{ XFS_LI_BUI,		"XFS_LI_BUI" }, \
 	{ XFS_LI_BUD,		"XFS_LI_BUD" }, \
 	{ XFS_LI_ATTRI,		"XFS_LI_ATTRI" }, \
-	{ XFS_LI_ATTRD,		"XFS_LI_ATTRD" }
+	{ XFS_LI_ATTRD,		"XFS_LI_ATTRD" }, \
+	{ XFS_LI_XMI,		"XFS_LI_XMI" }, \
+	{ XFS_LI_XMD,		"XFS_LI_XMD" }
 
 /*
  * Inode Log Item Format definitions.
@@ -878,6 +883,37 @@  struct xfs_bud_log_format {
 	uint64_t		bud_bui_id;	/* id of corresponding bui */
 };
 
+/*
+ * XMI/XMD (file mapping exchange) log format definitions
+ */
+
+/* This is the structure used to lay out an mapping exchange log item. */
+struct xfs_xmi_log_format {
+	uint16_t		xmi_type;	/* xmi log item type */
+	uint16_t		xmi_size;	/* size of this item */
+	uint32_t		__pad;		/* must be zero */
+	uint64_t		xmi_id;		/* xmi identifier */
+
+	uint64_t		xmi_inode1;	/* inumber of first file */
+	uint64_t		xmi_inode2;	/* inumber of second file */
+	uint64_t		xmi_startoff1;	/* block offset into file1 */
+	uint64_t		xmi_startoff2;	/* block offset into file2 */
+	uint64_t		xmi_blockcount;	/* number of blocks */
+	uint64_t		xmi_flags;	/* XFS_EXCHMAPS_* */
+	uint64_t		xmi_isize1;	/* intended file1 size */
+	uint64_t		xmi_isize2;	/* intended file2 size */
+};
+
+#define XFS_EXCHMAPS_LOGGED_FLAGS		(0)
+
+/* This is the structure used to lay out an mapping exchange done log item. */
+struct xfs_xmd_log_format {
+	uint16_t		xmd_type;	/* xmd log item type */
+	uint16_t		xmd_size;	/* size of this item */
+	uint32_t		__pad;
+	uint64_t		xmd_xmi_id;	/* id of corresponding xmi */
+};
+
 /*
  * Dquot Log format definitions.
  *
diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h
index 9fe7a9564bca9..47b758b49cb35 100644
--- a/fs/xfs/libxfs/xfs_log_recover.h
+++ b/fs/xfs/libxfs/xfs_log_recover.h
@@ -75,6 +75,8 @@  extern const struct xlog_recover_item_ops xlog_cui_item_ops;
 extern const struct xlog_recover_item_ops xlog_cud_item_ops;
 extern const struct xlog_recover_item_ops xlog_attri_item_ops;
 extern const struct xlog_recover_item_ops xlog_attrd_item_ops;
+extern const struct xlog_recover_item_ops xlog_xmi_item_ops;
+extern const struct xlog_recover_item_ops xlog_xmd_item_ops;
 
 /*
  * Macros, structures, prototypes for internal log manager use.
diff --git a/fs/xfs/xfs_exchmaps_item.c b/fs/xfs/xfs_exchmaps_item.c
new file mode 100644
index 0000000000000..65b0ade41b3d6
--- /dev/null
+++ b/fs/xfs/xfs_exchmaps_item.c
@@ -0,0 +1,235 @@ 
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2020-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_shared.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_exchmaps_item.h"
+#include "xfs_log.h"
+#include "xfs_bmap.h"
+#include "xfs_icache.h"
+#include "xfs_trans_space.h"
+#include "xfs_error.h"
+#include "xfs_log_priv.h"
+#include "xfs_log_recover.h"
+
+struct kmem_cache	*xfs_xmi_cache;
+struct kmem_cache	*xfs_xmd_cache;
+
+static const struct xfs_item_ops xfs_xmi_item_ops;
+
+static inline struct xfs_xmi_log_item *XMI_ITEM(struct xfs_log_item *lip)
+{
+	return container_of(lip, struct xfs_xmi_log_item, xmi_item);
+}
+
+STATIC void
+xfs_xmi_item_free(
+	struct xfs_xmi_log_item	*xmi_lip)
+{
+	kvfree(xmi_lip->xmi_item.li_lv_shadow);
+	kmem_cache_free(xfs_xmi_cache, xmi_lip);
+}
+
+/*
+ * Freeing the XMI requires that we remove it from the AIL if it has already
+ * been placed there. However, the XMI may not yet have been placed in the AIL
+ * when called by xfs_xmi_release() from XMD processing due to the ordering of
+ * committed vs unpin operations in bulk insert operations. Hence the reference
+ * count to ensure only the last caller frees the XMI.
+ */
+STATIC void
+xfs_xmi_release(
+	struct xfs_xmi_log_item	*xmi_lip)
+{
+	ASSERT(atomic_read(&xmi_lip->xmi_refcount) > 0);
+	if (atomic_dec_and_test(&xmi_lip->xmi_refcount)) {
+		xfs_trans_ail_delete(&xmi_lip->xmi_item, 0);
+		xfs_xmi_item_free(xmi_lip);
+	}
+}
+
+
+STATIC void
+xfs_xmi_item_size(
+	struct xfs_log_item	*lip,
+	int			*nvecs,
+	int			*nbytes)
+{
+	*nvecs += 1;
+	*nbytes += sizeof(struct xfs_xmi_log_format);
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the given xmi log
+ * item. We use only 1 iovec, and we point that at the xmi_log_format structure
+ * embedded in the xmi item.
+ */
+STATIC void
+xfs_xmi_item_format(
+	struct xfs_log_item	*lip,
+	struct xfs_log_vec	*lv)
+{
+	struct xfs_xmi_log_item	*xmi_lip = XMI_ITEM(lip);
+	struct xfs_log_iovec	*vecp = NULL;
+
+	xmi_lip->xmi_format.xmi_type = XFS_LI_XMI;
+	xmi_lip->xmi_format.xmi_size = 1;
+
+	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_XMI_FORMAT,
+			&xmi_lip->xmi_format,
+			sizeof(struct xfs_xmi_log_format));
+}
+
+/*
+ * The unpin operation is the last place an XMI is manipulated in the log. It
+ * is either inserted in the AIL or aborted in the event of a log I/O error. In
+ * either case, the XMI transaction has been successfully committed to make it
+ * this far. Therefore, we expect whoever committed the XMI to either construct
+ * and commit the XMD or drop the XMD's reference in the event of error. Simply
+ * drop the log's XMI reference now that the log is done with it.
+ */
+STATIC void
+xfs_xmi_item_unpin(
+	struct xfs_log_item	*lip,
+	int			remove)
+{
+	struct xfs_xmi_log_item	*xmi_lip = XMI_ITEM(lip);
+
+	xfs_xmi_release(xmi_lip);
+}
+
+/*
+ * The XMI has been either committed or aborted if the transaction has been
+ * cancelled. If the transaction was cancelled, an XMD isn't going to be
+ * constructed and thus we free the XMI here directly.
+ */
+STATIC void
+xfs_xmi_item_release(
+	struct xfs_log_item	*lip)
+{
+	xfs_xmi_release(XMI_ITEM(lip));
+}
+
+/* Allocate and initialize an xmi item. */
+STATIC struct xfs_xmi_log_item *
+xfs_xmi_init(
+	struct xfs_mount	*mp)
+
+{
+	struct xfs_xmi_log_item	*xmi_lip;
+
+	xmi_lip = kmem_cache_zalloc(xfs_xmi_cache, GFP_KERNEL | __GFP_NOFAIL);
+
+	xfs_log_item_init(mp, &xmi_lip->xmi_item, XFS_LI_XMI, &xfs_xmi_item_ops);
+	xmi_lip->xmi_format.xmi_id = (uintptr_t)(void *)xmi_lip;
+	atomic_set(&xmi_lip->xmi_refcount, 2);
+
+	return xmi_lip;
+}
+
+static inline struct xfs_xmd_log_item *XMD_ITEM(struct xfs_log_item *lip)
+{
+	return container_of(lip, struct xfs_xmd_log_item, xmd_item);
+}
+
+STATIC bool
+xfs_xmi_item_match(
+	struct xfs_log_item	*lip,
+	uint64_t		intent_id)
+{
+	return XMI_ITEM(lip)->xmi_format.xmi_id == intent_id;
+}
+
+static const struct xfs_item_ops xfs_xmi_item_ops = {
+	.flags		= XFS_ITEM_INTENT,
+	.iop_size	= xfs_xmi_item_size,
+	.iop_format	= xfs_xmi_item_format,
+	.iop_unpin	= xfs_xmi_item_unpin,
+	.iop_release	= xfs_xmi_item_release,
+	.iop_match	= xfs_xmi_item_match,
+};
+
+/*
+ * This routine is called to create an in-core file mapping exchange item from
+ * the xmi format structure which was logged on disk.  It allocates an in-core
+ * xmi, copies the exchange information from the format structure into it, and
+ * adds the xmi to the AIL with the given LSN.
+ */
+STATIC int
+xlog_recover_xmi_commit_pass2(
+	struct xlog			*log,
+	struct list_head		*buffer_list,
+	struct xlog_recover_item	*item,
+	xfs_lsn_t			lsn)
+{
+	struct xfs_mount		*mp = log->l_mp;
+	struct xfs_xmi_log_item		*xmi_lip;
+	struct xfs_xmi_log_format	*xmi_formatp;
+	size_t				len;
+
+	len = sizeof(struct xfs_xmi_log_format);
+	if (item->ri_buf[0].i_len != len) {
+		XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
+		return -EFSCORRUPTED;
+	}
+
+	xmi_formatp = item->ri_buf[0].i_addr;
+	if (xmi_formatp->__pad != 0) {
+		XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
+		return -EFSCORRUPTED;
+	}
+
+	xmi_lip = xfs_xmi_init(mp);
+	memcpy(&xmi_lip->xmi_format, xmi_formatp, len);
+
+	/* not implemented yet */
+	return -EIO;
+}
+
+const struct xlog_recover_item_ops xlog_xmi_item_ops = {
+	.item_type		= XFS_LI_XMI,
+	.commit_pass2		= xlog_recover_xmi_commit_pass2,
+};
+
+/*
+ * This routine is called when an XMD format structure is found in a committed
+ * transaction in the log. Its purpose is to cancel the corresponding XMI if it
+ * was still in the log. To do this it searches the AIL for the XMI with an id
+ * equal to that in the XMD format structure. If we find it we drop the XMD
+ * reference, which removes the XMI from the AIL and frees it.
+ */
+STATIC int
+xlog_recover_xmd_commit_pass2(
+	struct xlog			*log,
+	struct list_head		*buffer_list,
+	struct xlog_recover_item	*item,
+	xfs_lsn_t			lsn)
+{
+	struct xfs_xmd_log_format	*xmd_formatp;
+
+	xmd_formatp = item->ri_buf[0].i_addr;
+	if (item->ri_buf[0].i_len != sizeof(struct xfs_xmd_log_format)) {
+		XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
+		return -EFSCORRUPTED;
+	}
+
+	xlog_recover_release_intent(log, XFS_LI_XMI, xmd_formatp->xmd_xmi_id);
+	return 0;
+}
+
+const struct xlog_recover_item_ops xlog_xmd_item_ops = {
+	.item_type		= XFS_LI_XMD,
+	.commit_pass2		= xlog_recover_xmd_commit_pass2,
+};
diff --git a/fs/xfs/xfs_exchmaps_item.h b/fs/xfs/xfs_exchmaps_item.h
new file mode 100644
index 0000000000000..ada1eb314e658
--- /dev/null
+++ b/fs/xfs/xfs_exchmaps_item.h
@@ -0,0 +1,59 @@ 
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2020-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef	__XFS_EXCHMAPS_ITEM_H__
+#define	__XFS_EXCHMAPS_ITEM_H__
+
+/*
+ * The file mapping exchange intent item helps us exchange multiple file
+ * mappings between two inode forks.  It does this by tracking the range of
+ * file block offsets that still need to be exchanged, and relogs as progress
+ * happens.
+ *
+ * *I items should be recorded in the *first* of a series of rolled
+ * transactions, and the *D items should be recorded in the same transaction
+ * that records the associated bmbt updates.
+ *
+ * Should the system crash after the commit of the first transaction but
+ * before the commit of the final transaction in a series, log recovery will
+ * use the redo information recorded by the intent items to replay the
+ * rest of the mapping exchanges.
+ */
+
+/* kernel only XMI/XMD definitions */
+
+struct xfs_mount;
+struct kmem_cache;
+
+/*
+ * This is the incore file mapping exchange intent log item.  It is used to log
+ * the fact that we are exchanging mappings between two files.  It is used in
+ * conjunction with the incore file mapping exchange done log item described
+ * below.
+ *
+ * These log items follow the same rules as struct xfs_efi_log_item; see the
+ * comments about that structure (in xfs_extfree_item.h) for more details.
+ */
+struct xfs_xmi_log_item {
+	struct xfs_log_item		xmi_item;
+	atomic_t			xmi_refcount;
+	struct xfs_xmi_log_format	xmi_format;
+};
+
+/*
+ * This is the incore file mapping exchange done log item.  It is used to log
+ * the fact that an exchange mentioned in an earlier xmi item have been
+ * performed.
+ */
+struct xfs_xmd_log_item {
+	struct xfs_log_item		xmd_item;
+	struct xfs_xmi_log_item		*xmd_intent_log_item;
+	struct xfs_xmd_log_format	xmd_format;
+};
+
+extern struct kmem_cache	*xfs_xmi_cache;
+extern struct kmem_cache	*xfs_xmd_cache;
+
+#endif	/* __XFS_EXCHMAPS_ITEM_H__ */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 346ad9fceca73..042b105c42dc2 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1789,6 +1789,8 @@  static const struct xlog_recover_item_ops *xlog_recover_item_ops[] = {
 	&xlog_bud_item_ops,
 	&xlog_attri_item_ops,
 	&xlog_attrd_item_ops,
+	&xlog_xmi_item_ops,
+	&xlog_xmd_item_ops,
 };
 
 static const struct xlog_recover_item_ops *
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index a2dcbac6effe9..7bb867fd31041 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -43,6 +43,7 @@ 
 #include "xfs_iunlink_item.h"
 #include "xfs_dahash_test.h"
 #include "xfs_rtbitmap.h"
+#include "xfs_exchmaps_item.h"
 #include "scrub/stats.h"
 #include "scrub/rcbag_btree.h"
 
@@ -2195,8 +2196,24 @@  xfs_init_caches(void)
 	if (!xfs_iunlink_cache)
 		goto out_destroy_attri_cache;
 
+	xfs_xmd_cache = kmem_cache_create("xfs_xmd_item",
+					 sizeof(struct xfs_xmd_log_item),
+					 0, 0, NULL);
+	if (!xfs_xmd_cache)
+		goto out_destroy_iul_cache;
+
+	xfs_xmi_cache = kmem_cache_create("xfs_xmi_item",
+					 sizeof(struct xfs_xmi_log_item),
+					 0, 0, NULL);
+	if (!xfs_xmi_cache)
+		goto out_destroy_xmd_cache;
+
 	return 0;
 
+ out_destroy_xmd_cache:
+	kmem_cache_destroy(xfs_xmd_cache);
+ out_destroy_iul_cache:
+	kmem_cache_destroy(xfs_iunlink_cache);
  out_destroy_attri_cache:
 	kmem_cache_destroy(xfs_attri_cache);
  out_destroy_attrd_cache:
@@ -2253,6 +2270,8 @@  xfs_destroy_caches(void)
 	 * destroy caches.
 	 */
 	rcu_barrier();
+	kmem_cache_destroy(xfs_xmd_cache);
+	kmem_cache_destroy(xfs_xmi_cache);
 	kmem_cache_destroy(xfs_iunlink_cache);
 	kmem_cache_destroy(xfs_attri_cache);
 	kmem_cache_destroy(xfs_attrd_cache);