diff mbox series

[22/43] xfs: add the zoned space allocator

Message ID 20250206064511.2323878-23-hch@lst.de (mailing list archive)
State Not Applicable, archived
Headers show
Series [01/43] xfs: factor out a xfs_rt_check_size helper | expand

Commit Message

hch Feb. 6, 2025, 6:44 a.m. UTC
For zoned RT devices space is always allocated at the write pointer, that
is right after the last written block and only recorded on I/O completion.

Because the actual allocation algorithm is very simple and just involves
picking a good zone - preferably the one used for the last write to the
inode.  As the number of zones that can written at the same time is
usually limited by the hardware, selecting a zone is done as late as
possible from the iomap dio and buffered writeback bio submissions
helpers just before submitting the bio.

Given that the writers already took a reservation before acquiring the
iolock, space will always be readily available if an open zone slot is
available.  A new structure is used to track these open zones, and
pointed to by the xfs_rtgroup.  Because zoned file systems don't have
a rsum cache the space for that pointer can be reused.

Allocations are only recorded at I/O completion time.  The scheme used
for that is very similar to the reflink COW end I/O path.

Co-developed-by: Hans Holmberg <hans.holmberg@wdc.com>
Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/Makefile             |   3 +-
 fs/xfs/libxfs/xfs_rtgroup.h |  22 +-
 fs/xfs/libxfs/xfs_types.h   |  13 +-
 fs/xfs/xfs_log.c            |   4 +
 fs/xfs/xfs_mount.c          |  17 +-
 fs/xfs/xfs_mount.h          |   2 +
 fs/xfs/xfs_rtalloc.c        |   6 +-
 fs/xfs/xfs_trace.c          |   2 +
 fs/xfs/xfs_trace.h          |  97 ++++
 fs/xfs/xfs_zone_alloc.c     | 959 ++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_zone_alloc.h     |  34 ++
 fs/xfs/xfs_zone_priv.h      |  89 ++++
 12 files changed, 1238 insertions(+), 10 deletions(-)
 create mode 100644 fs/xfs/xfs_zone_alloc.c
 create mode 100644 fs/xfs/xfs_zone_alloc.h
 create mode 100644 fs/xfs/xfs_zone_priv.h

Comments

Darrick J. Wong Feb. 7, 2025, 5:39 p.m. UTC | #1
On Thu, Feb 06, 2025 at 07:44:38AM +0100, Christoph Hellwig wrote:
> For zoned RT devices space is always allocated at the write pointer, that
> is right after the last written block and only recorded on I/O completion.
> 
> Because the actual allocation algorithm is very simple and just involves
> picking a good zone - preferably the one used for the last write to the
> inode.  As the number of zones that can written at the same time is
> usually limited by the hardware, selecting a zone is done as late as
> possible from the iomap dio and buffered writeback bio submissions
> helpers just before submitting the bio.
> 
> Given that the writers already took a reservation before acquiring the
> iolock, space will always be readily available if an open zone slot is
> available.  A new structure is used to track these open zones, and
> pointed to by the xfs_rtgroup.  Because zoned file systems don't have
> a rsum cache the space for that pointer can be reused.
> 
> Allocations are only recorded at I/O completion time.  The scheme used
> for that is very similar to the reflink COW end I/O path.
> 
> Co-developed-by: Hans Holmberg <hans.holmberg@wdc.com>
> Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/xfs/Makefile             |   3 +-
>  fs/xfs/libxfs/xfs_rtgroup.h |  22 +-
>  fs/xfs/libxfs/xfs_types.h   |  13 +-
>  fs/xfs/xfs_log.c            |   4 +
>  fs/xfs/xfs_mount.c          |  17 +-
>  fs/xfs/xfs_mount.h          |   2 +
>  fs/xfs/xfs_rtalloc.c        |   6 +-
>  fs/xfs/xfs_trace.c          |   2 +
>  fs/xfs/xfs_trace.h          |  97 ++++
>  fs/xfs/xfs_zone_alloc.c     | 959 ++++++++++++++++++++++++++++++++++++
>  fs/xfs/xfs_zone_alloc.h     |  34 ++
>  fs/xfs/xfs_zone_priv.h      |  89 ++++
>  12 files changed, 1238 insertions(+), 10 deletions(-)
>  create mode 100644 fs/xfs/xfs_zone_alloc.c
>  create mode 100644 fs/xfs/xfs_zone_alloc.h
>  create mode 100644 fs/xfs/xfs_zone_priv.h
> 
> diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
> index ea8e66c1e969..28bd2627e9ef 100644
> --- a/fs/xfs/Makefile
> +++ b/fs/xfs/Makefile
> @@ -137,7 +137,8 @@ xfs-$(CONFIG_XFS_QUOTA)		+= xfs_dquot.o \
>  				   xfs_quotaops.o
>  
>  # xfs_rtbitmap is shared with libxfs
> -xfs-$(CONFIG_XFS_RT)		+= xfs_rtalloc.o
> +xfs-$(CONFIG_XFS_RT)		+= xfs_rtalloc.o \
> +				   xfs_zone_alloc.o
>  
>  xfs-$(CONFIG_XFS_POSIX_ACL)	+= xfs_acl.o
>  xfs-$(CONFIG_SYSCTL)		+= xfs_sysctl.o
> diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h
> index e35d1d798327..5d8777f819f4 100644
> --- a/fs/xfs/libxfs/xfs_rtgroup.h
> +++ b/fs/xfs/libxfs/xfs_rtgroup.h
> @@ -37,15 +37,27 @@ struct xfs_rtgroup {
>  	xfs_rtxnum_t		rtg_extents;
>  
>  	/*
> -	 * Cache of rt summary level per bitmap block with the invariant that
> -	 * rtg_rsum_cache[bbno] > the maximum i for which rsum[i][bbno] != 0,
> -	 * or 0 if rsum[i][bbno] == 0 for all i.
> -	 *
> +	 * For bitmap based RT devices this points to a cache of rt summary
> +	 * level per bitmap block with the invariant that rtg_rsum_cache[bbno]
> +	 * > the maximum i for which rsum[i][bbno] != 0, or 0 if
> +	 * rsum[i][bbno] == 0 for all i.
>  	 * Reads and writes are serialized by the rsumip inode lock.
> +	 *
> +	 * For zoned RT devices this points to the open zone structure for
> +	 * a group that is open for writers, or is NULL.
>  	 */
> -	uint8_t			*rtg_rsum_cache;
> +	union {
> +		uint8_t			*rtg_rsum_cache;
> +		struct xfs_open_zone	*rtg_open_zone;
> +	};
>  };
>  
> +/*
> + * For zoned RT devices this is set on groups that have no written blocks
> + * and can be picked by the allocator for opening.
> + */
> +#define XFS_RTG_FREE			XA_MARK_0
> +
>  static inline struct xfs_rtgroup *to_rtg(struct xfs_group *xg)
>  {
>  	return container_of(xg, struct xfs_rtgroup, rtg_group);
> diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
> index 76f3c31573ec..f6f4f2d4b5db 100644
> --- a/fs/xfs/libxfs/xfs_types.h
> +++ b/fs/xfs/libxfs/xfs_types.h
> @@ -243,12 +243,23 @@ enum xfs_free_counter {
>  	 * Number of free RT extents on the RT device.
>  	 */
>  	XC_FREE_RTEXTENTS,
> +
> +	/*
> +	 * Number of available for use RT extents.
> +	 *
> +	 * This counter only exists for zoned RT device and indicates the number
> +	 * of RT extents that can be directly used by writes.  XC_FREE_RTEXTENTS
> +	 * also includes blocks that have been written previously and freed, but
> +	 * sit in a rtgroup that still needs a zone reset.
> +	 */
> +	XC_FREE_RTAVAILABLE,
>  	XC_FREE_NR,
>  };
>  
>  #define XFS_FREECOUNTER_STR \
>  	{ XC_FREE_BLOCKS,		"blocks" }, \
> -	{ XC_FREE_RTEXTENTS,		"rtextents" }
> +	{ XC_FREE_RTEXTENTS,		"rtextents" }, \
> +	{ XC_FREE_RTAVAILABLE,		"rtavailable" }
>  
>  /*
>   * Type verifier functions
> diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
> index f8851ff835de..6493bdb57351 100644
> --- a/fs/xfs/xfs_log.c
> +++ b/fs/xfs/xfs_log.c
> @@ -20,6 +20,7 @@
>  #include "xfs_sysfs.h"
>  #include "xfs_sb.h"
>  #include "xfs_health.h"
> +#include "xfs_zone_alloc.h"
>  
>  struct kmem_cache	*xfs_log_ticket_cache;
>  
> @@ -3540,6 +3541,9 @@ xlog_force_shutdown(
>  	spin_unlock(&log->l_icloglock);
>  
>  	wake_up_var(&log->l_opstate);
> +	if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(log->l_mp))
> +		xfs_zoned_wake_all(log->l_mp);
> +
>  	return log_error;
>  }
>  
> diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
> index b6004e456ed3..06e34c43cc94 100644
> --- a/fs/xfs/xfs_mount.c
> +++ b/fs/xfs/xfs_mount.c
> @@ -40,6 +40,7 @@
>  #include "xfs_rtrmap_btree.h"
>  #include "xfs_rtrefcount_btree.h"
>  #include "scrub/stats.h"
> +#include "xfs_zone_alloc.h"
>  
>  static DEFINE_MUTEX(xfs_uuid_table_mutex);
>  static int xfs_uuid_table_size;
> @@ -467,6 +468,7 @@ xfs_mount_reset_sbqflags(
>  static const char *const xfs_free_pool_name[] = {
>  	[XC_FREE_BLOCKS]	= "free blocks",
>  	[XC_FREE_RTEXTENTS]	= "free rt extents",
> +	[XC_FREE_RTAVAILABLE]	= "available rt extents",
>  };
>  
>  uint64_t
> @@ -1045,6 +1047,12 @@ xfs_mountfs(
>  	if (xfs_is_readonly(mp) && !xfs_has_norecovery(mp))
>  		xfs_log_clean(mp);
>  
> +	if (xfs_has_zoned(mp)) {
> +		error = xfs_mount_zones(mp);
> +		if (error)
> +			goto out_rtunmount;
> +	}
> +
>  	/*
>  	 * Complete the quota initialisation, post-log-replay component.
>  	 */
> @@ -1087,6 +1095,8 @@ xfs_mountfs(
>   out_agresv:
>  	xfs_fs_unreserve_ag_blocks(mp);
>  	xfs_qm_unmount_quotas(mp);
> +	if (xfs_has_zoned(mp))
> +		xfs_unmount_zones(mp);
>   out_rtunmount:
>  	xfs_rtunmount_inodes(mp);
>   out_rele_rip:
> @@ -1168,6 +1178,8 @@ xfs_unmountfs(
>  	xfs_blockgc_stop(mp);
>  	xfs_fs_unreserve_ag_blocks(mp);
>  	xfs_qm_unmount_quotas(mp);
> +	if (xfs_has_zoned(mp))
> +		xfs_unmount_zones(mp);
>  	xfs_rtunmount_inodes(mp);
>  	xfs_irele(mp->m_rootip);
>  	if (mp->m_metadirip)
> @@ -1251,7 +1263,7 @@ xfs_freecounter_unavailable(
>  	struct xfs_mount	*mp,
>  	enum xfs_free_counter	ctr)
>  {
> -	if (ctr == XC_FREE_RTEXTENTS)
> +	if (ctr == XC_FREE_RTEXTENTS || ctr == XC_FREE_RTAVAILABLE)
>  		return 0;
>  	return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks);
>  }
> @@ -1341,7 +1353,8 @@ xfs_dec_freecounter(
>  
>  		lcounter = (long long)mp->m_resblks[ctr].avail - delta;
>  		if (lcounter < 0) {
> -			xfs_warn_once(mp,
> +			if (ctr == XC_FREE_BLOCKS)
> +				xfs_warn_once(mp,

That probably should go in the patch enabling xfs_dec_freecounter to
take XC_FREE_RTEXTENTS.

>  "Reserve blocks depleted! Consider increasing reserve pool size.");
>  			goto fdblocks_enospc;
>  		}
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index 7c29f3f9ba78..a0e51ce5b84c 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -205,6 +205,7 @@ typedef struct xfs_mount {
>  	bool			m_fail_unmount;
>  	bool			m_finobt_nores; /* no per-AG finobt resv. */
>  	bool			m_update_sb;	/* sb needs update in mount */
> +	unsigned int		m_max_open_zones;
>  
>  	/*
>  	 * Bitsets of per-fs metadata that have been checked and/or are sick.
> @@ -257,6 +258,7 @@ typedef struct xfs_mount {
>  		uint64_t	save;		/* reserved blks @ remount,ro */
>  	} m_resblks[XC_FREE_NR];
>  	struct delayed_work	m_reclaim_work;	/* background inode reclaim */
> +	struct xfs_zone_info	*m_zone_info;	/* zone allocator information */
>  	struct dentry		*m_debugfs;	/* debugfs parent */
>  	struct xfs_kobj		m_kobj;
>  	struct xfs_kobj		m_error_kobj;
> diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
> index a0fd1dc5d362..6a45749daf57 100644
> --- a/fs/xfs/xfs_rtalloc.c
> +++ b/fs/xfs/xfs_rtalloc.c
> @@ -33,6 +33,7 @@
>  #include "xfs_trace.h"
>  #include "xfs_rtrefcount_btree.h"
>  #include "xfs_reflink.h"
> +#include "xfs_zone_alloc.h"
>  
>  /*
>   * Return whether there are any free extents in the size range given
> @@ -663,7 +664,8 @@ xfs_rtunmount_rtg(
>  
>  	for (i = 0; i < XFS_RTGI_MAX; i++)
>  		xfs_rtginode_irele(&rtg->rtg_inodes[i]);
> -	kvfree(rtg->rtg_rsum_cache);
> +	if (!xfs_has_zoned(rtg_mount(rtg)))
> +		kvfree(rtg->rtg_rsum_cache);
>  }
>  
>  static int
> @@ -1573,6 +1575,8 @@ xfs_rtmount_rtg(
>  		}
>  	}
>  
> +	if (xfs_has_zoned(mp))
> +		return 0;
>  	return xfs_alloc_rsum_cache(rtg, mp->m_sb.sb_rbmblocks);
>  }
>  
> diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
> index 8f530e69c18a..a60556dbd172 100644
> --- a/fs/xfs/xfs_trace.c
> +++ b/fs/xfs/xfs_trace.c
> @@ -49,6 +49,8 @@
>  #include "xfs_metafile.h"
>  #include "xfs_metadir.h"
>  #include "xfs_rtgroup.h"
> +#include "xfs_zone_alloc.h"
> +#include "xfs_zone_priv.h"
>  
>  /*
>   * We include this last to have the helpers above available for the trace
> diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> index a02129c202b2..d4f012e41564 100644
> --- a/fs/xfs/xfs_trace.h
> +++ b/fs/xfs/xfs_trace.h
> @@ -102,6 +102,7 @@ struct xfs_rmap_intent;
>  struct xfs_refcount_intent;
>  struct xfs_metadir_update;
>  struct xfs_rtgroup;
> +struct xfs_open_zone;
>  
>  #define XFS_ATTR_FILTER_FLAGS \
>  	{ XFS_ATTR_ROOT,	"ROOT" }, \
> @@ -265,6 +266,100 @@ DEFINE_GROUP_REF_EVENT(xfs_group_grab);
>  DEFINE_GROUP_REF_EVENT(xfs_group_grab_next_tag);
>  DEFINE_GROUP_REF_EVENT(xfs_group_rele);
>  
> +#ifdef CONFIG_XFS_RT
> +DECLARE_EVENT_CLASS(xfs_zone_class,
> +	TP_PROTO(struct xfs_rtgroup *rtg),
> +	TP_ARGS(rtg),
> +	TP_STRUCT__entry(
> +		__field(dev_t, dev)
> +		__field(xfs_rgnumber_t, rgno)
> +		__field(xfs_rgblock_t, used)
> +	),
> +	TP_fast_assign(
> +		__entry->dev = rtg_mount(rtg)->m_super->s_dev;
> +		__entry->rgno = rtg_rgno(rtg);
> +		__entry->used = rtg_rmap(rtg)->i_used_blocks;
> +	),
> +	TP_printk("dev %d:%d rgno 0x%x used 0x%x",
> +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> +		  __entry->rgno,
> +		  __entry->used)
> +);
> +
> +#define DEFINE_ZONE_EVENT(name)				\
> +DEFINE_EVENT(xfs_zone_class, name,			\
> +	TP_PROTO(struct xfs_rtgroup *rtg),		\
> +	TP_ARGS(rtg))
> +DEFINE_ZONE_EVENT(xfs_zone_full);
> +DEFINE_ZONE_EVENT(xfs_zone_activate);
> +
> +TRACE_EVENT(xfs_zone_free_blocks,
> +	TP_PROTO(struct xfs_rtgroup *rtg, xfs_rgblock_t rgbno,
> +		 xfs_extlen_t len),
> +	TP_ARGS(rtg, rgbno, len),
> +	TP_STRUCT__entry(
> +		__field(dev_t, dev)
> +		__field(xfs_rgnumber_t, rgno)
> +		__field(xfs_rgblock_t, used)
> +		__field(xfs_rgblock_t, rgbno)
> +		__field(xfs_extlen_t, len)
> +	),
> +	TP_fast_assign(
> +		__entry->dev = rtg_mount(rtg)->m_super->s_dev;
> +		__entry->rgno = rtg_rgno(rtg);
> +		__entry->used = rtg_rmap(rtg)->i_used_blocks;
> +		__entry->rgbno = rgbno;
> +		__entry->len = len;
> +	),
> +	TP_printk("dev %d:%d rgno 0x%x used 0x%x rgbno 0x%x len 0x%x",
> +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> +		  __entry->rgno,
> +		  __entry->used,
> +		  __entry->rgbno,
> +		  __entry->len)
> +);
> +
> +DECLARE_EVENT_CLASS(xfs_zone_alloc_class,
> +	TP_PROTO(struct xfs_open_zone *oz, xfs_rgblock_t rgbno,
> +		 xfs_extlen_t len),
> +	TP_ARGS(oz, rgbno, len),
> +	TP_STRUCT__entry(
> +		__field(dev_t, dev)
> +		__field(xfs_rgnumber_t, rgno)
> +		__field(xfs_rgblock_t, used)
> +		__field(xfs_rgblock_t, written)
> +		__field(xfs_rgblock_t, write_pointer)
> +		__field(xfs_rgblock_t, rgbno)
> +		__field(xfs_extlen_t, len)
> +	),
> +	TP_fast_assign(
> +		__entry->dev = rtg_mount(oz->oz_rtg)->m_super->s_dev;
> +		__entry->rgno = rtg_rgno(oz->oz_rtg);
> +		__entry->used = rtg_rmap(oz->oz_rtg)->i_used_blocks;
> +		__entry->written = oz->oz_written;
> +		__entry->write_pointer = oz->oz_write_pointer;
> +		__entry->rgbno = rgbno;
> +		__entry->len = len;
> +	),
> +	TP_printk("dev %d:%d rgno 0x%x used 0x%x written 0x%x wp 0x%x rgbno 0x%x len 0x%x",
> +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> +		  __entry->rgno,
> +		  __entry->used,
> +		  __entry->written,
> +		  __entry->write_pointer,
> +		  __entry->rgbno,
> +		  __entry->len)
> +);
> +
> +#define DEFINE_ZONE_ALLOC_EVENT(name)				\
> +DEFINE_EVENT(xfs_zone_alloc_class, name,			\
> +	TP_PROTO(struct xfs_open_zone *oz, xfs_rgblock_t rgbno,	\
> +		 xfs_extlen_t len),				\
> +	TP_ARGS(oz, rgbno, len))
> +DEFINE_ZONE_ALLOC_EVENT(xfs_zone_record_blocks);
> +DEFINE_ZONE_ALLOC_EVENT(xfs_zone_alloc_blocks);
> +#endif /* CONFIG_XFS_RT */
> +
>  TRACE_EVENT(xfs_inodegc_worker,
>  	TP_PROTO(struct xfs_mount *mp, unsigned int shrinker_hits),
>  	TP_ARGS(mp, shrinker_hits),
> @@ -3982,6 +4077,7 @@ DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range);
>  DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow);
>  DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_from);
>  DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_to);
> +DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_skip);
>  
>  DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_cow_range_error);
>  DEFINE_INODE_ERROR_EVENT(xfs_reflink_end_cow_error);
> @@ -5665,6 +5761,7 @@ TRACE_EVENT(xfs_growfs_check_rtgeom,
>  
>  TRACE_DEFINE_ENUM(XC_FREE_BLOCKS);
>  TRACE_DEFINE_ENUM(XC_FREE_RTEXTENTS);
> +TRACE_DEFINE_ENUM(XC_FREE_RTAVAILABLE);
>  
>  DECLARE_EVENT_CLASS(xfs_freeblocks_class,
>  	TP_PROTO(struct xfs_mount *mp, enum xfs_free_counter ctr,
> diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c
> new file mode 100644
> index 000000000000..f41a2cc84382
> --- /dev/null
> +++ b/fs/xfs/xfs_zone_alloc.c
> @@ -0,0 +1,959 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (c) 2023-2025 Christoph Hellwig.
> + * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
> + */
> +#include "xfs.h"
> +#include "xfs_shared.h"
> +#include "xfs_format.h"
> +#include "xfs_log_format.h"
> +#include "xfs_error.h"
> +#include "xfs_trans_resv.h"
> +#include "xfs_mount.h"
> +#include "xfs_inode.h"
> +#include "xfs_iomap.h"
> +#include "xfs_trans.h"
> +#include "xfs_alloc.h"
> +#include "xfs_bmap.h"
> +#include "xfs_bmap_btree.h"
> +#include "xfs_trans_space.h"
> +#include "xfs_refcount.h"
> +#include "xfs_rtbitmap.h"
> +#include "xfs_rtrmap_btree.h"
> +#include "xfs_zone_alloc.h"
> +#include "xfs_zone_priv.h"
> +#include "xfs_zones.h"
> +#include "xfs_trace.h"
> +
> +void
> +xfs_open_zone_put(
> +	struct xfs_open_zone	*oz)
> +{
> +	if (atomic_dec_and_test(&oz->oz_ref)) {
> +		xfs_rtgroup_rele(oz->oz_rtg);
> +		kfree(oz);
> +	}
> +}
> +
> +static void
> +xfs_open_zone_mark_full(
> +	struct xfs_open_zone	*oz)
> +{
> +	struct xfs_rtgroup	*rtg = oz->oz_rtg;
> +	struct xfs_mount	*mp = rtg_mount(rtg);
> +	struct xfs_zone_info	*zi = mp->m_zone_info;
> +
> +	trace_xfs_zone_full(rtg);
> +
> +	WRITE_ONCE(rtg->rtg_open_zone, NULL);
> +
> +	spin_lock(&zi->zi_open_zones_lock);
> +	if (oz->oz_is_gc) {
> +		ASSERT(current == zi->zi_gc_thread);
> +		zi->zi_open_gc_zone = NULL;
> +	} else {
> +		zi->zi_nr_open_zones--;
> +		list_del_init(&oz->oz_entry);
> +	}
> +	spin_unlock(&zi->zi_open_zones_lock);
> +	xfs_open_zone_put(oz);
> +
> +	wake_up_all(&zi->zi_zone_wait);
> +}
> +
> +static void
> +xfs_zone_record_blocks(
> +	struct xfs_trans	*tp,
> +	xfs_fsblock_t		fsbno,
> +	xfs_filblks_t		len,
> +	struct xfs_open_zone	*oz,
> +	bool			used)
> +{
> +	struct xfs_mount	*mp = tp->t_mountp;
> +	struct xfs_rtgroup	*rtg = oz->oz_rtg;
> +	struct xfs_inode	*rmapip = rtg_rmap(rtg);
> +
> +	trace_xfs_zone_record_blocks(oz, xfs_rtb_to_rgbno(mp, fsbno), len);
> +
> +	xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
> +	xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP);
> +	if (used) {
> +		rmapip->i_used_blocks += len;
> +		ASSERT(rmapip->i_used_blocks <= rtg_blocks(rtg));
> +	} else {
> +		xfs_add_frextents(mp, len);
> +	}
> +	oz->oz_written += len;
> +	if (oz->oz_written == rtg_blocks(rtg))
> +		xfs_open_zone_mark_full(oz);
> +	xfs_trans_log_inode(tp, rmapip, XFS_ILOG_CORE);
> +}
> +
> +static int
> +xfs_zoned_map_extent(
> +	struct xfs_trans	*tp,
> +	struct xfs_inode	*ip,
> +	struct xfs_bmbt_irec	*new,
> +	struct xfs_open_zone	*oz,
> +	xfs_fsblock_t		old_startblock)
> +{
> +	struct xfs_bmbt_irec	data;
> +	int			nmaps = 1;
> +	int			error;
> +
> +	/* Grab the corresponding mapping in the data fork. */
> +	error = xfs_bmapi_read(ip, new->br_startoff, new->br_blockcount, &data,
> +			       &nmaps, 0);
> +	if (error)
> +		return error;
> +
> +	/*
> +	 * Cap the update to the existing extent in the data fork because we can
> +	 * only overwrite one extent at a time.
> +	 */
> +	ASSERT(new->br_blockcount >= data.br_blockcount);
> +	new->br_blockcount = data.br_blockcount;
> +
> +	/*
> +	 * If a data write raced with this GC write, keep the existing data in
> +	 * the data fork, mark our newly written GC extent as reclaimable, then
> +	 * move on to the next extent.
> +	 */
> +	if (old_startblock != NULLFSBLOCK &&
> +	    old_startblock != data.br_startblock)
> +		goto skip;
> +
> +	trace_xfs_reflink_cow_remap_from(ip, new);
> +	trace_xfs_reflink_cow_remap_to(ip, &data);
> +
> +	error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK,
> +			XFS_IEXT_REFLINK_END_COW_CNT);
> +	if (error)
> +		return error;
> +
> +	if (data.br_startblock != HOLESTARTBLOCK) {
> +		ASSERT(data.br_startblock != DELAYSTARTBLOCK);
> +		ASSERT(!isnullstartblock(data.br_startblock));
> +
> +		xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &data);
> +		if (xfs_is_reflink_inode(ip)) {
> +			xfs_refcount_decrease_extent(tp, true, &data);
> +		} else {
> +			error = xfs_free_extent_later(tp, data.br_startblock,
> +					data.br_blockcount, NULL,
> +					XFS_AG_RESV_NONE,
> +					XFS_FREE_EXTENT_REALTIME);
> +			if (error)
> +				return error;
> +		}
> +	}
> +
> +	xfs_zone_record_blocks(tp, new->br_startblock, new->br_blockcount, oz,
> +			true);
> +
> +	/* Map the new blocks into the data fork. */
> +	xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, new);
> +	return 0;
> +
> +skip:
> +	trace_xfs_reflink_cow_remap_skip(ip, new);
> +	xfs_zone_record_blocks(tp, new->br_startblock, new->br_blockcount, oz,
> +			false);
> +	return 0;
> +}
> +
> +int
> +xfs_zoned_end_io(
> +	struct xfs_inode	*ip,
> +	xfs_off_t		offset,
> +	xfs_off_t		count,
> +	xfs_daddr_t		daddr,
> +	struct xfs_open_zone	*oz,
> +	xfs_fsblock_t		old_startblock)
> +{
> +	struct xfs_mount	*mp = ip->i_mount;
> +	xfs_fileoff_t		end_fsb = XFS_B_TO_FSB(mp, offset + count);
> +	struct xfs_bmbt_irec	new = {
> +		.br_startoff	= XFS_B_TO_FSBT(mp, offset),
> +		.br_startblock	= xfs_daddr_to_rtb(mp, daddr),
> +		.br_state	= XFS_EXT_NORM,
> +	};
> +	unsigned int		resblks =
> +		XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
> +	struct xfs_trans	*tp;
> +	int			error;
> +
> +	if (xfs_is_shutdown(mp))
> +		return -EIO;
> +
> +	while (new.br_startoff < end_fsb) {
> +		new.br_blockcount = end_fsb - new.br_startoff;
> +
> +		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
> +				XFS_TRANS_RESERVE | XFS_TRANS_RES_FDBLKS, &tp);
> +		if (error)
> +			return error;
> +		xfs_ilock(ip, XFS_ILOCK_EXCL);
> +		xfs_trans_ijoin(tp, ip, 0);
> +
> +		error = xfs_zoned_map_extent(tp, ip, &new, oz, old_startblock);
> +		if (error)
> +			xfs_trans_cancel(tp);
> +		else
> +			error = xfs_trans_commit(tp);
> +		xfs_iunlock(ip, XFS_ILOCK_EXCL);
> +		if (error)
> +			return error;
> +
> +		new.br_startoff += new.br_blockcount;
> +		new.br_startblock += new.br_blockcount;
> +		if (old_startblock != NULLFSBLOCK)
> +			old_startblock += new.br_blockcount;
> +	}
> +
> +	return 0;
> +}
> +
> +/*
> + * "Free" blocks allocated in a zone.
> + *
> + * Just decrement the used blocks counter and report the space as freed.
> + */
> +int
> +xfs_zone_free_blocks(
> +	struct xfs_trans	*tp,
> +	struct xfs_rtgroup	*rtg,
> +	xfs_fsblock_t		fsbno,
> +	xfs_filblks_t		len)
> +{
> +	struct xfs_mount	*mp = tp->t_mountp;
> +	struct xfs_inode	*rmapip = rtg_rmap(rtg);
> +
> +	xfs_assert_ilocked(rmapip, XFS_ILOCK_EXCL);
> +
> +	if (len > rmapip->i_used_blocks) {
> +		xfs_err(mp,
> +"trying to free more blocks (%lld) than used counter (%u).",
> +			len, rmapip->i_used_blocks);
> +		ASSERT(len <= rmapip->i_used_blocks);
> +		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
> +		return -EFSCORRUPTED;
> +	}
> +
> +	trace_xfs_zone_free_blocks(rtg, xfs_rtb_to_rgbno(mp, fsbno), len);
> +
> +	rmapip->i_used_blocks -= len;
> +	xfs_add_frextents(mp, len);
> +	xfs_trans_log_inode(tp, rmapip, XFS_ILOG_CORE);
> +	return 0;
> +}
> +
> +/*
> + * Check if the zone containing the data just before the offset we are
> + * writing to is still open and has space.
> + */
> +static struct xfs_open_zone *
> +xfs_last_used_zone(
> +	struct iomap_ioend	*ioend)
> +{
> +	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
> +	struct xfs_mount	*mp = ip->i_mount;
> +	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSB(mp, ioend->io_offset);
> +	struct xfs_rtgroup	*rtg = NULL;
> +	struct xfs_open_zone	*oz = NULL;
> +	struct xfs_iext_cursor	icur;
> +	struct xfs_bmbt_irec	got;
> +
> +	xfs_ilock(ip, XFS_ILOCK_SHARED);
> +	if (!xfs_iext_lookup_extent_before(ip, &ip->i_df, &offset_fsb,
> +				&icur, &got)) {
> +		xfs_iunlock(ip, XFS_ILOCK_SHARED);
> +		return NULL;
> +	}
> +	xfs_iunlock(ip, XFS_ILOCK_SHARED);
> +
> +	rtg = xfs_rtgroup_grab(mp, xfs_rtb_to_rgno(mp, got.br_startblock));
> +	if (!rtg)
> +		return NULL;
> +
> +	xfs_ilock(rtg_rmap(rtg), XFS_ILOCK_SHARED);
> +	oz = READ_ONCE(rtg->rtg_open_zone);
> +	if (oz && (oz->oz_is_gc || !atomic_inc_not_zero(&oz->oz_ref)))
> +		oz = NULL;
> +	xfs_iunlock(rtg_rmap(rtg), XFS_ILOCK_SHARED);
> +
> +	xfs_rtgroup_rele(rtg);
> +	return oz;
> +}
> +
> +static struct xfs_group *
> +xfs_find_free_zone(
> +	struct xfs_mount	*mp,
> +	unsigned long		start,
> +	unsigned long		end)
> +{
> +	struct xfs_zone_info	*zi = mp->m_zone_info;
> +	XA_STATE		(xas, &mp->m_groups[XG_TYPE_RTG].xa, start);
> +	struct xfs_group	*xg;
> +
> +	xas_lock(&xas);
> +	xas_for_each_marked(&xas, xg, end, XFS_RTG_FREE)
> +		if (atomic_inc_not_zero(&xg->xg_active_ref))
> +			goto found;
> +	xas_unlock(&xas);
> +	return NULL;
> +
> +found:
> +	xas_clear_mark(&xas, XFS_RTG_FREE);
> +	atomic_dec(&zi->zi_nr_free_zones);
> +	zi->zi_free_zone_cursor = xg->xg_gno;
> +	xas_unlock(&xas);
> +	return xg;
> +}
> +
> +static struct xfs_open_zone *
> +xfs_init_open_zone(
> +	struct xfs_rtgroup	*rtg,
> +	xfs_rgblock_t		write_pointer,
> +	bool			is_gc)
> +{
> +	struct xfs_open_zone	*oz;
> +
> +	oz = kzalloc(sizeof(*oz), GFP_NOFS | __GFP_NOFAIL);
> +	spin_lock_init(&oz->oz_alloc_lock);
> +	atomic_set(&oz->oz_ref, 1);
> +	oz->oz_rtg = rtg;
> +	oz->oz_write_pointer = write_pointer;
> +	oz->oz_written = write_pointer;
> +	oz->oz_is_gc = is_gc;
> +
> +	/*
> +	 * All dereferences of rtg->rtg_open_zone hold the ILOCK for the rmap
> +	 * inode, but we don't really want to take that here because we are
> +	 * under the zone_list_lock.  Ensure the pointer is only set for a fully
> +	 * initialized open zone structure so that a racy lookup finding it is
> +	 * fine.
> +	 */
> +	WRITE_ONCE(rtg->rtg_open_zone, oz);
> +	return oz;
> +}
> +
> +/*
> + * Find a completely free zone, open it, and return a reference.
> + */
> +struct xfs_open_zone *
> +xfs_open_zone(
> +	struct xfs_mount	*mp,
> +	bool			is_gc)
> +{
> +	struct xfs_zone_info	*zi = mp->m_zone_info;
> +	struct xfs_group	*xg;
> +
> +	xg = xfs_find_free_zone(mp, zi->zi_free_zone_cursor, ULONG_MAX);
> +	if (!xg)
> +		xg = xfs_find_free_zone(mp, 0, zi->zi_free_zone_cursor);
> +	if (!xg)
> +		return NULL;
> +
> +	set_current_state(TASK_RUNNING);
> +	return xfs_init_open_zone(to_rtg(xg), 0, is_gc);
> +}
> +
> +static struct xfs_open_zone *
> +xfs_try_open_zone(
> +	struct xfs_mount	*mp)
> +{
> +	struct xfs_zone_info	*zi = mp->m_zone_info;
> +	struct xfs_open_zone	*oz;
> +
> +	if (zi->zi_nr_open_zones >= mp->m_max_open_zones - XFS_OPEN_GC_ZONES)
> +		return NULL;
> +	if (atomic_read(&zi->zi_nr_free_zones) <
> +	    XFS_GC_ZONES - XFS_OPEN_GC_ZONES)
> +		return NULL;
> +
> +	/*
> +	 * Increment the open zone count to reserve our slot before dropping
> +	 * zi_open_zones_lock.
> +	 */
> +	zi->zi_nr_open_zones++;
> +	spin_unlock(&zi->zi_open_zones_lock);
> +	oz = xfs_open_zone(mp, false);
> +	spin_lock(&zi->zi_open_zones_lock);
> +	if (!oz) {
> +		zi->zi_nr_open_zones--;
> +		return NULL;
> +	}
> +
> +	atomic_inc(&oz->oz_ref);
> +	list_add_tail(&oz->oz_entry, &zi->zi_open_zones);
> +
> +	/*
> +	 * If this was the last free zone, other waiters might be waiting
> +	 * on us to write to it as well.
> +	 */
> +	wake_up_all(&zi->zi_zone_wait);
> +
> +	/* XXX: this is a little verbose, but let's keep it for now */
> +	xfs_info(mp, "using zone %u (%u)",
> +		 rtg_rgno(oz->oz_rtg), zi->zi_nr_open_zones);

Should this XXX become a tracepoint?

> +	trace_xfs_zone_activate(oz->oz_rtg);
> +	return oz;
> +}
> +
> +static bool
> +xfs_try_use_zone(
> +	struct xfs_zone_info	*zi,
> +	struct xfs_open_zone	*oz)
> +{
> +	if (oz->oz_write_pointer == rtg_blocks(oz->oz_rtg))
> +		return false;
> +	if (!atomic_inc_not_zero(&oz->oz_ref))
> +		return false;
> +
> +	/*
> +	 * If we couldn't match by inode or life time we just pick the first
> +	 * zone with enough space above.  For that we want the least busy zone
> +	 * for some definition of "least" busy.  For now this simple LRU
> +	 * algorithm that rotates every zone to the end of the list will do it,
> +	 * even if it isn't exactly cache friendly.
> +	 */
> +	if (!list_is_last(&oz->oz_entry, &zi->zi_open_zones))
> +		list_move_tail(&oz->oz_entry, &zi->zi_open_zones);
> +	return true;
> +}
> +
> +static struct xfs_open_zone *
> +xfs_select_open_zone_lru(
> +	struct xfs_zone_info	*zi)
> +{
> +	struct xfs_open_zone	*oz;
> +
> +	lockdep_assert_held(&zi->zi_open_zones_lock);
> +
> +	list_for_each_entry(oz, &zi->zi_open_zones, oz_entry)
> +		if (xfs_try_use_zone(zi, oz))
> +			return oz;
> +
> +	cond_resched_lock(&zi->zi_open_zones_lock);
> +	return NULL;
> +}
> +
> +static struct xfs_open_zone *
> +xfs_select_open_zone_mru(
> +	struct xfs_zone_info	*zi)
> +{
> +	struct xfs_open_zone	*oz;
> +
> +	lockdep_assert_held(&zi->zi_open_zones_lock);
> +
> +	list_for_each_entry_reverse(oz, &zi->zi_open_zones, oz_entry)
> +		if (xfs_try_use_zone(zi, oz))
> +			return oz;
> +
> +	cond_resched_lock(&zi->zi_open_zones_lock);
> +	return NULL;
> +}
> +
> +/*
> + * Try to pack inodes that are written back after they were closed tight instead
> + * of trying to open new zones for them or spread them to the least recently
> + * used zone.  This optimizes the data layout for workloads that untar or copy
> + * a lot of small files.  Right now this does not separate multiple such
> + * streams.
> + */
> +static inline bool xfs_zoned_pack_tight(struct xfs_inode *ip)
> +{
> +	return !inode_is_open_for_write(VFS_I(ip)) &&
> +		!(ip->i_diflags & XFS_DIFLAG_APPEND);
> +}
> +
> +/*
> + * Pick a new zone for writes.
> + *
> + * If we aren't using up our budget of open zones just open a new one from the
> + * freelist.  Else try to find one that matches the expected data lifetime.  If
> + * we don't find one that is good pick any zone that is available.
> + */
> +static struct xfs_open_zone *
> +xfs_select_zone_nowait(
> +	struct xfs_mount	*mp,
> +	bool			pack_tight)
> +{
> +	struct xfs_zone_info	*zi = mp->m_zone_info;
> +	struct xfs_open_zone	*oz = NULL;
> +
> +	if (xfs_is_shutdown(mp))
> +		return NULL;
> +
> +	spin_lock(&zi->zi_open_zones_lock);
> +	if (pack_tight)
> +		oz = xfs_select_open_zone_mru(zi);
> +	if (oz)
> +		goto out_unlock;
> +
> +	/*
> +	 * See if we can open a new zone and use that.
> +	 */
> +	oz = xfs_try_open_zone(mp);
> +	if (oz)
> +		goto out_unlock;
> +
> +	oz = xfs_select_open_zone_lru(zi);
> +out_unlock:
> +	spin_unlock(&zi->zi_open_zones_lock);
> +	return oz;
> +}
> +
> +static struct xfs_open_zone *
> +xfs_select_zone(
> +	struct xfs_mount	*mp,
> +	bool			pack_tight)
> +{
> +	struct xfs_zone_info	*zi = mp->m_zone_info;
> +	DEFINE_WAIT		(wait);
> +	struct xfs_open_zone	*oz;
> +
> +	oz = xfs_select_zone_nowait(mp, pack_tight);
> +	if (oz)
> +		return oz;
> +
> +	for (;;) {
> +		prepare_to_wait(&zi->zi_zone_wait, &wait, TASK_UNINTERRUPTIBLE);
> +		oz = xfs_select_zone_nowait(mp, pack_tight);
> +		if (oz)
> +			break;
> +		schedule();
> +	}
> +	finish_wait(&zi->zi_zone_wait, &wait);
> +	return oz;
> +}
> +
> +static unsigned int
> +xfs_zone_alloc_blocks(
> +	struct xfs_open_zone	*oz,
> +	xfs_filblks_t		count_fsb,
> +	sector_t		*sector,
> +	bool			*is_seq)
> +{
> +	struct xfs_rtgroup	*rtg = oz->oz_rtg;
> +	struct xfs_mount	*mp = rtg_mount(rtg);
> +	xfs_rgblock_t		rgbno;
> +
> +	spin_lock(&oz->oz_alloc_lock);
> +	count_fsb = min3(count_fsb, XFS_MAX_BMBT_EXTLEN,
> +		(xfs_filblks_t)rtg_blocks(rtg) - oz->oz_write_pointer);
> +	if (!count_fsb) {
> +		spin_unlock(&oz->oz_alloc_lock);
> +		return 0;
> +	}
> +	rgbno = oz->oz_write_pointer;
> +	oz->oz_write_pointer += count_fsb;
> +	spin_unlock(&oz->oz_alloc_lock);
> +
> +	trace_xfs_zone_alloc_blocks(oz, rgbno, count_fsb);
> +
> +	*sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0);
> +	*is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *sector);
> +	if (!*is_seq)
> +		*sector += XFS_FSB_TO_BB(mp, rgbno);
> +	return XFS_FSB_TO_B(mp, count_fsb);
> +}
> +
> +void
> +xfs_mark_rtg_boundary(
> +	struct iomap_ioend	*ioend)
> +{
> +	struct xfs_mount	*mp = XFS_I(ioend->io_inode)->i_mount;
> +	sector_t		sector = ioend->io_bio.bi_iter.bi_sector;
> +
> +	if (xfs_rtb_to_rgbno(mp, xfs_daddr_to_rtb(mp, sector)) == 0)
> +		ioend->io_flags |= IOMAP_IOEND_BOUNDARY;
> +}
> +
> +static void
> +xfs_submit_zoned_bio(
> +	struct iomap_ioend	*ioend,
> +	struct xfs_open_zone	*oz,
> +	bool			is_seq)
> +{
> +	ioend->io_bio.bi_iter.bi_sector = ioend->io_sector;
> +	ioend->io_private = oz;
> +	atomic_inc(&oz->oz_ref); /* for xfs_zoned_end_io */
> +
> +	if (is_seq) {
> +		ioend->io_bio.bi_opf &= ~REQ_OP_WRITE;
> +		ioend->io_bio.bi_opf |= REQ_OP_ZONE_APPEND;
> +	} else {
> +		xfs_mark_rtg_boundary(ioend);
> +	}
> +
> +	submit_bio(&ioend->io_bio);
> +}
> +
> +void
> +xfs_zone_alloc_and_submit(
> +	struct iomap_ioend	*ioend,
> +	struct xfs_open_zone	**oz)
> +{
> +	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
> +	struct xfs_mount	*mp = ip->i_mount;
> +	bool			pack_tight = xfs_zoned_pack_tight(ip);
> +	unsigned int		alloc_len;
> +	struct iomap_ioend	*split;
> +	bool			is_seq;
> +
> +	if (xfs_is_shutdown(mp))
> +		goto out_error;
> +
> +	/*
> +	 * If we don't have a cached zone in this write context, see if the
> +	 * last extent before the one we are writing points of an active zone.

"...writing points *to the end* of an active zone" ?

> +	 * If so, just continue writing to it.
> +	 */
> +	if (!*oz && ioend->io_offset)
> +		*oz = xfs_last_used_zone(ioend);

Also, why not return oz instead of passing it out via double pointer?

> +	if (!*oz) {
> +select_zone:
> +		*oz = xfs_select_zone(mp, pack_tight);
> +		if (!*oz)
> +			goto out_error;
> +	}
> +
> +	alloc_len = xfs_zone_alloc_blocks(*oz, XFS_B_TO_FSB(mp, ioend->io_size),
> +			&ioend->io_sector, &is_seq);
> +	if (!alloc_len) {
> +		xfs_open_zone_put(*oz);
> +		goto select_zone;
> +	}
> +
> +	while ((split = iomap_split_ioend(ioend, alloc_len, is_seq))) {
> +		if (IS_ERR(split))
> +			goto out_split_error;
> +		alloc_len -= split->io_bio.bi_iter.bi_size;
> +		xfs_submit_zoned_bio(split, *oz, is_seq);
> +		if (!alloc_len) {
> +			xfs_open_zone_put(*oz);
> +			goto select_zone;
> +		}
> +	}
> +
> +	xfs_submit_zoned_bio(ioend, *oz, is_seq);
> +	return;
> +
> +out_split_error:
> +	ioend->io_bio.bi_status = errno_to_blk_status(PTR_ERR(split));
> +out_error:
> +	bio_io_error(&ioend->io_bio);
> +}
> +
> +void
> +xfs_zoned_wake_all(
> +	struct xfs_mount	*mp)
> +{
> +	if (!(mp->m_super->s_flags & SB_ACTIVE))
> +		return; /* can happen during log recovery */
> +	wake_up_all(&mp->m_zone_info->zi_zone_wait);
> +}
> +
> +/*
> + * Check if @rgbno in @rgb is a potentially valid block.  It might still be
> + * unused, but that information is only found in the rmap.
> + */
> +bool
> +xfs_zone_rgbno_is_valid(
> +	struct xfs_rtgroup	*rtg,
> +	xfs_rgnumber_t		rgbno)
> +{
> +	lockdep_assert_held(&rtg_rmap(rtg)->i_lock);
> +
> +	if (rtg->rtg_open_zone)
> +		return rgbno < rtg->rtg_open_zone->oz_write_pointer;
> +	return !xa_get_mark(&rtg_mount(rtg)->m_groups[XG_TYPE_RTG].xa,
> +			rtg_rgno(rtg), XFS_RTG_FREE);
> +}
> +
> +static void
> +xfs_free_open_zones(
> +	struct xfs_zone_info	*zi)
> +{
> +	struct xfs_open_zone	*oz;
> +
> +	spin_lock(&zi->zi_open_zones_lock);
> +	while ((oz = list_first_entry_or_null(&zi->zi_open_zones,
> +			struct xfs_open_zone, oz_entry))) {
> +		list_del(&oz->oz_entry);
> +		xfs_open_zone_put(oz);
> +	}
> +	spin_unlock(&zi->zi_open_zones_lock);
> +}
> +
> +struct xfs_init_zones {
> +	struct xfs_mount	*mp;
> +	uint64_t		available;
> +	uint64_t		reclaimable;
> +};
> +
> +static int
> +xfs_init_zone(
> +	struct xfs_init_zones	*iz,
> +	struct xfs_rtgroup	*rtg,
> +	struct blk_zone		*zone)
> +{
> +	struct xfs_mount	*mp = rtg_mount(rtg);
> +	struct xfs_zone_info	*zi = mp->m_zone_info;
> +	uint64_t		used = rtg_rmap(rtg)->i_used_blocks;
> +	xfs_rgblock_t		write_pointer, highest_rgbno;
> +
> +	if (zone && !xfs_zone_validate(zone, rtg, &write_pointer))
> +		return -EFSCORRUPTED;
> +
> +	/*
> +	 * For sequential write required zones we retrieved the hardware write
> +	 * pointer above.
> +	 *
> +	 * For conventional zones or conventional devices we don't have that
> +	 * luxury.  Instead query the rmap to find the highest recorded block
> +	 * and set the write pointer to the block after that.  In case of a
> +	 * power loss this misses blocks where the data I/O has completed but
> +	 * not recorded in the rmap yet, and it also rewrites blocks if the most
> +	 * recently written ones got deleted again before unmount, but this is
> +	 * the best we can do without hardware support.
> +	 */
> +	if (!zone || zone->cond == BLK_ZONE_COND_NOT_WP) {
> +		xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
> +		highest_rgbno = xfs_rtrmap_highest_rgbno(rtg);
> +		if (highest_rgbno == NULLRGBLOCK)
> +			write_pointer = 0;
> +		else
> +			write_pointer = highest_rgbno + 1;
> +		xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
> +	}
> +
> +	if (write_pointer == 0) {
> +		/* zone is empty */
> +		atomic_inc(&zi->zi_nr_free_zones);
> +		xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_FREE);
> +		iz->available += rtg_blocks(rtg);
> +	} else if (write_pointer < rtg_blocks(rtg)) {
> +		/* zone is open */
> +		struct xfs_open_zone *oz;
> +
> +		atomic_inc(&rtg_group(rtg)->xg_active_ref);
> +		oz = xfs_init_open_zone(rtg, write_pointer, false);
> +		list_add_tail(&oz->oz_entry, &zi->zi_open_zones);
> +		zi->zi_nr_open_zones++;
> +
> +		iz->available += (rtg_blocks(rtg) - write_pointer);
> +		iz->reclaimable += write_pointer - used;
> +	} else if (used < rtg_blocks(rtg)) {
> +		/* zone fully written, but has freed blocks */
> +		iz->reclaimable += (rtg_blocks(rtg) - used);
> +	}
> +
> +	return 0;
> +}
> +
> +static int
> +xfs_get_zone_info_cb(
> +	struct blk_zone		*zone,
> +	unsigned int		idx,
> +	void			*data)
> +{
> +	struct xfs_init_zones	*iz = data;
> +	struct xfs_mount	*mp = iz->mp;
> +	xfs_fsblock_t		zsbno = xfs_daddr_to_rtb(mp, zone->start);
> +	xfs_rgnumber_t		rgno;
> +	struct xfs_rtgroup	*rtg;
> +	int			error;
> +
> +	if (xfs_rtb_to_rgbno(mp, zsbno) != 0) {
> +		xfs_warn(mp, "mismatched zone start 0x%llx.", zsbno);
> +		return -EFSCORRUPTED;
> +	}
> +
> +	rgno = xfs_rtb_to_rgno(mp, zsbno);
> +	rtg = xfs_rtgroup_grab(mp, rgno);
> +	if (!rtg) {
> +		xfs_warn(mp, "realtime group not found for zone %u.", rgno);
> +		return -EFSCORRUPTED;
> +	}
> +	error = xfs_init_zone(iz, rtg, zone);
> +	xfs_rtgroup_rele(rtg);
> +	return error;
> +}
> +
> +/*
> + * Calculate the max open zone limit based on the of number of
> + * backing zones available
> + */
> +static inline uint32_t
> +xfs_max_open_zones(
> +	struct xfs_mount	*mp)
> +{
> +	unsigned int		max_open, max_open_data_zones;
> +	/*
> +	 * We need two zones for every open data zone,
> +	 * one in reserve as we don't reclaim open zones. One data zone
> +	 * and its spare is included in XFS_MIN_ZONES.
> +	 */
> +	max_open_data_zones = (mp->m_sb.sb_rgcount - XFS_MIN_ZONES) / 2 + 1;
> +	max_open = max_open_data_zones + XFS_OPEN_GC_ZONES;
> +
> +	/*
> +	 * Cap the max open limit to 1/4 of available space
> +	 */
> +	max_open = min(max_open, mp->m_sb.sb_rgcount / 4);
> +
> +	return max(XFS_MIN_OPEN_ZONES, max_open);
> +}
> +
> +/*
> + * Normally we use the open zone limit that the device reports.  If there is
> + * none let the user pick one from the command line.
> + *
> + * If the device doesn't report an open zone limit and there is no override,
> + * allow to hold about a quarter of the zones open.  In theory we could allow
> + * all to be open, but at that point we run into GC deadlocks because we can't
> + * reclaim open zones.
> + *
> + * When used on conventional SSDs a lower open limit is advisable as we'll
> + * otherwise overwhelm the FTL just as much as a conventional block allocator.
> + *
> + * Note: To debug the open zone management code, force max_open to 1 here.
> + */
> +static int
> +xfs_calc_open_zones(
> +	struct xfs_mount	*mp)
> +{
> +	struct block_device	*bdev = mp->m_rtdev_targp->bt_bdev;
> +	unsigned int		bdev_open_zones = bdev_max_open_zones(bdev);
> +
> +	if (!mp->m_max_open_zones) {
> +		if (bdev_open_zones)
> +			mp->m_max_open_zones = bdev_open_zones;
> +		else
> +			mp->m_max_open_zones = xfs_max_open_zones(mp);
> +	}
> +
> +	if (mp->m_max_open_zones < XFS_MIN_OPEN_ZONES) {
> +		xfs_notice(mp, "need at least %u open zones.",
> +			XFS_MIN_OPEN_ZONES);
> +		return -EIO;
> +	}
> +
> +	if (bdev_open_zones && bdev_open_zones < mp->m_max_open_zones) {
> +		mp->m_max_open_zones = bdev_open_zones;
> +		xfs_info(mp, "limiting open zones to %u due to hardware limit.\n",
> +			bdev_open_zones);
> +	}
> +
> +	if (mp->m_max_open_zones > xfs_max_open_zones(mp)) {
> +		mp->m_max_open_zones = xfs_max_open_zones(mp);
> +		xfs_info(mp,
> +"limiting open zones to %u due to total zone count (%u)",
> +			mp->m_max_open_zones, mp->m_sb.sb_rgcount);
> +	}
> +
> +	return 0;
> +}
> +
> +static struct xfs_zone_info *
> +xfs_alloc_zone_info(
> +	struct xfs_mount	*mp)
> +{
> +	struct xfs_zone_info	*zi;
> +
> +	zi = kzalloc(sizeof(*zi), GFP_KERNEL);
> +	if (!zi)
> +		return NULL;
> +	INIT_LIST_HEAD(&zi->zi_open_zones);
> +	INIT_LIST_HEAD(&zi->zi_reclaim_reservations);
> +	spin_lock_init(&zi->zi_reset_list_lock);
> +	spin_lock_init(&zi->zi_open_zones_lock);
> +	spin_lock_init(&zi->zi_reservation_lock);
> +	init_waitqueue_head(&zi->zi_zone_wait);
> +	return zi;
> +}
> +
> +static void
> +xfs_free_zone_info(
> +	struct xfs_zone_info	*zi)
> +{
> +	xfs_free_open_zones(zi);
> +	kfree(zi);
> +}
> +
> +int
> +xfs_mount_zones(
> +	struct xfs_mount	*mp)
> +{
> +	struct xfs_init_zones	iz = {
> +		.mp		= mp,
> +	};
> +	struct xfs_buftarg	*bt = mp->m_rtdev_targp;
> +	int			error;
> +
> +	if (!bt) {
> +		xfs_notice(mp, "RT device missing.");
> +		return -EINVAL;
> +	}
> +
> +	if (!xfs_has_rtgroups(mp) || !xfs_has_rmapbt(mp)) {
> +		xfs_notice(mp, "invalid flag combination.");
> +		return -EFSCORRUPTED;
> +	}
> +	if (mp->m_sb.sb_rextsize != 1) {
> +		xfs_notice(mp, "zoned file systems do not support rextsize.");
> +		return -EFSCORRUPTED;
> +	}
> +	if (mp->m_sb.sb_rgcount < XFS_MIN_ZONES) {
> +		xfs_notice(mp,
> +"zoned file systems need to have at least %u zones.", XFS_MIN_ZONES);
> +		return -EFSCORRUPTED;
> +	}
> +
> +	error = xfs_calc_open_zones(mp);
> +	if (error)
> +		return error;
> +
> +	mp->m_zone_info = xfs_alloc_zone_info(mp);
> +	if (!mp->m_zone_info)
> +		return -ENOMEM;
> +
> +	xfs_info(mp, "%u zones of %u blocks size (%u max open)",
> +		 mp->m_sb.sb_rgcount, mp->m_groups[XG_TYPE_RTG].blocks,
> +		 mp->m_max_open_zones);

Tracepoint?

> +	if (bdev_is_zoned(bt->bt_bdev)) {
> +		error = blkdev_report_zones(bt->bt_bdev,
> +				XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart),
> +				mp->m_sb.sb_rgcount, xfs_get_zone_info_cb, &iz);
> +		if (error < 0)
> +			goto out_free_zone_info;
> +	} else {
> +		struct xfs_rtgroup	*rtg = NULL;
> +
> +		while ((rtg = xfs_rtgroup_next(mp, rtg))) {
> +			error = xfs_init_zone(&iz, rtg, NULL);
> +			if (error)
> +				goto out_free_zone_info;
> +		}
> +	}
> +
> +	xfs_set_freecounter(mp, XC_FREE_RTAVAILABLE, iz.available);
> +	xfs_set_freecounter(mp, XC_FREE_RTEXTENTS,
> +		iz.available + iz.reclaimable);

More indenting needed here.

> +	return 0;
> +
> +out_free_zone_info:
> +	xfs_free_zone_info(mp->m_zone_info);
> +	return error;
> +}
> +
> +void
> +xfs_unmount_zones(
> +	struct xfs_mount	*mp)
> +{
> +	xfs_free_zone_info(mp->m_zone_info);
> +}
> diff --git a/fs/xfs/xfs_zone_alloc.h b/fs/xfs/xfs_zone_alloc.h
> new file mode 100644
> index 000000000000..78cd7bfc6ac8
> --- /dev/null
> +++ b/fs/xfs/xfs_zone_alloc.h
> @@ -0,0 +1,34 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _XFS_ZONE_ALLOC_H
> +#define _XFS_ZONE_ALLOC_H
> +
> +struct iomap_ioend;
> +struct xfs_open_zone;
> +
> +void xfs_zone_alloc_and_submit(struct iomap_ioend *ioend,
> +		struct xfs_open_zone **oz);
> +int xfs_zone_free_blocks(struct xfs_trans *tp, struct xfs_rtgroup *rtg,
> +		xfs_fsblock_t fsbno, xfs_filblks_t len);
> +int xfs_zoned_end_io(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count,
> +		xfs_daddr_t daddr, struct xfs_open_zone *oz,
> +		xfs_fsblock_t old_startblock);
> +void xfs_open_zone_put(struct xfs_open_zone *oz);
> +
> +void xfs_zoned_wake_all(struct xfs_mount *mp);
> +bool xfs_zone_rgbno_is_valid(struct xfs_rtgroup *rtg, xfs_rgnumber_t rgbno);
> +void xfs_mark_rtg_boundary(struct iomap_ioend *ioend);
> +
> +#ifdef CONFIG_XFS_RT
> +int xfs_mount_zones(struct xfs_mount *mp);
> +void xfs_unmount_zones(struct xfs_mount *mp);
> +#else
> +static inline int xfs_mount_zones(struct xfs_mount *mp)
> +{
> +	return -EIO;
> +}
> +static inline void xfs_unmount_zones(struct xfs_mount *mp)
> +{
> +}
> +#endif /* CONFIG_XFS_RT */
> +
> +#endif /* _XFS_ZONE_ALLOC_H */
> diff --git a/fs/xfs/xfs_zone_priv.h b/fs/xfs/xfs_zone_priv.h
> new file mode 100644
> index 000000000000..23d2fd6088ae
> --- /dev/null
> +++ b/fs/xfs/xfs_zone_priv.h
> @@ -0,0 +1,89 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _XFS_ZONE_PRIV_H
> +#define _XFS_ZONE_PRIV_H
> +
> +struct xfs_open_zone {
> +	/*
> +	 * Entry in the open zone list and refcount.  Protected by
> +	 * zi_open_zones_lock in struct xfs_zone_info.
> +	 */
> +	struct list_head	oz_entry;
> +	atomic_t		oz_ref;
> +
> +	/*
> +	 * oz_write_pointer is the write pointer at which space is handed out
> +	 * for conventional zones, or simple the count of blocks handed out
> +	 * so far for sequential write required zones and is protected by
> +	 * oz_alloc_lock/
> +	 */
> +	spinlock_t		oz_alloc_lock;
> +	xfs_rgblock_t		oz_write_pointer;
> +
> +	/*
> +	 * oz_written is the number of blocks for which we've received a
> +	 * write completion.  oz_written must always be <= oz_write_pointer
> +	 * and is protected by the ILOCK of the rmap inode.
> +	 */
> +	xfs_rgblock_t		oz_written;
> +
> +	/*
> +	 * Is this open zone used for garbage collection?  There can only be a
> +	 * single open GC zone, which is pointed to by zi_open_gc_zone in
> +	 * struct xfs_zone_info.  Constant over the life time of an open zone.
> +	 */
> +	bool			oz_is_gc;
> +
> +	/*
> +	 * Pointer to the RT groups structure for this open zone.  Constant over
> +	 * the life time of an open zone.
> +	 */
> +	struct xfs_rtgroup	*oz_rtg;
> +};
> +
> +struct xfs_zone_info {
> +	/*
> +	 * List of pending space reservations:
> +	 */
> +	spinlock_t		zi_reservation_lock;
> +	struct list_head	zi_reclaim_reservations;
> +
> +	/*
> +	 * List and number of open zones:
> +	 */
> +	spinlock_t		zi_open_zones_lock;
> +	struct list_head	zi_open_zones;
> +	unsigned int		zi_nr_open_zones;
> +
> +	/*
> +	 * Free zone search cursor and number of free zones:
> +	 */
> +	unsigned long		zi_free_zone_cursor;
> +	atomic_t		zi_nr_free_zones;
> +
> +	/*
> +	 * Wait queue to wait for free zones or open zone resources to become
> +	 * available:
> +	 */
> +	wait_queue_head_t	zi_zone_wait;
> +
> +	/*
> +	 * Pointer to the GC thread, and the current open zone used by GC
> +	 * (if any).
> +	 *
> +	 * zi_open_gc_zone is mostly private to the GC thread, but can be read
> +	 * for debugging from other threads, in which case zi_open_zones_lock
> +	 * must be taken to access it.
> +	 */
> +	struct task_struct      *zi_gc_thread;
> +	struct xfs_open_zone	*zi_open_gc_zone;
> +
> +	/*
> +	 * List of zones that need a reset:
> +	 */
> +	spinlock_t		zi_reset_list_lock;
> +	struct xfs_group	*zi_reset_list;
> +};
> +
> +struct xfs_open_zone *xfs_open_zone(struct xfs_mount *mp, bool is_gc);
> +
> +#endif /* _XFS_ZONE_PRIV_H */
> -- 
> 2.45.2
> 
>
hch Feb. 13, 2025, 5:14 a.m. UTC | #2
On Fri, Feb 07, 2025 at 09:39:42AM -0800, Darrick J. Wong wrote:
> > +	/* XXX: this is a little verbose, but let's keep it for now */
> > +	xfs_info(mp, "using zone %u (%u)",
> > +		 rtg_rgno(oz->oz_rtg), zi->zi_nr_open_zones);
> 
> Should this XXX become a tracepoint?
> 
> > +	trace_xfs_zone_activate(oz->oz_rtg);

The tracepoint is just below, but yes - this obviously was left as a
canary in the coalmine to check if anyone actually reviews the code :)

> > +	if (xfs_is_shutdown(mp))
> > +		goto out_error;
> > +
> > +	/*
> > +	 * If we don't have a cached zone in this write context, see if the
> > +	 * last extent before the one we are writing points of an active zone.
> 
> "...writing points *to the end* of an active zone" ?

We only really care about the same zone.  Even if that doesn't create a
contiguous extent, it means the next GC cycle will make it contiguous.
But even before that the locality is kinda useful at least on HDD.

There's still grammar issues, though which I've fixed up.

> 
> > +	 * If so, just continue writing to it.
> > +	 */
> > +	if (!*oz && ioend->io_offset)
> > +		*oz = xfs_last_used_zone(ioend);
> 
> Also, why not return oz instead of passing it out via double pointer?

I remember going back and forth a few times.  Let me give it a try to
see how it works out this time.

> > +	mp->m_zone_info = xfs_alloc_zone_info(mp);
> > +	if (!mp->m_zone_info)
> > +		return -ENOMEM;
> > +
> > +	xfs_info(mp, "%u zones of %u blocks size (%u max open)",
> > +		 mp->m_sb.sb_rgcount, mp->m_groups[XG_TYPE_RTG].blocks,
> > +		 mp->m_max_open_zones);
> 
> Tracepoint?

I think this actually pretty usueful mount time information in the
kernel log.  But if you mean a trace point on top of the message and
not instead I can look into it.
hch Feb. 13, 2025, 8:35 a.m. UTC | #3
On Thu, Feb 13, 2025 at 06:14:48AM +0100, Christoph Hellwig wrote:
> > > +	if (!*oz && ioend->io_offset)
> > > +		*oz = xfs_last_used_zone(ioend);
> > 
> > Also, why not return oz instead of passing it out via double pointer?
> 
> I remember going back and forth a few times.  Let me give it a try to
> see how it works out this time.

Looking into this, the callers treat the open_zone structure as a
context, an passing in the address makes this much nicer.  If you don't
like all the double pointer dereference operators we could add a new
context structure that just has a pointer to the open zone as syntactic
sugar.  And writing this down I think I like the idea, that makes it much
more clear what is done here, even if it generates the same code.
Darrick J. Wong Feb. 13, 2025, 10:08 p.m. UTC | #4
On Thu, Feb 13, 2025 at 06:14:48AM +0100, Christoph Hellwig wrote:
> On Fri, Feb 07, 2025 at 09:39:42AM -0800, Darrick J. Wong wrote:
> > > +	/* XXX: this is a little verbose, but let's keep it for now */
> > > +	xfs_info(mp, "using zone %u (%u)",
> > > +		 rtg_rgno(oz->oz_rtg), zi->zi_nr_open_zones);
> > 
> > Should this XXX become a tracepoint?
> > 
> > > +	trace_xfs_zone_activate(oz->oz_rtg);
> 
> The tracepoint is just below, but yes - this obviously was left as a
> canary in the coalmine to check if anyone actually reviews the code :)
> 
> > > +	if (xfs_is_shutdown(mp))
> > > +		goto out_error;
> > > +
> > > +	/*
> > > +	 * If we don't have a cached zone in this write context, see if the
> > > +	 * last extent before the one we are writing points of an active zone.
> > 
> > "...writing points *to the end* of an active zone" ?
> 
> We only really care about the same zone.  Even if that doesn't create a
> contiguous extent, it means the next GC cycle will make it contiguous.
> But even before that the locality is kinda useful at least on HDD.
> 
> There's still grammar issues, though which I've fixed up.
> 
> > 
> > > +	 * If so, just continue writing to it.
> > > +	 */
> > > +	if (!*oz && ioend->io_offset)
> > > +		*oz = xfs_last_used_zone(ioend);
> > 
> > Also, why not return oz instead of passing it out via double pointer?
> 
> I remember going back and forth a few times.  Let me give it a try to
> see how it works out this time.
> 
> > > +	mp->m_zone_info = xfs_alloc_zone_info(mp);
> > > +	if (!mp->m_zone_info)
> > > +		return -ENOMEM;
> > > +
> > > +	xfs_info(mp, "%u zones of %u blocks size (%u max open)",
> > > +		 mp->m_sb.sb_rgcount, mp->m_groups[XG_TYPE_RTG].blocks,
> > > +		 mp->m_max_open_zones);
> > 
> > Tracepoint?
> 
> I think this actually pretty usueful mount time information in the
> kernel log.  But if you mean a trace point on top of the message and
> not instead I can look into it.

Yeah, I like to just set up ftrace for 'xfs_zone*' and see what falls
out of a test run, instead of pulling in printk and then having to
filter out a bunch of other stuff. :)

--D
hch Feb. 17, 2025, 9:20 a.m. UTC | #5
On Thu, Feb 13, 2025 at 09:35:40AM +0100, Christoph Hellwig wrote:
> On Thu, Feb 13, 2025 at 06:14:48AM +0100, Christoph Hellwig wrote:
> > > > +	if (!*oz && ioend->io_offset)
> > > > +		*oz = xfs_last_used_zone(ioend);
> > > 
> > > Also, why not return oz instead of passing it out via double pointer?
> > 
> > I remember going back and forth a few times.  Let me give it a try to
> > see how it works out this time.
> 
> Looking into this, the callers treat the open_zone structure as a
> context, an passing in the address makes this much nicer.  If you don't
> like all the double pointer dereference operators we could add a new
> context structure that just has a pointer to the open zone as syntactic
> sugar.  And writing this down I think I like the idea, that makes it much
> more clear what is done here, even if it generates the same code.

That ended up being more messy than I though, so I'll stil to the
in/out pointer for now.
diff mbox series

Patch

diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index ea8e66c1e969..28bd2627e9ef 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -137,7 +137,8 @@  xfs-$(CONFIG_XFS_QUOTA)		+= xfs_dquot.o \
 				   xfs_quotaops.o
 
 # xfs_rtbitmap is shared with libxfs
-xfs-$(CONFIG_XFS_RT)		+= xfs_rtalloc.o
+xfs-$(CONFIG_XFS_RT)		+= xfs_rtalloc.o \
+				   xfs_zone_alloc.o
 
 xfs-$(CONFIG_XFS_POSIX_ACL)	+= xfs_acl.o
 xfs-$(CONFIG_SYSCTL)		+= xfs_sysctl.o
diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h
index e35d1d798327..5d8777f819f4 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.h
+++ b/fs/xfs/libxfs/xfs_rtgroup.h
@@ -37,15 +37,27 @@  struct xfs_rtgroup {
 	xfs_rtxnum_t		rtg_extents;
 
 	/*
-	 * Cache of rt summary level per bitmap block with the invariant that
-	 * rtg_rsum_cache[bbno] > the maximum i for which rsum[i][bbno] != 0,
-	 * or 0 if rsum[i][bbno] == 0 for all i.
-	 *
+	 * For bitmap based RT devices this points to a cache of rt summary
+	 * level per bitmap block with the invariant that rtg_rsum_cache[bbno]
+	 * > the maximum i for which rsum[i][bbno] != 0, or 0 if
+	 * rsum[i][bbno] == 0 for all i.
 	 * Reads and writes are serialized by the rsumip inode lock.
+	 *
+	 * For zoned RT devices this points to the open zone structure for
+	 * a group that is open for writers, or is NULL.
 	 */
-	uint8_t			*rtg_rsum_cache;
+	union {
+		uint8_t			*rtg_rsum_cache;
+		struct xfs_open_zone	*rtg_open_zone;
+	};
 };
 
+/*
+ * For zoned RT devices this is set on groups that have no written blocks
+ * and can be picked by the allocator for opening.
+ */
+#define XFS_RTG_FREE			XA_MARK_0
+
 static inline struct xfs_rtgroup *to_rtg(struct xfs_group *xg)
 {
 	return container_of(xg, struct xfs_rtgroup, rtg_group);
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index 76f3c31573ec..f6f4f2d4b5db 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -243,12 +243,23 @@  enum xfs_free_counter {
 	 * Number of free RT extents on the RT device.
 	 */
 	XC_FREE_RTEXTENTS,
+
+	/*
+	 * Number of available for use RT extents.
+	 *
+	 * This counter only exists for zoned RT device and indicates the number
+	 * of RT extents that can be directly used by writes.  XC_FREE_RTEXTENTS
+	 * also includes blocks that have been written previously and freed, but
+	 * sit in a rtgroup that still needs a zone reset.
+	 */
+	XC_FREE_RTAVAILABLE,
 	XC_FREE_NR,
 };
 
 #define XFS_FREECOUNTER_STR \
 	{ XC_FREE_BLOCKS,		"blocks" }, \
-	{ XC_FREE_RTEXTENTS,		"rtextents" }
+	{ XC_FREE_RTEXTENTS,		"rtextents" }, \
+	{ XC_FREE_RTAVAILABLE,		"rtavailable" }
 
 /*
  * Type verifier functions
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index f8851ff835de..6493bdb57351 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -20,6 +20,7 @@ 
 #include "xfs_sysfs.h"
 #include "xfs_sb.h"
 #include "xfs_health.h"
+#include "xfs_zone_alloc.h"
 
 struct kmem_cache	*xfs_log_ticket_cache;
 
@@ -3540,6 +3541,9 @@  xlog_force_shutdown(
 	spin_unlock(&log->l_icloglock);
 
 	wake_up_var(&log->l_opstate);
+	if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(log->l_mp))
+		xfs_zoned_wake_all(log->l_mp);
+
 	return log_error;
 }
 
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index b6004e456ed3..06e34c43cc94 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -40,6 +40,7 @@ 
 #include "xfs_rtrmap_btree.h"
 #include "xfs_rtrefcount_btree.h"
 #include "scrub/stats.h"
+#include "xfs_zone_alloc.h"
 
 static DEFINE_MUTEX(xfs_uuid_table_mutex);
 static int xfs_uuid_table_size;
@@ -467,6 +468,7 @@  xfs_mount_reset_sbqflags(
 static const char *const xfs_free_pool_name[] = {
 	[XC_FREE_BLOCKS]	= "free blocks",
 	[XC_FREE_RTEXTENTS]	= "free rt extents",
+	[XC_FREE_RTAVAILABLE]	= "available rt extents",
 };
 
 uint64_t
@@ -1045,6 +1047,12 @@  xfs_mountfs(
 	if (xfs_is_readonly(mp) && !xfs_has_norecovery(mp))
 		xfs_log_clean(mp);
 
+	if (xfs_has_zoned(mp)) {
+		error = xfs_mount_zones(mp);
+		if (error)
+			goto out_rtunmount;
+	}
+
 	/*
 	 * Complete the quota initialisation, post-log-replay component.
 	 */
@@ -1087,6 +1095,8 @@  xfs_mountfs(
  out_agresv:
 	xfs_fs_unreserve_ag_blocks(mp);
 	xfs_qm_unmount_quotas(mp);
+	if (xfs_has_zoned(mp))
+		xfs_unmount_zones(mp);
  out_rtunmount:
 	xfs_rtunmount_inodes(mp);
  out_rele_rip:
@@ -1168,6 +1178,8 @@  xfs_unmountfs(
 	xfs_blockgc_stop(mp);
 	xfs_fs_unreserve_ag_blocks(mp);
 	xfs_qm_unmount_quotas(mp);
+	if (xfs_has_zoned(mp))
+		xfs_unmount_zones(mp);
 	xfs_rtunmount_inodes(mp);
 	xfs_irele(mp->m_rootip);
 	if (mp->m_metadirip)
@@ -1251,7 +1263,7 @@  xfs_freecounter_unavailable(
 	struct xfs_mount	*mp,
 	enum xfs_free_counter	ctr)
 {
-	if (ctr == XC_FREE_RTEXTENTS)
+	if (ctr == XC_FREE_RTEXTENTS || ctr == XC_FREE_RTAVAILABLE)
 		return 0;
 	return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks);
 }
@@ -1341,7 +1353,8 @@  xfs_dec_freecounter(
 
 		lcounter = (long long)mp->m_resblks[ctr].avail - delta;
 		if (lcounter < 0) {
-			xfs_warn_once(mp,
+			if (ctr == XC_FREE_BLOCKS)
+				xfs_warn_once(mp,
 "Reserve blocks depleted! Consider increasing reserve pool size.");
 			goto fdblocks_enospc;
 		}
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 7c29f3f9ba78..a0e51ce5b84c 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -205,6 +205,7 @@  typedef struct xfs_mount {
 	bool			m_fail_unmount;
 	bool			m_finobt_nores; /* no per-AG finobt resv. */
 	bool			m_update_sb;	/* sb needs update in mount */
+	unsigned int		m_max_open_zones;
 
 	/*
 	 * Bitsets of per-fs metadata that have been checked and/or are sick.
@@ -257,6 +258,7 @@  typedef struct xfs_mount {
 		uint64_t	save;		/* reserved blks @ remount,ro */
 	} m_resblks[XC_FREE_NR];
 	struct delayed_work	m_reclaim_work;	/* background inode reclaim */
+	struct xfs_zone_info	*m_zone_info;	/* zone allocator information */
 	struct dentry		*m_debugfs;	/* debugfs parent */
 	struct xfs_kobj		m_kobj;
 	struct xfs_kobj		m_error_kobj;
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index a0fd1dc5d362..6a45749daf57 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -33,6 +33,7 @@ 
 #include "xfs_trace.h"
 #include "xfs_rtrefcount_btree.h"
 #include "xfs_reflink.h"
+#include "xfs_zone_alloc.h"
 
 /*
  * Return whether there are any free extents in the size range given
@@ -663,7 +664,8 @@  xfs_rtunmount_rtg(
 
 	for (i = 0; i < XFS_RTGI_MAX; i++)
 		xfs_rtginode_irele(&rtg->rtg_inodes[i]);
-	kvfree(rtg->rtg_rsum_cache);
+	if (!xfs_has_zoned(rtg_mount(rtg)))
+		kvfree(rtg->rtg_rsum_cache);
 }
 
 static int
@@ -1573,6 +1575,8 @@  xfs_rtmount_rtg(
 		}
 	}
 
+	if (xfs_has_zoned(mp))
+		return 0;
 	return xfs_alloc_rsum_cache(rtg, mp->m_sb.sb_rbmblocks);
 }
 
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index 8f530e69c18a..a60556dbd172 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -49,6 +49,8 @@ 
 #include "xfs_metafile.h"
 #include "xfs_metadir.h"
 #include "xfs_rtgroup.h"
+#include "xfs_zone_alloc.h"
+#include "xfs_zone_priv.h"
 
 /*
  * We include this last to have the helpers above available for the trace
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index a02129c202b2..d4f012e41564 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -102,6 +102,7 @@  struct xfs_rmap_intent;
 struct xfs_refcount_intent;
 struct xfs_metadir_update;
 struct xfs_rtgroup;
+struct xfs_open_zone;
 
 #define XFS_ATTR_FILTER_FLAGS \
 	{ XFS_ATTR_ROOT,	"ROOT" }, \
@@ -265,6 +266,100 @@  DEFINE_GROUP_REF_EVENT(xfs_group_grab);
 DEFINE_GROUP_REF_EVENT(xfs_group_grab_next_tag);
 DEFINE_GROUP_REF_EVENT(xfs_group_rele);
 
+#ifdef CONFIG_XFS_RT
+DECLARE_EVENT_CLASS(xfs_zone_class,
+	TP_PROTO(struct xfs_rtgroup *rtg),
+	TP_ARGS(rtg),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_rgnumber_t, rgno)
+		__field(xfs_rgblock_t, used)
+	),
+	TP_fast_assign(
+		__entry->dev = rtg_mount(rtg)->m_super->s_dev;
+		__entry->rgno = rtg_rgno(rtg);
+		__entry->used = rtg_rmap(rtg)->i_used_blocks;
+	),
+	TP_printk("dev %d:%d rgno 0x%x used 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->rgno,
+		  __entry->used)
+);
+
+#define DEFINE_ZONE_EVENT(name)				\
+DEFINE_EVENT(xfs_zone_class, name,			\
+	TP_PROTO(struct xfs_rtgroup *rtg),		\
+	TP_ARGS(rtg))
+DEFINE_ZONE_EVENT(xfs_zone_full);
+DEFINE_ZONE_EVENT(xfs_zone_activate);
+
+TRACE_EVENT(xfs_zone_free_blocks,
+	TP_PROTO(struct xfs_rtgroup *rtg, xfs_rgblock_t rgbno,
+		 xfs_extlen_t len),
+	TP_ARGS(rtg, rgbno, len),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_rgnumber_t, rgno)
+		__field(xfs_rgblock_t, used)
+		__field(xfs_rgblock_t, rgbno)
+		__field(xfs_extlen_t, len)
+	),
+	TP_fast_assign(
+		__entry->dev = rtg_mount(rtg)->m_super->s_dev;
+		__entry->rgno = rtg_rgno(rtg);
+		__entry->used = rtg_rmap(rtg)->i_used_blocks;
+		__entry->rgbno = rgbno;
+		__entry->len = len;
+	),
+	TP_printk("dev %d:%d rgno 0x%x used 0x%x rgbno 0x%x len 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->rgno,
+		  __entry->used,
+		  __entry->rgbno,
+		  __entry->len)
+);
+
+DECLARE_EVENT_CLASS(xfs_zone_alloc_class,
+	TP_PROTO(struct xfs_open_zone *oz, xfs_rgblock_t rgbno,
+		 xfs_extlen_t len),
+	TP_ARGS(oz, rgbno, len),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_rgnumber_t, rgno)
+		__field(xfs_rgblock_t, used)
+		__field(xfs_rgblock_t, written)
+		__field(xfs_rgblock_t, write_pointer)
+		__field(xfs_rgblock_t, rgbno)
+		__field(xfs_extlen_t, len)
+	),
+	TP_fast_assign(
+		__entry->dev = rtg_mount(oz->oz_rtg)->m_super->s_dev;
+		__entry->rgno = rtg_rgno(oz->oz_rtg);
+		__entry->used = rtg_rmap(oz->oz_rtg)->i_used_blocks;
+		__entry->written = oz->oz_written;
+		__entry->write_pointer = oz->oz_write_pointer;
+		__entry->rgbno = rgbno;
+		__entry->len = len;
+	),
+	TP_printk("dev %d:%d rgno 0x%x used 0x%x written 0x%x wp 0x%x rgbno 0x%x len 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->rgno,
+		  __entry->used,
+		  __entry->written,
+		  __entry->write_pointer,
+		  __entry->rgbno,
+		  __entry->len)
+);
+
+#define DEFINE_ZONE_ALLOC_EVENT(name)				\
+DEFINE_EVENT(xfs_zone_alloc_class, name,			\
+	TP_PROTO(struct xfs_open_zone *oz, xfs_rgblock_t rgbno,	\
+		 xfs_extlen_t len),				\
+	TP_ARGS(oz, rgbno, len))
+DEFINE_ZONE_ALLOC_EVENT(xfs_zone_record_blocks);
+DEFINE_ZONE_ALLOC_EVENT(xfs_zone_alloc_blocks);
+#endif /* CONFIG_XFS_RT */
+
 TRACE_EVENT(xfs_inodegc_worker,
 	TP_PROTO(struct xfs_mount *mp, unsigned int shrinker_hits),
 	TP_ARGS(mp, shrinker_hits),
@@ -3982,6 +4077,7 @@  DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range);
 DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow);
 DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_from);
 DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_to);
+DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_skip);
 
 DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_cow_range_error);
 DEFINE_INODE_ERROR_EVENT(xfs_reflink_end_cow_error);
@@ -5665,6 +5761,7 @@  TRACE_EVENT(xfs_growfs_check_rtgeom,
 
 TRACE_DEFINE_ENUM(XC_FREE_BLOCKS);
 TRACE_DEFINE_ENUM(XC_FREE_RTEXTENTS);
+TRACE_DEFINE_ENUM(XC_FREE_RTAVAILABLE);
 
 DECLARE_EVENT_CLASS(xfs_freeblocks_class,
 	TP_PROTO(struct xfs_mount *mp, enum xfs_free_counter ctr,
diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c
new file mode 100644
index 000000000000..f41a2cc84382
--- /dev/null
+++ b/fs/xfs/xfs_zone_alloc.c
@@ -0,0 +1,959 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2023-2025 Christoph Hellwig.
+ * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
+ */
+#include "xfs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_error.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_iomap.h"
+#include "xfs_trans.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_trans_space.h"
+#include "xfs_refcount.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_zone_alloc.h"
+#include "xfs_zone_priv.h"
+#include "xfs_zones.h"
+#include "xfs_trace.h"
+
+void
+xfs_open_zone_put(
+	struct xfs_open_zone	*oz)
+{
+	if (atomic_dec_and_test(&oz->oz_ref)) {
+		xfs_rtgroup_rele(oz->oz_rtg);
+		kfree(oz);
+	}
+}
+
+static void
+xfs_open_zone_mark_full(
+	struct xfs_open_zone	*oz)
+{
+	struct xfs_rtgroup	*rtg = oz->oz_rtg;
+	struct xfs_mount	*mp = rtg_mount(rtg);
+	struct xfs_zone_info	*zi = mp->m_zone_info;
+
+	trace_xfs_zone_full(rtg);
+
+	WRITE_ONCE(rtg->rtg_open_zone, NULL);
+
+	spin_lock(&zi->zi_open_zones_lock);
+	if (oz->oz_is_gc) {
+		ASSERT(current == zi->zi_gc_thread);
+		zi->zi_open_gc_zone = NULL;
+	} else {
+		zi->zi_nr_open_zones--;
+		list_del_init(&oz->oz_entry);
+	}
+	spin_unlock(&zi->zi_open_zones_lock);
+	xfs_open_zone_put(oz);
+
+	wake_up_all(&zi->zi_zone_wait);
+}
+
+static void
+xfs_zone_record_blocks(
+	struct xfs_trans	*tp,
+	xfs_fsblock_t		fsbno,
+	xfs_filblks_t		len,
+	struct xfs_open_zone	*oz,
+	bool			used)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_rtgroup	*rtg = oz->oz_rtg;
+	struct xfs_inode	*rmapip = rtg_rmap(rtg);
+
+	trace_xfs_zone_record_blocks(oz, xfs_rtb_to_rgbno(mp, fsbno), len);
+
+	xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
+	xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP);
+	if (used) {
+		rmapip->i_used_blocks += len;
+		ASSERT(rmapip->i_used_blocks <= rtg_blocks(rtg));
+	} else {
+		xfs_add_frextents(mp, len);
+	}
+	oz->oz_written += len;
+	if (oz->oz_written == rtg_blocks(rtg))
+		xfs_open_zone_mark_full(oz);
+	xfs_trans_log_inode(tp, rmapip, XFS_ILOG_CORE);
+}
+
+static int
+xfs_zoned_map_extent(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	struct xfs_bmbt_irec	*new,
+	struct xfs_open_zone	*oz,
+	xfs_fsblock_t		old_startblock)
+{
+	struct xfs_bmbt_irec	data;
+	int			nmaps = 1;
+	int			error;
+
+	/* Grab the corresponding mapping in the data fork. */
+	error = xfs_bmapi_read(ip, new->br_startoff, new->br_blockcount, &data,
+			       &nmaps, 0);
+	if (error)
+		return error;
+
+	/*
+	 * Cap the update to the existing extent in the data fork because we can
+	 * only overwrite one extent at a time.
+	 */
+	ASSERT(new->br_blockcount >= data.br_blockcount);
+	new->br_blockcount = data.br_blockcount;
+
+	/*
+	 * If a data write raced with this GC write, keep the existing data in
+	 * the data fork, mark our newly written GC extent as reclaimable, then
+	 * move on to the next extent.
+	 */
+	if (old_startblock != NULLFSBLOCK &&
+	    old_startblock != data.br_startblock)
+		goto skip;
+
+	trace_xfs_reflink_cow_remap_from(ip, new);
+	trace_xfs_reflink_cow_remap_to(ip, &data);
+
+	error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK,
+			XFS_IEXT_REFLINK_END_COW_CNT);
+	if (error)
+		return error;
+
+	if (data.br_startblock != HOLESTARTBLOCK) {
+		ASSERT(data.br_startblock != DELAYSTARTBLOCK);
+		ASSERT(!isnullstartblock(data.br_startblock));
+
+		xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &data);
+		if (xfs_is_reflink_inode(ip)) {
+			xfs_refcount_decrease_extent(tp, true, &data);
+		} else {
+			error = xfs_free_extent_later(tp, data.br_startblock,
+					data.br_blockcount, NULL,
+					XFS_AG_RESV_NONE,
+					XFS_FREE_EXTENT_REALTIME);
+			if (error)
+				return error;
+		}
+	}
+
+	xfs_zone_record_blocks(tp, new->br_startblock, new->br_blockcount, oz,
+			true);
+
+	/* Map the new blocks into the data fork. */
+	xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, new);
+	return 0;
+
+skip:
+	trace_xfs_reflink_cow_remap_skip(ip, new);
+	xfs_zone_record_blocks(tp, new->br_startblock, new->br_blockcount, oz,
+			false);
+	return 0;
+}
+
+int
+xfs_zoned_end_io(
+	struct xfs_inode	*ip,
+	xfs_off_t		offset,
+	xfs_off_t		count,
+	xfs_daddr_t		daddr,
+	struct xfs_open_zone	*oz,
+	xfs_fsblock_t		old_startblock)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_fileoff_t		end_fsb = XFS_B_TO_FSB(mp, offset + count);
+	struct xfs_bmbt_irec	new = {
+		.br_startoff	= XFS_B_TO_FSBT(mp, offset),
+		.br_startblock	= xfs_daddr_to_rtb(mp, daddr),
+		.br_state	= XFS_EXT_NORM,
+	};
+	unsigned int		resblks =
+		XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
+	struct xfs_trans	*tp;
+	int			error;
+
+	if (xfs_is_shutdown(mp))
+		return -EIO;
+
+	while (new.br_startoff < end_fsb) {
+		new.br_blockcount = end_fsb - new.br_startoff;
+
+		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
+				XFS_TRANS_RESERVE | XFS_TRANS_RES_FDBLKS, &tp);
+		if (error)
+			return error;
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+		xfs_trans_ijoin(tp, ip, 0);
+
+		error = xfs_zoned_map_extent(tp, ip, &new, oz, old_startblock);
+		if (error)
+			xfs_trans_cancel(tp);
+		else
+			error = xfs_trans_commit(tp);
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		if (error)
+			return error;
+
+		new.br_startoff += new.br_blockcount;
+		new.br_startblock += new.br_blockcount;
+		if (old_startblock != NULLFSBLOCK)
+			old_startblock += new.br_blockcount;
+	}
+
+	return 0;
+}
+
+/*
+ * "Free" blocks allocated in a zone.
+ *
+ * Just decrement the used blocks counter and report the space as freed.
+ */
+int
+xfs_zone_free_blocks(
+	struct xfs_trans	*tp,
+	struct xfs_rtgroup	*rtg,
+	xfs_fsblock_t		fsbno,
+	xfs_filblks_t		len)
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_inode	*rmapip = rtg_rmap(rtg);
+
+	xfs_assert_ilocked(rmapip, XFS_ILOCK_EXCL);
+
+	if (len > rmapip->i_used_blocks) {
+		xfs_err(mp,
+"trying to free more blocks (%lld) than used counter (%u).",
+			len, rmapip->i_used_blocks);
+		ASSERT(len <= rmapip->i_used_blocks);
+		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+		return -EFSCORRUPTED;
+	}
+
+	trace_xfs_zone_free_blocks(rtg, xfs_rtb_to_rgbno(mp, fsbno), len);
+
+	rmapip->i_used_blocks -= len;
+	xfs_add_frextents(mp, len);
+	xfs_trans_log_inode(tp, rmapip, XFS_ILOG_CORE);
+	return 0;
+}
+
+/*
+ * Check if the zone containing the data just before the offset we are
+ * writing to is still open and has space.
+ */
+static struct xfs_open_zone *
+xfs_last_used_zone(
+	struct iomap_ioend	*ioend)
+{
+	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSB(mp, ioend->io_offset);
+	struct xfs_rtgroup	*rtg = NULL;
+	struct xfs_open_zone	*oz = NULL;
+	struct xfs_iext_cursor	icur;
+	struct xfs_bmbt_irec	got;
+
+	xfs_ilock(ip, XFS_ILOCK_SHARED);
+	if (!xfs_iext_lookup_extent_before(ip, &ip->i_df, &offset_fsb,
+				&icur, &got)) {
+		xfs_iunlock(ip, XFS_ILOCK_SHARED);
+		return NULL;
+	}
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+	rtg = xfs_rtgroup_grab(mp, xfs_rtb_to_rgno(mp, got.br_startblock));
+	if (!rtg)
+		return NULL;
+
+	xfs_ilock(rtg_rmap(rtg), XFS_ILOCK_SHARED);
+	oz = READ_ONCE(rtg->rtg_open_zone);
+	if (oz && (oz->oz_is_gc || !atomic_inc_not_zero(&oz->oz_ref)))
+		oz = NULL;
+	xfs_iunlock(rtg_rmap(rtg), XFS_ILOCK_SHARED);
+
+	xfs_rtgroup_rele(rtg);
+	return oz;
+}
+
+static struct xfs_group *
+xfs_find_free_zone(
+	struct xfs_mount	*mp,
+	unsigned long		start,
+	unsigned long		end)
+{
+	struct xfs_zone_info	*zi = mp->m_zone_info;
+	XA_STATE		(xas, &mp->m_groups[XG_TYPE_RTG].xa, start);
+	struct xfs_group	*xg;
+
+	xas_lock(&xas);
+	xas_for_each_marked(&xas, xg, end, XFS_RTG_FREE)
+		if (atomic_inc_not_zero(&xg->xg_active_ref))
+			goto found;
+	xas_unlock(&xas);
+	return NULL;
+
+found:
+	xas_clear_mark(&xas, XFS_RTG_FREE);
+	atomic_dec(&zi->zi_nr_free_zones);
+	zi->zi_free_zone_cursor = xg->xg_gno;
+	xas_unlock(&xas);
+	return xg;
+}
+
+static struct xfs_open_zone *
+xfs_init_open_zone(
+	struct xfs_rtgroup	*rtg,
+	xfs_rgblock_t		write_pointer,
+	bool			is_gc)
+{
+	struct xfs_open_zone	*oz;
+
+	oz = kzalloc(sizeof(*oz), GFP_NOFS | __GFP_NOFAIL);
+	spin_lock_init(&oz->oz_alloc_lock);
+	atomic_set(&oz->oz_ref, 1);
+	oz->oz_rtg = rtg;
+	oz->oz_write_pointer = write_pointer;
+	oz->oz_written = write_pointer;
+	oz->oz_is_gc = is_gc;
+
+	/*
+	 * All dereferences of rtg->rtg_open_zone hold the ILOCK for the rmap
+	 * inode, but we don't really want to take that here because we are
+	 * under the zone_list_lock.  Ensure the pointer is only set for a fully
+	 * initialized open zone structure so that a racy lookup finding it is
+	 * fine.
+	 */
+	WRITE_ONCE(rtg->rtg_open_zone, oz);
+	return oz;
+}
+
+/*
+ * Find a completely free zone, open it, and return a reference.
+ */
+struct xfs_open_zone *
+xfs_open_zone(
+	struct xfs_mount	*mp,
+	bool			is_gc)
+{
+	struct xfs_zone_info	*zi = mp->m_zone_info;
+	struct xfs_group	*xg;
+
+	xg = xfs_find_free_zone(mp, zi->zi_free_zone_cursor, ULONG_MAX);
+	if (!xg)
+		xg = xfs_find_free_zone(mp, 0, zi->zi_free_zone_cursor);
+	if (!xg)
+		return NULL;
+
+	set_current_state(TASK_RUNNING);
+	return xfs_init_open_zone(to_rtg(xg), 0, is_gc);
+}
+
+static struct xfs_open_zone *
+xfs_try_open_zone(
+	struct xfs_mount	*mp)
+{
+	struct xfs_zone_info	*zi = mp->m_zone_info;
+	struct xfs_open_zone	*oz;
+
+	if (zi->zi_nr_open_zones >= mp->m_max_open_zones - XFS_OPEN_GC_ZONES)
+		return NULL;
+	if (atomic_read(&zi->zi_nr_free_zones) <
+	    XFS_GC_ZONES - XFS_OPEN_GC_ZONES)
+		return NULL;
+
+	/*
+	 * Increment the open zone count to reserve our slot before dropping
+	 * zi_open_zones_lock.
+	 */
+	zi->zi_nr_open_zones++;
+	spin_unlock(&zi->zi_open_zones_lock);
+	oz = xfs_open_zone(mp, false);
+	spin_lock(&zi->zi_open_zones_lock);
+	if (!oz) {
+		zi->zi_nr_open_zones--;
+		return NULL;
+	}
+
+	atomic_inc(&oz->oz_ref);
+	list_add_tail(&oz->oz_entry, &zi->zi_open_zones);
+
+	/*
+	 * If this was the last free zone, other waiters might be waiting
+	 * on us to write to it as well.
+	 */
+	wake_up_all(&zi->zi_zone_wait);
+
+	/* XXX: this is a little verbose, but let's keep it for now */
+	xfs_info(mp, "using zone %u (%u)",
+		 rtg_rgno(oz->oz_rtg), zi->zi_nr_open_zones);
+	trace_xfs_zone_activate(oz->oz_rtg);
+	return oz;
+}
+
+static bool
+xfs_try_use_zone(
+	struct xfs_zone_info	*zi,
+	struct xfs_open_zone	*oz)
+{
+	if (oz->oz_write_pointer == rtg_blocks(oz->oz_rtg))
+		return false;
+	if (!atomic_inc_not_zero(&oz->oz_ref))
+		return false;
+
+	/*
+	 * If we couldn't match by inode or life time we just pick the first
+	 * zone with enough space above.  For that we want the least busy zone
+	 * for some definition of "least" busy.  For now this simple LRU
+	 * algorithm that rotates every zone to the end of the list will do it,
+	 * even if it isn't exactly cache friendly.
+	 */
+	if (!list_is_last(&oz->oz_entry, &zi->zi_open_zones))
+		list_move_tail(&oz->oz_entry, &zi->zi_open_zones);
+	return true;
+}
+
+static struct xfs_open_zone *
+xfs_select_open_zone_lru(
+	struct xfs_zone_info	*zi)
+{
+	struct xfs_open_zone	*oz;
+
+	lockdep_assert_held(&zi->zi_open_zones_lock);
+
+	list_for_each_entry(oz, &zi->zi_open_zones, oz_entry)
+		if (xfs_try_use_zone(zi, oz))
+			return oz;
+
+	cond_resched_lock(&zi->zi_open_zones_lock);
+	return NULL;
+}
+
+static struct xfs_open_zone *
+xfs_select_open_zone_mru(
+	struct xfs_zone_info	*zi)
+{
+	struct xfs_open_zone	*oz;
+
+	lockdep_assert_held(&zi->zi_open_zones_lock);
+
+	list_for_each_entry_reverse(oz, &zi->zi_open_zones, oz_entry)
+		if (xfs_try_use_zone(zi, oz))
+			return oz;
+
+	cond_resched_lock(&zi->zi_open_zones_lock);
+	return NULL;
+}
+
+/*
+ * Try to pack inodes that are written back after they were closed tight instead
+ * of trying to open new zones for them or spread them to the least recently
+ * used zone.  This optimizes the data layout for workloads that untar or copy
+ * a lot of small files.  Right now this does not separate multiple such
+ * streams.
+ */
+static inline bool xfs_zoned_pack_tight(struct xfs_inode *ip)
+{
+	return !inode_is_open_for_write(VFS_I(ip)) &&
+		!(ip->i_diflags & XFS_DIFLAG_APPEND);
+}
+
+/*
+ * Pick a new zone for writes.
+ *
+ * If we aren't using up our budget of open zones just open a new one from the
+ * freelist.  Else try to find one that matches the expected data lifetime.  If
+ * we don't find one that is good pick any zone that is available.
+ */
+static struct xfs_open_zone *
+xfs_select_zone_nowait(
+	struct xfs_mount	*mp,
+	bool			pack_tight)
+{
+	struct xfs_zone_info	*zi = mp->m_zone_info;
+	struct xfs_open_zone	*oz = NULL;
+
+	if (xfs_is_shutdown(mp))
+		return NULL;
+
+	spin_lock(&zi->zi_open_zones_lock);
+	if (pack_tight)
+		oz = xfs_select_open_zone_mru(zi);
+	if (oz)
+		goto out_unlock;
+
+	/*
+	 * See if we can open a new zone and use that.
+	 */
+	oz = xfs_try_open_zone(mp);
+	if (oz)
+		goto out_unlock;
+
+	oz = xfs_select_open_zone_lru(zi);
+out_unlock:
+	spin_unlock(&zi->zi_open_zones_lock);
+	return oz;
+}
+
+static struct xfs_open_zone *
+xfs_select_zone(
+	struct xfs_mount	*mp,
+	bool			pack_tight)
+{
+	struct xfs_zone_info	*zi = mp->m_zone_info;
+	DEFINE_WAIT		(wait);
+	struct xfs_open_zone	*oz;
+
+	oz = xfs_select_zone_nowait(mp, pack_tight);
+	if (oz)
+		return oz;
+
+	for (;;) {
+		prepare_to_wait(&zi->zi_zone_wait, &wait, TASK_UNINTERRUPTIBLE);
+		oz = xfs_select_zone_nowait(mp, pack_tight);
+		if (oz)
+			break;
+		schedule();
+	}
+	finish_wait(&zi->zi_zone_wait, &wait);
+	return oz;
+}
+
+static unsigned int
+xfs_zone_alloc_blocks(
+	struct xfs_open_zone	*oz,
+	xfs_filblks_t		count_fsb,
+	sector_t		*sector,
+	bool			*is_seq)
+{
+	struct xfs_rtgroup	*rtg = oz->oz_rtg;
+	struct xfs_mount	*mp = rtg_mount(rtg);
+	xfs_rgblock_t		rgbno;
+
+	spin_lock(&oz->oz_alloc_lock);
+	count_fsb = min3(count_fsb, XFS_MAX_BMBT_EXTLEN,
+		(xfs_filblks_t)rtg_blocks(rtg) - oz->oz_write_pointer);
+	if (!count_fsb) {
+		spin_unlock(&oz->oz_alloc_lock);
+		return 0;
+	}
+	rgbno = oz->oz_write_pointer;
+	oz->oz_write_pointer += count_fsb;
+	spin_unlock(&oz->oz_alloc_lock);
+
+	trace_xfs_zone_alloc_blocks(oz, rgbno, count_fsb);
+
+	*sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0);
+	*is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *sector);
+	if (!*is_seq)
+		*sector += XFS_FSB_TO_BB(mp, rgbno);
+	return XFS_FSB_TO_B(mp, count_fsb);
+}
+
+void
+xfs_mark_rtg_boundary(
+	struct iomap_ioend	*ioend)
+{
+	struct xfs_mount	*mp = XFS_I(ioend->io_inode)->i_mount;
+	sector_t		sector = ioend->io_bio.bi_iter.bi_sector;
+
+	if (xfs_rtb_to_rgbno(mp, xfs_daddr_to_rtb(mp, sector)) == 0)
+		ioend->io_flags |= IOMAP_IOEND_BOUNDARY;
+}
+
+static void
+xfs_submit_zoned_bio(
+	struct iomap_ioend	*ioend,
+	struct xfs_open_zone	*oz,
+	bool			is_seq)
+{
+	ioend->io_bio.bi_iter.bi_sector = ioend->io_sector;
+	ioend->io_private = oz;
+	atomic_inc(&oz->oz_ref); /* for xfs_zoned_end_io */
+
+	if (is_seq) {
+		ioend->io_bio.bi_opf &= ~REQ_OP_WRITE;
+		ioend->io_bio.bi_opf |= REQ_OP_ZONE_APPEND;
+	} else {
+		xfs_mark_rtg_boundary(ioend);
+	}
+
+	submit_bio(&ioend->io_bio);
+}
+
+void
+xfs_zone_alloc_and_submit(
+	struct iomap_ioend	*ioend,
+	struct xfs_open_zone	**oz)
+{
+	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	bool			pack_tight = xfs_zoned_pack_tight(ip);
+	unsigned int		alloc_len;
+	struct iomap_ioend	*split;
+	bool			is_seq;
+
+	if (xfs_is_shutdown(mp))
+		goto out_error;
+
+	/*
+	 * If we don't have a cached zone in this write context, see if the
+	 * last extent before the one we are writing points of an active zone.
+	 * If so, just continue writing to it.
+	 */
+	if (!*oz && ioend->io_offset)
+		*oz = xfs_last_used_zone(ioend);
+	if (!*oz) {
+select_zone:
+		*oz = xfs_select_zone(mp, pack_tight);
+		if (!*oz)
+			goto out_error;
+	}
+
+	alloc_len = xfs_zone_alloc_blocks(*oz, XFS_B_TO_FSB(mp, ioend->io_size),
+			&ioend->io_sector, &is_seq);
+	if (!alloc_len) {
+		xfs_open_zone_put(*oz);
+		goto select_zone;
+	}
+
+	while ((split = iomap_split_ioend(ioend, alloc_len, is_seq))) {
+		if (IS_ERR(split))
+			goto out_split_error;
+		alloc_len -= split->io_bio.bi_iter.bi_size;
+		xfs_submit_zoned_bio(split, *oz, is_seq);
+		if (!alloc_len) {
+			xfs_open_zone_put(*oz);
+			goto select_zone;
+		}
+	}
+
+	xfs_submit_zoned_bio(ioend, *oz, is_seq);
+	return;
+
+out_split_error:
+	ioend->io_bio.bi_status = errno_to_blk_status(PTR_ERR(split));
+out_error:
+	bio_io_error(&ioend->io_bio);
+}
+
+void
+xfs_zoned_wake_all(
+	struct xfs_mount	*mp)
+{
+	if (!(mp->m_super->s_flags & SB_ACTIVE))
+		return; /* can happen during log recovery */
+	wake_up_all(&mp->m_zone_info->zi_zone_wait);
+}
+
+/*
+ * Check if @rgbno in @rgb is a potentially valid block.  It might still be
+ * unused, but that information is only found in the rmap.
+ */
+bool
+xfs_zone_rgbno_is_valid(
+	struct xfs_rtgroup	*rtg,
+	xfs_rgnumber_t		rgbno)
+{
+	lockdep_assert_held(&rtg_rmap(rtg)->i_lock);
+
+	if (rtg->rtg_open_zone)
+		return rgbno < rtg->rtg_open_zone->oz_write_pointer;
+	return !xa_get_mark(&rtg_mount(rtg)->m_groups[XG_TYPE_RTG].xa,
+			rtg_rgno(rtg), XFS_RTG_FREE);
+}
+
+static void
+xfs_free_open_zones(
+	struct xfs_zone_info	*zi)
+{
+	struct xfs_open_zone	*oz;
+
+	spin_lock(&zi->zi_open_zones_lock);
+	while ((oz = list_first_entry_or_null(&zi->zi_open_zones,
+			struct xfs_open_zone, oz_entry))) {
+		list_del(&oz->oz_entry);
+		xfs_open_zone_put(oz);
+	}
+	spin_unlock(&zi->zi_open_zones_lock);
+}
+
+struct xfs_init_zones {
+	struct xfs_mount	*mp;
+	uint64_t		available;
+	uint64_t		reclaimable;
+};
+
+static int
+xfs_init_zone(
+	struct xfs_init_zones	*iz,
+	struct xfs_rtgroup	*rtg,
+	struct blk_zone		*zone)
+{
+	struct xfs_mount	*mp = rtg_mount(rtg);
+	struct xfs_zone_info	*zi = mp->m_zone_info;
+	uint64_t		used = rtg_rmap(rtg)->i_used_blocks;
+	xfs_rgblock_t		write_pointer, highest_rgbno;
+
+	if (zone && !xfs_zone_validate(zone, rtg, &write_pointer))
+		return -EFSCORRUPTED;
+
+	/*
+	 * For sequential write required zones we retrieved the hardware write
+	 * pointer above.
+	 *
+	 * For conventional zones or conventional devices we don't have that
+	 * luxury.  Instead query the rmap to find the highest recorded block
+	 * and set the write pointer to the block after that.  In case of a
+	 * power loss this misses blocks where the data I/O has completed but
+	 * not recorded in the rmap yet, and it also rewrites blocks if the most
+	 * recently written ones got deleted again before unmount, but this is
+	 * the best we can do without hardware support.
+	 */
+	if (!zone || zone->cond == BLK_ZONE_COND_NOT_WP) {
+		xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
+		highest_rgbno = xfs_rtrmap_highest_rgbno(rtg);
+		if (highest_rgbno == NULLRGBLOCK)
+			write_pointer = 0;
+		else
+			write_pointer = highest_rgbno + 1;
+		xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
+	}
+
+	if (write_pointer == 0) {
+		/* zone is empty */
+		atomic_inc(&zi->zi_nr_free_zones);
+		xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_FREE);
+		iz->available += rtg_blocks(rtg);
+	} else if (write_pointer < rtg_blocks(rtg)) {
+		/* zone is open */
+		struct xfs_open_zone *oz;
+
+		atomic_inc(&rtg_group(rtg)->xg_active_ref);
+		oz = xfs_init_open_zone(rtg, write_pointer, false);
+		list_add_tail(&oz->oz_entry, &zi->zi_open_zones);
+		zi->zi_nr_open_zones++;
+
+		iz->available += (rtg_blocks(rtg) - write_pointer);
+		iz->reclaimable += write_pointer - used;
+	} else if (used < rtg_blocks(rtg)) {
+		/* zone fully written, but has freed blocks */
+		iz->reclaimable += (rtg_blocks(rtg) - used);
+	}
+
+	return 0;
+}
+
+static int
+xfs_get_zone_info_cb(
+	struct blk_zone		*zone,
+	unsigned int		idx,
+	void			*data)
+{
+	struct xfs_init_zones	*iz = data;
+	struct xfs_mount	*mp = iz->mp;
+	xfs_fsblock_t		zsbno = xfs_daddr_to_rtb(mp, zone->start);
+	xfs_rgnumber_t		rgno;
+	struct xfs_rtgroup	*rtg;
+	int			error;
+
+	if (xfs_rtb_to_rgbno(mp, zsbno) != 0) {
+		xfs_warn(mp, "mismatched zone start 0x%llx.", zsbno);
+		return -EFSCORRUPTED;
+	}
+
+	rgno = xfs_rtb_to_rgno(mp, zsbno);
+	rtg = xfs_rtgroup_grab(mp, rgno);
+	if (!rtg) {
+		xfs_warn(mp, "realtime group not found for zone %u.", rgno);
+		return -EFSCORRUPTED;
+	}
+	error = xfs_init_zone(iz, rtg, zone);
+	xfs_rtgroup_rele(rtg);
+	return error;
+}
+
+/*
+ * Calculate the max open zone limit based on the of number of
+ * backing zones available
+ */
+static inline uint32_t
+xfs_max_open_zones(
+	struct xfs_mount	*mp)
+{
+	unsigned int		max_open, max_open_data_zones;
+	/*
+	 * We need two zones for every open data zone,
+	 * one in reserve as we don't reclaim open zones. One data zone
+	 * and its spare is included in XFS_MIN_ZONES.
+	 */
+	max_open_data_zones = (mp->m_sb.sb_rgcount - XFS_MIN_ZONES) / 2 + 1;
+	max_open = max_open_data_zones + XFS_OPEN_GC_ZONES;
+
+	/*
+	 * Cap the max open limit to 1/4 of available space
+	 */
+	max_open = min(max_open, mp->m_sb.sb_rgcount / 4);
+
+	return max(XFS_MIN_OPEN_ZONES, max_open);
+}
+
+/*
+ * Normally we use the open zone limit that the device reports.  If there is
+ * none let the user pick one from the command line.
+ *
+ * If the device doesn't report an open zone limit and there is no override,
+ * allow to hold about a quarter of the zones open.  In theory we could allow
+ * all to be open, but at that point we run into GC deadlocks because we can't
+ * reclaim open zones.
+ *
+ * When used on conventional SSDs a lower open limit is advisable as we'll
+ * otherwise overwhelm the FTL just as much as a conventional block allocator.
+ *
+ * Note: To debug the open zone management code, force max_open to 1 here.
+ */
+static int
+xfs_calc_open_zones(
+	struct xfs_mount	*mp)
+{
+	struct block_device	*bdev = mp->m_rtdev_targp->bt_bdev;
+	unsigned int		bdev_open_zones = bdev_max_open_zones(bdev);
+
+	if (!mp->m_max_open_zones) {
+		if (bdev_open_zones)
+			mp->m_max_open_zones = bdev_open_zones;
+		else
+			mp->m_max_open_zones = xfs_max_open_zones(mp);
+	}
+
+	if (mp->m_max_open_zones < XFS_MIN_OPEN_ZONES) {
+		xfs_notice(mp, "need at least %u open zones.",
+			XFS_MIN_OPEN_ZONES);
+		return -EIO;
+	}
+
+	if (bdev_open_zones && bdev_open_zones < mp->m_max_open_zones) {
+		mp->m_max_open_zones = bdev_open_zones;
+		xfs_info(mp, "limiting open zones to %u due to hardware limit.\n",
+			bdev_open_zones);
+	}
+
+	if (mp->m_max_open_zones > xfs_max_open_zones(mp)) {
+		mp->m_max_open_zones = xfs_max_open_zones(mp);
+		xfs_info(mp,
+"limiting open zones to %u due to total zone count (%u)",
+			mp->m_max_open_zones, mp->m_sb.sb_rgcount);
+	}
+
+	return 0;
+}
+
+static struct xfs_zone_info *
+xfs_alloc_zone_info(
+	struct xfs_mount	*mp)
+{
+	struct xfs_zone_info	*zi;
+
+	zi = kzalloc(sizeof(*zi), GFP_KERNEL);
+	if (!zi)
+		return NULL;
+	INIT_LIST_HEAD(&zi->zi_open_zones);
+	INIT_LIST_HEAD(&zi->zi_reclaim_reservations);
+	spin_lock_init(&zi->zi_reset_list_lock);
+	spin_lock_init(&zi->zi_open_zones_lock);
+	spin_lock_init(&zi->zi_reservation_lock);
+	init_waitqueue_head(&zi->zi_zone_wait);
+	return zi;
+}
+
+static void
+xfs_free_zone_info(
+	struct xfs_zone_info	*zi)
+{
+	xfs_free_open_zones(zi);
+	kfree(zi);
+}
+
+int
+xfs_mount_zones(
+	struct xfs_mount	*mp)
+{
+	struct xfs_init_zones	iz = {
+		.mp		= mp,
+	};
+	struct xfs_buftarg	*bt = mp->m_rtdev_targp;
+	int			error;
+
+	if (!bt) {
+		xfs_notice(mp, "RT device missing.");
+		return -EINVAL;
+	}
+
+	if (!xfs_has_rtgroups(mp) || !xfs_has_rmapbt(mp)) {
+		xfs_notice(mp, "invalid flag combination.");
+		return -EFSCORRUPTED;
+	}
+	if (mp->m_sb.sb_rextsize != 1) {
+		xfs_notice(mp, "zoned file systems do not support rextsize.");
+		return -EFSCORRUPTED;
+	}
+	if (mp->m_sb.sb_rgcount < XFS_MIN_ZONES) {
+		xfs_notice(mp,
+"zoned file systems need to have at least %u zones.", XFS_MIN_ZONES);
+		return -EFSCORRUPTED;
+	}
+
+	error = xfs_calc_open_zones(mp);
+	if (error)
+		return error;
+
+	mp->m_zone_info = xfs_alloc_zone_info(mp);
+	if (!mp->m_zone_info)
+		return -ENOMEM;
+
+	xfs_info(mp, "%u zones of %u blocks size (%u max open)",
+		 mp->m_sb.sb_rgcount, mp->m_groups[XG_TYPE_RTG].blocks,
+		 mp->m_max_open_zones);
+
+	if (bdev_is_zoned(bt->bt_bdev)) {
+		error = blkdev_report_zones(bt->bt_bdev,
+				XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart),
+				mp->m_sb.sb_rgcount, xfs_get_zone_info_cb, &iz);
+		if (error < 0)
+			goto out_free_zone_info;
+	} else {
+		struct xfs_rtgroup	*rtg = NULL;
+
+		while ((rtg = xfs_rtgroup_next(mp, rtg))) {
+			error = xfs_init_zone(&iz, rtg, NULL);
+			if (error)
+				goto out_free_zone_info;
+		}
+	}
+
+	xfs_set_freecounter(mp, XC_FREE_RTAVAILABLE, iz.available);
+	xfs_set_freecounter(mp, XC_FREE_RTEXTENTS,
+		iz.available + iz.reclaimable);
+	return 0;
+
+out_free_zone_info:
+	xfs_free_zone_info(mp->m_zone_info);
+	return error;
+}
+
+void
+xfs_unmount_zones(
+	struct xfs_mount	*mp)
+{
+	xfs_free_zone_info(mp->m_zone_info);
+}
diff --git a/fs/xfs/xfs_zone_alloc.h b/fs/xfs/xfs_zone_alloc.h
new file mode 100644
index 000000000000..78cd7bfc6ac8
--- /dev/null
+++ b/fs/xfs/xfs_zone_alloc.h
@@ -0,0 +1,34 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _XFS_ZONE_ALLOC_H
+#define _XFS_ZONE_ALLOC_H
+
+struct iomap_ioend;
+struct xfs_open_zone;
+
+void xfs_zone_alloc_and_submit(struct iomap_ioend *ioend,
+		struct xfs_open_zone **oz);
+int xfs_zone_free_blocks(struct xfs_trans *tp, struct xfs_rtgroup *rtg,
+		xfs_fsblock_t fsbno, xfs_filblks_t len);
+int xfs_zoned_end_io(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count,
+		xfs_daddr_t daddr, struct xfs_open_zone *oz,
+		xfs_fsblock_t old_startblock);
+void xfs_open_zone_put(struct xfs_open_zone *oz);
+
+void xfs_zoned_wake_all(struct xfs_mount *mp);
+bool xfs_zone_rgbno_is_valid(struct xfs_rtgroup *rtg, xfs_rgnumber_t rgbno);
+void xfs_mark_rtg_boundary(struct iomap_ioend *ioend);
+
+#ifdef CONFIG_XFS_RT
+int xfs_mount_zones(struct xfs_mount *mp);
+void xfs_unmount_zones(struct xfs_mount *mp);
+#else
+static inline int xfs_mount_zones(struct xfs_mount *mp)
+{
+	return -EIO;
+}
+static inline void xfs_unmount_zones(struct xfs_mount *mp)
+{
+}
+#endif /* CONFIG_XFS_RT */
+
+#endif /* _XFS_ZONE_ALLOC_H */
diff --git a/fs/xfs/xfs_zone_priv.h b/fs/xfs/xfs_zone_priv.h
new file mode 100644
index 000000000000..23d2fd6088ae
--- /dev/null
+++ b/fs/xfs/xfs_zone_priv.h
@@ -0,0 +1,89 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _XFS_ZONE_PRIV_H
+#define _XFS_ZONE_PRIV_H
+
+struct xfs_open_zone {
+	/*
+	 * Entry in the open zone list and refcount.  Protected by
+	 * zi_open_zones_lock in struct xfs_zone_info.
+	 */
+	struct list_head	oz_entry;
+	atomic_t		oz_ref;
+
+	/*
+	 * oz_write_pointer is the write pointer at which space is handed out
+	 * for conventional zones, or simple the count of blocks handed out
+	 * so far for sequential write required zones and is protected by
+	 * oz_alloc_lock/
+	 */
+	spinlock_t		oz_alloc_lock;
+	xfs_rgblock_t		oz_write_pointer;
+
+	/*
+	 * oz_written is the number of blocks for which we've received a
+	 * write completion.  oz_written must always be <= oz_write_pointer
+	 * and is protected by the ILOCK of the rmap inode.
+	 */
+	xfs_rgblock_t		oz_written;
+
+	/*
+	 * Is this open zone used for garbage collection?  There can only be a
+	 * single open GC zone, which is pointed to by zi_open_gc_zone in
+	 * struct xfs_zone_info.  Constant over the life time of an open zone.
+	 */
+	bool			oz_is_gc;
+
+	/*
+	 * Pointer to the RT groups structure for this open zone.  Constant over
+	 * the life time of an open zone.
+	 */
+	struct xfs_rtgroup	*oz_rtg;
+};
+
+struct xfs_zone_info {
+	/*
+	 * List of pending space reservations:
+	 */
+	spinlock_t		zi_reservation_lock;
+	struct list_head	zi_reclaim_reservations;
+
+	/*
+	 * List and number of open zones:
+	 */
+	spinlock_t		zi_open_zones_lock;
+	struct list_head	zi_open_zones;
+	unsigned int		zi_nr_open_zones;
+
+	/*
+	 * Free zone search cursor and number of free zones:
+	 */
+	unsigned long		zi_free_zone_cursor;
+	atomic_t		zi_nr_free_zones;
+
+	/*
+	 * Wait queue to wait for free zones or open zone resources to become
+	 * available:
+	 */
+	wait_queue_head_t	zi_zone_wait;
+
+	/*
+	 * Pointer to the GC thread, and the current open zone used by GC
+	 * (if any).
+	 *
+	 * zi_open_gc_zone is mostly private to the GC thread, but can be read
+	 * for debugging from other threads, in which case zi_open_zones_lock
+	 * must be taken to access it.
+	 */
+	struct task_struct      *zi_gc_thread;
+	struct xfs_open_zone	*zi_open_gc_zone;
+
+	/*
+	 * List of zones that need a reset:
+	 */
+	spinlock_t		zi_reset_list_lock;
+	struct xfs_group	*zi_reset_list;
+};
+
+struct xfs_open_zone *xfs_open_zone(struct xfs_mount *mp, bool is_gc);
+
+#endif /* _XFS_ZONE_PRIV_H */