@@ -216,7 +216,10 @@ typedef struct xfs_inode {
struct xfs_ifork i_df; /* data fork */
struct xfs_ifork i_af; /* attribute fork */
struct xfs_inode_log_item *i_itemp; /* logging information */
- unsigned int i_delayed_blks; /* count of delay alloc blks */
+ uint64_t i_delayed_blks; /* count of delay alloc blks */
+ /* Space that has been set aside to root a btree in this file. */
+ uint64_t i_meta_resv_asked;
+
xfs_fsize_t i_disk_size; /* number of bytes in file */
xfs_rfsblock_t i_nblocks; /* # of direct & btree blocks */
prid_t i_projid; /* owner's project id */
@@ -99,6 +99,7 @@ typedef struct xfs_mount {
uint m_rmap_maxlevels; /* max rmap btree levels */
uint m_refc_maxlevels; /* max refc btree levels */
unsigned int m_agbtree_maxlevels; /* max level of all AG btrees */
+ unsigned int m_rtbtree_maxlevels; /* max level of all rt btrees */
xfs_extlen_t m_ag_prealloc_blocks; /* reserved ag blocks */
uint m_alloc_set_aside; /* space we can't use */
uint m_ag_max_usable; /* max space per AG */
@@ -389,4 +389,11 @@
#define trace_xfs_iunlink_remove(...) ((void) 0)
#define trace_xfs_iunlink_map_prev_fallback(...) ((void) 0)
+#define trace_xfs_imeta_resv_alloc_extent(...) ((void) 0)
+#define trace_xfs_imeta_resv_critical(...) ((void) 0)
+#define trace_xfs_imeta_resv_free(...) ((void) 0)
+#define trace_xfs_imeta_resv_free_extent(...) ((void) 0)
+#define trace_xfs_imeta_resv_init(...) ((void) 0)
+#define trace_xfs_imeta_resv_init_error(...) ((void) 0)
+
#endif /* __TRACE_H__ */
@@ -64,6 +64,7 @@ error_tag(char *name)
{ XFS_ERRTAG_WB_DELAY_MS, "wb_delay_ms" },
{ XFS_ERRTAG_WRITE_DELAY_MS, "write_delay_ms" },
{ XFS_ERRTAG_SWAPEXT_FINISH_ONE, "swapext_finish_one" },
+ { XFS_ERRTAG_IMETA_RESV_CRITICAL, "imeta_resv_critical" },
{ XFS_ERRTAG_MAX, NULL }
};
int count;
@@ -651,6 +651,15 @@ xfs_agbtree_compute_maxlevels(
mp->m_agbtree_maxlevels = max(levels, mp->m_refc_maxlevels);
}
+/* Compute maximum possible height for realtime btree types for this fs. */
+static inline void
+xfs_rtbtree_compute_maxlevels(
+ struct xfs_mount *mp)
+{
+ /* This will be filled in later. */
+ mp->m_rtbtree_maxlevels = 0;
+}
+
/* Compute maximum possible height of all btrees. */
void
libxfs_compute_all_maxlevels(
@@ -667,7 +676,7 @@ libxfs_compute_all_maxlevels(
xfs_refcountbt_compute_maxlevels(mp);
xfs_agbtree_compute_maxlevels(mp);
-
+ xfs_rtbtree_compute_maxlevels(mp);
}
/* Mount the metadata files under the metadata directory tree. */
@@ -221,6 +221,17 @@ uint32_t get_random_u32(void);
#define get_random_u32() (0)
#endif
+static inline int
+__percpu_counter_compare(uint64_t *count, int64_t rhs, int32_t batch)
+{
+ if (*count > rhs)
+ return 1;
+ else if (*count < rhs)
+ return -1;
+ return 0;
+}
+
+
#define PAGE_SIZE getpagesize()
#define inode_peek_iversion(inode) (inode)->i_version
@@ -112,6 +112,7 @@ xfs_ag_resv_needed(
case XFS_AG_RESV_RMAPBT:
len -= xfs_perag_resv(pag, type)->ar_reserved;
break;
+ case XFS_AG_RESV_IMETA:
case XFS_AG_RESV_NONE:
/* empty */
break;
@@ -346,6 +347,7 @@ xfs_ag_resv_alloc_extent(
switch (type) {
case XFS_AG_RESV_AGFL:
+ case XFS_AG_RESV_IMETA:
return;
case XFS_AG_RESV_METADATA:
case XFS_AG_RESV_RMAPBT:
@@ -388,6 +390,7 @@ xfs_ag_resv_free_extent(
switch (type) {
case XFS_AG_RESV_AGFL:
+ case XFS_AG_RESV_IMETA:
return;
case XFS_AG_RESV_METADATA:
case XFS_AG_RESV_RMAPBT:
@@ -64,7 +64,8 @@
#define XFS_ERRTAG_WB_DELAY_MS 42
#define XFS_ERRTAG_WRITE_DELAY_MS 43
#define XFS_ERRTAG_SWAPEXT_FINISH_ONE 44
-#define XFS_ERRTAG_MAX 45
+#define XFS_ERRTAG_IMETA_RESV_CRITICAL 45
+#define XFS_ERRTAG_MAX 46
/*
* Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
@@ -113,5 +114,6 @@
#define XFS_RANDOM_WB_DELAY_MS 3000
#define XFS_RANDOM_WRITE_DELAY_MS 3000
#define XFS_RANDOM_SWAPEXT_FINISH_ONE 1
+#define XFS_RANDOM_IMETA_RESV_CRITICAL 4
#endif /* __XFS_ERRORTAG_H_ */
@@ -26,6 +26,9 @@
#include "xfs_dir2.h"
#include "xfs_dir2_priv.h"
#include "xfs_health.h"
+#include "xfs_errortag.h"
+#include "xfs_btree.h"
+#include "xfs_alloc.h"
/*
* Metadata File Management
@@ -1074,3 +1077,190 @@ xfs_imeta_free_path(
kfree(path->im_path);
kfree(path);
}
+
+/*
+ * Is the amount of space that could be allocated towards a given metadata
+ * file at or beneath a certain threshold?
+ */
+static inline bool
+xfs_imeta_resv_can_cover(
+ struct xfs_inode *ip,
+ int64_t rhs)
+{
+ /*
+ * The amount of space that can be allocated to this metadata file is
+ * the remaining reservation for the particular metadata file + the
+ * global free block count. Take care of the first case to avoid
+ * touching the per-cpu counter.
+ */
+ if (ip->i_delayed_blks >= rhs)
+ return true;
+
+ /*
+ * There aren't enough blocks left in the inode's reservation, but it
+ * isn't critical unless there also isn't enough free space.
+ */
+ return __percpu_counter_compare(&ip->i_mount->m_fdblocks,
+ rhs - ip->i_delayed_blks, 2048) >= 0;
+}
+
+/*
+ * Is this metadata file critically low on blocks? For now we'll define that
+ * as the number of blocks we can get our hands on being less than 10% of what
+ * we reserved or less than some arbitrary number (maximum btree height).
+ */
+bool
+xfs_imeta_resv_critical(
+ struct xfs_inode *ip)
+{
+ uint64_t asked_low_water;
+
+ if (!ip)
+ return false;
+
+ ASSERT(xfs_is_metadir_inode(ip));
+ trace_xfs_imeta_resv_critical(ip, 0);
+
+ if (!xfs_imeta_resv_can_cover(ip, ip->i_mount->m_rtbtree_maxlevels))
+ return true;
+
+ asked_low_water = div_u64(ip->i_meta_resv_asked, 10);
+ if (!xfs_imeta_resv_can_cover(ip, asked_low_water))
+ return true;
+
+ return XFS_TEST_ERROR(false, ip->i_mount,
+ XFS_ERRTAG_IMETA_RESV_CRITICAL);
+}
+
+/* Allocate a block from the metadata file's reservation. */
+void
+xfs_imeta_resv_alloc_extent(
+ struct xfs_inode *ip,
+ struct xfs_alloc_arg *args)
+{
+ int64_t len = args->len;
+
+ ASSERT(xfs_is_metadir_inode(ip));
+ ASSERT(XFS_IS_DQDETACHED(ip->i_mount, ip));
+ ASSERT(args->resv == XFS_AG_RESV_IMETA);
+
+ trace_xfs_imeta_resv_alloc_extent(ip, args->len);
+
+ /*
+ * Allocate the blocks from the metadata inode's block reservation
+ * and update the ondisk sb counter.
+ */
+ if (ip->i_delayed_blks > 0) {
+ int64_t from_resv;
+
+ from_resv = min_t(int64_t, len, ip->i_delayed_blks);
+ ip->i_delayed_blks -= from_resv;
+ xfs_mod_delalloc(ip->i_mount, -from_resv);
+ xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_RES_FDBLOCKS,
+ -from_resv);
+ len -= from_resv;
+ }
+
+ /*
+ * Any allocation in excess of the reservation requires in-core and
+ * on-disk fdblocks updates.
+ */
+ if (len)
+ xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_FDBLOCKS, -len);
+
+ ip->i_nblocks += args->len;
+ xfs_trans_log_inode(args->tp, ip, XFS_ILOG_CORE);
+}
+
+/* Free a block to the metadata file's reservation. */
+void
+xfs_imeta_resv_free_extent(
+ struct xfs_inode *ip,
+ struct xfs_trans *tp,
+ xfs_filblks_t len)
+{
+ int64_t to_resv;
+
+ ASSERT(xfs_is_metadir_inode(ip));
+ ASSERT(XFS_IS_DQDETACHED(ip->i_mount, ip));
+ trace_xfs_imeta_resv_free_extent(ip, len);
+
+ ip->i_nblocks -= len;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ /*
+ * Add the freed blocks back into the inode's delalloc reservation
+ * until it reaches the maximum size. Update the ondisk fdblocks only.
+ */
+ to_resv = ip->i_meta_resv_asked - (ip->i_nblocks + ip->i_delayed_blks);
+ if (to_resv > 0) {
+ to_resv = min_t(int64_t, to_resv, len);
+ ip->i_delayed_blks += to_resv;
+ xfs_mod_delalloc(ip->i_mount, to_resv);
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FDBLOCKS, to_resv);
+ len -= to_resv;
+ }
+
+ /*
+ * Everything else goes back to the filesystem, so update the in-core
+ * and on-disk counters.
+ */
+ if (len)
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, len);
+}
+
+/* Release a metadata file's space reservation. */
+void
+xfs_imeta_resv_free_inode(
+ struct xfs_inode *ip)
+{
+ if (!ip)
+ return;
+
+ ASSERT(xfs_is_metadir_inode(ip));
+ trace_xfs_imeta_resv_free(ip, 0);
+
+ xfs_mod_delalloc(ip->i_mount, -ip->i_delayed_blks);
+ xfs_mod_fdblocks(ip->i_mount, ip->i_delayed_blks, true);
+ ip->i_delayed_blks = 0;
+ ip->i_meta_resv_asked = 0;
+}
+
+/* Set up a metadata file's space reservation. */
+int
+xfs_imeta_resv_init_inode(
+ struct xfs_inode *ip,
+ xfs_filblks_t ask)
+{
+ xfs_filblks_t hidden_space;
+ xfs_filblks_t used;
+ int error;
+
+ if (!ip || ip->i_meta_resv_asked > 0)
+ return 0;
+
+ ASSERT(xfs_is_metadir_inode(ip));
+
+ /*
+ * Space taken by all other metadata btrees are accounted on-disk as
+ * used space. We therefore only hide the space that is reserved but
+ * not used by the trees.
+ */
+ used = ip->i_nblocks;
+ if (used > ask)
+ ask = used;
+ hidden_space = ask - used;
+
+ error = xfs_mod_fdblocks(ip->i_mount, -(int64_t)hidden_space, true);
+ if (error) {
+ trace_xfs_imeta_resv_init_error(ip, error, _RET_IP_);
+ return error;
+ }
+
+ xfs_mod_delalloc(ip->i_mount, hidden_space);
+ ip->i_delayed_blks = hidden_space;
+ ip->i_meta_resv_asked = ask;
+
+ trace_xfs_imeta_resv_init(ip, ask);
+ return 0;
+}
@@ -102,6 +102,17 @@ unsigned int xfs_imeta_create_space_res(struct xfs_mount *mp);
unsigned int xfs_imeta_link_space_res(struct xfs_mount *mp);
unsigned int xfs_imeta_unlink_space_res(struct xfs_mount *mp);
+/* Space reservations for metadata inodes. */
+struct xfs_alloc_arg;
+
+bool xfs_imeta_resv_critical(struct xfs_inode *ip);
+void xfs_imeta_resv_alloc_extent(struct xfs_inode *ip,
+ struct xfs_alloc_arg *args);
+void xfs_imeta_resv_free_extent(struct xfs_inode *ip, struct xfs_trans *tp,
+ xfs_filblks_t len);
+void xfs_imeta_resv_free_inode(struct xfs_inode *ip);
+int xfs_imeta_resv_init_inode(struct xfs_inode *ip, xfs_filblks_t ask);
+
/* Must be implemented by the libxfs client */
int xfs_imeta_iget(struct xfs_trans *tp, xfs_ino_t ino, unsigned char ftype,
struct xfs_inode **ipp);
@@ -221,6 +221,13 @@ enum xfs_ag_resv_type {
* altering fdblocks. If you think you need this you're wrong.
*/
XFS_AG_RESV_IGNORE,
+
+ /*
+ * This allocation activity is being done on behalf of a metadata file.
+ * These files maintain their own permanent space reservations and are
+ * required to adjust fdblocks using the xfs_imeta_resv_* helpers.
+ */
+ XFS_AG_RESV_IMETA,
};
/* Results of scanning a btree keyspace to check occupancy. */