@@ -2039,6 +2039,46 @@ xfs_iunlink_insert_inode(
return xfs_iunlink_update_bucket(tp, agno, agibp, 0, agino);
}
+/*
+ * Lock the perag and take AGI lock if agi_unlinked is touched as well for
+ * xfs_iunlink_remove_inode().
+ *
+ * Inode allocation in the O_TMPFILE path defines the AGI/unlinked list lock
+ * order as being AGI->perag unlinked list lock. We are inverting it here as
+ * the fast path tail addition does not need to modify the AGI at all. Hence
+ * we only need the AGI lock if the tail is empty, correct lock order.
+ */
+static struct xfs_perag *
+xfs_iunlink_remove_lock(
+ xfs_agino_t agno,
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ struct xfs_buf **agibpp)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_perag *pag;
+ int error;
+
+ pag = xfs_perag_get(mp, agno);
+ mutex_lock(&pag->pag_unlinked_mutex);
+
+ if (ip->i_prev_unlinked == NULLAGINO) {
+ mutex_unlock(&pag->pag_unlinked_mutex);
+ /*
+ * some paths (e.g. xfs_create_tmpfile) could take AGI lock
+ * in this transaction in advance and the only locking order
+ * AGI buf lock -> pag_unlinked_mutex is safe.
+ */
+ error = xfs_read_agi(mp, tp, agno, agibpp);
+ if (error) {
+ xfs_perag_put(pag);
+ return ERR_PTR(error);
+ }
+ mutex_lock(&pag->pag_unlinked_mutex);
+ }
+ return pag;
+}
+
/*
* Remove can be from anywhere in the list, so we have to do two adjacent inode
* lookups here so we can update list pointers. We may be at the head or the
@@ -2046,12 +2086,12 @@ xfs_iunlink_insert_inode(
*/
static int
xfs_iunlink_remove_inode(
+ struct xfs_perag *pag,
struct xfs_trans *tp,
struct xfs_buf *agibp,
struct xfs_inode *ip)
{
struct xfs_mount *mp = tp->t_mountp;
- struct xfs_agi *agi = agibp->b_addr;
xfs_agino_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
xfs_agino_t next_agino = ip->i_next_unlinked;
@@ -2059,6 +2099,7 @@ xfs_iunlink_remove_inode(
if (ip->i_prev_unlinked == NULLAGINO) {
struct xlog *log = mp->m_log;
+ struct xfs_agi *agi = agibp->b_addr;
short bucket_index = 0;
/* During recovery, the old multiple bucket can be applied */
@@ -2083,7 +2124,7 @@ xfs_iunlink_remove_inode(
/* lookup previous inode and update to point at next */
struct xfs_inode *pip;
- pip = xfs_iunlink_lookup_prev(agibp->b_pag, ip);
+ pip = xfs_iunlink_lookup_prev(pag, ip);
if (IS_ERR_OR_NULL(pip))
return -EFSCORRUPTED;
@@ -2102,7 +2143,7 @@ xfs_iunlink_remove_inode(
if (ip->i_next_unlinked != NULLAGINO) {
struct xfs_inode *nip;
- nip = xfs_iunlink_lookup_next(agibp->b_pag, ip);
+ nip = xfs_iunlink_lookup_next(pag, ip);
if (IS_ERR_OR_NULL(nip))
return -EFSCORRUPTED;
@@ -2126,6 +2167,15 @@ xfs_iunlink_remove_inode(
return 0;
}
+static void
+xfs_iunlink_unlock(
+ struct xfs_perag *pag)
+{
+ /* Does not unlock AGI, ever. xfs_trans_commit() does that. */
+ mutex_unlock(&pag->pag_unlinked_mutex);
+ xfs_perag_put(pag);
+}
+
/*
* This is called when the inode's link count has gone to 0 or we are creating
* a tmpfile via O_TMPFILE. The inode @ip must have nlink == 0.
@@ -2143,7 +2193,6 @@ xfs_iunlink(
xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
int error;
- ASSERT(ip->i_next_unlinked == NULLAGINO);
ASSERT(VFS_I(ip)->i_nlink == 0);
ASSERT(VFS_I(ip)->i_mode != 0);
trace_xfs_iunlink(ip);
@@ -2153,7 +2202,10 @@ xfs_iunlink(
if (error)
return error;
- return xfs_iunlink_insert_inode(tp, agibp, ip);
+ mutex_lock(&agibp->b_pag->pag_unlinked_mutex);
+ error = xfs_iunlink_insert_inode(tp, agibp, ip);
+ mutex_unlock(&agibp->b_pag->pag_unlinked_mutex);
+ return error;
}
/*
@@ -2164,19 +2216,20 @@ xfs_iunlink_remove(
struct xfs_trans *tp,
struct xfs_inode *ip)
{
- struct xfs_mount *mp = tp->t_mountp;
- struct xfs_buf *agibp;
- xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
+ struct xfs_buf *agibp = NULL;
+ struct xfs_perag *pag;
int error;
trace_xfs_iunlink_remove(ip);
- /* Get the agi buffer first. It ensures lock ordering on the list. */
- error = xfs_read_agi(mp, tp, agno, &agibp);
- if (error)
- return error;
+ pag = xfs_iunlink_remove_lock(XFS_INO_TO_AGNO(tp->t_mountp, ip->i_ino),
+ tp, ip, &agibp);
+ if (IS_ERR(pag))
+ return PTR_ERR(pag);
- return xfs_iunlink_remove_inode(tp, agibp, ip);
+ error = xfs_iunlink_remove_inode(pag, tp, agibp, ip);
+ xfs_iunlink_unlock(pag);
+ return error;
}
/*
@@ -223,6 +223,7 @@ xfs_initialize_perag(
if (first_initialised == NULLAGNUMBER)
first_initialised = index;
spin_lock_init(&pag->pag_state_lock);
+ mutex_init(&pag->pag_unlinked_mutex);
}
index = xfs_set_inode_alloc(mp, agcount);
@@ -372,6 +372,9 @@ typedef struct xfs_perag {
/* reference count */
uint8_t pagf_refcount_level;
+ /* lock to protect unlinked inode list */
+ struct mutex pag_unlinked_mutex;
+
/*
* Unlinked inode information. This incore information reflects
* data stored in the AGI, so callers must hold the AGI buffer lock
Currently, we use AGI buffer lock to protect in-memory linked list for unlinked inodes but since it's not necessary to modify AGI unless the head of the unlinked list is modified. Therefore, let's add another per-AG dedicated lock to protect the whole list, and removing AGI buffer lock in xfs_iunlink_remove() if the head is untouched. Note that some paths (e.g. xfs_create_tmpfile()) take AGI lock in its transaction in advance, so the proper locking order is only AGI lock -> unlinked list lock. Signed-off-by: Gao Xiang <hsiangkao@redhat.com> --- fs/xfs/xfs_inode.c | 79 ++++++++++++++++++++++++++++++++++++++-------- fs/xfs/xfs_mount.c | 1 + fs/xfs/xfs_mount.h | 3 ++ 3 files changed, 70 insertions(+), 13 deletions(-)