@@ -452,10 +452,18 @@ xfs_buf_item_format(
* This is called to pin the buffer associated with the buf log item in memory
* so it cannot be written out.
*
- * We also always take a reference to the buffer log item here so that the bli
- * is held while the item is pinned in memory. This means that we can
- * unconditionally drop the reference count a transaction holds when the
- * transaction is completed.
+ * We take a reference to the buffer log item here so that the BLI life cycle
+ * extends at least until the buffer is unpinned via xfs_buf_item_unpin() and
+ * inserted into the AIL.
+ *
+ * We also need to take a reference to the buffer itself as the BLI unpin
+ * processing requires accessing the buffer after the BLI has dropped the final
+ * BLI reference. See xfs_buf_item_unpin() for an explanation.
+ * If unpins race to drop the final BLI reference and only the
+ * BLI owns a reference to the buffer, then the loser of the race can have the
+ * buffer fgreed from under it (e.g. on shutdown). Taking a buffer reference per
+ * pin count ensures the life cycle of the buffer extends for as
+ * long as we hold the buffer pin reference in xfs_buf_item_unpin().
*/
STATIC void
xfs_buf_item_pin(
@@ -470,13 +478,30 @@ xfs_buf_item_pin(
trace_xfs_buf_item_pin(bip);
+ xfs_buf_hold(bip->bli_buf);
atomic_inc(&bip->bli_refcount);
atomic_inc(&bip->bli_buf->b_pin_count);
}
/*
- * This is called to unpin the buffer associated with the buf log item which
- * was previously pinned with a call to xfs_buf_item_pin().
+ * This is called to unpin the buffer associated with the buf log item which was
+ * previously pinned with a call to xfs_buf_item_pin(). We enter this function
+ * with a buffer pin count, a buffer reference and a BLI reference.
+ *
+ * We must drop the BLI reference before we unpin the buffer because the AIL
+ * doesn't acquire a BLI reference whenever it accesses it. Therefore if the
+ * refcount drops to zero, the bli could still be AIL resident and the buffer
+ * submitted for I/O at any point before we return. This can result in IO
+ * completion freeing the buffer while we are still trying to access it here.
+ * This race condition can also occur in shutdown situations where we abort and
+ * unpin buffers from contexts other that journal IO completion.
+ *
+ * Hence we have to hold a buffer reference per pin count to ensure that the
+ * buffer cannot be freed until we have finished processing the unpin operation.
+ * The reference is taken in xfs_buf_item_pin(), and we must hold it until we
+ * are done processing the buffer state. In the case of an abort (remove =
+ * true) then we re-use the current pin reference as the IO reference we hand
+ * off to IO failure handling.
*/
STATIC void
xfs_buf_item_unpin(
@@ -493,24 +518,18 @@ xfs_buf_item_unpin(
trace_xfs_buf_item_unpin(bip);
- /*
- * Drop the bli ref associated with the pin and grab the hold required
- * for the I/O simulation failure in the abort case. We have to do this
- * before the pin count drops because the AIL doesn't acquire a bli
- * reference. Therefore if the refcount drops to zero, the bli could
- * still be AIL resident and the buffer submitted for I/O (and freed on
- * completion) at any point before we return. This can be removed once
- * the AIL properly holds a reference on the bli.
- */
freed = atomic_dec_and_test(&bip->bli_refcount);
- if (freed && !stale && remove)
- xfs_buf_hold(bp);
if (atomic_dec_and_test(&bp->b_pin_count))
wake_up_all(&bp->b_waiters);
- /* nothing to do but drop the pin count if the bli is active */
- if (!freed)
+ /*
+ * Nothing to do but drop the buffer pin reference if the BLI is
+ * still active.
+ */
+ if (!freed) {
+ xfs_buf_rele(bp);
return;
+ }
if (stale) {
ASSERT(bip->bli_flags & XFS_BLI_STALE);
@@ -522,6 +541,15 @@ xfs_buf_item_unpin(
trace_xfs_buf_item_unpin_stale(bip);
+ /*
+ * The buffer has been locked and referenced since it was marked
+ * stale so we own both lock and reference exclusively here. We
+ * do not need the pin reference any more, so drop it now so
+ * that we only have one reference to drop once item completion
+ * processing is complete.
+ */
+ xfs_buf_rele(bp);
+
/*
* If we get called here because of an IO error, we may or may
* not have the item on the AIL. xfs_trans_ail_delete() will
@@ -538,16 +566,30 @@ xfs_buf_item_unpin(
ASSERT(bp->b_log_item == NULL);
}
xfs_buf_relse(bp);
- } else if (remove) {
+ return;
+ }
+
+ if (remove) {
/*
- * The buffer must be locked and held by the caller to simulate
- * an async I/O failure. We acquired the hold for this case
- * before the buffer was unpinned.
+ * We need to simulate an async IO failures here to ensure that
+ * the correct error completion is run on this buffer. This
+ * requires a reference to the buffer and for the buffer to be
+ * locked. We can safely pass ownership of the pin reference to
+ * the IO to ensure that nothing can free the buffer while we
+ * wait for the lock and then run the IO failure completion.
*/
xfs_buf_lock(bp);
bp->b_flags |= XBF_ASYNC;
xfs_buf_ioend_fail(bp);
+ return;
}
+
+ /*
+ * BLI has no more active references - it will be moved to the AIL to
+ * manage the remaining BLI/buffer life cycle. There is nothing left for
+ * us to do here so drop the pin reference to the buffer.
+ */
+ xfs_buf_rele(bp);
}
STATIC uint