diff mbox

[4/9] xfs: don't assert fail with AIL lock held

Message ID 20180508034202.10136-5-david@fromorbit.com (mailing list archive)
State Accepted
Headers show

Commit Message

Dave Chinner May 8, 2018, 3:41 a.m. UTC
From: Dave Chinner <dchinner@redhat.com>

Been hitting AIL ordering assert failures recently, but been unable
to trace them down because the system immediately hangs up onteh
spinlock that was held when this assert fires:

XFS: Assertion failed: XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0, file: fs/xfs/xfs_trans_ail.c, line: 52

Move the assertions outside of the spinlock so the corpse can
be dissected. Thanks to Brian Foster for supplying a clean
way of doing this.

Signed-Off-By: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_trans_ail.c | 43 +++++++++++++++++++++++++++++++-----------
 1 file changed, 32 insertions(+), 11 deletions(-)

Comments

Brian Foster May 8, 2018, 2:18 p.m. UTC | #1
On Tue, May 08, 2018 at 01:41:57PM +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> Been hitting AIL ordering assert failures recently, but been unable
> to trace them down because the system immediately hangs up onteh
> spinlock that was held when this assert fires:
> 
> XFS: Assertion failed: XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0, file: fs/xfs/xfs_trans_ail.c, line: 52
> 
> Move the assertions outside of the spinlock so the corpse can
> be dissected. Thanks to Brian Foster for supplying a clean
> way of doing this.
> 
> Signed-Off-By: Dave Chinner <dchinner@redhat.com>
> ---

Reviewed-by: Brian Foster <bfoster@redhat.com>

>  fs/xfs/xfs_trans_ail.c | 43 +++++++++++++++++++++++++++++++-----------
>  1 file changed, 32 insertions(+), 11 deletions(-)
> 
> diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
> index 50611d2bcbc2..41e280ef1483 100644
> --- a/fs/xfs/xfs_trans_ail.c
> +++ b/fs/xfs/xfs_trans_ail.c
> @@ -32,30 +32,51 @@
>  #ifdef DEBUG
>  /*
>   * Check that the list is sorted as it should be.
> + *
> + * Called with the ail lock held, but we don't want to assert fail with it
> + * held otherwise we'll lock everything up and won't be able to debug the
> + * cause. Hence we sample and check the state under the AIL lock and return if
> + * everything is fine, otherwise we drop the lock and run the ASSERT checks.
> + * Asserts may not be fatal, so pick the lock back up and continue onwards.
>   */
>  STATIC void
>  xfs_ail_check(
> -	struct xfs_ail	*ailp,
> -	xfs_log_item_t	*lip)
> +	struct xfs_ail		*ailp,
> +	struct xfs_log_item	*lip)
>  {
> -	xfs_log_item_t	*prev_lip;
> +	struct xfs_log_item	*prev_lip;
> +	struct xfs_log_item	*next_lip;
> +	xfs_lsn_t		prev_lsn = NULLCOMMITLSN;
> +	xfs_lsn_t		next_lsn = NULLCOMMITLSN;
> +	xfs_lsn_t		lsn;
> +	bool			in_ail;
> +
>  
>  	if (list_empty(&ailp->ail_head))
>  		return;
>  
>  	/*
> -	 * Check the next and previous entries are valid.
> +	 * Sample then check the next and previous entries are valid.
>  	 */
> -	ASSERT(test_bit(XFS_LI_IN_AIL, &lip->li_flags));
> -	prev_lip = list_entry(lip->li_ail.prev, xfs_log_item_t, li_ail);
> +	in_ail = test_bit(XFS_LI_IN_AIL, &lip->li_flags);
> +	prev_lip = list_entry(lip->li_ail.prev, struct xfs_log_item, li_ail);
>  	if (&prev_lip->li_ail != &ailp->ail_head)
> -		ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
> -
> -	prev_lip = list_entry(lip->li_ail.next, xfs_log_item_t, li_ail);
> -	if (&prev_lip->li_ail != &ailp->ail_head)
> -		ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) >= 0);
> +		prev_lsn = prev_lip->li_lsn;
> +	next_lip = list_entry(lip->li_ail.next, struct xfs_log_item, li_ail);
> +	if (&next_lip->li_ail != &ailp->ail_head)
> +		next_lsn = next_lip->li_lsn;
> +	lsn = lip->li_lsn;
>  
> +	if (in_ail &&
> +	    (prev_lsn == NULLCOMMITLSN || XFS_LSN_CMP(prev_lsn, lsn) <= 0) &&
> +	    (next_lsn == NULLCOMMITLSN || XFS_LSN_CMP(next_lsn, lsn) >= 0))
> +		return;
>  
> +	spin_unlock(&ailp->ail_lock);
> +	ASSERT(in_ail);
> +	ASSERT(prev_lsn == NULLCOMMITLSN || XFS_LSN_CMP(prev_lsn, lsn) <= 0);
> +	ASSERT(next_lsn == NULLCOMMITLSN || XFS_LSN_CMP(next_lsn, lsn) >= 0);
> +	spin_lock(&ailp->ail_lock);
>  }
>  #else /* !DEBUG */
>  #define	xfs_ail_check(a,l)
> -- 
> 2.17.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig May 9, 2018, 6:13 a.m. UTC | #2
Not really excited about this, but if it helps you debugging the AIL
code:

Reviewed-by: Christoph Hellwig <hch@lst.de>
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Darrick J. Wong May 9, 2018, 2:52 p.m. UTC | #3
On Tue, May 08, 2018 at 01:41:57PM +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> Been hitting AIL ordering assert failures recently, but been unable
> to trace them down because the system immediately hangs up onteh
> spinlock that was held when this assert fires:
> 
> XFS: Assertion failed: XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0, file: fs/xfs/xfs_trans_ail.c, line: 52
> 
> Move the assertions outside of the spinlock so the corpse can
> be dissected. Thanks to Brian Foster for supplying a clean
> way of doing this.
> 
> Signed-Off-By: Dave Chinner <dchinner@redhat.com>

Looks ok,
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>

--D

> ---
>  fs/xfs/xfs_trans_ail.c | 43 +++++++++++++++++++++++++++++++-----------
>  1 file changed, 32 insertions(+), 11 deletions(-)
> 
> diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
> index 50611d2bcbc2..41e280ef1483 100644
> --- a/fs/xfs/xfs_trans_ail.c
> +++ b/fs/xfs/xfs_trans_ail.c
> @@ -32,30 +32,51 @@
>  #ifdef DEBUG
>  /*
>   * Check that the list is sorted as it should be.
> + *
> + * Called with the ail lock held, but we don't want to assert fail with it
> + * held otherwise we'll lock everything up and won't be able to debug the
> + * cause. Hence we sample and check the state under the AIL lock and return if
> + * everything is fine, otherwise we drop the lock and run the ASSERT checks.
> + * Asserts may not be fatal, so pick the lock back up and continue onwards.
>   */
>  STATIC void
>  xfs_ail_check(
> -	struct xfs_ail	*ailp,
> -	xfs_log_item_t	*lip)
> +	struct xfs_ail		*ailp,
> +	struct xfs_log_item	*lip)
>  {
> -	xfs_log_item_t	*prev_lip;
> +	struct xfs_log_item	*prev_lip;
> +	struct xfs_log_item	*next_lip;
> +	xfs_lsn_t		prev_lsn = NULLCOMMITLSN;
> +	xfs_lsn_t		next_lsn = NULLCOMMITLSN;
> +	xfs_lsn_t		lsn;
> +	bool			in_ail;
> +
>  
>  	if (list_empty(&ailp->ail_head))
>  		return;
>  
>  	/*
> -	 * Check the next and previous entries are valid.
> +	 * Sample then check the next and previous entries are valid.
>  	 */
> -	ASSERT(test_bit(XFS_LI_IN_AIL, &lip->li_flags));
> -	prev_lip = list_entry(lip->li_ail.prev, xfs_log_item_t, li_ail);
> +	in_ail = test_bit(XFS_LI_IN_AIL, &lip->li_flags);
> +	prev_lip = list_entry(lip->li_ail.prev, struct xfs_log_item, li_ail);
>  	if (&prev_lip->li_ail != &ailp->ail_head)
> -		ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
> -
> -	prev_lip = list_entry(lip->li_ail.next, xfs_log_item_t, li_ail);
> -	if (&prev_lip->li_ail != &ailp->ail_head)
> -		ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) >= 0);
> +		prev_lsn = prev_lip->li_lsn;
> +	next_lip = list_entry(lip->li_ail.next, struct xfs_log_item, li_ail);
> +	if (&next_lip->li_ail != &ailp->ail_head)
> +		next_lsn = next_lip->li_lsn;
> +	lsn = lip->li_lsn;
>  
> +	if (in_ail &&
> +	    (prev_lsn == NULLCOMMITLSN || XFS_LSN_CMP(prev_lsn, lsn) <= 0) &&
> +	    (next_lsn == NULLCOMMITLSN || XFS_LSN_CMP(next_lsn, lsn) >= 0))
> +		return;
>  
> +	spin_unlock(&ailp->ail_lock);
> +	ASSERT(in_ail);
> +	ASSERT(prev_lsn == NULLCOMMITLSN || XFS_LSN_CMP(prev_lsn, lsn) <= 0);
> +	ASSERT(next_lsn == NULLCOMMITLSN || XFS_LSN_CMP(next_lsn, lsn) >= 0);
> +	spin_lock(&ailp->ail_lock);
>  }
>  #else /* !DEBUG */
>  #define	xfs_ail_check(a,l)
> -- 
> 2.17.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 50611d2bcbc2..41e280ef1483 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -32,30 +32,51 @@ 
 #ifdef DEBUG
 /*
  * Check that the list is sorted as it should be.
+ *
+ * Called with the ail lock held, but we don't want to assert fail with it
+ * held otherwise we'll lock everything up and won't be able to debug the
+ * cause. Hence we sample and check the state under the AIL lock and return if
+ * everything is fine, otherwise we drop the lock and run the ASSERT checks.
+ * Asserts may not be fatal, so pick the lock back up and continue onwards.
  */
 STATIC void
 xfs_ail_check(
-	struct xfs_ail	*ailp,
-	xfs_log_item_t	*lip)
+	struct xfs_ail		*ailp,
+	struct xfs_log_item	*lip)
 {
-	xfs_log_item_t	*prev_lip;
+	struct xfs_log_item	*prev_lip;
+	struct xfs_log_item	*next_lip;
+	xfs_lsn_t		prev_lsn = NULLCOMMITLSN;
+	xfs_lsn_t		next_lsn = NULLCOMMITLSN;
+	xfs_lsn_t		lsn;
+	bool			in_ail;
+
 
 	if (list_empty(&ailp->ail_head))
 		return;
 
 	/*
-	 * Check the next and previous entries are valid.
+	 * Sample then check the next and previous entries are valid.
 	 */
-	ASSERT(test_bit(XFS_LI_IN_AIL, &lip->li_flags));
-	prev_lip = list_entry(lip->li_ail.prev, xfs_log_item_t, li_ail);
+	in_ail = test_bit(XFS_LI_IN_AIL, &lip->li_flags);
+	prev_lip = list_entry(lip->li_ail.prev, struct xfs_log_item, li_ail);
 	if (&prev_lip->li_ail != &ailp->ail_head)
-		ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
-
-	prev_lip = list_entry(lip->li_ail.next, xfs_log_item_t, li_ail);
-	if (&prev_lip->li_ail != &ailp->ail_head)
-		ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) >= 0);
+		prev_lsn = prev_lip->li_lsn;
+	next_lip = list_entry(lip->li_ail.next, struct xfs_log_item, li_ail);
+	if (&next_lip->li_ail != &ailp->ail_head)
+		next_lsn = next_lip->li_lsn;
+	lsn = lip->li_lsn;
 
+	if (in_ail &&
+	    (prev_lsn == NULLCOMMITLSN || XFS_LSN_CMP(prev_lsn, lsn) <= 0) &&
+	    (next_lsn == NULLCOMMITLSN || XFS_LSN_CMP(next_lsn, lsn) >= 0))
+		return;
 
+	spin_unlock(&ailp->ail_lock);
+	ASSERT(in_ail);
+	ASSERT(prev_lsn == NULLCOMMITLSN || XFS_LSN_CMP(prev_lsn, lsn) <= 0);
+	ASSERT(next_lsn == NULLCOMMITLSN || XFS_LSN_CMP(next_lsn, lsn) >= 0);
+	spin_lock(&ailp->ail_lock);
 }
 #else /* !DEBUG */
 #define	xfs_ail_check(a,l)