diff mbox series

[5/8] btrfs: don't enospc all tickets on flush failure

Message ID 20181121190313.24575-6-josef@toxicpanda.com (mailing list archive)
State New, archived
Headers show
Series Enospc cleanups and fixes | expand

Commit Message

Josef Bacik Nov. 21, 2018, 7:03 p.m. UTC
With the introduction of the per-inode block_rsv it became possible to
have really really large reservation requests made because of data
fragmentation.  Since the ticket stuff assumed that we'd always have
relatively small reservation requests it just killed all tickets if we
were unable to satisfy the current request.  However this is generally
not the case anymore.  So fix this logic to instead see if we had a
ticket that we were able to give some reservation to, and if we were
continue the flushing loop again.  Likewise we make the tickets use the
space_info_add_old_bytes() method of returning what reservation they did
receive in hopes that it could satisfy reservations down the line.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
---
 fs/btrfs/extent-tree.c | 45 +++++++++++++++++++++++++--------------------
 1 file changed, 25 insertions(+), 20 deletions(-)

Comments

Nikolay Borisov Nov. 26, 2018, 12:25 p.m. UTC | #1
On 21.11.18 г. 21:03 ч., Josef Bacik wrote:
> With the introduction of the per-inode block_rsv it became possible to
> have really really large reservation requests made because of data
> fragmentation.  Since the ticket stuff assumed that we'd always have
> relatively small reservation requests it just killed all tickets if we
> were unable to satisfy the current request.  However this is generally
> not the case anymore.  So fix this logic to instead see if we had a
> ticket that we were able to give some reservation to, and if we were
> continue the flushing loop again.  Likewise we make the tickets use the
> space_info_add_old_bytes() method of returning what reservation they did
> receive in hopes that it could satisfy reservations down the line.


The logic of the patch can be summarised as follows:

If no progress is made for a ticket, then start fail all tickets until
the first one that has progress made on its reservation (inclusive). In
this case this first ticket will be failed but at least it's space will
be reused via space_info_add_old_bytes.

Frankly this seem really arbitrary.

> 
> Signed-off-by: Josef Bacik <josef@toxicpanda.com>
> ---
>  fs/btrfs/extent-tree.c | 45 +++++++++++++++++++++++++--------------------
>  1 file changed, 25 insertions(+), 20 deletions(-)
> 
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index e6bb6ce23c84..983d086fa768 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -4791,6 +4791,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
>  }
>  
>  struct reserve_ticket {
> +	u64 orig_bytes;
>  	u64 bytes;
>  	int error;
>  	struct list_head list;
> @@ -5012,7 +5013,7 @@ static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
>  		!test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
>  }
>  
> -static void wake_all_tickets(struct list_head *head)
> +static bool wake_all_tickets(struct list_head *head)
>  {
>  	struct reserve_ticket *ticket;
>  
> @@ -5021,7 +5022,10 @@ static void wake_all_tickets(struct list_head *head)
>  		list_del_init(&ticket->list);
>  		ticket->error = -ENOSPC;
>  		wake_up(&ticket->wait);
> +		if (ticket->bytes != ticket->orig_bytes)
> +			return true;
>  	}
> +	return false;
>  }
>  
>  /*
> @@ -5089,8 +5093,12 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
>  		if (flush_state > COMMIT_TRANS) {
>  			commit_cycles++;
>  			if (commit_cycles > 2) {
> -				wake_all_tickets(&space_info->tickets);
> -				space_info->flush = 0;
> +				if (wake_all_tickets(&space_info->tickets)) {
> +					flush_state = FLUSH_DELAYED_ITEMS_NR;
> +					commit_cycles--;
> +				} else {
> +					space_info->flush = 0;
> +				}
>  			} else {
>  				flush_state = FLUSH_DELAYED_ITEMS_NR;
>  			}
> @@ -5142,10 +5150,11 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
>  
>  static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
>  			       struct btrfs_space_info *space_info,
> -			       struct reserve_ticket *ticket, u64 orig_bytes)
> +			       struct reserve_ticket *ticket)
>  
>  {
>  	DEFINE_WAIT(wait);
> +	u64 reclaim_bytes = 0;
>  	int ret = 0;
>  
>  	spin_lock(&space_info->lock);
> @@ -5166,14 +5175,12 @@ static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
>  		ret = ticket->error;
>  	if (!list_empty(&ticket->list))
>  		list_del_init(&ticket->list);
> -	if (ticket->bytes && ticket->bytes < orig_bytes) {
> -		u64 num_bytes = orig_bytes - ticket->bytes;
> -		update_bytes_may_use(space_info, -num_bytes);
> -		trace_btrfs_space_reservation(fs_info, "space_info",
> -					      space_info->flags, num_bytes, 0);
> -	}
> +	if (ticket->bytes && ticket->bytes < ticket->orig_bytes)
> +		reclaim_bytes = ticket->orig_bytes - ticket->bytes;
>  	spin_unlock(&space_info->lock);
>  
> +	if (reclaim_bytes)
> +		space_info_add_old_bytes(fs_info, space_info, reclaim_bytes);
>  	return ret;
>  }
>  
> @@ -5199,6 +5206,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
>  {
>  	struct reserve_ticket ticket;
>  	u64 used;
> +	u64 reclaim_bytes = 0;
>  	int ret = 0;
>  
>  	ASSERT(orig_bytes);
> @@ -5234,6 +5242,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
>  	 * the list and we will do our own flushing further down.
>  	 */
>  	if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
> +		ticket.orig_bytes = orig_bytes;
>  		ticket.bytes = orig_bytes;
>  		ticket.error = 0;
>  		init_waitqueue_head(&ticket.wait);
> @@ -5274,25 +5283,21 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
>  		return ret;
>  
>  	if (flush == BTRFS_RESERVE_FLUSH_ALL)
> -		return wait_reserve_ticket(fs_info, space_info, &ticket,
> -					   orig_bytes);
> +		return wait_reserve_ticket(fs_info, space_info, &ticket);
>  
>  	ret = 0;
>  	priority_reclaim_metadata_space(fs_info, space_info, &ticket);
>  	spin_lock(&space_info->lock);
>  	if (ticket.bytes) {
> -		if (ticket.bytes < orig_bytes) {
> -			u64 num_bytes = orig_bytes - ticket.bytes;
> -			update_bytes_may_use(space_info, -num_bytes);
> -			trace_btrfs_space_reservation(fs_info, "space_info",
> -						      space_info->flags,
> -						      num_bytes, 0);
> -
> -		}
> +		if (ticket.bytes < orig_bytes)
> +			reclaim_bytes = orig_bytes - ticket.bytes;
>  		list_del_init(&ticket.list);
>  		ret = -ENOSPC;
>  	}
>  	spin_unlock(&space_info->lock);
> +
> +	if (reclaim_bytes)
> +		space_info_add_old_bytes(fs_info, space_info, reclaim_bytes);
>  	ASSERT(list_empty(&ticket.list));
>  	return ret;
>  }
>
Josef Bacik Nov. 27, 2018, 7:46 p.m. UTC | #2
On Mon, Nov 26, 2018 at 02:25:52PM +0200, Nikolay Borisov wrote:
> 
> 
> On 21.11.18 г. 21:03 ч., Josef Bacik wrote:
> > With the introduction of the per-inode block_rsv it became possible to
> > have really really large reservation requests made because of data
> > fragmentation.  Since the ticket stuff assumed that we'd always have
> > relatively small reservation requests it just killed all tickets if we
> > were unable to satisfy the current request.  However this is generally
> > not the case anymore.  So fix this logic to instead see if we had a
> > ticket that we were able to give some reservation to, and if we were
> > continue the flushing loop again.  Likewise we make the tickets use the
> > space_info_add_old_bytes() method of returning what reservation they did
> > receive in hopes that it could satisfy reservations down the line.
> 
> 
> The logic of the patch can be summarised as follows:
> 
> If no progress is made for a ticket, then start fail all tickets until
> the first one that has progress made on its reservation (inclusive). In
> this case this first ticket will be failed but at least it's space will
> be reused via space_info_add_old_bytes.
> 
> Frankly this seem really arbitrary.

It's not though.  The tickets are in order of who requested the reservation.
Because we will backfill reservations for things like hugely fragmented files or
large amounts of delayed refs we can have spikes where we're trying to reserve
100mb's of metadata space.  We may fill 50mb of that before we run out of space.
Well so we can't satisfy that reservation, but the small 100k reservations that
are waiting to be serviced can be satisfied and they can run.  The alternative
is you get ENOSPC and then you can turn around and touch a file no problem
because it's a small reservation and there was room for it.  This patch enables
better behavior for the user.  Thanks,

Josef
Nikolay Borisov Nov. 28, 2018, 8:11 a.m. UTC | #3
On 27.11.18 г. 21:46 ч., Josef Bacik wrote:
> On Mon, Nov 26, 2018 at 02:25:52PM +0200, Nikolay Borisov wrote:
>>
>>
>> On 21.11.18 г. 21:03 ч., Josef Bacik wrote:
>>> With the introduction of the per-inode block_rsv it became possible to
>>> have really really large reservation requests made because of data
>>> fragmentation.  Since the ticket stuff assumed that we'd always have
>>> relatively small reservation requests it just killed all tickets if we
>>> were unable to satisfy the current request.  However this is generally
>>> not the case anymore.  So fix this logic to instead see if we had a
>>> ticket that we were able to give some reservation to, and if we were
>>> continue the flushing loop again.  Likewise we make the tickets use the
>>> space_info_add_old_bytes() method of returning what reservation they did
>>> receive in hopes that it could satisfy reservations down the line.
>>
>>
>> The logic of the patch can be summarised as follows:
>>
>> If no progress is made for a ticket, then start fail all tickets until
>> the first one that has progress made on its reservation (inclusive). In
>> this case this first ticket will be failed but at least it's space will
>> be reused via space_info_add_old_bytes.
>>
>> Frankly this seem really arbitrary.
> 
> It's not though.  The tickets are in order of who requested the reservation.
> Because we will backfill reservations for things like hugely fragmented files or
> large amounts of delayed refs we can have spikes where we're trying to reserve
> 100mb's of metadata space.  We may fill 50mb of that before we run out of space.
> Well so we can't satisfy that reservation, but the small 100k reservations that
> are waiting to be serviced can be satisfied and they can run.  The alternative
> is you get ENOSPC and then you can turn around and touch a file no problem
> because it's a small reservation and there was room for it.  This patch enables
> better behavior for the user.  Thanks,

Well this information needs to be in the changelog since it describe the
situation where this patch is useful.

> 
> Josef
>
diff mbox series

Patch

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e6bb6ce23c84..983d086fa768 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4791,6 +4791,7 @@  static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
 }
 
 struct reserve_ticket {
+	u64 orig_bytes;
 	u64 bytes;
 	int error;
 	struct list_head list;
@@ -5012,7 +5013,7 @@  static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
 		!test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
 }
 
-static void wake_all_tickets(struct list_head *head)
+static bool wake_all_tickets(struct list_head *head)
 {
 	struct reserve_ticket *ticket;
 
@@ -5021,7 +5022,10 @@  static void wake_all_tickets(struct list_head *head)
 		list_del_init(&ticket->list);
 		ticket->error = -ENOSPC;
 		wake_up(&ticket->wait);
+		if (ticket->bytes != ticket->orig_bytes)
+			return true;
 	}
+	return false;
 }
 
 /*
@@ -5089,8 +5093,12 @@  static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
 		if (flush_state > COMMIT_TRANS) {
 			commit_cycles++;
 			if (commit_cycles > 2) {
-				wake_all_tickets(&space_info->tickets);
-				space_info->flush = 0;
+				if (wake_all_tickets(&space_info->tickets)) {
+					flush_state = FLUSH_DELAYED_ITEMS_NR;
+					commit_cycles--;
+				} else {
+					space_info->flush = 0;
+				}
 			} else {
 				flush_state = FLUSH_DELAYED_ITEMS_NR;
 			}
@@ -5142,10 +5150,11 @@  static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
 
 static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
 			       struct btrfs_space_info *space_info,
-			       struct reserve_ticket *ticket, u64 orig_bytes)
+			       struct reserve_ticket *ticket)
 
 {
 	DEFINE_WAIT(wait);
+	u64 reclaim_bytes = 0;
 	int ret = 0;
 
 	spin_lock(&space_info->lock);
@@ -5166,14 +5175,12 @@  static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
 		ret = ticket->error;
 	if (!list_empty(&ticket->list))
 		list_del_init(&ticket->list);
-	if (ticket->bytes && ticket->bytes < orig_bytes) {
-		u64 num_bytes = orig_bytes - ticket->bytes;
-		update_bytes_may_use(space_info, -num_bytes);
-		trace_btrfs_space_reservation(fs_info, "space_info",
-					      space_info->flags, num_bytes, 0);
-	}
+	if (ticket->bytes && ticket->bytes < ticket->orig_bytes)
+		reclaim_bytes = ticket->orig_bytes - ticket->bytes;
 	spin_unlock(&space_info->lock);
 
+	if (reclaim_bytes)
+		space_info_add_old_bytes(fs_info, space_info, reclaim_bytes);
 	return ret;
 }
 
@@ -5199,6 +5206,7 @@  static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
 {
 	struct reserve_ticket ticket;
 	u64 used;
+	u64 reclaim_bytes = 0;
 	int ret = 0;
 
 	ASSERT(orig_bytes);
@@ -5234,6 +5242,7 @@  static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
 	 * the list and we will do our own flushing further down.
 	 */
 	if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
+		ticket.orig_bytes = orig_bytes;
 		ticket.bytes = orig_bytes;
 		ticket.error = 0;
 		init_waitqueue_head(&ticket.wait);
@@ -5274,25 +5283,21 @@  static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
 		return ret;
 
 	if (flush == BTRFS_RESERVE_FLUSH_ALL)
-		return wait_reserve_ticket(fs_info, space_info, &ticket,
-					   orig_bytes);
+		return wait_reserve_ticket(fs_info, space_info, &ticket);
 
 	ret = 0;
 	priority_reclaim_metadata_space(fs_info, space_info, &ticket);
 	spin_lock(&space_info->lock);
 	if (ticket.bytes) {
-		if (ticket.bytes < orig_bytes) {
-			u64 num_bytes = orig_bytes - ticket.bytes;
-			update_bytes_may_use(space_info, -num_bytes);
-			trace_btrfs_space_reservation(fs_info, "space_info",
-						      space_info->flags,
-						      num_bytes, 0);
-
-		}
+		if (ticket.bytes < orig_bytes)
+			reclaim_bytes = orig_bytes - ticket.bytes;
 		list_del_init(&ticket.list);
 		ret = -ENOSPC;
 	}
 	spin_unlock(&space_info->lock);
+
+	if (reclaim_bytes)
+		space_info_add_old_bytes(fs_info, space_info, reclaim_bytes);
 	ASSERT(list_empty(&ticket.list));
 	return ret;
 }