diff mbox series

[v2] dm: respect REQ_NOWAIT flag in bios issued to DM

Message ID ZTlt0HPbVZf0gYcI@redhat.com (mailing list archive)
State New, archived
Headers show
Series [v2] dm: respect REQ_NOWAIT flag in bios issued to DM | expand

Commit Message

Mike Snitzer Oct. 25, 2023, 7:34 p.m. UTC
From: Mikulas Patocka <mpatocka@redhat.com>

Update DM core's IO submission to allocate required memory using
GFP_NOWAIT if REQ_NOWAIT is set.  Lone exception is in the error path
where DM's requeue support needs to allocate a clone bio in
dm_io_rewind() to allow the IO to be resubmitted: GFP_NOWAIT is used
first but if it fails GFP_NOIO is used as a last resort.

Tested with simple test provided in commit a9ce385344f916 ("dm: don't
attempt to queue IO under RCU protection") that was enhanced to check
error codes.  Also tested using fio's pvsync2 with nowait=1.

But testing with induced GFP_NOWAIT allocation failures wasn't
performed.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Co-developed-by: Mike Snitzer <snitzer@kernel.org>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-io-rewind.c | 13 ++++++--
 drivers/md/dm.c           | 66 ++++++++++++++++++++++++++++++++-------
 2 files changed, 65 insertions(+), 14 deletions(-)

Comments

Mike Snitzer Oct. 25, 2023, 7:55 p.m. UTC | #1
On Wed, Oct 25 2023 at  3:34P -0400,
Mike Snitzer <snitzer@kernel.org> wrote:

> From: Mikulas Patocka <mpatocka@redhat.com>
> 
> Update DM core's IO submission to allocate required memory using
> GFP_NOWAIT if REQ_NOWAIT is set.  Lone exception is in the error path
> where DM's requeue support needs to allocate a clone bio in
> dm_io_rewind() to allow the IO to be resubmitted: GFP_NOWAIT is used
> first but if it fails GFP_NOIO is used as a last resort.
> 
> Tested with simple test provided in commit a9ce385344f916 ("dm: don't
> attempt to queue IO under RCU protection") that was enhanced to check
> error codes.  Also tested using fio's pvsync2 with nowait=1.
> 
> But testing with induced GFP_NOWAIT allocation failures wasn't
> performed.
> 
> Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
> Co-developed-by: Mike Snitzer <snitzer@kernel.org>
> Signed-off-by: Mike Snitzer <snitzer@kernel.org>
> ---
>  drivers/md/dm-io-rewind.c | 13 ++++++--
>  drivers/md/dm.c           | 66 ++++++++++++++++++++++++++++++++-------
>  2 files changed, 65 insertions(+), 14 deletions(-)

Backfilling some comments on this v2:

1) I'm not completely clear if REQ_NOWAIT not ever sleeping (waiting
   on IO) is a hint vs a hard requirement
   -- so the dm_io_rewind() change to try GFP_NOWAIT and failback to
      GFP_NOIO might be bogus...

2) What is a good way to reliably simulate low memory allocations such
   that mempool backed GFP_NOWAIT allocations fail?

Thanks for any review/feedback.

Mike

> 
> diff --git a/drivers/md/dm-io-rewind.c b/drivers/md/dm-io-rewind.c
> index 6155b0117c9d..bde5a53e2d88 100644
> --- a/drivers/md/dm-io-rewind.c
> +++ b/drivers/md/dm-io-rewind.c
> @@ -143,8 +143,17 @@ static void dm_bio_rewind(struct bio *bio, unsigned int bytes)
>  void dm_io_rewind(struct dm_io *io, struct bio_set *bs)
>  {
>  	struct bio *orig = io->orig_bio;
> -	struct bio *new_orig = bio_alloc_clone(orig->bi_bdev, orig,
> -					       GFP_NOIO, bs);
> +	struct bio *new_orig;
> +
> +	new_orig = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOWAIT, bs);
> +	if (unlikely(!new_orig)) {
> +		/*
> +		 * Returning early and failing rewind isn't an option, even
> +		 * if orig has REQ_NOWAIT set, so retry alloc with GFP_NOIO.
> +		 */
> +		new_orig = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOIO, bs);
> +	}
> +
>  	/*
>  	 * dm_bio_rewind can restore to previous position since the
>  	 * end sector is fixed for original bio, but we still need
> diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> index 1113a8da3c47..b0e1425af475 100644
> --- a/drivers/md/dm.c
> +++ b/drivers/md/dm.c
> @@ -87,6 +87,8 @@ struct clone_info {
>  	unsigned int sector_count;
>  	bool is_abnormal_io:1;
>  	bool submit_as_polled:1;
> +	bool is_nowait:1;
> +	bool nowait_failed:1;
>  };
>  
>  static inline struct dm_target_io *clone_to_tio(struct bio *clone)
> @@ -576,7 +578,13 @@ static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
>  	struct dm_target_io *tio;
>  	struct bio *clone;
>  
> -	clone = bio_alloc_clone(NULL, bio, GFP_NOIO, &md->mempools->io_bs);
> +	if (unlikely(bio->bi_opf & REQ_NOWAIT)) {
> +		clone = bio_alloc_clone(NULL, bio, GFP_NOWAIT, &md->mempools->io_bs);
> +		if (!clone)
> +			return NULL;
> +	} else {
> +		clone = bio_alloc_clone(NULL, bio, GFP_NOIO, &md->mempools->io_bs);
> +	}
>  	tio = clone_to_tio(clone);
>  	tio->flags = 0;
>  	dm_tio_set_flag(tio, DM_TIO_INSIDE_DM_IO);
> @@ -1500,9 +1508,17 @@ static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci,
>  			mutex_unlock(&ci->io->md->table_devices_lock);
>  		if (bio_nr == num_bios)
>  			return;
> -
> +		/*
> +		 * Failed to allocate all bios, all or nothing, so rewind partial
> +		 * allocations and short-circuit retry with GFP_NOIO if 'is_nowait'
> +		 */
>  		while ((bio = bio_list_pop(blist)))
>  			free_tio(bio);
> +
> +		if (ci->is_nowait) {
> +			ci->nowait_failed = true;
> +			return;
> +		}
>  	}
>  }
>  
> @@ -1519,7 +1535,15 @@ static unsigned int __send_duplicate_bios(struct clone_info *ci, struct dm_targe
>  	case 1:
>  		if (len)
>  			setup_split_accounting(ci, *len);
> -		clone = alloc_tio(ci, ti, 0, len, GFP_NOIO);
> +		if (unlikely(ci->is_nowait)) {
> +			clone = alloc_tio(ci, ti, 0, len, GFP_NOWAIT);
> +			if (!clone) {
> +				ci->nowait_failed = true;
> +				return 0;
> +			}
> +		} else {
> +			clone = alloc_tio(ci, ti, 0, len, GFP_NOIO);
> +		}
>  		__map_bio(clone);
>  		ret = 1;
>  		break;
> @@ -1539,7 +1563,7 @@ static unsigned int __send_duplicate_bios(struct clone_info *ci, struct dm_targe
>  	return ret;
>  }
>  
> -static void __send_empty_flush(struct clone_info *ci)
> +static blk_status_t __send_empty_flush(struct clone_info *ci)
>  {
>  	struct dm_table *t = ci->map;
>  	struct bio flush_bio;
> @@ -1572,6 +1596,8 @@ static void __send_empty_flush(struct clone_info *ci)
>  	atomic_sub(1, &ci->io->io_count);
>  
>  	bio_uninit(ci->bio);
> +
> +	return likely(!ci->nowait_failed) ? BLK_STS_OK : BLK_STS_AGAIN;
>  }
>  
>  static void __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti,
> @@ -1656,7 +1682,7 @@ static blk_status_t __process_abnormal_io(struct clone_info *ci,
>  
>  	__send_changing_extent_only(ci, ti, num_bios,
>  				    max_granularity, max_sectors);
> -	return BLK_STS_OK;
> +	return likely(!ci->nowait_failed) ? BLK_STS_OK : BLK_STS_AGAIN;
>  }
>  
>  /*
> @@ -1714,7 +1740,7 @@ static blk_status_t __split_and_process_bio(struct clone_info *ci)
>  	if (unlikely(!ti))
>  		return BLK_STS_IOERR;
>  
> -	if (unlikely((ci->bio->bi_opf & REQ_NOWAIT) != 0) &&
> +	if (unlikely(ci->is_nowait) &&
>  	    unlikely(!dm_target_supports_nowait(ti->type)))
>  		return BLK_STS_NOTSUPP;
>  
> @@ -1729,7 +1755,15 @@ static blk_status_t __split_and_process_bio(struct clone_info *ci)
>  
>  	len = min_t(sector_t, max_io_len(ti, ci->sector), ci->sector_count);
>  	setup_split_accounting(ci, len);
> -	clone = alloc_tio(ci, ti, 0, &len, GFP_NOIO);
> +	if (unlikely(ci->is_nowait)) {
> +		clone = alloc_tio(ci, ti, 0, &len, GFP_NOWAIT);
> +		if (unlikely(!clone)) {
> +			ci->nowait_failed = true; /* unchecked, set for consistency */
> +			return BLK_STS_AGAIN;
> +		}
> +	} else {
> +		clone = alloc_tio(ci, ti, 0, &len, GFP_NOIO);
> +	}
>  	__map_bio(clone);
>  
>  	ci->sector += len;
> @@ -1738,11 +1772,13 @@ static blk_status_t __split_and_process_bio(struct clone_info *ci)
>  	return BLK_STS_OK;
>  }
>  
> -static void init_clone_info(struct clone_info *ci, struct mapped_device *md,
> +static void init_clone_info(struct clone_info *ci, struct dm_io *io,
>  			    struct dm_table *map, struct bio *bio, bool is_abnormal)
>  {
> +	ci->is_nowait = !!(bio->bi_opf & REQ_NOWAIT);
> +	ci->nowait_failed = false;
>  	ci->map = map;
> -	ci->io = alloc_io(md, bio);
> +	ci->io = io;
>  	ci->bio = bio;
>  	ci->is_abnormal_io = is_abnormal;
>  	ci->submit_as_polled = false;
> @@ -1777,11 +1813,17 @@ static void dm_split_and_process_bio(struct mapped_device *md,
>  			return;
>  	}
>  
> -	init_clone_info(&ci, md, map, bio, is_abnormal);
> -	io = ci.io;
> +	io = alloc_io(md, bio);
> +	if (!io) {
> +		/* Unable to do anything without dm_io. */
> +		bio_wouldblock_error(bio);
> +		return;
> +	}
> +
> +	init_clone_info(&ci, io, map, bio, is_abnormal);
>  
>  	if (bio->bi_opf & REQ_PREFLUSH) {
> -		__send_empty_flush(&ci);
> +		error = __send_empty_flush(&ci);
>  		/* dm_io_complete submits any data associated with flush */
>  		goto out;
>  	}
> -- 
> 2.40.0
>
Ming Lei Oct. 25, 2023, 11:44 p.m. UTC | #2
On Wed, Oct 25, 2023 at 03:34:40PM -0400, Mike Snitzer wrote:
> From: Mikulas Patocka <mpatocka@redhat.com>
> 
> Update DM core's IO submission to allocate required memory using
> GFP_NOWAIT if REQ_NOWAIT is set.  Lone exception is in the error path
> where DM's requeue support needs to allocate a clone bio in
> dm_io_rewind() to allow the IO to be resubmitted: GFP_NOWAIT is used
> first but if it fails GFP_NOIO is used as a last resort.
> 
> Tested with simple test provided in commit a9ce385344f916 ("dm: don't
> attempt to queue IO under RCU protection") that was enhanced to check
> error codes.  Also tested using fio's pvsync2 with nowait=1.
> 
> But testing with induced GFP_NOWAIT allocation failures wasn't
> performed.
> 
> Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
> Co-developed-by: Mike Snitzer <snitzer@kernel.org>
> Signed-off-by: Mike Snitzer <snitzer@kernel.org>
> ---
>  drivers/md/dm-io-rewind.c | 13 ++++++--
>  drivers/md/dm.c           | 66 ++++++++++++++++++++++++++++++++-------
>  2 files changed, 65 insertions(+), 14 deletions(-)
> 
> diff --git a/drivers/md/dm-io-rewind.c b/drivers/md/dm-io-rewind.c
> index 6155b0117c9d..bde5a53e2d88 100644
> --- a/drivers/md/dm-io-rewind.c
> +++ b/drivers/md/dm-io-rewind.c
> @@ -143,8 +143,17 @@ static void dm_bio_rewind(struct bio *bio, unsigned int bytes)
>  void dm_io_rewind(struct dm_io *io, struct bio_set *bs)
>  {
>  	struct bio *orig = io->orig_bio;
> -	struct bio *new_orig = bio_alloc_clone(orig->bi_bdev, orig,
> -					       GFP_NOIO, bs);
> +	struct bio *new_orig;
> +
> +	new_orig = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOWAIT, bs);
> +	if (unlikely(!new_orig)) {
> +		/*
> +		 * Returning early and failing rewind isn't an option, even
> +		 * if orig has REQ_NOWAIT set, so retry alloc with GFP_NOIO.
> +		 */
> +		new_orig = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOIO, bs);
> +	}

dm_bio_rewind() is only called in requeue work fn, so there shouldn't
be issue with NOWAIT, cause upper IO submission code path won't be blocked
by requeue.


Thanks, 
Ming
diff mbox series

Patch

diff --git a/drivers/md/dm-io-rewind.c b/drivers/md/dm-io-rewind.c
index 6155b0117c9d..bde5a53e2d88 100644
--- a/drivers/md/dm-io-rewind.c
+++ b/drivers/md/dm-io-rewind.c
@@ -143,8 +143,17 @@  static void dm_bio_rewind(struct bio *bio, unsigned int bytes)
 void dm_io_rewind(struct dm_io *io, struct bio_set *bs)
 {
 	struct bio *orig = io->orig_bio;
-	struct bio *new_orig = bio_alloc_clone(orig->bi_bdev, orig,
-					       GFP_NOIO, bs);
+	struct bio *new_orig;
+
+	new_orig = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOWAIT, bs);
+	if (unlikely(!new_orig)) {
+		/*
+		 * Returning early and failing rewind isn't an option, even
+		 * if orig has REQ_NOWAIT set, so retry alloc with GFP_NOIO.
+		 */
+		new_orig = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOIO, bs);
+	}
+
 	/*
 	 * dm_bio_rewind can restore to previous position since the
 	 * end sector is fixed for original bio, but we still need
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 1113a8da3c47..b0e1425af475 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -87,6 +87,8 @@  struct clone_info {
 	unsigned int sector_count;
 	bool is_abnormal_io:1;
 	bool submit_as_polled:1;
+	bool is_nowait:1;
+	bool nowait_failed:1;
 };
 
 static inline struct dm_target_io *clone_to_tio(struct bio *clone)
@@ -576,7 +578,13 @@  static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
 	struct dm_target_io *tio;
 	struct bio *clone;
 
-	clone = bio_alloc_clone(NULL, bio, GFP_NOIO, &md->mempools->io_bs);
+	if (unlikely(bio->bi_opf & REQ_NOWAIT)) {
+		clone = bio_alloc_clone(NULL, bio, GFP_NOWAIT, &md->mempools->io_bs);
+		if (!clone)
+			return NULL;
+	} else {
+		clone = bio_alloc_clone(NULL, bio, GFP_NOIO, &md->mempools->io_bs);
+	}
 	tio = clone_to_tio(clone);
 	tio->flags = 0;
 	dm_tio_set_flag(tio, DM_TIO_INSIDE_DM_IO);
@@ -1500,9 +1508,17 @@  static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci,
 			mutex_unlock(&ci->io->md->table_devices_lock);
 		if (bio_nr == num_bios)
 			return;
-
+		/*
+		 * Failed to allocate all bios, all or nothing, so rewind partial
+		 * allocations and short-circuit retry with GFP_NOIO if 'is_nowait'
+		 */
 		while ((bio = bio_list_pop(blist)))
 			free_tio(bio);
+
+		if (ci->is_nowait) {
+			ci->nowait_failed = true;
+			return;
+		}
 	}
 }
 
@@ -1519,7 +1535,15 @@  static unsigned int __send_duplicate_bios(struct clone_info *ci, struct dm_targe
 	case 1:
 		if (len)
 			setup_split_accounting(ci, *len);
-		clone = alloc_tio(ci, ti, 0, len, GFP_NOIO);
+		if (unlikely(ci->is_nowait)) {
+			clone = alloc_tio(ci, ti, 0, len, GFP_NOWAIT);
+			if (!clone) {
+				ci->nowait_failed = true;
+				return 0;
+			}
+		} else {
+			clone = alloc_tio(ci, ti, 0, len, GFP_NOIO);
+		}
 		__map_bio(clone);
 		ret = 1;
 		break;
@@ -1539,7 +1563,7 @@  static unsigned int __send_duplicate_bios(struct clone_info *ci, struct dm_targe
 	return ret;
 }
 
-static void __send_empty_flush(struct clone_info *ci)
+static blk_status_t __send_empty_flush(struct clone_info *ci)
 {
 	struct dm_table *t = ci->map;
 	struct bio flush_bio;
@@ -1572,6 +1596,8 @@  static void __send_empty_flush(struct clone_info *ci)
 	atomic_sub(1, &ci->io->io_count);
 
 	bio_uninit(ci->bio);
+
+	return likely(!ci->nowait_failed) ? BLK_STS_OK : BLK_STS_AGAIN;
 }
 
 static void __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti,
@@ -1656,7 +1682,7 @@  static blk_status_t __process_abnormal_io(struct clone_info *ci,
 
 	__send_changing_extent_only(ci, ti, num_bios,
 				    max_granularity, max_sectors);
-	return BLK_STS_OK;
+	return likely(!ci->nowait_failed) ? BLK_STS_OK : BLK_STS_AGAIN;
 }
 
 /*
@@ -1714,7 +1740,7 @@  static blk_status_t __split_and_process_bio(struct clone_info *ci)
 	if (unlikely(!ti))
 		return BLK_STS_IOERR;
 
-	if (unlikely((ci->bio->bi_opf & REQ_NOWAIT) != 0) &&
+	if (unlikely(ci->is_nowait) &&
 	    unlikely(!dm_target_supports_nowait(ti->type)))
 		return BLK_STS_NOTSUPP;
 
@@ -1729,7 +1755,15 @@  static blk_status_t __split_and_process_bio(struct clone_info *ci)
 
 	len = min_t(sector_t, max_io_len(ti, ci->sector), ci->sector_count);
 	setup_split_accounting(ci, len);
-	clone = alloc_tio(ci, ti, 0, &len, GFP_NOIO);
+	if (unlikely(ci->is_nowait)) {
+		clone = alloc_tio(ci, ti, 0, &len, GFP_NOWAIT);
+		if (unlikely(!clone)) {
+			ci->nowait_failed = true; /* unchecked, set for consistency */
+			return BLK_STS_AGAIN;
+		}
+	} else {
+		clone = alloc_tio(ci, ti, 0, &len, GFP_NOIO);
+	}
 	__map_bio(clone);
 
 	ci->sector += len;
@@ -1738,11 +1772,13 @@  static blk_status_t __split_and_process_bio(struct clone_info *ci)
 	return BLK_STS_OK;
 }
 
-static void init_clone_info(struct clone_info *ci, struct mapped_device *md,
+static void init_clone_info(struct clone_info *ci, struct dm_io *io,
 			    struct dm_table *map, struct bio *bio, bool is_abnormal)
 {
+	ci->is_nowait = !!(bio->bi_opf & REQ_NOWAIT);
+	ci->nowait_failed = false;
 	ci->map = map;
-	ci->io = alloc_io(md, bio);
+	ci->io = io;
 	ci->bio = bio;
 	ci->is_abnormal_io = is_abnormal;
 	ci->submit_as_polled = false;
@@ -1777,11 +1813,17 @@  static void dm_split_and_process_bio(struct mapped_device *md,
 			return;
 	}
 
-	init_clone_info(&ci, md, map, bio, is_abnormal);
-	io = ci.io;
+	io = alloc_io(md, bio);
+	if (!io) {
+		/* Unable to do anything without dm_io. */
+		bio_wouldblock_error(bio);
+		return;
+	}
+
+	init_clone_info(&ci, io, map, bio, is_abnormal);
 
 	if (bio->bi_opf & REQ_PREFLUSH) {
-		__send_empty_flush(&ci);
+		error = __send_empty_flush(&ci);
 		/* dm_io_complete submits any data associated with flush */
 		goto out;
 	}