diff mbox

bio: skip atomic inc/dec of ->bi_remaining for non-chains

Message ID 20150417145933.GA4770@kernel.dk (mailing list archive)
State New, archived
Headers show

Commit Message

Jens Axboe April 17, 2015, 2:59 p.m. UTC
Hi,

Struct bio has an atomic ref count for chained bio's, and we use this
to know when to end IO on the bio. However, most bio's are not chained,
so we don't need to always introduce this atomic operation as part of
ending IO.

Add a helper to elevate the bi_remaining count, and flag the bio as
now actually needing the decrement at end_io time. Rename the field
to __bi_remaining to catch any current users of this doing the
incrementing manually.

For high IOPS workloads, this reduces the overhead of bio_endio()
substantially.

Tested-by: Robert Elliott <elliott@hp.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---

 block/bio.c                  |   38 +++++++++++++++++++++++++++++---------
 drivers/md/dm-cache-target.c |    2 +-
 drivers/md/dm-raid1.c        |    2 +-
 drivers/md/dm-snap.c         |    2 +-
 drivers/md/dm-thin.c         |    4 ++--
 include/linux/bio.h          |   11 +++++++++++
 include/linux/blk_types.h    |    3 ++-
 7 files changed, 47 insertions(+), 15 deletions(-)

Comments

Jan Kara April 20, 2015, 4:04 p.m. UTC | #1
On Fri 17-04-15 08:59:33, Jens Axboe wrote:
> @@ -292,8 +292,8 @@ void bio_reset(struct bio *bio)
>  	__bio_free(bio);
>  
>  	memset(bio, 0, BIO_RESET_BYTES);
> -	bio->bi_flags = flags|(1 << BIO_UPTODATE);
> -	atomic_set(&bio->bi_remaining, 1);
> +	bio->bi_flags = flags | 1 << BIO_UPTODATE;
  Although this is correct, I'd prefer to keep parenthesis around 1 <<
BIO_UPTODATE. Maybe I'm not a real C hacker but I had to lookup the
operator precedence of | vs << :).

Otherwise the patch looks good to me (but I'm not an expert in this area, I
just looked into the patch by accident because I thought it's another
respin of your direct IO patch ;).

								Honza
Jens Axboe April 20, 2015, 4:42 p.m. UTC | #2
On 04/20/2015 10:04 AM, Jan Kara wrote:
> On Fri 17-04-15 08:59:33, Jens Axboe wrote:
>> @@ -292,8 +292,8 @@ void bio_reset(struct bio *bio)
>>   	__bio_free(bio);
>>
>>   	memset(bio, 0, BIO_RESET_BYTES);
>> -	bio->bi_flags = flags|(1 << BIO_UPTODATE);
>> -	atomic_set(&bio->bi_remaining, 1);
>> +	bio->bi_flags = flags | 1 << BIO_UPTODATE;
>    Although this is correct, I'd prefer to keep parenthesis around 1 <<
> BIO_UPTODATE. Maybe I'm not a real C hacker but I had to lookup the
> operator precedence of | vs << :).

Heh, I think we can safely say you are a real C hacker, lets just keep 
the parenthesis if that makes it easier to read/verify.

> Otherwise the patch looks good to me (but I'm not an expert in this area, I
> just looked into the patch by accident because I thought it's another
> respin of your direct IO patch ;).

It's a parallel effort, getting rid of the atomics where we can... 
Thanks for looking at it, can I add your reviewed-by tag with the 
parenthesis change?
Jan Kara April 21, 2015, 9:04 a.m. UTC | #3
On Mon 20-04-15 10:42:13, Jens Axboe wrote:
> On 04/20/2015 10:04 AM, Jan Kara wrote:
> >On Fri 17-04-15 08:59:33, Jens Axboe wrote:
> >>@@ -292,8 +292,8 @@ void bio_reset(struct bio *bio)
> >>  	__bio_free(bio);
> >>
> >>  	memset(bio, 0, BIO_RESET_BYTES);
> >>-	bio->bi_flags = flags|(1 << BIO_UPTODATE);
> >>-	atomic_set(&bio->bi_remaining, 1);
> >>+	bio->bi_flags = flags | 1 << BIO_UPTODATE;
> >   Although this is correct, I'd prefer to keep parenthesis around 1 <<
> >BIO_UPTODATE. Maybe I'm not a real C hacker but I had to lookup the
> >operator precedence of | vs << :).
> 
> Heh, I think we can safely say you are a real C hacker, lets just
> keep the parenthesis if that makes it easier to read/verify.
> 
> >Otherwise the patch looks good to me (but I'm not an expert in this area, I
> >just looked into the patch by accident because I thought it's another
> >respin of your direct IO patch ;).
> 
> It's a parallel effort, getting rid of the atomics where we can...
> Thanks for looking at it, can I add your reviewed-by tag with the
> parenthesis change?
  Yes, you can.

								Honza
diff mbox

Patch

diff --git a/block/bio.c b/block/bio.c
index f66a4eae16ee..8fa4641f9e65 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -270,7 +270,7 @@  void bio_init(struct bio *bio)
 {
 	memset(bio, 0, sizeof(*bio));
 	bio->bi_flags = 1 << BIO_UPTODATE;
-	atomic_set(&bio->bi_remaining, 1);
+	atomic_set(&bio->__bi_remaining, 1);
 	atomic_set(&bio->bi_cnt, 1);
 }
 EXPORT_SYMBOL(bio_init);
@@ -292,8 +292,8 @@  void bio_reset(struct bio *bio)
 	__bio_free(bio);
 
 	memset(bio, 0, BIO_RESET_BYTES);
-	bio->bi_flags = flags|(1 << BIO_UPTODATE);
-	atomic_set(&bio->bi_remaining, 1);
+	bio->bi_flags = flags | 1 << BIO_UPTODATE;
+	atomic_set(&bio->__bi_remaining, 1);
 }
 EXPORT_SYMBOL(bio_reset);
 
@@ -320,7 +320,7 @@  void bio_chain(struct bio *bio, struct bio *parent)
 
 	bio->bi_private = parent;
 	bio->bi_end_io	= bio_chain_endio;
-	atomic_inc(&parent->bi_remaining);
+	bio_inc_remaining(parent);
 }
 EXPORT_SYMBOL(bio_chain);
 
@@ -1741,6 +1741,23 @@  void bio_flush_dcache_pages(struct bio *bi)
 EXPORT_SYMBOL(bio_flush_dcache_pages);
 #endif
 
+static inline bool bio_remaining_done(struct bio *bio)
+{
+	/*
+	 * If we're not chaining, then ->__bi_remaining is always 1 and
+	 * we always end io on the first invocation.
+	 */
+	if (!bio_flagged(bio, BIO_CHAIN))
+		return true;
+
+	BUG_ON(atomic_read(&bio->__bi_remaining) <= 0);
+
+	if (atomic_dec_and_test(&bio->__bi_remaining))
+		return true;
+
+	return false;
+}
+
 /**
  * bio_endio - end I/O on a bio
  * @bio:	bio
@@ -1758,15 +1775,13 @@  EXPORT_SYMBOL(bio_flush_dcache_pages);
 void bio_endio(struct bio *bio, int error)
 {
 	while (bio) {
-		BUG_ON(atomic_read(&bio->bi_remaining) <= 0);
-
 		if (error)
 			clear_bit(BIO_UPTODATE, &bio->bi_flags);
 		else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
 			error = -EIO;
 
-		if (!atomic_dec_and_test(&bio->bi_remaining))
-			return;
+		if (unlikely(!bio_remaining_done(bio)))
+			break;
 
 		/*
 		 * Need to have a real endio function for chained bios,
@@ -1799,7 +1814,12 @@  EXPORT_SYMBOL(bio_endio);
  **/
 void bio_endio_nodec(struct bio *bio, int error)
 {
-	atomic_inc(&bio->bi_remaining);
+	/*
+	 * If it's not flagged as a chain, we are not going to dec the count
+	 */
+	if (bio_flagged(bio, BIO_CHAIN))
+		bio_inc_remaining(bio);
+
 	bio_endio(bio, error);
 }
 EXPORT_SYMBOL(bio_endio_nodec);
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 7755af351867..705eb7b99d69 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -91,7 +91,7 @@  static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio)
 	 * Must bump bi_remaining to allow bio to complete with
 	 * restored bi_end_io.
 	 */
-	atomic_inc(&bio->bi_remaining);
+	bio_inc_remaining(bio);
 }
 
 /*----------------------------------------------------------------*/
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 089d62751f7f..d6a1c096b777 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -1254,7 +1254,7 @@  static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error)
 			dm_bio_restore(bd, bio);
 			bio_record->details.bi_bdev = NULL;
 
-			atomic_inc(&bio->bi_remaining);
+			bio_inc_remaining(bio);
 
 			queue_bio(ms, bio, rw);
 			return DM_ENDIO_INCOMPLETE;
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index f83a0f3fc365..8bfeae218531 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -1478,7 +1478,7 @@  out:
 	if (full_bio) {
 		full_bio->bi_end_io = pe->full_bio_end_io;
 		full_bio->bi_private = pe->full_bio_private;
-		atomic_inc(&full_bio->bi_remaining);
+		bio_inc_remaining(full_bio);
 	}
 	increment_pending_exceptions_done_count();
 
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 921aafd12aee..342dbdad6131 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -795,7 +795,7 @@  static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
 {
 	if (m->bio) {
 		m->bio->bi_end_io = m->saved_bi_end_io;
-		atomic_inc(&m->bio->bi_remaining);
+		bio_inc_remaining(m->bio);
 	}
 	cell_error(m->tc->pool, m->cell);
 	list_del(&m->list);
@@ -812,7 +812,7 @@  static void process_prepared_mapping(struct dm_thin_new_mapping *m)
 	bio = m->bio;
 	if (bio) {
 		bio->bi_end_io = m->saved_bi_end_io;
-		atomic_inc(&bio->bi_remaining);
+		bio_inc_remaining(bio);
 	}
 
 	if (m->err) {
diff --git a/include/linux/bio.h b/include/linux/bio.h
index da3a127c9958..8bfe9eee6d1a 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -645,6 +645,17 @@  static inline struct bio *bio_list_get(struct bio_list *bl)
 }
 
 /*
+ * Increment chain count for the bio. Make sure the CHAIN flag update
+ * is visible before the raised count.
+ */
+static inline void bio_inc_remaining(struct bio *bio)
+{
+	bio->bi_flags |= (1 << BIO_CHAIN);
+	smp_mb__before_atomic();
+	atomic_inc(&bio->__bi_remaining);
+}
+
+/*
  * bio_set is used to allow other portions of the IO system to
  * allocate their own private memory pools for bio and iovec structures.
  * These memory pools in turn all allocate from the bio_slab
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index a1b25e35ea5f..8b07e0603887 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -65,7 +65,7 @@  struct bio {
 	unsigned int		bi_seg_front_size;
 	unsigned int		bi_seg_back_size;
 
-	atomic_t		bi_remaining;
+	atomic_t		__bi_remaining;
 
 	bio_end_io_t		*bi_end_io;
 
@@ -122,6 +122,7 @@  struct bio {
 #define BIO_NULL_MAPPED 8	/* contains invalid user pages */
 #define BIO_QUIET	9	/* Make BIO Quiet */
 #define BIO_SNAP_STABLE	10	/* bio data must be snapshotted during write */
+#define BIO_CHAIN	11	/* chained bio, ->bi_remaining in effect */
 
 /*
  * Flags starting here get preserved by bio_reset() - this includes