diff mbox series

[V11,14/19] block: handle non-cluster bio out of blk_bio_segment_split

Message ID 20181121032327.8434-15-ming.lei@redhat.com (mailing list archive)
State New, archived
Headers show
Series block: support multi-page bvec | expand

Commit Message

Ming Lei Nov. 21, 2018, 3:23 a.m. UTC
We will enable multi-page bvec soon, but non-cluster queue can't
handle the multi-page bvec at all. This patch borrows bounce's
idea to clone new single-page bio for non-cluster queue, and moves
its handling out of blk_bio_segment_split().

Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 block/Makefile      |  3 ++-
 block/blk-merge.c   |  6 ++++-
 block/blk.h         |  2 ++
 block/non-cluster.c | 70 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 79 insertions(+), 2 deletions(-)
 create mode 100644 block/non-cluster.c

Comments

Christoph Hellwig Nov. 21, 2018, 2:33 p.m. UTC | #1
> +			non-cluster.o

Do we really need a new source file for these few functions?

>  	default:
> +		if (!blk_queue_cluster(q)) {
> +			blk_queue_non_cluster_bio(q, bio);
> +			return;

I'd name this blk_bio_segment_split_singlepage or similar.

> +static __init int init_non_cluster_bioset(void)
> +{
> +	WARN_ON(bioset_init(&non_cluster_bio_set, BIO_POOL_SIZE, 0,
> +			   BIOSET_NEED_BVECS));
> +	WARN_ON(bioset_integrity_create(&non_cluster_bio_set, BIO_POOL_SIZE));
> +	WARN_ON(bioset_init(&non_cluster_bio_split, BIO_POOL_SIZE, 0, 0));

Please only allocate the resources once a queue without the cluster
flag is registered, there are only very few modern drivers that do that.

> +static void non_cluster_end_io(struct bio *bio)
> +{
> +	struct bio *bio_orig = bio->bi_private;
> +
> +	bio_orig->bi_status = bio->bi_status;
> +	bio_endio(bio_orig);
> +	bio_put(bio);
> +}

Why can't we use bio_chain for the split bios?

> +	bio_for_each_segment(from, *bio_orig, iter) {
> +		if (i++ < max_segs)
> +			sectors += from.bv_len >> 9;
> +		else
> +			break;
> +	}

The easy to read way would be:

	bio_for_each_segment(from, *bio_orig, iter) {
		if (i++ == max_segs)
			break;
		sectors += from.bv_len >> 9;
	}

> +	if (sectors < bio_sectors(*bio_orig)) {
> +		bio = bio_split(*bio_orig, sectors, GFP_NOIO,
> +				&non_cluster_bio_split);
> +		bio_chain(bio, *bio_orig);
> +		generic_make_request(*bio_orig);
> +		*bio_orig = bio;

I don't think this is very efficient, as this means we now
clone the bio twice, first to split it at the sector boundary,
and then again when converting it to single-page bio_vec.

I think this could be something like this (totally untested):

diff --git a/block/non-cluster.c b/block/non-cluster.c
index 9c2910be9404..60389f275c43 100644
--- a/block/non-cluster.c
+++ b/block/non-cluster.c
@@ -13,58 +13,59 @@
 
 #include "blk.h"
 
-static struct bio_set non_cluster_bio_set, non_cluster_bio_split;
+static struct bio_set non_cluster_bio_set;
 
 static __init int init_non_cluster_bioset(void)
 {
 	WARN_ON(bioset_init(&non_cluster_bio_set, BIO_POOL_SIZE, 0,
 			   BIOSET_NEED_BVECS));
 	WARN_ON(bioset_integrity_create(&non_cluster_bio_set, BIO_POOL_SIZE));
-	WARN_ON(bioset_init(&non_cluster_bio_split, BIO_POOL_SIZE, 0, 0));
 
 	return 0;
 }
 __initcall(init_non_cluster_bioset);
 
-static void non_cluster_end_io(struct bio *bio)
-{
-	struct bio *bio_orig = bio->bi_private;
-
-	bio_orig->bi_status = bio->bi_status;
-	bio_endio(bio_orig);
-	bio_put(bio);
-}
-
 void blk_queue_non_cluster_bio(struct request_queue *q, struct bio **bio_orig)
 {
-	struct bio *bio;
 	struct bvec_iter iter;
-	struct bio_vec from;
-	unsigned i = 0;
-	unsigned sectors = 0;
-	unsigned short max_segs = min_t(unsigned short, BIO_MAX_PAGES,
-					queue_max_segments(q));
+	struct bio *bio;
+	struct bio_vec bv;
+	unsigned short max_segs, segs = 0;
+
+	bio = bio_alloc_bioset(GFP_NOIO, bio_segments(*bio_orig),
+			&non_cluster_bio_set);
+	bio->bi_disk		= (*bio_orig)->bi_disk;
+	bio->bi_partno		= (*bio_orig)->bi_partno;
+	bio_set_flag(bio, BIO_CLONED);
+	if (bio_flagged(*bio_orig, BIO_THROTTLED))
+		bio_set_flag(bio, BIO_THROTTLED);
+	bio->bi_opf		= (*bio_orig)->bi_opf;
+	bio->bi_ioprio		= (*bio_orig)->bi_ioprio;
+	bio->bi_write_hint	= (*bio_orig)->bi_write_hint;
+	bio->bi_iter.bi_sector	= (*bio_orig)->bi_iter.bi_sector;
+	bio->bi_iter.bi_size	= (*bio_orig)->bi_iter.bi_size;
+
+	if (bio_integrity(*bio_orig))
+		bio_integrity_clone(bio, *bio_orig, GFP_NOIO);
 
-	bio_for_each_segment(from, *bio_orig, iter) {
-		if (i++ < max_segs)
-			sectors += from.bv_len >> 9;
-		else
+	bio_clone_blkcg_association(bio, *bio_orig);
+
+	max_segs = min_t(unsigned short, queue_max_segments(q), BIO_MAX_PAGES);
+	bio_for_each_segment(bv, *bio_orig, iter) {
+		bio->bi_io_vec[segs++] = bv;
+		if (segs++ == max_segs)
 			break;
 	}
 
-	if (sectors < bio_sectors(*bio_orig)) {
-		bio = bio_split(*bio_orig, sectors, GFP_NOIO,
-				&non_cluster_bio_split);
-		bio_chain(bio, *bio_orig);
-		generic_make_request(*bio_orig);
-		*bio_orig = bio;
-	}
-	bio = bio_clone_bioset(*bio_orig, GFP_NOIO, &non_cluster_bio_set);
+	bio->bi_vcnt = segs;
+	bio->bi_phys_segments = segs;
+	bio_set_flag(bio, BIO_SEG_VALID);
+	bio_chain(bio, *bio_orig);
 
-	bio->bi_phys_segments = bio_segments(bio);
-        bio_set_flag(bio, BIO_SEG_VALID);
-	bio->bi_end_io = non_cluster_end_io;
+	if (bio_integrity(bio))
+		bio_integrity_trim(bio);
+	bio_advance(bio, (*bio_orig)->bi_iter.bi_size);
 
-	bio->bi_private = *bio_orig;
+	generic_make_request(*bio_orig);
 	*bio_orig = bio;
 }
Ming Lei Nov. 21, 2018, 3:37 p.m. UTC | #2
On Wed, Nov 21, 2018 at 03:33:55PM +0100, Christoph Hellwig wrote:
> > +			non-cluster.o
> 
> Do we really need a new source file for these few functions?
> 
> >  	default:
> > +		if (!blk_queue_cluster(q)) {
> > +			blk_queue_non_cluster_bio(q, bio);
> > +			return;
> 
> I'd name this blk_bio_segment_split_singlepage or similar.

OK.

> 
> > +static __init int init_non_cluster_bioset(void)
> > +{
> > +	WARN_ON(bioset_init(&non_cluster_bio_set, BIO_POOL_SIZE, 0,
> > +			   BIOSET_NEED_BVECS));
> > +	WARN_ON(bioset_integrity_create(&non_cluster_bio_set, BIO_POOL_SIZE));
> > +	WARN_ON(bioset_init(&non_cluster_bio_split, BIO_POOL_SIZE, 0, 0));
> 
> Please only allocate the resources once a queue without the cluster
> flag is registered, there are only very few modern drivers that do that.

OK.

> 
> > +static void non_cluster_end_io(struct bio *bio)
> > +{
> > +	struct bio *bio_orig = bio->bi_private;
> > +
> > +	bio_orig->bi_status = bio->bi_status;
> > +	bio_endio(bio_orig);
> > +	bio_put(bio);
> > +}
> 
> Why can't we use bio_chain for the split bios?

The parent bio is multi-page bvec, we can't submit it for non-cluster.

> 
> > +	bio_for_each_segment(from, *bio_orig, iter) {
> > +		if (i++ < max_segs)
> > +			sectors += from.bv_len >> 9;
> > +		else
> > +			break;
> > +	}
> 
> The easy to read way would be:
> 
> 	bio_for_each_segment(from, *bio_orig, iter) {
> 		if (i++ == max_segs)
> 			break;
> 		sectors += from.bv_len >> 9;
> 	}

OK.

> 
> > +	if (sectors < bio_sectors(*bio_orig)) {
> > +		bio = bio_split(*bio_orig, sectors, GFP_NOIO,
> > +				&non_cluster_bio_split);
> > +		bio_chain(bio, *bio_orig);
> > +		generic_make_request(*bio_orig);
> > +		*bio_orig = bio;
> 
> I don't think this is very efficient, as this means we now
> clone the bio twice, first to split it at the sector boundary,
> and then again when converting it to single-page bio_vec.

That is exactly what bounce code does. The problem for both bounce
and non-cluster is same actually because the bvec table itself has
to be changed.

> 
> I think this could be something like this (totally untested):
> 
> diff --git a/block/non-cluster.c b/block/non-cluster.c
> index 9c2910be9404..60389f275c43 100644
> --- a/block/non-cluster.c
> +++ b/block/non-cluster.c
> @@ -13,58 +13,59 @@
>  
>  #include "blk.h"
>  
> -static struct bio_set non_cluster_bio_set, non_cluster_bio_split;
> +static struct bio_set non_cluster_bio_set;
>  
>  static __init int init_non_cluster_bioset(void)
>  {
>  	WARN_ON(bioset_init(&non_cluster_bio_set, BIO_POOL_SIZE, 0,
>  			   BIOSET_NEED_BVECS));
>  	WARN_ON(bioset_integrity_create(&non_cluster_bio_set, BIO_POOL_SIZE));
> -	WARN_ON(bioset_init(&non_cluster_bio_split, BIO_POOL_SIZE, 0, 0));
>  
>  	return 0;
>  }
>  __initcall(init_non_cluster_bioset);
>  
> -static void non_cluster_end_io(struct bio *bio)
> -{
> -	struct bio *bio_orig = bio->bi_private;
> -
> -	bio_orig->bi_status = bio->bi_status;
> -	bio_endio(bio_orig);
> -	bio_put(bio);
> -}
> -
>  void blk_queue_non_cluster_bio(struct request_queue *q, struct bio **bio_orig)
>  {
> -	struct bio *bio;
>  	struct bvec_iter iter;
> -	struct bio_vec from;
> -	unsigned i = 0;
> -	unsigned sectors = 0;
> -	unsigned short max_segs = min_t(unsigned short, BIO_MAX_PAGES,
> -					queue_max_segments(q));
> +	struct bio *bio;
> +	struct bio_vec bv;
> +	unsigned short max_segs, segs = 0;
> +
> +	bio = bio_alloc_bioset(GFP_NOIO, bio_segments(*bio_orig),
> +			&non_cluster_bio_set);

bio_segments(*bio_orig) may be > 256, so bio_alloc_bioset() may fail.

Thanks,
Ming
Christoph Hellwig Nov. 21, 2018, 4:11 p.m. UTC | #3
On Wed, Nov 21, 2018 at 11:37:27PM +0800, Ming Lei wrote:
> > +	bio = bio_alloc_bioset(GFP_NOIO, bio_segments(*bio_orig),
> > +			&non_cluster_bio_set);
> 
> bio_segments(*bio_orig) may be > 256, so bio_alloc_bioset() may fail.

Nothing a little min with BIO_MAX_PAGES couldn't fix.  This patch
was just intended as an idea how I think this code could work.
Christoph Hellwig Nov. 21, 2018, 5:46 p.m. UTC | #4
Actually..

I think we can kill this code entirely.  If we look at what the
clustering setting is really about it is to avoid ever merging a
segement that spans a page boundary.  And we should be able to do
that with something like this before your series:

---
From 0d46fa76c376493a74ea0dbe77305bd5fa2cf011 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 21 Nov 2018 18:39:47 +0100
Subject: block: remove the "cluster" flag

The cluster flag implements some very old SCSI behavior.  As far as I
can tell the original intent was to enable or disable any kind of
segment merging.  But the actually visible effect to the LLDD is that
it limits each segments to be inside a single page, which we can
also affect by setting the maximum segment size and the virt
boundary.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-merge.c       | 20 ++++++++------------
 block/blk-settings.c    |  3 ---
 block/blk-sysfs.c       |  5 +----
 drivers/scsi/scsi_lib.c | 16 +++++++++++++---
 include/linux/blkdev.h  |  6 ------
 5 files changed, 22 insertions(+), 28 deletions(-)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 6be04ef8da5b..e69d8f8ba819 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -195,7 +195,7 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
 			goto split;
 		}
 
-		if (bvprvp && blk_queue_cluster(q)) {
+		if (bvprvp) {
 			if (seg_size + bv.bv_len > queue_max_segment_size(q))
 				goto new_segment;
 			if (!biovec_phys_mergeable(q, bvprvp, &bv))
@@ -295,10 +295,10 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
 					     bool no_sg_merge)
 {
 	struct bio_vec bv, bvprv = { NULL };
-	int cluster, prev = 0;
 	unsigned int seg_size, nr_phys_segs;
 	struct bio *fbio, *bbio;
 	struct bvec_iter iter;
+	bool prev = false;
 
 	if (!bio)
 		return 0;
@@ -313,7 +313,6 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
 	}
 
 	fbio = bio;
-	cluster = blk_queue_cluster(q);
 	seg_size = 0;
 	nr_phys_segs = 0;
 	for_each_bio(bio) {
@@ -325,7 +324,7 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
 			if (no_sg_merge)
 				goto new_segment;
 
-			if (prev && cluster) {
+			if (prev) {
 				if (seg_size + bv.bv_len
 				    > queue_max_segment_size(q))
 					goto new_segment;
@@ -343,7 +342,7 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
 
 			nr_phys_segs++;
 			bvprv = bv;
-			prev = 1;
+			prev = true;
 			seg_size = bv.bv_len;
 		}
 		bbio = bio;
@@ -396,9 +395,6 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
 {
 	struct bio_vec end_bv = { NULL }, nxt_bv;
 
-	if (!blk_queue_cluster(q))
-		return 0;
-
 	if (bio->bi_seg_back_size + nxt->bi_seg_front_size >
 	    queue_max_segment_size(q))
 		return 0;
@@ -415,12 +411,12 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
 static inline void
 __blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec,
 		     struct scatterlist *sglist, struct bio_vec *bvprv,
-		     struct scatterlist **sg, int *nsegs, int *cluster)
+		     struct scatterlist **sg, int *nsegs)
 {
 
 	int nbytes = bvec->bv_len;
 
-	if (*sg && *cluster) {
+	if (*sg) {
 		if ((*sg)->length + nbytes > queue_max_segment_size(q))
 			goto new_segment;
 		if (!biovec_phys_mergeable(q, bvprv, bvec))
@@ -466,12 +462,12 @@ static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio,
 {
 	struct bio_vec bvec, bvprv = { NULL };
 	struct bvec_iter iter;
-	int cluster = blk_queue_cluster(q), nsegs = 0;
+	int nsegs = 0;
 
 	for_each_bio(bio)
 		bio_for_each_segment(bvec, bio, iter)
 			__blk_segment_map_sg(q, &bvec, sglist, &bvprv, sg,
-					     &nsegs, &cluster);
+					     &nsegs);
 
 	return nsegs;
 }
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 3abe831e92c8..3e7038e475ee 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -56,7 +56,6 @@ void blk_set_default_limits(struct queue_limits *lim)
 	lim->alignment_offset = 0;
 	lim->io_opt = 0;
 	lim->misaligned = 0;
-	lim->cluster = 1;
 	lim->zoned = BLK_ZONED_NONE;
 }
 EXPORT_SYMBOL(blk_set_default_limits);
@@ -547,8 +546,6 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 	t->io_min = max(t->io_min, b->io_min);
 	t->io_opt = lcm_not_zero(t->io_opt, b->io_opt);
 
-	t->cluster &= b->cluster;
-
 	/* Physical block size a multiple of the logical block size? */
 	if (t->physical_block_size & (t->logical_block_size - 1)) {
 		t->physical_block_size = t->logical_block_size;
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 80eef48fddc8..ef7b844a3e00 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -132,10 +132,7 @@ static ssize_t queue_max_integrity_segments_show(struct request_queue *q, char *
 
 static ssize_t queue_max_segment_size_show(struct request_queue *q, char *page)
 {
-	if (blk_queue_cluster(q))
-		return queue_var_show(queue_max_segment_size(q), (page));
-
-	return queue_var_show(PAGE_SIZE, (page));
+	return queue_var_show(queue_max_segment_size(q), page);
 }
 
 static ssize_t queue_logical_block_size_show(struct request_queue *q, char *page)
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 0df15cb738d2..c1ea50962286 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1810,6 +1810,7 @@ static int scsi_map_queues(struct blk_mq_tag_set *set)
 void __scsi_init_queue(struct Scsi_Host *shost, struct request_queue *q)
 {
 	struct device *dev = shost->dma_dev;
+	unsigned max_segment_size = dma_get_max_seg_size(dev);
 
 	/*
 	 * this limit is imposed by hardware restrictions
@@ -1831,10 +1832,19 @@ void __scsi_init_queue(struct Scsi_Host *shost, struct request_queue *q)
 	blk_queue_segment_boundary(q, shost->dma_boundary);
 	dma_set_seg_boundary(dev, shost->dma_boundary);
 
-	blk_queue_max_segment_size(q, dma_get_max_seg_size(dev));
+	/*
+	 * Clustering is a really old concept from the stone age of Linux
+	 * SCSI support.  But the basic idea is that we never give the
+	 * driver a segment that spans multiple pages.  For that we need
+	 * to limit the segment size, and set the virt boundary so that
+	 * we never merge a second segment which is no page aligned.
+	 */
+	if (!shost->use_clustering) {
+		blk_queue_virt_boundary(q, PAGE_SIZE - 1);
+		max_segment_size = min_t(unsigned, max_segment_size, PAGE_SIZE);
+	}
 
-	if (!shost->use_clustering)
-		q->limits.cluster = 0;
+	blk_queue_max_segment_size(q, max_segment_size);
 
 	/*
 	 * Set a reasonable default alignment:  The larger of 32-byte (dword),
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 9b53db06ad08..399a7a415609 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -341,7 +341,6 @@ struct queue_limits {
 
 	unsigned char		misaligned;
 	unsigned char		discard_misaligned;
-	unsigned char		cluster;
 	unsigned char		raid_partial_stripes_expensive;
 	enum blk_zoned_model	zoned;
 };
@@ -660,11 +659,6 @@ static inline bool queue_is_mq(struct request_queue *q)
 	return q->mq_ops;
 }
 
-static inline unsigned int blk_queue_cluster(struct request_queue *q)
-{
-	return q->limits.cluster;
-}
-
 static inline enum blk_zoned_model
 blk_queue_zoned_model(struct request_queue *q)
 {
Ming Lei Nov. 22, 2018, 9:33 a.m. UTC | #5
On Wed, Nov 21, 2018 at 06:46:21PM +0100, Christoph Hellwig wrote:
> Actually..
> 
> I think we can kill this code entirely.  If we look at what the
> clustering setting is really about it is to avoid ever merging a
> segement that spans a page boundary.  And we should be able to do
> that with something like this before your series:
> 
> ---
> From 0d46fa76c376493a74ea0dbe77305bd5fa2cf011 Mon Sep 17 00:00:00 2001
> From: Christoph Hellwig <hch@lst.de>
> Date: Wed, 21 Nov 2018 18:39:47 +0100
> Subject: block: remove the "cluster" flag
> 
> The cluster flag implements some very old SCSI behavior.  As far as I
> can tell the original intent was to enable or disable any kind of
> segment merging.  But the actually visible effect to the LLDD is that
> it limits each segments to be inside a single page, which we can
> also affect by setting the maximum segment size and the virt
> boundary.

This approach is pretty good given we can do post-split during mapping
sg.

However, using virt boundary limit on non-cluster seems over-kill,
because the bio will be over-split(each small bvec may be split as one bio)
if it includes lots of small segment.

What we want to do is just to avoid to merge bvecs to segment, which
should have been done by NO_SG_MERGE simply. However, after multi-page
is enabled, two adjacent bvecs won't be merged any more, I just forget
to remove the bvec merge code in V11.

So seems we can simply avoid to use virt boundary limit for non-cluster
after multipage bvec is enabled?


thanks,
Ming
Christoph Hellwig Nov. 22, 2018, 10:04 a.m. UTC | #6
On Thu, Nov 22, 2018 at 05:33:00PM +0800, Ming Lei wrote:
> However, using virt boundary limit on non-cluster seems over-kill,
> because the bio will be over-split(each small bvec may be split as one bio)
> if it includes lots of small segment.

The combination of the virt boundary of PAGE_SIZE - 1 and a
max_segment_size of PAGE_SIZE will only split if the to me merged
segment is in a different page than the previous one, which is exactly
what we need here.  Multiple small bvec inside the same page (e.g.
512 byte buffer_heads) will still be merged.

> What we want to do is just to avoid to merge bvecs to segment, which
> should have been done by NO_SG_MERGE simply. However, after multi-page
> is enabled, two adjacent bvecs won't be merged any more, I just forget
> to remove the bvec merge code in V11.
> 
> So seems we can simply avoid to use virt boundary limit for non-cluster
> after multipage bvec is enabled?

No, we can't just remove it.  As explained in the patch there is one very
visible difference of setting the flag amd that is no segment will span a
page boundary, and at least the iSCSI code seems to rely on that.
Ming Lei Nov. 22, 2018, 10:26 a.m. UTC | #7
On Thu, Nov 22, 2018 at 11:04:28AM +0100, Christoph Hellwig wrote:
> On Thu, Nov 22, 2018 at 05:33:00PM +0800, Ming Lei wrote:
> > However, using virt boundary limit on non-cluster seems over-kill,
> > because the bio will be over-split(each small bvec may be split as one bio)
> > if it includes lots of small segment.
> 
> The combination of the virt boundary of PAGE_SIZE - 1 and a
> max_segment_size of PAGE_SIZE will only split if the to me merged
> segment is in a different page than the previous one, which is exactly
> what we need here.  Multiple small bvec inside the same page (e.g.
> 512 byte buffer_heads) will still be merged.

Suppose one bio includes (pg0, 0, 512) and (pg1, 512, 512):

The split is introduced by the following code in blk_bio_segment_split():

      if (bvprvp && bvec_gap_to_prev(q, bvprvp, bv.bv_offset))
		  	goto split;

Without this patch, for non-cluster, the two bvecs are just in different
segment, but still handled by one same bio. Now you convert into two bios.

Thanks,
Ming
Ming Lei Nov. 22, 2018, 10:32 a.m. UTC | #8
On Thu, Nov 22, 2018 at 11:04:28AM +0100, Christoph Hellwig wrote:
> On Thu, Nov 22, 2018 at 05:33:00PM +0800, Ming Lei wrote:
> > However, using virt boundary limit on non-cluster seems over-kill,
> > because the bio will be over-split(each small bvec may be split as one bio)
> > if it includes lots of small segment.
> 
> The combination of the virt boundary of PAGE_SIZE - 1 and a
> max_segment_size of PAGE_SIZE will only split if the to me merged
> segment is in a different page than the previous one, which is exactly
> what we need here.  Multiple small bvec inside the same page (e.g.
> 512 byte buffer_heads) will still be merged.
> 
> > What we want to do is just to avoid to merge bvecs to segment, which
> > should have been done by NO_SG_MERGE simply. However, after multi-page
> > is enabled, two adjacent bvecs won't be merged any more, I just forget
> > to remove the bvec merge code in V11.
> > 
> > So seems we can simply avoid to use virt boundary limit for non-cluster
> > after multipage bvec is enabled?
> 
> No, we can't just remove it.  As explained in the patch there is one very
> visible difference of setting the flag amd that is no segment will span a
> page boundary, and at least the iSCSI code seems to rely on that.

IMO, we should use queue_segment_boundary() to enhance the rule during splitting
segment after multi-page bvec is enabled.

Seems we miss the segment boundary limit in bvec_split_segs().

Thanks,
Ming
Christoph Hellwig Nov. 22, 2018, 10:40 a.m. UTC | #9
On Thu, Nov 22, 2018 at 06:26:17PM +0800, Ming Lei wrote:
> Suppose one bio includes (pg0, 0, 512) and (pg1, 512, 512):
> 
> The split is introduced by the following code in blk_bio_segment_split():
> 
>       if (bvprvp && bvec_gap_to_prev(q, bvprvp, bv.bv_offset))
> 		  	goto split;
> 
> Without this patch, for non-cluster, the two bvecs are just in different
> segment, but still handled by one same bio. Now you convert into two bios.

Oh, true - we create new bios instead of new segments.  So I guess we
need to do something special here if we don't want to pay that overhead.

Or just look into killing the cluster setting..
Christoph Hellwig Nov. 22, 2018, 10:41 a.m. UTC | #10
On Thu, Nov 22, 2018 at 06:32:09PM +0800, Ming Lei wrote:
> On Thu, Nov 22, 2018 at 11:04:28AM +0100, Christoph Hellwig wrote:
> > On Thu, Nov 22, 2018 at 05:33:00PM +0800, Ming Lei wrote:
> > > However, using virt boundary limit on non-cluster seems over-kill,
> > > because the bio will be over-split(each small bvec may be split as one bio)
> > > if it includes lots of small segment.
> > 
> > The combination of the virt boundary of PAGE_SIZE - 1 and a
> > max_segment_size of PAGE_SIZE will only split if the to me merged
> > segment is in a different page than the previous one, which is exactly
> > what we need here.  Multiple small bvec inside the same page (e.g.
> > 512 byte buffer_heads) will still be merged.
> > 
> > > What we want to do is just to avoid to merge bvecs to segment, which
> > > should have been done by NO_SG_MERGE simply. However, after multi-page
> > > is enabled, two adjacent bvecs won't be merged any more, I just forget
> > > to remove the bvec merge code in V11.
> > > 
> > > So seems we can simply avoid to use virt boundary limit for non-cluster
> > > after multipage bvec is enabled?
> > 
> > No, we can't just remove it.  As explained in the patch there is one very
> > visible difference of setting the flag amd that is no segment will span a
> > page boundary, and at least the iSCSI code seems to rely on that.
> 
> IMO, we should use queue_segment_boundary() to enhance the rule during splitting
> segment after multi-page bvec is enabled.
> 
> Seems we miss the segment boundary limit in bvec_split_segs().

Yes, that looks like the right fix!
Ming Lei Nov. 22, 2018, 10:46 a.m. UTC | #11
On Thu, Nov 22, 2018 at 11:41:50AM +0100, Christoph Hellwig wrote:
> On Thu, Nov 22, 2018 at 06:32:09PM +0800, Ming Lei wrote:
> > On Thu, Nov 22, 2018 at 11:04:28AM +0100, Christoph Hellwig wrote:
> > > On Thu, Nov 22, 2018 at 05:33:00PM +0800, Ming Lei wrote:
> > > > However, using virt boundary limit on non-cluster seems over-kill,
> > > > because the bio will be over-split(each small bvec may be split as one bio)
> > > > if it includes lots of small segment.
> > > 
> > > The combination of the virt boundary of PAGE_SIZE - 1 and a
> > > max_segment_size of PAGE_SIZE will only split if the to me merged
> > > segment is in a different page than the previous one, which is exactly
> > > what we need here.  Multiple small bvec inside the same page (e.g.
> > > 512 byte buffer_heads) will still be merged.
> > > 
> > > > What we want to do is just to avoid to merge bvecs to segment, which
> > > > should have been done by NO_SG_MERGE simply. However, after multi-page
> > > > is enabled, two adjacent bvecs won't be merged any more, I just forget
> > > > to remove the bvec merge code in V11.
> > > > 
> > > > So seems we can simply avoid to use virt boundary limit for non-cluster
> > > > after multipage bvec is enabled?
> > > 
> > > No, we can't just remove it.  As explained in the patch there is one very
> > > visible difference of setting the flag amd that is no segment will span a
> > > page boundary, and at least the iSCSI code seems to rely on that.
> > 
> > IMO, we should use queue_segment_boundary() to enhance the rule during splitting
> > segment after multi-page bvec is enabled.
> > 
> > Seems we miss the segment boundary limit in bvec_split_segs().
> 
> Yes, that looks like the right fix!

Then your patch should work by just replacing virt boundary with segment
boudary limit. I will do that change in V12 if you don't object.


Thanks,
Ming
Christoph Hellwig Nov. 22, 2018, 10:47 a.m. UTC | #12
On Thu, Nov 22, 2018 at 06:46:05PM +0800, Ming Lei wrote:
> Then your patch should work by just replacing virt boundary with segment
> boudary limit. I will do that change in V12 if you don't object.

Please do, thanks a lot.
diff mbox series

Patch

diff --git a/block/Makefile b/block/Makefile
index eee1b4ceecf9..e07d59438c4b 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -9,7 +9,8 @@  obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-sysfs.o \
 			blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \
 			blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \
 			genhd.o partition-generic.o ioprio.o \
-			badblocks.o partitions/ blk-rq-qos.o
+			badblocks.o partitions/ blk-rq-qos.o \
+			non-cluster.o
 
 obj-$(CONFIG_BOUNCE)		+= bounce.o
 obj-$(CONFIG_BLK_SCSI_REQUEST)	+= scsi_ioctl.o
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 8829c51b4e75..7c44216c1b58 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -247,7 +247,7 @@  static struct bio *blk_bio_segment_split(struct request_queue *q,
 			goto split;
 		}
 
-		if (bvprvp && blk_queue_cluster(q)) {
+		if (bvprvp) {
 			if (seg_size + bv.bv_len > queue_max_segment_size(q))
 				goto new_segment;
 			if (!biovec_phys_mergeable(q, bvprvp, &bv))
@@ -307,6 +307,10 @@  void blk_queue_split(struct request_queue *q, struct bio **bio)
 		split = blk_bio_write_same_split(q, *bio, &q->bio_split, &nsegs);
 		break;
 	default:
+		if (!blk_queue_cluster(q)) {
+			blk_queue_non_cluster_bio(q, bio);
+			return;
+		}
 		split = blk_bio_segment_split(q, *bio, &q->bio_split, &nsegs);
 		break;
 	}
diff --git a/block/blk.h b/block/blk.h
index 31c0e45aba3a..6fc5821ced55 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -338,6 +338,8 @@  struct bio *blk_next_bio(struct bio *bio, unsigned int nr_pages, gfp_t gfp);
 
 struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask, struct bio_set *bs);
 
+void blk_queue_non_cluster_bio(struct request_queue *q, struct bio **bio_orig);
+
 #ifdef CONFIG_BLK_DEV_ZONED
 void blk_queue_free_zone_bitmaps(struct request_queue *q);
 #else
diff --git a/block/non-cluster.c b/block/non-cluster.c
new file mode 100644
index 000000000000..9c2910be9404
--- /dev/null
+++ b/block/non-cluster.c
@@ -0,0 +1,70 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/* non-cluster handling for block devices */
+
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/swap.h>
+#include <linux/gfp.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
+#include <linux/init.h>
+#include <linux/printk.h>
+
+#include "blk.h"
+
+static struct bio_set non_cluster_bio_set, non_cluster_bio_split;
+
+static __init int init_non_cluster_bioset(void)
+{
+	WARN_ON(bioset_init(&non_cluster_bio_set, BIO_POOL_SIZE, 0,
+			   BIOSET_NEED_BVECS));
+	WARN_ON(bioset_integrity_create(&non_cluster_bio_set, BIO_POOL_SIZE));
+	WARN_ON(bioset_init(&non_cluster_bio_split, BIO_POOL_SIZE, 0, 0));
+
+	return 0;
+}
+__initcall(init_non_cluster_bioset);
+
+static void non_cluster_end_io(struct bio *bio)
+{
+	struct bio *bio_orig = bio->bi_private;
+
+	bio_orig->bi_status = bio->bi_status;
+	bio_endio(bio_orig);
+	bio_put(bio);
+}
+
+void blk_queue_non_cluster_bio(struct request_queue *q, struct bio **bio_orig)
+{
+	struct bio *bio;
+	struct bvec_iter iter;
+	struct bio_vec from;
+	unsigned i = 0;
+	unsigned sectors = 0;
+	unsigned short max_segs = min_t(unsigned short, BIO_MAX_PAGES,
+					queue_max_segments(q));
+
+	bio_for_each_segment(from, *bio_orig, iter) {
+		if (i++ < max_segs)
+			sectors += from.bv_len >> 9;
+		else
+			break;
+	}
+
+	if (sectors < bio_sectors(*bio_orig)) {
+		bio = bio_split(*bio_orig, sectors, GFP_NOIO,
+				&non_cluster_bio_split);
+		bio_chain(bio, *bio_orig);
+		generic_make_request(*bio_orig);
+		*bio_orig = bio;
+	}
+	bio = bio_clone_bioset(*bio_orig, GFP_NOIO, &non_cluster_bio_set);
+
+	bio->bi_phys_segments = bio_segments(bio);
+        bio_set_flag(bio, BIO_SEG_VALID);
+	bio->bi_end_io = non_cluster_end_io;
+
+	bio->bi_private = *bio_orig;
+	*bio_orig = bio;
+}