diff mbox series

[v3,3/3] md: Use optimal I/O size for last bitmap page

Message ID 20230222215828.225-4-jonathan.derrick@linux.dev (mailing list archive)
State Superseded, archived
Headers show
Series md/bitmap: Optimal last page size | expand

Commit Message

Jonathan Derrick Feb. 22, 2023, 9:58 p.m. UTC
From: Jon Derrick <jonathan.derrick@linux.dev>

If the bitmap space has enough room, size the I/O for the last bitmap
page write to the optimal I/O size for the storage device. The expanded
write is checked that it won't overrun the data or metadata.

The drive this was tested against has higher latencies when there are
sub-4k writes due to device-side read-mod-writes of its atomic 4k write
unit. This change helps increase performance by sizing the last bitmap
page I/O for the device's preferred write unit, if it is given.

Example Intel/Solidigm P5520
Raid10, Chunk-size 64M, bitmap-size 57228 bits

$ mdadm --create /dev/md0 --level=10 --raid-devices=4 /dev/nvme{0,1,2,3}n1
        --assume-clean --bitmap=internal --bitmap-chunk=64M
$ fio --name=test --direct=1 --filename=/dev/md0 --rw=randwrite --bs=4k --runtime=60

Without patch:
  write: IOPS=1676, BW=6708KiB/s (6869kB/s)(393MiB/60001msec); 0 zone resets

With patch:
  write: IOPS=15.7k, BW=61.4MiB/s (64.4MB/s)(3683MiB/60001msec); 0 zone resets

Biosnoop:
Without patch:
Time        Process        PID     Device      LBA        Size      Lat
1.410377    md0_raid10     6900    nvme0n1   W 16         4096      0.02
1.410387    md0_raid10     6900    nvme2n1   W 16         4096      0.02
1.410374    md0_raid10     6900    nvme3n1   W 16         4096      0.01
1.410381    md0_raid10     6900    nvme1n1   W 16         4096      0.02
1.410411    md0_raid10     6900    nvme1n1   W 115346512  4096      0.01
1.410418    md0_raid10     6900    nvme0n1   W 115346512  4096      0.02
1.410915    md0_raid10     6900    nvme2n1   W 24         3584      0.43 <--
1.410935    md0_raid10     6900    nvme3n1   W 24         3584      0.45 <--
1.411124    md0_raid10     6900    nvme1n1   W 24         3584      0.64 <--
1.411147    md0_raid10     6900    nvme0n1   W 24         3584      0.66 <--
1.411176    md0_raid10     6900    nvme3n1   W 2019022184 4096      0.01
1.411189    md0_raid10     6900    nvme2n1   W 2019022184 4096      0.02

With patch:
Time        Process        PID     Device      LBA        Size      Lat
5.747193    md0_raid10     727     nvme0n1   W 16         4096      0.01
5.747192    md0_raid10     727     nvme1n1   W 16         4096      0.02
5.747195    md0_raid10     727     nvme3n1   W 16         4096      0.01
5.747202    md0_raid10     727     nvme2n1   W 16         4096      0.02
5.747229    md0_raid10     727     nvme3n1   W 1196223704 4096      0.02
5.747224    md0_raid10     727     nvme0n1   W 1196223704 4096      0.01
5.747279    md0_raid10     727     nvme0n1   W 24         4096      0.01 <--
5.747279    md0_raid10     727     nvme1n1   W 24         4096      0.02 <--
5.747284    md0_raid10     727     nvme3n1   W 24         4096      0.02 <--
5.747291    md0_raid10     727     nvme2n1   W 24         4096      0.02 <--
5.747314    md0_raid10     727     nvme2n1   W 2234636712 4096      0.01
5.747317    md0_raid10     727     nvme1n1   W 2234636712 4096      0.02

Signed-off-by: Jon Derrick <jonathan.derrick@linux.dev>
---
 drivers/md/md-bitmap.c | 33 +++++++++++++++++++++++++++++----
 1 file changed, 29 insertions(+), 4 deletions(-)

Comments

Christoph Hellwig Feb. 22, 2023, 11:42 p.m. UTC | #1
On Wed, Feb 22, 2023 at 02:58:28PM -0700, Jonathan Derrick wrote:
> +	if (io_size != opt_size &&
> +	    start + opt_size / SECTOR_SIZE <= boundary)
> +		return opt_size;
> +	else if (start + io_size / SECTOR_SIZE <= boundary)

No need for an else after a return.

Otherwise looks good:

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reindl Harald Feb. 22, 2023, 11:48 p.m. UTC | #2
Am 23.02.23 um 00:42 schrieb Christoph Hellwig:
> On Wed, Feb 22, 2023 at 02:58:28PM -0700, Jonathan Derrick wrote:
>> +	if (io_size != opt_size &&
>> +	    start + opt_size / SECTOR_SIZE <= boundary)
>> +		return opt_size;
>> +	else if (start + io_size / SECTOR_SIZE <= boundary)
> 
> No need for an else after a return.
> 
> Otherwise looks good:
> 
> Reviewed-by: Christoph Hellwig <hch@lst.de>

the "return" is within the if-condition and has nothing to do with the 
else - with {} it would be clearly visible
Christoph Hellwig Feb. 22, 2023, 11:53 p.m. UTC | #3
On Thu, Feb 23, 2023 at 12:48:58AM +0100, Reindl Harald wrote:
> > > +	if (io_size != opt_size &&
> > > +	    start + opt_size / SECTOR_SIZE <= boundary)
> > > +		return opt_size;
> > > +	else if (start + io_size / SECTOR_SIZE <= boundary)
> > 
> > No need for an else after a return.
> > 
> > Otherwise looks good:
> > 
> > Reviewed-by: Christoph Hellwig <hch@lst.de>
> 
> the "return" is within the if-condition and has nothing to do with the else
> - with {} it would be clearly visible

That doesn't change the fact that it's not actually needed.
diff mbox series

Patch

diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index 11f4453775ee..196984fc3776 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -209,6 +209,28 @@  static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mdde
 	return NULL;
 }
 
+static unsigned int optimal_io_size(struct block_device *bdev,
+				    unsigned int last_page_size,
+				    unsigned int io_size)
+{
+	if (bdev_io_opt(bdev) > bdev_logical_block_size(bdev))
+		return roundup(last_page_size, bdev_io_opt(bdev));
+	return io_size;
+}
+
+static unsigned int bitmap_io_size(unsigned int io_size, unsigned int opt_size,
+				   sector_t start, sector_t boundary)
+{
+	if (io_size != opt_size &&
+	    start + opt_size / SECTOR_SIZE <= boundary)
+		return opt_size;
+	else if (start + io_size / SECTOR_SIZE <= boundary)
+		return io_size;
+
+	/* Overflows boundary */
+	return 0;
+}
+
 static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
 			   struct page *page)
 {
@@ -218,14 +240,15 @@  static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
 	sector_t offset = mddev->bitmap_info.offset;
 	sector_t ps, sboff, doff;
 	unsigned int size = PAGE_SIZE;
+	unsigned int opt_size = PAGE_SIZE;
 
 	bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev;
 	if (page->index == store->file_pages - 1) {
 		unsigned int last_page_size = store->bytes & (PAGE_SIZE - 1);
 		if (last_page_size == 0)
 			last_page_size = PAGE_SIZE;
-		size = roundup(last_page_size,
-			       bdev_logical_block_size(bdev));
+		size = roundup(last_page_size, bdev_logical_block_size(bdev));
+		opt_size = optimal_io_size(bdev, last_page_size, size);
 	}
 
 	ps = page->index * (PAGE_SIZE / SECTOR_SIZE);
@@ -240,7 +263,8 @@  static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
 			return -EINVAL;
 	} else if (offset < 0) {
 		/* DATA  BITMAP METADATA  */
-		if (offset + ps + size / SECTOR_SIZE > 0)
+		size = bitmap_io_size(size, opt_size, offset + ps, 0);
+		if (size == 0)
 			/* bitmap runs in to metadata */
 			return -EINVAL;
 
@@ -249,7 +273,8 @@  static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
 			return -EINVAL;
 	} else if (rdev->sb_start < rdev->data_offset) {
 		/* METADATA BITMAP DATA */
-		if (sboff + ps + size / SECTOR_SIZE > doff)
+		size = bitmap_io_size(size, opt_size, sboff + ps, doff);
+		if (size == 0)
 			/* bitmap runs in to data */
 			return -EINVAL;
 	} else {