Message ID | 20230224183323.638-4-jonathan.derrick@linux.dev (mailing list archive) |
---|---|
State | Accepted, archived |
Headers | show |
Series | md/bitmap: Optimal last page size | expand |
Hi Jonathan I did a test in my environment, but I didn't see such a big performance difference. The first environment: All nvme devices have 512 logical size, 512 phy size, and 0 optimal size. Then I used your way to rebuild the kernel /sys/block/nvme0n1/queue/physical_block_size 512 /sys/block/nvme0n1/queue/optimal_io_size 4096 cat /sys/block/nvme0n1/queue/logical_block_size 512 without the patch set write: IOPS=68.0k, BW=266MiB/s (279MB/s)(15.6GiB/60001msec); 0 zone resets with the patch set write: IOPS=69.1k, BW=270MiB/s (283MB/s)(15.8GiB/60001msec); 0 zone resets The second environment: The nvme devices' opt size are 4096. So I don't need to rebuild the kernel. /sys/block/nvme0n1/queue/logical_block_size /sys/block/nvme0n1/queue/physical_block_size /sys/block/nvme0n1/queue/optimal_io_size without the patch set write: IOPS=51.6k, BW=202MiB/s (212MB/s)(11.8GiB/60001msec); 0 zone resets with the patch set write: IOPS=53.5k, BW=209MiB/s (219MB/s)(12.2GiB/60001msec); 0 zone resets Best Regards Xiao On Sat, Feb 25, 2023 at 2:34 AM Jonathan Derrick <jonathan.derrick@linux.dev> wrote: > > From: Jon Derrick <jonathan.derrick@linux.dev> > > If the bitmap space has enough room, size the I/O for the last bitmap > page write to the optimal I/O size for the storage device. The expanded > write is checked that it won't overrun the data or metadata. > > The drive this was tested against has higher latencies when there are > sub-4k writes due to device-side read-mod-writes of its atomic 4k write > unit. This change helps increase performance by sizing the last bitmap > page I/O for the device's preferred write unit, if it is given. > > Example Intel/Solidigm P5520 > Raid10, Chunk-size 64M, bitmap-size 57228 bits > > $ mdadm --create /dev/md0 --level=10 --raid-devices=4 /dev/nvme{0,1,2,3}n1 > --assume-clean --bitmap=internal --bitmap-chunk=64M > $ fio --name=test --direct=1 --filename=/dev/md0 --rw=randwrite --bs=4k --runtime=60 > > Without patch: > write: IOPS=1676, BW=6708KiB/s (6869kB/s)(393MiB/60001msec); 0 zone resets > > With patch: > write: IOPS=15.7k, BW=61.4MiB/s (64.4MB/s)(3683MiB/60001msec); 0 zone resets > > Biosnoop: > Without patch: > Time Process PID Device LBA Size Lat > 1.410377 md0_raid10 6900 nvme0n1 W 16 4096 0.02 > 1.410387 md0_raid10 6900 nvme2n1 W 16 4096 0.02 > 1.410374 md0_raid10 6900 nvme3n1 W 16 4096 0.01 > 1.410381 md0_raid10 6900 nvme1n1 W 16 4096 0.02 > 1.410411 md0_raid10 6900 nvme1n1 W 115346512 4096 0.01 > 1.410418 md0_raid10 6900 nvme0n1 W 115346512 4096 0.02 > 1.410915 md0_raid10 6900 nvme2n1 W 24 3584 0.43 <-- > 1.410935 md0_raid10 6900 nvme3n1 W 24 3584 0.45 <-- > 1.411124 md0_raid10 6900 nvme1n1 W 24 3584 0.64 <-- > 1.411147 md0_raid10 6900 nvme0n1 W 24 3584 0.66 <-- > 1.411176 md0_raid10 6900 nvme3n1 W 2019022184 4096 0.01 > 1.411189 md0_raid10 6900 nvme2n1 W 2019022184 4096 0.02 > > With patch: > Time Process PID Device LBA Size Lat > 5.747193 md0_raid10 727 nvme0n1 W 16 4096 0.01 > 5.747192 md0_raid10 727 nvme1n1 W 16 4096 0.02 > 5.747195 md0_raid10 727 nvme3n1 W 16 4096 0.01 > 5.747202 md0_raid10 727 nvme2n1 W 16 4096 0.02 > 5.747229 md0_raid10 727 nvme3n1 W 1196223704 4096 0.02 > 5.747224 md0_raid10 727 nvme0n1 W 1196223704 4096 0.01 > 5.747279 md0_raid10 727 nvme0n1 W 24 4096 0.01 <-- > 5.747279 md0_raid10 727 nvme1n1 W 24 4096 0.02 <-- > 5.747284 md0_raid10 727 nvme3n1 W 24 4096 0.02 <-- > 5.747291 md0_raid10 727 nvme2n1 W 24 4096 0.02 <-- > 5.747314 md0_raid10 727 nvme2n1 W 2234636712 4096 0.01 > 5.747317 md0_raid10 727 nvme1n1 W 2234636712 4096 0.02 > > Reviewed-by: Christoph Hellwig <hch@lst.de> > Signed-off-by: Jon Derrick <jonathan.derrick@linux.dev> > --- > drivers/md/md-bitmap.c | 33 +++++++++++++++++++++++++++++---- > 1 file changed, 29 insertions(+), 4 deletions(-) > > diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c > index bf250a5e3a86..920bb68156d2 100644 > --- a/drivers/md/md-bitmap.c > +++ b/drivers/md/md-bitmap.c > @@ -209,6 +209,28 @@ static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mdde > return NULL; > } > > +static unsigned int optimal_io_size(struct block_device *bdev, > + unsigned int last_page_size, > + unsigned int io_size) > +{ > + if (bdev_io_opt(bdev) > bdev_logical_block_size(bdev)) > + return roundup(last_page_size, bdev_io_opt(bdev)); > + return io_size; > +} > + > +static unsigned int bitmap_io_size(unsigned int io_size, unsigned int opt_size, > + sector_t start, sector_t boundary) > +{ > + if (io_size != opt_size && > + start + opt_size / SECTOR_SIZE <= boundary) > + return opt_size; > + if (start + io_size / SECTOR_SIZE <= boundary) > + return io_size; > + > + /* Overflows boundary */ > + return 0; > +} > + > static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap, > struct page *page) > { > @@ -218,6 +240,7 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap, > sector_t offset = mddev->bitmap_info.offset; > sector_t ps, sboff, doff; > unsigned int size = PAGE_SIZE; > + unsigned int opt_size = PAGE_SIZE; > > bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev; > if (page->index == store->file_pages - 1) { > @@ -225,8 +248,8 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap, > > if (last_page_size == 0) > last_page_size = PAGE_SIZE; > - size = roundup(last_page_size, > - bdev_logical_block_size(bdev)); > + size = roundup(last_page_size, bdev_logical_block_size(bdev)); > + opt_size = optimal_io_size(bdev, last_page_size, size); > } > > ps = page->index * PAGE_SIZE / SECTOR_SIZE; > @@ -241,7 +264,8 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap, > return -EINVAL; > } else if (offset < 0) { > /* DATA BITMAP METADATA */ > - if (offset + ps + size / SECTOR_SIZE > 0) > + size = bitmap_io_size(size, opt_size, offset + ps, 0); > + if (size == 0) > /* bitmap runs in to metadata */ > return -EINVAL; > > @@ -250,7 +274,8 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap, > return -EINVAL; > } else if (rdev->sb_start < rdev->data_offset) { > /* METADATA BITMAP DATA */ > - if (sboff + ps + size / SECTOR_SIZE > doff) > + size = bitmap_io_size(size, opt_size, sboff + ps, doff); > + if (size == 0) > /* bitmap runs in to data */ > return -EINVAL; > } else { > -- > 2.27.0 >
Hi Xiao On 2/26/2023 6:56 PM, Xiao Ni wrote: > Hi Jonathan > > I did a test in my environment, but I didn't see such a big > performance difference. > > The first environment: > All nvme devices have 512 logical size, 512 phy size, and 0 optimal size. Then > I used your way to rebuild the kernel > /sys/block/nvme0n1/queue/physical_block_size 512 > /sys/block/nvme0n1/queue/optimal_io_size 4096 > cat /sys/block/nvme0n1/queue/logical_block_size 512 > > without the patch set > write: IOPS=68.0k, BW=266MiB/s (279MB/s)(15.6GiB/60001msec); 0 zone resets > with the patch set > write: IOPS=69.1k, BW=270MiB/s (283MB/s)(15.8GiB/60001msec); 0 zone resets > > The second environment: > The nvme devices' opt size are 4096. So I don't need to rebuild the kernel. > /sys/block/nvme0n1/queue/logical_block_size > /sys/block/nvme0n1/queue/physical_block_size > /sys/block/nvme0n1/queue/optimal_io_size > > without the patch set > write: IOPS=51.6k, BW=202MiB/s (212MB/s)(11.8GiB/60001msec); 0 zone resets > with the patch set > write: IOPS=53.5k, BW=209MiB/s (219MB/s)(12.2GiB/60001msec); 0 zone resets > Sounds like your devices may not have latency issues at sub-optimal sizes. Can you provide biosnoop traces with and without patches? Still, 'works fine for me' is generally not a reason to reject the patches. > Best Regards > Xiao > > On Sat, Feb 25, 2023 at 2:34 AM Jonathan Derrick > <jonathan.derrick@linux.dev> wrote: >> >> From: Jon Derrick <jonathan.derrick@linux.dev> >> >> If the bitmap space has enough room, size the I/O for the last bitmap >> page write to the optimal I/O size for the storage device. The expanded >> write is checked that it won't overrun the data or metadata. >> >> The drive this was tested against has higher latencies when there are >> sub-4k writes due to device-side read-mod-writes of its atomic 4k write >> unit. This change helps increase performance by sizing the last bitmap >> page I/O for the device's preferred write unit, if it is given. >> >> Example Intel/Solidigm P5520 >> Raid10, Chunk-size 64M, bitmap-size 57228 bits >> >> $ mdadm --create /dev/md0 --level=10 --raid-devices=4 /dev/nvme{0,1,2,3}n1 >> --assume-clean --bitmap=internal --bitmap-chunk=64M >> $ fio --name=test --direct=1 --filename=/dev/md0 --rw=randwrite --bs=4k --runtime=60 >> >> Without patch: >> write: IOPS=1676, BW=6708KiB/s (6869kB/s)(393MiB/60001msec); 0 zone resets >> >> With patch: >> write: IOPS=15.7k, BW=61.4MiB/s (64.4MB/s)(3683MiB/60001msec); 0 zone resets >> >> Biosnoop: >> Without patch: >> Time Process PID Device LBA Size Lat >> 1.410377 md0_raid10 6900 nvme0n1 W 16 4096 0.02 >> 1.410387 md0_raid10 6900 nvme2n1 W 16 4096 0.02 >> 1.410374 md0_raid10 6900 nvme3n1 W 16 4096 0.01 >> 1.410381 md0_raid10 6900 nvme1n1 W 16 4096 0.02 >> 1.410411 md0_raid10 6900 nvme1n1 W 115346512 4096 0.01 >> 1.410418 md0_raid10 6900 nvme0n1 W 115346512 4096 0.02 >> 1.410915 md0_raid10 6900 nvme2n1 W 24 3584 0.43 <-- >> 1.410935 md0_raid10 6900 nvme3n1 W 24 3584 0.45 <-- >> 1.411124 md0_raid10 6900 nvme1n1 W 24 3584 0.64 <-- >> 1.411147 md0_raid10 6900 nvme0n1 W 24 3584 0.66 <-- >> 1.411176 md0_raid10 6900 nvme3n1 W 2019022184 4096 0.01 >> 1.411189 md0_raid10 6900 nvme2n1 W 2019022184 4096 0.02 >> >> With patch: >> Time Process PID Device LBA Size Lat >> 5.747193 md0_raid10 727 nvme0n1 W 16 4096 0.01 >> 5.747192 md0_raid10 727 nvme1n1 W 16 4096 0.02 >> 5.747195 md0_raid10 727 nvme3n1 W 16 4096 0.01 >> 5.747202 md0_raid10 727 nvme2n1 W 16 4096 0.02 >> 5.747229 md0_raid10 727 nvme3n1 W 1196223704 4096 0.02 >> 5.747224 md0_raid10 727 nvme0n1 W 1196223704 4096 0.01 >> 5.747279 md0_raid10 727 nvme0n1 W 24 4096 0.01 <-- >> 5.747279 md0_raid10 727 nvme1n1 W 24 4096 0.02 <-- >> 5.747284 md0_raid10 727 nvme3n1 W 24 4096 0.02 <-- >> 5.747291 md0_raid10 727 nvme2n1 W 24 4096 0.02 <-- >> 5.747314 md0_raid10 727 nvme2n1 W 2234636712 4096 0.01 >> 5.747317 md0_raid10 727 nvme1n1 W 2234636712 4096 0.02 >> >> Reviewed-by: Christoph Hellwig <hch@lst.de> >> Signed-off-by: Jon Derrick <jonathan.derrick@linux.dev> >> --- >> drivers/md/md-bitmap.c | 33 +++++++++++++++++++++++++++++---- >> 1 file changed, 29 insertions(+), 4 deletions(-) >> >> diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c >> index bf250a5e3a86..920bb68156d2 100644 >> --- a/drivers/md/md-bitmap.c >> +++ b/drivers/md/md-bitmap.c >> @@ -209,6 +209,28 @@ static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mdde >> return NULL; >> } >> >> +static unsigned int optimal_io_size(struct block_device *bdev, >> + unsigned int last_page_size, >> + unsigned int io_size) >> +{ >> + if (bdev_io_opt(bdev) > bdev_logical_block_size(bdev)) >> + return roundup(last_page_size, bdev_io_opt(bdev)); >> + return io_size; >> +} >> + >> +static unsigned int bitmap_io_size(unsigned int io_size, unsigned int opt_size, >> + sector_t start, sector_t boundary) >> +{ >> + if (io_size != opt_size && >> + start + opt_size / SECTOR_SIZE <= boundary) >> + return opt_size; >> + if (start + io_size / SECTOR_SIZE <= boundary) >> + return io_size; >> + >> + /* Overflows boundary */ >> + return 0; >> +} >> + >> static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap, >> struct page *page) >> { >> @@ -218,6 +240,7 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap, >> sector_t offset = mddev->bitmap_info.offset; >> sector_t ps, sboff, doff; >> unsigned int size = PAGE_SIZE; >> + unsigned int opt_size = PAGE_SIZE; >> >> bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev; >> if (page->index == store->file_pages - 1) { >> @@ -225,8 +248,8 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap, >> >> if (last_page_size == 0) >> last_page_size = PAGE_SIZE; >> - size = roundup(last_page_size, >> - bdev_logical_block_size(bdev)); >> + size = roundup(last_page_size, bdev_logical_block_size(bdev)); >> + opt_size = optimal_io_size(bdev, last_page_size, size); >> } >> >> ps = page->index * PAGE_SIZE / SECTOR_SIZE; >> @@ -241,7 +264,8 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap, >> return -EINVAL; >> } else if (offset < 0) { >> /* DATA BITMAP METADATA */ >> - if (offset + ps + size / SECTOR_SIZE > 0) >> + size = bitmap_io_size(size, opt_size, offset + ps, 0); >> + if (size == 0) >> /* bitmap runs in to metadata */ >> return -EINVAL; >> >> @@ -250,7 +274,8 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap, >> return -EINVAL; >> } else if (rdev->sb_start < rdev->data_offset) { >> /* METADATA BITMAP DATA */ >> - if (sboff + ps + size / SECTOR_SIZE > doff) >> + size = bitmap_io_size(size, opt_size, sboff + ps, doff); >> + if (size == 0) >> /* bitmap runs in to data */ >> return -EINVAL; >> } else { >> -- >> 2.27.0 >> >
On Wed, Mar 1, 2023 at 7:10 AM Jonathan Derrick <jonathan.derrick@linux.dev> wrote: > > Hi Xiao > > On 2/26/2023 6:56 PM, Xiao Ni wrote: > > Hi Jonathan > > > > I did a test in my environment, but I didn't see such a big > > performance difference. > > > > The first environment: > > All nvme devices have 512 logical size, 512 phy size, and 0 optimal size. Then > > I used your way to rebuild the kernel > > /sys/block/nvme0n1/queue/physical_block_size 512 > > /sys/block/nvme0n1/queue/optimal_io_size 4096 > > cat /sys/block/nvme0n1/queue/logical_block_size 512 > > > > without the patch set > > write: IOPS=68.0k, BW=266MiB/s (279MB/s)(15.6GiB/60001msec); 0 zone resets > > with the patch set > > write: IOPS=69.1k, BW=270MiB/s (283MB/s)(15.8GiB/60001msec); 0 zone resets > > > > The second environment: > > The nvme devices' opt size are 4096. So I don't need to rebuild the kernel. > > /sys/block/nvme0n1/queue/logical_block_size > > /sys/block/nvme0n1/queue/physical_block_size > > /sys/block/nvme0n1/queue/optimal_io_size > > > > without the patch set > > write: IOPS=51.6k, BW=202MiB/s (212MB/s)(11.8GiB/60001msec); 0 zone resets > > with the patch set > > write: IOPS=53.5k, BW=209MiB/s (219MB/s)(12.2GiB/60001msec); 0 zone resets > > > Sounds like your devices may not have latency issues at sub-optimal sizes. > Can you provide biosnoop traces with and without patches? > > Still, 'works fine for me' is generally not a reason to reject the patches. Yes, I can. I tried to install the biosnoop in fedora38 but it failed. These are the rpm packages I've installed: bcc-tools-0.25.0-1.fc38.x86_64 bcc-0.25.0-1.fc38.x86_64 python3-bcc-0.25.0-1.fc38.noarch Are there other packages that I need to install? Regards Xiao
On 3/1/2023 5:36 AM, Xiao Ni wrote: > On Wed, Mar 1, 2023 at 7:10 AM Jonathan Derrick > <jonathan.derrick@linux.dev> wrote: >> >> Hi Xiao >> >> On 2/26/2023 6:56 PM, Xiao Ni wrote: >>> Hi Jonathan >>> >>> I did a test in my environment, but I didn't see such a big >>> performance difference. >>> >>> The first environment: >>> All nvme devices have 512 logical size, 512 phy size, and 0 optimal size. Then >>> I used your way to rebuild the kernel >>> /sys/block/nvme0n1/queue/physical_block_size 512 >>> /sys/block/nvme0n1/queue/optimal_io_size 4096 >>> cat /sys/block/nvme0n1/queue/logical_block_size 512 >>> >>> without the patch set >>> write: IOPS=68.0k, BW=266MiB/s (279MB/s)(15.6GiB/60001msec); 0 zone resets >>> with the patch set >>> write: IOPS=69.1k, BW=270MiB/s (283MB/s)(15.8GiB/60001msec); 0 zone resets >>> >>> The second environment: >>> The nvme devices' opt size are 4096. So I don't need to rebuild the kernel. >>> /sys/block/nvme0n1/queue/logical_block_size >>> /sys/block/nvme0n1/queue/physical_block_size >>> /sys/block/nvme0n1/queue/optimal_io_size >>> >>> without the patch set >>> write: IOPS=51.6k, BW=202MiB/s (212MB/s)(11.8GiB/60001msec); 0 zone resets >>> with the patch set >>> write: IOPS=53.5k, BW=209MiB/s (219MB/s)(12.2GiB/60001msec); 0 zone resets >>> >> Sounds like your devices may not have latency issues at sub-optimal sizes. >> Can you provide biosnoop traces with and without patches? >> >> Still, 'works fine for me' is generally not a reason to reject the patches. > > Yes, I can. I tried to install the biosnoop in fedora38 but it failed. > These are the rpm packages I've installed: > bcc-tools-0.25.0-1.fc38.x86_64 > bcc-0.25.0-1.fc38.x86_64 > python3-bcc-0.25.0-1.fc38.noarch > > Are there other packages that I need to install? > I've had issues with the packaged versions as well Best to install from source: https://github.com/iovisor/bcc/ https://github.com/iovisor/bcc/blob/master/INSTALL.md#fedora---source > Regards > Xiao >
On Thu, Mar 2, 2023 at 1:57 AM Jonathan Derrick <jonathan.derrick@linux.dev> wrote: > > > > On 3/1/2023 5:36 AM, Xiao Ni wrote: > > On Wed, Mar 1, 2023 at 7:10 AM Jonathan Derrick > > <jonathan.derrick@linux.dev> wrote: > >> > >> Hi Xiao > >> > >> On 2/26/2023 6:56 PM, Xiao Ni wrote: > >>> Hi Jonathan > >>> > >>> I did a test in my environment, but I didn't see such a big > >>> performance difference. > >>> > >>> The first environment: > >>> All nvme devices have 512 logical size, 512 phy size, and 0 optimal size. Then > >>> I used your way to rebuild the kernel > >>> /sys/block/nvme0n1/queue/physical_block_size 512 > >>> /sys/block/nvme0n1/queue/optimal_io_size 4096 > >>> cat /sys/block/nvme0n1/queue/logical_block_size 512 > >>> > >>> without the patch set > >>> write: IOPS=68.0k, BW=266MiB/s (279MB/s)(15.6GiB/60001msec); 0 zone resets > >>> with the patch set > >>> write: IOPS=69.1k, BW=270MiB/s (283MB/s)(15.8GiB/60001msec); 0 zone resets > >>> > >>> The second environment: > >>> The nvme devices' opt size are 4096. So I don't need to rebuild the kernel. > >>> /sys/block/nvme0n1/queue/logical_block_size > >>> /sys/block/nvme0n1/queue/physical_block_size > >>> /sys/block/nvme0n1/queue/optimal_io_size > >>> > >>> without the patch set > >>> write: IOPS=51.6k, BW=202MiB/s (212MB/s)(11.8GiB/60001msec); 0 zone resets > >>> with the patch set > >>> write: IOPS=53.5k, BW=209MiB/s (219MB/s)(12.2GiB/60001msec); 0 zone resets > >>> > >> Sounds like your devices may not have latency issues at sub-optimal sizes. > >> Can you provide biosnoop traces with and without patches? > >> > >> Still, 'works fine for me' is generally not a reason to reject the patches. > > > > Yes, I can. I tried to install the biosnoop in fedora38 but it failed. > > These are the rpm packages I've installed: > > bcc-tools-0.25.0-1.fc38.x86_64 > > bcc-0.25.0-1.fc38.x86_64 > > python3-bcc-0.25.0-1.fc38.noarch > > > > Are there other packages that I need to install? > > > I've had issues with the packaged versions as well > > Best to install from source: > https://github.com/iovisor/bcc/ > https://github.com/iovisor/bcc/blob/master/INSTALL.md#fedora---source > Hi Jonathan I did a test without modifying phys_size and opt_size. And I picked up a part of the result: 0.172142 md0_raid10 2094 nvme1n1 W 1225496264 4096 0.01 0.172145 md0_raid10 2094 nvme0n1 W 1225496264 4096 0.01 0.172161 md0_raid10 2094 nvme3n1 W 16 4096 0.01 0.172164 md0_raid10 2094 nvme2n1 W 16 4096 0.01 0.172166 md0_raid10 2094 nvme1n1 W 16 4096 0.01 0.172168 md0_raid10 2094 nvme0n1 W 16 4096 0.01 0.172178 md0_raid10 2094 nvme3n1 W 633254624 4096 0.01 0.172180 md0_raid10 2094 nvme2n1 W 633254624 4096 0.01 0.172196 md0_raid10 2094 nvme3n1 W 16 4096 0.01 0.172199 md0_raid10 2094 nvme2n1 W 16 4096 0.01 0.172201 md0_raid10 2094 nvme1n1 W 16 4096 0.01 0.172203 md0_raid10 2094 nvme0n1 W 16 4096 0.01 0.172213 md0_raid10 2094 nvme3n1 W 1060251672 4096 0.01 0.172215 md0_raid10 2094 nvme2n1 W 1060251672 4096 0.01 The last column always shows 0.01. Is that the reason I can't see the performance improvement? What do you think if I use ssd or hdds? Best Regards Xiao
On 3/2/2023 2:05 AM, Xiao Ni wrote: > On Thu, Mar 2, 2023 at 1:57 AM Jonathan Derrick > <jonathan.derrick@linux.dev> wrote: >> >> >> >> On 3/1/2023 5:36 AM, Xiao Ni wrote: >>> On Wed, Mar 1, 2023 at 7:10 AM Jonathan Derrick >>> <jonathan.derrick@linux.dev> wrote: >>>> >>>> Hi Xiao >>>> >>>> On 2/26/2023 6:56 PM, Xiao Ni wrote: >>>>> Hi Jonathan >>>>> >>>>> I did a test in my environment, but I didn't see such a big >>>>> performance difference. >>>>> >>>>> The first environment: >>>>> All nvme devices have 512 logical size, 512 phy size, and 0 optimal size. Then >>>>> I used your way to rebuild the kernel >>>>> /sys/block/nvme0n1/queue/physical_block_size 512 >>>>> /sys/block/nvme0n1/queue/optimal_io_size 4096 >>>>> cat /sys/block/nvme0n1/queue/logical_block_size 512 >>>>> >>>>> without the patch set >>>>> write: IOPS=68.0k, BW=266MiB/s (279MB/s)(15.6GiB/60001msec); 0 zone resets >>>>> with the patch set >>>>> write: IOPS=69.1k, BW=270MiB/s (283MB/s)(15.8GiB/60001msec); 0 zone resets >>>>> >>>>> The second environment: >>>>> The nvme devices' opt size are 4096. So I don't need to rebuild the kernel. >>>>> /sys/block/nvme0n1/queue/logical_block_size >>>>> /sys/block/nvme0n1/queue/physical_block_size >>>>> /sys/block/nvme0n1/queue/optimal_io_size >>>>> >>>>> without the patch set >>>>> write: IOPS=51.6k, BW=202MiB/s (212MB/s)(11.8GiB/60001msec); 0 zone resets >>>>> with the patch set >>>>> write: IOPS=53.5k, BW=209MiB/s (219MB/s)(12.2GiB/60001msec); 0 zone resets >>>>> >>>> Sounds like your devices may not have latency issues at sub-optimal sizes. >>>> Can you provide biosnoop traces with and without patches? >>>> >>>> Still, 'works fine for me' is generally not a reason to reject the patches. >>> >>> Yes, I can. I tried to install the biosnoop in fedora38 but it failed. >>> These are the rpm packages I've installed: >>> bcc-tools-0.25.0-1.fc38.x86_64 >>> bcc-0.25.0-1.fc38.x86_64 >>> python3-bcc-0.25.0-1.fc38.noarch >>> >>> Are there other packages that I need to install? >>> >> I've had issues with the packaged versions as well >> >> Best to install from source: >> https://github.com/iovisor/bcc/ >> https://github.com/iovisor/bcc/blob/master/INSTALL.md#fedora---source >> > Hi Jonathan > > I did a test without modifying phys_size and opt_size. And I picked up a part > of the result: > > 0.172142 md0_raid10 2094 nvme1n1 W 1225496264 4096 0.01 > 0.172145 md0_raid10 2094 nvme0n1 W 1225496264 4096 0.01 > 0.172161 md0_raid10 2094 nvme3n1 W 16 4096 0.01 > 0.172164 md0_raid10 2094 nvme2n1 W 16 4096 0.01 > 0.172166 md0_raid10 2094 nvme1n1 W 16 4096 0.01 > 0.172168 md0_raid10 2094 nvme0n1 W 16 4096 0.01 > 0.172178 md0_raid10 2094 nvme3n1 W 633254624 4096 0.01 > 0.172180 md0_raid10 2094 nvme2n1 W 633254624 4096 0.01 > 0.172196 md0_raid10 2094 nvme3n1 W 16 4096 0.01 > 0.172199 md0_raid10 2094 nvme2n1 W 16 4096 0.01 > 0.172201 md0_raid10 2094 nvme1n1 W 16 4096 0.01 > 0.172203 md0_raid10 2094 nvme0n1 W 16 4096 0.01 > 0.172213 md0_raid10 2094 nvme3n1 W 1060251672 4096 0.01 > 0.172215 md0_raid10 2094 nvme2n1 W 1060251672 4096 0.01 > > The last column always shows 0.01. Is that the reason I can't see the > performance > improvement? What do you think if I use ssd or hdds? Try reducing your mdadm's --bitmap-chunk first. In above logs, only LBA 16 is being used, and that's the first bitmap page (and seemingly also the last). You want to configure it such that you have more bitmap pages. Reducing the --bitmap-chunk parameter should create more bitmap pages, and you may run into the scenario predicted by the patch. > > Best Regards > Xiao >
On Fri, Mar 3, 2023 at 1:18 AM Jonathan Derrick <jonathan.derrick@linux.dev> wrote: > > > > On 3/2/2023 2:05 AM, Xiao Ni wrote: > > On Thu, Mar 2, 2023 at 1:57 AM Jonathan Derrick > > <jonathan.derrick@linux.dev> wrote: > >> > >> > >> > >> On 3/1/2023 5:36 AM, Xiao Ni wrote: > >>> On Wed, Mar 1, 2023 at 7:10 AM Jonathan Derrick > >>> <jonathan.derrick@linux.dev> wrote: > >>>> > >>>> Hi Xiao > >>>> > >>>> On 2/26/2023 6:56 PM, Xiao Ni wrote: > >>>>> Hi Jonathan > >>>>> > >>>>> I did a test in my environment, but I didn't see such a big > >>>>> performance difference. > >>>>> > >>>>> The first environment: > >>>>> All nvme devices have 512 logical size, 512 phy size, and 0 optimal size. Then > >>>>> I used your way to rebuild the kernel > >>>>> /sys/block/nvme0n1/queue/physical_block_size 512 > >>>>> /sys/block/nvme0n1/queue/optimal_io_size 4096 > >>>>> cat /sys/block/nvme0n1/queue/logical_block_size 512 > >>>>> > >>>>> without the patch set > >>>>> write: IOPS=68.0k, BW=266MiB/s (279MB/s)(15.6GiB/60001msec); 0 zone resets > >>>>> with the patch set > >>>>> write: IOPS=69.1k, BW=270MiB/s (283MB/s)(15.8GiB/60001msec); 0 zone resets > >>>>> > >>>>> The second environment: > >>>>> The nvme devices' opt size are 4096. So I don't need to rebuild the kernel. > >>>>> /sys/block/nvme0n1/queue/logical_block_size > >>>>> /sys/block/nvme0n1/queue/physical_block_size > >>>>> /sys/block/nvme0n1/queue/optimal_io_size > >>>>> > >>>>> without the patch set > >>>>> write: IOPS=51.6k, BW=202MiB/s (212MB/s)(11.8GiB/60001msec); 0 zone resets > >>>>> with the patch set > >>>>> write: IOPS=53.5k, BW=209MiB/s (219MB/s)(12.2GiB/60001msec); 0 zone resets > >>>>> > >>>> Sounds like your devices may not have latency issues at sub-optimal sizes. > >>>> Can you provide biosnoop traces with and without patches? > >>>> > >>>> Still, 'works fine for me' is generally not a reason to reject the patches. > >>> > >>> Yes, I can. I tried to install the biosnoop in fedora38 but it failed. > >>> These are the rpm packages I've installed: > >>> bcc-tools-0.25.0-1.fc38.x86_64 > >>> bcc-0.25.0-1.fc38.x86_64 > >>> python3-bcc-0.25.0-1.fc38.noarch > >>> > >>> Are there other packages that I need to install? > >>> > >> I've had issues with the packaged versions as well > >> > >> Best to install from source: > >> https://github.com/iovisor/bcc/ > >> https://github.com/iovisor/bcc/blob/master/INSTALL.md#fedora---source > >> > > Hi Jonathan > > > > I did a test without modifying phys_size and opt_size. And I picked up a part > > of the result: > > > > 0.172142 md0_raid10 2094 nvme1n1 W 1225496264 4096 0.01 > > 0.172145 md0_raid10 2094 nvme0n1 W 1225496264 4096 0.01 > > 0.172161 md0_raid10 2094 nvme3n1 W 16 4096 0.01 > > 0.172164 md0_raid10 2094 nvme2n1 W 16 4096 0.01 > > 0.172166 md0_raid10 2094 nvme1n1 W 16 4096 0.01 > > 0.172168 md0_raid10 2094 nvme0n1 W 16 4096 0.01 > > 0.172178 md0_raid10 2094 nvme3n1 W 633254624 4096 0.01 > > 0.172180 md0_raid10 2094 nvme2n1 W 633254624 4096 0.01 > > 0.172196 md0_raid10 2094 nvme3n1 W 16 4096 0.01 > > 0.172199 md0_raid10 2094 nvme2n1 W 16 4096 0.01 > > 0.172201 md0_raid10 2094 nvme1n1 W 16 4096 0.01 > > 0.172203 md0_raid10 2094 nvme0n1 W 16 4096 0.01 > > 0.172213 md0_raid10 2094 nvme3n1 W 1060251672 4096 0.01 > > 0.172215 md0_raid10 2094 nvme2n1 W 1060251672 4096 0.01 > > > > The last column always shows 0.01. Is that the reason I can't see the > > performance > > improvement? What do you think if I use ssd or hdds? > Try reducing your mdadm's --bitmap-chunk first. In above logs, only LBA 16 is > being used, and that's the first bitmap page (and seemingly also the last). You > want to configure it such that you have more bitmap pages. Reducing the > --bitmap-chunk parameter should create more bitmap pages, and you may run into > the scenario predicted by the patch. > Finally, I can see the performance improvement. It's cool. Thanks for this. The raid with 64MB bitmap size write: IOPS=58.4k, BW=228MiB/s (239MB/s)(13.4GiB/60001msec); 0 zone resets (without patch) write: IOPS=69.3k, BW=271MiB/s (284MB/s)(252MiB/931msec); 0 zone resets (with patch) The raid with 8MB bitmap size write: IOPS=11.6k, BW=45.4MiB/s (47.6MB/s)(2724MiB/60002msec); 0 zone resets (without patch) write: IOPS=43.7k, BW=171MiB/s (179MB/s)(10.0GiB/60001msec); 0 zone resets (with patch) Best Regards Xiao
diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index bf250a5e3a86..920bb68156d2 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -209,6 +209,28 @@ static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mdde return NULL; } +static unsigned int optimal_io_size(struct block_device *bdev, + unsigned int last_page_size, + unsigned int io_size) +{ + if (bdev_io_opt(bdev) > bdev_logical_block_size(bdev)) + return roundup(last_page_size, bdev_io_opt(bdev)); + return io_size; +} + +static unsigned int bitmap_io_size(unsigned int io_size, unsigned int opt_size, + sector_t start, sector_t boundary) +{ + if (io_size != opt_size && + start + opt_size / SECTOR_SIZE <= boundary) + return opt_size; + if (start + io_size / SECTOR_SIZE <= boundary) + return io_size; + + /* Overflows boundary */ + return 0; +} + static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap, struct page *page) { @@ -218,6 +240,7 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap, sector_t offset = mddev->bitmap_info.offset; sector_t ps, sboff, doff; unsigned int size = PAGE_SIZE; + unsigned int opt_size = PAGE_SIZE; bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev; if (page->index == store->file_pages - 1) { @@ -225,8 +248,8 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap, if (last_page_size == 0) last_page_size = PAGE_SIZE; - size = roundup(last_page_size, - bdev_logical_block_size(bdev)); + size = roundup(last_page_size, bdev_logical_block_size(bdev)); + opt_size = optimal_io_size(bdev, last_page_size, size); } ps = page->index * PAGE_SIZE / SECTOR_SIZE; @@ -241,7 +264,8 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap, return -EINVAL; } else if (offset < 0) { /* DATA BITMAP METADATA */ - if (offset + ps + size / SECTOR_SIZE > 0) + size = bitmap_io_size(size, opt_size, offset + ps, 0); + if (size == 0) /* bitmap runs in to metadata */ return -EINVAL; @@ -250,7 +274,8 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap, return -EINVAL; } else if (rdev->sb_start < rdev->data_offset) { /* METADATA BITMAP DATA */ - if (sboff + ps + size / SECTOR_SIZE > doff) + size = bitmap_io_size(size, opt_size, sboff + ps, doff); + if (size == 0) /* bitmap runs in to data */ return -EINVAL; } else {