Message ID | 20200910144833.742260-7-hch@lst.de (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [01/12] fs: remove the unused SB_I_MULTIROOT flag | expand |
On Thu 10-09-20 16:48:26, Christoph Hellwig wrote: > Drivers shouldn't really mess with the readahead size, as that is a VM > concept. Instead set it based on the optimal I/O size by lifting the > algorithm from the md driver when registering the disk. Also set > bdi->io_pages there as well by applying the same scheme based on > max_sectors. > > Signed-off-by: Christoph Hellwig <hch@lst.de> > --- > block/blk-settings.c | 5 ++--- > block/blk-sysfs.c | 10 +++++++++- > block/genhd.c | 5 +++-- > drivers/block/aoe/aoeblk.c | 2 -- > drivers/block/drbd/drbd_nl.c | 12 +----------- > drivers/md/bcache/super.c | 4 ---- > drivers/md/dm-table.c | 3 --- > drivers/md/raid0.c | 16 ---------------- > drivers/md/raid10.c | 24 +----------------------- > drivers/md/raid5.c | 13 +------------ > 10 files changed, 17 insertions(+), 77 deletions(-) > > diff --git a/block/blk-settings.c b/block/blk-settings.c > index 76a7e03bcd6cac..01049e9b998f1d 100644 > --- a/block/blk-settings.c > +++ b/block/blk-settings.c > @@ -452,6 +452,8 @@ EXPORT_SYMBOL(blk_limits_io_opt); > void blk_queue_io_opt(struct request_queue *q, unsigned int opt) > { > blk_limits_io_opt(&q->limits, opt); > + q->backing_dev_info->ra_pages = > + max(queue_io_opt(q) * 2 / PAGE_SIZE, VM_READAHEAD_PAGES); > } > EXPORT_SYMBOL(blk_queue_io_opt); > > @@ -628,9 +630,6 @@ void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, > printk(KERN_NOTICE "%s: Warning: Device %s is misaligned\n", > top, bottom); > } > - > - t->backing_dev_info->io_pages = > - t->limits.max_sectors >> (PAGE_SHIFT - 9); > } > EXPORT_SYMBOL(disk_stack_limits); > > diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c > index 81722cdcf0cb21..95eb35324e1a61 100644 > --- a/block/blk-sysfs.c > +++ b/block/blk-sysfs.c > @@ -245,7 +245,6 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) > > spin_lock_irq(&q->queue_lock); > q->limits.max_sectors = max_sectors_kb << 1; > - q->backing_dev_info->io_pages = max_sectors_kb >> (PAGE_SHIFT - 10); > spin_unlock_irq(&q->queue_lock); So do I get it right that readahead won't now be limited if you store lower value to max_sectors? Why? I'd consider io_pages a "cached value" of max_sectors and thus expect it to change together with max_sectors... > @@ -854,6 +853,15 @@ int blk_register_queue(struct gendisk *disk) > percpu_ref_switch_to_percpu(&q->q_usage_counter); > } > > + /* > + * For read-ahead of large files to be effective, we need to read ahead > + * at least twice the optimal I/O size. > + */ > + q->backing_dev_info->ra_pages = > + max(queue_io_opt(q) * 2 / PAGE_SIZE, VM_READAHEAD_PAGES); > + q->backing_dev_info->io_pages = > + queue_max_sectors(q) >> (PAGE_SHIFT - 9); > + > ret = blk_trace_init_sysfs(dev); > if (ret) > return ret; > diff --git a/block/genhd.c b/block/genhd.c > index 081f1039d9367f..db311a14ddc71a 100644 > --- a/block/genhd.c > +++ b/block/genhd.c > @@ -772,6 +772,7 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk, > const struct attribute_group **groups, > bool register_queue) > { > + struct request_queue *q = disk->queue; > dev_t devt; > int retval; > > @@ -782,7 +783,7 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk, > * registration. > */ > if (register_queue) > - elevator_init_mq(disk->queue); > + elevator_init_mq(q); > > /* minors == 0 indicates to use ext devt from part0 and should > * be accompanied with EXT_DEVT flag. Make sure all > @@ -812,7 +813,7 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk, > disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO; > disk->flags |= GENHD_FL_NO_PART_SCAN; > } else { > - struct backing_dev_info *bdi = disk->queue->backing_dev_info; > + struct backing_dev_info *bdi = q->backing_dev_info; > struct device *dev = disk_to_dev(disk); > int ret; Not sure how/why these changes got here... Not that I care too much :) > > diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c > index 5ca7216e9e01f3..89b33b402b4e52 100644 > --- a/drivers/block/aoe/aoeblk.c > +++ b/drivers/block/aoe/aoeblk.c > @@ -347,7 +347,6 @@ aoeblk_gdalloc(void *vp) > mempool_t *mp; > struct request_queue *q; > struct blk_mq_tag_set *set; > - enum { KB = 1024, MB = KB * KB, READ_AHEAD = 2 * MB, }; > ulong flags; > int late = 0; > int err; > @@ -407,7 +406,6 @@ aoeblk_gdalloc(void *vp) > WARN_ON(d->gd); > WARN_ON(d->flags & DEVFL_UP); > blk_queue_max_hw_sectors(q, BLK_DEF_MAX_SECTORS); > - q->backing_dev_info->ra_pages = READ_AHEAD / PAGE_SIZE; > d->bufpool = mp; > d->blkq = gd->queue = q; > q->queuedata = d; Shouldn't AOE set 2MB optimal IO size so that readahead is equivalent to previous behavior? > diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c > index 1bbdc410ee3c51..ff2101d56cd7f1 100644 > --- a/drivers/md/bcache/super.c > +++ b/drivers/md/bcache/super.c > @@ -1427,10 +1427,6 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) > if (ret) > return ret; > > - dc->disk.disk->queue->backing_dev_info->ra_pages = > - max(dc->disk.disk->queue->backing_dev_info->ra_pages, > - q->backing_dev_info->ra_pages); > - So bcache is basically stacking readahead here on top of underlying cache device. I don't see this being replicated by your patch so it is lost now? Probably this should be replaced by properly inheriting optimal IO size? Honza
On Thu, Sep 17, 2020 at 12:35:40PM +0200, Jan Kara wrote: > > diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c > > index 81722cdcf0cb21..95eb35324e1a61 100644 > > --- a/block/blk-sysfs.c > > +++ b/block/blk-sysfs.c > > @@ -245,7 +245,6 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) > > > > spin_lock_irq(&q->queue_lock); > > q->limits.max_sectors = max_sectors_kb << 1; > > - q->backing_dev_info->io_pages = max_sectors_kb >> (PAGE_SHIFT - 10); > > spin_unlock_irq(&q->queue_lock); > > So do I get it right that readahead won't now be limited if you store lower > value to max_sectors? Why? I'd consider io_pages a "cached value" of > max_sectors and thus expect it to change together with max_sectors... Most to start untangling the bdi from the queue. But I had to peddle back on that in the follow on series anyway, so I can add this back. > > @@ -812,7 +813,7 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk, > > disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO; > > disk->flags |= GENHD_FL_NO_PART_SCAN; > > } else { > > - struct backing_dev_info *bdi = disk->queue->backing_dev_info; > > + struct backing_dev_info *bdi = q->backing_dev_info; > > struct device *dev = disk_to_dev(disk); > > int ret; > > Not sure how/why these changes got here... Not that I care too much :) Because more changes in this area in earlier versions of the patches. But yes, this shouldn't be here, so I'll drop it. > > @@ -407,7 +406,6 @@ aoeblk_gdalloc(void *vp) > > WARN_ON(d->gd); > > WARN_ON(d->flags & DEVFL_UP); > > blk_queue_max_hw_sectors(q, BLK_DEF_MAX_SECTORS); > > - q->backing_dev_info->ra_pages = READ_AHEAD / PAGE_SIZE; > > d->bufpool = mp; > > d->blkq = gd->queue = q; > > q->queuedata = d; > > Shouldn't AOE set 2MB optimal IO size so that readahead is equivalent to > previous behavior? Sure, I'll add a separate patch just for that. > > diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c > > index 1bbdc410ee3c51..ff2101d56cd7f1 100644 > > --- a/drivers/md/bcache/super.c > > +++ b/drivers/md/bcache/super.c > > @@ -1427,10 +1427,6 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) > > if (ret) > > return ret; > > > > - dc->disk.disk->queue->backing_dev_info->ra_pages = > > - max(dc->disk.disk->queue->backing_dev_info->ra_pages, > > - q->backing_dev_info->ra_pages); > > - > > So bcache is basically stacking readahead here on top of underlying cache > device. I don't see this being replicated by your patch so it is lost now? > Probably this should be replaced by properly inheriting optimal IO size? Yes, I'll add another patch.
diff --git a/block/blk-settings.c b/block/blk-settings.c index 76a7e03bcd6cac..01049e9b998f1d 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -452,6 +452,8 @@ EXPORT_SYMBOL(blk_limits_io_opt); void blk_queue_io_opt(struct request_queue *q, unsigned int opt) { blk_limits_io_opt(&q->limits, opt); + q->backing_dev_info->ra_pages = + max(queue_io_opt(q) * 2 / PAGE_SIZE, VM_READAHEAD_PAGES); } EXPORT_SYMBOL(blk_queue_io_opt); @@ -628,9 +630,6 @@ void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, printk(KERN_NOTICE "%s: Warning: Device %s is misaligned\n", top, bottom); } - - t->backing_dev_info->io_pages = - t->limits.max_sectors >> (PAGE_SHIFT - 9); } EXPORT_SYMBOL(disk_stack_limits); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 81722cdcf0cb21..95eb35324e1a61 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -245,7 +245,6 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) spin_lock_irq(&q->queue_lock); q->limits.max_sectors = max_sectors_kb << 1; - q->backing_dev_info->io_pages = max_sectors_kb >> (PAGE_SHIFT - 10); spin_unlock_irq(&q->queue_lock); return ret; @@ -854,6 +853,15 @@ int blk_register_queue(struct gendisk *disk) percpu_ref_switch_to_percpu(&q->q_usage_counter); } + /* + * For read-ahead of large files to be effective, we need to read ahead + * at least twice the optimal I/O size. + */ + q->backing_dev_info->ra_pages = + max(queue_io_opt(q) * 2 / PAGE_SIZE, VM_READAHEAD_PAGES); + q->backing_dev_info->io_pages = + queue_max_sectors(q) >> (PAGE_SHIFT - 9); + ret = blk_trace_init_sysfs(dev); if (ret) return ret; diff --git a/block/genhd.c b/block/genhd.c index 081f1039d9367f..db311a14ddc71a 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -772,6 +772,7 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk, const struct attribute_group **groups, bool register_queue) { + struct request_queue *q = disk->queue; dev_t devt; int retval; @@ -782,7 +783,7 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk, * registration. */ if (register_queue) - elevator_init_mq(disk->queue); + elevator_init_mq(q); /* minors == 0 indicates to use ext devt from part0 and should * be accompanied with EXT_DEVT flag. Make sure all @@ -812,7 +813,7 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk, disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO; disk->flags |= GENHD_FL_NO_PART_SCAN; } else { - struct backing_dev_info *bdi = disk->queue->backing_dev_info; + struct backing_dev_info *bdi = q->backing_dev_info; struct device *dev = disk_to_dev(disk); int ret; diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c index 5ca7216e9e01f3..89b33b402b4e52 100644 --- a/drivers/block/aoe/aoeblk.c +++ b/drivers/block/aoe/aoeblk.c @@ -347,7 +347,6 @@ aoeblk_gdalloc(void *vp) mempool_t *mp; struct request_queue *q; struct blk_mq_tag_set *set; - enum { KB = 1024, MB = KB * KB, READ_AHEAD = 2 * MB, }; ulong flags; int late = 0; int err; @@ -407,7 +406,6 @@ aoeblk_gdalloc(void *vp) WARN_ON(d->gd); WARN_ON(d->flags & DEVFL_UP); blk_queue_max_hw_sectors(q, BLK_DEF_MAX_SECTORS); - q->backing_dev_info->ra_pages = READ_AHEAD / PAGE_SIZE; d->bufpool = mp; d->blkq = gd->queue = q; q->queuedata = d; diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index aaff5bde391506..f8fb1c9b1bb6c1 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -1360,18 +1360,8 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi decide_on_discard_support(device, q, b, discard_zeroes_if_aligned); decide_on_write_same_support(device, q, b, o, disable_write_same); - if (b) { + if (b) blk_stack_limits(&q->limits, &b->limits, 0); - - if (q->backing_dev_info->ra_pages != - b->backing_dev_info->ra_pages) { - drbd_info(device, "Adjusting my ra_pages to backing device's (%lu -> %lu)\n", - q->backing_dev_info->ra_pages, - b->backing_dev_info->ra_pages); - q->backing_dev_info->ra_pages = - b->backing_dev_info->ra_pages; - } - } fixup_discard_if_not_supported(q); fixup_write_zeroes(device, q); } diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 1bbdc410ee3c51..ff2101d56cd7f1 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1427,10 +1427,6 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) if (ret) return ret; - dc->disk.disk->queue->backing_dev_info->ra_pages = - max(dc->disk.disk->queue->backing_dev_info->ra_pages, - q->backing_dev_info->ra_pages); - atomic_set(&dc->io_errors, 0); dc->io_disable = false; dc->error_limit = DEFAULT_CACHED_DEV_ERROR_LIMIT; diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 5edc3079e7c199..e1be7697214bd7 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1924,9 +1924,6 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, q->nr_zones = blkdev_nr_zones(t->md->disk); } #endif - - /* Allow reads to exceed readahead limits */ - q->backing_dev_info->io_pages = limits->max_sectors >> (PAGE_SHIFT - 9); } unsigned int dm_table_get_num_targets(struct dm_table *t) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index f54a449f97aa79..aa2d7279176880 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -410,22 +410,6 @@ static int raid0_run(struct mddev *mddev) mdname(mddev), (unsigned long long)mddev->array_sectors); - if (mddev->queue) { - /* calculate the max read-ahead size. - * For read-ahead of large files to be effective, we need to - * readahead at least twice a whole stripe. i.e. number of devices - * multiplied by chunk size times 2. - * If an individual device has an ra_pages greater than the - * chunk size, then we will not drive that device as hard as it - * wants. We consider this a configuration error: a larger - * chunksize should be used in that case. - */ - int stripe = mddev->raid_disks * - (mddev->chunk_sectors << 9) / PAGE_SIZE; - if (mddev->queue->backing_dev_info->ra_pages < 2* stripe) - mddev->queue->backing_dev_info->ra_pages = 2* stripe; - } - dump_zones(mddev); ret = md_integrity_register(mddev); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 9956a04ac13bd6..5d1bdee313ec33 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -3873,19 +3873,6 @@ static int raid10_run(struct mddev *mddev) mddev->resync_max_sectors = size; set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags); - if (mddev->queue) { - int stripe = conf->geo.raid_disks * - ((mddev->chunk_sectors << 9) / PAGE_SIZE); - - /* Calculate max read-ahead size. - * We need to readahead at least twice a whole stripe.... - * maybe... - */ - stripe /= conf->geo.near_copies; - if (mddev->queue->backing_dev_info->ra_pages < 2 * stripe) - mddev->queue->backing_dev_info->ra_pages = 2 * stripe; - } - if (md_integrity_register(mddev)) goto out_free_conf; @@ -4723,17 +4710,8 @@ static void end_reshape(struct r10conf *conf) conf->reshape_safe = MaxSector; spin_unlock_irq(&conf->device_lock); - /* read-ahead size must cover two whole stripes, which is - * 2 * (datadisks) * chunksize where 'n' is the number of raid devices - */ - if (conf->mddev->queue) { - int stripe = conf->geo.raid_disks * - ((conf->mddev->chunk_sectors << 9) / PAGE_SIZE); - stripe /= conf->geo.near_copies; - if (conf->mddev->queue->backing_dev_info->ra_pages < 2 * stripe) - conf->mddev->queue->backing_dev_info->ra_pages = 2 * stripe; + if (conf->mddev->queue) raid10_set_io_opt(conf); - } conf->fullsync = 0; } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 9a7d1250894ef1..7ace1f76b14736 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -7522,8 +7522,6 @@ static int raid5_run(struct mddev *mddev) int data_disks = conf->previous_raid_disks - conf->max_degraded; int stripe = data_disks * ((mddev->chunk_sectors << 9) / PAGE_SIZE); - if (mddev->queue->backing_dev_info->ra_pages < 2 * stripe) - mddev->queue->backing_dev_info->ra_pages = 2 * stripe; chunk_size = mddev->chunk_sectors << 9; blk_queue_io_min(mddev->queue, chunk_size); @@ -8111,17 +8109,8 @@ static void end_reshape(struct r5conf *conf) spin_unlock_irq(&conf->device_lock); wake_up(&conf->wait_for_overlap); - /* read-ahead size must cover two whole stripes, which is - * 2 * (datadisks) * chunksize where 'n' is the number of raid devices - */ - if (conf->mddev->queue) { - int data_disks = conf->raid_disks - conf->max_degraded; - int stripe = data_disks * ((conf->chunk_sectors << 9) - / PAGE_SIZE); - if (conf->mddev->queue->backing_dev_info->ra_pages < 2 * stripe) - conf->mddev->queue->backing_dev_info->ra_pages = 2 * stripe; + if (conf->mddev->queue) raid5_set_io_opt(conf); - } } }
Drivers shouldn't really mess with the readahead size, as that is a VM concept. Instead set it based on the optimal I/O size by lifting the algorithm from the md driver when registering the disk. Also set bdi->io_pages there as well by applying the same scheme based on max_sectors. Signed-off-by: Christoph Hellwig <hch@lst.de> --- block/blk-settings.c | 5 ++--- block/blk-sysfs.c | 10 +++++++++- block/genhd.c | 5 +++-- drivers/block/aoe/aoeblk.c | 2 -- drivers/block/drbd/drbd_nl.c | 12 +----------- drivers/md/bcache/super.c | 4 ---- drivers/md/dm-table.c | 3 --- drivers/md/raid0.c | 16 ---------------- drivers/md/raid10.c | 24 +----------------------- drivers/md/raid5.c | 13 +------------ 10 files changed, 17 insertions(+), 77 deletions(-)