Message ID | 1469589685-31630-1-git-send-email-bob.liu@oracle.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Hello, Thanks for the fixes. On Wed, Jul 27, 2016 at 11:21:25AM +0800, Bob Liu wrote: > The current VBD layer reserves buffer space for each attached device based on > three statically configured settings which are read at boot time. > * max_indirect_segs: Maximum amount of segments. > * max_ring_page_order: Maximum order of pages to be used for the shared ring. > * max_queues: Maximum of queues(rings) to be used. > > But the storage backend, workload, and guest memory result in very different > tuning requirements. It's impossible to centrally predict application > characteristics so it's best to leave allow the settings can be dynamiclly > adjusted based on workload inside the Guest. > > Usage: > Show current values: > cat /sys/devices/vbd-xxx/max_indirect_segs > cat /sys/devices/vbd-xxx/max_ring_page_order > cat /sys/devices/vbd-xxx/max_queues > > Write new values: > echo <new value> > /sys/devices/vbd-xxx/max_indirect_segs > echo <new value> > /sys/devices/vbd-xxx/max_ring_page_order > echo <new value> > /sys/devices/vbd-xxx/max_queues > > Signed-off-by: Bob Liu <bob.liu@oracle.com> > -- > v3: > * Remove new_max_indirect_segments. > * Fix BUG_ON(). > v2: > * Rename to max_ring_page_order. > * Remove the waiting code suggested by Roger. > --- > drivers/block/xen-blkfront.c | 277 ++++++++++++++++++++++++++++++++++++++++-- > 1 file changed, 269 insertions(+), 8 deletions(-) > > diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c > index 1b4c380..57baa54 100644 > --- a/drivers/block/xen-blkfront.c > +++ b/drivers/block/xen-blkfront.c > @@ -212,6 +212,10 @@ struct blkfront_info > /* Save uncomplete reqs and bios for migration. */ > struct list_head requests; > struct bio_list bio_list; > + /* For dynamic configuration. */ > + unsigned int reconfiguring:1; > + unsigned int max_ring_page_order; > + unsigned int max_queues; max_{ring_page_order/queues} should be after max_indirect_segments, and reconfigurng should be of type bool (moreover when below you assign 'true' or 'false' to it). > }; > > static unsigned int nr_minors; > @@ -1350,6 +1354,31 @@ static void blkif_free(struct blkfront_info *info, int suspend) > for (i = 0; i < info->nr_rings; i++) > blkif_free_ring(&info->rinfo[i]); > > + /* Remove old xenstore nodes. */ > + if (info->nr_ring_pages > 1) > + xenbus_rm(XBT_NIL, info->xbdev->nodename, "ring-page-order"); > + > + if (info->nr_rings == 1) { > + if (info->nr_ring_pages == 1) { > + xenbus_rm(XBT_NIL, info->xbdev->nodename, "ring-ref"); > + } else { > + for (i = 0; i < info->nr_ring_pages; i++) { > + char ring_ref_name[RINGREF_NAME_LEN]; > + > + snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i); > + xenbus_rm(XBT_NIL, info->xbdev->nodename, ring_ref_name); > + } > + } > + } else { > + xenbus_rm(XBT_NIL, info->xbdev->nodename, "multi-queue-num-queues"); > + > + for (i = 0; i < info->nr_rings; i++) { > + char queuename[QUEUE_NAME_LEN]; > + > + snprintf(queuename, QUEUE_NAME_LEN, "queue-%u", i); > + xenbus_rm(XBT_NIL, info->xbdev->nodename, queuename); > + } > + } > kfree(info->rinfo); > info->rinfo = NULL; > info->nr_rings = 0; > @@ -1763,15 +1792,20 @@ static int talk_to_blkback(struct xenbus_device *dev, > const char *message = NULL; > struct xenbus_transaction xbt; > int err; > - unsigned int i, max_page_order = 0; > + unsigned int i, backend_max_order = 0; > unsigned int ring_page_order = 0; > > err = xenbus_scanf(XBT_NIL, info->xbdev->otherend, > - "max-ring-page-order", "%u", &max_page_order); > + "max-ring-page-order", "%u", &backend_max_order); > if (err != 1) > info->nr_ring_pages = 1; > else { > - ring_page_order = min(xen_blkif_max_ring_order, max_page_order); > + if (info->max_ring_page_order) > + /* Dynamic configured through /sys. */ > + ring_page_order = min(info->max_ring_page_order, backend_max_order); > + else > + /* Default. */ > + ring_page_order = min(xen_blkif_max_ring_order, backend_max_order); > info->nr_ring_pages = 1 << ring_page_order; In order to avoid this 'if' here (and below), what do you think of assigning the global values to the blkfront_info structure in blkfront_probe? Then you would only have to do min(info->max_ring_page_order, backend_max_order) in order to get the min value. > } > > @@ -1894,7 +1928,13 @@ static int negotiate_mq(struct blkfront_info *info) > if (err < 0) > backend_max_queues = 1; > > - info->nr_rings = min(backend_max_queues, xen_blkif_max_queues); > + if (info->max_queues) > + /* Dynamic configured through /sys */ > + info->nr_rings = min(backend_max_queues, info->max_queues); > + else > + /* Default. */ > + info->nr_rings = min(backend_max_queues, xen_blkif_max_queues); > + > /* We need at least one ring. */ > if (!info->nr_rings) > info->nr_rings = 1; > @@ -2352,11 +2392,198 @@ static void blkfront_gather_backend_features(struct blkfront_info *info) > NULL); > if (err) > info->max_indirect_segments = 0; > - else > - info->max_indirect_segments = min(indirect_segments, > - xen_blkif_max_segments); > + else { > + if (info->max_indirect_segments) > + /* Dynamic configured through /sys */ > + info->max_indirect_segments = min(indirect_segments, > + info->max_indirect_segments); > + else > + info->max_indirect_segments = min(indirect_segments, > + xen_blkif_max_segments); > + } > +} > + > +static ssize_t max_ring_page_order_show(struct device *dev, > + struct device_attribute *attr, char *page) > +{ > + struct blkfront_info *info = dev_get_drvdata(dev); > + > + return sprintf(page, "%u\n", get_order(info->nr_ring_pages * XEN_PAGE_SIZE)); > +} > + > +static ssize_t max_indirect_segs_show(struct device *dev, > + struct device_attribute *attr, char *page) > +{ > + struct blkfront_info *info = dev_get_drvdata(dev); > + > + return sprintf(page, "%u\n", info->max_indirect_segments); > +} > + > +static ssize_t max_queues_show(struct device *dev, > + struct device_attribute *attr, char *page) > +{ > + struct blkfront_info *info = dev_get_drvdata(dev); > + > + return sprintf(page, "%u\n", info->nr_rings); > +} > + > +static ssize_t dynamic_reconfig_device(struct blkfront_info *info, ssize_t count) > +{ > + /* > + * Prevent new requests even to software request queue. > + */ > + blk_mq_freeze_queue(info->rq); > + > + /* > + * Guarantee no uncompleted reqs. > + */ > + if (part_in_flight(&info->gd->part0) || info->reconfiguring) { > + blk_mq_unfreeze_queue(info->rq); > + pr_err("Dev:%s busy, please retry later.\n", dev_name(&info->xbdev->dev)); > + return -EBUSY; > + } > + > + /* > + * Frontend: Backend: > + * Freeze_queue() > + * Switch to XenbusStateClosed > + * frontend_changed(StateClosed) > + * > xen_blkif_disconnect() > + * > Switch to XenbusStateClosed > + * blkback_changed(StateClosed) > + * > blkfront_resume() > + * > Switch to StateInitialised > + * frontend_changed(StateInitialised): > + * > reconnect > + * > Switch to StateConnected > + * blkback_changed(StateConnected) > + * > blkif_recover() > + * > Also switch to StateConnected > + * > Unfreeze_queue() > + */ > + info->reconfiguring = true; This is racy AFAICT. You should use an atomic test_and_set in the condition above or lock the blkfront_info mutex before and while setting reconfiguring. Without this two (or more) concurrent calls to dynamic_reconfig_device might succeed. > + xenbus_switch_state(info->xbdev, XenbusStateClosed); > + > + return count; > +} > + > +static ssize_t max_indirect_segs_store(struct device *dev, > + struct device_attribute *attr, const char *buf, size_t count) > +{ > + ssize_t ret; > + unsigned int max_segs = 0, backend_max_segs = 0; > + struct blkfront_info *info = dev_get_drvdata(dev); > + int err; > + > + ret = kstrtouint(buf, 10, &max_segs); > + if (ret < 0) > + return ret; > + > + if (max_segs == info->max_indirect_segments) > + return count; > + > + err = xenbus_gather(XBT_NIL, info->xbdev->otherend, > + "feature-max-indirect-segments", "%u", &backend_max_segs, > + NULL); > + if (err) { > + pr_err("Backend %s doesn't support feature-indirect-segments.\n", > + info->xbdev->otherend); > + return -EOPNOTSUPP; > + } > + > + if (max_segs > backend_max_segs) { > + pr_err("Invalid max indirect segment (%u), backend-max: %u.\n", > + max_segs, backend_max_segs); > + return -EINVAL; > + } > + > + info->max_indirect_segments = max_segs; > + > + return dynamic_reconfig_device(info, count); > +} > + > +static ssize_t max_ring_page_order_store(struct device *dev, > + struct device_attribute *attr, > + const char *buf, size_t count) > +{ > + ssize_t ret; > + unsigned int max_order = 0, backend_max_order = 0; > + struct blkfront_info *info = dev_get_drvdata(dev); > + int err; > + > + ret = kstrtouint(buf, 10, &max_order); > + if (ret < 0) > + return ret; > + > + if ((1 << max_order) == info->nr_ring_pages) > + return count; > + > + if (max_order > XENBUS_MAX_RING_GRANT_ORDER) { > + pr_err("Invalid max_ring_page_order (%u), max: %u.\n", > + max_order, XENBUS_MAX_RING_GRANT_ORDER); > + return -EINVAL; > + } > + > + err = xenbus_scanf(XBT_NIL, info->xbdev->otherend, > + "max-ring-page-order", "%u", &backend_max_order); > + if (err != 1) { > + pr_err("Backend %s doesn't support feature multi-page-ring.\n", > + info->xbdev->otherend); > + return -EOPNOTSUPP; > + } > + if (max_order > backend_max_order) { > + pr_err("Invalid max_ring_page_order (%u), backend supports max: %u.\n", > + max_order, backend_max_order); > + return -EINVAL; > + } > + info->max_ring_page_order = max_order; > + > + return dynamic_reconfig_device(info, count); > +} > + > +static ssize_t max_queues_store(struct device *dev, > + struct device_attribute *attr, > + const char *buf, size_t count) > +{ > + ssize_t ret; > + unsigned int max_queues = 0, backend_max_queues = 0; > + struct blkfront_info *info = dev_get_drvdata(dev); > + int err; > + > + ret = kstrtouint(buf, 10, &max_queues); > + if (ret < 0) > + return ret; > + > + if (max_queues == info->nr_rings) > + return count; > + > + if (max_queues > num_online_cpus()) { > + pr_err("Invalid max_queues (%u), can't bigger than online cpus: %u.\n", > + max_queues, num_online_cpus()); > + return -EINVAL; > + } > + > + err = xenbus_scanf(XBT_NIL, info->xbdev->otherend, > + "multi-queue-max-queues", "%u", &backend_max_queues); > + if (err != 1) { > + pr_err("Backend %s doesn't support block multi queue.\n", > + info->xbdev->otherend); > + return -EOPNOTSUPP; > + } > + if (max_queues > backend_max_queues) { > + pr_err("Invalid max_queues (%u), backend supports max: %u.\n", > + max_queues, backend_max_queues); > + return -EINVAL; > + } > + info->max_queues = max_queues; > + > + return dynamic_reconfig_device(info, count); > } > > +static DEVICE_ATTR_RW(max_queues); > +static DEVICE_ATTR_RW(max_ring_page_order); > +static DEVICE_ATTR_RW(max_indirect_segs); > + > /* > * Invoked when the backend is finally 'ready' (and has told produced > * the details about the physical device - #sectors, size, etc). > @@ -2393,6 +2620,10 @@ static void blkfront_connect(struct blkfront_info *info) > * supports indirect descriptors, and how many. > */ > blkif_recover(info); > + if (info->reconfiguring) { > + blk_mq_unfreeze_queue(info->rq); > + info->reconfiguring = false; > + } > return; > > default: > @@ -2443,6 +2674,22 @@ static void blkfront_connect(struct blkfront_info *info) > return; > } > > + err = device_create_file(&info->xbdev->dev, &dev_attr_max_ring_page_order); > + if (err) > + goto fail; > + > + err = device_create_file(&info->xbdev->dev, &dev_attr_max_indirect_segs); > + if (err) { > + device_remove_file(&info->xbdev->dev, &dev_attr_max_ring_page_order); > + goto fail; > + } > + > + err = device_create_file(&info->xbdev->dev, &dev_attr_max_queues); > + if (err) { > + device_remove_file(&info->xbdev->dev, &dev_attr_max_ring_page_order); > + device_remove_file(&info->xbdev->dev, &dev_attr_max_indirect_segs); > + goto fail; > + } > xenbus_switch_state(info->xbdev, XenbusStateConnected); > > /* Kick pending requests. */ > @@ -2453,6 +2700,12 @@ static void blkfront_connect(struct blkfront_info *info) > add_disk(info->gd); > > info->is_ready = 1; > + return; > + > +fail: > + blkif_free(info, 0); > + xlvbd_release_gendisk(info); > + return; Hm, I'm not sure whether this chunk should be in a separate patch, it seems like blkfront_connect doesn't properly cleanup on error (if xlvbd_alloc_gendisk fails blkif_free will not be called). Do you think you could send the addition of the 'fail' label as a separate patch and fix the error path of xlvbd_alloc_gendisk? > } > > /** > @@ -2500,8 +2753,16 @@ static void blkback_changed(struct xenbus_device *dev, > break; > > case XenbusStateClosed: > - if (dev->state == XenbusStateClosed) > + if (dev->state == XenbusStateClosed) { > + if (info->reconfiguring) > + if (blkfront_resume(info->xbdev)) { Could you please join those two conditions: if (info->reconfiguring && blkfront_resume(info->xbdev)) { ... Also, I'm not sure this is correct, if blkfront sees the "Closing" state on blkback it will try to close the frontend and destroy the block device (see blkfront_closing), and this should be avoided. You should call blkfront_resume as soon as you see the backend move to the Closed or Closing states, without calling blkfront_closing. > + /* Resume failed. */ > + info->reconfiguring = false; > + xenbus_switch_state(info->xbdev, XenbusStateClosed); > + pr_err("Resume from dynamic configuration failed\n"); > + } > break; > + } > /* Missed the backend's Closing state -- fallthrough */ > case XenbusStateClosing: > if (info) > -- > 1.7.10.4
On 07/27/2016 04:07 PM, Roger Pau Monné wrote: ..[snip].. >> @@ -2443,6 +2674,22 @@ static void blkfront_connect(struct blkfront_info *info) >> return; >> } >> >> + err = device_create_file(&info->xbdev->dev, &dev_attr_max_ring_page_order); >> + if (err) >> + goto fail; >> + >> + err = device_create_file(&info->xbdev->dev, &dev_attr_max_indirect_segs); >> + if (err) { >> + device_remove_file(&info->xbdev->dev, &dev_attr_max_ring_page_order); >> + goto fail; >> + } >> + >> + err = device_create_file(&info->xbdev->dev, &dev_attr_max_queues); >> + if (err) { >> + device_remove_file(&info->xbdev->dev, &dev_attr_max_ring_page_order); >> + device_remove_file(&info->xbdev->dev, &dev_attr_max_indirect_segs); >> + goto fail; >> + } >> xenbus_switch_state(info->xbdev, XenbusStateConnected); >> >> /* Kick pending requests. */ >> @@ -2453,6 +2700,12 @@ static void blkfront_connect(struct blkfront_info *info) >> add_disk(info->gd); >> >> info->is_ready = 1; >> + return; >> + >> +fail: >> + blkif_free(info, 0); >> + xlvbd_release_gendisk(info); >> + return; > > Hm, I'm not sure whether this chunk should be in a separate patch, it seems > like blkfront_connect doesn't properly cleanup on error (if > xlvbd_alloc_gendisk fails blkif_free will not be called). Do you think you > could send the addition of the 'fail' label as a separate patch and fix the > error path of xlvbd_alloc_gendisk? > Sure, will fix all of your comments above. >> } >> >> /** >> @@ -2500,8 +2753,16 @@ static void blkback_changed(struct xenbus_device *dev, >> break; >> >> case XenbusStateClosed: >> - if (dev->state == XenbusStateClosed) >> + if (dev->state == XenbusStateClosed) { >> + if (info->reconfiguring) >> + if (blkfront_resume(info->xbdev)) { > > Could you please join those two conditions: > > if (info->reconfiguring && blkfront_resume(info->xbdev)) { ... > > Also, I'm not sure this is correct, if blkfront sees the "Closing" state on > blkback it will try to close the frontend and destroy the block device (see > blkfront_closing), and this should be avoided. You should call > blkfront_resume as soon as you see the backend move to the Closed or Closing > states, without calling blkfront_closing. > I didn't get how this can happen, backend state won't be changed to 'Closing' before blkfront_closing() is called. So I think current logic is fine. Btw: could you please ack [PATCH v2 2/3] xen-blkfront: introduce blkif_set_queue_limits()? Thank you! Bob
> -----Original Message----- [snip] > > > > Also, I'm not sure this is correct, if blkfront sees the "Closing" state on > > blkback it will try to close the frontend and destroy the block device (see > > blkfront_closing), and this should be avoided. You should call > > blkfront_resume as soon as you see the backend move to the Closed or > Closing > > states, without calling blkfront_closing. > > > > I didn't get how this can happen, backend state won't be changed to 'Closing' > before blkfront_closing() is called. > So I think current logic is fine. > Backends can go to closing before frontends, when the PV device is being detached. Paul
On Wed, Jul 27, 2016 at 05:12:22PM +0800, Bob Liu wrote: > > On 07/27/2016 04:07 PM, Roger Pau Monné wrote: > ..[snip].. > >> @@ -2443,6 +2674,22 @@ static void blkfront_connect(struct blkfront_info *info) > >> return; > >> } > >> > >> + err = device_create_file(&info->xbdev->dev, &dev_attr_max_ring_page_order); > >> + if (err) > >> + goto fail; > >> + > >> + err = device_create_file(&info->xbdev->dev, &dev_attr_max_indirect_segs); > >> + if (err) { > >> + device_remove_file(&info->xbdev->dev, &dev_attr_max_ring_page_order); > >> + goto fail; > >> + } > >> + > >> + err = device_create_file(&info->xbdev->dev, &dev_attr_max_queues); > >> + if (err) { > >> + device_remove_file(&info->xbdev->dev, &dev_attr_max_ring_page_order); > >> + device_remove_file(&info->xbdev->dev, &dev_attr_max_indirect_segs); > >> + goto fail; > >> + } > >> xenbus_switch_state(info->xbdev, XenbusStateConnected); > >> > >> /* Kick pending requests. */ > >> @@ -2453,6 +2700,12 @@ static void blkfront_connect(struct blkfront_info *info) > >> add_disk(info->gd); > >> > >> info->is_ready = 1; > >> + return; > >> + > >> +fail: > >> + blkif_free(info, 0); > >> + xlvbd_release_gendisk(info); > >> + return; > > > > Hm, I'm not sure whether this chunk should be in a separate patch, it seems > > like blkfront_connect doesn't properly cleanup on error (if > > xlvbd_alloc_gendisk fails blkif_free will not be called). Do you think you > > could send the addition of the 'fail' label as a separate patch and fix the > > error path of xlvbd_alloc_gendisk? > > > > Sure, will fix all of your comments above. > > >> } > >> > >> /** > >> @@ -2500,8 +2753,16 @@ static void blkback_changed(struct xenbus_device *dev, > >> break; > >> > >> case XenbusStateClosed: > >> - if (dev->state == XenbusStateClosed) > >> + if (dev->state == XenbusStateClosed) { > >> + if (info->reconfiguring) > >> + if (blkfront_resume(info->xbdev)) { > > > > Could you please join those two conditions: > > > > if (info->reconfiguring && blkfront_resume(info->xbdev)) { ... > > > > Also, I'm not sure this is correct, if blkfront sees the "Closing" state on > > blkback it will try to close the frontend and destroy the block device (see > > blkfront_closing), and this should be avoided. You should call > > blkfront_resume as soon as you see the backend move to the Closed or Closing > > states, without calling blkfront_closing. > > > > I didn't get how this can happen, backend state won't be changed to 'Closing' before blkfront_closing() is called. > So I think current logic is fine. I see, in dynamic_reconfig_device you change the frontend state to "Closed". Then if the backend switches to state "Closing" blkfront will call blkfront_closing, but that won't do anything because the frontend state is already at the "Closed" state, at which point you just wait for the backend to finally reach state "Closed" in order to perform the reconnection. I stand corrected, I think the current logic is indeed fine. Roger.
On Wed, Jul 27, 2016 at 11:21:25AM +0800, Bob Liu wrote: [...] > +static ssize_t dynamic_reconfig_device(struct blkfront_info *info, ssize_t count) > +{ > + /* > + * Prevent new requests even to software request queue. > + */ > + blk_mq_freeze_queue(info->rq); > + > + /* > + * Guarantee no uncompleted reqs. > + */ I'm also wondering, why do you need to guarantee that there are no uncompleted requests? The resume procedure is going to call blkif_recover that will take care of requeuing any unfinished requests that are on the ring. > + if (part_in_flight(&info->gd->part0) || info->reconfiguring) { > + blk_mq_unfreeze_queue(info->rq); > + pr_err("Dev:%s busy, please retry later.\n", dev_name(&info->xbdev->dev)); > + return -EBUSY; > + } Roger.
On 07/27/2016 06:59 PM, Roger Pau Monné wrote: > On Wed, Jul 27, 2016 at 11:21:25AM +0800, Bob Liu wrote: > [...] >> +static ssize_t dynamic_reconfig_device(struct blkfront_info *info, ssize_t count) >> +{ >> + /* >> + * Prevent new requests even to software request queue. >> + */ >> + blk_mq_freeze_queue(info->rq); >> + >> + /* >> + * Guarantee no uncompleted reqs. >> + */ > > I'm also wondering, why do you need to guarantee that there are no > uncompleted requests? The resume procedure is going to call blkif_recover > that will take care of requeuing any unfinished requests that are on the > ring. > Because there may have requests in the software request queue with more segments than we can handle(if info->max_indirect_segments is reduced). The blkif_recover() can't handle this since blk-mq was introduced, because there is no way to iterate the sw-request queues(blk_fetch_request() can't be used by blk-mq). So there is a bug in blkif_recover(), I was thinking implement the suspend function of blkfront_driver like: +static int blkfront_suspend(struct xenbus_device *dev) +{ + blk_mq_freeze_queue(info->rq); + .. +} static struct xenbus_driver blkfront_driver = { .ids = blkfront_ids, .probe = blkfront_probe, .remove = blkfront_remove, + .suspend = blkfront_suspend, .resume = blkfront_resume,
On Wed, Jul 27, 2016 at 07:21:05PM +0800, Bob Liu wrote: > > On 07/27/2016 06:59 PM, Roger Pau Monné wrote: > > On Wed, Jul 27, 2016 at 11:21:25AM +0800, Bob Liu wrote: > > [...] > >> +static ssize_t dynamic_reconfig_device(struct blkfront_info *info, ssize_t count) > >> +{ > >> + /* > >> + * Prevent new requests even to software request queue. > >> + */ > >> + blk_mq_freeze_queue(info->rq); > >> + > >> + /* > >> + * Guarantee no uncompleted reqs. > >> + */ > > > > I'm also wondering, why do you need to guarantee that there are no > > uncompleted requests? The resume procedure is going to call blkif_recover > > that will take care of requeuing any unfinished requests that are on the > > ring. > > > > Because there may have requests in the software request queue with more segments than > we can handle(if info->max_indirect_segments is reduced). > > The blkif_recover() can't handle this since blk-mq was introduced, > because there is no way to iterate the sw-request queues(blk_fetch_request() can't be used by blk-mq). > > So there is a bug in blkif_recover(), I was thinking implement the suspend function of blkfront_driver like: Hm, this is a regression and should be fixed ASAP. I'm still not sure I follow, don't blk_queue_max_segments change the number of segments the requests on the queue are going to have? So that you will only have to re-queue the requests already on the ring? Waiting for the whole queue to be flushed before suspending is IMHO not acceptable, it introduces an unbounded delay during migration if the backend is slow for some reason. Roger.
On 07/27/2016 10:24 PM, Roger Pau Monné wrote: > On Wed, Jul 27, 2016 at 07:21:05PM +0800, Bob Liu wrote: >> >> On 07/27/2016 06:59 PM, Roger Pau Monné wrote: >>> On Wed, Jul 27, 2016 at 11:21:25AM +0800, Bob Liu wrote: >>> [...] >>>> +static ssize_t dynamic_reconfig_device(struct blkfront_info *info, ssize_t count) >>>> +{ >>>> + /* >>>> + * Prevent new requests even to software request queue. >>>> + */ >>>> + blk_mq_freeze_queue(info->rq); >>>> + >>>> + /* >>>> + * Guarantee no uncompleted reqs. >>>> + */ >>> >>> I'm also wondering, why do you need to guarantee that there are no >>> uncompleted requests? The resume procedure is going to call blkif_recover >>> that will take care of requeuing any unfinished requests that are on the >>> ring. >>> >> >> Because there may have requests in the software request queue with more segments than >> we can handle(if info->max_indirect_segments is reduced). >> >> The blkif_recover() can't handle this since blk-mq was introduced, >> because there is no way to iterate the sw-request queues(blk_fetch_request() can't be used by blk-mq). >> >> So there is a bug in blkif_recover(), I was thinking implement the suspend function of blkfront_driver like: > > Hm, this is a regression and should be fixed ASAP. I'm still not sure I > follow, don't blk_queue_max_segments change the number of segments the > requests on the queue are going to have? So that you will only have to > re-queue the requests already on the ring? > That's not enough, request queues were split to software queues and hardware queues since blk-mq was introduced. We need to consider two more things: * Stop new requests be added to software queues before blk_queue_max_segments() is called(still using old 'max-indirect-segments'). I didn't see other way except call blk_mq_freeze_queue(). * Requests already in software queues but with old 'max-indirect-segments' also have to be re-queued based on new 'max-indirect-segments'. Neither blk-mq API can do this. > Waiting for the whole queue to be flushed before suspending is IMHO not > acceptable, it introduces an unbounded delay during migration if the backend > is slow for some reason. > Right, I also hope there is better solution.
On Thu, Jul 28, 2016 at 07:05:05AM +0800, Bob Liu wrote: > > On 07/27/2016 10:24 PM, Roger Pau Monné wrote: > > On Wed, Jul 27, 2016 at 07:21:05PM +0800, Bob Liu wrote: > >> > >> On 07/27/2016 06:59 PM, Roger Pau Monné wrote: > >>> On Wed, Jul 27, 2016 at 11:21:25AM +0800, Bob Liu wrote: > >>> [...] > >>>> +static ssize_t dynamic_reconfig_device(struct blkfront_info *info, ssize_t count) > >>>> +{ > >>>> + /* > >>>> + * Prevent new requests even to software request queue. > >>>> + */ > >>>> + blk_mq_freeze_queue(info->rq); > >>>> + > >>>> + /* > >>>> + * Guarantee no uncompleted reqs. > >>>> + */ > >>> > >>> I'm also wondering, why do you need to guarantee that there are no > >>> uncompleted requests? The resume procedure is going to call blkif_recover > >>> that will take care of requeuing any unfinished requests that are on the > >>> ring. > >>> > >> > >> Because there may have requests in the software request queue with more segments than > >> we can handle(if info->max_indirect_segments is reduced). > >> > >> The blkif_recover() can't handle this since blk-mq was introduced, > >> because there is no way to iterate the sw-request queues(blk_fetch_request() can't be used by blk-mq). > >> > >> So there is a bug in blkif_recover(), I was thinking implement the suspend function of blkfront_driver like: > > > > Hm, this is a regression and should be fixed ASAP. I'm still not sure I > > follow, don't blk_queue_max_segments change the number of segments the > > requests on the queue are going to have? So that you will only have to > > re-queue the requests already on the ring? > > > > That's not enough, request queues were split to software queues and hardware queues since blk-mq was introduced. > We need to consider two more things: > * Stop new requests be added to software queues before blk_queue_max_segments() is called(still using old 'max-indirect-segments'). > I didn't see other way except call blk_mq_freeze_queue(). Right, stopping the queues doesn't seem to be an issue? > * Requests already in software queues but with old 'max-indirect-segments' also have to be re-queued based on new 'max-indirect-segments'. > Neither blk-mq API can do this. I'm afraid I don't know much about the new blk-mq API, you will have to ask the blk-mq maintainers for solutions, since this was possible with the previous API (and I would consider a regression that the new blk-mq API doesn't allow it). Roger.
On 27/07/16 04:21, Bob Liu wrote: > The current VBD layer reserves buffer space for each attached device based on > three statically configured settings which are read at boot time. > * max_indirect_segs: Maximum amount of segments. > * max_ring_page_order: Maximum order of pages to be used for the shared ring. > * max_queues: Maximum of queues(rings) to be used. I think you want a single knob for "maximum number of pages in flight". This is much easier for an admin to relate to resource usage. You could apply this limit without having to reconfigure, which would simplify this patch quite a bit. David
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 1b4c380..57baa54 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -212,6 +212,10 @@ struct blkfront_info /* Save uncomplete reqs and bios for migration. */ struct list_head requests; struct bio_list bio_list; + /* For dynamic configuration. */ + unsigned int reconfiguring:1; + unsigned int max_ring_page_order; + unsigned int max_queues; }; static unsigned int nr_minors; @@ -1350,6 +1354,31 @@ static void blkif_free(struct blkfront_info *info, int suspend) for (i = 0; i < info->nr_rings; i++) blkif_free_ring(&info->rinfo[i]); + /* Remove old xenstore nodes. */ + if (info->nr_ring_pages > 1) + xenbus_rm(XBT_NIL, info->xbdev->nodename, "ring-page-order"); + + if (info->nr_rings == 1) { + if (info->nr_ring_pages == 1) { + xenbus_rm(XBT_NIL, info->xbdev->nodename, "ring-ref"); + } else { + for (i = 0; i < info->nr_ring_pages; i++) { + char ring_ref_name[RINGREF_NAME_LEN]; + + snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i); + xenbus_rm(XBT_NIL, info->xbdev->nodename, ring_ref_name); + } + } + } else { + xenbus_rm(XBT_NIL, info->xbdev->nodename, "multi-queue-num-queues"); + + for (i = 0; i < info->nr_rings; i++) { + char queuename[QUEUE_NAME_LEN]; + + snprintf(queuename, QUEUE_NAME_LEN, "queue-%u", i); + xenbus_rm(XBT_NIL, info->xbdev->nodename, queuename); + } + } kfree(info->rinfo); info->rinfo = NULL; info->nr_rings = 0; @@ -1763,15 +1792,20 @@ static int talk_to_blkback(struct xenbus_device *dev, const char *message = NULL; struct xenbus_transaction xbt; int err; - unsigned int i, max_page_order = 0; + unsigned int i, backend_max_order = 0; unsigned int ring_page_order = 0; err = xenbus_scanf(XBT_NIL, info->xbdev->otherend, - "max-ring-page-order", "%u", &max_page_order); + "max-ring-page-order", "%u", &backend_max_order); if (err != 1) info->nr_ring_pages = 1; else { - ring_page_order = min(xen_blkif_max_ring_order, max_page_order); + if (info->max_ring_page_order) + /* Dynamic configured through /sys. */ + ring_page_order = min(info->max_ring_page_order, backend_max_order); + else + /* Default. */ + ring_page_order = min(xen_blkif_max_ring_order, backend_max_order); info->nr_ring_pages = 1 << ring_page_order; } @@ -1894,7 +1928,13 @@ static int negotiate_mq(struct blkfront_info *info) if (err < 0) backend_max_queues = 1; - info->nr_rings = min(backend_max_queues, xen_blkif_max_queues); + if (info->max_queues) + /* Dynamic configured through /sys */ + info->nr_rings = min(backend_max_queues, info->max_queues); + else + /* Default. */ + info->nr_rings = min(backend_max_queues, xen_blkif_max_queues); + /* We need at least one ring. */ if (!info->nr_rings) info->nr_rings = 1; @@ -2352,11 +2392,198 @@ static void blkfront_gather_backend_features(struct blkfront_info *info) NULL); if (err) info->max_indirect_segments = 0; - else - info->max_indirect_segments = min(indirect_segments, - xen_blkif_max_segments); + else { + if (info->max_indirect_segments) + /* Dynamic configured through /sys */ + info->max_indirect_segments = min(indirect_segments, + info->max_indirect_segments); + else + info->max_indirect_segments = min(indirect_segments, + xen_blkif_max_segments); + } +} + +static ssize_t max_ring_page_order_show(struct device *dev, + struct device_attribute *attr, char *page) +{ + struct blkfront_info *info = dev_get_drvdata(dev); + + return sprintf(page, "%u\n", get_order(info->nr_ring_pages * XEN_PAGE_SIZE)); +} + +static ssize_t max_indirect_segs_show(struct device *dev, + struct device_attribute *attr, char *page) +{ + struct blkfront_info *info = dev_get_drvdata(dev); + + return sprintf(page, "%u\n", info->max_indirect_segments); +} + +static ssize_t max_queues_show(struct device *dev, + struct device_attribute *attr, char *page) +{ + struct blkfront_info *info = dev_get_drvdata(dev); + + return sprintf(page, "%u\n", info->nr_rings); +} + +static ssize_t dynamic_reconfig_device(struct blkfront_info *info, ssize_t count) +{ + /* + * Prevent new requests even to software request queue. + */ + blk_mq_freeze_queue(info->rq); + + /* + * Guarantee no uncompleted reqs. + */ + if (part_in_flight(&info->gd->part0) || info->reconfiguring) { + blk_mq_unfreeze_queue(info->rq); + pr_err("Dev:%s busy, please retry later.\n", dev_name(&info->xbdev->dev)); + return -EBUSY; + } + + /* + * Frontend: Backend: + * Freeze_queue() + * Switch to XenbusStateClosed + * frontend_changed(StateClosed) + * > xen_blkif_disconnect() + * > Switch to XenbusStateClosed + * blkback_changed(StateClosed) + * > blkfront_resume() + * > Switch to StateInitialised + * frontend_changed(StateInitialised): + * > reconnect + * > Switch to StateConnected + * blkback_changed(StateConnected) + * > blkif_recover() + * > Also switch to StateConnected + * > Unfreeze_queue() + */ + info->reconfiguring = true; + xenbus_switch_state(info->xbdev, XenbusStateClosed); + + return count; +} + +static ssize_t max_indirect_segs_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + ssize_t ret; + unsigned int max_segs = 0, backend_max_segs = 0; + struct blkfront_info *info = dev_get_drvdata(dev); + int err; + + ret = kstrtouint(buf, 10, &max_segs); + if (ret < 0) + return ret; + + if (max_segs == info->max_indirect_segments) + return count; + + err = xenbus_gather(XBT_NIL, info->xbdev->otherend, + "feature-max-indirect-segments", "%u", &backend_max_segs, + NULL); + if (err) { + pr_err("Backend %s doesn't support feature-indirect-segments.\n", + info->xbdev->otherend); + return -EOPNOTSUPP; + } + + if (max_segs > backend_max_segs) { + pr_err("Invalid max indirect segment (%u), backend-max: %u.\n", + max_segs, backend_max_segs); + return -EINVAL; + } + + info->max_indirect_segments = max_segs; + + return dynamic_reconfig_device(info, count); +} + +static ssize_t max_ring_page_order_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + ssize_t ret; + unsigned int max_order = 0, backend_max_order = 0; + struct blkfront_info *info = dev_get_drvdata(dev); + int err; + + ret = kstrtouint(buf, 10, &max_order); + if (ret < 0) + return ret; + + if ((1 << max_order) == info->nr_ring_pages) + return count; + + if (max_order > XENBUS_MAX_RING_GRANT_ORDER) { + pr_err("Invalid max_ring_page_order (%u), max: %u.\n", + max_order, XENBUS_MAX_RING_GRANT_ORDER); + return -EINVAL; + } + + err = xenbus_scanf(XBT_NIL, info->xbdev->otherend, + "max-ring-page-order", "%u", &backend_max_order); + if (err != 1) { + pr_err("Backend %s doesn't support feature multi-page-ring.\n", + info->xbdev->otherend); + return -EOPNOTSUPP; + } + if (max_order > backend_max_order) { + pr_err("Invalid max_ring_page_order (%u), backend supports max: %u.\n", + max_order, backend_max_order); + return -EINVAL; + } + info->max_ring_page_order = max_order; + + return dynamic_reconfig_device(info, count); +} + +static ssize_t max_queues_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + ssize_t ret; + unsigned int max_queues = 0, backend_max_queues = 0; + struct blkfront_info *info = dev_get_drvdata(dev); + int err; + + ret = kstrtouint(buf, 10, &max_queues); + if (ret < 0) + return ret; + + if (max_queues == info->nr_rings) + return count; + + if (max_queues > num_online_cpus()) { + pr_err("Invalid max_queues (%u), can't bigger than online cpus: %u.\n", + max_queues, num_online_cpus()); + return -EINVAL; + } + + err = xenbus_scanf(XBT_NIL, info->xbdev->otherend, + "multi-queue-max-queues", "%u", &backend_max_queues); + if (err != 1) { + pr_err("Backend %s doesn't support block multi queue.\n", + info->xbdev->otherend); + return -EOPNOTSUPP; + } + if (max_queues > backend_max_queues) { + pr_err("Invalid max_queues (%u), backend supports max: %u.\n", + max_queues, backend_max_queues); + return -EINVAL; + } + info->max_queues = max_queues; + + return dynamic_reconfig_device(info, count); } +static DEVICE_ATTR_RW(max_queues); +static DEVICE_ATTR_RW(max_ring_page_order); +static DEVICE_ATTR_RW(max_indirect_segs); + /* * Invoked when the backend is finally 'ready' (and has told produced * the details about the physical device - #sectors, size, etc). @@ -2393,6 +2620,10 @@ static void blkfront_connect(struct blkfront_info *info) * supports indirect descriptors, and how many. */ blkif_recover(info); + if (info->reconfiguring) { + blk_mq_unfreeze_queue(info->rq); + info->reconfiguring = false; + } return; default: @@ -2443,6 +2674,22 @@ static void blkfront_connect(struct blkfront_info *info) return; } + err = device_create_file(&info->xbdev->dev, &dev_attr_max_ring_page_order); + if (err) + goto fail; + + err = device_create_file(&info->xbdev->dev, &dev_attr_max_indirect_segs); + if (err) { + device_remove_file(&info->xbdev->dev, &dev_attr_max_ring_page_order); + goto fail; + } + + err = device_create_file(&info->xbdev->dev, &dev_attr_max_queues); + if (err) { + device_remove_file(&info->xbdev->dev, &dev_attr_max_ring_page_order); + device_remove_file(&info->xbdev->dev, &dev_attr_max_indirect_segs); + goto fail; + } xenbus_switch_state(info->xbdev, XenbusStateConnected); /* Kick pending requests. */ @@ -2453,6 +2700,12 @@ static void blkfront_connect(struct blkfront_info *info) add_disk(info->gd); info->is_ready = 1; + return; + +fail: + blkif_free(info, 0); + xlvbd_release_gendisk(info); + return; } /** @@ -2500,8 +2753,16 @@ static void blkback_changed(struct xenbus_device *dev, break; case XenbusStateClosed: - if (dev->state == XenbusStateClosed) + if (dev->state == XenbusStateClosed) { + if (info->reconfiguring) + if (blkfront_resume(info->xbdev)) { + /* Resume failed. */ + info->reconfiguring = false; + xenbus_switch_state(info->xbdev, XenbusStateClosed); + pr_err("Resume from dynamic configuration failed\n"); + } break; + } /* Missed the backend's Closing state -- fallthrough */ case XenbusStateClosing: if (info)
The current VBD layer reserves buffer space for each attached device based on three statically configured settings which are read at boot time. * max_indirect_segs: Maximum amount of segments. * max_ring_page_order: Maximum order of pages to be used for the shared ring. * max_queues: Maximum of queues(rings) to be used. But the storage backend, workload, and guest memory result in very different tuning requirements. It's impossible to centrally predict application characteristics so it's best to leave allow the settings can be dynamiclly adjusted based on workload inside the Guest. Usage: Show current values: cat /sys/devices/vbd-xxx/max_indirect_segs cat /sys/devices/vbd-xxx/max_ring_page_order cat /sys/devices/vbd-xxx/max_queues Write new values: echo <new value> > /sys/devices/vbd-xxx/max_indirect_segs echo <new value> > /sys/devices/vbd-xxx/max_ring_page_order echo <new value> > /sys/devices/vbd-xxx/max_queues Signed-off-by: Bob Liu <bob.liu@oracle.com> -- v3: * Remove new_max_indirect_segments. * Fix BUG_ON(). v2: * Rename to max_ring_page_order. * Remove the waiting code suggested by Roger. --- drivers/block/xen-blkfront.c | 277 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 269 insertions(+), 8 deletions(-)