diff mbox

[v3] xen-blkfront: dynamic configuration of per-vbd resources

Message ID 1469589685-31630-1-git-send-email-bob.liu@oracle.com (mailing list archive)
State New, archived
Headers show

Commit Message

Bob Liu July 27, 2016, 3:21 a.m. UTC
The current VBD layer reserves buffer space for each attached device based on
three statically configured settings which are read at boot time.
 * max_indirect_segs: Maximum amount of segments.
 * max_ring_page_order: Maximum order of pages to be used for the shared ring.
 * max_queues: Maximum of queues(rings) to be used.

But the storage backend, workload, and guest memory result in very different
tuning requirements. It's impossible to centrally predict application
characteristics so it's best to leave allow the settings can be dynamiclly
adjusted based on workload inside the Guest.

Usage:
Show current values:
cat /sys/devices/vbd-xxx/max_indirect_segs
cat /sys/devices/vbd-xxx/max_ring_page_order
cat /sys/devices/vbd-xxx/max_queues

Write new values:
echo <new value> > /sys/devices/vbd-xxx/max_indirect_segs
echo <new value> > /sys/devices/vbd-xxx/max_ring_page_order
echo <new value> > /sys/devices/vbd-xxx/max_queues

Signed-off-by: Bob Liu <bob.liu@oracle.com>
--
v3:
 * Remove new_max_indirect_segments.
 * Fix BUG_ON().
v2:
 * Rename to max_ring_page_order.
 * Remove the waiting code suggested by Roger.
---
 drivers/block/xen-blkfront.c |  277 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 269 insertions(+), 8 deletions(-)

Comments

Roger Pau Monné July 27, 2016, 8:07 a.m. UTC | #1
Hello,

Thanks for the fixes.

On Wed, Jul 27, 2016 at 11:21:25AM +0800, Bob Liu wrote:
> The current VBD layer reserves buffer space for each attached device based on
> three statically configured settings which are read at boot time.
>  * max_indirect_segs: Maximum amount of segments.
>  * max_ring_page_order: Maximum order of pages to be used for the shared ring.
>  * max_queues: Maximum of queues(rings) to be used.
> 
> But the storage backend, workload, and guest memory result in very different
> tuning requirements. It's impossible to centrally predict application
> characteristics so it's best to leave allow the settings can be dynamiclly
> adjusted based on workload inside the Guest.
> 
> Usage:
> Show current values:
> cat /sys/devices/vbd-xxx/max_indirect_segs
> cat /sys/devices/vbd-xxx/max_ring_page_order
> cat /sys/devices/vbd-xxx/max_queues
> 
> Write new values:
> echo <new value> > /sys/devices/vbd-xxx/max_indirect_segs
> echo <new value> > /sys/devices/vbd-xxx/max_ring_page_order
> echo <new value> > /sys/devices/vbd-xxx/max_queues
> 
> Signed-off-by: Bob Liu <bob.liu@oracle.com>
> --
> v3:
>  * Remove new_max_indirect_segments.
>  * Fix BUG_ON().
> v2:
>  * Rename to max_ring_page_order.
>  * Remove the waiting code suggested by Roger.
> ---
>  drivers/block/xen-blkfront.c |  277 ++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 269 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
> index 1b4c380..57baa54 100644
> --- a/drivers/block/xen-blkfront.c
> +++ b/drivers/block/xen-blkfront.c
> @@ -212,6 +212,10 @@ struct blkfront_info
>  	/* Save uncomplete reqs and bios for migration. */
>  	struct list_head requests;
>  	struct bio_list bio_list;
> +	/* For dynamic configuration. */
> +	unsigned int reconfiguring:1;
> +	unsigned int max_ring_page_order;
> +	unsigned int max_queues;

max_{ring_page_order/queues} should be after max_indirect_segments, and 
reconfigurng should be of type bool (moreover when below you assign 'true' 
or 'false' to it).

>  };
>  
>  static unsigned int nr_minors;
> @@ -1350,6 +1354,31 @@ static void blkif_free(struct blkfront_info *info, int suspend)
>  	for (i = 0; i < info->nr_rings; i++)
>  		blkif_free_ring(&info->rinfo[i]);
>  
> +	/* Remove old xenstore nodes. */
> +	if (info->nr_ring_pages > 1)
> +		xenbus_rm(XBT_NIL, info->xbdev->nodename, "ring-page-order");
> +
> +	if (info->nr_rings == 1) {
> +		if (info->nr_ring_pages == 1) {
> +			xenbus_rm(XBT_NIL, info->xbdev->nodename, "ring-ref");
> +		} else {
> +			for (i = 0; i < info->nr_ring_pages; i++) {
> +				char ring_ref_name[RINGREF_NAME_LEN];
> +
> +				snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i);
> +				xenbus_rm(XBT_NIL, info->xbdev->nodename, ring_ref_name);
> +			}
> +		}
> +	} else {
> +		xenbus_rm(XBT_NIL, info->xbdev->nodename, "multi-queue-num-queues");
> +
> +		for (i = 0; i < info->nr_rings; i++) {
> +			char queuename[QUEUE_NAME_LEN];
> +
> +			snprintf(queuename, QUEUE_NAME_LEN, "queue-%u", i);
> +			xenbus_rm(XBT_NIL, info->xbdev->nodename, queuename);
> +		}
> +	}
>  	kfree(info->rinfo);
>  	info->rinfo = NULL;
>  	info->nr_rings = 0;
> @@ -1763,15 +1792,20 @@ static int talk_to_blkback(struct xenbus_device *dev,
>  	const char *message = NULL;
>  	struct xenbus_transaction xbt;
>  	int err;
> -	unsigned int i, max_page_order = 0;
> +	unsigned int i, backend_max_order = 0;
>  	unsigned int ring_page_order = 0;
>  
>  	err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
> -			   "max-ring-page-order", "%u", &max_page_order);
> +			   "max-ring-page-order", "%u", &backend_max_order);
>  	if (err != 1)
>  		info->nr_ring_pages = 1;
>  	else {
> -		ring_page_order = min(xen_blkif_max_ring_order, max_page_order);
> +		if (info->max_ring_page_order)
> +			/* Dynamic configured through /sys. */
> +			ring_page_order = min(info->max_ring_page_order, backend_max_order);
> +		else
> +			/* Default. */
> +			ring_page_order = min(xen_blkif_max_ring_order, backend_max_order);
>  		info->nr_ring_pages = 1 << ring_page_order;

In order to avoid this 'if' here (and below), what do you think of assigning 
the global values to the blkfront_info structure in blkfront_probe? Then you 
would only have to do min(info->max_ring_page_order, backend_max_order) in 
order to get the min value.

>  	}
>  
> @@ -1894,7 +1928,13 @@ static int negotiate_mq(struct blkfront_info *info)
>  	if (err < 0)
>  		backend_max_queues = 1;
>  
> -	info->nr_rings = min(backend_max_queues, xen_blkif_max_queues);
> +	if (info->max_queues)
> +		/* Dynamic configured through /sys */
> +		info->nr_rings = min(backend_max_queues, info->max_queues);
> +	else
> +		/* Default. */
> +		info->nr_rings = min(backend_max_queues, xen_blkif_max_queues);
> +
>  	/* We need at least one ring. */
>  	if (!info->nr_rings)
>  		info->nr_rings = 1;
> @@ -2352,11 +2392,198 @@ static void blkfront_gather_backend_features(struct blkfront_info *info)
>  			    NULL);
>  	if (err)
>  		info->max_indirect_segments = 0;
> -	else
> -		info->max_indirect_segments = min(indirect_segments,
> -						  xen_blkif_max_segments);
> +	else {
> +		if (info->max_indirect_segments)
> +			/* Dynamic configured through /sys */
> +			info->max_indirect_segments = min(indirect_segments,
> +							  info->max_indirect_segments);
> +		else
> +			info->max_indirect_segments = min(indirect_segments,
> +							  xen_blkif_max_segments);
> +	}
> +}
> +
> +static ssize_t max_ring_page_order_show(struct device *dev,
> +					struct device_attribute *attr, char *page)
> +{
> +	struct blkfront_info *info = dev_get_drvdata(dev);
> +
> +	return sprintf(page, "%u\n", get_order(info->nr_ring_pages * XEN_PAGE_SIZE));
> +}
> +
> +static ssize_t max_indirect_segs_show(struct device *dev,
> +				      struct device_attribute *attr, char *page)
> +{
> +	struct blkfront_info *info = dev_get_drvdata(dev);
> +
> +	return sprintf(page, "%u\n", info->max_indirect_segments);
> +}
> +
> +static ssize_t max_queues_show(struct device *dev,
> +			       struct device_attribute *attr, char *page)
> +{
> +	struct blkfront_info *info = dev_get_drvdata(dev);
> +
> +	return sprintf(page, "%u\n", info->nr_rings);
> +}
> +
> +static ssize_t dynamic_reconfig_device(struct blkfront_info *info, ssize_t count)
> +{
> +	/*
> +	 * Prevent new requests even to software request queue.
> +	 */
> +	blk_mq_freeze_queue(info->rq);
> +
> +	/*
> +	 * Guarantee no uncompleted reqs.
> +	 */
> +	if (part_in_flight(&info->gd->part0) || info->reconfiguring) {
> +		blk_mq_unfreeze_queue(info->rq);
> +		pr_err("Dev:%s busy, please retry later.\n", dev_name(&info->xbdev->dev));
> +		return -EBUSY;
> +	}
> +
> +	/*
> +	 * Frontend: 				Backend:
> +	 * Freeze_queue()
> +	 * Switch to XenbusStateClosed
> +	 *					frontend_changed(StateClosed)
> +	 *						> xen_blkif_disconnect()
> +	 *						> Switch to XenbusStateClosed
> +	 * blkback_changed(StateClosed)
> +	 *	> blkfront_resume()
> +	 *	> Switch to StateInitialised
> +	 *					frontend_changed(StateInitialised):
> +	 *						> reconnect
> +	 *						> Switch to StateConnected
> +	 * blkback_changed(StateConnected)
> +	 *	> blkif_recover()
> +	 *		> Also switch to StateConnected
> +	 *		> Unfreeze_queue()
> +	 */
> +	info->reconfiguring = true;

This is racy AFAICT. You should use an atomic test_and_set in the condition 
above or lock the blkfront_info mutex before and while setting 
reconfiguring. Without this two (or more) concurrent calls to 
dynamic_reconfig_device might succeed.

> +	xenbus_switch_state(info->xbdev, XenbusStateClosed);
> +
> +	return count;
> +}
> +
> +static ssize_t max_indirect_segs_store(struct device *dev,
> +		struct device_attribute *attr, const char *buf, size_t count)
> +{
> +	ssize_t ret;
> +	unsigned int max_segs = 0, backend_max_segs = 0;
> +	struct blkfront_info *info = dev_get_drvdata(dev);
> +	int err;
> +
> +	ret = kstrtouint(buf, 10, &max_segs);
> +	if (ret < 0)
> +		return ret;
> +
> +	if (max_segs == info->max_indirect_segments)
> +		return count;
> +
> +	err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
> +			    "feature-max-indirect-segments", "%u", &backend_max_segs,
> +			    NULL);
> +	if (err) {
> +		pr_err("Backend %s doesn't support feature-indirect-segments.\n",
> +			info->xbdev->otherend);
> +		return -EOPNOTSUPP;
> +	}
> +
> +	if (max_segs > backend_max_segs) {
> +		pr_err("Invalid max indirect segment (%u), backend-max: %u.\n",
> +			max_segs, backend_max_segs);
> +		return -EINVAL;
> +	}
> +
> +	info->max_indirect_segments = max_segs;
> +
> +	return dynamic_reconfig_device(info, count);
> +}
> +
> +static ssize_t max_ring_page_order_store(struct device *dev,
> +				 struct device_attribute *attr,
> +				 const char *buf, size_t count)
> +{
> +	ssize_t ret;
> +	unsigned int max_order = 0, backend_max_order = 0;
> +	struct blkfront_info *info = dev_get_drvdata(dev);
> +	int err;
> +
> +	ret = kstrtouint(buf, 10, &max_order);
> +	if (ret < 0)
> +		return ret;
> +
> +	if ((1 << max_order) == info->nr_ring_pages)
> +		return count;
> +
> +	if (max_order > XENBUS_MAX_RING_GRANT_ORDER) {
> +		pr_err("Invalid max_ring_page_order (%u), max: %u.\n",
> +				max_order, XENBUS_MAX_RING_GRANT_ORDER);
> +		return -EINVAL;
> +	}
> +
> +	err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
> +			   "max-ring-page-order", "%u", &backend_max_order);
> +	if (err != 1) {
> +		pr_err("Backend %s doesn't support feature multi-page-ring.\n",
> +			info->xbdev->otherend);
> +		return -EOPNOTSUPP;
> +	}
> +	if (max_order > backend_max_order) {
> +		pr_err("Invalid max_ring_page_order (%u), backend supports max: %u.\n",
> +			max_order, backend_max_order);
> +		return -EINVAL;
> +	}
> +	info->max_ring_page_order = max_order;
> +
> +	return dynamic_reconfig_device(info, count);
> +}
> +
> +static ssize_t max_queues_store(struct device *dev,
> +				struct device_attribute *attr,
> +				const char *buf, size_t count)
> +{
> +	ssize_t ret;
> +	unsigned int max_queues = 0, backend_max_queues = 0;
> +	struct blkfront_info *info = dev_get_drvdata(dev);
> +	int err;
> +
> +	ret = kstrtouint(buf, 10, &max_queues);
> +	if (ret < 0)
> +		return ret;
> +
> +	if (max_queues == info->nr_rings)
> +		return count;
> +
> +	if (max_queues > num_online_cpus()) {
> +		pr_err("Invalid max_queues (%u), can't bigger than online cpus: %u.\n",
> +			max_queues, num_online_cpus());
> +		return -EINVAL;
> +	}
> +
> +	err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
> +			   "multi-queue-max-queues", "%u", &backend_max_queues);
> +	if (err != 1) {
> +		pr_err("Backend %s doesn't support block multi queue.\n",
> +			info->xbdev->otherend);
> +		return -EOPNOTSUPP;
> +	}
> +	if (max_queues > backend_max_queues) {
> +		pr_err("Invalid max_queues (%u), backend supports max: %u.\n",
> +			max_queues, backend_max_queues);
> +		return -EINVAL;
> +	}
> +	info->max_queues = max_queues;
> +
> +	return dynamic_reconfig_device(info, count);
>  }
>  
> +static DEVICE_ATTR_RW(max_queues);
> +static DEVICE_ATTR_RW(max_ring_page_order);
> +static DEVICE_ATTR_RW(max_indirect_segs);
> +
>  /*
>   * Invoked when the backend is finally 'ready' (and has told produced
>   * the details about the physical device - #sectors, size, etc).
> @@ -2393,6 +2620,10 @@ static void blkfront_connect(struct blkfront_info *info)
>  		 * supports indirect descriptors, and how many.
>  		 */
>  		blkif_recover(info);
> +		if (info->reconfiguring) {
> +			blk_mq_unfreeze_queue(info->rq);
> +			info->reconfiguring = false;
> +		}
>  		return;
>  
>  	default:
> @@ -2443,6 +2674,22 @@ static void blkfront_connect(struct blkfront_info *info)
>  		return;
>  	}
>  
> +	err = device_create_file(&info->xbdev->dev, &dev_attr_max_ring_page_order);
> +	if (err)
> +		goto fail;
> +
> +	err = device_create_file(&info->xbdev->dev, &dev_attr_max_indirect_segs);
> +	if (err) {
> +		device_remove_file(&info->xbdev->dev, &dev_attr_max_ring_page_order);
> +		goto fail;
> +	}
> +
> +	err = device_create_file(&info->xbdev->dev, &dev_attr_max_queues);
> +	if (err) {
> +		device_remove_file(&info->xbdev->dev, &dev_attr_max_ring_page_order);
> +		device_remove_file(&info->xbdev->dev, &dev_attr_max_indirect_segs);
> +		goto fail;
> +	}
>  	xenbus_switch_state(info->xbdev, XenbusStateConnected);
>  
>  	/* Kick pending requests. */
> @@ -2453,6 +2700,12 @@ static void blkfront_connect(struct blkfront_info *info)
>  	add_disk(info->gd);
>  
>  	info->is_ready = 1;
> +	return;
> +
> +fail:
> +	blkif_free(info, 0);
> +	xlvbd_release_gendisk(info);
> +	return;

Hm, I'm not sure whether this chunk should be in a separate patch, it seems 
like blkfront_connect doesn't properly cleanup on error (if 
xlvbd_alloc_gendisk fails blkif_free will not be called). Do you think you 
could send the addition of the 'fail' label as a separate patch and fix the 
error path of xlvbd_alloc_gendisk?

>  }
>  
>  /**
> @@ -2500,8 +2753,16 @@ static void blkback_changed(struct xenbus_device *dev,
>  		break;
>  
>  	case XenbusStateClosed:
> -		if (dev->state == XenbusStateClosed)
> +		if (dev->state == XenbusStateClosed) {
> +			if (info->reconfiguring)
> +				if (blkfront_resume(info->xbdev)) {

Could you please join those two conditions:

if (info->reconfiguring && blkfront_resume(info->xbdev)) { ...

Also, I'm not sure this is correct, if blkfront sees the "Closing" state on 
blkback it will try to close the frontend and destroy the block device (see 
blkfront_closing), and this should be avoided. You should call 
blkfront_resume as soon as you see the backend move to the Closed or Closing 
states, without calling blkfront_closing.

> +					/* Resume failed. */
> +					info->reconfiguring = false;
> +					xenbus_switch_state(info->xbdev, XenbusStateClosed);
> +					pr_err("Resume from dynamic configuration failed\n");
> +				}
>  			break;
> +		}
>  		/* Missed the backend's Closing state -- fallthrough */
>  	case XenbusStateClosing:
>  		if (info)
> -- 
> 1.7.10.4
Bob Liu July 27, 2016, 9:12 a.m. UTC | #2
On 07/27/2016 04:07 PM, Roger Pau Monné wrote:
..[snip]..
>> @@ -2443,6 +2674,22 @@ static void blkfront_connect(struct blkfront_info *info)
>>  		return;
>>  	}
>>  
>> +	err = device_create_file(&info->xbdev->dev, &dev_attr_max_ring_page_order);
>> +	if (err)
>> +		goto fail;
>> +
>> +	err = device_create_file(&info->xbdev->dev, &dev_attr_max_indirect_segs);
>> +	if (err) {
>> +		device_remove_file(&info->xbdev->dev, &dev_attr_max_ring_page_order);
>> +		goto fail;
>> +	}
>> +
>> +	err = device_create_file(&info->xbdev->dev, &dev_attr_max_queues);
>> +	if (err) {
>> +		device_remove_file(&info->xbdev->dev, &dev_attr_max_ring_page_order);
>> +		device_remove_file(&info->xbdev->dev, &dev_attr_max_indirect_segs);
>> +		goto fail;
>> +	}
>>  	xenbus_switch_state(info->xbdev, XenbusStateConnected);
>>  
>>  	/* Kick pending requests. */
>> @@ -2453,6 +2700,12 @@ static void blkfront_connect(struct blkfront_info *info)
>>  	add_disk(info->gd);
>>  
>>  	info->is_ready = 1;
>> +	return;
>> +
>> +fail:
>> +	blkif_free(info, 0);
>> +	xlvbd_release_gendisk(info);
>> +	return;
> 
> Hm, I'm not sure whether this chunk should be in a separate patch, it seems 
> like blkfront_connect doesn't properly cleanup on error (if 
> xlvbd_alloc_gendisk fails blkif_free will not be called). Do you think you 
> could send the addition of the 'fail' label as a separate patch and fix the 
> error path of xlvbd_alloc_gendisk?
> 

Sure, will fix all of your comments above.

>>  }
>>  
>>  /**
>> @@ -2500,8 +2753,16 @@ static void blkback_changed(struct xenbus_device *dev,
>>  		break;
>>  
>>  	case XenbusStateClosed:
>> -		if (dev->state == XenbusStateClosed)
>> +		if (dev->state == XenbusStateClosed) {
>> +			if (info->reconfiguring)
>> +				if (blkfront_resume(info->xbdev)) {
> 
> Could you please join those two conditions:
> 
> if (info->reconfiguring && blkfront_resume(info->xbdev)) { ...
> 
> Also, I'm not sure this is correct, if blkfront sees the "Closing" state on 
> blkback it will try to close the frontend and destroy the block device (see 
> blkfront_closing), and this should be avoided. You should call 
> blkfront_resume as soon as you see the backend move to the Closed or Closing 
> states, without calling blkfront_closing.
> 

I didn't get how this can happen, backend state won't be changed to 'Closing' before blkfront_closing() is called.
So I think current logic is fine.

Btw: could you please ack [PATCH v2 2/3] xen-blkfront: introduce blkif_set_queue_limits()?

Thank you!
Bob
Paul Durrant July 27, 2016, 9:29 a.m. UTC | #3
> -----Original Message-----

[snip]
> >

> > Also, I'm not sure this is correct, if blkfront sees the "Closing" state on

> > blkback it will try to close the frontend and destroy the block device (see

> > blkfront_closing), and this should be avoided. You should call

> > blkfront_resume as soon as you see the backend move to the Closed or

> Closing

> > states, without calling blkfront_closing.

> >

> 

> I didn't get how this can happen, backend state won't be changed to 'Closing'

> before blkfront_closing() is called.

> So I think current logic is fine.

> 


Backends can go to closing before frontends, when the PV device is being detached.

  Paul
Roger Pau Monné July 27, 2016, 10:09 a.m. UTC | #4
On Wed, Jul 27, 2016 at 05:12:22PM +0800, Bob Liu wrote:
> 
> On 07/27/2016 04:07 PM, Roger Pau Monné wrote:
> ..[snip]..
> >> @@ -2443,6 +2674,22 @@ static void blkfront_connect(struct blkfront_info *info)
> >>  		return;
> >>  	}
> >>  
> >> +	err = device_create_file(&info->xbdev->dev, &dev_attr_max_ring_page_order);
> >> +	if (err)
> >> +		goto fail;
> >> +
> >> +	err = device_create_file(&info->xbdev->dev, &dev_attr_max_indirect_segs);
> >> +	if (err) {
> >> +		device_remove_file(&info->xbdev->dev, &dev_attr_max_ring_page_order);
> >> +		goto fail;
> >> +	}
> >> +
> >> +	err = device_create_file(&info->xbdev->dev, &dev_attr_max_queues);
> >> +	if (err) {
> >> +		device_remove_file(&info->xbdev->dev, &dev_attr_max_ring_page_order);
> >> +		device_remove_file(&info->xbdev->dev, &dev_attr_max_indirect_segs);
> >> +		goto fail;
> >> +	}
> >>  	xenbus_switch_state(info->xbdev, XenbusStateConnected);
> >>  
> >>  	/* Kick pending requests. */
> >> @@ -2453,6 +2700,12 @@ static void blkfront_connect(struct blkfront_info *info)
> >>  	add_disk(info->gd);
> >>  
> >>  	info->is_ready = 1;
> >> +	return;
> >> +
> >> +fail:
> >> +	blkif_free(info, 0);
> >> +	xlvbd_release_gendisk(info);
> >> +	return;
> > 
> > Hm, I'm not sure whether this chunk should be in a separate patch, it seems 
> > like blkfront_connect doesn't properly cleanup on error (if 
> > xlvbd_alloc_gendisk fails blkif_free will not be called). Do you think you 
> > could send the addition of the 'fail' label as a separate patch and fix the 
> > error path of xlvbd_alloc_gendisk?
> > 
> 
> Sure, will fix all of your comments above.
> 
> >>  }
> >>  
> >>  /**
> >> @@ -2500,8 +2753,16 @@ static void blkback_changed(struct xenbus_device *dev,
> >>  		break;
> >>  
> >>  	case XenbusStateClosed:
> >> -		if (dev->state == XenbusStateClosed)
> >> +		if (dev->state == XenbusStateClosed) {
> >> +			if (info->reconfiguring)
> >> +				if (blkfront_resume(info->xbdev)) {
> > 
> > Could you please join those two conditions:
> > 
> > if (info->reconfiguring && blkfront_resume(info->xbdev)) { ...
> > 
> > Also, I'm not sure this is correct, if blkfront sees the "Closing" state on 
> > blkback it will try to close the frontend and destroy the block device (see 
> > blkfront_closing), and this should be avoided. You should call 
> > blkfront_resume as soon as you see the backend move to the Closed or Closing 
> > states, without calling blkfront_closing.
> > 
> 
> I didn't get how this can happen, backend state won't be changed to 'Closing' before blkfront_closing() is called.
> So I think current logic is fine.

I see, in dynamic_reconfig_device you change the frontend state to "Closed". 
Then if the backend switches to state "Closing" blkfront will call 
blkfront_closing, but that won't do anything because the frontend state is 
already at the "Closed" state, at which point you just wait for the backend 
to finally reach state "Closed" in order to perform the reconnection. I 
stand corrected, I think the current logic is indeed fine.
 
Roger.
Roger Pau Monné July 27, 2016, 10:59 a.m. UTC | #5
On Wed, Jul 27, 2016 at 11:21:25AM +0800, Bob Liu wrote:
[...]
> +static ssize_t dynamic_reconfig_device(struct blkfront_info *info, ssize_t count)
> +{
> +	/*
> +	 * Prevent new requests even to software request queue.
> +	 */
> +	blk_mq_freeze_queue(info->rq);
> +
> +	/*
> +	 * Guarantee no uncompleted reqs.
> +	 */

I'm also wondering, why do you need to guarantee that there are no 
uncompleted requests? The resume procedure is going to call blkif_recover 
that will take care of requeuing any unfinished requests that are on the 
ring.

> +	if (part_in_flight(&info->gd->part0) || info->reconfiguring) {
> +		blk_mq_unfreeze_queue(info->rq);
> +		pr_err("Dev:%s busy, please retry later.\n", dev_name(&info->xbdev->dev));
> +		return -EBUSY;
> +	}

Roger.
Bob Liu July 27, 2016, 11:21 a.m. UTC | #6
On 07/27/2016 06:59 PM, Roger Pau Monné wrote:
> On Wed, Jul 27, 2016 at 11:21:25AM +0800, Bob Liu wrote:
> [...]
>> +static ssize_t dynamic_reconfig_device(struct blkfront_info *info, ssize_t count)
>> +{
>> +	/*
>> +	 * Prevent new requests even to software request queue.
>> +	 */
>> +	blk_mq_freeze_queue(info->rq);
>> +
>> +	/*
>> +	 * Guarantee no uncompleted reqs.
>> +	 */
> 
> I'm also wondering, why do you need to guarantee that there are no 
> uncompleted requests? The resume procedure is going to call blkif_recover 
> that will take care of requeuing any unfinished requests that are on the 
> ring.
> 

Because there may have requests in the software request queue with more segments than
we can handle(if info->max_indirect_segments is reduced).

The blkif_recover() can't handle this since blk-mq was introduced,
because there is no way to iterate the sw-request queues(blk_fetch_request() can't be used by blk-mq).

So there is a bug in blkif_recover(), I was thinking implement the suspend function of blkfront_driver like:

+static int blkfront_suspend(struct xenbus_device *dev)
+{
+       blk_mq_freeze_queue(info->rq);
+       ..
+}
 static struct xenbus_driver blkfront_driver = {
        .ids  = blkfront_ids,
        .probe = blkfront_probe,
        .remove = blkfront_remove,
+       .suspend = blkfront_suspend,
        .resume = blkfront_resume,
Roger Pau Monné July 27, 2016, 2:24 p.m. UTC | #7
On Wed, Jul 27, 2016 at 07:21:05PM +0800, Bob Liu wrote:
> 
> On 07/27/2016 06:59 PM, Roger Pau Monné wrote:
> > On Wed, Jul 27, 2016 at 11:21:25AM +0800, Bob Liu wrote:
> > [...]
> >> +static ssize_t dynamic_reconfig_device(struct blkfront_info *info, ssize_t count)
> >> +{
> >> +	/*
> >> +	 * Prevent new requests even to software request queue.
> >> +	 */
> >> +	blk_mq_freeze_queue(info->rq);
> >> +
> >> +	/*
> >> +	 * Guarantee no uncompleted reqs.
> >> +	 */
> > 
> > I'm also wondering, why do you need to guarantee that there are no 
> > uncompleted requests? The resume procedure is going to call blkif_recover 
> > that will take care of requeuing any unfinished requests that are on the 
> > ring.
> > 
> 
> Because there may have requests in the software request queue with more segments than
> we can handle(if info->max_indirect_segments is reduced).
> 
> The blkif_recover() can't handle this since blk-mq was introduced,
> because there is no way to iterate the sw-request queues(blk_fetch_request() can't be used by blk-mq).
> 
> So there is a bug in blkif_recover(), I was thinking implement the suspend function of blkfront_driver like:

Hm, this is a regression and should be fixed ASAP. I'm still not sure I 
follow, don't blk_queue_max_segments change the number of segments the 
requests on the queue are going to have? So that you will only have to 
re-queue the requests already on the ring?

Waiting for the whole queue to be flushed before suspending is IMHO not 
acceptable, it introduces an unbounded delay during migration if the backend 
is slow for some reason.

Roger.
Bob Liu July 27, 2016, 11:05 p.m. UTC | #8
On 07/27/2016 10:24 PM, Roger Pau Monné wrote:
> On Wed, Jul 27, 2016 at 07:21:05PM +0800, Bob Liu wrote:
>>
>> On 07/27/2016 06:59 PM, Roger Pau Monné wrote:
>>> On Wed, Jul 27, 2016 at 11:21:25AM +0800, Bob Liu wrote:
>>> [...]
>>>> +static ssize_t dynamic_reconfig_device(struct blkfront_info *info, ssize_t count)
>>>> +{
>>>> +	/*
>>>> +	 * Prevent new requests even to software request queue.
>>>> +	 */
>>>> +	blk_mq_freeze_queue(info->rq);
>>>> +
>>>> +	/*
>>>> +	 * Guarantee no uncompleted reqs.
>>>> +	 */
>>>
>>> I'm also wondering, why do you need to guarantee that there are no 
>>> uncompleted requests? The resume procedure is going to call blkif_recover 
>>> that will take care of requeuing any unfinished requests that are on the 
>>> ring.
>>>
>>
>> Because there may have requests in the software request queue with more segments than
>> we can handle(if info->max_indirect_segments is reduced).
>>
>> The blkif_recover() can't handle this since blk-mq was introduced,
>> because there is no way to iterate the sw-request queues(blk_fetch_request() can't be used by blk-mq).
>>
>> So there is a bug in blkif_recover(), I was thinking implement the suspend function of blkfront_driver like:
> 
> Hm, this is a regression and should be fixed ASAP. I'm still not sure I 
> follow, don't blk_queue_max_segments change the number of segments the 
> requests on the queue are going to have? So that you will only have to 
> re-queue the requests already on the ring?
> 

That's not enough, request queues were split to software queues and hardware queues since blk-mq was introduced.
We need to consider two more things:
 * Stop new requests be added to software queues before blk_queue_max_segments() is called(still using old 'max-indirect-segments').
   I didn't see other way except call blk_mq_freeze_queue().

 * Requests already in software queues but with old 'max-indirect-segments' also have to be re-queued based on new 'max-indirect-segments'.
   Neither blk-mq API can do this.

> Waiting for the whole queue to be flushed before suspending is IMHO not 
> acceptable, it introduces an unbounded delay during migration if the backend 
> is slow for some reason.
> 

Right, I also hope there is better solution.
Roger Pau Monné July 28, 2016, 7:49 a.m. UTC | #9
On Thu, Jul 28, 2016 at 07:05:05AM +0800, Bob Liu wrote:
> 
> On 07/27/2016 10:24 PM, Roger Pau Monné wrote:
> > On Wed, Jul 27, 2016 at 07:21:05PM +0800, Bob Liu wrote:
> >>
> >> On 07/27/2016 06:59 PM, Roger Pau Monné wrote:
> >>> On Wed, Jul 27, 2016 at 11:21:25AM +0800, Bob Liu wrote:
> >>> [...]
> >>>> +static ssize_t dynamic_reconfig_device(struct blkfront_info *info, ssize_t count)
> >>>> +{
> >>>> +	/*
> >>>> +	 * Prevent new requests even to software request queue.
> >>>> +	 */
> >>>> +	blk_mq_freeze_queue(info->rq);
> >>>> +
> >>>> +	/*
> >>>> +	 * Guarantee no uncompleted reqs.
> >>>> +	 */
> >>>
> >>> I'm also wondering, why do you need to guarantee that there are no 
> >>> uncompleted requests? The resume procedure is going to call blkif_recover 
> >>> that will take care of requeuing any unfinished requests that are on the 
> >>> ring.
> >>>
> >>
> >> Because there may have requests in the software request queue with more segments than
> >> we can handle(if info->max_indirect_segments is reduced).
> >>
> >> The blkif_recover() can't handle this since blk-mq was introduced,
> >> because there is no way to iterate the sw-request queues(blk_fetch_request() can't be used by blk-mq).
> >>
> >> So there is a bug in blkif_recover(), I was thinking implement the suspend function of blkfront_driver like:
> > 
> > Hm, this is a regression and should be fixed ASAP. I'm still not sure I 
> > follow, don't blk_queue_max_segments change the number of segments the 
> > requests on the queue are going to have? So that you will only have to 
> > re-queue the requests already on the ring?
> > 
> 
> That's not enough, request queues were split to software queues and hardware queues since blk-mq was introduced.
> We need to consider two more things:
>  * Stop new requests be added to software queues before blk_queue_max_segments() is called(still using old 'max-indirect-segments').
>    I didn't see other way except call blk_mq_freeze_queue().

Right, stopping the queues doesn't seem to be an issue?

>  * Requests already in software queues but with old 'max-indirect-segments' also have to be re-queued based on new 'max-indirect-segments'.
>    Neither blk-mq API can do this.

I'm afraid I don't know much about the new blk-mq API, you will have to ask 
the blk-mq maintainers for solutions, since this was possible with the 
previous API (and I would consider a regression that the new blk-mq API 
doesn't allow it).

Roger.
David Vrabel July 28, 2016, 9:31 a.m. UTC | #10
On 27/07/16 04:21, Bob Liu wrote:
> The current VBD layer reserves buffer space for each attached device based on
> three statically configured settings which are read at boot time.
>  * max_indirect_segs: Maximum amount of segments.
>  * max_ring_page_order: Maximum order of pages to be used for the shared ring.
>  * max_queues: Maximum of queues(rings) to be used.

I think you want a single knob for "maximum number of pages in flight".
 This is much easier for an admin to relate to resource usage.

You could apply this limit without having to reconfigure, which would
simplify this patch quite a bit.

David
diff mbox

Patch

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 1b4c380..57baa54 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -212,6 +212,10 @@  struct blkfront_info
 	/* Save uncomplete reqs and bios for migration. */
 	struct list_head requests;
 	struct bio_list bio_list;
+	/* For dynamic configuration. */
+	unsigned int reconfiguring:1;
+	unsigned int max_ring_page_order;
+	unsigned int max_queues;
 };
 
 static unsigned int nr_minors;
@@ -1350,6 +1354,31 @@  static void blkif_free(struct blkfront_info *info, int suspend)
 	for (i = 0; i < info->nr_rings; i++)
 		blkif_free_ring(&info->rinfo[i]);
 
+	/* Remove old xenstore nodes. */
+	if (info->nr_ring_pages > 1)
+		xenbus_rm(XBT_NIL, info->xbdev->nodename, "ring-page-order");
+
+	if (info->nr_rings == 1) {
+		if (info->nr_ring_pages == 1) {
+			xenbus_rm(XBT_NIL, info->xbdev->nodename, "ring-ref");
+		} else {
+			for (i = 0; i < info->nr_ring_pages; i++) {
+				char ring_ref_name[RINGREF_NAME_LEN];
+
+				snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i);
+				xenbus_rm(XBT_NIL, info->xbdev->nodename, ring_ref_name);
+			}
+		}
+	} else {
+		xenbus_rm(XBT_NIL, info->xbdev->nodename, "multi-queue-num-queues");
+
+		for (i = 0; i < info->nr_rings; i++) {
+			char queuename[QUEUE_NAME_LEN];
+
+			snprintf(queuename, QUEUE_NAME_LEN, "queue-%u", i);
+			xenbus_rm(XBT_NIL, info->xbdev->nodename, queuename);
+		}
+	}
 	kfree(info->rinfo);
 	info->rinfo = NULL;
 	info->nr_rings = 0;
@@ -1763,15 +1792,20 @@  static int talk_to_blkback(struct xenbus_device *dev,
 	const char *message = NULL;
 	struct xenbus_transaction xbt;
 	int err;
-	unsigned int i, max_page_order = 0;
+	unsigned int i, backend_max_order = 0;
 	unsigned int ring_page_order = 0;
 
 	err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
-			   "max-ring-page-order", "%u", &max_page_order);
+			   "max-ring-page-order", "%u", &backend_max_order);
 	if (err != 1)
 		info->nr_ring_pages = 1;
 	else {
-		ring_page_order = min(xen_blkif_max_ring_order, max_page_order);
+		if (info->max_ring_page_order)
+			/* Dynamic configured through /sys. */
+			ring_page_order = min(info->max_ring_page_order, backend_max_order);
+		else
+			/* Default. */
+			ring_page_order = min(xen_blkif_max_ring_order, backend_max_order);
 		info->nr_ring_pages = 1 << ring_page_order;
 	}
 
@@ -1894,7 +1928,13 @@  static int negotiate_mq(struct blkfront_info *info)
 	if (err < 0)
 		backend_max_queues = 1;
 
-	info->nr_rings = min(backend_max_queues, xen_blkif_max_queues);
+	if (info->max_queues)
+		/* Dynamic configured through /sys */
+		info->nr_rings = min(backend_max_queues, info->max_queues);
+	else
+		/* Default. */
+		info->nr_rings = min(backend_max_queues, xen_blkif_max_queues);
+
 	/* We need at least one ring. */
 	if (!info->nr_rings)
 		info->nr_rings = 1;
@@ -2352,11 +2392,198 @@  static void blkfront_gather_backend_features(struct blkfront_info *info)
 			    NULL);
 	if (err)
 		info->max_indirect_segments = 0;
-	else
-		info->max_indirect_segments = min(indirect_segments,
-						  xen_blkif_max_segments);
+	else {
+		if (info->max_indirect_segments)
+			/* Dynamic configured through /sys */
+			info->max_indirect_segments = min(indirect_segments,
+							  info->max_indirect_segments);
+		else
+			info->max_indirect_segments = min(indirect_segments,
+							  xen_blkif_max_segments);
+	}
+}
+
+static ssize_t max_ring_page_order_show(struct device *dev,
+					struct device_attribute *attr, char *page)
+{
+	struct blkfront_info *info = dev_get_drvdata(dev);
+
+	return sprintf(page, "%u\n", get_order(info->nr_ring_pages * XEN_PAGE_SIZE));
+}
+
+static ssize_t max_indirect_segs_show(struct device *dev,
+				      struct device_attribute *attr, char *page)
+{
+	struct blkfront_info *info = dev_get_drvdata(dev);
+
+	return sprintf(page, "%u\n", info->max_indirect_segments);
+}
+
+static ssize_t max_queues_show(struct device *dev,
+			       struct device_attribute *attr, char *page)
+{
+	struct blkfront_info *info = dev_get_drvdata(dev);
+
+	return sprintf(page, "%u\n", info->nr_rings);
+}
+
+static ssize_t dynamic_reconfig_device(struct blkfront_info *info, ssize_t count)
+{
+	/*
+	 * Prevent new requests even to software request queue.
+	 */
+	blk_mq_freeze_queue(info->rq);
+
+	/*
+	 * Guarantee no uncompleted reqs.
+	 */
+	if (part_in_flight(&info->gd->part0) || info->reconfiguring) {
+		blk_mq_unfreeze_queue(info->rq);
+		pr_err("Dev:%s busy, please retry later.\n", dev_name(&info->xbdev->dev));
+		return -EBUSY;
+	}
+
+	/*
+	 * Frontend: 				Backend:
+	 * Freeze_queue()
+	 * Switch to XenbusStateClosed
+	 *					frontend_changed(StateClosed)
+	 *						> xen_blkif_disconnect()
+	 *						> Switch to XenbusStateClosed
+	 * blkback_changed(StateClosed)
+	 *	> blkfront_resume()
+	 *	> Switch to StateInitialised
+	 *					frontend_changed(StateInitialised):
+	 *						> reconnect
+	 *						> Switch to StateConnected
+	 * blkback_changed(StateConnected)
+	 *	> blkif_recover()
+	 *		> Also switch to StateConnected
+	 *		> Unfreeze_queue()
+	 */
+	info->reconfiguring = true;
+	xenbus_switch_state(info->xbdev, XenbusStateClosed);
+
+	return count;
+}
+
+static ssize_t max_indirect_segs_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	ssize_t ret;
+	unsigned int max_segs = 0, backend_max_segs = 0;
+	struct blkfront_info *info = dev_get_drvdata(dev);
+	int err;
+
+	ret = kstrtouint(buf, 10, &max_segs);
+	if (ret < 0)
+		return ret;
+
+	if (max_segs == info->max_indirect_segments)
+		return count;
+
+	err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
+			    "feature-max-indirect-segments", "%u", &backend_max_segs,
+			    NULL);
+	if (err) {
+		pr_err("Backend %s doesn't support feature-indirect-segments.\n",
+			info->xbdev->otherend);
+		return -EOPNOTSUPP;
+	}
+
+	if (max_segs > backend_max_segs) {
+		pr_err("Invalid max indirect segment (%u), backend-max: %u.\n",
+			max_segs, backend_max_segs);
+		return -EINVAL;
+	}
+
+	info->max_indirect_segments = max_segs;
+
+	return dynamic_reconfig_device(info, count);
+}
+
+static ssize_t max_ring_page_order_store(struct device *dev,
+				 struct device_attribute *attr,
+				 const char *buf, size_t count)
+{
+	ssize_t ret;
+	unsigned int max_order = 0, backend_max_order = 0;
+	struct blkfront_info *info = dev_get_drvdata(dev);
+	int err;
+
+	ret = kstrtouint(buf, 10, &max_order);
+	if (ret < 0)
+		return ret;
+
+	if ((1 << max_order) == info->nr_ring_pages)
+		return count;
+
+	if (max_order > XENBUS_MAX_RING_GRANT_ORDER) {
+		pr_err("Invalid max_ring_page_order (%u), max: %u.\n",
+				max_order, XENBUS_MAX_RING_GRANT_ORDER);
+		return -EINVAL;
+	}
+
+	err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
+			   "max-ring-page-order", "%u", &backend_max_order);
+	if (err != 1) {
+		pr_err("Backend %s doesn't support feature multi-page-ring.\n",
+			info->xbdev->otherend);
+		return -EOPNOTSUPP;
+	}
+	if (max_order > backend_max_order) {
+		pr_err("Invalid max_ring_page_order (%u), backend supports max: %u.\n",
+			max_order, backend_max_order);
+		return -EINVAL;
+	}
+	info->max_ring_page_order = max_order;
+
+	return dynamic_reconfig_device(info, count);
+}
+
+static ssize_t max_queues_store(struct device *dev,
+				struct device_attribute *attr,
+				const char *buf, size_t count)
+{
+	ssize_t ret;
+	unsigned int max_queues = 0, backend_max_queues = 0;
+	struct blkfront_info *info = dev_get_drvdata(dev);
+	int err;
+
+	ret = kstrtouint(buf, 10, &max_queues);
+	if (ret < 0)
+		return ret;
+
+	if (max_queues == info->nr_rings)
+		return count;
+
+	if (max_queues > num_online_cpus()) {
+		pr_err("Invalid max_queues (%u), can't bigger than online cpus: %u.\n",
+			max_queues, num_online_cpus());
+		return -EINVAL;
+	}
+
+	err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
+			   "multi-queue-max-queues", "%u", &backend_max_queues);
+	if (err != 1) {
+		pr_err("Backend %s doesn't support block multi queue.\n",
+			info->xbdev->otherend);
+		return -EOPNOTSUPP;
+	}
+	if (max_queues > backend_max_queues) {
+		pr_err("Invalid max_queues (%u), backend supports max: %u.\n",
+			max_queues, backend_max_queues);
+		return -EINVAL;
+	}
+	info->max_queues = max_queues;
+
+	return dynamic_reconfig_device(info, count);
 }
 
+static DEVICE_ATTR_RW(max_queues);
+static DEVICE_ATTR_RW(max_ring_page_order);
+static DEVICE_ATTR_RW(max_indirect_segs);
+
 /*
  * Invoked when the backend is finally 'ready' (and has told produced
  * the details about the physical device - #sectors, size, etc).
@@ -2393,6 +2620,10 @@  static void blkfront_connect(struct blkfront_info *info)
 		 * supports indirect descriptors, and how many.
 		 */
 		blkif_recover(info);
+		if (info->reconfiguring) {
+			blk_mq_unfreeze_queue(info->rq);
+			info->reconfiguring = false;
+		}
 		return;
 
 	default:
@@ -2443,6 +2674,22 @@  static void blkfront_connect(struct blkfront_info *info)
 		return;
 	}
 
+	err = device_create_file(&info->xbdev->dev, &dev_attr_max_ring_page_order);
+	if (err)
+		goto fail;
+
+	err = device_create_file(&info->xbdev->dev, &dev_attr_max_indirect_segs);
+	if (err) {
+		device_remove_file(&info->xbdev->dev, &dev_attr_max_ring_page_order);
+		goto fail;
+	}
+
+	err = device_create_file(&info->xbdev->dev, &dev_attr_max_queues);
+	if (err) {
+		device_remove_file(&info->xbdev->dev, &dev_attr_max_ring_page_order);
+		device_remove_file(&info->xbdev->dev, &dev_attr_max_indirect_segs);
+		goto fail;
+	}
 	xenbus_switch_state(info->xbdev, XenbusStateConnected);
 
 	/* Kick pending requests. */
@@ -2453,6 +2700,12 @@  static void blkfront_connect(struct blkfront_info *info)
 	add_disk(info->gd);
 
 	info->is_ready = 1;
+	return;
+
+fail:
+	blkif_free(info, 0);
+	xlvbd_release_gendisk(info);
+	return;
 }
 
 /**
@@ -2500,8 +2753,16 @@  static void blkback_changed(struct xenbus_device *dev,
 		break;
 
 	case XenbusStateClosed:
-		if (dev->state == XenbusStateClosed)
+		if (dev->state == XenbusStateClosed) {
+			if (info->reconfiguring)
+				if (blkfront_resume(info->xbdev)) {
+					/* Resume failed. */
+					info->reconfiguring = false;
+					xenbus_switch_state(info->xbdev, XenbusStateClosed);
+					pr_err("Resume from dynamic configuration failed\n");
+				}
 			break;
+		}
 		/* Missed the backend's Closing state -- fallthrough */
 	case XenbusStateClosing:
 		if (info)