diff mbox

[v3] vfio/mdev: Check globally for duplicate devices

Message ID 20180517031746.32242.17768.stgit@gimli.home (mailing list archive)
State New, archived
Headers show

Commit Message

Alex Williamson May 17, 2018, 3:30 a.m. UTC
When we create an mdev device, we check for duplicates against the
parent device and return -EEXIST if found, but the mdev device
namespace is global since we'll link all devices from the bus.  We do
catch this later in sysfs_do_create_link_sd() to return -EEXIST, but
with it comes a kernel warning and stack trace for trying to create
duplicate sysfs links, which makes it an undesirable response.

Therefore we should really be looking for duplicates across all mdev
parent devices, or as implemented here, against our mdev device list.
Using mdev_list to prevent duplicates means that we can remove
mdev_parent.lock, but in order not to serialize mdev device creation
and removal globally, we add mdev_device.active which allows UUIDs to
be reserved such that we can drop the mdev_list_lock before the mdev
device is fully in place.

NB. there was never intended to be any serialization guarantee
provided by the mdev core with respect to creation and removal of mdev
devices, mdev_parent.lock provided this only as a side-effect of the
implementation for locking the namespace per parent.  That
serialization is now removed.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---

v3: Rework locking and add a field to mdev_device so we can track
    completed instances vs those added to reserve the namespace.

 drivers/vfio/mdev/mdev_core.c    |   94 +++++++++++++-------------------------
 drivers/vfio/mdev/mdev_private.h |    2 -
 2 files changed, 34 insertions(+), 62 deletions(-)

Comments

Cornelia Huck May 17, 2018, 8:09 a.m. UTC | #1
On Wed, 16 May 2018 21:30:19 -0600
Alex Williamson <alex.williamson@redhat.com> wrote:

> When we create an mdev device, we check for duplicates against the
> parent device and return -EEXIST if found, but the mdev device
> namespace is global since we'll link all devices from the bus.  We do
> catch this later in sysfs_do_create_link_sd() to return -EEXIST, but
> with it comes a kernel warning and stack trace for trying to create
> duplicate sysfs links, which makes it an undesirable response.
> 
> Therefore we should really be looking for duplicates across all mdev
> parent devices, or as implemented here, against our mdev device list.
> Using mdev_list to prevent duplicates means that we can remove
> mdev_parent.lock, but in order not to serialize mdev device creation
> and removal globally, we add mdev_device.active which allows UUIDs to
> be reserved such that we can drop the mdev_list_lock before the mdev
> device is fully in place.
> 
> NB. there was never intended to be any serialization guarantee
> provided by the mdev core with respect to creation and removal of mdev
> devices, mdev_parent.lock provided this only as a side-effect of the
> implementation for locking the namespace per parent.  That
> serialization is now removed.

This is probably fine; but I noted that documentation on the locking
conventions and serialization guarantees for mdev is a bit sparse, and
this topic also came up during the vfio-ap review.

We probably want to add some more concrete documentation; would the
kernel doc for the _ops or vfio-mediated-device.txt be a better place
for that?

[Dong Jia, Halil: Can you please take a look whether vfio-ccw is really
ok? I don't think we open up any new races, but I'd appreciate a second
or third opinion.]

> 
> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> ---
> 
> v3: Rework locking and add a field to mdev_device so we can track
>     completed instances vs those added to reserve the namespace.
> 
>  drivers/vfio/mdev/mdev_core.c    |   94 +++++++++++++-------------------------
>  drivers/vfio/mdev/mdev_private.h |    2 -
>  2 files changed, 34 insertions(+), 62 deletions(-)
> 
> diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c
> index 126991046eb7..55ea9d34ec69 100644
> --- a/drivers/vfio/mdev/mdev_core.c
> +++ b/drivers/vfio/mdev/mdev_core.c
> @@ -66,34 +66,6 @@ uuid_le mdev_uuid(struct mdev_device *mdev)
>  }
>  EXPORT_SYMBOL(mdev_uuid);
>  
> -static int _find_mdev_device(struct device *dev, void *data)
> -{
> -	struct mdev_device *mdev;
> -
> -	if (!dev_is_mdev(dev))
> -		return 0;
> -
> -	mdev = to_mdev_device(dev);
> -
> -	if (uuid_le_cmp(mdev->uuid, *(uuid_le *)data) == 0)
> -		return 1;
> -
> -	return 0;
> -}
> -
> -static bool mdev_device_exist(struct mdev_parent *parent, uuid_le uuid)
> -{
> -	struct device *dev;
> -
> -	dev = device_find_child(parent->dev, &uuid, _find_mdev_device);
> -	if (dev) {
> -		put_device(dev);
> -		return true;
> -	}
> -
> -	return false;
> -}
> -
>  /* Should be called holding parent_list_lock */
>  static struct mdev_parent *__find_parent_device(struct device *dev)
>  {
> @@ -221,7 +193,6 @@ int mdev_register_device(struct device *dev, const struct mdev_parent_ops *ops)
>  	}
>  
>  	kref_init(&parent->ref);
> -	mutex_init(&parent->lock);
>  
>  	parent->dev = dev;
>  	parent->ops = ops;
> @@ -304,7 +275,7 @@ static void mdev_device_release(struct device *dev)
>  int mdev_device_create(struct kobject *kobj, struct device *dev, uuid_le uuid)
>  {
>  	int ret;
> -	struct mdev_device *mdev;
> +	struct mdev_device *mdev, *tmp;
>  	struct mdev_parent *parent;
>  	struct mdev_type *type = to_mdev_type(kobj);
>  
> @@ -312,21 +283,26 @@ int mdev_device_create(struct kobject *kobj, struct device *dev, uuid_le uuid)
>  	if (!parent)
>  		return -EINVAL;
>  
> -	mutex_lock(&parent->lock);
> +	mutex_lock(&mdev_list_lock);
>  
>  	/* Check for duplicate */
> -	if (mdev_device_exist(parent, uuid)) {
> -		ret = -EEXIST;
> -		goto create_err;
> +	list_for_each_entry(tmp, &mdev_list, next) {
> +		if (!uuid_le_cmp(tmp->uuid, uuid)) {
> +			mutex_unlock(&mdev_list_lock);
> +			return -EEXIST;
> +		}
>  	}
>  
>  	mdev = kzalloc(sizeof(*mdev), GFP_KERNEL);
>  	if (!mdev) {
> -		ret = -ENOMEM;
> -		goto create_err;
> +		mutex_unlock(&mdev_list_lock);
> +		return -ENOMEM;
>  	}
>  
>  	memcpy(&mdev->uuid, &uuid, sizeof(uuid_le));
> +	list_add(&mdev->next, &mdev_list);
> +	mutex_unlock(&mdev_list_lock);
> +
>  	mdev->parent = parent;
>  	kref_init(&mdev->ref);
>  
> @@ -352,21 +328,18 @@ int mdev_device_create(struct kobject *kobj, struct device *dev, uuid_le uuid)
>  	}
>  
>  	mdev->type_kobj = kobj;
> +	mdev->active = true;
>  	dev_dbg(&mdev->dev, "MDEV: created\n");
>  
> -	mutex_unlock(&parent->lock);
> -
> -	mutex_lock(&mdev_list_lock);
> -	list_add(&mdev->next, &mdev_list);
> -	mutex_unlock(&mdev_list_lock);
> -
> -	return ret;
> +	return 0;
>  
>  create_failed:
>  	device_unregister(&mdev->dev);
>  
>  create_err:
> -	mutex_unlock(&parent->lock);
> +	mutex_lock(&mdev_list_lock);
> +	list_del(&mdev->next);
> +	mutex_unlock(&mdev_list_lock);
>  	mdev_put_parent(parent);
>  	return ret;
>  }
> @@ -377,44 +350,43 @@ int mdev_device_remove(struct device *dev, bool force_remove)
>  	struct mdev_parent *parent;
>  	struct mdev_type *type;
>  	int ret;
> -	bool found = false;
>  
>  	mdev = to_mdev_device(dev);
>  
>  	mutex_lock(&mdev_list_lock);
>  	list_for_each_entry(tmp, &mdev_list, next) {
> -		if (tmp == mdev) {
> -			found = true;
> +		if (tmp == mdev)
>  			break;
> -		}
>  	}
>  
> -	if (found)
> -		list_del(&mdev->next);
> +	if (tmp != mdev) {
> +		mutex_unlock(&mdev_list_lock);
> +		return -ENODEV;
> +	}
>  
> -	mutex_unlock(&mdev_list_lock);
> +	if (!mdev->active) {
> +		mutex_unlock(&mdev_list_lock);
> +		return -EAGAIN;
> +	}

I'm not sure whether this is 100% watertight. Consider:

- device gets registered, we have added it to the list, made it visible
  in sysfs and have added the remove attribute, but not yet the symlinks
- userspace can access the remove attribute and trigger removal
- we do an early exit here because not yet active
- ???

(If there's any problem, it's a very pathological case, and I don't
think anything really bad can happen. I just want to make sure we don't
miss anything.)

>  
> -	if (!found)
> -		return -ENODEV;
> +	mdev->active = false;
> +	mutex_unlock(&mdev_list_lock);
>  
>  	type = to_mdev_type(mdev->type_kobj);
>  	parent = mdev->parent;
> -	mutex_lock(&parent->lock);
>  
>  	ret = mdev_device_remove_ops(mdev, force_remove);
>  	if (ret) {
> -		mutex_unlock(&parent->lock);
> -
> -		mutex_lock(&mdev_list_lock);
> -		list_add(&mdev->next, &mdev_list);
> -		mutex_unlock(&mdev_list_lock);
> -
> +		mdev->active = true;
>  		return ret;
>  	}
>  
> +	mutex_lock(&mdev_list_lock);
> +	list_del(&mdev->next);
> +	mutex_unlock(&mdev_list_lock);
> +
>  	mdev_remove_sysfs_files(dev, type);
>  	device_unregister(dev);
> -	mutex_unlock(&parent->lock);
>  	mdev_put_parent(parent);
>  
>  	return 0;
> diff --git a/drivers/vfio/mdev/mdev_private.h b/drivers/vfio/mdev/mdev_private.h
> index a9cefd70a705..b5819b7d7ef7 100644
> --- a/drivers/vfio/mdev/mdev_private.h
> +++ b/drivers/vfio/mdev/mdev_private.h
> @@ -20,7 +20,6 @@ struct mdev_parent {
>  	struct device *dev;
>  	const struct mdev_parent_ops *ops;
>  	struct kref ref;
> -	struct mutex lock;
>  	struct list_head next;
>  	struct kset *mdev_types_kset;
>  	struct list_head type_list;
> @@ -34,6 +33,7 @@ struct mdev_device {
>  	struct kref ref;
>  	struct list_head next;
>  	struct kobject *type_kobj;
> +	bool active;
>  };
>  
>  #define to_mdev_device(dev)	container_of(dev, struct mdev_device, dev)
>
Kirti Wankhede May 17, 2018, 3:55 p.m. UTC | #2
On 5/17/2018 1:39 PM, Cornelia Huck wrote:
> On Wed, 16 May 2018 21:30:19 -0600
> Alex Williamson <alex.williamson@redhat.com> wrote:
> 
>> When we create an mdev device, we check for duplicates against the
>> parent device and return -EEXIST if found, but the mdev device
>> namespace is global since we'll link all devices from the bus.  We do
>> catch this later in sysfs_do_create_link_sd() to return -EEXIST, but
>> with it comes a kernel warning and stack trace for trying to create
>> duplicate sysfs links, which makes it an undesirable response.
>>
>> Therefore we should really be looking for duplicates across all mdev
>> parent devices, or as implemented here, against our mdev device list.
>> Using mdev_list to prevent duplicates means that we can remove
>> mdev_parent.lock, but in order not to serialize mdev device creation
>> and removal globally, we add mdev_device.active which allows UUIDs to
>> be reserved such that we can drop the mdev_list_lock before the mdev
>> device is fully in place.
>>
>> NB. there was never intended to be any serialization guarantee
>> provided by the mdev core with respect to creation and removal of mdev
>> devices, mdev_parent.lock provided this only as a side-effect of the
>> implementation for locking the namespace per parent.  That
>> serialization is now removed.
> 

mdev_parent.lock is to serialize create and remove of that mdev device,
that handles race condition that Cornelia mentioned below.

> This is probably fine; but I noted that documentation on the locking
> conventions and serialization guarantees for mdev is a bit sparse, and
> this topic also came up during the vfio-ap review.
> 
> We probably want to add some more concrete documentation; would the
> kernel doc for the _ops or vfio-mediated-device.txt be a better place
> for that?
> 
> [Dong Jia, Halil: Can you please take a look whether vfio-ccw is really
> ok? I don't think we open up any new races, but I'd appreciate a second
> or third opinion.]
> 
>>
>> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
>> ---
>>
>> v3: Rework locking and add a field to mdev_device so we can track
>>     completed instances vs those added to reserve the namespace.
>>
>>  drivers/vfio/mdev/mdev_core.c    |   94 +++++++++++++-------------------------
>>  drivers/vfio/mdev/mdev_private.h |    2 -
>>  2 files changed, 34 insertions(+), 62 deletions(-)
>>
>> diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c
>> index 126991046eb7..55ea9d34ec69 100644
>> --- a/drivers/vfio/mdev/mdev_core.c
>> +++ b/drivers/vfio/mdev/mdev_core.c
>> @@ -66,34 +66,6 @@ uuid_le mdev_uuid(struct mdev_device *mdev)
>>  }
>>  EXPORT_SYMBOL(mdev_uuid);
>>  
>> -static int _find_mdev_device(struct device *dev, void *data)
>> -{
>> -	struct mdev_device *mdev;
>> -
>> -	if (!dev_is_mdev(dev))
>> -		return 0;
>> -
>> -	mdev = to_mdev_device(dev);
>> -
>> -	if (uuid_le_cmp(mdev->uuid, *(uuid_le *)data) == 0)
>> -		return 1;
>> -
>> -	return 0;
>> -}
>> -
>> -static bool mdev_device_exist(struct mdev_parent *parent, uuid_le uuid)
>> -{
>> -	struct device *dev;
>> -
>> -	dev = device_find_child(parent->dev, &uuid, _find_mdev_device);
>> -	if (dev) {
>> -		put_device(dev);
>> -		return true;
>> -	}
>> -
>> -	return false;
>> -}
>> -
>>  /* Should be called holding parent_list_lock */
>>  static struct mdev_parent *__find_parent_device(struct device *dev)
>>  {
>> @@ -221,7 +193,6 @@ int mdev_register_device(struct device *dev, const struct mdev_parent_ops *ops)
>>  	}
>>  
>>  	kref_init(&parent->ref);
>> -	mutex_init(&parent->lock);
>>  
>>  	parent->dev = dev;
>>  	parent->ops = ops;
>> @@ -304,7 +275,7 @@ static void mdev_device_release(struct device *dev)
>>  int mdev_device_create(struct kobject *kobj, struct device *dev, uuid_le uuid)
>>  {
>>  	int ret;
>> -	struct mdev_device *mdev;
>> +	struct mdev_device *mdev, *tmp;
>>  	struct mdev_parent *parent;
>>  	struct mdev_type *type = to_mdev_type(kobj);
>>  
>> @@ -312,21 +283,26 @@ int mdev_device_create(struct kobject *kobj, struct device *dev, uuid_le uuid)
>>  	if (!parent)
>>  		return -EINVAL;
>>  
>> -	mutex_lock(&parent->lock);
>> +	mutex_lock(&mdev_list_lock);
>>  
>>  	/* Check for duplicate */
>> -	if (mdev_device_exist(parent, uuid)) {
>> -		ret = -EEXIST;
>> -		goto create_err;
>> +	list_for_each_entry(tmp, &mdev_list, next) {
>> +		if (!uuid_le_cmp(tmp->uuid, uuid)) {
>> +			mutex_unlock(&mdev_list_lock);
>> +			return -EEXIST;
>> +		}
>>  	}
>>  

mdev_put_parent(parent) missing before return.


>>  	mdev = kzalloc(sizeof(*mdev), GFP_KERNEL);
>>  	if (!mdev) {
>> -		ret = -ENOMEM;
>> -		goto create_err;
>> +		mutex_unlock(&mdev_list_lock);
>> +		return -ENOMEM;
>>  	}
>>

mdev_put_parent(parent) missing here again.

Thanks,
Kirti

>>  	memcpy(&mdev->uuid, &uuid, sizeof(uuid_le));
>> +	list_add(&mdev->next, &mdev_list);
>> +	mutex_unlock(&mdev_list_lock);
>> +
>>  	mdev->parent = parent;
>>  	kref_init(&mdev->ref);
>>  
>> @@ -352,21 +328,18 @@ int mdev_device_create(struct kobject *kobj, struct device *dev, uuid_le uuid)
>>  	}
>>  
>>  	mdev->type_kobj = kobj;
>> +	mdev->active = true;
>>  	dev_dbg(&mdev->dev, "MDEV: created\n");
>>  
>> -	mutex_unlock(&parent->lock);
>> -
>> -	mutex_lock(&mdev_list_lock);
>> -	list_add(&mdev->next, &mdev_list);
>> -	mutex_unlock(&mdev_list_lock);
>> -
>> -	return ret;
>> +	return 0;
>>  
>>  create_failed:
>>  	device_unregister(&mdev->dev);
>>  
>>  create_err:
>> -	mutex_unlock(&parent->lock);
>> +	mutex_lock(&mdev_list_lock);
>> +	list_del(&mdev->next);
>> +	mutex_unlock(&mdev_list_lock);
>>  	mdev_put_parent(parent);
>>  	return ret;
>>  }
>> @@ -377,44 +350,43 @@ int mdev_device_remove(struct device *dev, bool force_remove)
>>  	struct mdev_parent *parent;
>>  	struct mdev_type *type;
>>  	int ret;
>> -	bool found = false;
>>  
>>  	mdev = to_mdev_device(dev);
>>  
>>  	mutex_lock(&mdev_list_lock);
>>  	list_for_each_entry(tmp, &mdev_list, next) {
>> -		if (tmp == mdev) {
>> -			found = true;
>> +		if (tmp == mdev)
>>  			break;
>> -		}
>>  	}
>>  
>> -	if (found)
>> -		list_del(&mdev->next);
>> +	if (tmp != mdev) {
>> +		mutex_unlock(&mdev_list_lock);
>> +		return -ENODEV;
>> +	}
>>  
>> -	mutex_unlock(&mdev_list_lock);
>> +	if (!mdev->active) {
>> +		mutex_unlock(&mdev_list_lock);
>> +		return -EAGAIN;
>> +	}
> 
> I'm not sure whether this is 100% watertight. Consider:
> 
> - device gets registered, we have added it to the list, made it visible
>   in sysfs and have added the remove attribute, but not yet the symlinks
> - userspace can access the remove attribute and trigger removal
> - we do an early exit here because not yet active
> - ???
> 
> (If there's any problem, it's a very pathological case, and I don't
> think anything really bad can happen. I just want to make sure we don't
> miss anything.)
> 
>>  
>> -	if (!found)
>> -		return -ENODEV;
>> +	mdev->active = false;
>> +	mutex_unlock(&mdev_list_lock);
>>  
>>  	type = to_mdev_type(mdev->type_kobj);
>>  	parent = mdev->parent;
>> -	mutex_lock(&parent->lock);
>>  
>>  	ret = mdev_device_remove_ops(mdev, force_remove);
>>  	if (ret) {
>> -		mutex_unlock(&parent->lock);
>> -
>> -		mutex_lock(&mdev_list_lock);
>> -		list_add(&mdev->next, &mdev_list);
>> -		mutex_unlock(&mdev_list_lock);
>> -
>> +		mdev->active = true;
>>  		return ret;
>>  	}
>>  
>> +	mutex_lock(&mdev_list_lock);
>> +	list_del(&mdev->next);
>> +	mutex_unlock(&mdev_list_lock);
>> +
>>  	mdev_remove_sysfs_files(dev, type);
>>  	device_unregister(dev);
>> -	mutex_unlock(&parent->lock);
>>  	mdev_put_parent(parent);
>>  
>>  	return 0;
>> diff --git a/drivers/vfio/mdev/mdev_private.h b/drivers/vfio/mdev/mdev_private.h
>> index a9cefd70a705..b5819b7d7ef7 100644
>> --- a/drivers/vfio/mdev/mdev_private.h
>> +++ b/drivers/vfio/mdev/mdev_private.h
>> @@ -20,7 +20,6 @@ struct mdev_parent {
>>  	struct device *dev;
>>  	const struct mdev_parent_ops *ops;
>>  	struct kref ref;
>> -	struct mutex lock;
>>  	struct list_head next;
>>  	struct kset *mdev_types_kset;
>>  	struct list_head type_list;
>> @@ -34,6 +33,7 @@ struct mdev_device {
>>  	struct kref ref;
>>  	struct list_head next;
>>  	struct kobject *type_kobj;
>> +	bool active;
>>  };
>>  
>>  #define to_mdev_device(dev)	container_of(dev, struct mdev_device, dev)
>>
>
Alex Williamson May 17, 2018, 4:20 p.m. UTC | #3
On Thu, 17 May 2018 21:25:22 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> On 5/17/2018 1:39 PM, Cornelia Huck wrote:
> > On Wed, 16 May 2018 21:30:19 -0600
> > Alex Williamson <alex.williamson@redhat.com> wrote:
> >   
> >> When we create an mdev device, we check for duplicates against the
> >> parent device and return -EEXIST if found, but the mdev device
> >> namespace is global since we'll link all devices from the bus.  We do
> >> catch this later in sysfs_do_create_link_sd() to return -EEXIST, but
> >> with it comes a kernel warning and stack trace for trying to create
> >> duplicate sysfs links, which makes it an undesirable response.
> >>
> >> Therefore we should really be looking for duplicates across all mdev
> >> parent devices, or as implemented here, against our mdev device list.
> >> Using mdev_list to prevent duplicates means that we can remove
> >> mdev_parent.lock, but in order not to serialize mdev device creation
> >> and removal globally, we add mdev_device.active which allows UUIDs to
> >> be reserved such that we can drop the mdev_list_lock before the mdev
> >> device is fully in place.
> >>
> >> NB. there was never intended to be any serialization guarantee
> >> provided by the mdev core with respect to creation and removal of mdev
> >> devices, mdev_parent.lock provided this only as a side-effect of the
> >> implementation for locking the namespace per parent.  That
> >> serialization is now removed.  
> >   
> 
> mdev_parent.lock is to serialize create and remove of that mdev device,
> that handles race condition that Cornelia mentioned below.

Previously it was stated:

On Thu, 17 May 2018 01:01:40 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:
> Here lock is not for create/remove routines of vendor driver, its about
> mdev device creation and device registration, which is a common code
> path, and so is part of mdev core module.

So the race condition was handled previously, but as a side-effect of
protecting the namespace, aiui.  I'm trying to state above that the
serialization of create/remove was never intended as a guarantee
provided to mdev vendor drivers.  I don't see that there's a need to
protect "mdev device creation and device registration" beyond conflicts
in the UUID namespace, which is done here.  Are there others?

> > This is probably fine; but I noted that documentation on the locking
> > conventions and serialization guarantees for mdev is a bit sparse, and
> > this topic also came up during the vfio-ap review.
> > 
> > We probably want to add some more concrete documentation; would the
> > kernel doc for the _ops or vfio-mediated-device.txt be a better place
> > for that?

I'll look to see where we can add a note withing that file, I suspect
that's the right place to put it.

> > [Dong Jia, Halil: Can you please take a look whether vfio-ccw is really
> > ok? I don't think we open up any new races, but I'd appreciate a second
> > or third opinion.]
> >   
> >>
> >> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> >> ---
> >>
> >> v3: Rework locking and add a field to mdev_device so we can track
> >>     completed instances vs those added to reserve the namespace.
> >>
> >>  drivers/vfio/mdev/mdev_core.c    |   94 +++++++++++++-------------------------
> >>  drivers/vfio/mdev/mdev_private.h |    2 -
> >>  2 files changed, 34 insertions(+), 62 deletions(-)
> >>
> >> diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c
> >> index 126991046eb7..55ea9d34ec69 100644
> >> --- a/drivers/vfio/mdev/mdev_core.c
> >> +++ b/drivers/vfio/mdev/mdev_core.c
> >> @@ -66,34 +66,6 @@ uuid_le mdev_uuid(struct mdev_device *mdev)
> >>  }
> >>  EXPORT_SYMBOL(mdev_uuid);
> >>  
> >> -static int _find_mdev_device(struct device *dev, void *data)
> >> -{
> >> -	struct mdev_device *mdev;
> >> -
> >> -	if (!dev_is_mdev(dev))
> >> -		return 0;
> >> -
> >> -	mdev = to_mdev_device(dev);
> >> -
> >> -	if (uuid_le_cmp(mdev->uuid, *(uuid_le *)data) == 0)
> >> -		return 1;
> >> -
> >> -	return 0;
> >> -}
> >> -
> >> -static bool mdev_device_exist(struct mdev_parent *parent, uuid_le uuid)
> >> -{
> >> -	struct device *dev;
> >> -
> >> -	dev = device_find_child(parent->dev, &uuid, _find_mdev_device);
> >> -	if (dev) {
> >> -		put_device(dev);
> >> -		return true;
> >> -	}
> >> -
> >> -	return false;
> >> -}
> >> -
> >>  /* Should be called holding parent_list_lock */
> >>  static struct mdev_parent *__find_parent_device(struct device *dev)
> >>  {
> >> @@ -221,7 +193,6 @@ int mdev_register_device(struct device *dev, const struct mdev_parent_ops *ops)
> >>  	}
> >>  
> >>  	kref_init(&parent->ref);
> >> -	mutex_init(&parent->lock);
> >>  
> >>  	parent->dev = dev;
> >>  	parent->ops = ops;
> >> @@ -304,7 +275,7 @@ static void mdev_device_release(struct device *dev)
> >>  int mdev_device_create(struct kobject *kobj, struct device *dev, uuid_le uuid)
> >>  {
> >>  	int ret;
> >> -	struct mdev_device *mdev;
> >> +	struct mdev_device *mdev, *tmp;
> >>  	struct mdev_parent *parent;
> >>  	struct mdev_type *type = to_mdev_type(kobj);
> >>  
> >> @@ -312,21 +283,26 @@ int mdev_device_create(struct kobject *kobj, struct device *dev, uuid_le uuid)
> >>  	if (!parent)
> >>  		return -EINVAL;
> >>  
> >> -	mutex_lock(&parent->lock);
> >> +	mutex_lock(&mdev_list_lock);
> >>  
> >>  	/* Check for duplicate */
> >> -	if (mdev_device_exist(parent, uuid)) {
> >> -		ret = -EEXIST;
> >> -		goto create_err;
> >> +	list_for_each_entry(tmp, &mdev_list, next) {
> >> +		if (!uuid_le_cmp(tmp->uuid, uuid)) {
> >> +			mutex_unlock(&mdev_list_lock);
> >> +			return -EEXIST;
> >> +		}
> >>  	}
> >>    
> 
> mdev_put_parent(parent) missing before return.
> 
> 
> >>  	mdev = kzalloc(sizeof(*mdev), GFP_KERNEL);
> >>  	if (!mdev) {
> >> -		ret = -ENOMEM;
> >> -		goto create_err;
> >> +		mutex_unlock(&mdev_list_lock);
> >> +		return -ENOMEM;
> >>  	}
> >>  
> 
> mdev_put_parent(parent) missing here again.


Oops, will fix both.


> >>  	memcpy(&mdev->uuid, &uuid, sizeof(uuid_le));
> >> +	list_add(&mdev->next, &mdev_list);
> >> +	mutex_unlock(&mdev_list_lock);
> >> +
> >>  	mdev->parent = parent;
> >>  	kref_init(&mdev->ref);
> >>  
> >> @@ -352,21 +328,18 @@ int mdev_device_create(struct kobject *kobj, struct device *dev, uuid_le uuid)
> >>  	}
> >>  
> >>  	mdev->type_kobj = kobj;
> >> +	mdev->active = true;
> >>  	dev_dbg(&mdev->dev, "MDEV: created\n");
> >>  
> >> -	mutex_unlock(&parent->lock);
> >> -
> >> -	mutex_lock(&mdev_list_lock);
> >> -	list_add(&mdev->next, &mdev_list);
> >> -	mutex_unlock(&mdev_list_lock);
> >> -
> >> -	return ret;
> >> +	return 0;
> >>  
> >>  create_failed:
> >>  	device_unregister(&mdev->dev);
> >>  
> >>  create_err:
> >> -	mutex_unlock(&parent->lock);
> >> +	mutex_lock(&mdev_list_lock);
> >> +	list_del(&mdev->next);
> >> +	mutex_unlock(&mdev_list_lock);
> >>  	mdev_put_parent(parent);
> >>  	return ret;
> >>  }
> >> @@ -377,44 +350,43 @@ int mdev_device_remove(struct device *dev, bool force_remove)
> >>  	struct mdev_parent *parent;
> >>  	struct mdev_type *type;
> >>  	int ret;
> >> -	bool found = false;
> >>  
> >>  	mdev = to_mdev_device(dev);
> >>  
> >>  	mutex_lock(&mdev_list_lock);
> >>  	list_for_each_entry(tmp, &mdev_list, next) {
> >> -		if (tmp == mdev) {
> >> -			found = true;
> >> +		if (tmp == mdev)
> >>  			break;
> >> -		}
> >>  	}
> >>  
> >> -	if (found)
> >> -		list_del(&mdev->next);
> >> +	if (tmp != mdev) {
> >> +		mutex_unlock(&mdev_list_lock);
> >> +		return -ENODEV;
> >> +	}
> >>  
> >> -	mutex_unlock(&mdev_list_lock);
> >> +	if (!mdev->active) {
> >> +		mutex_unlock(&mdev_list_lock);
> >> +		return -EAGAIN;
> >> +	}  
> > 
> > I'm not sure whether this is 100% watertight. Consider:
> > 
> > - device gets registered, we have added it to the list, made it visible
> >   in sysfs and have added the remove attribute, but not yet the symlinks
> > - userspace can access the remove attribute and trigger removal
> > - we do an early exit here because not yet active
> > - ???
> > 
> > (If there's any problem, it's a very pathological case, and I don't
> > think anything really bad can happen. I just want to make sure we don't
> > miss anything.)

The presented scenario is exactly the use case that the -EAGAIN return
is intended to handle.  I can't put a mutex on the mdev_device to block
this path as the mdev creation might ultimately fail and the device
released (perhaps not possible in our code flow, but that would be a
very subtle detail to rely on).  So any sort of blocking approach would
require releasing mdev_list_lock and re-scanning in a busy loop.  Why
bother to do that when we can indicate the same to the user through
-EAGAIN.  AIUI, this is the purpose of -EAGAIN and it's up to userspace
to decide how they'd like to handle it, try again or abort.  Are there
suggestions for alternatives?  Thanks,

Alex
Kirti Wankhede May 17, 2018, 8:26 p.m. UTC | #4
On 5/17/2018 9:50 PM, Alex Williamson wrote:
> On Thu, 17 May 2018 21:25:22 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
>> On 5/17/2018 1:39 PM, Cornelia Huck wrote:
>>> On Wed, 16 May 2018 21:30:19 -0600
>>> Alex Williamson <alex.williamson@redhat.com> wrote:
>>>   
>>>> When we create an mdev device, we check for duplicates against the
>>>> parent device and return -EEXIST if found, but the mdev device
>>>> namespace is global since we'll link all devices from the bus.  We do
>>>> catch this later in sysfs_do_create_link_sd() to return -EEXIST, but
>>>> with it comes a kernel warning and stack trace for trying to create
>>>> duplicate sysfs links, which makes it an undesirable response.
>>>>
>>>> Therefore we should really be looking for duplicates across all mdev
>>>> parent devices, or as implemented here, against our mdev device list.
>>>> Using mdev_list to prevent duplicates means that we can remove
>>>> mdev_parent.lock, but in order not to serialize mdev device creation
>>>> and removal globally, we add mdev_device.active which allows UUIDs to
>>>> be reserved such that we can drop the mdev_list_lock before the mdev
>>>> device is fully in place.
>>>>
>>>> NB. there was never intended to be any serialization guarantee
>>>> provided by the mdev core with respect to creation and removal of mdev
>>>> devices, mdev_parent.lock provided this only as a side-effect of the
>>>> implementation for locking the namespace per parent.  That
>>>> serialization is now removed.  
>>>   
>>
>> mdev_parent.lock is to serialize create and remove of that mdev device,
>> that handles race condition that Cornelia mentioned below.
> 
> Previously it was stated:
> 
> On Thu, 17 May 2018 01:01:40 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
>> Here lock is not for create/remove routines of vendor driver, its about
>> mdev device creation and device registration, which is a common code
>> path, and so is part of mdev core module.
> 
> So the race condition was handled previously, but as a side-effect of
> protecting the namespace, aiui.  I'm trying to state above that the
> serialization of create/remove was never intended as a guarantee
> provided to mdev vendor drivers.  I don't see that there's a need to
> protect "mdev device creation and device registration" beyond conflicts
> in the UUID namespace, which is done here.  Are there others?
> 

Sorry not being elaborative in my earlier response to

> If we can
> show that vendor drivers handle the create/remove paths themselves,
> perhaps we can refine the locking granularity.

mdev_device_create() function does :
- create mdev device
- register device
- call vendor driver->create
- create sysfs files.

mdev_device_remove() removes sysfs files, unregister device and delete
device.

There is common code in mdev_device_create() and mdev_device_remove()
independent of what vendor driver does in its create()/remove()
callback. Moving this code to each vendor driver to handle create/remove
themselves doesn't make sense to me.

mdev_parent.lock here does take care of race conditions that could occur
during mdev device creation and deletion in this common code path.

What is the urge to remove mdev_parent.lock if that handles all race
conditions without bothering user to handle -EAGAIN?

Thanks,
Kirti


>>> This is probably fine; but I noted that documentation on the locking
>>> conventions and serialization guarantees for mdev is a bit sparse, and
>>> this topic also came up during the vfio-ap review.
>>>
>>> We probably want to add some more concrete documentation; would the
>>> kernel doc for the _ops or vfio-mediated-device.txt be a better place
>>> for that?
> 
> I'll look to see where we can add a note withing that file, I suspect
> that's the right place to put it.
> 
>>> [Dong Jia, Halil: Can you please take a look whether vfio-ccw is really
>>> ok? I don't think we open up any new races, but I'd appreciate a second
>>> or third opinion.]
>>>   
>>>>
>>>> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
>>>> ---
>>>>
>>>> v3: Rework locking and add a field to mdev_device so we can track
>>>>     completed instances vs those added to reserve the namespace.
>>>>
>>>>  drivers/vfio/mdev/mdev_core.c    |   94 +++++++++++++-------------------------
>>>>  drivers/vfio/mdev/mdev_private.h |    2 -
>>>>  2 files changed, 34 insertions(+), 62 deletions(-)
>>>>
>>>> diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c
>>>> index 126991046eb7..55ea9d34ec69 100644
>>>> --- a/drivers/vfio/mdev/mdev_core.c
>>>> +++ b/drivers/vfio/mdev/mdev_core.c
>>>> @@ -66,34 +66,6 @@ uuid_le mdev_uuid(struct mdev_device *mdev)
>>>>  }
>>>>  EXPORT_SYMBOL(mdev_uuid);
>>>>  
>>>> -static int _find_mdev_device(struct device *dev, void *data)
>>>> -{
>>>> -	struct mdev_device *mdev;
>>>> -
>>>> -	if (!dev_is_mdev(dev))
>>>> -		return 0;
>>>> -
>>>> -	mdev = to_mdev_device(dev);
>>>> -
>>>> -	if (uuid_le_cmp(mdev->uuid, *(uuid_le *)data) == 0)
>>>> -		return 1;
>>>> -
>>>> -	return 0;
>>>> -}
>>>> -
>>>> -static bool mdev_device_exist(struct mdev_parent *parent, uuid_le uuid)
>>>> -{
>>>> -	struct device *dev;
>>>> -
>>>> -	dev = device_find_child(parent->dev, &uuid, _find_mdev_device);
>>>> -	if (dev) {
>>>> -		put_device(dev);
>>>> -		return true;
>>>> -	}
>>>> -
>>>> -	return false;
>>>> -}
>>>> -
>>>>  /* Should be called holding parent_list_lock */
>>>>  static struct mdev_parent *__find_parent_device(struct device *dev)
>>>>  {
>>>> @@ -221,7 +193,6 @@ int mdev_register_device(struct device *dev, const struct mdev_parent_ops *ops)
>>>>  	}
>>>>  
>>>>  	kref_init(&parent->ref);
>>>> -	mutex_init(&parent->lock);
>>>>  
>>>>  	parent->dev = dev;
>>>>  	parent->ops = ops;
>>>> @@ -304,7 +275,7 @@ static void mdev_device_release(struct device *dev)
>>>>  int mdev_device_create(struct kobject *kobj, struct device *dev, uuid_le uuid)
>>>>  {
>>>>  	int ret;
>>>> -	struct mdev_device *mdev;
>>>> +	struct mdev_device *mdev, *tmp;
>>>>  	struct mdev_parent *parent;
>>>>  	struct mdev_type *type = to_mdev_type(kobj);
>>>>  
>>>> @@ -312,21 +283,26 @@ int mdev_device_create(struct kobject *kobj, struct device *dev, uuid_le uuid)
>>>>  	if (!parent)
>>>>  		return -EINVAL;
>>>>  
>>>> -	mutex_lock(&parent->lock);
>>>> +	mutex_lock(&mdev_list_lock);
>>>>  
>>>>  	/* Check for duplicate */
>>>> -	if (mdev_device_exist(parent, uuid)) {
>>>> -		ret = -EEXIST;
>>>> -		goto create_err;
>>>> +	list_for_each_entry(tmp, &mdev_list, next) {
>>>> +		if (!uuid_le_cmp(tmp->uuid, uuid)) {
>>>> +			mutex_unlock(&mdev_list_lock);
>>>> +			return -EEXIST;
>>>> +		}
>>>>  	}
>>>>    
>>
>> mdev_put_parent(parent) missing before return.
>>
>>
>>>>  	mdev = kzalloc(sizeof(*mdev), GFP_KERNEL);
>>>>  	if (!mdev) {
>>>> -		ret = -ENOMEM;
>>>> -		goto create_err;
>>>> +		mutex_unlock(&mdev_list_lock);
>>>> +		return -ENOMEM;
>>>>  	}
>>>>  
>>
>> mdev_put_parent(parent) missing here again.
> 
> 
> Oops, will fix both.
> 
> 
>>>>  	memcpy(&mdev->uuid, &uuid, sizeof(uuid_le));
>>>> +	list_add(&mdev->next, &mdev_list);
>>>> +	mutex_unlock(&mdev_list_lock);
>>>> +
>>>>  	mdev->parent = parent;
>>>>  	kref_init(&mdev->ref);
>>>>  
>>>> @@ -352,21 +328,18 @@ int mdev_device_create(struct kobject *kobj, struct device *dev, uuid_le uuid)
>>>>  	}
>>>>  
>>>>  	mdev->type_kobj = kobj;
>>>> +	mdev->active = true;
>>>>  	dev_dbg(&mdev->dev, "MDEV: created\n");
>>>>  
>>>> -	mutex_unlock(&parent->lock);
>>>> -
>>>> -	mutex_lock(&mdev_list_lock);
>>>> -	list_add(&mdev->next, &mdev_list);
>>>> -	mutex_unlock(&mdev_list_lock);
>>>> -
>>>> -	return ret;
>>>> +	return 0;
>>>>  
>>>>  create_failed:
>>>>  	device_unregister(&mdev->dev);
>>>>  
>>>>  create_err:
>>>> -	mutex_unlock(&parent->lock);
>>>> +	mutex_lock(&mdev_list_lock);
>>>> +	list_del(&mdev->next);
>>>> +	mutex_unlock(&mdev_list_lock);
>>>>  	mdev_put_parent(parent);
>>>>  	return ret;
>>>>  }
>>>> @@ -377,44 +350,43 @@ int mdev_device_remove(struct device *dev, bool force_remove)
>>>>  	struct mdev_parent *parent;
>>>>  	struct mdev_type *type;
>>>>  	int ret;
>>>> -	bool found = false;
>>>>  
>>>>  	mdev = to_mdev_device(dev);
>>>>  
>>>>  	mutex_lock(&mdev_list_lock);
>>>>  	list_for_each_entry(tmp, &mdev_list, next) {
>>>> -		if (tmp == mdev) {
>>>> -			found = true;
>>>> +		if (tmp == mdev)
>>>>  			break;
>>>> -		}
>>>>  	}
>>>>  
>>>> -	if (found)
>>>> -		list_del(&mdev->next);
>>>> +	if (tmp != mdev) {
>>>> +		mutex_unlock(&mdev_list_lock);
>>>> +		return -ENODEV;
>>>> +	}
>>>>  
>>>> -	mutex_unlock(&mdev_list_lock);
>>>> +	if (!mdev->active) {
>>>> +		mutex_unlock(&mdev_list_lock);
>>>> +		return -EAGAIN;
>>>> +	}  
>>>
>>> I'm not sure whether this is 100% watertight. Consider:
>>>
>>> - device gets registered, we have added it to the list, made it visible
>>>   in sysfs and have added the remove attribute, but not yet the symlinks
>>> - userspace can access the remove attribute and trigger removal
>>> - we do an early exit here because not yet active
>>> - ???
>>>
>>> (If there's any problem, it's a very pathological case, and I don't
>>> think anything really bad can happen. I just want to make sure we don't
>>> miss anything.)
> 
> The presented scenario is exactly the use case that the -EAGAIN return
> is intended to handle.  I can't put a mutex on the mdev_device to block
> this path as the mdev creation might ultimately fail and the device
> released (perhaps not possible in our code flow, but that would be a
> very subtle detail to rely on).  So any sort of blocking approach would
> require releasing mdev_list_lock and re-scanning in a busy loop.  Why
> bother to do that when we can indicate the same to the user through
> -EAGAIN.  AIUI, this is the purpose of -EAGAIN and it's up to userspace
> to decide how they'd like to handle it, try again or abort.  Are there
> suggestions for alternatives?  Thanks,
>
Alex Williamson May 17, 2018, 9:37 p.m. UTC | #5
On Fri, 18 May 2018 01:56:50 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> On 5/17/2018 9:50 PM, Alex Williamson wrote:
> > On Thu, 17 May 2018 21:25:22 +0530
> > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >   
> >> On 5/17/2018 1:39 PM, Cornelia Huck wrote:  
> >>> On Wed, 16 May 2018 21:30:19 -0600
> >>> Alex Williamson <alex.williamson@redhat.com> wrote:
> >>>     
> >>>> When we create an mdev device, we check for duplicates against the
> >>>> parent device and return -EEXIST if found, but the mdev device
> >>>> namespace is global since we'll link all devices from the bus.  We do
> >>>> catch this later in sysfs_do_create_link_sd() to return -EEXIST, but
> >>>> with it comes a kernel warning and stack trace for trying to create
> >>>> duplicate sysfs links, which makes it an undesirable response.
> >>>>
> >>>> Therefore we should really be looking for duplicates across all mdev
> >>>> parent devices, or as implemented here, against our mdev device list.
> >>>> Using mdev_list to prevent duplicates means that we can remove
> >>>> mdev_parent.lock, but in order not to serialize mdev device creation
> >>>> and removal globally, we add mdev_device.active which allows UUIDs to
> >>>> be reserved such that we can drop the mdev_list_lock before the mdev
> >>>> device is fully in place.
> >>>>
> >>>> NB. there was never intended to be any serialization guarantee
> >>>> provided by the mdev core with respect to creation and removal of mdev
> >>>> devices, mdev_parent.lock provided this only as a side-effect of the
> >>>> implementation for locking the namespace per parent.  That
> >>>> serialization is now removed.    
> >>>     
> >>
> >> mdev_parent.lock is to serialize create and remove of that mdev device,
> >> that handles race condition that Cornelia mentioned below.  
> > 
> > Previously it was stated:
> > 
> > On Thu, 17 May 2018 01:01:40 +0530
> > Kirti Wankhede <kwankhede@nvidia.com> wrote:  
> >> Here lock is not for create/remove routines of vendor driver, its about
> >> mdev device creation and device registration, which is a common code
> >> path, and so is part of mdev core module.  
> > 
> > So the race condition was handled previously, but as a side-effect of
> > protecting the namespace, aiui.  I'm trying to state above that the
> > serialization of create/remove was never intended as a guarantee
> > provided to mdev vendor drivers.  I don't see that there's a need to
> > protect "mdev device creation and device registration" beyond conflicts
> > in the UUID namespace, which is done here.  Are there others?
> >   
> 
> Sorry not being elaborative in my earlier response to
> 
> > If we can
> > show that vendor drivers handle the create/remove paths themselves,
> > perhaps we can refine the locking granularity.  
> 
> mdev_device_create() function does :
> - create mdev device
> - register device
> - call vendor driver->create
> - create sysfs files.
> 
> mdev_device_remove() removes sysfs files, unregister device and delete
> device.
> 
> There is common code in mdev_device_create() and mdev_device_remove()
> independent of what vendor driver does in its create()/remove()
> callback. Moving this code to each vendor driver to handle create/remove
> themselves doesn't make sense to me.

I don't see where anyone is suggesting that, I'm not.
 
> mdev_parent.lock here does take care of race conditions that could occur
> during mdev device creation and deletion in this common code path.

Exactly what races in the common code path is mdev_parent.lock
preventing?  mdev_device_create() calls:

device_register()
mdev_device_create_ops()
  parent->ops->create()
  sysfs_create_groups()
mdev_create_sysfs_files()
  sysfs_create_files()
  sysfs_create_link()
  sysfs_create_link()

mdev_parent.lock is certainly not serializing all calls across the
entire kernel to device_register and sysfs_create_{groups,files,link}
so what is it protecting other than serializing parent->ops->create()?
Locks protect data, not code.  The data we're protecting is the shared
mdev_list, there is no shared data once mdev_device_create() has its
mdev_device with uuid reservation placed into that list.

> What is the urge to remove mdev_parent.lock if that handles all race
> conditions without bothering user to handle -EAGAIN?

Can you say why -EAGAIN is undesirable?  Note that the user is only
going to see this error if they attempt to remove the device in the
minuscule gap between the sysfs remove file being created and the
completion of the write to the create sysfs file.  It seems like you're
asking that I decrease the locking granularity, but not too much
because mdev_parent.lock protects "things".  If the -EAGAIN is really
so terrible, we can avoid it by spinning until the mdev_device is
either not found in the list or becomes active, we don't need
mdev_parent.lock to solve that, but I don't think that's the best
solution and there's no concrete statement to back -EAGAIN being a
problem.  Thanks,

Alex
Alex Williamson May 17, 2018, 10:17 p.m. UTC | #6
On Thu, 17 May 2018 15:37:37 -0600
Alex Williamson <alex.williamson@redhat.com> wrote:

> On Fri, 18 May 2018 01:56:50 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
> > On 5/17/2018 9:50 PM, Alex Williamson wrote:  
> > > On Thu, 17 May 2018 21:25:22 +0530
> > > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > >     
> > >> On 5/17/2018 1:39 PM, Cornelia Huck wrote:    
> > >>> On Wed, 16 May 2018 21:30:19 -0600
> > >>> Alex Williamson <alex.williamson@redhat.com> wrote:
> > >>>       
> > >>>> When we create an mdev device, we check for duplicates against the
> > >>>> parent device and return -EEXIST if found, but the mdev device
> > >>>> namespace is global since we'll link all devices from the bus.  We do
> > >>>> catch this later in sysfs_do_create_link_sd() to return -EEXIST, but
> > >>>> with it comes a kernel warning and stack trace for trying to create
> > >>>> duplicate sysfs links, which makes it an undesirable response.
> > >>>>
> > >>>> Therefore we should really be looking for duplicates across all mdev
> > >>>> parent devices, or as implemented here, against our mdev device list.
> > >>>> Using mdev_list to prevent duplicates means that we can remove
> > >>>> mdev_parent.lock, but in order not to serialize mdev device creation
> > >>>> and removal globally, we add mdev_device.active which allows UUIDs to
> > >>>> be reserved such that we can drop the mdev_list_lock before the mdev
> > >>>> device is fully in place.
> > >>>>
> > >>>> NB. there was never intended to be any serialization guarantee
> > >>>> provided by the mdev core with respect to creation and removal of mdev
> > >>>> devices, mdev_parent.lock provided this only as a side-effect of the
> > >>>> implementation for locking the namespace per parent.  That
> > >>>> serialization is now removed.      
> > >>>       
> > >>
> > >> mdev_parent.lock is to serialize create and remove of that mdev device,
> > >> that handles race condition that Cornelia mentioned below.    
> > > 
> > > Previously it was stated:
> > > 
> > > On Thu, 17 May 2018 01:01:40 +0530
> > > Kirti Wankhede <kwankhede@nvidia.com> wrote:    
> > >> Here lock is not for create/remove routines of vendor driver, its about
> > >> mdev device creation and device registration, which is a common code
> > >> path, and so is part of mdev core module.    
> > > 
> > > So the race condition was handled previously, but as a side-effect of
> > > protecting the namespace, aiui.  I'm trying to state above that the
> > > serialization of create/remove was never intended as a guarantee
> > > provided to mdev vendor drivers.  I don't see that there's a need to
> > > protect "mdev device creation and device registration" beyond conflicts
> > > in the UUID namespace, which is done here.  Are there others?
> > >     
> > 
> > Sorry not being elaborative in my earlier response to
> >   
> > > If we can
> > > show that vendor drivers handle the create/remove paths themselves,
> > > perhaps we can refine the locking granularity.    
> > 
> > mdev_device_create() function does :
> > - create mdev device
> > - register device
> > - call vendor driver->create
> > - create sysfs files.
> > 
> > mdev_device_remove() removes sysfs files, unregister device and delete
> > device.
> > 
> > There is common code in mdev_device_create() and mdev_device_remove()
> > independent of what vendor driver does in its create()/remove()
> > callback. Moving this code to each vendor driver to handle create/remove
> > themselves doesn't make sense to me.  
> 
> I don't see where anyone is suggesting that, I'm not.
>  
> > mdev_parent.lock here does take care of race conditions that could occur
> > during mdev device creation and deletion in this common code path.  
> 
> Exactly what races in the common code path is mdev_parent.lock
> preventing?  mdev_device_create() calls:
> 
> device_register()
> mdev_device_create_ops()
>   parent->ops->create()
>   sysfs_create_groups()
> mdev_create_sysfs_files()
>   sysfs_create_files()
>   sysfs_create_link()
>   sysfs_create_link()
> 
> mdev_parent.lock is certainly not serializing all calls across the
> entire kernel to device_register and sysfs_create_{groups,files,link}
> so what is it protecting other than serializing parent->ops->create()?
> Locks protect data, not code.  The data we're protecting is the shared
> mdev_list, there is no shared data once mdev_device_create() has its
> mdev_device with uuid reservation placed into that list.
> 
> > What is the urge to remove mdev_parent.lock if that handles all race
> > conditions without bothering user to handle -EAGAIN?  
> 
> Can you say why -EAGAIN is undesirable?  Note that the user is only
> going to see this error if they attempt to remove the device in the
> minuscule gap between the sysfs remove file being created and the
> completion of the write to the create sysfs file.  It seems like you're
> asking that I decrease the locking granularity, but not too much
> because mdev_parent.lock protects "things".  If the -EAGAIN is really
> so terrible, we can avoid it by spinning until the mdev_device is
> either not found in the list or becomes active, we don't need
> mdev_parent.lock to solve that, but I don't think that's the best
> solution and there's no concrete statement to back -EAGAIN being a
> problem.  Thanks,

Testing whether -EAGAIN is bad, I race the following scripts:

# cat remove-it.sh 
#!/bin/sh

UUID=802d71a3-560d-4316-999a-4c2033b7fbb2
DEV=/sys/devices/virtual/mtty/mtty/$UUID
REMOVE=$DEV/remove
CNT=0
ERR=0

while true; do
	if [ -e $REMOVE ]; then
		echo 1 > $REMOVE
		RET=$?
		if [ $RET -ne 0 ]; then
			echo "Remove returned $RET"
			ERR=$(( $ERR + 1 ))
		fi
		CNT=$(( $CNT + 1 ))
		if [ $(( $CNT % 10000 )) -eq 0 ]; then
			echo "$CNT/$ERR"
		fi
	fi
done

# cat create-it.sh 
#!/bin/sh

UUID=802d71a3-560d-4316-999a-4c2033b7fbb2
DEV=/sys/devices/virtual/mtty/mtty/$UUID
CREATE=/sys/class/mdev_bus/mtty/mdev_supported_types/mtty-1/create
CNT=0

while true; do	
	if [ ! -d $DEV ]; then
		echo $UUID > $CREATE
		RET=$?
		if [ $RET -ne 0 ]; then
			echo "Create returned $RET"
		fi
		CNT=$(( $CNT + 1 ))
		if [ $(( $CNT % 10000 )) -eq 0 ]; then
			echo $CNT
		fi		
	fi
done

After 3 million iterations, this targeted testing is only able to hit
the gap 0.0032% of the time and the sum badness is:

./remove-it.sh: line 11: echo: write error: Resource temporarily unavailable

Of course if I have one thread of execution that does create then
remove, it's impossible to hit this case, so the only code that can hit
this is unsynchronized user code stepping on each other and -EAGAIN
seems like a completely reasonable and appropriate error, imo.  Thanks,

Alex
Kirti Wankhede May 18, 2018, 7:04 a.m. UTC | #7
On 5/18/2018 3:07 AM, Alex Williamson wrote:
> On Fri, 18 May 2018 01:56:50 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
>> On 5/17/2018 9:50 PM, Alex Williamson wrote:
>>> On Thu, 17 May 2018 21:25:22 +0530
>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
>>>   
>>>> On 5/17/2018 1:39 PM, Cornelia Huck wrote:  
>>>>> On Wed, 16 May 2018 21:30:19 -0600
>>>>> Alex Williamson <alex.williamson@redhat.com> wrote:
>>>>>     
>>>>>> When we create an mdev device, we check for duplicates against the
>>>>>> parent device and return -EEXIST if found, but the mdev device
>>>>>> namespace is global since we'll link all devices from the bus.  We do
>>>>>> catch this later in sysfs_do_create_link_sd() to return -EEXIST, but
>>>>>> with it comes a kernel warning and stack trace for trying to create
>>>>>> duplicate sysfs links, which makes it an undesirable response.
>>>>>>
>>>>>> Therefore we should really be looking for duplicates across all mdev
>>>>>> parent devices, or as implemented here, against our mdev device list.
>>>>>> Using mdev_list to prevent duplicates means that we can remove
>>>>>> mdev_parent.lock, but in order not to serialize mdev device creation
>>>>>> and removal globally, we add mdev_device.active which allows UUIDs to
>>>>>> be reserved such that we can drop the mdev_list_lock before the mdev
>>>>>> device is fully in place.
>>>>>>
>>>>>> NB. there was never intended to be any serialization guarantee
>>>>>> provided by the mdev core with respect to creation and removal of mdev
>>>>>> devices, mdev_parent.lock provided this only as a side-effect of the
>>>>>> implementation for locking the namespace per parent.  That
>>>>>> serialization is now removed.    
>>>>>     
>>>>
>>>> mdev_parent.lock is to serialize create and remove of that mdev device,
>>>> that handles race condition that Cornelia mentioned below.  
>>>
>>> Previously it was stated:
>>>
>>> On Thu, 17 May 2018 01:01:40 +0530
>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:  
>>>> Here lock is not for create/remove routines of vendor driver, its about
>>>> mdev device creation and device registration, which is a common code
>>>> path, and so is part of mdev core module.  
>>>
>>> So the race condition was handled previously, but as a side-effect of
>>> protecting the namespace, aiui.  I'm trying to state above that the
>>> serialization of create/remove was never intended as a guarantee
>>> provided to mdev vendor drivers.  I don't see that there's a need to
>>> protect "mdev device creation and device registration" beyond conflicts
>>> in the UUID namespace, which is done here.  Are there others?
>>>   
>>
>> Sorry not being elaborative in my earlier response to
>>
>>> If we can
>>> show that vendor drivers handle the create/remove paths themselves,
>>> perhaps we can refine the locking granularity.  
>>
>> mdev_device_create() function does :
>> - create mdev device
>> - register device
>> - call vendor driver->create
>> - create sysfs files.
>>
>> mdev_device_remove() removes sysfs files, unregister device and delete
>> device.
>>
>> There is common code in mdev_device_create() and mdev_device_remove()
>> independent of what vendor driver does in its create()/remove()
>> callback. Moving this code to each vendor driver to handle create/remove
>> themselves doesn't make sense to me.
> 
> I don't see where anyone is suggesting that, I'm not.
>  
>> mdev_parent.lock here does take care of race conditions that could occur
>> during mdev device creation and deletion in this common code path.
> 
> Exactly what races in the common code path is mdev_parent.lock
> preventing?  mdev_device_create() calls:
> 
> device_register()
> mdev_device_create_ops()
>   parent->ops->create()
>   sysfs_create_groups()
> mdev_create_sysfs_files()
>   sysfs_create_files()
>   sysfs_create_link()
>   sysfs_create_link()
> 
> mdev_parent.lock is certainly not serializing all calls across the
> entire kernel to device_register and sysfs_create_{groups,files,link}
> so what is it protecting other than serializing parent->ops->create()?
> Locks protect data, not code.  The data we're protecting is the shared
> mdev_list, there is no shared data once mdev_device_create() has its
> mdev_device with uuid reservation placed into that list.
> 

This lock prevents race condition that could occur due to sysfs entries
1. between write on 'create' and 'remove' sysfs file of mdev device
  As per current code without lock, mdev_create_sysfs_files() creates
'remove' sysfs, but before adding this mdev device to mdev_list, if
'remove' is called, that would return -ENODEV even if the device is seen
in sysfs

2. between write on 'remove' and 'create' sysfs file
  If 'remove' of a device is in progress (device is removed from
mdev_list but sysfs entries are not yet removed) and again 'create' of
same device with same parent is called, will hit duplicate entries error
for sysfs.

3. between multiple writes on 'create' with same uuid
 current code doesn't handle the case you are fixing here, if same uuid
is used to create mdev device on different parents.

Your change handles #1 and #3, but still there is a small gap for #2.
Even its a very small gap, but if such conditions are it in production
environment, it becomes difficult to debug.

>> What is the urge to remove mdev_parent.lock if that handles all race
>> conditions without bothering user to handle -EAGAIN?
> 
> Can you say why -EAGAIN is undesirable?  Note that the user is only
> going to see this error if they attempt to remove the device in the
> minuscule gap between the sysfs remove file being created and the
> completion of the write to the create sysfs file.  It seems like you're
> asking that I decrease the locking granularity, but not too much
> because mdev_parent.lock protects "things".  If the -EAGAIN is really
> so terrible, we can avoid it by spinning until the mdev_device is
> either not found in the list or becomes active, we don't need
> mdev_parent.lock to solve that, but I don't think that's the best
> solution and there's no concrete statement to back -EAGAIN being a
> problem. 

Does libvirt handles -EAGAIN error case? I don't know, may be someone
from libvirt can comment.

Thanks,
Kirti
Halil Pasic May 18, 2018, 1:28 p.m. UTC | #8
On 05/17/2018 10:09 AM, Cornelia Huck wrote:
> [Dong Jia, Halil: Can you please take a look whether vfio-ccw is really
> ok? I don't think we open up any new races, but I'd appreciate a second
> or third opinion.]

I will wait for things to settle a bit before I start reviewing the synchronization
for mdev and vfio-ccw. I suspect there will be a v4. I would appreciate a cc,
so I don't miss it.

Halil
Cornelia Huck May 18, 2018, 2:05 p.m. UTC | #9
On Fri, 18 May 2018 12:34:03 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> On 5/18/2018 3:07 AM, Alex Williamson wrote:
> > On Fri, 18 May 2018 01:56:50 +0530
> > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >   
> >> On 5/17/2018 9:50 PM, Alex Williamson wrote:  
> >>> On Thu, 17 May 2018 21:25:22 +0530
> >>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >>>     
> >>>> On 5/17/2018 1:39 PM, Cornelia Huck wrote:    
> >>>>> On Wed, 16 May 2018 21:30:19 -0600
> >>>>> Alex Williamson <alex.williamson@redhat.com> wrote:
> >>>>>       
> >>>>>> When we create an mdev device, we check for duplicates against the
> >>>>>> parent device and return -EEXIST if found, but the mdev device
> >>>>>> namespace is global since we'll link all devices from the bus.  We do
> >>>>>> catch this later in sysfs_do_create_link_sd() to return -EEXIST, but
> >>>>>> with it comes a kernel warning and stack trace for trying to create
> >>>>>> duplicate sysfs links, which makes it an undesirable response.
> >>>>>>
> >>>>>> Therefore we should really be looking for duplicates across all mdev
> >>>>>> parent devices, or as implemented here, against our mdev device list.
> >>>>>> Using mdev_list to prevent duplicates means that we can remove
> >>>>>> mdev_parent.lock, but in order not to serialize mdev device creation
> >>>>>> and removal globally, we add mdev_device.active which allows UUIDs to
> >>>>>> be reserved such that we can drop the mdev_list_lock before the mdev
> >>>>>> device is fully in place.
> >>>>>>
> >>>>>> NB. there was never intended to be any serialization guarantee
> >>>>>> provided by the mdev core with respect to creation and removal of mdev
> >>>>>> devices, mdev_parent.lock provided this only as a side-effect of the
> >>>>>> implementation for locking the namespace per parent.  That
> >>>>>> serialization is now removed.      
> >>>>>       
> >>>>
> >>>> mdev_parent.lock is to serialize create and remove of that mdev device,
> >>>> that handles race condition that Cornelia mentioned below.    
> >>>
> >>> Previously it was stated:
> >>>
> >>> On Thu, 17 May 2018 01:01:40 +0530
> >>> Kirti Wankhede <kwankhede@nvidia.com> wrote:    
> >>>> Here lock is not for create/remove routines of vendor driver, its about
> >>>> mdev device creation and device registration, which is a common code
> >>>> path, and so is part of mdev core module.    
> >>>
> >>> So the race condition was handled previously, but as a side-effect of
> >>> protecting the namespace, aiui.  I'm trying to state above that the
> >>> serialization of create/remove was never intended as a guarantee
> >>> provided to mdev vendor drivers.  I don't see that there's a need to
> >>> protect "mdev device creation and device registration" beyond conflicts
> >>> in the UUID namespace, which is done here.  Are there others?
> >>>     
> >>
> >> Sorry not being elaborative in my earlier response to
> >>  
> >>> If we can
> >>> show that vendor drivers handle the create/remove paths themselves,
> >>> perhaps we can refine the locking granularity.    
> >>
> >> mdev_device_create() function does :
> >> - create mdev device
> >> - register device
> >> - call vendor driver->create
> >> - create sysfs files.
> >>
> >> mdev_device_remove() removes sysfs files, unregister device and delete
> >> device.
> >>
> >> There is common code in mdev_device_create() and mdev_device_remove()
> >> independent of what vendor driver does in its create()/remove()
> >> callback. Moving this code to each vendor driver to handle create/remove
> >> themselves doesn't make sense to me.  
> > 
> > I don't see where anyone is suggesting that, I'm not.
> >    
> >> mdev_parent.lock here does take care of race conditions that could occur
> >> during mdev device creation and deletion in this common code path.  
> > 
> > Exactly what races in the common code path is mdev_parent.lock
> > preventing?  mdev_device_create() calls:
> > 
> > device_register()
> > mdev_device_create_ops()
> >   parent->ops->create()
> >   sysfs_create_groups()
> > mdev_create_sysfs_files()
> >   sysfs_create_files()
> >   sysfs_create_link()
> >   sysfs_create_link()

We might consider creating the 'remove' attribute only after the sysfs
links have been created (IOW, when nothing else can fail anymore).
Drawback: We'd need to clean up the sysfs links. Benefit: the -EAGAIN
window should be closed.

> > 
> > mdev_parent.lock is certainly not serializing all calls across the
> > entire kernel to device_register and sysfs_create_{groups,files,link}
> > so what is it protecting other than serializing parent->ops->create()?
> > Locks protect data, not code.  The data we're protecting is the shared
> > mdev_list, there is no shared data once mdev_device_create() has its
> > mdev_device with uuid reservation placed into that list.
> >   
> 
> This lock prevents race condition that could occur due to sysfs entries
> 1. between write on 'create' and 'remove' sysfs file of mdev device
>   As per current code without lock, mdev_create_sysfs_files() creates
> 'remove' sysfs, but before adding this mdev device to mdev_list, if
> 'remove' is called, that would return -ENODEV even if the device is seen
> in sysfs
> 
> 2. between write on 'remove' and 'create' sysfs file
>   If 'remove' of a device is in progress (device is removed from
> mdev_list but sysfs entries are not yet removed) and again 'create' of
> same device with same parent is called, will hit duplicate entries error
> for sysfs.
> 
> 3. between multiple writes on 'create' with same uuid
>  current code doesn't handle the case you are fixing here, if same uuid
> is used to create mdev device on different parents.
> 
> Your change handles #1 and #3, but still there is a small gap for #2.
> Even its a very small gap, but if such conditions are it in production
> environment, it becomes difficult to debug.
> 
> >> What is the urge to remove mdev_parent.lock if that handles all race
> >> conditions without bothering user to handle -EAGAIN?  
> > 
> > Can you say why -EAGAIN is undesirable?  Note that the user is only
> > going to see this error if they attempt to remove the device in the
> > minuscule gap between the sysfs remove file being created and the
> > completion of the write to the create sysfs file.  It seems like you're
> > asking that I decrease the locking granularity, but not too much
> > because mdev_parent.lock protects "things".  If the -EAGAIN is really
> > so terrible, we can avoid it by spinning until the mdev_device is
> > either not found in the list or becomes active, we don't need
> > mdev_parent.lock to solve that, but I don't think that's the best
> > solution and there's no concrete statement to back -EAGAIN being a
> > problem.   

Moving the creation of the sysfs attribute would make -EAGAIN
impossible, AFAICS.

> Does libvirt handles -EAGAIN error case? I don't know, may be someone
> from libvirt can comment.

I would expect callers to be able to deal with failures, but I'm not
sure whether they can be expected to retry on their own.

Another point: I'm still not 100% sure that no vendor driver ever
assumed some kind of serialization that is now gone. The first patch
had the benefit that it does not change these semantics, making it less
likely to introduce subtle regressions (and thus less likely to be
problematic if backported into stable). We could maybe start with the
original patch as a bug fix and then clean up resp. rework the locking
on top of that?
Alex Williamson May 18, 2018, 5:30 p.m. UTC | #10
On Fri, 18 May 2018 12:34:03 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> On 5/18/2018 3:07 AM, Alex Williamson wrote:
> > On Fri, 18 May 2018 01:56:50 +0530
> > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >   
> >> On 5/17/2018 9:50 PM, Alex Williamson wrote:  
> >>> On Thu, 17 May 2018 21:25:22 +0530
> >>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >>>     
> >>>> On 5/17/2018 1:39 PM, Cornelia Huck wrote:    
> >>>>> On Wed, 16 May 2018 21:30:19 -0600
> >>>>> Alex Williamson <alex.williamson@redhat.com> wrote:
> >>>>>       
> >>>>>> When we create an mdev device, we check for duplicates against the
> >>>>>> parent device and return -EEXIST if found, but the mdev device
> >>>>>> namespace is global since we'll link all devices from the bus.  We do
> >>>>>> catch this later in sysfs_do_create_link_sd() to return -EEXIST, but
> >>>>>> with it comes a kernel warning and stack trace for trying to create
> >>>>>> duplicate sysfs links, which makes it an undesirable response.
> >>>>>>
> >>>>>> Therefore we should really be looking for duplicates across all mdev
> >>>>>> parent devices, or as implemented here, against our mdev device list.
> >>>>>> Using mdev_list to prevent duplicates means that we can remove
> >>>>>> mdev_parent.lock, but in order not to serialize mdev device creation
> >>>>>> and removal globally, we add mdev_device.active which allows UUIDs to
> >>>>>> be reserved such that we can drop the mdev_list_lock before the mdev
> >>>>>> device is fully in place.
> >>>>>>
> >>>>>> NB. there was never intended to be any serialization guarantee
> >>>>>> provided by the mdev core with respect to creation and removal of mdev
> >>>>>> devices, mdev_parent.lock provided this only as a side-effect of the
> >>>>>> implementation for locking the namespace per parent.  That
> >>>>>> serialization is now removed.      
> >>>>>       
> >>>>
> >>>> mdev_parent.lock is to serialize create and remove of that mdev device,
> >>>> that handles race condition that Cornelia mentioned below.    
> >>>
> >>> Previously it was stated:
> >>>
> >>> On Thu, 17 May 2018 01:01:40 +0530
> >>> Kirti Wankhede <kwankhede@nvidia.com> wrote:    
> >>>> Here lock is not for create/remove routines of vendor driver, its about
> >>>> mdev device creation and device registration, which is a common code
> >>>> path, and so is part of mdev core module.    
> >>>
> >>> So the race condition was handled previously, but as a side-effect of
> >>> protecting the namespace, aiui.  I'm trying to state above that the
> >>> serialization of create/remove was never intended as a guarantee
> >>> provided to mdev vendor drivers.  I don't see that there's a need to
> >>> protect "mdev device creation and device registration" beyond conflicts
> >>> in the UUID namespace, which is done here.  Are there others?
> >>>     
> >>
> >> Sorry not being elaborative in my earlier response to
> >>  
> >>> If we can
> >>> show that vendor drivers handle the create/remove paths themselves,
> >>> perhaps we can refine the locking granularity.    
> >>
> >> mdev_device_create() function does :
> >> - create mdev device
> >> - register device
> >> - call vendor driver->create
> >> - create sysfs files.
> >>
> >> mdev_device_remove() removes sysfs files, unregister device and delete
> >> device.
> >>
> >> There is common code in mdev_device_create() and mdev_device_remove()
> >> independent of what vendor driver does in its create()/remove()
> >> callback. Moving this code to each vendor driver to handle create/remove
> >> themselves doesn't make sense to me.  
> > 
> > I don't see where anyone is suggesting that, I'm not.
> >    
> >> mdev_parent.lock here does take care of race conditions that could occur
> >> during mdev device creation and deletion in this common code path.  
> > 
> > Exactly what races in the common code path is mdev_parent.lock
> > preventing?  mdev_device_create() calls:
> > 
> > device_register()
> > mdev_device_create_ops()
> >   parent->ops->create()
> >   sysfs_create_groups()
> > mdev_create_sysfs_files()
> >   sysfs_create_files()
> >   sysfs_create_link()
> >   sysfs_create_link()
> > 
> > mdev_parent.lock is certainly not serializing all calls across the
> > entire kernel to device_register and sysfs_create_{groups,files,link}
> > so what is it protecting other than serializing parent->ops->create()?
> > Locks protect data, not code.  The data we're protecting is the shared
> > mdev_list, there is no shared data once mdev_device_create() has its
> > mdev_device with uuid reservation placed into that list.
> >   

Thank you for enumerating these points below.

> This lock prevents race condition that could occur due to sysfs entries
> 1. between write on 'create' and 'remove' sysfs file of mdev device
>   As per current code without lock, mdev_create_sysfs_files() creates
> 'remove' sysfs, but before adding this mdev device to mdev_list, if
> 'remove' is called, that would return -ENODEV even if the device is seen
> in sysfs

mdev_parent.lock doesn't play a factor here.  As it exists today, the
sysfs remove attribute is added during mdev_device_create() while
holding mdev_parent.lock, but only after releasing that lock is the
mdev_device added to mdev_list.  mdev_device_remove() only uses the
mdev_list, so there exists a gap where the remove attribute is visible
to userspace, but a write to it will return -ENODEV.  Let's not fool
ourselves that the current code is bulletproof here, we're debating
whether it's more reasonable to return -ENODEV or -EAGAIN in the gap
between the sysfs remove attribute being created and the completion of
mdev_device_create().  Personally I think -EAGAIN makes more sense,
which is why I chose to separate it from the -ENODEV return.

Additionally, we can consider the write on 'remove' and write on
'remove' case.  A second writer to the remove attribute would today see
-ENODEV, which is incorrect as the device again clearly does exist.
Furthermore if mdev_device_remove_ops() fails, the mdev_device can be
re-added to the mdev_list, so a subsequent remove could work.
Effectively the device disappears and comes back according to the
remove attribute while in my proposal the user would see a much more
consistent device status, -EAGAIN if the user is racing another
remove, allowing the device to return to "found" should
mdev_device_remove_ops() fail.  Again, I this makes more sense to me
than the existing behavior.
 
> 2. between write on 'remove' and 'create' sysfs file
>   If 'remove' of a device is in progress (device is removed from
> mdev_list but sysfs entries are not yet removed) and again 'create' of
> same device with same parent is called, will hit duplicate entries
> error for sysfs.

This is a good point, but the fix is trivial, move the list_del to
mdev_device_release().
 
> 3. between multiple writes on 'create' with same uuid
>  current code doesn't handle the case you are fixing here, if same uuid
> is used to create mdev device on different parents.
> 
> Your change handles #1 and #3, but still there is a small gap for #2.
> Even its a very small gap, but if such conditions are it in production
> environment, it becomes difficult to debug.

Fixed trivially and arguably better than existing code as the namespace
is protected through the very end of the lifecycle of the mdev_device.
 
> >> What is the urge to remove mdev_parent.lock if that handles all race
> >> conditions without bothering user to handle -EAGAIN?  
> > 
> > Can you say why -EAGAIN is undesirable?  Note that the user is only
> > going to see this error if they attempt to remove the device in the
> > minuscule gap between the sysfs remove file being created and the
> > completion of the write to the create sysfs file.  It seems like you're
> > asking that I decrease the locking granularity, but not too much
> > because mdev_parent.lock protects "things".  If the -EAGAIN is really
> > so terrible, we can avoid it by spinning until the mdev_device is
> > either not found in the list or becomes active, we don't need
> > mdev_parent.lock to solve that, but I don't think that's the best
> > solution and there's no concrete statement to back -EAGAIN being a
> > problem.   
> 
> Does libvirt handles -EAGAIN error case? I don't know, may be someone
> from libvirt can comment.

Is this even a valid question given the revelation above?  Current code
has a gap where the user can access remove and see -ENODEV.  The
proposed code has a gap where the user can access remove and see
-EAGAIN.  Either case requires that the user is calling remove prior to
the device creation being completed, which suggests that userspace
already has multiple processing stepping on each other.  I don't think
libvirt does this, nor do I think we need to be particularly amenable
such user code, though I do think the -EAGAIN error is more consistent
with the actual device state.  Thanks,

Alex
Kirti Wankhede May 18, 2018, 7:03 p.m. UTC | #11
On 5/18/2018 11:00 PM, Alex Williamson wrote:
> On Fri, 18 May 2018 12:34:03 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
>> On 5/18/2018 3:07 AM, Alex Williamson wrote:
>>> On Fri, 18 May 2018 01:56:50 +0530
>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
>>>   
>>>> On 5/17/2018 9:50 PM, Alex Williamson wrote:  
>>>>> On Thu, 17 May 2018 21:25:22 +0530
>>>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
>>>>>     
>>>>>> On 5/17/2018 1:39 PM, Cornelia Huck wrote:    
>>>>>>> On Wed, 16 May 2018 21:30:19 -0600
>>>>>>> Alex Williamson <alex.williamson@redhat.com> wrote:
>>>>>>>       
>>>>>>>> When we create an mdev device, we check for duplicates against the
>>>>>>>> parent device and return -EEXIST if found, but the mdev device
>>>>>>>> namespace is global since we'll link all devices from the bus.  We do
>>>>>>>> catch this later in sysfs_do_create_link_sd() to return -EEXIST, but
>>>>>>>> with it comes a kernel warning and stack trace for trying to create
>>>>>>>> duplicate sysfs links, which makes it an undesirable response.
>>>>>>>>
>>>>>>>> Therefore we should really be looking for duplicates across all mdev
>>>>>>>> parent devices, or as implemented here, against our mdev device list.
>>>>>>>> Using mdev_list to prevent duplicates means that we can remove
>>>>>>>> mdev_parent.lock, but in order not to serialize mdev device creation
>>>>>>>> and removal globally, we add mdev_device.active which allows UUIDs to
>>>>>>>> be reserved such that we can drop the mdev_list_lock before the mdev
>>>>>>>> device is fully in place.
>>>>>>>>
>>>>>>>> NB. there was never intended to be any serialization guarantee
>>>>>>>> provided by the mdev core with respect to creation and removal of mdev
>>>>>>>> devices, mdev_parent.lock provided this only as a side-effect of the
>>>>>>>> implementation for locking the namespace per parent.  That
>>>>>>>> serialization is now removed.      
>>>>>>>       
>>>>>>
>>>>>> mdev_parent.lock is to serialize create and remove of that mdev device,
>>>>>> that handles race condition that Cornelia mentioned below.    
>>>>>
>>>>> Previously it was stated:
>>>>>
>>>>> On Thu, 17 May 2018 01:01:40 +0530
>>>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:    
>>>>>> Here lock is not for create/remove routines of vendor driver, its about
>>>>>> mdev device creation and device registration, which is a common code
>>>>>> path, and so is part of mdev core module.    
>>>>>
>>>>> So the race condition was handled previously, but as a side-effect of
>>>>> protecting the namespace, aiui.  I'm trying to state above that the
>>>>> serialization of create/remove was never intended as a guarantee
>>>>> provided to mdev vendor drivers.  I don't see that there's a need to
>>>>> protect "mdev device creation and device registration" beyond conflicts
>>>>> in the UUID namespace, which is done here.  Are there others?
>>>>>     
>>>>
>>>> Sorry not being elaborative in my earlier response to
>>>>  
>>>>> If we can
>>>>> show that vendor drivers handle the create/remove paths themselves,
>>>>> perhaps we can refine the locking granularity.    
>>>>
>>>> mdev_device_create() function does :
>>>> - create mdev device
>>>> - register device
>>>> - call vendor driver->create
>>>> - create sysfs files.
>>>>
>>>> mdev_device_remove() removes sysfs files, unregister device and delete
>>>> device.
>>>>
>>>> There is common code in mdev_device_create() and mdev_device_remove()
>>>> independent of what vendor driver does in its create()/remove()
>>>> callback. Moving this code to each vendor driver to handle create/remove
>>>> themselves doesn't make sense to me.  
>>>
>>> I don't see where anyone is suggesting that, I'm not.
>>>    
>>>> mdev_parent.lock here does take care of race conditions that could occur
>>>> during mdev device creation and deletion in this common code path.  
>>>
>>> Exactly what races in the common code path is mdev_parent.lock
>>> preventing?  mdev_device_create() calls:
>>>
>>> device_register()
>>> mdev_device_create_ops()
>>>   parent->ops->create()
>>>   sysfs_create_groups()
>>> mdev_create_sysfs_files()
>>>   sysfs_create_files()
>>>   sysfs_create_link()
>>>   sysfs_create_link()
>>>
>>> mdev_parent.lock is certainly not serializing all calls across the
>>> entire kernel to device_register and sysfs_create_{groups,files,link}
>>> so what is it protecting other than serializing parent->ops->create()?
>>> Locks protect data, not code.  The data we're protecting is the shared
>>> mdev_list, there is no shared data once mdev_device_create() has its
>>> mdev_device with uuid reservation placed into that list.
>>>   
> 
> Thank you for enumerating these points below.
> 
>> This lock prevents race condition that could occur due to sysfs entries
>> 1. between write on 'create' and 'remove' sysfs file of mdev device
>>   As per current code without lock, mdev_create_sysfs_files() creates
>> 'remove' sysfs, but before adding this mdev device to mdev_list, if
>> 'remove' is called, that would return -ENODEV even if the device is seen
>> in sysfs
> 
> mdev_parent.lock doesn't play a factor here.  As it exists today, the
> sysfs remove attribute is added during mdev_device_create() while
> holding mdev_parent.lock, but only after releasing that lock is the
> mdev_device added to mdev_list.  mdev_device_remove() only uses the
> mdev_list, so there exists a gap where the remove attribute is visible
> to userspace, but a write to it will return -ENODEV.  

Ah, that's right.

> Let's not fool
> ourselves that the current code is bulletproof here, we're debating
> whether it's more reasonable to return -ENODEV or -EAGAIN in the gap
> between the sysfs remove attribute being created and the completion of
> mdev_device_create().  Personally I think -EAGAIN makes more sense,
> which is why I chose to separate it from the -ENODEV return.
> 
> Additionally, we can consider the write on 'remove' and write on
> 'remove' case.  A second writer to the remove attribute would today see
> -ENODEV, which is incorrect as the device again clearly does exist.
> Furthermore if mdev_device_remove_ops() fails, the mdev_device can be
> re-added to the mdev_list, so a subsequent remove could work.
> Effectively the device disappears and comes back according to the
> remove attribute while in my proposal the user would see a much more
> consistent device status, -EAGAIN if the user is racing another
> remove, allowing the device to return to "found" should
> mdev_device_remove_ops() fail.  Again, I this makes more sense to me
> than the existing behavior.
>  

Ok. This makes sense to me.

>> 2. between write on 'remove' and 'create' sysfs file
>>   If 'remove' of a device is in progress (device is removed from
>> mdev_list but sysfs entries are not yet removed) and again 'create' of
>> same device with same parent is called, will hit duplicate entries
>> error for sysfs.
> 
> This is a good point, but the fix is trivial, move the list_del to
> mdev_device_release().
>  

Sounds good.

>> 3. between multiple writes on 'create' with same uuid
>>  current code doesn't handle the case you are fixing here, if same uuid
>> is used to create mdev device on different parents.
>>
>> Your change handles #1 and #3, but still there is a small gap for #2.
>> Even its a very small gap, but if such conditions are it in production
>> environment, it becomes difficult to debug.
> 
> Fixed trivially and arguably better than existing code as the namespace
> is protected through the very end of the lifecycle of the mdev_device.
>  
>>>> What is the urge to remove mdev_parent.lock if that handles all race
>>>> conditions without bothering user to handle -EAGAIN?  
>>>
>>> Can you say why -EAGAIN is undesirable?  Note that the user is only
>>> going to see this error if they attempt to remove the device in the
>>> minuscule gap between the sysfs remove file being created and the
>>> completion of the write to the create sysfs file.  It seems like you're
>>> asking that I decrease the locking granularity, but not too much
>>> because mdev_parent.lock protects "things".  If the -EAGAIN is really
>>> so terrible, we can avoid it by spinning until the mdev_device is
>>> either not found in the list or becomes active, we don't need
>>> mdev_parent.lock to solve that, but I don't think that's the best
>>> solution and there's no concrete statement to back -EAGAIN being a
>>> problem.   
>>
>> Does libvirt handles -EAGAIN error case? I don't know, may be someone
>> from libvirt can comment.
> 
> Is this even a valid question given the revelation above?  Current code
> has a gap where the user can access remove and see -ENODEV.  The
> proposed code has a gap where the user can access remove and see
> -EAGAIN.  Either case requires that the user is calling remove prior to
> the device creation being completed, which suggests that userspace
> already has multiple processing stepping on each other.  I don't think
> libvirt does this, nor do I think we need to be particularly amenable
> such user code, though I do think the -EAGAIN error is more consistent
> with the actual device state.  Thanks,
> 

Ok. Make sense.

Thanks,
Kirti
diff mbox

Patch

diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c
index 126991046eb7..55ea9d34ec69 100644
--- a/drivers/vfio/mdev/mdev_core.c
+++ b/drivers/vfio/mdev/mdev_core.c
@@ -66,34 +66,6 @@  uuid_le mdev_uuid(struct mdev_device *mdev)
 }
 EXPORT_SYMBOL(mdev_uuid);
 
-static int _find_mdev_device(struct device *dev, void *data)
-{
-	struct mdev_device *mdev;
-
-	if (!dev_is_mdev(dev))
-		return 0;
-
-	mdev = to_mdev_device(dev);
-
-	if (uuid_le_cmp(mdev->uuid, *(uuid_le *)data) == 0)
-		return 1;
-
-	return 0;
-}
-
-static bool mdev_device_exist(struct mdev_parent *parent, uuid_le uuid)
-{
-	struct device *dev;
-
-	dev = device_find_child(parent->dev, &uuid, _find_mdev_device);
-	if (dev) {
-		put_device(dev);
-		return true;
-	}
-
-	return false;
-}
-
 /* Should be called holding parent_list_lock */
 static struct mdev_parent *__find_parent_device(struct device *dev)
 {
@@ -221,7 +193,6 @@  int mdev_register_device(struct device *dev, const struct mdev_parent_ops *ops)
 	}
 
 	kref_init(&parent->ref);
-	mutex_init(&parent->lock);
 
 	parent->dev = dev;
 	parent->ops = ops;
@@ -304,7 +275,7 @@  static void mdev_device_release(struct device *dev)
 int mdev_device_create(struct kobject *kobj, struct device *dev, uuid_le uuid)
 {
 	int ret;
-	struct mdev_device *mdev;
+	struct mdev_device *mdev, *tmp;
 	struct mdev_parent *parent;
 	struct mdev_type *type = to_mdev_type(kobj);
 
@@ -312,21 +283,26 @@  int mdev_device_create(struct kobject *kobj, struct device *dev, uuid_le uuid)
 	if (!parent)
 		return -EINVAL;
 
-	mutex_lock(&parent->lock);
+	mutex_lock(&mdev_list_lock);
 
 	/* Check for duplicate */
-	if (mdev_device_exist(parent, uuid)) {
-		ret = -EEXIST;
-		goto create_err;
+	list_for_each_entry(tmp, &mdev_list, next) {
+		if (!uuid_le_cmp(tmp->uuid, uuid)) {
+			mutex_unlock(&mdev_list_lock);
+			return -EEXIST;
+		}
 	}
 
 	mdev = kzalloc(sizeof(*mdev), GFP_KERNEL);
 	if (!mdev) {
-		ret = -ENOMEM;
-		goto create_err;
+		mutex_unlock(&mdev_list_lock);
+		return -ENOMEM;
 	}
 
 	memcpy(&mdev->uuid, &uuid, sizeof(uuid_le));
+	list_add(&mdev->next, &mdev_list);
+	mutex_unlock(&mdev_list_lock);
+
 	mdev->parent = parent;
 	kref_init(&mdev->ref);
 
@@ -352,21 +328,18 @@  int mdev_device_create(struct kobject *kobj, struct device *dev, uuid_le uuid)
 	}
 
 	mdev->type_kobj = kobj;
+	mdev->active = true;
 	dev_dbg(&mdev->dev, "MDEV: created\n");
 
-	mutex_unlock(&parent->lock);
-
-	mutex_lock(&mdev_list_lock);
-	list_add(&mdev->next, &mdev_list);
-	mutex_unlock(&mdev_list_lock);
-
-	return ret;
+	return 0;
 
 create_failed:
 	device_unregister(&mdev->dev);
 
 create_err:
-	mutex_unlock(&parent->lock);
+	mutex_lock(&mdev_list_lock);
+	list_del(&mdev->next);
+	mutex_unlock(&mdev_list_lock);
 	mdev_put_parent(parent);
 	return ret;
 }
@@ -377,44 +350,43 @@  int mdev_device_remove(struct device *dev, bool force_remove)
 	struct mdev_parent *parent;
 	struct mdev_type *type;
 	int ret;
-	bool found = false;
 
 	mdev = to_mdev_device(dev);
 
 	mutex_lock(&mdev_list_lock);
 	list_for_each_entry(tmp, &mdev_list, next) {
-		if (tmp == mdev) {
-			found = true;
+		if (tmp == mdev)
 			break;
-		}
 	}
 
-	if (found)
-		list_del(&mdev->next);
+	if (tmp != mdev) {
+		mutex_unlock(&mdev_list_lock);
+		return -ENODEV;
+	}
 
-	mutex_unlock(&mdev_list_lock);
+	if (!mdev->active) {
+		mutex_unlock(&mdev_list_lock);
+		return -EAGAIN;
+	}
 
-	if (!found)
-		return -ENODEV;
+	mdev->active = false;
+	mutex_unlock(&mdev_list_lock);
 
 	type = to_mdev_type(mdev->type_kobj);
 	parent = mdev->parent;
-	mutex_lock(&parent->lock);
 
 	ret = mdev_device_remove_ops(mdev, force_remove);
 	if (ret) {
-		mutex_unlock(&parent->lock);
-
-		mutex_lock(&mdev_list_lock);
-		list_add(&mdev->next, &mdev_list);
-		mutex_unlock(&mdev_list_lock);
-
+		mdev->active = true;
 		return ret;
 	}
 
+	mutex_lock(&mdev_list_lock);
+	list_del(&mdev->next);
+	mutex_unlock(&mdev_list_lock);
+
 	mdev_remove_sysfs_files(dev, type);
 	device_unregister(dev);
-	mutex_unlock(&parent->lock);
 	mdev_put_parent(parent);
 
 	return 0;
diff --git a/drivers/vfio/mdev/mdev_private.h b/drivers/vfio/mdev/mdev_private.h
index a9cefd70a705..b5819b7d7ef7 100644
--- a/drivers/vfio/mdev/mdev_private.h
+++ b/drivers/vfio/mdev/mdev_private.h
@@ -20,7 +20,6 @@  struct mdev_parent {
 	struct device *dev;
 	const struct mdev_parent_ops *ops;
 	struct kref ref;
-	struct mutex lock;
 	struct list_head next;
 	struct kset *mdev_types_kset;
 	struct list_head type_list;
@@ -34,6 +33,7 @@  struct mdev_device {
 	struct kref ref;
 	struct list_head next;
 	struct kobject *type_kobj;
+	bool active;
 };
 
 #define to_mdev_device(dev)	container_of(dev, struct mdev_device, dev)