diff mbox series

vhost/vsock: don't check owner in vhost_vsock_stop() while releasing

Message ID 20220221114916.107045-1-sgarzare@redhat.com (mailing list archive)
State New, archived
Headers show
Series vhost/vsock: don't check owner in vhost_vsock_stop() while releasing | expand

Commit Message

Stefano Garzarella Feb. 21, 2022, 11:49 a.m. UTC
vhost_vsock_stop() calls vhost_dev_check_owner() to check the device
ownership. It expects current->mm to be valid.

vhost_vsock_stop() is also called by vhost_vsock_dev_release() when
the user has not done close(), so when we are in do_exit(). In this
case current->mm is invalid and we're releasing the device, so we
should clean it anyway.

Let's check the owner only when vhost_vsock_stop() is called
by an ioctl.

Fixes: 433fc58e6bf2 ("VSOCK: Introduce vhost_vsock.ko")
Cc: stable@vger.kernel.org
Reported-by: syzbot+1e3ea63db39f2b4440e0@syzkaller.appspotmail.com
Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
---
 drivers/vhost/vsock.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

Comments

Stefano Garzarella Feb. 21, 2022, 1:59 p.m. UTC | #1
On Mon, Feb 21, 2022 at 12:49 PM Stefano Garzarella <sgarzare@redhat.com> wrote:
>
> vhost_vsock_stop() calls vhost_dev_check_owner() to check the device
> ownership. It expects current->mm to be valid.
>
> vhost_vsock_stop() is also called by vhost_vsock_dev_release() when
> the user has not done close(), so when we are in do_exit(). In this
> case current->mm is invalid and we're releasing the device, so we
> should clean it anyway.
>
> Let's check the owner only when vhost_vsock_stop() is called
> by an ioctl.
>
> Fixes: 433fc58e6bf2 ("VSOCK: Introduce vhost_vsock.ko")
> Cc: stable@vger.kernel.org
> Reported-by: syzbot+1e3ea63db39f2b4440e0@syzkaller.appspotmail.com
> Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
> ---
>  drivers/vhost/vsock.c | 14 ++++++++------
>  1 file changed, 8 insertions(+), 6 deletions(-)

Reported-and-tested-by: syzbot+0abd373e2e50d704db87@syzkaller.appspotmail.com
Reported-and-tested-by: syzbot+3140b17cb44a7b174008@syzkaller.appspotmail.com
Michael S. Tsirkin Feb. 21, 2022, 3:03 p.m. UTC | #2
On Mon, Feb 21, 2022 at 12:49:16PM +0100, Stefano Garzarella wrote:
> vhost_vsock_stop() calls vhost_dev_check_owner() to check the device
> ownership. It expects current->mm to be valid.
> 
> vhost_vsock_stop() is also called by vhost_vsock_dev_release() when
> the user has not done close(), so when we are in do_exit(). In this
> case current->mm is invalid and we're releasing the device, so we
> should clean it anyway.
> 
> Let's check the owner only when vhost_vsock_stop() is called
> by an ioctl.




> Fixes: 433fc58e6bf2 ("VSOCK: Introduce vhost_vsock.ko")
> Cc: stable@vger.kernel.org
> Reported-by: syzbot+1e3ea63db39f2b4440e0@syzkaller.appspotmail.com
> Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
> ---
>  drivers/vhost/vsock.c | 14 ++++++++------
>  1 file changed, 8 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
> index d6ca1c7ad513..f00d2dfd72b7 100644
> --- a/drivers/vhost/vsock.c
> +++ b/drivers/vhost/vsock.c
> @@ -629,16 +629,18 @@ static int vhost_vsock_start(struct vhost_vsock *vsock)
>  	return ret;
>  }
>  
> -static int vhost_vsock_stop(struct vhost_vsock *vsock)
> +static int vhost_vsock_stop(struct vhost_vsock *vsock, bool check_owner)

>  {
>  	size_t i;
>  	int ret;
>  
>  	mutex_lock(&vsock->dev.mutex);
>  
> -	ret = vhost_dev_check_owner(&vsock->dev);
> -	if (ret)
> -		goto err;
> +	if (check_owner) {
> +		ret = vhost_dev_check_owner(&vsock->dev);
> +		if (ret)
> +			goto err;
> +	}
>  
>  	for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) {
>  		struct vhost_virtqueue *vq = &vsock->vqs[i];
> @@ -753,7 +755,7 @@ static int vhost_vsock_dev_release(struct inode *inode, struct file *file)
>  	 * inefficient.  Room for improvement here. */
>  	vsock_for_each_connected_socket(vhost_vsock_reset_orphans);
>  
> -	vhost_vsock_stop(vsock);

Let's add an explanation:

When invoked from release we can not fail so we don't
check return code of vhost_vsock_stop.
We need to stop vsock even if it's not the owner.

> +	vhost_vsock_stop(vsock, false);
>  	vhost_vsock_flush(vsock);
>  	vhost_dev_stop(&vsock->dev);
>  
> @@ -868,7 +870,7 @@ static long vhost_vsock_dev_ioctl(struct file *f, unsigned int ioctl,
>  		if (start)
>  			return vhost_vsock_start(vsock);
>  		else
> -			return vhost_vsock_stop(vsock);
> +			return vhost_vsock_stop(vsock, true);
>  	case VHOST_GET_FEATURES:
>  		features = VHOST_VSOCK_FEATURES;
>  		if (copy_to_user(argp, &features, sizeof(features)))
> -- 
> 2.35.1
Stefano Garzarella Feb. 21, 2022, 3:22 p.m. UTC | #3
On Mon, Feb 21, 2022 at 10:03:39AM -0500, Michael S. Tsirkin wrote:
>On Mon, Feb 21, 2022 at 12:49:16PM +0100, Stefano Garzarella wrote:
>> vhost_vsock_stop() calls vhost_dev_check_owner() to check the device
>> ownership. It expects current->mm to be valid.
>>
>> vhost_vsock_stop() is also called by vhost_vsock_dev_release() when
>> the user has not done close(), so when we are in do_exit(). In this
>> case current->mm is invalid and we're releasing the device, so we
>> should clean it anyway.
>>
>> Let's check the owner only when vhost_vsock_stop() is called
>> by an ioctl.
>
>
>
>
>> Fixes: 433fc58e6bf2 ("VSOCK: Introduce vhost_vsock.ko")
>> Cc: stable@vger.kernel.org
>> Reported-by: syzbot+1e3ea63db39f2b4440e0@syzkaller.appspotmail.com
>> Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
>> ---
>>  drivers/vhost/vsock.c | 14 ++++++++------
>>  1 file changed, 8 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
>> index d6ca1c7ad513..f00d2dfd72b7 100644
>> --- a/drivers/vhost/vsock.c
>> +++ b/drivers/vhost/vsock.c
>> @@ -629,16 +629,18 @@ static int vhost_vsock_start(struct vhost_vsock *vsock)
>>  	return ret;
>>  }
>>
>> -static int vhost_vsock_stop(struct vhost_vsock *vsock)
>> +static int vhost_vsock_stop(struct vhost_vsock *vsock, bool check_owner)
>
>>  {
>>  	size_t i;
>>  	int ret;
>>
>>  	mutex_lock(&vsock->dev.mutex);
>>
>> -	ret = vhost_dev_check_owner(&vsock->dev);
>> -	if (ret)
>> -		goto err;
>> +	if (check_owner) {
>> +		ret = vhost_dev_check_owner(&vsock->dev);
>> +		if (ret)
>> +			goto err;
>> +	}
>>
>>  	for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) {
>>  		struct vhost_virtqueue *vq = &vsock->vqs[i];
>> @@ -753,7 +755,7 @@ static int vhost_vsock_dev_release(struct inode *inode, struct file *file)
>>  	 * inefficient.  Room for improvement here. */
>>  	vsock_for_each_connected_socket(vhost_vsock_reset_orphans);
>>
>> -	vhost_vsock_stop(vsock);
>
>Let's add an explanation:
>
>When invoked from release we can not fail so we don't
>check return code of vhost_vsock_stop.
>We need to stop vsock even if it's not the owner.

Do you want me to send a v2 by adding this as a comment in the code?

Thanks,
Stefano
Anirudh Rayabharam Feb. 21, 2022, 4:14 p.m. UTC | #4
On Mon, Feb 21, 2022 at 02:59:30PM +0100, Stefano Garzarella wrote:
> On Mon, Feb 21, 2022 at 12:49 PM Stefano Garzarella <sgarzare@redhat.com> wrote:
> >
> > vhost_vsock_stop() calls vhost_dev_check_owner() to check the device
> > ownership. It expects current->mm to be valid.
> >
> > vhost_vsock_stop() is also called by vhost_vsock_dev_release() when
> > the user has not done close(), so when we are in do_exit(). In this
> > case current->mm is invalid and we're releasing the device, so we
> > should clean it anyway.
> >
> > Let's check the owner only when vhost_vsock_stop() is called
> > by an ioctl.
> >
> > Fixes: 433fc58e6bf2 ("VSOCK: Introduce vhost_vsock.ko")
> > Cc: stable@vger.kernel.org
> > Reported-by: syzbot+1e3ea63db39f2b4440e0@syzkaller.appspotmail.com
> > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
> > ---
> >  drivers/vhost/vsock.c | 14 ++++++++------
> >  1 file changed, 8 insertions(+), 6 deletions(-)
> 
> Reported-and-tested-by: syzbot+0abd373e2e50d704db87@syzkaller.appspotmail.com

I don't think this patch fixes "INFO: task hung in vhost_work_dev_flush"
even though syzbot says so. I am able to reproduce the issue locally
even with this patch applied.

Thanks,

	- Anirudh.

> Reported-and-tested-by: syzbot+3140b17cb44a7b174008@syzkaller.appspotmail.com
>
Stefano Garzarella Feb. 21, 2022, 4:44 p.m. UTC | #5
On Mon, Feb 21, 2022 at 09:44:39PM +0530, Anirudh Rayabharam wrote:
>On Mon, Feb 21, 2022 at 02:59:30PM +0100, Stefano Garzarella wrote:
>> On Mon, Feb 21, 2022 at 12:49 PM Stefano Garzarella <sgarzare@redhat.com> wrote:
>> >
>> > vhost_vsock_stop() calls vhost_dev_check_owner() to check the device
>> > ownership. It expects current->mm to be valid.
>> >
>> > vhost_vsock_stop() is also called by vhost_vsock_dev_release() when
>> > the user has not done close(), so when we are in do_exit(). In this
>> > case current->mm is invalid and we're releasing the device, so we
>> > should clean it anyway.
>> >
>> > Let's check the owner only when vhost_vsock_stop() is called
>> > by an ioctl.
>> >
>> > Fixes: 433fc58e6bf2 ("VSOCK: Introduce vhost_vsock.ko")
>> > Cc: stable@vger.kernel.org
>> > Reported-by: syzbot+1e3ea63db39f2b4440e0@syzkaller.appspotmail.com
>> > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
>> > ---
>> >  drivers/vhost/vsock.c | 14 ++++++++------
>> >  1 file changed, 8 insertions(+), 6 deletions(-)
>>
>> Reported-and-tested-by: syzbot+0abd373e2e50d704db87@syzkaller.appspotmail.com
>
>I don't think this patch fixes "INFO: task hung in vhost_work_dev_flush"
>even though syzbot says so. I am able to reproduce the issue locally
>even with this patch applied.

Are you using the sysbot reproducer or another test?
In that case, can you share it?

 From the stack trace it seemed to me that the worker accesses a zone 
that has been cleaned (iotlb), so it is invalid and fails.
That's why I had this patch tested which should stop the worker before 
cleaning.

Thanks,
Stefano
Anirudh Rayabharam Feb. 21, 2022, 6:03 p.m. UTC | #6
On Mon, Feb 21, 2022 at 05:44:20PM +0100, Stefano Garzarella wrote:
> On Mon, Feb 21, 2022 at 09:44:39PM +0530, Anirudh Rayabharam wrote:
> > On Mon, Feb 21, 2022 at 02:59:30PM +0100, Stefano Garzarella wrote:
> > > On Mon, Feb 21, 2022 at 12:49 PM Stefano Garzarella <sgarzare@redhat.com> wrote:
> > > >
> > > > vhost_vsock_stop() calls vhost_dev_check_owner() to check the device
> > > > ownership. It expects current->mm to be valid.
> > > >
> > > > vhost_vsock_stop() is also called by vhost_vsock_dev_release() when
> > > > the user has not done close(), so when we are in do_exit(). In this
> > > > case current->mm is invalid and we're releasing the device, so we
> > > > should clean it anyway.
> > > >
> > > > Let's check the owner only when vhost_vsock_stop() is called
> > > > by an ioctl.
> > > >
> > > > Fixes: 433fc58e6bf2 ("VSOCK: Introduce vhost_vsock.ko")
> > > > Cc: stable@vger.kernel.org
> > > > Reported-by: syzbot+1e3ea63db39f2b4440e0@syzkaller.appspotmail.com
> > > > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
> > > > ---
> > > >  drivers/vhost/vsock.c | 14 ++++++++------
> > > >  1 file changed, 8 insertions(+), 6 deletions(-)
> > > 
> > > Reported-and-tested-by: syzbot+0abd373e2e50d704db87@syzkaller.appspotmail.com
> > 
> > I don't think this patch fixes "INFO: task hung in vhost_work_dev_flush"
> > even though syzbot says so. I am able to reproduce the issue locally
> > even with this patch applied.
> 
> Are you using the sysbot reproducer or another test?
> In that case, can you share it?

I am using the syzbot reproducer.

> 
> From the stack trace it seemed to me that the worker accesses a zone that
> has been cleaned (iotlb), so it is invalid and fails.

Would the thread hang in that case? How?

Thanks,

	- Anirudh.

> That's why I had this patch tested which should stop the worker before
> cleaning.
> 
> Thanks,
> Stefano
>
Stefano Garzarella Feb. 21, 2022, 6:26 p.m. UTC | #7
On Mon, Feb 21, 2022 at 11:33:11PM +0530, Anirudh Rayabharam wrote:
>On Mon, Feb 21, 2022 at 05:44:20PM +0100, Stefano Garzarella wrote:
>> On Mon, Feb 21, 2022 at 09:44:39PM +0530, Anirudh Rayabharam wrote:
>> > On Mon, Feb 21, 2022 at 02:59:30PM +0100, Stefano Garzarella wrote:
>> > > On Mon, Feb 21, 2022 at 12:49 PM Stefano Garzarella <sgarzare@redhat.com> wrote:
>> > > >
>> > > > vhost_vsock_stop() calls vhost_dev_check_owner() to check the device
>> > > > ownership. It expects current->mm to be valid.
>> > > >
>> > > > vhost_vsock_stop() is also called by vhost_vsock_dev_release() when
>> > > > the user has not done close(), so when we are in do_exit(). In this
>> > > > case current->mm is invalid and we're releasing the device, so we
>> > > > should clean it anyway.
>> > > >
>> > > > Let's check the owner only when vhost_vsock_stop() is called
>> > > > by an ioctl.
>> > > >
>> > > > Fixes: 433fc58e6bf2 ("VSOCK: Introduce vhost_vsock.ko")
>> > > > Cc: stable@vger.kernel.org
>> > > > Reported-by: syzbot+1e3ea63db39f2b4440e0@syzkaller.appspotmail.com
>> > > > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
>> > > > ---
>> > > >  drivers/vhost/vsock.c | 14 ++++++++------
>> > > >  1 file changed, 8 insertions(+), 6 deletions(-)
>> > >
>> > > Reported-and-tested-by: syzbot+0abd373e2e50d704db87@syzkaller.appspotmail.com
>> >
>> > I don't think this patch fixes "INFO: task hung in vhost_work_dev_flush"
>> > even though syzbot says so. I am able to reproduce the issue locally
>> > even with this patch applied.
>>
>> Are you using the sysbot reproducer or another test?
>> In that case, can you share it?
>
>I am using the syzbot reproducer.
>
>>
>> From the stack trace it seemed to me that the worker accesses a zone that
>> has been cleaned (iotlb), so it is invalid and fails.
>
>Would the thread hang in that case? How?

Looking at this log [1] it seems that the process is blocked on the 
wait_for_completion() in vhost_work_dev_flush().

Since we're not setting the backend to NULL to stop the worker, it's 
likely that the worker will keep running, preventing the flush work from 
completing.

[1] https://syzkaller.appspot.com/text?tag=CrashLog&x=153f0852700000
Anirudh Rayabharam Feb. 21, 2022, 7:36 p.m. UTC | #8
On Mon, Feb 21, 2022 at 07:26:28PM +0100, Stefano Garzarella wrote:
> On Mon, Feb 21, 2022 at 11:33:11PM +0530, Anirudh Rayabharam wrote:
> > On Mon, Feb 21, 2022 at 05:44:20PM +0100, Stefano Garzarella wrote:
> > > On Mon, Feb 21, 2022 at 09:44:39PM +0530, Anirudh Rayabharam wrote:
> > > > On Mon, Feb 21, 2022 at 02:59:30PM +0100, Stefano Garzarella wrote:
> > > > > On Mon, Feb 21, 2022 at 12:49 PM Stefano Garzarella <sgarzare@redhat.com> wrote:
> > > > > >
> > > > > > vhost_vsock_stop() calls vhost_dev_check_owner() to check the device
> > > > > > ownership. It expects current->mm to be valid.
> > > > > >
> > > > > > vhost_vsock_stop() is also called by vhost_vsock_dev_release() when
> > > > > > the user has not done close(), so when we are in do_exit(). In this
> > > > > > case current->mm is invalid and we're releasing the device, so we
> > > > > > should clean it anyway.
> > > > > >
> > > > > > Let's check the owner only when vhost_vsock_stop() is called
> > > > > > by an ioctl.
> > > > > >
> > > > > > Fixes: 433fc58e6bf2 ("VSOCK: Introduce vhost_vsock.ko")
> > > > > > Cc: stable@vger.kernel.org
> > > > > > Reported-by: syzbot+1e3ea63db39f2b4440e0@syzkaller.appspotmail.com
> > > > > > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
> > > > > > ---
> > > > > >  drivers/vhost/vsock.c | 14 ++++++++------
> > > > > >  1 file changed, 8 insertions(+), 6 deletions(-)
> > > > >
> > > > > Reported-and-tested-by: syzbot+0abd373e2e50d704db87@syzkaller.appspotmail.com
> > > >
> > > > I don't think this patch fixes "INFO: task hung in vhost_work_dev_flush"
> > > > even though syzbot says so. I am able to reproduce the issue locally
> > > > even with this patch applied.
> > > 
> > > Are you using the sysbot reproducer or another test?
> > > In that case, can you share it?
> > 
> > I am using the syzbot reproducer.
> > 
> > > 
> > > From the stack trace it seemed to me that the worker accesses a zone that
> > > has been cleaned (iotlb), so it is invalid and fails.
> > 
> > Would the thread hang in that case? How?
> 
> Looking at this log [1] it seems that the process is blocked on the
> wait_for_completion() in vhost_work_dev_flush().
> 
> Since we're not setting the backend to NULL to stop the worker, it's likely
> that the worker will keep running, preventing the flush work from
> completing.

The log shows that the worker thread is stuck in iotlb_access_ok(). How
will setting the backend to NULL stop it? During my debugging I found
that the worker is stuck in this while loop:

1361         while (len > s) {                                                                     
1362                 map = vhost_iotlb_itree_first(umem, addr, last);                        
1363                 if (map == NULL || map->start > addr) {                                 
1364                         vhost_iotlb_miss(vq, addr, access);     
1365                         return false;                      
1366                 } else if (!(map->perm & access)) {        
1367                         /* Report the possible access violation by
1368                          * request another translation from userspace.    
1369                          */                                           
1370                         return false;                                 
1371                 }                      
1372                                          
1373                 pr_info("iotlb_access_ok: after msize=%llu, mstart=%llu\n",
1374                                 map->size, map->start);                    
1375                 size = map->size - addr + map->start;                      
1376                                                             
1377                 if (orig_addr == addr && size >= len)       
1378                         vhost_vq_meta_update(vq, map, type);                      
1379                                                                                   
1380                 s += size;                                                        
1381                 addr += size;                                                     
1382         }

> 
> [1] https://syzkaller.appspot.com/text?tag=CrashLog&x=153f0852700000
>
Dan Carpenter Feb. 22, 2022, 5:30 a.m. UTC | #9
Hi Stefano,

url:    https://github.com/0day-ci/linux/commits/Stefano-Garzarella/vhost-vsock-don-t-check-owner-in-vhost_vsock_stop-while-releasing/20220221-195038
base:   https://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git linux-next
config: x86_64-randconfig-m031-20220221 (https://download.01.org/0day-ci/archive/20220222/202202220707.AM3rKUcP-lkp@intel.com/config)
compiler: gcc-9 (Debian 9.3.0-22) 9.3.0

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>

smatch warnings:
drivers/vhost/vsock.c:655 vhost_vsock_stop() error: uninitialized symbol 'ret'.

vim +/ret +655 drivers/vhost/vsock.c

3ace84c91bfcde Stefano Garzarella 2022-02-21  632  static int vhost_vsock_stop(struct vhost_vsock *vsock, bool check_owner)
433fc58e6bf2c8 Asias He           2016-07-28  633  {
433fc58e6bf2c8 Asias He           2016-07-28  634  	size_t i;
433fc58e6bf2c8 Asias He           2016-07-28  635  	int ret;
433fc58e6bf2c8 Asias He           2016-07-28  636  
433fc58e6bf2c8 Asias He           2016-07-28  637  	mutex_lock(&vsock->dev.mutex);
433fc58e6bf2c8 Asias He           2016-07-28  638  
3ace84c91bfcde Stefano Garzarella 2022-02-21  639  	if (check_owner) {
433fc58e6bf2c8 Asias He           2016-07-28  640  		ret = vhost_dev_check_owner(&vsock->dev);
433fc58e6bf2c8 Asias He           2016-07-28  641  		if (ret)
433fc58e6bf2c8 Asias He           2016-07-28  642  			goto err;
3ace84c91bfcde Stefano Garzarella 2022-02-21  643  	}

"ret" not initialized on else path.

433fc58e6bf2c8 Asias He           2016-07-28  644  
433fc58e6bf2c8 Asias He           2016-07-28  645  	for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) {
433fc58e6bf2c8 Asias He           2016-07-28  646  		struct vhost_virtqueue *vq = &vsock->vqs[i];
433fc58e6bf2c8 Asias He           2016-07-28  647  
433fc58e6bf2c8 Asias He           2016-07-28  648  		mutex_lock(&vq->mutex);
247643f85782fc Eugenio PĂ©rez      2020-03-31  649  		vhost_vq_set_backend(vq, NULL);
433fc58e6bf2c8 Asias He           2016-07-28  650  		mutex_unlock(&vq->mutex);
433fc58e6bf2c8 Asias He           2016-07-28  651  	}
433fc58e6bf2c8 Asias He           2016-07-28  652  
433fc58e6bf2c8 Asias He           2016-07-28  653  err:
433fc58e6bf2c8 Asias He           2016-07-28  654  	mutex_unlock(&vsock->dev.mutex);
433fc58e6bf2c8 Asias He           2016-07-28 @655  	return ret;
433fc58e6bf2c8 Asias He           2016-07-28  656  }

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
Stefano Garzarella Feb. 22, 2022, 8:11 a.m. UTC | #10
On Tue, Feb 22, 2022 at 08:30:17AM +0300, Dan Carpenter wrote:
>Hi Stefano,
>
>url:    https://github.com/0day-ci/linux/commits/Stefano-Garzarella/vhost-vsock-don-t-check-owner-in-vhost_vsock_stop-while-releasing/20220221-195038
>base:   https://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git linux-next
>config: x86_64-randconfig-m031-20220221 (https://download.01.org/0day-ci/archive/20220222/202202220707.AM3rKUcP-lkp@intel.com/config)
>compiler: gcc-9 (Debian 9.3.0-22) 9.3.0
>
>If you fix the issue, kindly add following tag as appropriate
>Reported-by: kernel test robot <lkp@intel.com>
>Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
>
>smatch warnings:
>drivers/vhost/vsock.c:655 vhost_vsock_stop() error: uninitialized symbol 'ret'.
>
>vim +/ret +655 drivers/vhost/vsock.c
>
>3ace84c91bfcde Stefano Garzarella 2022-02-21  632  static int vhost_vsock_stop(struct vhost_vsock *vsock, bool check_owner)
>433fc58e6bf2c8 Asias He           2016-07-28  633  {
>433fc58e6bf2c8 Asias He           2016-07-28  634  	size_t i;
>433fc58e6bf2c8 Asias He           2016-07-28  635  	int ret;
>433fc58e6bf2c8 Asias He           2016-07-28  636
>433fc58e6bf2c8 Asias He           2016-07-28  637  	mutex_lock(&vsock->dev.mutex);
>433fc58e6bf2c8 Asias He           2016-07-28  638
>3ace84c91bfcde Stefano Garzarella 2022-02-21  639  	if (check_owner) {
>433fc58e6bf2c8 Asias He           2016-07-28  640  		ret = vhost_dev_check_owner(&vsock->dev);
>433fc58e6bf2c8 Asias He           2016-07-28  641  		if (ret)
>433fc58e6bf2c8 Asias He           2016-07-28  642  			goto err;
>3ace84c91bfcde Stefano Garzarella 2022-02-21  643  	}
>
>"ret" not initialized on else path.

Oooops, I was testing with vhost_vsock_dev_release() where we don't 
check the ret value, but of course we need to initialize it to 0 for the 
vhost_vsock_dev_ioctl() use case.

I'll fix in the v2.

Thanks for the report,
Stefano
Stefano Garzarella Feb. 22, 2022, 9:05 a.m. UTC | #11
On Tue, Feb 22, 2022 at 01:06:12AM +0530, Anirudh Rayabharam wrote:
>On Mon, Feb 21, 2022 at 07:26:28PM +0100, Stefano Garzarella wrote:
>> On Mon, Feb 21, 2022 at 11:33:11PM +0530, Anirudh Rayabharam wrote:
>> > On Mon, Feb 21, 2022 at 05:44:20PM +0100, Stefano Garzarella wrote:
>> > > On Mon, Feb 21, 2022 at 09:44:39PM +0530, Anirudh Rayabharam wrote:
>> > > > On Mon, Feb 21, 2022 at 02:59:30PM +0100, Stefano Garzarella wrote:
>> > > > > On Mon, Feb 21, 2022 at 12:49 PM Stefano Garzarella <sgarzare@redhat.com> wrote:
>> > > > > >
>> > > > > > vhost_vsock_stop() calls vhost_dev_check_owner() to check the device
>> > > > > > ownership. It expects current->mm to be valid.
>> > > > > >
>> > > > > > vhost_vsock_stop() is also called by vhost_vsock_dev_release() when
>> > > > > > the user has not done close(), so when we are in do_exit(). In this
>> > > > > > case current->mm is invalid and we're releasing the device, so we
>> > > > > > should clean it anyway.
>> > > > > >
>> > > > > > Let's check the owner only when vhost_vsock_stop() is called
>> > > > > > by an ioctl.
>> > > > > >
>> > > > > > Fixes: 433fc58e6bf2 ("VSOCK: Introduce vhost_vsock.ko")
>> > > > > > Cc: stable@vger.kernel.org
>> > > > > > Reported-by: syzbot+1e3ea63db39f2b4440e0@syzkaller.appspotmail.com
>> > > > > > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
>> > > > > > ---
>> > > > > >  drivers/vhost/vsock.c | 14 ++++++++------
>> > > > > >  1 file changed, 8 insertions(+), 6 deletions(-)
>> > > > >
>> > > > > Reported-and-tested-by: syzbot+0abd373e2e50d704db87@syzkaller.appspotmail.com
>> > > >
>> > > > I don't think this patch fixes "INFO: task hung in vhost_work_dev_flush"
>> > > > even though syzbot says so. I am able to reproduce the issue locally
>> > > > even with this patch applied.
>> > >
>> > > Are you using the sysbot reproducer or another test?
>> > > In that case, can you share it?
>> >
>> > I am using the syzbot reproducer.
>> >
>> > >
>> > > From the stack trace it seemed to me that the worker accesses a zone that
>> > > has been cleaned (iotlb), so it is invalid and fails.
>> >
>> > Would the thread hang in that case? How?
>>
>> Looking at this log [1] it seems that the process is blocked on the
>> wait_for_completion() in vhost_work_dev_flush().
>>
>> Since we're not setting the backend to NULL to stop the worker, it's likely
>> that the worker will keep running, preventing the flush work from
>> completing.
>
>The log shows that the worker thread is stuck in iotlb_access_ok(). How
>will setting the backend to NULL stop it? During my debugging I found
>that the worker is stuck in this while loop:

Okay, looking at your new patch, now I see. If we enter in this loop 
before setting the backend to NULL and we have start = 0 and end = (u64) 
-1 , we should be there forever.

I'll remove that tag in v2, but the test might fail without this patch 
applied, because for now we don't stop workers correctly.

Thanks,
Stefano
diff mbox series

Patch

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index d6ca1c7ad513..f00d2dfd72b7 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -629,16 +629,18 @@  static int vhost_vsock_start(struct vhost_vsock *vsock)
 	return ret;
 }
 
-static int vhost_vsock_stop(struct vhost_vsock *vsock)
+static int vhost_vsock_stop(struct vhost_vsock *vsock, bool check_owner)
 {
 	size_t i;
 	int ret;
 
 	mutex_lock(&vsock->dev.mutex);
 
-	ret = vhost_dev_check_owner(&vsock->dev);
-	if (ret)
-		goto err;
+	if (check_owner) {
+		ret = vhost_dev_check_owner(&vsock->dev);
+		if (ret)
+			goto err;
+	}
 
 	for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) {
 		struct vhost_virtqueue *vq = &vsock->vqs[i];
@@ -753,7 +755,7 @@  static int vhost_vsock_dev_release(struct inode *inode, struct file *file)
 	 * inefficient.  Room for improvement here. */
 	vsock_for_each_connected_socket(vhost_vsock_reset_orphans);
 
-	vhost_vsock_stop(vsock);
+	vhost_vsock_stop(vsock, false);
 	vhost_vsock_flush(vsock);
 	vhost_dev_stop(&vsock->dev);
 
@@ -868,7 +870,7 @@  static long vhost_vsock_dev_ioctl(struct file *f, unsigned int ioctl,
 		if (start)
 			return vhost_vsock_start(vsock);
 		else
-			return vhost_vsock_stop(vsock);
+			return vhost_vsock_stop(vsock, true);
 	case VHOST_GET_FEATURES:
 		features = VHOST_VSOCK_FEATURES;
 		if (copy_to_user(argp, &features, sizeof(features)))