diff mbox series

[V5,3/4] blk-mq: clear stale request in tags->rq[] before freeing one request pool

Message ID 20210505145855.174127-4-ming.lei@redhat.com (mailing list archive)
State New, archived
Headers show
Series blk-mq: fix request UAF related with iterating over tagset requests | expand

Commit Message

Ming Lei May 5, 2021, 2:58 p.m. UTC
refcount_inc_not_zero() in bt_tags_iter() still may read one freed
request.

Fix the issue by the following approach:

1) hold a per-tags spinlock when reading ->rqs[tag] and calling
refcount_inc_not_zero in bt_tags_iter()

2) clearing stale request referred via ->rqs[tag] before freeing
request pool, the per-tags spinlock is held for clearing stale
->rq[tag]

So after we cleared stale requests, bt_tags_iter() won't observe
freed request any more, also the clearing will wait for pending
request reference.

The idea of clearing ->rqs[] is borrowed from John Garry's previous
patch and one recent David's patch.

Tested-by: John Garry <john.garry@huawei.com>
Reviewed-by: David Jeffery <djeffery@redhat.com>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 block/blk-mq-tag.c |  8 +++++++-
 block/blk-mq-tag.h |  3 +++
 block/blk-mq.c     | 39 ++++++++++++++++++++++++++++++++++-----
 3 files changed, 44 insertions(+), 6 deletions(-)

Comments

Christoph Hellwig May 6, 2021, 7:12 a.m. UTC | #1
On Wed, May 05, 2021 at 10:58:54PM +0800, Ming Lei wrote:
> refcount_inc_not_zero() in bt_tags_iter() still may read one freed
> request.
> 
> Fix the issue by the following approach:
> 
> 1) hold a per-tags spinlock when reading ->rqs[tag] and calling
> refcount_inc_not_zero in bt_tags_iter()
> 
> 2) clearing stale request referred via ->rqs[tag] before freeing
> request pool, the per-tags spinlock is held for clearing stale
> ->rq[tag]
> 
> So after we cleared stale requests, bt_tags_iter() won't observe
> freed request any more, also the clearing will wait for pending
> request reference.
> 
> The idea of clearing ->rqs[] is borrowed from John Garry's previous
> patch and one recent David's patch.
> 
> Tested-by: John Garry <john.garry@huawei.com>
> Reviewed-by: David Jeffery <djeffery@redhat.com>
> Reviewed-by: Bart Van Assche <bvanassche@acm.org>
> Signed-off-by: Ming Lei <ming.lei@redhat.com>
> ---
>  block/blk-mq-tag.c |  8 +++++++-
>  block/blk-mq-tag.h |  3 +++
>  block/blk-mq.c     | 39 ++++++++++++++++++++++++++++++++++-----
>  3 files changed, 44 insertions(+), 6 deletions(-)
> 
> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> index 4a40d409f5dd..8b239dcce85f 100644
> --- a/block/blk-mq-tag.c
> +++ b/block/blk-mq-tag.c
> @@ -203,9 +203,14 @@ static struct request *blk_mq_find_and_get_req(struct blk_mq_tags *tags,
>  		unsigned int bitnr)
>  {
>  	struct request *rq = tags->rqs[bitnr];
> +	unsigned long flags;
>  
> -	if (!rq || !refcount_inc_not_zero(&rq->ref))
> +	spin_lock_irqsave(&tags->lock, flags);
> +	if (!rq || !refcount_inc_not_zero(&rq->ref)) {
> +		spin_unlock_irqrestore(&tags->lock, flags);
>  		return NULL;
> +	}
> +	spin_unlock_irqrestore(&tags->lock, flags);
>  	return rq;
>  }
>  
> @@ -538,6 +543,7 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
>  
>  	tags->nr_tags = total_tags;
>  	tags->nr_reserved_tags = reserved_tags;
> +	spin_lock_init(&tags->lock);
>  
>  	if (blk_mq_is_sbitmap_shared(flags))
>  		return tags;
> diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
> index 7d3e6b333a4a..f942a601b5ef 100644
> --- a/block/blk-mq-tag.h
> +++ b/block/blk-mq-tag.h
> @@ -20,6 +20,9 @@ struct blk_mq_tags {
>  	struct request **rqs;
>  	struct request **static_rqs;
>  	struct list_head page_list;
> +
> +	/* used to clear rqs[] before one request pool is freed */
> +	spinlock_t lock;
>  };
>  
>  extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags,
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index d053e80fa6d7..c1b28e09a27e 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -2306,6 +2306,38 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio)
>  	return BLK_QC_T_NONE;
>  }
>  
> +static size_t order_to_size(unsigned int order)
> +{
> +	return (size_t)PAGE_SIZE << order;
> +}
> +
> +/* called before freeing request pool in @tags */
> +static void blk_mq_clear_rq_mapping(struct blk_mq_tag_set *set,
> +		struct blk_mq_tags *tags, unsigned int hctx_idx)
> +{
> +	struct blk_mq_tags *drv_tags = set->tags[hctx_idx];
> +	struct page *page;
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&drv_tags->lock, flags);
> +	list_for_each_entry(page, &tags->page_list, lru) {
> +		unsigned long start = (unsigned long)page_address(page);
> +		unsigned long end = start + order_to_size(page->private);
> +		int i;
> +
> +		for (i = 0; i < set->queue_depth; i++) {
> +			struct request *rq = drv_tags->rqs[i];
> +			unsigned long rq_addr = (unsigned long)rq;
> +
> +			if (rq_addr >= start && rq_addr < end) {
> +				WARN_ON_ONCE(refcount_read(&rq->ref) != 0);
> +				cmpxchg(&drv_tags->rqs[i], rq, NULL);
> +			}
> +		}
> +	}
> +	spin_unlock_irqrestore(&drv_tags->lock, flags);

This looks really strange.  Why do we need the cmpxchg while under
the lock anyway?  Why iterate the allocated pages and not just clear
the drv_tags valus directly?

static void blk_mq_clear_rq_mapping(struct blk_mq_tag_set *set,
		struct blk_mq_tags *tags, unsigned int hctx_idx)
{
	struct blk_mq_tags *drv_tags = set->tags[hctx_idx];
	unsigned int i = 0;
	unsigned long flags;

	spin_lock_irqsave(&drv_tags->lock, flags);
	for (i = 0; i < set->queue_depth; i++)
		WARN_ON_ONCE(refcount_read(&drv_tags->rqs[i]->ref) != 0);
		drv_tags->rqs[i] = NULL;
	}
	spin_unlock_irqrestore(&drv_tags->lock, flags);
}
Ming Lei May 6, 2021, 7:34 a.m. UTC | #2
On Thu, May 06, 2021 at 08:12:56AM +0100, Christoph Hellwig wrote:
> On Wed, May 05, 2021 at 10:58:54PM +0800, Ming Lei wrote:
> > refcount_inc_not_zero() in bt_tags_iter() still may read one freed
> > request.
> > 
> > Fix the issue by the following approach:
> > 
> > 1) hold a per-tags spinlock when reading ->rqs[tag] and calling
> > refcount_inc_not_zero in bt_tags_iter()
> > 
> > 2) clearing stale request referred via ->rqs[tag] before freeing
> > request pool, the per-tags spinlock is held for clearing stale
> > ->rq[tag]
> > 
> > So after we cleared stale requests, bt_tags_iter() won't observe
> > freed request any more, also the clearing will wait for pending
> > request reference.
> > 
> > The idea of clearing ->rqs[] is borrowed from John Garry's previous
> > patch and one recent David's patch.
> > 
> > Tested-by: John Garry <john.garry@huawei.com>
> > Reviewed-by: David Jeffery <djeffery@redhat.com>
> > Reviewed-by: Bart Van Assche <bvanassche@acm.org>
> > Signed-off-by: Ming Lei <ming.lei@redhat.com>
> > ---
> >  block/blk-mq-tag.c |  8 +++++++-
> >  block/blk-mq-tag.h |  3 +++
> >  block/blk-mq.c     | 39 ++++++++++++++++++++++++++++++++++-----
> >  3 files changed, 44 insertions(+), 6 deletions(-)
> > 
> > diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> > index 4a40d409f5dd..8b239dcce85f 100644
> > --- a/block/blk-mq-tag.c
> > +++ b/block/blk-mq-tag.c
> > @@ -203,9 +203,14 @@ static struct request *blk_mq_find_and_get_req(struct blk_mq_tags *tags,
> >  		unsigned int bitnr)
> >  {
> >  	struct request *rq = tags->rqs[bitnr];
> > +	unsigned long flags;
> >  
> > -	if (!rq || !refcount_inc_not_zero(&rq->ref))
> > +	spin_lock_irqsave(&tags->lock, flags);
> > +	if (!rq || !refcount_inc_not_zero(&rq->ref)) {
> > +		spin_unlock_irqrestore(&tags->lock, flags);
> >  		return NULL;
> > +	}
> > +	spin_unlock_irqrestore(&tags->lock, flags);
> >  	return rq;
> >  }
> >  
> > @@ -538,6 +543,7 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
> >  
> >  	tags->nr_tags = total_tags;
> >  	tags->nr_reserved_tags = reserved_tags;
> > +	spin_lock_init(&tags->lock);
> >  
> >  	if (blk_mq_is_sbitmap_shared(flags))
> >  		return tags;
> > diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
> > index 7d3e6b333a4a..f942a601b5ef 100644
> > --- a/block/blk-mq-tag.h
> > +++ b/block/blk-mq-tag.h
> > @@ -20,6 +20,9 @@ struct blk_mq_tags {
> >  	struct request **rqs;
> >  	struct request **static_rqs;
> >  	struct list_head page_list;
> > +
> > +	/* used to clear rqs[] before one request pool is freed */
> > +	spinlock_t lock;
> >  };
> >  
> >  extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags,
> > diff --git a/block/blk-mq.c b/block/blk-mq.c
> > index d053e80fa6d7..c1b28e09a27e 100644
> > --- a/block/blk-mq.c
> > +++ b/block/blk-mq.c
> > @@ -2306,6 +2306,38 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio)
> >  	return BLK_QC_T_NONE;
> >  }
> >  
> > +static size_t order_to_size(unsigned int order)
> > +{
> > +	return (size_t)PAGE_SIZE << order;
> > +}
> > +
> > +/* called before freeing request pool in @tags */
> > +static void blk_mq_clear_rq_mapping(struct blk_mq_tag_set *set,
> > +		struct blk_mq_tags *tags, unsigned int hctx_idx)
> > +{
> > +	struct blk_mq_tags *drv_tags = set->tags[hctx_idx];
> > +	struct page *page;
> > +	unsigned long flags;
> > +
> > +	spin_lock_irqsave(&drv_tags->lock, flags);
> > +	list_for_each_entry(page, &tags->page_list, lru) {
> > +		unsigned long start = (unsigned long)page_address(page);
> > +		unsigned long end = start + order_to_size(page->private);
> > +		int i;
> > +
> > +		for (i = 0; i < set->queue_depth; i++) {
> > +			struct request *rq = drv_tags->rqs[i];
> > +			unsigned long rq_addr = (unsigned long)rq;
> > +
> > +			if (rq_addr >= start && rq_addr < end) {
> > +				WARN_ON_ONCE(refcount_read(&rq->ref) != 0);
> > +				cmpxchg(&drv_tags->rqs[i], rq, NULL);
> > +			}
> > +		}
> > +	}
> > +	spin_unlock_irqrestore(&drv_tags->lock, flags);
> 
> This looks really strange.  Why do we need the cmpxchg while under
> the lock anyway?  Why iterate the allocated pages and not just clear

Lock is for protecting free vs. iterating.

> the drv_tags valus directly?
> 
> static void blk_mq_clear_rq_mapping(struct blk_mq_tag_set *set,
> 		struct blk_mq_tags *tags, unsigned int hctx_idx)
> {
> 	struct blk_mq_tags *drv_tags = set->tags[hctx_idx];
> 	unsigned int i = 0;
> 	unsigned long flags;
> 
> 	spin_lock_irqsave(&drv_tags->lock, flags);
> 	for (i = 0; i < set->queue_depth; i++)
> 		WARN_ON_ONCE(refcount_read(&drv_tags->rqs[i]->ref) != 0);
> 		drv_tags->rqs[i] = NULL;

drv_tags->rqs[] may be assigned from another LUN at the same time, then
the simple clearing will overwrite the active assignment. We just need
to clear mapping iff the stored rq will be freed.


Thanks,
Ming
Christoph Hellwig May 6, 2021, 12:18 p.m. UTC | #3
On Thu, May 06, 2021 at 03:34:17PM +0800, Ming Lei wrote:
> > {
> > 	struct blk_mq_tags *drv_tags = set->tags[hctx_idx];
> > 	unsigned int i = 0;
> > 	unsigned long flags;
> > 
> > 	spin_lock_irqsave(&drv_tags->lock, flags);
> > 	for (i = 0; i < set->queue_depth; i++)
> > 		WARN_ON_ONCE(refcount_read(&drv_tags->rqs[i]->ref) != 0);
> > 		drv_tags->rqs[i] = NULL;
> 
> drv_tags->rqs[] may be assigned from another LUN at the same time, then
> the simple clearing will overwrite the active assignment. We just need
> to clear mapping iff the stored rq will be freed.

So.  Even a different LUN shares the same tagset.  So I can see the
need for the cmpxchg (please document it!), but I don't see the need
for the complex iteration.  All the rqs are freed in one single loop,
so we can just iterate through them sequentially.

Btw, I think ->lock might better be named ->clear_lock to document its
purpose better.
Ming Lei May 6, 2021, 1:38 p.m. UTC | #4
On Thu, May 06, 2021 at 01:18:49PM +0100, Christoph Hellwig wrote:
> On Thu, May 06, 2021 at 03:34:17PM +0800, Ming Lei wrote:
> > > {
> > > 	struct blk_mq_tags *drv_tags = set->tags[hctx_idx];
> > > 	unsigned int i = 0;
> > > 	unsigned long flags;
> > > 
> > > 	spin_lock_irqsave(&drv_tags->lock, flags);
> > > 	for (i = 0; i < set->queue_depth; i++)
> > > 		WARN_ON_ONCE(refcount_read(&drv_tags->rqs[i]->ref) != 0);
> > > 		drv_tags->rqs[i] = NULL;
> > 
> > drv_tags->rqs[] may be assigned from another LUN at the same time, then
> > the simple clearing will overwrite the active assignment. We just need
> > to clear mapping iff the stored rq will be freed.
> 
> So.  Even a different LUN shares the same tagset.  So I can see the
> need for the cmpxchg (please document it!), but I don't see the need
> for the complex iteration.  All the rqs are freed in one single loop,
> so we can just iterate through them sequentially.

That is exactly what the patch is doing, requests are stored in page
list, so check if one request(covered in page list) reference in
drv_tags->rq[i] exists, if yes, we clear the request reference.

The code is actually sort of self-document: before we free requests,
clear the reference in hostwide drv->rqs[].

> 
> Btw, I think ->lock might better be named ->clear_lock to document its
> purpose better.

OK.


thanks, 
Ming
Bart Van Assche May 6, 2021, 2:51 p.m. UTC | #5
On 5/6/21 5:18 AM, Christoph Hellwig wrote:
> On Thu, May 06, 2021 at 03:34:17PM +0800, Ming Lei wrote:
>>> {
>>> 	struct blk_mq_tags *drv_tags = set->tags[hctx_idx];
>>> 	unsigned int i = 0;
>>> 	unsigned long flags;
>>>
>>> 	spin_lock_irqsave(&drv_tags->lock, flags);
>>> 	for (i = 0; i < set->queue_depth; i++)
>>> 		WARN_ON_ONCE(refcount_read(&drv_tags->rqs[i]->ref) != 0);
>>> 		drv_tags->rqs[i] = NULL;
>>
>> drv_tags->rqs[] may be assigned from another LUN at the same time, then
>> the simple clearing will overwrite the active assignment. We just need
>> to clear mapping iff the stored rq will be freed.
> 
> So.  Even a different LUN shares the same tagset.  So I can see the
> need for the cmpxchg (please document it!), but I don't see the need
> for the complex iteration.  All the rqs are freed in one single loop,
> so we can just iterate through them sequentially.
> 
> Btw, I think ->lock might better be named ->clear_lock to document its
> purpose better.

I'm not sure that would be a better name since I don't think that it is
necessary to hold that lock around the cmpxchg() calls. How about using
something like the following code in blk_mq_clear_rq_mapping() instead
of the code in v5 of patch 3/4?

	spin_lock_irqsave(&drv_tags->lock, flags);
	spin_unlock_irqrestore(&drv_tags->lock, flags);

	list_for_each_entry(page, &tags->page_list, lru) {
		/* use cmpxchg() to clear request pointer selectively */
	}

Thanks,

Bart.
Bart Van Assche May 6, 2021, 3:02 p.m. UTC | #6
On 5/5/21 7:58 AM, Ming Lei wrote:
> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> index 4a40d409f5dd..8b239dcce85f 100644
> --- a/block/blk-mq-tag.c
> +++ b/block/blk-mq-tag.c
> @@ -203,9 +203,14 @@ static struct request *blk_mq_find_and_get_req(struct blk_mq_tags *tags,
>  		unsigned int bitnr)
>  {
>  	struct request *rq = tags->rqs[bitnr];
> +	unsigned long flags;
>  
> -	if (!rq || !refcount_inc_not_zero(&rq->ref))
> +	spin_lock_irqsave(&tags->lock, flags);
> +	if (!rq || !refcount_inc_not_zero(&rq->ref)) {
> +		spin_unlock_irqrestore(&tags->lock, flags);
>  		return NULL;
> +	}
> +	spin_unlock_irqrestore(&tags->lock, flags);
>  	return rq;
>  }
Has it been considered to change the body of the if-statement into the
following?

		rq = NULL;

That change would reduce the number of return statements from two to one.

Thanks,

Bart.
Ming Lei May 7, 2021, 12:11 a.m. UTC | #7
On Thu, May 06, 2021 at 07:51:53AM -0700, Bart Van Assche wrote:
> On 5/6/21 5:18 AM, Christoph Hellwig wrote:
> > On Thu, May 06, 2021 at 03:34:17PM +0800, Ming Lei wrote:
> >>> {
> >>> 	struct blk_mq_tags *drv_tags = set->tags[hctx_idx];
> >>> 	unsigned int i = 0;
> >>> 	unsigned long flags;
> >>>
> >>> 	spin_lock_irqsave(&drv_tags->lock, flags);
> >>> 	for (i = 0; i < set->queue_depth; i++)
> >>> 		WARN_ON_ONCE(refcount_read(&drv_tags->rqs[i]->ref) != 0);
> >>> 		drv_tags->rqs[i] = NULL;
> >>
> >> drv_tags->rqs[] may be assigned from another LUN at the same time, then
> >> the simple clearing will overwrite the active assignment. We just need
> >> to clear mapping iff the stored rq will be freed.
> > 
> > So.  Even a different LUN shares the same tagset.  So I can see the
> > need for the cmpxchg (please document it!), but I don't see the need
> > for the complex iteration.  All the rqs are freed in one single loop,
> > so we can just iterate through them sequentially.
> > 
> > Btw, I think ->lock might better be named ->clear_lock to document its
> > purpose better.
> 
> I'm not sure that would be a better name since I don't think that it is
> necessary to hold that lock around the cmpxchg() calls. How about using
> something like the following code in blk_mq_clear_rq_mapping() instead
> of the code in v5 of patch 3/4?
> 
> 	spin_lock_irqsave(&drv_tags->lock, flags);
> 	spin_unlock_irqrestore(&drv_tags->lock, flags);
> 
> 	list_for_each_entry(page, &tags->page_list, lru) {
> 		/* use cmpxchg() to clear request pointer selectively */
> 	}

This way won't work as expected because iterating may happen between
releasing drv_tags->lock and cmpxchg(->rqs[]), then freed requests
may still be touched during iteration after they are freed in blk_mq_free_rqs().


Thanks,
Ming
Ming Lei May 7, 2021, 12:13 a.m. UTC | #8
On Thu, May 06, 2021 at 08:02:53AM -0700, Bart Van Assche wrote:
> On 5/5/21 7:58 AM, Ming Lei wrote:
> > diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> > index 4a40d409f5dd..8b239dcce85f 100644
> > --- a/block/blk-mq-tag.c
> > +++ b/block/blk-mq-tag.c
> > @@ -203,9 +203,14 @@ static struct request *blk_mq_find_and_get_req(struct blk_mq_tags *tags,
> >  		unsigned int bitnr)
> >  {
> >  	struct request *rq = tags->rqs[bitnr];
> > +	unsigned long flags;
> >  
> > -	if (!rq || !refcount_inc_not_zero(&rq->ref))
> > +	spin_lock_irqsave(&tags->lock, flags);
> > +	if (!rq || !refcount_inc_not_zero(&rq->ref)) {
> > +		spin_unlock_irqrestore(&tags->lock, flags);
> >  		return NULL;
> > +	}
> > +	spin_unlock_irqrestore(&tags->lock, flags);
> >  	return rq;
> >  }
> Has it been considered to change the body of the if-statement into the
> following?
> 
> 		rq = NULL;
> 
> That change would reduce the number of return statements from two to one.

Looks cleaner, will take it in V6.


Thanks,
Ming
Bart Van Assche May 7, 2021, 1:10 a.m. UTC | #9
On 5/6/21 5:11 PM, Ming Lei wrote:
> On Thu, May 06, 2021 at 07:51:53AM -0700, Bart Van Assche wrote:
>> I'm not sure that would be a better name since I don't think that it is
>> necessary to hold that lock around the cmpxchg() calls. How about using
>> something like the following code in blk_mq_clear_rq_mapping() instead
>> of the code in v5 of patch 3/4?
>>
>> 	spin_lock_irqsave(&drv_tags->lock, flags);
>> 	spin_unlock_irqrestore(&drv_tags->lock, flags);
>>
>> 	list_for_each_entry(page, &tags->page_list, lru) {
>> 		/* use cmpxchg() to clear request pointer selectively */
>> 	}
> 
> This way won't work as expected because iterating may happen between
> releasing drv_tags->lock and cmpxchg(->rqs[]), then freed requests
> may still be touched during iteration after they are freed in blk_mq_free_rqs().

Right, the unlock should happen after the pointers have been cleared. 
But I think it is safe to move the spin_lock call down such that both 
the spin_lock and spin_unlock call happen after the pointers have been 
cleared. That is sufficient to guarantee that all 
blk_mq_find_and_get_req() calls either happen before or after the spin 
lock / unlock pair. blk_mq_find_and_get_req() calls that happen before 
the pair happen before the request pointers are freed. Calls that happen 
after the spin lock / unlock pair will either read NULL or a pointer to 
a request that is associated with another queue and hence won't trigger 
a use-after-free.

Bart.
Bart Van Assche May 7, 2021, 1:11 a.m. UTC | #10
On 5/5/21 7:58 AM, Ming Lei wrote:
> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> index 4a40d409f5dd..8b239dcce85f 100644
> --- a/block/blk-mq-tag.c
> +++ b/block/blk-mq-tag.c
> @@ -203,9 +203,14 @@ static struct request *blk_mq_find_and_get_req(struct blk_mq_tags *tags,
>   		unsigned int bitnr)
>   {
>   	struct request *rq = tags->rqs[bitnr];
> +	unsigned long flags;
>   
> -	if (!rq || !refcount_inc_not_zero(&rq->ref))
> +	spin_lock_irqsave(&tags->lock, flags);
> +	if (!rq || !refcount_inc_not_zero(&rq->ref)) {
> +		spin_unlock_irqrestore(&tags->lock, flags);
>   		return NULL;
> +	}
> +	spin_unlock_irqrestore(&tags->lock, flags);
>   	return rq;
>   }

Shouldn't the 'rq = tags->rqs[bitnr]' assignment be protected by 
tags->lock too? Otherwise a request pointer could be read before the 
request pointer clearing happens and the refcount_inc_not_zero() call 
could happen after the request clearing.

Thanks,

Bart.
Ming Lei May 7, 2021, 2:05 a.m. UTC | #11
On Thu, May 06, 2021 at 06:10:09PM -0700, Bart Van Assche wrote:
> On 5/6/21 5:11 PM, Ming Lei wrote:
> > On Thu, May 06, 2021 at 07:51:53AM -0700, Bart Van Assche wrote:
> > > I'm not sure that would be a better name since I don't think that it is
> > > necessary to hold that lock around the cmpxchg() calls. How about using
> > > something like the following code in blk_mq_clear_rq_mapping() instead
> > > of the code in v5 of patch 3/4?
> > > 
> > > 	spin_lock_irqsave(&drv_tags->lock, flags);
> > > 	spin_unlock_irqrestore(&drv_tags->lock, flags);
> > > 
> > > 	list_for_each_entry(page, &tags->page_list, lru) {
> > > 		/* use cmpxchg() to clear request pointer selectively */
> > > 	}
> > 
> > This way won't work as expected because iterating may happen between
> > releasing drv_tags->lock and cmpxchg(->rqs[]), then freed requests
> > may still be touched during iteration after they are freed in blk_mq_free_rqs().
> 
> Right, the unlock should happen after the pointers have been cleared. But I
> think it is safe to move the spin_lock call down such that both the
> spin_lock and spin_unlock call happen after the pointers have been cleared.
> That is sufficient to guarantee that all blk_mq_find_and_get_req() calls
> either happen before or after the spin lock / unlock pair.
> blk_mq_find_and_get_req() calls that happen before the pair happen before
> the request pointers are freed. Calls that happen after the spin lock /
> unlock pair will either read NULL or a pointer to a request that is
> associated with another queue and hence won't trigger a use-after-free.

Putting the lock pair after clearing rq mapping should work, but not see
any benefit: not very readable, and memory barrier knowledge is required for
understanding its correctness(cmpxchg has to be completed before unlock), ...,
so is it better idea to move lock pair after clearing rq mapping?



Thanks,
Ming
Ming Lei May 7, 2021, 2:06 a.m. UTC | #12
On Thu, May 06, 2021 at 06:11:59PM -0700, Bart Van Assche wrote:
> On 5/5/21 7:58 AM, Ming Lei wrote:
> > diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> > index 4a40d409f5dd..8b239dcce85f 100644
> > --- a/block/blk-mq-tag.c
> > +++ b/block/blk-mq-tag.c
> > @@ -203,9 +203,14 @@ static struct request *blk_mq_find_and_get_req(struct blk_mq_tags *tags,
> >   		unsigned int bitnr)
> >   {
> >   	struct request *rq = tags->rqs[bitnr];
> > +	unsigned long flags;
> > -	if (!rq || !refcount_inc_not_zero(&rq->ref))
> > +	spin_lock_irqsave(&tags->lock, flags);
> > +	if (!rq || !refcount_inc_not_zero(&rq->ref)) {
> > +		spin_unlock_irqrestore(&tags->lock, flags);
> >   		return NULL;
> > +	}
> > +	spin_unlock_irqrestore(&tags->lock, flags);
> >   	return rq;
> >   }
> 
> Shouldn't the 'rq = tags->rqs[bitnr]' assignment be protected by tags->lock
> too? Otherwise a request pointer could be read before the request pointer
> clearing happens and the refcount_inc_not_zero() call could happen after the
> request clearing.

Right, will fix it.
Bart Van Assche May 7, 2021, 3:16 a.m. UTC | #13
On 5/6/21 7:05 PM, Ming Lei wrote:
> Putting the lock pair after clearing rq mapping should work, but not see
> any benefit: not very readable, and memory barrier knowledge is required for
> understanding its correctness(cmpxchg has to be completed before unlock), ...,
> so is it better idea to move lock pair after clearing rq mapping?

It depends on how much time will be spent inside
blk_mq_clear_rq_mapping(). If the time spent in the nested loop in
blk_mq_clear_rq_mapping() would be significant then the proposed change
will help to reduce interrupt latency in blk_mq_find_and_get_req().

Thanks,

Bart.
Ming Lei May 7, 2021, 6:31 a.m. UTC | #14
On Thu, May 06, 2021 at 08:16:25PM -0700, Bart Van Assche wrote:
> On 5/6/21 7:05 PM, Ming Lei wrote:
> > Putting the lock pair after clearing rq mapping should work, but not see
> > any benefit: not very readable, and memory barrier knowledge is required for
> > understanding its correctness(cmpxchg has to be completed before unlock), ...,
> > so is it better idea to move lock pair after clearing rq mapping?
> 
> It depends on how much time will be spent inside
> blk_mq_clear_rq_mapping(). If the time spent in the nested loop in
> blk_mq_clear_rq_mapping() would be significant then the proposed change
> will help to reduce interrupt latency in blk_mq_find_and_get_req().

interrupt latency in blk_mq_find_and_get_req() shouldn't be increased
because interrupt won't be disabled when spinning on the lock. But interrupt
may be disabled for a while in blk_mq_clear_rq_mapping() in case of big
nr_requests and hw queue depth.

Fair enough, will take this way for not holding lock for too long.

Thanks, 
Ming
Christoph Hellwig May 7, 2021, 6:52 a.m. UTC | #15
On Fri, May 07, 2021 at 02:31:02PM +0800, Ming Lei wrote:
> > It depends on how much time will be spent inside
> > blk_mq_clear_rq_mapping(). If the time spent in the nested loop in
> > blk_mq_clear_rq_mapping() would be significant then the proposed change
> > will help to reduce interrupt latency in blk_mq_find_and_get_req().
> 
> interrupt latency in blk_mq_find_and_get_req() shouldn't be increased
> because interrupt won't be disabled when spinning on the lock. But interrupt
> may be disabled for a while in blk_mq_clear_rq_mapping() in case of big
> nr_requests and hw queue depth.
> 
> Fair enough, will take this way for not holding lock for too long.

Can we take a step back here?  Once blk_mq_clear_rq_mapping hits we are
deep into tearing the device down and freeing the tag_set.  So if
blk_mq_find_and_get_req is waiting any time on the lock something is
wrong.  We might as well just trylock in blk_mq_find_and_get_req and
not find a request if the lock is contented as there is no point in
waiting for the lock.  In fact we might not even need a lock, but just
an atomic bitops in the tagset that marks it as beeing freed, that
needs to be tested in blk_mq_find_and_get_req with the right memory
barriers.
Christoph Hellwig May 7, 2021, 6:57 a.m. UTC | #16
On Thu, May 06, 2021 at 09:38:41PM +0800, Ming Lei wrote:
> > So.  Even a different LUN shares the same tagset.  So I can see the
> > need for the cmpxchg (please document it!), but I don't see the need
> > for the complex iteration.  All the rqs are freed in one single loop,
> > so we can just iterate through them sequentially.
> 
> That is exactly what the patch is doing, requests are stored in page
> list, so check if one request(covered in page list) reference in
> drv_tags->rq[i] exists, if yes, we clear the request reference.
> 
> The code is actually sort of self-document: before we free requests,
> clear the reference in hostwide drv->rqs[].

What the patch does it to do a completely pointless nested loop.
Instead of just looping through all requests which is simple and fast
it loops through each page, and then does another loop inside that,
just increasing complexity and runtime.  We should at least do something
like the incremental patch below instead which is simpler, faster and
easier to understand:


diff --git a/block/blk-mq.c b/block/blk-mq.c
index c1b28e09a27e..598fe82cfbcf 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2311,29 +2311,20 @@ static size_t order_to_size(unsigned int order)
 	return (size_t)PAGE_SIZE << order;
 }
 
-/* called before freeing request pool in @tags */
+/* ensure that blk_mq_find_and_get_req can't find the tags any more */
 static void blk_mq_clear_rq_mapping(struct blk_mq_tag_set *set,
 		struct blk_mq_tags *tags, unsigned int hctx_idx)
 {
 	struct blk_mq_tags *drv_tags = set->tags[hctx_idx];
-	struct page *page;
 	unsigned long flags;
+	int i;
 
 	spin_lock_irqsave(&drv_tags->lock, flags);
-	list_for_each_entry(page, &tags->page_list, lru) {
-		unsigned long start = (unsigned long)page_address(page);
-		unsigned long end = start + order_to_size(page->private);
-		int i;
+	for (i = 0; i < set->queue_depth; i++) {
+		struct request *rq = drv_tags->rqs[i];
 
-		for (i = 0; i < set->queue_depth; i++) {
-			struct request *rq = drv_tags->rqs[i];
-			unsigned long rq_addr = (unsigned long)rq;
-
-			if (rq_addr >= start && rq_addr < end) {
-				WARN_ON_ONCE(refcount_read(&rq->ref) != 0);
-				cmpxchg(&drv_tags->rqs[i], rq, NULL);
-			}
-		}
+		WARN_ON_ONCE(refcount_read(&rq->ref) != 0);
+		cmpxchg(&drv_tags->rqs[i], rq, NULL);
 	}
 	spin_unlock_irqrestore(&drv_tags->lock, flags);
 }
Ming Lei May 7, 2021, 7:30 a.m. UTC | #17
On Fri, May 07, 2021 at 07:57:23AM +0100, Christoph Hellwig wrote:
> On Thu, May 06, 2021 at 09:38:41PM +0800, Ming Lei wrote:
> > > So.  Even a different LUN shares the same tagset.  So I can see the
> > > need for the cmpxchg (please document it!), but I don't see the need
> > > for the complex iteration.  All the rqs are freed in one single loop,
> > > so we can just iterate through them sequentially.
> > 
> > That is exactly what the patch is doing, requests are stored in page
> > list, so check if one request(covered in page list) reference in
> > drv_tags->rq[i] exists, if yes, we clear the request reference.
> > 
> > The code is actually sort of self-document: before we free requests,
> > clear the reference in hostwide drv->rqs[].
> 
> What the patch does it to do a completely pointless nested loop.
> Instead of just looping through all requests which is simple and fast
> it loops through each page, and then does another loop inside that,
> just increasing complexity and runtime.  We should at least do something
> like the incremental patch below instead which is simpler, faster and
> easier to understand:

The pages to be freed may be from scheduler tags(set->sched_tags), which
belongs to one request queue being shutdown, but set->tags->rqs[] is
shared by all request queues in the host, and it can be actively assigned
from other LUN/request queue.

> 
> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index c1b28e09a27e..598fe82cfbcf 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -2311,29 +2311,20 @@ static size_t order_to_size(unsigned int order)
>  	return (size_t)PAGE_SIZE << order;
>  }
>  
> -/* called before freeing request pool in @tags */
> +/* ensure that blk_mq_find_and_get_req can't find the tags any more */
>  static void blk_mq_clear_rq_mapping(struct blk_mq_tag_set *set,
>  		struct blk_mq_tags *tags, unsigned int hctx_idx)
>  {
>  	struct blk_mq_tags *drv_tags = set->tags[hctx_idx];
> -	struct page *page;
>  	unsigned long flags;
> +	int i;
>  
>  	spin_lock_irqsave(&drv_tags->lock, flags);
> -	list_for_each_entry(page, &tags->page_list, lru) {
> -		unsigned long start = (unsigned long)page_address(page);
> -		unsigned long end = start + order_to_size(page->private);
> -		int i;
> +	for (i = 0; i < set->queue_depth; i++) {
> +		struct request *rq = drv_tags->rqs[i];
>  
> -		for (i = 0; i < set->queue_depth; i++) {
> -			struct request *rq = drv_tags->rqs[i];
> -			unsigned long rq_addr = (unsigned long)rq;
> -
> -			if (rq_addr >= start && rq_addr < end) {
> -				WARN_ON_ONCE(refcount_read(&rq->ref) != 0);
> -				cmpxchg(&drv_tags->rqs[i], rq, NULL);
> -			}
> -		}
> +		WARN_ON_ONCE(refcount_read(&rq->ref) != 0);
> +		cmpxchg(&drv_tags->rqs[i], rq, NULL);

set->tags->rqs[] is just one dynamic mapping between host-wide driver tag and
request which may be allocated from sched tags which is per-request-queue,
and set->tags->rqs[] is host wide.

What if the request pointed by 'rq' is just assigned from another active LUN's
sched tags?

What we need to do is to make sure every reference to being freed request is
cleared, that is all.

Thanks,
Ming
Bart Van Assche May 8, 2021, 2:02 a.m. UTC | #18
On 5/6/21 11:52 PM, Christoph Hellwig wrote:
> Can we take a step back here?  Once blk_mq_clear_rq_mapping hits we are
> deep into tearing the device down and freeing the tag_set.  So if
> blk_mq_find_and_get_req is waiting any time on the lock something is
> wrong.  We might as well just trylock in blk_mq_find_and_get_req and
> not find a request if the lock is contented as there is no point in
> waiting for the lock.  In fact we might not even need a lock, but just
> an atomic bitops in the tagset that marks it as beeing freed, that
> needs to be tested in blk_mq_find_and_get_req with the right memory
> barriers.

We need to make sure that blk_mq_find_and_get_req() in its entirety
either happens before scheduler tags are freed or after the cmpxchg()
loop has finished. Maybe I'm overlooking something but I don't see how
to do that without a wait loop, the kind of loop that is embedded in
spin_lock()?

Thanks,

Bart.
diff mbox series

Patch

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 4a40d409f5dd..8b239dcce85f 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -203,9 +203,14 @@  static struct request *blk_mq_find_and_get_req(struct blk_mq_tags *tags,
 		unsigned int bitnr)
 {
 	struct request *rq = tags->rqs[bitnr];
+	unsigned long flags;
 
-	if (!rq || !refcount_inc_not_zero(&rq->ref))
+	spin_lock_irqsave(&tags->lock, flags);
+	if (!rq || !refcount_inc_not_zero(&rq->ref)) {
+		spin_unlock_irqrestore(&tags->lock, flags);
 		return NULL;
+	}
+	spin_unlock_irqrestore(&tags->lock, flags);
 	return rq;
 }
 
@@ -538,6 +543,7 @@  struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
 
 	tags->nr_tags = total_tags;
 	tags->nr_reserved_tags = reserved_tags;
+	spin_lock_init(&tags->lock);
 
 	if (blk_mq_is_sbitmap_shared(flags))
 		return tags;
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 7d3e6b333a4a..f942a601b5ef 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -20,6 +20,9 @@  struct blk_mq_tags {
 	struct request **rqs;
 	struct request **static_rqs;
 	struct list_head page_list;
+
+	/* used to clear rqs[] before one request pool is freed */
+	spinlock_t lock;
 };
 
 extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags,
diff --git a/block/blk-mq.c b/block/blk-mq.c
index d053e80fa6d7..c1b28e09a27e 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2306,6 +2306,38 @@  blk_qc_t blk_mq_submit_bio(struct bio *bio)
 	return BLK_QC_T_NONE;
 }
 
+static size_t order_to_size(unsigned int order)
+{
+	return (size_t)PAGE_SIZE << order;
+}
+
+/* called before freeing request pool in @tags */
+static void blk_mq_clear_rq_mapping(struct blk_mq_tag_set *set,
+		struct blk_mq_tags *tags, unsigned int hctx_idx)
+{
+	struct blk_mq_tags *drv_tags = set->tags[hctx_idx];
+	struct page *page;
+	unsigned long flags;
+
+	spin_lock_irqsave(&drv_tags->lock, flags);
+	list_for_each_entry(page, &tags->page_list, lru) {
+		unsigned long start = (unsigned long)page_address(page);
+		unsigned long end = start + order_to_size(page->private);
+		int i;
+
+		for (i = 0; i < set->queue_depth; i++) {
+			struct request *rq = drv_tags->rqs[i];
+			unsigned long rq_addr = (unsigned long)rq;
+
+			if (rq_addr >= start && rq_addr < end) {
+				WARN_ON_ONCE(refcount_read(&rq->ref) != 0);
+				cmpxchg(&drv_tags->rqs[i], rq, NULL);
+			}
+		}
+	}
+	spin_unlock_irqrestore(&drv_tags->lock, flags);
+}
+
 void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
 		     unsigned int hctx_idx)
 {
@@ -2324,6 +2356,8 @@  void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
 		}
 	}
 
+	blk_mq_clear_rq_mapping(set, tags, hctx_idx);
+
 	while (!list_empty(&tags->page_list)) {
 		page = list_first_entry(&tags->page_list, struct page, lru);
 		list_del_init(&page->lru);
@@ -2383,11 +2417,6 @@  struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
 	return tags;
 }
 
-static size_t order_to_size(unsigned int order)
-{
-	return (size_t)PAGE_SIZE << order;
-}
-
 static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
 			       unsigned int hctx_idx, int node)
 {