Message ID | 20210505145855.174127-4-ming.lei@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | blk-mq: fix request UAF related with iterating over tagset requests | expand |
On Wed, May 05, 2021 at 10:58:54PM +0800, Ming Lei wrote: > refcount_inc_not_zero() in bt_tags_iter() still may read one freed > request. > > Fix the issue by the following approach: > > 1) hold a per-tags spinlock when reading ->rqs[tag] and calling > refcount_inc_not_zero in bt_tags_iter() > > 2) clearing stale request referred via ->rqs[tag] before freeing > request pool, the per-tags spinlock is held for clearing stale > ->rq[tag] > > So after we cleared stale requests, bt_tags_iter() won't observe > freed request any more, also the clearing will wait for pending > request reference. > > The idea of clearing ->rqs[] is borrowed from John Garry's previous > patch and one recent David's patch. > > Tested-by: John Garry <john.garry@huawei.com> > Reviewed-by: David Jeffery <djeffery@redhat.com> > Reviewed-by: Bart Van Assche <bvanassche@acm.org> > Signed-off-by: Ming Lei <ming.lei@redhat.com> > --- > block/blk-mq-tag.c | 8 +++++++- > block/blk-mq-tag.h | 3 +++ > block/blk-mq.c | 39 ++++++++++++++++++++++++++++++++++----- > 3 files changed, 44 insertions(+), 6 deletions(-) > > diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c > index 4a40d409f5dd..8b239dcce85f 100644 > --- a/block/blk-mq-tag.c > +++ b/block/blk-mq-tag.c > @@ -203,9 +203,14 @@ static struct request *blk_mq_find_and_get_req(struct blk_mq_tags *tags, > unsigned int bitnr) > { > struct request *rq = tags->rqs[bitnr]; > + unsigned long flags; > > - if (!rq || !refcount_inc_not_zero(&rq->ref)) > + spin_lock_irqsave(&tags->lock, flags); > + if (!rq || !refcount_inc_not_zero(&rq->ref)) { > + spin_unlock_irqrestore(&tags->lock, flags); > return NULL; > + } > + spin_unlock_irqrestore(&tags->lock, flags); > return rq; > } > > @@ -538,6 +543,7 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, > > tags->nr_tags = total_tags; > tags->nr_reserved_tags = reserved_tags; > + spin_lock_init(&tags->lock); > > if (blk_mq_is_sbitmap_shared(flags)) > return tags; > diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h > index 7d3e6b333a4a..f942a601b5ef 100644 > --- a/block/blk-mq-tag.h > +++ b/block/blk-mq-tag.h > @@ -20,6 +20,9 @@ struct blk_mq_tags { > struct request **rqs; > struct request **static_rqs; > struct list_head page_list; > + > + /* used to clear rqs[] before one request pool is freed */ > + spinlock_t lock; > }; > > extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, > diff --git a/block/blk-mq.c b/block/blk-mq.c > index d053e80fa6d7..c1b28e09a27e 100644 > --- a/block/blk-mq.c > +++ b/block/blk-mq.c > @@ -2306,6 +2306,38 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio) > return BLK_QC_T_NONE; > } > > +static size_t order_to_size(unsigned int order) > +{ > + return (size_t)PAGE_SIZE << order; > +} > + > +/* called before freeing request pool in @tags */ > +static void blk_mq_clear_rq_mapping(struct blk_mq_tag_set *set, > + struct blk_mq_tags *tags, unsigned int hctx_idx) > +{ > + struct blk_mq_tags *drv_tags = set->tags[hctx_idx]; > + struct page *page; > + unsigned long flags; > + > + spin_lock_irqsave(&drv_tags->lock, flags); > + list_for_each_entry(page, &tags->page_list, lru) { > + unsigned long start = (unsigned long)page_address(page); > + unsigned long end = start + order_to_size(page->private); > + int i; > + > + for (i = 0; i < set->queue_depth; i++) { > + struct request *rq = drv_tags->rqs[i]; > + unsigned long rq_addr = (unsigned long)rq; > + > + if (rq_addr >= start && rq_addr < end) { > + WARN_ON_ONCE(refcount_read(&rq->ref) != 0); > + cmpxchg(&drv_tags->rqs[i], rq, NULL); > + } > + } > + } > + spin_unlock_irqrestore(&drv_tags->lock, flags); This looks really strange. Why do we need the cmpxchg while under the lock anyway? Why iterate the allocated pages and not just clear the drv_tags valus directly? static void blk_mq_clear_rq_mapping(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx) { struct blk_mq_tags *drv_tags = set->tags[hctx_idx]; unsigned int i = 0; unsigned long flags; spin_lock_irqsave(&drv_tags->lock, flags); for (i = 0; i < set->queue_depth; i++) WARN_ON_ONCE(refcount_read(&drv_tags->rqs[i]->ref) != 0); drv_tags->rqs[i] = NULL; } spin_unlock_irqrestore(&drv_tags->lock, flags); }
On Thu, May 06, 2021 at 08:12:56AM +0100, Christoph Hellwig wrote: > On Wed, May 05, 2021 at 10:58:54PM +0800, Ming Lei wrote: > > refcount_inc_not_zero() in bt_tags_iter() still may read one freed > > request. > > > > Fix the issue by the following approach: > > > > 1) hold a per-tags spinlock when reading ->rqs[tag] and calling > > refcount_inc_not_zero in bt_tags_iter() > > > > 2) clearing stale request referred via ->rqs[tag] before freeing > > request pool, the per-tags spinlock is held for clearing stale > > ->rq[tag] > > > > So after we cleared stale requests, bt_tags_iter() won't observe > > freed request any more, also the clearing will wait for pending > > request reference. > > > > The idea of clearing ->rqs[] is borrowed from John Garry's previous > > patch and one recent David's patch. > > > > Tested-by: John Garry <john.garry@huawei.com> > > Reviewed-by: David Jeffery <djeffery@redhat.com> > > Reviewed-by: Bart Van Assche <bvanassche@acm.org> > > Signed-off-by: Ming Lei <ming.lei@redhat.com> > > --- > > block/blk-mq-tag.c | 8 +++++++- > > block/blk-mq-tag.h | 3 +++ > > block/blk-mq.c | 39 ++++++++++++++++++++++++++++++++++----- > > 3 files changed, 44 insertions(+), 6 deletions(-) > > > > diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c > > index 4a40d409f5dd..8b239dcce85f 100644 > > --- a/block/blk-mq-tag.c > > +++ b/block/blk-mq-tag.c > > @@ -203,9 +203,14 @@ static struct request *blk_mq_find_and_get_req(struct blk_mq_tags *tags, > > unsigned int bitnr) > > { > > struct request *rq = tags->rqs[bitnr]; > > + unsigned long flags; > > > > - if (!rq || !refcount_inc_not_zero(&rq->ref)) > > + spin_lock_irqsave(&tags->lock, flags); > > + if (!rq || !refcount_inc_not_zero(&rq->ref)) { > > + spin_unlock_irqrestore(&tags->lock, flags); > > return NULL; > > + } > > + spin_unlock_irqrestore(&tags->lock, flags); > > return rq; > > } > > > > @@ -538,6 +543,7 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, > > > > tags->nr_tags = total_tags; > > tags->nr_reserved_tags = reserved_tags; > > + spin_lock_init(&tags->lock); > > > > if (blk_mq_is_sbitmap_shared(flags)) > > return tags; > > diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h > > index 7d3e6b333a4a..f942a601b5ef 100644 > > --- a/block/blk-mq-tag.h > > +++ b/block/blk-mq-tag.h > > @@ -20,6 +20,9 @@ struct blk_mq_tags { > > struct request **rqs; > > struct request **static_rqs; > > struct list_head page_list; > > + > > + /* used to clear rqs[] before one request pool is freed */ > > + spinlock_t lock; > > }; > > > > extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, > > diff --git a/block/blk-mq.c b/block/blk-mq.c > > index d053e80fa6d7..c1b28e09a27e 100644 > > --- a/block/blk-mq.c > > +++ b/block/blk-mq.c > > @@ -2306,6 +2306,38 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio) > > return BLK_QC_T_NONE; > > } > > > > +static size_t order_to_size(unsigned int order) > > +{ > > + return (size_t)PAGE_SIZE << order; > > +} > > + > > +/* called before freeing request pool in @tags */ > > +static void blk_mq_clear_rq_mapping(struct blk_mq_tag_set *set, > > + struct blk_mq_tags *tags, unsigned int hctx_idx) > > +{ > > + struct blk_mq_tags *drv_tags = set->tags[hctx_idx]; > > + struct page *page; > > + unsigned long flags; > > + > > + spin_lock_irqsave(&drv_tags->lock, flags); > > + list_for_each_entry(page, &tags->page_list, lru) { > > + unsigned long start = (unsigned long)page_address(page); > > + unsigned long end = start + order_to_size(page->private); > > + int i; > > + > > + for (i = 0; i < set->queue_depth; i++) { > > + struct request *rq = drv_tags->rqs[i]; > > + unsigned long rq_addr = (unsigned long)rq; > > + > > + if (rq_addr >= start && rq_addr < end) { > > + WARN_ON_ONCE(refcount_read(&rq->ref) != 0); > > + cmpxchg(&drv_tags->rqs[i], rq, NULL); > > + } > > + } > > + } > > + spin_unlock_irqrestore(&drv_tags->lock, flags); > > This looks really strange. Why do we need the cmpxchg while under > the lock anyway? Why iterate the allocated pages and not just clear Lock is for protecting free vs. iterating. > the drv_tags valus directly? > > static void blk_mq_clear_rq_mapping(struct blk_mq_tag_set *set, > struct blk_mq_tags *tags, unsigned int hctx_idx) > { > struct blk_mq_tags *drv_tags = set->tags[hctx_idx]; > unsigned int i = 0; > unsigned long flags; > > spin_lock_irqsave(&drv_tags->lock, flags); > for (i = 0; i < set->queue_depth; i++) > WARN_ON_ONCE(refcount_read(&drv_tags->rqs[i]->ref) != 0); > drv_tags->rqs[i] = NULL; drv_tags->rqs[] may be assigned from another LUN at the same time, then the simple clearing will overwrite the active assignment. We just need to clear mapping iff the stored rq will be freed. Thanks, Ming
On Thu, May 06, 2021 at 03:34:17PM +0800, Ming Lei wrote: > > { > > struct blk_mq_tags *drv_tags = set->tags[hctx_idx]; > > unsigned int i = 0; > > unsigned long flags; > > > > spin_lock_irqsave(&drv_tags->lock, flags); > > for (i = 0; i < set->queue_depth; i++) > > WARN_ON_ONCE(refcount_read(&drv_tags->rqs[i]->ref) != 0); > > drv_tags->rqs[i] = NULL; > > drv_tags->rqs[] may be assigned from another LUN at the same time, then > the simple clearing will overwrite the active assignment. We just need > to clear mapping iff the stored rq will be freed. So. Even a different LUN shares the same tagset. So I can see the need for the cmpxchg (please document it!), but I don't see the need for the complex iteration. All the rqs are freed in one single loop, so we can just iterate through them sequentially. Btw, I think ->lock might better be named ->clear_lock to document its purpose better.
On Thu, May 06, 2021 at 01:18:49PM +0100, Christoph Hellwig wrote: > On Thu, May 06, 2021 at 03:34:17PM +0800, Ming Lei wrote: > > > { > > > struct blk_mq_tags *drv_tags = set->tags[hctx_idx]; > > > unsigned int i = 0; > > > unsigned long flags; > > > > > > spin_lock_irqsave(&drv_tags->lock, flags); > > > for (i = 0; i < set->queue_depth; i++) > > > WARN_ON_ONCE(refcount_read(&drv_tags->rqs[i]->ref) != 0); > > > drv_tags->rqs[i] = NULL; > > > > drv_tags->rqs[] may be assigned from another LUN at the same time, then > > the simple clearing will overwrite the active assignment. We just need > > to clear mapping iff the stored rq will be freed. > > So. Even a different LUN shares the same tagset. So I can see the > need for the cmpxchg (please document it!), but I don't see the need > for the complex iteration. All the rqs are freed in one single loop, > so we can just iterate through them sequentially. That is exactly what the patch is doing, requests are stored in page list, so check if one request(covered in page list) reference in drv_tags->rq[i] exists, if yes, we clear the request reference. The code is actually sort of self-document: before we free requests, clear the reference in hostwide drv->rqs[]. > > Btw, I think ->lock might better be named ->clear_lock to document its > purpose better. OK. thanks, Ming
On 5/6/21 5:18 AM, Christoph Hellwig wrote: > On Thu, May 06, 2021 at 03:34:17PM +0800, Ming Lei wrote: >>> { >>> struct blk_mq_tags *drv_tags = set->tags[hctx_idx]; >>> unsigned int i = 0; >>> unsigned long flags; >>> >>> spin_lock_irqsave(&drv_tags->lock, flags); >>> for (i = 0; i < set->queue_depth; i++) >>> WARN_ON_ONCE(refcount_read(&drv_tags->rqs[i]->ref) != 0); >>> drv_tags->rqs[i] = NULL; >> >> drv_tags->rqs[] may be assigned from another LUN at the same time, then >> the simple clearing will overwrite the active assignment. We just need >> to clear mapping iff the stored rq will be freed. > > So. Even a different LUN shares the same tagset. So I can see the > need for the cmpxchg (please document it!), but I don't see the need > for the complex iteration. All the rqs are freed in one single loop, > so we can just iterate through them sequentially. > > Btw, I think ->lock might better be named ->clear_lock to document its > purpose better. I'm not sure that would be a better name since I don't think that it is necessary to hold that lock around the cmpxchg() calls. How about using something like the following code in blk_mq_clear_rq_mapping() instead of the code in v5 of patch 3/4? spin_lock_irqsave(&drv_tags->lock, flags); spin_unlock_irqrestore(&drv_tags->lock, flags); list_for_each_entry(page, &tags->page_list, lru) { /* use cmpxchg() to clear request pointer selectively */ } Thanks, Bart.
On 5/5/21 7:58 AM, Ming Lei wrote: > diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c > index 4a40d409f5dd..8b239dcce85f 100644 > --- a/block/blk-mq-tag.c > +++ b/block/blk-mq-tag.c > @@ -203,9 +203,14 @@ static struct request *blk_mq_find_and_get_req(struct blk_mq_tags *tags, > unsigned int bitnr) > { > struct request *rq = tags->rqs[bitnr]; > + unsigned long flags; > > - if (!rq || !refcount_inc_not_zero(&rq->ref)) > + spin_lock_irqsave(&tags->lock, flags); > + if (!rq || !refcount_inc_not_zero(&rq->ref)) { > + spin_unlock_irqrestore(&tags->lock, flags); > return NULL; > + } > + spin_unlock_irqrestore(&tags->lock, flags); > return rq; > } Has it been considered to change the body of the if-statement into the following? rq = NULL; That change would reduce the number of return statements from two to one. Thanks, Bart.
On Thu, May 06, 2021 at 07:51:53AM -0700, Bart Van Assche wrote: > On 5/6/21 5:18 AM, Christoph Hellwig wrote: > > On Thu, May 06, 2021 at 03:34:17PM +0800, Ming Lei wrote: > >>> { > >>> struct blk_mq_tags *drv_tags = set->tags[hctx_idx]; > >>> unsigned int i = 0; > >>> unsigned long flags; > >>> > >>> spin_lock_irqsave(&drv_tags->lock, flags); > >>> for (i = 0; i < set->queue_depth; i++) > >>> WARN_ON_ONCE(refcount_read(&drv_tags->rqs[i]->ref) != 0); > >>> drv_tags->rqs[i] = NULL; > >> > >> drv_tags->rqs[] may be assigned from another LUN at the same time, then > >> the simple clearing will overwrite the active assignment. We just need > >> to clear mapping iff the stored rq will be freed. > > > > So. Even a different LUN shares the same tagset. So I can see the > > need for the cmpxchg (please document it!), but I don't see the need > > for the complex iteration. All the rqs are freed in one single loop, > > so we can just iterate through them sequentially. > > > > Btw, I think ->lock might better be named ->clear_lock to document its > > purpose better. > > I'm not sure that would be a better name since I don't think that it is > necessary to hold that lock around the cmpxchg() calls. How about using > something like the following code in blk_mq_clear_rq_mapping() instead > of the code in v5 of patch 3/4? > > spin_lock_irqsave(&drv_tags->lock, flags); > spin_unlock_irqrestore(&drv_tags->lock, flags); > > list_for_each_entry(page, &tags->page_list, lru) { > /* use cmpxchg() to clear request pointer selectively */ > } This way won't work as expected because iterating may happen between releasing drv_tags->lock and cmpxchg(->rqs[]), then freed requests may still be touched during iteration after they are freed in blk_mq_free_rqs(). Thanks, Ming
On Thu, May 06, 2021 at 08:02:53AM -0700, Bart Van Assche wrote: > On 5/5/21 7:58 AM, Ming Lei wrote: > > diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c > > index 4a40d409f5dd..8b239dcce85f 100644 > > --- a/block/blk-mq-tag.c > > +++ b/block/blk-mq-tag.c > > @@ -203,9 +203,14 @@ static struct request *blk_mq_find_and_get_req(struct blk_mq_tags *tags, > > unsigned int bitnr) > > { > > struct request *rq = tags->rqs[bitnr]; > > + unsigned long flags; > > > > - if (!rq || !refcount_inc_not_zero(&rq->ref)) > > + spin_lock_irqsave(&tags->lock, flags); > > + if (!rq || !refcount_inc_not_zero(&rq->ref)) { > > + spin_unlock_irqrestore(&tags->lock, flags); > > return NULL; > > + } > > + spin_unlock_irqrestore(&tags->lock, flags); > > return rq; > > } > Has it been considered to change the body of the if-statement into the > following? > > rq = NULL; > > That change would reduce the number of return statements from two to one. Looks cleaner, will take it in V6. Thanks, Ming
On 5/6/21 5:11 PM, Ming Lei wrote: > On Thu, May 06, 2021 at 07:51:53AM -0700, Bart Van Assche wrote: >> I'm not sure that would be a better name since I don't think that it is >> necessary to hold that lock around the cmpxchg() calls. How about using >> something like the following code in blk_mq_clear_rq_mapping() instead >> of the code in v5 of patch 3/4? >> >> spin_lock_irqsave(&drv_tags->lock, flags); >> spin_unlock_irqrestore(&drv_tags->lock, flags); >> >> list_for_each_entry(page, &tags->page_list, lru) { >> /* use cmpxchg() to clear request pointer selectively */ >> } > > This way won't work as expected because iterating may happen between > releasing drv_tags->lock and cmpxchg(->rqs[]), then freed requests > may still be touched during iteration after they are freed in blk_mq_free_rqs(). Right, the unlock should happen after the pointers have been cleared. But I think it is safe to move the spin_lock call down such that both the spin_lock and spin_unlock call happen after the pointers have been cleared. That is sufficient to guarantee that all blk_mq_find_and_get_req() calls either happen before or after the spin lock / unlock pair. blk_mq_find_and_get_req() calls that happen before the pair happen before the request pointers are freed. Calls that happen after the spin lock / unlock pair will either read NULL or a pointer to a request that is associated with another queue and hence won't trigger a use-after-free. Bart.
On 5/5/21 7:58 AM, Ming Lei wrote: > diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c > index 4a40d409f5dd..8b239dcce85f 100644 > --- a/block/blk-mq-tag.c > +++ b/block/blk-mq-tag.c > @@ -203,9 +203,14 @@ static struct request *blk_mq_find_and_get_req(struct blk_mq_tags *tags, > unsigned int bitnr) > { > struct request *rq = tags->rqs[bitnr]; > + unsigned long flags; > > - if (!rq || !refcount_inc_not_zero(&rq->ref)) > + spin_lock_irqsave(&tags->lock, flags); > + if (!rq || !refcount_inc_not_zero(&rq->ref)) { > + spin_unlock_irqrestore(&tags->lock, flags); > return NULL; > + } > + spin_unlock_irqrestore(&tags->lock, flags); > return rq; > } Shouldn't the 'rq = tags->rqs[bitnr]' assignment be protected by tags->lock too? Otherwise a request pointer could be read before the request pointer clearing happens and the refcount_inc_not_zero() call could happen after the request clearing. Thanks, Bart.
On Thu, May 06, 2021 at 06:10:09PM -0700, Bart Van Assche wrote: > On 5/6/21 5:11 PM, Ming Lei wrote: > > On Thu, May 06, 2021 at 07:51:53AM -0700, Bart Van Assche wrote: > > > I'm not sure that would be a better name since I don't think that it is > > > necessary to hold that lock around the cmpxchg() calls. How about using > > > something like the following code in blk_mq_clear_rq_mapping() instead > > > of the code in v5 of patch 3/4? > > > > > > spin_lock_irqsave(&drv_tags->lock, flags); > > > spin_unlock_irqrestore(&drv_tags->lock, flags); > > > > > > list_for_each_entry(page, &tags->page_list, lru) { > > > /* use cmpxchg() to clear request pointer selectively */ > > > } > > > > This way won't work as expected because iterating may happen between > > releasing drv_tags->lock and cmpxchg(->rqs[]), then freed requests > > may still be touched during iteration after they are freed in blk_mq_free_rqs(). > > Right, the unlock should happen after the pointers have been cleared. But I > think it is safe to move the spin_lock call down such that both the > spin_lock and spin_unlock call happen after the pointers have been cleared. > That is sufficient to guarantee that all blk_mq_find_and_get_req() calls > either happen before or after the spin lock / unlock pair. > blk_mq_find_and_get_req() calls that happen before the pair happen before > the request pointers are freed. Calls that happen after the spin lock / > unlock pair will either read NULL or a pointer to a request that is > associated with another queue and hence won't trigger a use-after-free. Putting the lock pair after clearing rq mapping should work, but not see any benefit: not very readable, and memory barrier knowledge is required for understanding its correctness(cmpxchg has to be completed before unlock), ..., so is it better idea to move lock pair after clearing rq mapping? Thanks, Ming
On Thu, May 06, 2021 at 06:11:59PM -0700, Bart Van Assche wrote: > On 5/5/21 7:58 AM, Ming Lei wrote: > > diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c > > index 4a40d409f5dd..8b239dcce85f 100644 > > --- a/block/blk-mq-tag.c > > +++ b/block/blk-mq-tag.c > > @@ -203,9 +203,14 @@ static struct request *blk_mq_find_and_get_req(struct blk_mq_tags *tags, > > unsigned int bitnr) > > { > > struct request *rq = tags->rqs[bitnr]; > > + unsigned long flags; > > - if (!rq || !refcount_inc_not_zero(&rq->ref)) > > + spin_lock_irqsave(&tags->lock, flags); > > + if (!rq || !refcount_inc_not_zero(&rq->ref)) { > > + spin_unlock_irqrestore(&tags->lock, flags); > > return NULL; > > + } > > + spin_unlock_irqrestore(&tags->lock, flags); > > return rq; > > } > > Shouldn't the 'rq = tags->rqs[bitnr]' assignment be protected by tags->lock > too? Otherwise a request pointer could be read before the request pointer > clearing happens and the refcount_inc_not_zero() call could happen after the > request clearing. Right, will fix it.
On 5/6/21 7:05 PM, Ming Lei wrote: > Putting the lock pair after clearing rq mapping should work, but not see > any benefit: not very readable, and memory barrier knowledge is required for > understanding its correctness(cmpxchg has to be completed before unlock), ..., > so is it better idea to move lock pair after clearing rq mapping? It depends on how much time will be spent inside blk_mq_clear_rq_mapping(). If the time spent in the nested loop in blk_mq_clear_rq_mapping() would be significant then the proposed change will help to reduce interrupt latency in blk_mq_find_and_get_req(). Thanks, Bart.
On Thu, May 06, 2021 at 08:16:25PM -0700, Bart Van Assche wrote: > On 5/6/21 7:05 PM, Ming Lei wrote: > > Putting the lock pair after clearing rq mapping should work, but not see > > any benefit: not very readable, and memory barrier knowledge is required for > > understanding its correctness(cmpxchg has to be completed before unlock), ..., > > so is it better idea to move lock pair after clearing rq mapping? > > It depends on how much time will be spent inside > blk_mq_clear_rq_mapping(). If the time spent in the nested loop in > blk_mq_clear_rq_mapping() would be significant then the proposed change > will help to reduce interrupt latency in blk_mq_find_and_get_req(). interrupt latency in blk_mq_find_and_get_req() shouldn't be increased because interrupt won't be disabled when spinning on the lock. But interrupt may be disabled for a while in blk_mq_clear_rq_mapping() in case of big nr_requests and hw queue depth. Fair enough, will take this way for not holding lock for too long. Thanks, Ming
On Fri, May 07, 2021 at 02:31:02PM +0800, Ming Lei wrote: > > It depends on how much time will be spent inside > > blk_mq_clear_rq_mapping(). If the time spent in the nested loop in > > blk_mq_clear_rq_mapping() would be significant then the proposed change > > will help to reduce interrupt latency in blk_mq_find_and_get_req(). > > interrupt latency in blk_mq_find_and_get_req() shouldn't be increased > because interrupt won't be disabled when spinning on the lock. But interrupt > may be disabled for a while in blk_mq_clear_rq_mapping() in case of big > nr_requests and hw queue depth. > > Fair enough, will take this way for not holding lock for too long. Can we take a step back here? Once blk_mq_clear_rq_mapping hits we are deep into tearing the device down and freeing the tag_set. So if blk_mq_find_and_get_req is waiting any time on the lock something is wrong. We might as well just trylock in blk_mq_find_and_get_req and not find a request if the lock is contented as there is no point in waiting for the lock. In fact we might not even need a lock, but just an atomic bitops in the tagset that marks it as beeing freed, that needs to be tested in blk_mq_find_and_get_req with the right memory barriers.
On Thu, May 06, 2021 at 09:38:41PM +0800, Ming Lei wrote: > > So. Even a different LUN shares the same tagset. So I can see the > > need for the cmpxchg (please document it!), but I don't see the need > > for the complex iteration. All the rqs are freed in one single loop, > > so we can just iterate through them sequentially. > > That is exactly what the patch is doing, requests are stored in page > list, so check if one request(covered in page list) reference in > drv_tags->rq[i] exists, if yes, we clear the request reference. > > The code is actually sort of self-document: before we free requests, > clear the reference in hostwide drv->rqs[]. What the patch does it to do a completely pointless nested loop. Instead of just looping through all requests which is simple and fast it loops through each page, and then does another loop inside that, just increasing complexity and runtime. We should at least do something like the incremental patch below instead which is simpler, faster and easier to understand: diff --git a/block/blk-mq.c b/block/blk-mq.c index c1b28e09a27e..598fe82cfbcf 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2311,29 +2311,20 @@ static size_t order_to_size(unsigned int order) return (size_t)PAGE_SIZE << order; } -/* called before freeing request pool in @tags */ +/* ensure that blk_mq_find_and_get_req can't find the tags any more */ static void blk_mq_clear_rq_mapping(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx) { struct blk_mq_tags *drv_tags = set->tags[hctx_idx]; - struct page *page; unsigned long flags; + int i; spin_lock_irqsave(&drv_tags->lock, flags); - list_for_each_entry(page, &tags->page_list, lru) { - unsigned long start = (unsigned long)page_address(page); - unsigned long end = start + order_to_size(page->private); - int i; + for (i = 0; i < set->queue_depth; i++) { + struct request *rq = drv_tags->rqs[i]; - for (i = 0; i < set->queue_depth; i++) { - struct request *rq = drv_tags->rqs[i]; - unsigned long rq_addr = (unsigned long)rq; - - if (rq_addr >= start && rq_addr < end) { - WARN_ON_ONCE(refcount_read(&rq->ref) != 0); - cmpxchg(&drv_tags->rqs[i], rq, NULL); - } - } + WARN_ON_ONCE(refcount_read(&rq->ref) != 0); + cmpxchg(&drv_tags->rqs[i], rq, NULL); } spin_unlock_irqrestore(&drv_tags->lock, flags); }
On Fri, May 07, 2021 at 07:57:23AM +0100, Christoph Hellwig wrote: > On Thu, May 06, 2021 at 09:38:41PM +0800, Ming Lei wrote: > > > So. Even a different LUN shares the same tagset. So I can see the > > > need for the cmpxchg (please document it!), but I don't see the need > > > for the complex iteration. All the rqs are freed in one single loop, > > > so we can just iterate through them sequentially. > > > > That is exactly what the patch is doing, requests are stored in page > > list, so check if one request(covered in page list) reference in > > drv_tags->rq[i] exists, if yes, we clear the request reference. > > > > The code is actually sort of self-document: before we free requests, > > clear the reference in hostwide drv->rqs[]. > > What the patch does it to do a completely pointless nested loop. > Instead of just looping through all requests which is simple and fast > it loops through each page, and then does another loop inside that, > just increasing complexity and runtime. We should at least do something > like the incremental patch below instead which is simpler, faster and > easier to understand: The pages to be freed may be from scheduler tags(set->sched_tags), which belongs to one request queue being shutdown, but set->tags->rqs[] is shared by all request queues in the host, and it can be actively assigned from other LUN/request queue. > > > diff --git a/block/blk-mq.c b/block/blk-mq.c > index c1b28e09a27e..598fe82cfbcf 100644 > --- a/block/blk-mq.c > +++ b/block/blk-mq.c > @@ -2311,29 +2311,20 @@ static size_t order_to_size(unsigned int order) > return (size_t)PAGE_SIZE << order; > } > > -/* called before freeing request pool in @tags */ > +/* ensure that blk_mq_find_and_get_req can't find the tags any more */ > static void blk_mq_clear_rq_mapping(struct blk_mq_tag_set *set, > struct blk_mq_tags *tags, unsigned int hctx_idx) > { > struct blk_mq_tags *drv_tags = set->tags[hctx_idx]; > - struct page *page; > unsigned long flags; > + int i; > > spin_lock_irqsave(&drv_tags->lock, flags); > - list_for_each_entry(page, &tags->page_list, lru) { > - unsigned long start = (unsigned long)page_address(page); > - unsigned long end = start + order_to_size(page->private); > - int i; > + for (i = 0; i < set->queue_depth; i++) { > + struct request *rq = drv_tags->rqs[i]; > > - for (i = 0; i < set->queue_depth; i++) { > - struct request *rq = drv_tags->rqs[i]; > - unsigned long rq_addr = (unsigned long)rq; > - > - if (rq_addr >= start && rq_addr < end) { > - WARN_ON_ONCE(refcount_read(&rq->ref) != 0); > - cmpxchg(&drv_tags->rqs[i], rq, NULL); > - } > - } > + WARN_ON_ONCE(refcount_read(&rq->ref) != 0); > + cmpxchg(&drv_tags->rqs[i], rq, NULL); set->tags->rqs[] is just one dynamic mapping between host-wide driver tag and request which may be allocated from sched tags which is per-request-queue, and set->tags->rqs[] is host wide. What if the request pointed by 'rq' is just assigned from another active LUN's sched tags? What we need to do is to make sure every reference to being freed request is cleared, that is all. Thanks, Ming
On 5/6/21 11:52 PM, Christoph Hellwig wrote: > Can we take a step back here? Once blk_mq_clear_rq_mapping hits we are > deep into tearing the device down and freeing the tag_set. So if > blk_mq_find_and_get_req is waiting any time on the lock something is > wrong. We might as well just trylock in blk_mq_find_and_get_req and > not find a request if the lock is contented as there is no point in > waiting for the lock. In fact we might not even need a lock, but just > an atomic bitops in the tagset that marks it as beeing freed, that > needs to be tested in blk_mq_find_and_get_req with the right memory > barriers. We need to make sure that blk_mq_find_and_get_req() in its entirety either happens before scheduler tags are freed or after the cmpxchg() loop has finished. Maybe I'm overlooking something but I don't see how to do that without a wait loop, the kind of loop that is embedded in spin_lock()? Thanks, Bart.
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 4a40d409f5dd..8b239dcce85f 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -203,9 +203,14 @@ static struct request *blk_mq_find_and_get_req(struct blk_mq_tags *tags, unsigned int bitnr) { struct request *rq = tags->rqs[bitnr]; + unsigned long flags; - if (!rq || !refcount_inc_not_zero(&rq->ref)) + spin_lock_irqsave(&tags->lock, flags); + if (!rq || !refcount_inc_not_zero(&rq->ref)) { + spin_unlock_irqrestore(&tags->lock, flags); return NULL; + } + spin_unlock_irqrestore(&tags->lock, flags); return rq; } @@ -538,6 +543,7 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, tags->nr_tags = total_tags; tags->nr_reserved_tags = reserved_tags; + spin_lock_init(&tags->lock); if (blk_mq_is_sbitmap_shared(flags)) return tags; diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index 7d3e6b333a4a..f942a601b5ef 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -20,6 +20,9 @@ struct blk_mq_tags { struct request **rqs; struct request **static_rqs; struct list_head page_list; + + /* used to clear rqs[] before one request pool is freed */ + spinlock_t lock; }; extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, diff --git a/block/blk-mq.c b/block/blk-mq.c index d053e80fa6d7..c1b28e09a27e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2306,6 +2306,38 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio) return BLK_QC_T_NONE; } +static size_t order_to_size(unsigned int order) +{ + return (size_t)PAGE_SIZE << order; +} + +/* called before freeing request pool in @tags */ +static void blk_mq_clear_rq_mapping(struct blk_mq_tag_set *set, + struct blk_mq_tags *tags, unsigned int hctx_idx) +{ + struct blk_mq_tags *drv_tags = set->tags[hctx_idx]; + struct page *page; + unsigned long flags; + + spin_lock_irqsave(&drv_tags->lock, flags); + list_for_each_entry(page, &tags->page_list, lru) { + unsigned long start = (unsigned long)page_address(page); + unsigned long end = start + order_to_size(page->private); + int i; + + for (i = 0; i < set->queue_depth; i++) { + struct request *rq = drv_tags->rqs[i]; + unsigned long rq_addr = (unsigned long)rq; + + if (rq_addr >= start && rq_addr < end) { + WARN_ON_ONCE(refcount_read(&rq->ref) != 0); + cmpxchg(&drv_tags->rqs[i], rq, NULL); + } + } + } + spin_unlock_irqrestore(&drv_tags->lock, flags); +} + void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx) { @@ -2324,6 +2356,8 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, } } + blk_mq_clear_rq_mapping(set, tags, hctx_idx); + while (!list_empty(&tags->page_list)) { page = list_first_entry(&tags->page_list, struct page, lru); list_del_init(&page->lru); @@ -2383,11 +2417,6 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, return tags; } -static size_t order_to_size(unsigned int order) -{ - return (size_t)PAGE_SIZE << order; -} - static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq, unsigned int hctx_idx, int node) {