diff mbox series

[V2,2/3] blk-mq: complete request locally if the completion is from tagset iterator

Message ID 20210427014540.2747282-3-ming.lei@redhat.com (mailing list archive)
State New, archived
Headers show
Series blk-mq: fix request UAF related with iterating over tagset requests | expand

Commit Message

Ming Lei April 27, 2021, 1:45 a.m. UTC
rq->ref is not held when running a remote completion, and iteration
over tagset request pool is possible when another remote completion
is pending, so there is potential request UAF if request is completed
remotely from our tagset iterator helper.

Fix it by completing request locally if the completion is from tagset
iterator.

Suggested-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 block/blk-mq-tag.c     | 5 ++++-
 block/blk-mq.c         | 8 ++++++++
 include/linux/blkdev.h | 2 ++
 3 files changed, 14 insertions(+), 1 deletion(-)

Comments

Bart Van Assche April 27, 2021, 2:30 a.m. UTC | #1
On 4/26/21 6:45 PM, Ming Lei wrote:
> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> index 100fa44d52a6..773aea4db90c 100644
> --- a/block/blk-mq-tag.c
> +++ b/block/blk-mq-tag.c
> @@ -284,8 +284,11 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
>  	if ((iter_data->flags & BT_TAG_ITER_STARTED) &&
>  	    !blk_mq_request_started(rq))
>  		ret = true;
> -	else
> +	else {
> +		rq->rq_flags |= RQF_ITERATING;
>  		ret = iter_data->fn(rq, iter_data->data, reserved);
> +		rq->rq_flags &= ~RQF_ITERATING;
> +	}
>  	if (!iter_static_rqs)
>  		blk_mq_put_rq_ref(rq);
>  	return ret;

All existing rq->rq_flags modifications are serialized. The above change
adds code that may change rq_flags concurrently with regular request
processing. I think that counts as a race condition. Additionally, the
RQF_ITERATING flag won't be set correctly in the (unlikely) case that
two concurrent bt_tags_iter() calls examine the same request at the same
time.

Thanks,

Bart.
Ming Lei April 27, 2021, 7:06 a.m. UTC | #2
On Mon, Apr 26, 2021 at 07:30:51PM -0700, Bart Van Assche wrote:
> On 4/26/21 6:45 PM, Ming Lei wrote:
> > diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> > index 100fa44d52a6..773aea4db90c 100644
> > --- a/block/blk-mq-tag.c
> > +++ b/block/blk-mq-tag.c
> > @@ -284,8 +284,11 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
> >  	if ((iter_data->flags & BT_TAG_ITER_STARTED) &&
> >  	    !blk_mq_request_started(rq))
> >  		ret = true;
> > -	else
> > +	else {
> > +		rq->rq_flags |= RQF_ITERATING;
> >  		ret = iter_data->fn(rq, iter_data->data, reserved);
> > +		rq->rq_flags &= ~RQF_ITERATING;
> > +	}
> >  	if (!iter_static_rqs)
> >  		blk_mq_put_rq_ref(rq);
> >  	return ret;
> 
> All existing rq->rq_flags modifications are serialized. The above change
> adds code that may change rq_flags concurrently with regular request
> processing. I think that counts as a race condition.

Good catch, but we still can change .rq_flags via atomic op, such as:

	do {
		old = rq->rq_flags;
		new = old | RQF_ITERATING;
	} while (cmpxchg(&rq->rq_flags, old, new) != old);

> Additionally, the
> RQF_ITERATING flag won't be set correctly in the (unlikely) case that
> two concurrent bt_tags_iter() calls examine the same request at the same
> time.

If the driver completes request from two concurrent bt_tags_iter(), there has
been big trouble of double completion, so I'd rather not to consider this case.


Thanks,
Ming
Ming Lei April 27, 2021, 9 a.m. UTC | #3
On Tue, Apr 27, 2021 at 03:06:51PM +0800, Ming Lei wrote:
> On Mon, Apr 26, 2021 at 07:30:51PM -0700, Bart Van Assche wrote:
> > On 4/26/21 6:45 PM, Ming Lei wrote:
> > > diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> > > index 100fa44d52a6..773aea4db90c 100644
> > > --- a/block/blk-mq-tag.c
> > > +++ b/block/blk-mq-tag.c
> > > @@ -284,8 +284,11 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
> > >  	if ((iter_data->flags & BT_TAG_ITER_STARTED) &&
> > >  	    !blk_mq_request_started(rq))
> > >  		ret = true;
> > > -	else
> > > +	else {
> > > +		rq->rq_flags |= RQF_ITERATING;
> > >  		ret = iter_data->fn(rq, iter_data->data, reserved);
> > > +		rq->rq_flags &= ~RQF_ITERATING;
> > > +	}
> > >  	if (!iter_static_rqs)
> > >  		blk_mq_put_rq_ref(rq);
> > >  	return ret;
> > 
> > All existing rq->rq_flags modifications are serialized. The above change
> > adds code that may change rq_flags concurrently with regular request
> > processing. I think that counts as a race condition.
> 
> Good catch, but we still can change .rq_flags via atomic op, such as:
> 
> 	do {
> 		old = rq->rq_flags;
> 		new = old | RQF_ITERATING;
> 	} while (cmpxchg(&rq->rq_flags, old, new) != old);

oops, this way can't work because other .rq_flags modification still may
clear RQF_ITERATING.

As I mentioned in another thread, blk-mq needn't to consider the double
completion[1] any more, which is covered by driver. blk-mq just needs to
make sure that valid request is passed to ->fn(), and it is driver's
responsibility to avoid double completion.

[1] https://lore.kernel.org/linux-block/YIdWz8C5eklPvEiV@T590/T/#u


Thanks,
Ming
Bart Van Assche April 27, 2021, 2:53 p.m. UTC | #4
On 4/27/21 12:06 AM, Ming Lei wrote:
> On Mon, Apr 26, 2021 at 07:30:51PM -0700, Bart Van Assche wrote:
>> On 4/26/21 6:45 PM, Ming Lei wrote:
>>> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
>>> index 100fa44d52a6..773aea4db90c 100644
>>> --- a/block/blk-mq-tag.c
>>> +++ b/block/blk-mq-tag.c
>>> @@ -284,8 +284,11 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
>>>  	if ((iter_data->flags & BT_TAG_ITER_STARTED) &&
>>>  	    !blk_mq_request_started(rq))
>>>  		ret = true;
>>> -	else
>>> +	else {
>>> +		rq->rq_flags |= RQF_ITERATING;
>>>  		ret = iter_data->fn(rq, iter_data->data, reserved);
>>> +		rq->rq_flags &= ~RQF_ITERATING;
>>> +	}
>>>  	if (!iter_static_rqs)
>>>  		blk_mq_put_rq_ref(rq);
>>>  	return ret;
>>
>> All existing rq->rq_flags modifications are serialized. The above change
>> adds code that may change rq_flags concurrently with regular request
>> processing. I think that counts as a race condition.
> 
> Good catch, but we still can change .rq_flags via atomic op, such as:
> 
> 	do {
> 		old = rq->rq_flags;
> 		new = old | RQF_ITERATING;
> 	} while (cmpxchg(&rq->rq_flags, old, new) != old);

That's not sufficient because the above would not work correctly in
combination with statements like the following:

	rq->rq_flags &= ~RQF_MQ_INFLIGHT;
	req->rq_flags |= RQF_TIMED_OUT;

How about setting a flag in 'current', just like the memalloc_noio_*()
functions set or clear PF_MEMALLOC_NOIO to indicate whether or not
GFP_NOIO should be used? That should work fine in thread context and
also in interrupt context.

>> Additionally, the
>> RQF_ITERATING flag won't be set correctly in the (unlikely) case that
>> two concurrent bt_tags_iter() calls examine the same request at the same
>> time.
> 
> If the driver completes request from two concurrent bt_tags_iter(), there has
> been big trouble of double completion, so I'd rather not to consider this case.

bt_tags_iter() may be used for other purposes than completing requests.
Here is an example of a blk_mq_tagset_busy_iter() call (from debugfs)
that may run concurrently with other calls of that function:

static int hctx_busy_show(void *data, struct seq_file *m)
{
	struct blk_mq_hw_ctx *hctx = data;
	struct show_busy_params params = { .m = m, .hctx = hctx };

	blk_mq_tagset_busy_iter(hctx->queue->tag_set, hctx_show_busy_rq,
				&params);

	return 0;
}

Bart.
diff mbox series

Patch

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 100fa44d52a6..773aea4db90c 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -284,8 +284,11 @@  static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
 	if ((iter_data->flags & BT_TAG_ITER_STARTED) &&
 	    !blk_mq_request_started(rq))
 		ret = true;
-	else
+	else {
+		rq->rq_flags |= RQF_ITERATING;
 		ret = iter_data->fn(rq, iter_data->data, reserved);
+		rq->rq_flags &= ~RQF_ITERATING;
+	}
 	if (!iter_static_rqs)
 		blk_mq_put_rq_ref(rq);
 	return ret;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4bd6c11bd8bc..ae06e5b3f215 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -654,6 +654,14 @@  bool blk_mq_complete_request_remote(struct request *rq)
 	if (rq->cmd_flags & REQ_HIPRI)
 		return false;
 
+	/*
+	 * Complete the request locally if it is being completed via tagset
+	 * iterator helper for avoiding UAF because rq->ref isn't held when
+	 * running remote completion via IPI or softirq
+	 */
+	if (rq->rq_flags & RQF_ITERATING)
+		return false;
+
 	if (blk_mq_complete_need_ipi(rq)) {
 		blk_mq_complete_send_ipi(rq);
 		return true;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f2e77ba97550..3b9bc4381dab 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -102,6 +102,8 @@  typedef __u32 __bitwise req_flags_t;
 #define RQF_MQ_POLL_SLEPT	((__force req_flags_t)(1 << 20))
 /* ->timeout has been called, don't expire again */
 #define RQF_TIMED_OUT		((__force req_flags_t)(1 << 21))
+/* The request is being iterated by blk-mq iterator API */
+#define RQF_ITERATING		((__force req_flags_t)(1 << 22))
 
 /* flags that prevent us from merging requests: */
 #define RQF_NOMERGE_FLAGS \