diff mbox series

[8/8] blk-mq: clear stale request in tags->rq[] before freeing one request pool

Message ID 20210425085753.2617424-9-ming.lei@redhat.com (mailing list archive)
State Not Applicable
Headers show
Series blk-mq: fix request UAF related with iterating over tagset requests | expand

Commit Message

Ming Lei April 25, 2021, 8:57 a.m. UTC
refcount_inc_not_zero() in bt_tags_iter() still may read one freed
request.

Fix the issue by the following approach:

1) hold a per-tags spinlock when reading ->rqs[tag] and calling
refcount_inc_not_zero in bt_tags_iter()

2) clearing stale request referred via ->rqs[tag] before freeing
request pool, the per-tags spinlock is held for clearing stale
->rq[tag]

So after we cleared stale requests, bt_tags_iter() won't observe
freed request any more, also the clearing will wait for pending
request reference.

The idea of clearing ->rqs[] is borrowed from John Garry's previous
patch and one recent David's patch.

Cc: John Garry <john.garry@huawei.com>
Cc: David Jeffery <djeffery@redhat.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 block/blk-mq-tag.c |  9 ++++++++-
 block/blk-mq-tag.h |  3 +++
 block/blk-mq.c     | 39 ++++++++++++++++++++++++++++++++++-----
 3 files changed, 45 insertions(+), 6 deletions(-)

Comments

Bart Van Assche April 25, 2021, 8:42 p.m. UTC | #1
On 4/25/21 1:57 AM, Ming Lei wrote:
> +	spin_lock_irqsave(&drv_tags->lock, flags);
> +	list_for_each_entry(page, &tags->page_list, lru) {
> +		unsigned long start = (unsigned long)page_address(page);
> +		unsigned long end = start + order_to_size(page->private);
> +		int i;
> +
> +		for (i = 0; i < set->queue_depth; i++) {
> +			struct request *rq = drv_tags->rqs[i];
> +			unsigned long rq_addr = (unsigned long)rq;
> +
> +			if (rq_addr >= start && rq_addr < end) {
> +				WARN_ON_ONCE(refcount_read(&rq->ref) != 0);
> +				cmpxchg(&drv_tags->rqs[i], rq, NULL);
> +			}
> +		}
> +	}
> +	spin_unlock_irqrestore(&drv_tags->lock, flags);

Using cmpxchg() on set->tags[] is only safe if all other set->tags[]
accesses are changed into WRITE_ONCE() or READ_ONCE().

Bart.
Ming Lei April 26, 2021, 12:49 a.m. UTC | #2
On Sun, Apr 25, 2021 at 01:42:59PM -0700, Bart Van Assche wrote:
> On 4/25/21 1:57 AM, Ming Lei wrote:
> > +	spin_lock_irqsave(&drv_tags->lock, flags);
> > +	list_for_each_entry(page, &tags->page_list, lru) {
> > +		unsigned long start = (unsigned long)page_address(page);
> > +		unsigned long end = start + order_to_size(page->private);
> > +		int i;
> > +
> > +		for (i = 0; i < set->queue_depth; i++) {
> > +			struct request *rq = drv_tags->rqs[i];
> > +			unsigned long rq_addr = (unsigned long)rq;
> > +
> > +			if (rq_addr >= start && rq_addr < end) {
> > +				WARN_ON_ONCE(refcount_read(&rq->ref) != 0);
> > +				cmpxchg(&drv_tags->rqs[i], rq, NULL);
> > +			}
> > +		}
> > +	}
> > +	spin_unlock_irqrestore(&drv_tags->lock, flags);
> 
> Using cmpxchg() on set->tags[] is only safe if all other set->tags[]
> accesses are changed into WRITE_ONCE() or READ_ONCE().

Why?

Semantic of cmpxchg() is to modify value pointed by the address if its
old value is same with passed 'rq'. That is exactly what we need.

writting 'void *' is always atomic. if someone has touched
'->rqs[tag]', cmpxchg() won't modify the value.



Thanks, 
Ming
Bart Van Assche April 26, 2021, 1:50 a.m. UTC | #3
On 4/25/21 5:49 PM, Ming Lei wrote:
> On Sun, Apr 25, 2021 at 01:42:59PM -0700, Bart Van Assche wrote:
>> Using cmpxchg() on set->tags[] is only safe if all other set->tags[]
>> accesses are changed into WRITE_ONCE() or READ_ONCE().
> 
> Why?
> 
> Semantic of cmpxchg() is to modify value pointed by the address if its
> old value is same with passed 'rq'. That is exactly what we need.
> 
> writting 'void *' is always atomic. if someone has touched
> '->rqs[tag]', cmpxchg() won't modify the value.

WRITE_ONCE() supports data types that have the same size as char, short,
int, long and long long. That includes void *. If writes to these data
types would always be atomic then we wouldn't need the WRITE_ONCE()
macro. The explanation at the top of the rwonce.h header file is as
follows: "Prevent the compiler from merging or refetching reads or
writes. [ ... ] Ensuring that the compiler does not fold, spindle, or
otherwise mutilate accesses that either do not require ordering or that
interact with an explicit memory barrier or atomic instruction that
provides the required ordering."

Bart.
Ming Lei April 26, 2021, 2:07 a.m. UTC | #4
On Sun, Apr 25, 2021 at 06:50:48PM -0700, Bart Van Assche wrote:
> On 4/25/21 5:49 PM, Ming Lei wrote:
> > On Sun, Apr 25, 2021 at 01:42:59PM -0700, Bart Van Assche wrote:
> >> Using cmpxchg() on set->tags[] is only safe if all other set->tags[]
> >> accesses are changed into WRITE_ONCE() or READ_ONCE().
> > 
> > Why?
> > 
> > Semantic of cmpxchg() is to modify value pointed by the address if its
> > old value is same with passed 'rq'. That is exactly what we need.
> > 
> > writting 'void *' is always atomic. if someone has touched
> > '->rqs[tag]', cmpxchg() won't modify the value.
> 
> WRITE_ONCE() supports data types that have the same size as char, short,
> int, long and long long. That includes void *. If writes to these data
> types would always be atomic then we wouldn't need the WRITE_ONCE()
> macro.

OK, then we don't need WRITE_ONCE(), since WRITE on tags->rqs[i] is
always atomic.


Thanks, 
Ming
diff mbox series

Patch

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 489d2db89856..402a7860f6c9 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -265,10 +265,12 @@  static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
 	bool reserved = iter_data->flags & BT_TAG_ITER_RESERVED;
 	struct request *rq;
 	bool ret;
+	unsigned long flags;
 
 	if (!reserved)
 		bitnr += tags->nr_reserved_tags;
 
+	spin_lock_irqsave(&tags->lock, flags);
 	/*
 	 * We can hit rq == NULL here, because the tagging functions
 	 * test and set the bit before assigning ->rqs[].
@@ -277,8 +279,12 @@  static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
 		rq = tags->static_rqs[bitnr];
 	else
 		rq = tags->rqs[bitnr];
-	if (!rq || !refcount_inc_not_zero(&rq->ref))
+	if (!rq || !refcount_inc_not_zero(&rq->ref)) {
+		spin_unlock_irqrestore(&tags->lock, flags);
 		return true;
+	}
+	spin_unlock_irqrestore(&tags->lock, flags);
+
 	if ((iter_data->flags & BT_TAG_ITER_STARTED) &&
 	    !blk_mq_request_started(rq))
 		ret = true;
@@ -524,6 +530,7 @@  struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
 
 	tags->nr_tags = total_tags;
 	tags->nr_reserved_tags = reserved_tags;
+	spin_lock_init(&tags->lock);
 
 	if (blk_mq_is_sbitmap_shared(flags))
 		return tags;
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 7d3e6b333a4a..f942a601b5ef 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -20,6 +20,9 @@  struct blk_mq_tags {
 	struct request **rqs;
 	struct request **static_rqs;
 	struct list_head page_list;
+
+	/* used to clear rqs[] before one request pool is freed */
+	spinlock_t lock;
 };
 
 extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags,
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 9a4d520740a1..1f913f786c1f 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2307,6 +2307,38 @@  blk_qc_t blk_mq_submit_bio(struct bio *bio)
 	return BLK_QC_T_NONE;
 }
 
+static size_t order_to_size(unsigned int order)
+{
+	return (size_t)PAGE_SIZE << order;
+}
+
+/* called before freeing request pool in @tags */
+static void blk_mq_clear_rq_mapping(struct blk_mq_tag_set *set,
+		struct blk_mq_tags *tags, unsigned int hctx_idx)
+{
+	struct blk_mq_tags *drv_tags = set->tags[hctx_idx];
+	struct page *page;
+	unsigned long flags;
+
+	spin_lock_irqsave(&drv_tags->lock, flags);
+	list_for_each_entry(page, &tags->page_list, lru) {
+		unsigned long start = (unsigned long)page_address(page);
+		unsigned long end = start + order_to_size(page->private);
+		int i;
+
+		for (i = 0; i < set->queue_depth; i++) {
+			struct request *rq = drv_tags->rqs[i];
+			unsigned long rq_addr = (unsigned long)rq;
+
+			if (rq_addr >= start && rq_addr < end) {
+				WARN_ON_ONCE(refcount_read(&rq->ref) != 0);
+				cmpxchg(&drv_tags->rqs[i], rq, NULL);
+			}
+		}
+	}
+	spin_unlock_irqrestore(&drv_tags->lock, flags);
+}
+
 void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
 		     unsigned int hctx_idx)
 {
@@ -2325,6 +2357,8 @@  void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
 		}
 	}
 
+	blk_mq_clear_rq_mapping(set, tags, hctx_idx);
+
 	while (!list_empty(&tags->page_list)) {
 		page = list_first_entry(&tags->page_list, struct page, lru);
 		list_del_init(&page->lru);
@@ -2384,11 +2418,6 @@  struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
 	return tags;
 }
 
-static size_t order_to_size(unsigned int order)
-{
-	return (size_t)PAGE_SIZE << order;
-}
-
 static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
 			       unsigned int hctx_idx, int node)
 {