[1/4] block: fix use-after-free on cached last_lookup partition
diff mbox series

Message ID 20200109062109.2313-2-ming.lei@redhat.com
State New
Headers show
Series
  • block: fix partition use-after-free and optimization
Related show

Commit Message

Ming Lei Jan. 9, 2020, 6:21 a.m. UTC
delete_partition() clears the cached last_lookup partition. However
the .last_lookup cache may be overwritten by one IO path after
it is cleared from delete_partition(). Then another IO path may
use the cached deleting partition after __delete_partition() is
called, then use-after-free is triggered on the cached partition.

Fixes the issue by the following approach:

1) always get the partition's refcount via hd_struct_try_get() before
setting .last_lookup

2) move clearing .last_lookup from delete_partition() to
__delete_partition() which is release handle of the partition's
percpu-refcount, so that no IO path can overwrite .last_lookup after it
is cleared in __delete_partition().

It is one candidate approach of Yufen's patch[1] which adds overhead
in fast path by indirect lookup which may introduce one extra cacheline
in IO path. Also this patch relies on percpu-refcount's protection, and
it is easier to understand and verify.

[1] https://lore.kernel.org/linux-block/20200109013551.GB9655@ming.t460p/T/#t

Reported-by: Yufen Yu <yuyufen@huawei.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Hou Tao <houtao1@huawei.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 block/blk-core.c          | 12 ------------
 block/genhd.c             |  6 +++++-
 block/partition-generic.c | 10 +++++++++-
 include/linux/genhd.h     |  1 +
 4 files changed, 15 insertions(+), 14 deletions(-)

Comments

Yufen Yu Feb. 21, 2020, 3:05 a.m. UTC | #1
Hi Guys,

Did this patch have been forgotten? So, ping...

Thanks,
Yufen


On 2020/1/9 14:21, Ming Lei wrote:
> delete_partition() clears the cached last_lookup partition. However
> the .last_lookup cache may be overwritten by one IO path after
> it is cleared from delete_partition(). Then another IO path may
> use the cached deleting partition after __delete_partition() is
> called, then use-after-free is triggered on the cached partition.
> 
> Fixes the issue by the following approach:
> 
> 1) always get the partition's refcount via hd_struct_try_get() before
> setting .last_lookup
> 
> 2) move clearing .last_lookup from delete_partition() to
> __delete_partition() which is release handle of the partition's
> percpu-refcount, so that no IO path can overwrite .last_lookup after it
> is cleared in __delete_partition().
> 
> It is one candidate approach of Yufen's patch[1] which adds overhead
> in fast path by indirect lookup which may introduce one extra cacheline
> in IO path. Also this patch relies on percpu-refcount's protection, and
> it is easier to understand and verify.
> 
> [1] https://lore.kernel.org/linux-block/20200109013551.GB9655@ming.t460p/T/#t
> 
> Reported-by: Yufen Yu <yuyufen@huawei.com>
> Cc: Christoph Hellwig <hch@infradead.org>
> Cc: Hou Tao <houtao1@huawei.com>
> Signed-off-by: Ming Lei <ming.lei@redhat.com>
> ---
>   block/blk-core.c          | 12 ------------
>   block/genhd.c             |  6 +++++-
>   block/partition-generic.c | 10 +++++++++-
>   include/linux/genhd.h     |  1 +
>   4 files changed, 15 insertions(+), 14 deletions(-)
> 
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 089e890ab208..79599f5fd5b7 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -1365,18 +1365,6 @@ void blk_account_io_start(struct request *rq, bool new_io)
>   		part_stat_inc(part, merges[rw]);
>   	} else {
>   		part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
> -		if (!hd_struct_try_get(part)) {
> -			/*
> -			 * The partition is already being removed,
> -			 * the request will be accounted on the disk only
> -			 *
> -			 * We take a reference on disk->part0 although that
> -			 * partition will never be deleted, so we can treat
> -			 * it as any other partition.
> -			 */
> -			part = &rq->rq_disk->part0;
> -			hd_struct_get(part);
> -		}
>   		part_inc_in_flight(rq->q, part, rw);
>   		rq->part = part;
>   	}
> diff --git a/block/genhd.c b/block/genhd.c
> index ff6268970ddc..6029c94510f0 100644
> --- a/block/genhd.c
> +++ b/block/genhd.c
> @@ -286,17 +286,21 @@ struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector)
>   	ptbl = rcu_dereference(disk->part_tbl);
>   
>   	part = rcu_dereference(ptbl->last_lookup);
> -	if (part && sector_in_part(part, sector))
> +	if (part && sector_in_part(part, sector) && hd_struct_try_get(part))
>   		return part;
>   
>   	for (i = 1; i < ptbl->len; i++) {
>   		part = rcu_dereference(ptbl->part[i]);
>   
>   		if (part && sector_in_part(part, sector)) {
> +			if (!hd_struct_try_get(part))
> +				goto exit;
>   			rcu_assign_pointer(ptbl->last_lookup, part);
>   			return part;
>   		}
>   	}
> + exit:
> +	hd_struct_get(&disk->part0);
>   	return &disk->part0;
>   }
>   EXPORT_SYMBOL_GPL(disk_map_sector_rcu);
> diff --git a/block/partition-generic.c b/block/partition-generic.c
> index 1d20c9cf213f..1739f750dbf2 100644
> --- a/block/partition-generic.c
> +++ b/block/partition-generic.c
> @@ -262,6 +262,12 @@ static void delete_partition_work_fn(struct work_struct *work)
>   void __delete_partition(struct percpu_ref *ref)
>   {
>   	struct hd_struct *part = container_of(ref, struct hd_struct, ref);
> +	struct disk_part_tbl *ptbl =
> +		rcu_dereference_protected(part->disk->part_tbl, 1);
> +
> +	rcu_assign_pointer(ptbl->last_lookup, NULL);
> +	put_device(disk_to_dev(part->disk));
> +
>   	INIT_RCU_WORK(&part->rcu_work, delete_partition_work_fn);
>   	queue_rcu_work(system_wq, &part->rcu_work);
>   }
> @@ -283,8 +289,9 @@ void delete_partition(struct gendisk *disk, int partno)
>   	if (!part)
>   		return;
>   
> +	get_device(disk_to_dev(disk));
>   	rcu_assign_pointer(ptbl->part[partno], NULL);
> -	rcu_assign_pointer(ptbl->last_lookup, NULL);
> +
>   	kobject_put(part->holder_dir);
>   	device_del(part_to_dev(part));
>   
> @@ -349,6 +356,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
>   	p->nr_sects = len;
>   	p->partno = partno;
>   	p->policy = get_disk_ro(disk);
> +	p->disk = disk;
>   
>   	if (info) {
>   		struct partition_meta_info *pinfo = alloc_part_info(disk);
> diff --git a/include/linux/genhd.h b/include/linux/genhd.h
> index 8bb63027e4d6..1b09cfe00aa3 100644
> --- a/include/linux/genhd.h
> +++ b/include/linux/genhd.h
> @@ -130,6 +130,7 @@ struct hd_struct {
>   	struct disk_stats dkstats;
>   #endif
>   	struct percpu_ref ref;
> +	struct gendisk *disk;
>   	struct rcu_work rcu_work;
>   };
>   
>
Bart Van Assche Feb. 21, 2020, 4:03 a.m. UTC | #2
On 2020-01-08 22:21, Ming Lei wrote:
> delete_partition() clears the cached last_lookup partition. However
> the .last_lookup cache may be overwritten by one IO path after
> it is cleared from delete_partition(). Then another IO path may
> use the cached deleting partition after __delete_partition() is
> called, then use-after-free is triggered on the cached partition.
> 
> Fixes the issue by the following approach:
> 
> 1) always get the partition's refcount via hd_struct_try_get() before
> setting .last_lookup
> 
> 2) move clearing .last_lookup from delete_partition() to
> __delete_partition() which is release handle of the partition's
> percpu-refcount, so that no IO path can overwrite .last_lookup after it
> is cleared in __delete_partition().
> 
> It is one candidate approach of Yufen's patch[1] which adds overhead
> in fast path by indirect lookup which may introduce one extra cacheline
> in IO path. Also this patch relies on percpu-refcount's protection, and
> it is easier to understand and verify.
> 
> [1] https://lore.kernel.org/linux-block/20200109013551.GB9655@ming.t460p/T/#t

Hi Ming,

disk_map_sector_rcu() is called from the I/O path only and hence with
q->q_usage_counter > 0. Has it been considered to freeze disk->queue
from delete_partition() before deleting a partition and unfreezing
disk->queue after partition deletion has finished? Would that approach
allow to eliminate partition reference counting and thereby improve the
performance of the hot path?

Thanks,

Bart.
Ming Lei Feb. 21, 2020, 7:47 a.m. UTC | #3
On Thu, Feb 20, 2020 at 08:03:52PM -0800, Bart Van Assche wrote:
> On 2020-01-08 22:21, Ming Lei wrote:
> > delete_partition() clears the cached last_lookup partition. However
> > the .last_lookup cache may be overwritten by one IO path after
> > it is cleared from delete_partition(). Then another IO path may
> > use the cached deleting partition after __delete_partition() is
> > called, then use-after-free is triggered on the cached partition.
> > 
> > Fixes the issue by the following approach:
> > 
> > 1) always get the partition's refcount via hd_struct_try_get() before
> > setting .last_lookup
> > 
> > 2) move clearing .last_lookup from delete_partition() to
> > __delete_partition() which is release handle of the partition's
> > percpu-refcount, so that no IO path can overwrite .last_lookup after it
> > is cleared in __delete_partition().
> > 
> > It is one candidate approach of Yufen's patch[1] which adds overhead
> > in fast path by indirect lookup which may introduce one extra cacheline
> > in IO path. Also this patch relies on percpu-refcount's protection, and
> > it is easier to understand and verify.
> > 
> > [1] https://lore.kernel.org/linux-block/20200109013551.GB9655@ming.t460p/T/#t
> 
> Hi Ming,
> 
> disk_map_sector_rcu() is called from the I/O path only and hence with
> q->q_usage_counter > 0. Has it been considered to freeze disk->queue
> from delete_partition() before deleting a partition and unfreezing
> disk->queue after partition deletion has finished? Would that approach
> allow to eliminate partition reference counting and thereby improve the
> performance of the hot path?

Hi Bart,

I did consider that approach, but this way may cause performance
regression, given deleting any partition drops IO performance a lot
on other un-related partitions.



Thanks,
Ming

Patch
diff mbox series

diff --git a/block/blk-core.c b/block/blk-core.c
index 089e890ab208..79599f5fd5b7 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1365,18 +1365,6 @@  void blk_account_io_start(struct request *rq, bool new_io)
 		part_stat_inc(part, merges[rw]);
 	} else {
 		part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
-		if (!hd_struct_try_get(part)) {
-			/*
-			 * The partition is already being removed,
-			 * the request will be accounted on the disk only
-			 *
-			 * We take a reference on disk->part0 although that
-			 * partition will never be deleted, so we can treat
-			 * it as any other partition.
-			 */
-			part = &rq->rq_disk->part0;
-			hd_struct_get(part);
-		}
 		part_inc_in_flight(rq->q, part, rw);
 		rq->part = part;
 	}
diff --git a/block/genhd.c b/block/genhd.c
index ff6268970ddc..6029c94510f0 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -286,17 +286,21 @@  struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector)
 	ptbl = rcu_dereference(disk->part_tbl);
 
 	part = rcu_dereference(ptbl->last_lookup);
-	if (part && sector_in_part(part, sector))
+	if (part && sector_in_part(part, sector) && hd_struct_try_get(part))
 		return part;
 
 	for (i = 1; i < ptbl->len; i++) {
 		part = rcu_dereference(ptbl->part[i]);
 
 		if (part && sector_in_part(part, sector)) {
+			if (!hd_struct_try_get(part))
+				goto exit;
 			rcu_assign_pointer(ptbl->last_lookup, part);
 			return part;
 		}
 	}
+ exit:
+	hd_struct_get(&disk->part0);
 	return &disk->part0;
 }
 EXPORT_SYMBOL_GPL(disk_map_sector_rcu);
diff --git a/block/partition-generic.c b/block/partition-generic.c
index 1d20c9cf213f..1739f750dbf2 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -262,6 +262,12 @@  static void delete_partition_work_fn(struct work_struct *work)
 void __delete_partition(struct percpu_ref *ref)
 {
 	struct hd_struct *part = container_of(ref, struct hd_struct, ref);
+	struct disk_part_tbl *ptbl =
+		rcu_dereference_protected(part->disk->part_tbl, 1);
+
+	rcu_assign_pointer(ptbl->last_lookup, NULL);
+	put_device(disk_to_dev(part->disk));
+
 	INIT_RCU_WORK(&part->rcu_work, delete_partition_work_fn);
 	queue_rcu_work(system_wq, &part->rcu_work);
 }
@@ -283,8 +289,9 @@  void delete_partition(struct gendisk *disk, int partno)
 	if (!part)
 		return;
 
+	get_device(disk_to_dev(disk));
 	rcu_assign_pointer(ptbl->part[partno], NULL);
-	rcu_assign_pointer(ptbl->last_lookup, NULL);
+
 	kobject_put(part->holder_dir);
 	device_del(part_to_dev(part));
 
@@ -349,6 +356,7 @@  struct hd_struct *add_partition(struct gendisk *disk, int partno,
 	p->nr_sects = len;
 	p->partno = partno;
 	p->policy = get_disk_ro(disk);
+	p->disk = disk;
 
 	if (info) {
 		struct partition_meta_info *pinfo = alloc_part_info(disk);
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 8bb63027e4d6..1b09cfe00aa3 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -130,6 +130,7 @@  struct hd_struct {
 	struct disk_stats dkstats;
 #endif
 	struct percpu_ref ref;
+	struct gendisk *disk;
 	struct rcu_work rcu_work;
 };