diff mbox series

blk-cgroup: check blkcg policy is enabled in blkg_create()

Message ID 20211008072720.797814-1-yukuai3@huawei.com (mailing list archive)
State New, archived
Headers show
Series blk-cgroup: check blkcg policy is enabled in blkg_create() | expand

Commit Message

Yu Kuai Oct. 8, 2021, 7:27 a.m. UTC
Out test report a null pointer dereference:

[  168.534653] ==================================================================
[  168.535614] Disabling lock debugging due to kernel taint
[  168.536346] BUG: kernel NULL pointer dereference, address: 0000000000000008
[  168.537274] #PF: supervisor read access in kernel mode
[  168.537964] #PF: error_code(0x0000) - not-present page
[  168.538667] PGD 0 P4D 0
[  168.539025] Oops: 0000 [#1] PREEMPT SMP KASAN
[  168.539656] CPU: 13 PID: 759 Comm: bash Tainted: G    B             5.15.0-rc2-next-202100
[  168.540954] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_0738364
[  168.542736] RIP: 0010:bfq_pd_init+0x88/0x1e0
[  168.543318] Code: 98 00 00 00 e8 c9 e4 5b ff 4c 8b 65 00 49 8d 7c 24 08 e8 bb e4 5b ff 4d0
[  168.545803] RSP: 0018:ffff88817095f9c0 EFLAGS: 00010002
[  168.546497] RAX: 0000000000000001 RBX: ffff888101a1c000 RCX: 0000000000000000
[  168.547438] RDX: 0000000000000003 RSI: 0000000000000002 RDI: ffff888106553428
[  168.548402] RBP: ffff888106553400 R08: ffffffff961bcaf4 R09: 0000000000000001
[  168.549365] R10: ffffffffa2e16c27 R11: fffffbfff45c2d84 R12: 0000000000000000
[  168.550291] R13: ffff888101a1c098 R14: ffff88810c7a08c8 R15: ffffffffa55541a0
[  168.551221] FS:  00007fac75227700(0000) GS:ffff88839ba80000(0000) knlGS:0000000000000000
[  168.552278] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  168.553040] CR2: 0000000000000008 CR3: 0000000165ce7000 CR4: 00000000000006e0
[  168.554000] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  168.554929] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  168.555888] Call Trace:
[  168.556221]  <TASK>
[  168.556510]  blkg_create+0x1c0/0x8c0
[  168.556989]  blkg_conf_prep+0x574/0x650
[  168.557502]  ? stack_trace_save+0x99/0xd0
[  168.558033]  ? blkcg_conf_open_bdev+0x1b0/0x1b0
[  168.558629]  tg_set_conf.constprop.0+0xb9/0x280
[  168.559231]  ? kasan_set_track+0x29/0x40
[  168.559758]  ? kasan_set_free_info+0x30/0x60
[  168.560344]  ? tg_set_limit+0xae0/0xae0
[  168.560853]  ? do_sys_openat2+0x33b/0x640
[  168.561383]  ? do_sys_open+0xa2/0x100
[  168.561877]  ? __x64_sys_open+0x4e/0x60
[  168.562383]  ? __kasan_check_write+0x20/0x30
[  168.562951]  ? copyin+0x48/0x70
[  168.563390]  ? _copy_from_iter+0x234/0x9e0
[  168.563948]  tg_set_conf_u64+0x17/0x20
[  168.564467]  cgroup_file_write+0x1ad/0x380
[  168.565014]  ? cgroup_file_poll+0x80/0x80
[  168.565568]  ? __mutex_lock_slowpath+0x30/0x30
[  168.566165]  ? pgd_free+0x100/0x160
[  168.566649]  kernfs_fop_write_iter+0x21d/0x340
[  168.567246]  ? cgroup_file_poll+0x80/0x80
[  168.567796]  new_sync_write+0x29f/0x3c0
[  168.568314]  ? new_sync_read+0x410/0x410
[  168.568840]  ? __handle_mm_fault+0x1c97/0x2d80
[  168.569425]  ? copy_page_range+0x2b10/0x2b10
[  168.570007]  ? _raw_read_lock_bh+0xa0/0xa0
[  168.570622]  vfs_write+0x46e/0x630
[  168.571091]  ksys_write+0xcd/0x1e0
[  168.571563]  ? __x64_sys_read+0x60/0x60
[  168.572081]  ? __kasan_check_write+0x20/0x30
[  168.572659]  ? do_user_addr_fault+0x446/0xff0
[  168.573264]  __x64_sys_write+0x46/0x60
[  168.573774]  do_syscall_64+0x35/0x80
[  168.574264]  entry_SYSCALL_64_after_hwframe+0x44/0xae
[  168.574960] RIP: 0033:0x7fac74915130
[  168.575456] Code: 73 01 c3 48 8b 0d 58 ed 2c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 0f 1f 444
[  168.577969] RSP: 002b:00007ffc3080e288 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
[  168.578986] RAX: ffffffffffffffda RBX: 0000000000000009 RCX: 00007fac74915130
[  168.579937] RDX: 0000000000000009 RSI: 000056007669f080 RDI: 0000000000000001
[  168.580884] RBP: 000056007669f080 R08: 000000000000000a R09: 00007fac75227700
[  168.581841] R10: 000056007655c8f0 R11: 0000000000000246 R12: 0000000000000009
[  168.582796] R13: 0000000000000001 R14: 00007fac74be55e0 R15: 00007fac74be08c0
[  168.583757]  </TASK>
[  168.584063] Modules linked in:
[  168.584494] CR2: 0000000000000008
[  168.584964] ---[ end trace 2475611ad0f77a1a ]---

This is because blkg_alloc() is called from blkg_conf_prep() without
holding 'q->queue_lock', and elevator is exited before blkg_create():

thread 1                            thread 2
blkg_conf_prep
 spin_lock_irq(&q->queue_lock);
 blkg_lookup_check -> return NULL
 spin_unlock_irq(&q->queue_lock);

 blkg_alloc
  blkcg_policy_enabled -> true
  pd = ->pd_alloc_fn
  blkg->pd[i] = pd
                                   blk_mq_exit_sched
                                    bfq_exit_queue
                                     blkcg_deactivate_policy
                                      spin_lock_irq(&q->queue_lock);
                                      __clear_bit(pol->plid, q->blkcg_pols);
                                      spin_unlock_irq(&q->queue_lock);
                                    q->elevator = NULL;
  spin_lock_irq(&q->queue_lock);
   blkg_create
    if (blkg->pd[i])
     ->pd_init_fn -> q->elevator is NULL
  spin_unlock_irq(&q->queue_lock);

Fix the problem by checking that policy is still enabled in
blkg_create().

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 block/blk-cgroup.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

Comments

Michal Koutný Oct. 11, 2021, 3:23 p.m. UTC | #1
Hello.

On Fri, Oct 08, 2021 at 03:27:20PM +0800, Yu Kuai <yukuai3@huawei.com> wrote:
> This is because blkg_alloc() is called from blkg_conf_prep() without
> holding 'q->queue_lock', and elevator is exited before blkg_create():
 
IIUC the problematic interleaving is this one (I've noticed `blkg->pd[i]
= NULL` to thread 2 call trace):

> thread 1                            thread 2
> blkg_conf_prep
>  spin_lock_irq(&q->queue_lock);
>  blkg_lookup_check -> return NULL
>  spin_unlock_irq(&q->queue_lock);
> 
>  blkg_alloc
>   blkcg_policy_enabled -> true
>   pd = ->pd_alloc_fn
>                                    blk_mq_exit_sched
>                                     bfq_exit_queue
>                                      blkcg_deactivate_policy
>                                       spin_lock_irq(&q->queue_lock);
>                                       __clear_bit(pol->plid, q->blkcg_pols);
>
                                        pol->pd_free_fn(blkg->pd[i]);
                                        blkg->pd[i] = NULL;
>
>                                       spin_unlock_irq(&q->queue_lock);
>                                     q->elevator = NULL;
    blkg->pd[i] = pd
>   spin_lock_irq(&q->queue_lock);
>    blkg_create
>     if (blkg->pd[i])
>      ->pd_init_fn -> q->elevator is NULL
>   spin_unlock_irq(&q->queue_lock);

In high-level terms, is this a race between (blk)io controller attribute
write and a device scheduler (elevator) switch?
If so, I'd add it to the commit message.

> Fix the problem by checking that policy is still enabled in
> blkg_create().

Is this sufficient wrt some other q->elevator users later?

> @@ -252,6 +266,9 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
>  		goto err_free_blkg;
>  	}
>  

I'd add a comment here like:

> Re-check policies are still enabled, since the caller blkg_conf_prep()
> temporarily drops q->queue_lock and we can race with
> blk_mq_exit_sched() removing policies.

> +	if (new_blkg)
> +		blkg_check_pd(q, new_blkg);
> +

Thanks,
Michal
Tejun Heo Oct. 11, 2021, 5:16 p.m. UTC | #2
On Fri, Oct 08, 2021 at 03:27:20PM +0800, Yu Kuai wrote:
> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
> index eb48090eefce..00e1d97621ea 100644
> --- a/block/blk-cgroup.c
> +++ b/block/blk-cgroup.c
> @@ -226,6 +226,20 @@ struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
>  }
>  EXPORT_SYMBOL_GPL(blkg_lookup_slowpath);
>  
> +static void blkg_check_pd(struct request_queue *q, struct blkcg_gq *blkg)
> +{
> +	int i;
> +
> +	for (i = 0; i < BLKCG_MAX_POLS; i++) {
> +		struct blkcg_policy *pol = blkcg_policy[i];
> +
> +		if (blkg->pd[i] && !blkcg_policy_enabled(q, pol)) {
> +			pol->pd_free_fn(blkg->pd[i]);
> +			blkg->pd[i] = NULL;
> +		}
> +	}
> +}
> +
>  /*
>   * If @new_blkg is %NULL, this function tries to allocate a new one as
>   * necessary using %GFP_NOWAIT.  @new_blkg is always consumed on return.
> @@ -252,6 +266,9 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
>  		goto err_free_blkg;
>  	}
>  
> +	if (new_blkg)
> +		blkg_check_pd(q, new_blkg);
> +

Can't this happen the other way around too? ie. Linking a pd which doesn't
have an entry for a policy which got enabled inbetween? And what if an
existing policy was de-registered and another policy got the policy id
inbetween? I think the correct solution here would be synchronizing alloc -
create blocks against policy deactivation rather than trying to patch an
allocated blkg later. Deactivation being a really slow path, there are
plenty of options. The main challenge would making it difficult to make
mistakes with, I guess.

Thanks.
Yu Kuai Oct. 12, 2021, 1:09 a.m. UTC | #3
On 2021/10/11 23:23, Michal Koutný wrote:
> Hello.
> 
> On Fri, Oct 08, 2021 at 03:27:20PM +0800, Yu Kuai <yukuai3@huawei.com> wrote:
>> This is because blkg_alloc() is called from blkg_conf_prep() without
>> holding 'q->queue_lock', and elevator is exited before blkg_create():
>   
> IIUC the problematic interleaving is this one (I've noticed `blkg->pd[i]
> = NULL` to thread 2 call trace):

The new blkg will not add to blkg_list untill pd_init_fn() is done in
blkg_create(), thus blkcg_deactivate_policy() can't access this blkg.
> 
>> thread 1                            thread 2
>> blkg_conf_prep
>>   spin_lock_irq(&q->queue_lock);
>>   blkg_lookup_check -> return NULL
>>   spin_unlock_irq(&q->queue_lock);
>>
>>   blkg_alloc
>>    blkcg_policy_enabled -> true
>>    pd = ->pd_alloc_fn
>>                                     blk_mq_exit_sched
>>                                      bfq_exit_queue
>>                                       blkcg_deactivate_policy
>>                                        spin_lock_irq(&q->queue_lock);
>>                                        __clear_bit(pol->plid, q->blkcg_pols);
>>
>                                          pol->pd_free_fn(blkg->pd[i]);
>                                          blkg->pd[i] = NULL;
>>
>>                                        spin_unlock_irq(&q->queue_lock);
>>                                      q->elevator = NULL;
>      blkg->pd[i] = pd
>>    spin_lock_irq(&q->queue_lock);
>>     blkg_create
>>      if (blkg->pd[i])
>>       ->pd_init_fn -> q->elevator is NULL
>>    spin_unlock_irq(&q->queue_lock);
> 
> In high-level terms, is this a race between (blk)io controller attribute
> write and a device scheduler (elevator) switch?
> If so, I'd add it to the commit message.
> 
>> Fix the problem by checking that policy is still enabled in
>> blkg_create().
> 
> Is this sufficient wrt some other q->elevator users later?
> 
>> @@ -252,6 +266,9 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
>>   		goto err_free_blkg;
>>   	}
>>   
> 
> I'd add a comment here like:
> 
>> Re-check policies are still enabled, since the caller blkg_conf_prep()
>> temporarily drops q->queue_lock and we can race with
>> blk_mq_exit_sched() removing policies.

Thanks for your advice.

Best regards,
Kuai
> 
>> +	if (new_blkg)
>> +		blkg_check_pd(q, new_blkg);
>> +
> 
> Thanks,
> Michal
> .
>
Yu Kuai Oct. 12, 2021, 1:39 a.m. UTC | #4
On 2021/10/12 1:16, Tejun Heo wrote:
> On Fri, Oct 08, 2021 at 03:27:20PM +0800, Yu Kuai wrote:
>> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
>> index eb48090eefce..00e1d97621ea 100644
>> --- a/block/blk-cgroup.c
>> +++ b/block/blk-cgroup.c
>> @@ -226,6 +226,20 @@ struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
>>   }
>>   EXPORT_SYMBOL_GPL(blkg_lookup_slowpath);
>>   
>> +static void blkg_check_pd(struct request_queue *q, struct blkcg_gq *blkg)
>> +{
>> +	int i;
>> +
>> +	for (i = 0; i < BLKCG_MAX_POLS; i++) {
>> +		struct blkcg_policy *pol = blkcg_policy[i];
>> +
>> +		if (blkg->pd[i] && !blkcg_policy_enabled(q, pol)) {
>> +			pol->pd_free_fn(blkg->pd[i]);
>> +			blkg->pd[i] = NULL;
>> +		}
>> +	}
>> +}
>> +
>>   /*
>>    * If @new_blkg is %NULL, this function tries to allocate a new one as
>>    * necessary using %GFP_NOWAIT.  @new_blkg is always consumed on return.
>> @@ -252,6 +266,9 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
>>   		goto err_free_blkg;
>>   	}
>>   
>> +	if (new_blkg)
>> +		blkg_check_pd(q, new_blkg);
>> +
> 
> Can't this happen the other way around too? ie. Linking a pd which doesn't
> have an entry for a policy which got enabled inbetween? And what if an
> existing policy was de-registered and another policy got the policy id
> inbetween? I think the correct solution here would be synchronizing alloc -
> create blocks against policy deactivation rather than trying to patch an
> allocated blkg later. Deactivation being a really slow path, there are
> plenty of options. The main challenge would making it difficult to make
> mistakes with, I guess.

For the case policy was de-registered, I think there won't be a problem
because pd_init_fn() is not called yet, and the blkg is not at
blkg_list, it's fine to use this blkg for the new policy.

For the case policy got enabled inbetween, the problem is that the pd
still doesn't have an entry for the policy, perhaps we can call
pd_alloc_fn() additionally in blkg_create?

If checking the blkg in blkg_create() is not a good solution, and we
decide to synchronize alloc-create blkg against policy deactivation.
Since only bfq policy can be deactivated or activated while queue is
not dying, and queue is freezed during activation and deactivation,
can we get a q->q_usage_counter and put it after blkg_create() is done
to prevent concurrent bfq policy activation and deactivation?

Thanks,
Kuai
> 
> Thanks.
>
Yu Kuai Oct. 13, 2021, 11:47 a.m. UTC | #5
On 2021/10/12 9:39, yukuai (C) wrote:
> On 2021/10/12 1:16, Tejun Heo wrote:
>> On Fri, Oct 08, 2021 at 03:27:20PM +0800, Yu Kuai wrote:
>>> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
>>> index eb48090eefce..00e1d97621ea 100644
>>> --- a/block/blk-cgroup.c
>>> +++ b/block/blk-cgroup.c
>>> @@ -226,6 +226,20 @@ struct blkcg_gq *blkg_lookup_slowpath(struct 
>>> blkcg *blkcg,
>>>   }
>>>   EXPORT_SYMBOL_GPL(blkg_lookup_slowpath);
>>> +static void blkg_check_pd(struct request_queue *q, struct blkcg_gq 
>>> *blkg)
>>> +{
>>> +    int i;
>>> +
>>> +    for (i = 0; i < BLKCG_MAX_POLS; i++) {
>>> +        struct blkcg_policy *pol = blkcg_policy[i];
>>> +
>>> +        if (blkg->pd[i] && !blkcg_policy_enabled(q, pol)) {
>>> +            pol->pd_free_fn(blkg->pd[i]);
>>> +            blkg->pd[i] = NULL;
>>> +        }
>>> +    }
>>> +}
>>> +
>>>   /*
>>>    * If @new_blkg is %NULL, this function tries to allocate a new one as
>>>    * necessary using %GFP_NOWAIT.  @new_blkg is always consumed on 
>>> return.
>>> @@ -252,6 +266,9 @@ static struct blkcg_gq *blkg_create(struct blkcg 
>>> *blkcg,
>>>           goto err_free_blkg;
>>>       }
>>> +    if (new_blkg)
>>> +        blkg_check_pd(q, new_blkg);
>>> +
>>
>> Can't this happen the other way around too? ie. Linking a pd which 
>> doesn't
>> have an entry for a policy which got enabled inbetween? And what if an
>> existing policy was de-registered and another policy got the policy id
>> inbetween? I think the correct solution here would be synchronizing 
>> alloc -
>> create blocks against policy deactivation rather than trying to patch an
>> allocated blkg later. Deactivation being a really slow path, there are
>> plenty of options. The main challenge would making it difficult to make
>> mistakes with, I guess.
> 
> For the case policy was de-registered, I think there won't be a problem
> because pd_init_fn() is not called yet, and the blkg is not at
> blkg_list, it's fine to use this blkg for the new policy.
> 
> For the case policy got enabled inbetween, the problem is that the pd
> still doesn't have an entry for the policy, perhaps we can call
> pd_alloc_fn() additionally in blkg_create?
> 
> If checking the blkg in blkg_create() is not a good solution, and we
> decide to synchronize alloc-create blkg against policy deactivation.
> Since only bfq policy can be deactivated or activated while queue is
> not dying, and queue is freezed during activation and deactivation,
> can we get a q->q_usage_counter and put it after blkg_create() is done
> to prevent concurrent bfq policy activation and deactivation?

Just found that blkcg_deactivate_policy() will call
blk_mq_freeze_queue(), thus get q->q_usage_counter is wrong...

Thanks,
Kuai
diff mbox series

Patch

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index eb48090eefce..00e1d97621ea 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -226,6 +226,20 @@  struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
 }
 EXPORT_SYMBOL_GPL(blkg_lookup_slowpath);
 
+static void blkg_check_pd(struct request_queue *q, struct blkcg_gq *blkg)
+{
+	int i;
+
+	for (i = 0; i < BLKCG_MAX_POLS; i++) {
+		struct blkcg_policy *pol = blkcg_policy[i];
+
+		if (blkg->pd[i] && !blkcg_policy_enabled(q, pol)) {
+			pol->pd_free_fn(blkg->pd[i]);
+			blkg->pd[i] = NULL;
+		}
+	}
+}
+
 /*
  * If @new_blkg is %NULL, this function tries to allocate a new one as
  * necessary using %GFP_NOWAIT.  @new_blkg is always consumed on return.
@@ -252,6 +266,9 @@  static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 		goto err_free_blkg;
 	}
 
+	if (new_blkg)
+		blkg_check_pd(q, new_blkg);
+
 	/* allocate */
 	if (!new_blkg) {
 		new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT | __GFP_NOWARN);