diff mbox

[RFC] block/loop: make loop cgroup aware

Message ID dd357e1638021132cd39a11769a745a8318636d3.1503506490.git.shli@fb.com (mailing list archive)
State New, archived
Headers show

Commit Message

Shaohua Li Aug. 23, 2017, 6:15 p.m. UTC
From: Shaohua Li <shli@fb.com>

Not a merge request, for discussion only.

loop block device handles IO in a separate thread. The actual IO
dispatched isn't cloned from the IO loop device received, so the
dispatched IO loses the cgroup context.

I'm ignoring buffer IO case now, which is quite complicated.  Making the
loop thread aware of cgroup context doesn't really help. The loop device
only writes to a single file. In current writeback cgroup
implementation, the file can only belong to one cgroup.

For direct IO case, we could workaround the issue in theory. For
example, say we assign cgroup1 5M/s BW for loop device and cgroup2
10M/s. We can create a special cgroup for loop thread and assign at
least 15M/s for the underlayer disk. In this way, we correctly throttle
the two cgroups. But this is tricky to setup.

This patch tries to address the issue. When loop thread is handling IO,
it declares the IO is on behalf of the original task, then in block IO
we use the original task to find cgroup. The concept probably works for
other scenarios too, but right now I don't make it generic yet.

Signed-off-by: Shaohua Li <shli@fb.com>
---
 block/bio.c                |  5 ++++-
 drivers/block/loop.c       | 14 +++++++++++++-
 drivers/block/loop.h       |  1 +
 include/linux/blk-cgroup.h |  3 ++-
 include/linux/sched.h      |  1 +
 5 files changed, 21 insertions(+), 3 deletions(-)

Comments

Vivek Goyal Aug. 23, 2017, 7:21 p.m. UTC | #1
On Wed, Aug 23, 2017 at 11:15:15AM -0700, Shaohua Li wrote:
> From: Shaohua Li <shli@fb.com>
> 
> Not a merge request, for discussion only.
> 
> loop block device handles IO in a separate thread. The actual IO
> dispatched isn't cloned from the IO loop device received, so the
> dispatched IO loses the cgroup context.
> 
> I'm ignoring buffer IO case now, which is quite complicated.  Making the
> loop thread aware of cgroup context doesn't really help. The loop device
> only writes to a single file. In current writeback cgroup
> implementation, the file can only belong to one cgroup.
> 
> For direct IO case, we could workaround the issue in theory. For
> example, say we assign cgroup1 5M/s BW for loop device and cgroup2
> 10M/s. We can create a special cgroup for loop thread and assign at
> least 15M/s for the underlayer disk. In this way, we correctly throttle
> the two cgroups. But this is tricky to setup.
> 
> This patch tries to address the issue. When loop thread is handling IO,
> it declares the IO is on behalf of the original task, then in block IO
> we use the original task to find cgroup. The concept probably works for
> other scenarios too, but right now I don't make it generic yet.

Hi Shaohua, 

Can worker thread switch/move to the cgroup bio is in and do the submision
and then switch back. That way IO submitted by worker should be accounted
to the cgroup bio belongs to. Just a thought. I don't even know if that's
feasible.

Vivek

> 
> Signed-off-by: Shaohua Li <shli@fb.com>
> ---
>  block/bio.c                |  5 ++++-
>  drivers/block/loop.c       | 14 +++++++++++++-
>  drivers/block/loop.h       |  1 +
>  include/linux/blk-cgroup.h |  3 ++-
>  include/linux/sched.h      |  1 +
>  5 files changed, 21 insertions(+), 3 deletions(-)
> 
> diff --git a/block/bio.c b/block/bio.c
> index e241bbc..8f0df3c 100644
> --- a/block/bio.c
> +++ b/block/bio.c
> @@ -2058,7 +2058,10 @@ int bio_associate_current(struct bio *bio)
>  
>  	get_io_context_active(ioc);
>  	bio->bi_ioc = ioc;
> -	bio->bi_css = task_get_css(current, io_cgrp_id);
> +	if (current->cgroup_task)
> +		bio->bi_css = task_get_css(current->cgroup_task, io_cgrp_id);
> +	else
> +		bio->bi_css = task_get_css(current, io_cgrp_id);
>  	return 0;
>  }
>  EXPORT_SYMBOL_GPL(bio_associate_current);
> diff --git a/drivers/block/loop.c b/drivers/block/loop.c
> index ef83349..fefede3 100644
> --- a/drivers/block/loop.c
> +++ b/drivers/block/loop.c
> @@ -77,7 +77,7 @@
>  #include <linux/falloc.h>
>  #include <linux/uio.h>
>  #include "loop.h"
> -
> +#include <linux/sched/task.h>
>  #include <linux/uaccess.h>
>  
>  static DEFINE_IDR(loop_index_idr);
> @@ -471,6 +471,8 @@ static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2)
>  {
>  	struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb);
>  
> +	if (cmd->cgroup_task)
> +		put_task_struct(cmd->cgroup_task);
>  	cmd->ret = ret;
>  	blk_mq_complete_request(cmd->rq);
>  }
> @@ -502,11 +504,16 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
>  	cmd->iocb.ki_complete = lo_rw_aio_complete;
>  	cmd->iocb.ki_flags = IOCB_DIRECT;
>  
> +	if (cmd->cgroup_task)
> +		current->cgroup_task = cmd->cgroup_task;
> +
>  	if (rw == WRITE)
>  		ret = call_write_iter(file, &cmd->iocb, &iter);
>  	else
>  		ret = call_read_iter(file, &cmd->iocb, &iter);
>  
> +	current->cgroup_task = NULL;
> +
>  	if (ret != -EIOCBQUEUED)
>  		cmd->iocb.ki_complete(&cmd->iocb, ret, 0);
>  	return 0;
> @@ -1705,6 +1712,11 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx,
>  		break;
>  	}
>  
> +	if (cmd->use_aio) {
> +		cmd->cgroup_task = current;
> +		get_task_struct(current);
> +	} else
> +		cmd->cgroup_task = NULL;
>  	kthread_queue_work(&lo->worker, &cmd->work);
>  
>  	return BLK_STS_OK;
> diff --git a/drivers/block/loop.h b/drivers/block/loop.h
> index 2c096b9..eb98d4d 100644
> --- a/drivers/block/loop.h
> +++ b/drivers/block/loop.h
> @@ -73,6 +73,7 @@ struct loop_cmd {
>  	bool use_aio;           /* use AIO interface to handle I/O */
>  	long ret;
>  	struct kiocb iocb;
> +	struct task_struct *cgroup_task;
>  };
>  
>  /* Support for loadable transfer modules */
> diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
> index 9d92153..38a5517 100644
> --- a/include/linux/blk-cgroup.h
> +++ b/include/linux/blk-cgroup.h
> @@ -232,7 +232,8 @@ static inline struct blkcg *bio_blkcg(struct bio *bio)
>  {
>  	if (bio && bio->bi_css)
>  		return css_to_blkcg(bio->bi_css);
> -	return task_blkcg(current);
> +	return task_blkcg(current->cgroup_task ?
> +			current->cgroup_task : current);
>  }
>  
>  static inline struct cgroup_subsys_state *
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 8337e2d..a5958b0 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -897,6 +897,7 @@ struct task_struct {
>  	struct css_set __rcu		*cgroups;
>  	/* cg_list protected by css_set_lock and tsk->alloc_lock: */
>  	struct list_head		cg_list;
> +	struct task_struct		*cgroup_task;
>  #endif
>  #ifdef CONFIG_INTEL_RDT_A
>  	int				closid;
> -- 
> 2.9.5
Shaohua Li Aug. 23, 2017, 7:30 p.m. UTC | #2
On Wed, Aug 23, 2017 at 03:21:25PM -0400, Vivek Goyal wrote:
> On Wed, Aug 23, 2017 at 11:15:15AM -0700, Shaohua Li wrote:
> > From: Shaohua Li <shli@fb.com>
> > 
> > Not a merge request, for discussion only.
> > 
> > loop block device handles IO in a separate thread. The actual IO
> > dispatched isn't cloned from the IO loop device received, so the
> > dispatched IO loses the cgroup context.
> > 
> > I'm ignoring buffer IO case now, which is quite complicated.  Making the
> > loop thread aware of cgroup context doesn't really help. The loop device
> > only writes to a single file. In current writeback cgroup
> > implementation, the file can only belong to one cgroup.
> > 
> > For direct IO case, we could workaround the issue in theory. For
> > example, say we assign cgroup1 5M/s BW for loop device and cgroup2
> > 10M/s. We can create a special cgroup for loop thread and assign at
> > least 15M/s for the underlayer disk. In this way, we correctly throttle
> > the two cgroups. But this is tricky to setup.
> > 
> > This patch tries to address the issue. When loop thread is handling IO,
> > it declares the IO is on behalf of the original task, then in block IO
> > we use the original task to find cgroup. The concept probably works for
> > other scenarios too, but right now I don't make it generic yet.
> 
> Hi Shaohua, 
> 
> Can worker thread switch/move to the cgroup bio is in and do the submision
> and then switch back. That way IO submitted by worker should be accounted
> to the cgroup bio belongs to. Just a thought. I don't even know if that's
> feasible.

That is my first attempt, but looks moving thread to a cgroup is an
expensive operation.

Thanks,
Shaohua

> 
> Vivek
> 
> > 
> > Signed-off-by: Shaohua Li <shli@fb.com>
> > ---
> >  block/bio.c                |  5 ++++-
> >  drivers/block/loop.c       | 14 +++++++++++++-
> >  drivers/block/loop.h       |  1 +
> >  include/linux/blk-cgroup.h |  3 ++-
> >  include/linux/sched.h      |  1 +
> >  5 files changed, 21 insertions(+), 3 deletions(-)
> > 
> > diff --git a/block/bio.c b/block/bio.c
> > index e241bbc..8f0df3c 100644
> > --- a/block/bio.c
> > +++ b/block/bio.c
> > @@ -2058,7 +2058,10 @@ int bio_associate_current(struct bio *bio)
> >  
> >  	get_io_context_active(ioc);
> >  	bio->bi_ioc = ioc;
> > -	bio->bi_css = task_get_css(current, io_cgrp_id);
> > +	if (current->cgroup_task)
> > +		bio->bi_css = task_get_css(current->cgroup_task, io_cgrp_id);
> > +	else
> > +		bio->bi_css = task_get_css(current, io_cgrp_id);
> >  	return 0;
> >  }
> >  EXPORT_SYMBOL_GPL(bio_associate_current);
> > diff --git a/drivers/block/loop.c b/drivers/block/loop.c
> > index ef83349..fefede3 100644
> > --- a/drivers/block/loop.c
> > +++ b/drivers/block/loop.c
> > @@ -77,7 +77,7 @@
> >  #include <linux/falloc.h>
> >  #include <linux/uio.h>
> >  #include "loop.h"
> > -
> > +#include <linux/sched/task.h>
> >  #include <linux/uaccess.h>
> >  
> >  static DEFINE_IDR(loop_index_idr);
> > @@ -471,6 +471,8 @@ static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2)
> >  {
> >  	struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb);
> >  
> > +	if (cmd->cgroup_task)
> > +		put_task_struct(cmd->cgroup_task);
> >  	cmd->ret = ret;
> >  	blk_mq_complete_request(cmd->rq);
> >  }
> > @@ -502,11 +504,16 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
> >  	cmd->iocb.ki_complete = lo_rw_aio_complete;
> >  	cmd->iocb.ki_flags = IOCB_DIRECT;
> >  
> > +	if (cmd->cgroup_task)
> > +		current->cgroup_task = cmd->cgroup_task;
> > +
> >  	if (rw == WRITE)
> >  		ret = call_write_iter(file, &cmd->iocb, &iter);
> >  	else
> >  		ret = call_read_iter(file, &cmd->iocb, &iter);
> >  
> > +	current->cgroup_task = NULL;
> > +
> >  	if (ret != -EIOCBQUEUED)
> >  		cmd->iocb.ki_complete(&cmd->iocb, ret, 0);
> >  	return 0;
> > @@ -1705,6 +1712,11 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx,
> >  		break;
> >  	}
> >  
> > +	if (cmd->use_aio) {
> > +		cmd->cgroup_task = current;
> > +		get_task_struct(current);
> > +	} else
> > +		cmd->cgroup_task = NULL;
> >  	kthread_queue_work(&lo->worker, &cmd->work);
> >  
> >  	return BLK_STS_OK;
> > diff --git a/drivers/block/loop.h b/drivers/block/loop.h
> > index 2c096b9..eb98d4d 100644
> > --- a/drivers/block/loop.h
> > +++ b/drivers/block/loop.h
> > @@ -73,6 +73,7 @@ struct loop_cmd {
> >  	bool use_aio;           /* use AIO interface to handle I/O */
> >  	long ret;
> >  	struct kiocb iocb;
> > +	struct task_struct *cgroup_task;
> >  };
> >  
> >  /* Support for loadable transfer modules */
> > diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
> > index 9d92153..38a5517 100644
> > --- a/include/linux/blk-cgroup.h
> > +++ b/include/linux/blk-cgroup.h
> > @@ -232,7 +232,8 @@ static inline struct blkcg *bio_blkcg(struct bio *bio)
> >  {
> >  	if (bio && bio->bi_css)
> >  		return css_to_blkcg(bio->bi_css);
> > -	return task_blkcg(current);
> > +	return task_blkcg(current->cgroup_task ?
> > +			current->cgroup_task : current);
> >  }
> >  
> >  static inline struct cgroup_subsys_state *
> > diff --git a/include/linux/sched.h b/include/linux/sched.h
> > index 8337e2d..a5958b0 100644
> > --- a/include/linux/sched.h
> > +++ b/include/linux/sched.h
> > @@ -897,6 +897,7 @@ struct task_struct {
> >  	struct css_set __rcu		*cgroups;
> >  	/* cg_list protected by css_set_lock and tsk->alloc_lock: */
> >  	struct list_head		cg_list;
> > +	struct task_struct		*cgroup_task;
> >  #endif
> >  #ifdef CONFIG_INTEL_RDT_A
> >  	int				closid;
> > -- 
> > 2.9.5
Tejun Heo Aug. 28, 2017, 10:54 p.m. UTC | #3
Hello, Shaohua.

On Wed, Aug 23, 2017 at 11:15:15AM -0700, Shaohua Li wrote:
> loop block device handles IO in a separate thread. The actual IO
> dispatched isn't cloned from the IO loop device received, so the
> dispatched IO loses the cgroup context.

Ah, right.  Thanks for spotting this.

> I'm ignoring buffer IO case now, which is quite complicated.  Making the
> loop thread aware of cgroup context doesn't really help. The loop device
> only writes to a single file. In current writeback cgroup
> implementation, the file can only belong to one cgroup.

Yeah, writeback tracks the most active cgroup and associates writeback
ios with that cgroup.  For buffered loop devices, I think it'd be fine
to make the loop driver forward the cgroup association and let the
writeback layer determine the matching association as usual.

> For direct IO case, we could workaround the issue in theory. For
> example, say we assign cgroup1 5M/s BW for loop device and cgroup2
> 10M/s. We can create a special cgroup for loop thread and assign at
> least 15M/s for the underlayer disk. In this way, we correctly throttle
> the two cgroups. But this is tricky to setup.
> 
> This patch tries to address the issue. When loop thread is handling IO,
> it declares the IO is on behalf of the original task, then in block IO
> we use the original task to find cgroup. The concept probably works for
> other scenarios too, but right now I don't make it generic yet.

The overall approach looks sound to me.  Some comments below.

> @@ -2058,7 +2058,10 @@ int bio_associate_current(struct bio *bio)
>  
>  	get_io_context_active(ioc);
>  	bio->bi_ioc = ioc;
> -	bio->bi_css = task_get_css(current, io_cgrp_id);
> +	if (current->cgroup_task)
> +		bio->bi_css = task_get_css(current->cgroup_task, io_cgrp_id);
> +	else
> +		bio->bi_css = task_get_css(current, io_cgrp_id);

Do we need a full pointer field for this?  I think we should be able
to mark lo writers with a flag (PF or whatever) and then to
kthread_data() to get the lo struct which contains the target css.
Oh, let's do csses instead of tasks for consistency, correctness
(please see below) and performance (csses are cheaper to pin).

> @@ -502,11 +504,16 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
>  	cmd->iocb.ki_complete = lo_rw_aio_complete;
>  	cmd->iocb.ki_flags = IOCB_DIRECT;
>  
> +	if (cmd->cgroup_task)
> +		current->cgroup_task = cmd->cgroup_task;
> +
>  	if (rw == WRITE)
>  		ret = call_write_iter(file, &cmd->iocb, &iter);
>  	else
>  		ret = call_read_iter(file, &cmd->iocb, &iter);
>  
> +	current->cgroup_task = NULL;
> +
>  	if (ret != -EIOCBQUEUED)
>  		cmd->iocb.ki_complete(&cmd->iocb, ret, 0);
>  	return 0;

Hmm... why do we need double forwarding (original issuer -> aio cmd ->
ios) in loop?  If we need this, doesn't this mean that we're missing
ownership forwarding in aio paths and should make the forwarding
happen there?

> @@ -1705,6 +1712,11 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx,
>  		break;
>  	}
>  
> +	if (cmd->use_aio) {
> +		cmd->cgroup_task = current;
> +		get_task_struct(current);
> +	} else
> +		cmd->cgroup_task = NULL;

What if %current isn't the original issuer of the io?  It could be a
writeback worker trying to flush to a loop device and we don't want to
attribute those ios to the writeback worker.  We should forward the
bi_css not the current task.

Thanks!
Shaohua Li Aug. 29, 2017, 3:22 p.m. UTC | #4
On Mon, Aug 28, 2017 at 03:54:59PM -0700, Tejun Heo wrote:
> Hello, Shaohua.
> 
> On Wed, Aug 23, 2017 at 11:15:15AM -0700, Shaohua Li wrote:
> > loop block device handles IO in a separate thread. The actual IO
> > dispatched isn't cloned from the IO loop device received, so the
> > dispatched IO loses the cgroup context.
> 
> Ah, right.  Thanks for spotting this.
> 
> > I'm ignoring buffer IO case now, which is quite complicated.  Making the
> > loop thread aware of cgroup context doesn't really help. The loop device
> > only writes to a single file. In current writeback cgroup
> > implementation, the file can only belong to one cgroup.
> 
> Yeah, writeback tracks the most active cgroup and associates writeback
> ios with that cgroup.  For buffered loop devices, I think it'd be fine
> to make the loop driver forward the cgroup association and let the
> writeback layer determine the matching association as usual.

Doing this means we must forward cgroup info to page, not bio. I need to check
if we can make the mechanism work for memcg.
 
> > For direct IO case, we could workaround the issue in theory. For
> > example, say we assign cgroup1 5M/s BW for loop device and cgroup2
> > 10M/s. We can create a special cgroup for loop thread and assign at
> > least 15M/s for the underlayer disk. In this way, we correctly throttle
> > the two cgroups. But this is tricky to setup.
> > 
> > This patch tries to address the issue. When loop thread is handling IO,
> > it declares the IO is on behalf of the original task, then in block IO
> > we use the original task to find cgroup. The concept probably works for
> > other scenarios too, but right now I don't make it generic yet.
> 
> The overall approach looks sound to me.  Some comments below.
> 
> > @@ -2058,7 +2058,10 @@ int bio_associate_current(struct bio *bio)
> >  
> >  	get_io_context_active(ioc);
> >  	bio->bi_ioc = ioc;
> > -	bio->bi_css = task_get_css(current, io_cgrp_id);
> > +	if (current->cgroup_task)
> > +		bio->bi_css = task_get_css(current->cgroup_task, io_cgrp_id);
> > +	else
> > +		bio->bi_css = task_get_css(current, io_cgrp_id);
> 
> Do we need a full pointer field for this?  I think we should be able
> to mark lo writers with a flag (PF or whatever) and then to
> kthread_data() to get the lo struct which contains the target css.
> Oh, let's do csses instead of tasks for consistency, correctness
> (please see below) and performance (csses are cheaper to pin).

Forwarding css sounds better.

> > @@ -502,11 +504,16 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
> >  	cmd->iocb.ki_complete = lo_rw_aio_complete;
> >  	cmd->iocb.ki_flags = IOCB_DIRECT;
> >  
> > +	if (cmd->cgroup_task)
> > +		current->cgroup_task = cmd->cgroup_task;
> > +
> >  	if (rw == WRITE)
> >  		ret = call_write_iter(file, &cmd->iocb, &iter);
> >  	else
> >  		ret = call_read_iter(file, &cmd->iocb, &iter);
> >  
> > +	current->cgroup_task = NULL;
> > +
> >  	if (ret != -EIOCBQUEUED)
> >  		cmd->iocb.ki_complete(&cmd->iocb, ret, 0);
> >  	return 0;
> 
> Hmm... why do we need double forwarding (original issuer -> aio cmd ->
> ios) in loop?  If we need this, doesn't this mean that we're missing
> ownership forwarding in aio paths and should make the forwarding
> happen there?

what do you mean double forwarding?
> 
> > @@ -1705,6 +1712,11 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx,
> >  		break;
> >  	}
> >  
> > +	if (cmd->use_aio) {
> > +		cmd->cgroup_task = current;
> > +		get_task_struct(current);
> > +	} else
> > +		cmd->cgroup_task = NULL;
> 
> What if %current isn't the original issuer of the io?  It could be a
> writeback worker trying to flush to a loop device and we don't want to
> attribute those ios to the writeback worker.  We should forward the
> bi_css not the current task.

Makes sense.

Thanks,
Shaohua
Tejun Heo Aug. 29, 2017, 3:28 p.m. UTC | #5
Hello, Shaohua.

On Tue, Aug 29, 2017 at 08:22:36AM -0700, Shaohua Li wrote:
> > Yeah, writeback tracks the most active cgroup and associates writeback
> > ios with that cgroup.  For buffered loop devices, I think it'd be fine
> > to make the loop driver forward the cgroup association and let the
> > writeback layer determine the matching association as usual.
> 
> Doing this means we must forward cgroup info to page, not bio. I need to check
> if we can make the mechanism work for memcg.

The association is already coming from the page.  We just need to make
sure that going through loop driver doesn't get in the way of the
membership getting propagated to the underlying device.

> > Hmm... why do we need double forwarding (original issuer -> aio cmd ->
> > ios) in loop?  If we need this, doesn't this mean that we're missing
> > ownership forwarding in aio paths and should make the forwarding
> > happen there?
> 
> what do you mean double forwarding?

So, this looks like the loop driver is explicitly forwarding the
association from the original issuer to the aio command and then from
the aio command to the ios to the underlying device.  I'm wondering
whether the right way to do this is making aio forward the association
by default, instead of the loop driver doing it explicitly.  That way,
all aio users can benefit from the forwarding instead of just loop.

Thanks.
Shaohua Li Aug. 30, 2017, 5:07 a.m. UTC | #6
On Tue, Aug 29, 2017 at 08:28:09AM -0700, Tejun Heo wrote:
> Hello, Shaohua.
> 
> On Tue, Aug 29, 2017 at 08:22:36AM -0700, Shaohua Li wrote:
> > > Yeah, writeback tracks the most active cgroup and associates writeback
> > > ios with that cgroup.  For buffered loop devices, I think it'd be fine
> > > to make the loop driver forward the cgroup association and let the
> > > writeback layer determine the matching association as usual.
> > 
> > Doing this means we must forward cgroup info to page, not bio. I need to check
> > if we can make the mechanism work for memcg.
> 
> The association is already coming from the page.  We just need to make
> sure that going through loop driver doesn't get in the way of the
> membership getting propagated to the underlying device.

I think there is confusion. App writes files in upper layer fs on loop. memcg
estabilish membership for the pages of these files. Then writeback does write,
loop then write these pages to under layer fs. The write is done in loop
thread. The write will allocate new page cache for under layer fs files. The
issue is we must forward memcg info from upper layer files page cache to under
layer files page cache.

> > > Hmm... why do we need double forwarding (original issuer -> aio cmd ->
> > > ios) in loop?  If we need this, doesn't this mean that we're missing
> > > ownership forwarding in aio paths and should make the forwarding
> > > happen there?
> > 
> > what do you mean double forwarding?
> 
> So, this looks like the loop driver is explicitly forwarding the
> association from the original issuer to the aio command and then from
> the aio command to the ios to the underlying device.  I'm wondering
> whether the right way to do this is making aio forward the association
> by default, instead of the loop driver doing it explicitly.  That way,
> all aio users can benefit from the forwarding instead of just loop.

That's possible. The downside doing this in aio is we must audit all fs to make
sure all bio have association. I'd like to avoid doing this if there is no
other loop-like cgroup usage.

Thanks,
Shaohua
Tejun Heo Aug. 31, 2017, 12:36 a.m. UTC | #7
Hello, Shaohua.

On Tue, Aug 29, 2017 at 10:07:04PM -0700, Shaohua Li wrote:
> > The association is already coming from the page.  We just need to make
> > sure that going through loop driver doesn't get in the way of the
> > membership getting propagated to the underlying device.
> 
> I think there is confusion. App writes files in upper layer fs on loop. memcg

Ah, yes, for some reason, I was still thinking about direct ios.

> estabilish membership for the pages of these files. Then writeback does write,
> loop then write these pages to under layer fs. The write is done in loop

Yes.

> thread. The write will allocate new page cache for under layer fs files. The
> issue is we must forward memcg info from upper layer files page cache to under
> layer files page cache.

Ah, I see.  We need to assign the ownership of the page cache pages to
the original dirtier.  Yeah, that's a different problem.  For now,
let's concentrate on the dio case.

> > So, this looks like the loop driver is explicitly forwarding the
> > association from the original issuer to the aio command and then from
> > the aio command to the ios to the underlying device.  I'm wondering
> > whether the right way to do this is making aio forward the association
> > by default, instead of the loop driver doing it explicitly.  That way,
> > all aio users can benefit from the forwarding instead of just loop.
> 
> That's possible. The downside doing this in aio is we must audit all fs to make
> sure all bio have association. I'd like to avoid doing this if there is no
> other loop-like cgroup usage.

But wouldn't that mean that we break cgroup membership for aios that
users issue?  Can't we do this in the generic aio layer?

Thanks.
diff mbox

Patch

diff --git a/block/bio.c b/block/bio.c
index e241bbc..8f0df3c 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -2058,7 +2058,10 @@  int bio_associate_current(struct bio *bio)
 
 	get_io_context_active(ioc);
 	bio->bi_ioc = ioc;
-	bio->bi_css = task_get_css(current, io_cgrp_id);
+	if (current->cgroup_task)
+		bio->bi_css = task_get_css(current->cgroup_task, io_cgrp_id);
+	else
+		bio->bi_css = task_get_css(current, io_cgrp_id);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(bio_associate_current);
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index ef83349..fefede3 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -77,7 +77,7 @@ 
 #include <linux/falloc.h>
 #include <linux/uio.h>
 #include "loop.h"
-
+#include <linux/sched/task.h>
 #include <linux/uaccess.h>
 
 static DEFINE_IDR(loop_index_idr);
@@ -471,6 +471,8 @@  static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2)
 {
 	struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb);
 
+	if (cmd->cgroup_task)
+		put_task_struct(cmd->cgroup_task);
 	cmd->ret = ret;
 	blk_mq_complete_request(cmd->rq);
 }
@@ -502,11 +504,16 @@  static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
 	cmd->iocb.ki_complete = lo_rw_aio_complete;
 	cmd->iocb.ki_flags = IOCB_DIRECT;
 
+	if (cmd->cgroup_task)
+		current->cgroup_task = cmd->cgroup_task;
+
 	if (rw == WRITE)
 		ret = call_write_iter(file, &cmd->iocb, &iter);
 	else
 		ret = call_read_iter(file, &cmd->iocb, &iter);
 
+	current->cgroup_task = NULL;
+
 	if (ret != -EIOCBQUEUED)
 		cmd->iocb.ki_complete(&cmd->iocb, ret, 0);
 	return 0;
@@ -1705,6 +1712,11 @@  static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx,
 		break;
 	}
 
+	if (cmd->use_aio) {
+		cmd->cgroup_task = current;
+		get_task_struct(current);
+	} else
+		cmd->cgroup_task = NULL;
 	kthread_queue_work(&lo->worker, &cmd->work);
 
 	return BLK_STS_OK;
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index 2c096b9..eb98d4d 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -73,6 +73,7 @@  struct loop_cmd {
 	bool use_aio;           /* use AIO interface to handle I/O */
 	long ret;
 	struct kiocb iocb;
+	struct task_struct *cgroup_task;
 };
 
 /* Support for loadable transfer modules */
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 9d92153..38a5517 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -232,7 +232,8 @@  static inline struct blkcg *bio_blkcg(struct bio *bio)
 {
 	if (bio && bio->bi_css)
 		return css_to_blkcg(bio->bi_css);
-	return task_blkcg(current);
+	return task_blkcg(current->cgroup_task ?
+			current->cgroup_task : current);
 }
 
 static inline struct cgroup_subsys_state *
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8337e2d..a5958b0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -897,6 +897,7 @@  struct task_struct {
 	struct css_set __rcu		*cgroups;
 	/* cg_list protected by css_set_lock and tsk->alloc_lock: */
 	struct list_head		cg_list;
+	struct task_struct		*cgroup_task;
 #endif
 #ifdef CONFIG_INTEL_RDT_A
 	int				closid;