diff mbox

[07/11] blktrace: export cgroup info in trace

Message ID e9f2d4ba1b376f7bfc920a4f3dcc94aa13ff8555.1496432591.git.shli@fb.com (mailing list archive)
State New, archived
Headers show

Commit Message

Shaohua Li June 2, 2017, 9:54 p.m. UTC
From: Shaohua Li <shli@fb.com>

Currently blktrace isn't cgroup aware. blktrace prints out task name of
current context, but the task of current context isn't always in the
cgroup where the BIO comes from. We can't use task name to find out IO
cgroup. For example, Writeback BIOs always comes from flusher thread but
the BIOs are for different blk cgroups. Request could be requeued and
dispatched from completely different tasks. MD/DM are another examples.

This patch tries to fix the gap. We print out cgroup fhandle info in
blktrace. Userspace can use open_by_handle_at() syscall to find the
cgroup by fhandle. Or userspace can use name_to_handle_at() syscall to
find fhandle for a cgroup and use a BPF program to filter out blktrace
for a specific cgroup.

We add a new 'blk_cgroup' trace option for blk tracer. It's default off.
Application which doesn't know the new option isn't affected.  When it's
on, we output fhandle info right after blk_io_trace with an extra bit
set in event action. So from application point of view, blktrace with
the option will output new actions.

I didn't change blk trace event yet, since I'm not sure if changing the
trace event output is an ABI issue. If not, I'll do it later.

Signed-off-by: Shaohua Li <shli@fb.com>
---
 include/uapi/linux/blktrace_api.h |   3 +
 kernel/trace/blktrace.c           | 230 ++++++++++++++++++++++++++------------
 2 files changed, 160 insertions(+), 73 deletions(-)

Comments

kernel test robot June 3, 2017, 5:35 a.m. UTC | #1
Hi Shaohua,

[auto build test WARNING on driver-core/driver-core-testing]
[also build test WARNING on v4.12-rc3 next-20170602]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Shaohua-Li/kernfs-implement-i_generation/20170603-083132
reproduce: make htmldocs

All warnings (new ones prefixed by >>):

   WARNING: convert(1) not found, for SVG to PDF conversion install ImageMagick (https://www.imagemagick.org)
   arch/x86/include/asm/uaccess_32.h:1: warning: no structured comments found
>> kernel/trace/blktrace.c:750: warning: No description found for parameter 'cgid'
   kernel/trace/blktrace.c:807: warning: No description found for parameter 'cgid'
   include/linux/init.h:1: warning: no structured comments found
   include/linux/mod_devicetable.h:686: warning: Excess struct/union/enum/typedef member 'ver_major' description in 'fsl_mc_device_id'
   include/linux/mod_devicetable.h:686: warning: Excess struct/union/enum/typedef member 'ver_minor' description in 'fsl_mc_device_id'
   kernel/sched/core.c:2088: warning: No description found for parameter 'rf'
   kernel/sched/core.c:2088: warning: Excess function parameter 'cookie' description in 'try_to_wake_up_local'
   include/linux/kthread.h:26: warning: Excess function parameter '...' description in 'kthread_create'
   kernel/sys.c:1: warning: no structured comments found
   include/linux/device.h:969: warning: No description found for parameter 'dma_ops'
   drivers/dma-buf/seqno-fence.c:1: warning: no structured comments found
   include/linux/iio/iio.h:597: warning: No description found for parameter 'trig_readonly'
   include/linux/iio/trigger.h:151: warning: No description found for parameter 'indio_dev'
   include/linux/iio/trigger.h:151: warning: No description found for parameter 'trig'
   include/linux/device.h:970: warning: No description found for parameter 'dma_ops'
   include/linux/usb/gadget.h:230: warning: No description found for parameter 'claimed'
   include/linux/usb/gadget.h:230: warning: No description found for parameter 'enabled'
   include/linux/usb/gadget.h:408: warning: No description found for parameter 'quirk_altset_not_supp'
   include/linux/usb/gadget.h:408: warning: No description found for parameter 'quirk_stall_not_supp'
   include/linux/usb/gadget.h:408: warning: No description found for parameter 'quirk_zlp_not_supp'
   include/drm/drm_drv.h:524: warning: No description found for parameter 'set_busid'
   include/drm/drm_drv.h:524: warning: No description found for parameter 'irq_handler'
   include/drm/drm_drv.h:524: warning: No description found for parameter 'irq_preinstall'
   include/drm/drm_drv.h:524: warning: No description found for parameter 'irq_postinstall'
   include/drm/drm_drv.h:524: warning: No description found for parameter 'irq_uninstall'
   include/drm/drm_drv.h:524: warning: No description found for parameter 'debugfs_init'
   include/drm/drm_drv.h:524: warning: No description found for parameter 'gem_open_object'
   include/drm/drm_drv.h:524: warning: No description found for parameter 'gem_close_object'
   include/drm/drm_drv.h:524: warning: No description found for parameter 'prime_handle_to_fd'
   include/drm/drm_drv.h:524: warning: No description found for parameter 'prime_fd_to_handle'
   include/drm/drm_drv.h:524: warning: No description found for parameter 'gem_prime_export'
   include/drm/drm_drv.h:524: warning: No description found for parameter 'gem_prime_import'
   include/drm/drm_drv.h:524: warning: No description found for parameter 'gem_prime_pin'
   include/drm/drm_drv.h:524: warning: No description found for parameter 'gem_prime_unpin'
   include/drm/drm_drv.h:524: warning: No description found for parameter 'gem_prime_res_obj'
   include/drm/drm_drv.h:524: warning: No description found for parameter 'gem_prime_get_sg_table'
   include/drm/drm_drv.h:524: warning: No description found for parameter 'gem_prime_import_sg_table'
   include/drm/drm_drv.h:524: warning: No description found for parameter 'gem_prime_vmap'
   include/drm/drm_drv.h:524: warning: No description found for parameter 'gem_prime_vunmap'
   include/drm/drm_drv.h:524: warning: No description found for parameter 'gem_prime_mmap'
   include/drm/drm_drv.h:524: warning: No description found for parameter 'gem_vm_ops'
   include/drm/drm_drv.h:524: warning: No description found for parameter 'major'
   include/drm/drm_drv.h:524: warning: No description found for parameter 'minor'
   include/drm/drm_drv.h:524: warning: No description found for parameter 'patchlevel'
   include/drm/drm_drv.h:524: warning: No description found for parameter 'name'
   include/drm/drm_drv.h:524: warning: No description found for parameter 'desc'
   include/drm/drm_drv.h:524: warning: No description found for parameter 'date'
   include/drm/drm_drv.h:524: warning: No description found for parameter 'driver_features'
   include/drm/drm_drv.h:524: warning: No description found for parameter 'ioctls'
   include/drm/drm_drv.h:524: warning: No description found for parameter 'num_ioctls'
   include/drm/drm_drv.h:524: warning: No description found for parameter 'fops'
   include/drm/drm_color_mgmt.h:1: warning: no structured comments found
   drivers/gpu/drm/drm_plane_helper.c:403: warning: No description found for parameter 'ctx'
   drivers/gpu/drm/drm_plane_helper.c:404: warning: No description found for parameter 'ctx'
   drivers/gpu/drm/i915/intel_lpe_audio.c:355: warning: No description found for parameter 'dp_output'
   drivers/gpu/drm/i915/intel_lpe_audio.c:355: warning: No description found for parameter 'link_rate'
   drivers/gpu/drm/i915/intel_lpe_audio.c:356: warning: No description found for parameter 'dp_output'
   drivers/gpu/drm/i915/intel_lpe_audio.c:356: warning: No description found for parameter 'link_rate'
   Documentation/core-api/assoc_array.rst:13: WARNING: Enumerated list ends without a blank line; unexpected unindent.
   Documentation/doc-guide/sphinx.rst:126: ERROR: Unknown target name: "sphinx c domain".
   kernel/sched/fair.c:7650: WARNING: Inline emphasis start-string without end-string.
   kernel/time/timer.c:1200: ERROR: Unexpected indentation.
   kernel/time/timer.c:1202: ERROR: Unexpected indentation.
   kernel/time/timer.c:1203: WARNING: Block quote ends without a blank line; unexpected unindent.
   include/linux/wait.h:122: WARNING: Block quote ends without a blank line; unexpected unindent.
   include/linux/wait.h:125: ERROR: Unexpected indentation.
   include/linux/wait.h:127: WARNING: Block quote ends without a blank line; unexpected unindent.
   kernel/time/hrtimer.c:990: WARNING: Block quote ends without a blank line; unexpected unindent.
   kernel/signal.c:322: WARNING: Inline literal start-string without end-string.
   include/linux/iio/iio.h:219: ERROR: Unexpected indentation.
   include/linux/iio/iio.h:220: WARNING: Block quote ends without a blank line; unexpected unindent.
   include/linux/iio/iio.h:226: WARNING: Definition list ends without a blank line; unexpected unindent.
   drivers/iio/industrialio-core.c:638: ERROR: Unknown target name: "iio_val".
   drivers/iio/industrialio-core.c:645: ERROR: Unknown target name: "iio_val".
   drivers/message/fusion/mptbase.c:5051: WARNING: Definition list ends without a blank line; unexpected unindent.
   drivers/tty/serial/serial_core.c:1898: WARNING: Definition list ends without a blank line; unexpected unindent.
   drivers/pci/pci.c:3456: ERROR: Unexpected indentation.
   include/linux/regulator/driver.h:271: ERROR: Unknown target name: "regulator_regmap_x_voltage".
   include/linux/spi/spi.h:370: ERROR: Unexpected indentation.
   drivers/gpu/drm/drm_scdc_helper.c:203: ERROR: Unexpected indentation.
   drivers/gpu/drm/drm_scdc_helper.c:204: WARNING: Block quote ends without a blank line; unexpected unindent.
   drivers/gpu/drm/drm_ioctl.c:690: WARNING: Definition list ends without a blank line; unexpected unindent.
   Documentation/gpu/todo.rst:111: ERROR: Unknown target name: "drm_fb".
   sound/soc/soc-core.c:2670: ERROR: Unknown target name: "snd_soc_daifmt".
   sound/core/jack.c:312: ERROR: Unknown target name: "snd_jack_btn".
   Documentation/userspace-api/unshare.rst:108: WARNING: Inline emphasis start-string without end-string.
   Documentation/usb/typec.rst:: WARNING: document isn't included in any toctree
   Documentation/usb/usb3-debug-port.rst:: WARNING: document isn't included in any toctree
   Fontconfig warning: "/home/kbuild/.config/fontconfig/fonts.conf", line 43: Having multiple values in <test> isn't supported and may not work as expected
   Fontconfig warning: "/home/kbuild/.config/fontconfig/fonts.conf", line 56: Having multiple values in <test> isn't supported and may not work as expected
   Fontconfig warning: "/home/kbuild/.config/fontconfig/fonts.conf", line 69: Having multiple values in <test> isn't supported and may not work as expected
   Fontconfig warning: "/home/kbuild/.config/fontconfig/fonts.conf", line 82: Having multiple values in <test> isn't supported and may not work as expected
   Fontconfig warning: "/home/kbuild/.config/fontconfig/fonts.conf", line 96: Having multiple values in <test> isn't supported and may not work as expected
   Fontconfig warning: "/home/kbuild/.config/fontconfig/fonts.conf", line 109: Having multiple values in <test> isn't supported and may not work as expected
   Fontconfig warning: "/home/kbuild/.config/fontconfig/fonts.conf", line 122: Having multiple values in <test> isn't supported and may not work as expected
   Fontconfig warning: "/home/kbuild/.config/fontconfig/fonts.conf", line 133: Having multiple values in <test> isn't supported and may not work as expected
   Fontconfig warning: "/home/kbuild/.config/fontconfig/fonts.conf", line 164: Having multiple values in <test> isn't supported and may not work as expected
   Fontconfig warning: "/home/kbuild/.config/fontconfig/fonts.conf", line 193: Having multiple values in <test> isn't supported and may not work as expected
   Fontconfig warning: "~/.fonts.conf", line 43: Having multiple values in <test> isn't supported and may not work as expected
   Fontconfig warning: "~/.fonts.conf", line 56: Having multiple values in <test> isn't supported and may not work as expected
   Fontconfig warning: "~/.fonts.conf", line 69: Having multiple values in <test> isn't supported and may not work as expected

vim +/cgid +750 kernel/trace/blktrace.c

5f3ea37c block/blktrace.c        Arnaldo Carvalho de Melo 2008-10-30  734  
5f3ea37c block/blktrace.c        Arnaldo Carvalho de Melo 2008-10-30  735  /**
5f3ea37c block/blktrace.c        Arnaldo Carvalho de Melo 2008-10-30  736   * blk_add_trace_rq - Add a trace for a request oriented action
5f3ea37c block/blktrace.c        Arnaldo Carvalho de Melo 2008-10-30  737   * @rq:		the source request
caf7df12 kernel/trace/blktrace.c Christoph Hellwig        2017-04-20  738   * @error:	return status to log
af5040da kernel/trace/blktrace.c Roman Pen                2014-03-04  739   * @nr_bytes:	number of completed bytes
5f3ea37c block/blktrace.c        Arnaldo Carvalho de Melo 2008-10-30  740   * @what:	the action
5f3ea37c block/blktrace.c        Arnaldo Carvalho de Melo 2008-10-30  741   *
5f3ea37c block/blktrace.c        Arnaldo Carvalho de Melo 2008-10-30  742   * Description:
5f3ea37c block/blktrace.c        Arnaldo Carvalho de Melo 2008-10-30  743   *     Records an action against a request. Will log the bio offset + size.
5f3ea37c block/blktrace.c        Arnaldo Carvalho de Melo 2008-10-30  744   *
5f3ea37c block/blktrace.c        Arnaldo Carvalho de Melo 2008-10-30  745   **/
caf7df12 kernel/trace/blktrace.c Christoph Hellwig        2017-04-20  746  static void blk_add_trace_rq(struct request *rq, int error,
f46b1436 kernel/trace/blktrace.c Shaohua Li               2017-06-02  747  			     unsigned int nr_bytes, u32 what,
f46b1436 kernel/trace/blktrace.c Shaohua Li               2017-06-02  748  			     struct kernfs_node_id *cgid)
5f3ea37c block/blktrace.c        Arnaldo Carvalho de Melo 2008-10-30  749  {
caf7df12 kernel/trace/blktrace.c Christoph Hellwig        2017-04-20 @750  	struct blk_trace *bt = rq->q->blk_trace;
5f3ea37c block/blktrace.c        Arnaldo Carvalho de Melo 2008-10-30  751  
5f3ea37c block/blktrace.c        Arnaldo Carvalho de Melo 2008-10-30  752  	if (likely(!bt))
5f3ea37c block/blktrace.c        Arnaldo Carvalho de Melo 2008-10-30  753  		return;
5f3ea37c block/blktrace.c        Arnaldo Carvalho de Melo 2008-10-30  754  
57292b58 kernel/trace/blktrace.c Christoph Hellwig        2017-01-31  755  	if (blk_rq_is_passthrough(rq))
5f3ea37c block/blktrace.c        Arnaldo Carvalho de Melo 2008-10-30  756  		what |= BLK_TC_ACT(BLK_TC_PC);
48b77ad6 kernel/trace/blktrace.c Christoph Hellwig        2017-01-27  757  	else
5f3ea37c block/blktrace.c        Arnaldo Carvalho de Melo 2008-10-30  758  		what |= BLK_TC_ACT(BLK_TC_FS);

:::::: The code at line 750 was first introduced by commit
:::::: caf7df12272118e0274c8353bcfeaf60c7743a47 block: remove the errors field from struct request

:::::: TO: Christoph Hellwig <hch@lst.de>
:::::: CC: Jens Axboe <axboe@fb.com>

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
diff mbox

Patch

diff --git a/include/uapi/linux/blktrace_api.h b/include/uapi/linux/blktrace_api.h
index c590ca6..9cdaede 100644
--- a/include/uapi/linux/blktrace_api.h
+++ b/include/uapi/linux/blktrace_api.h
@@ -52,6 +52,7 @@  enum blktrace_act {
 	__BLK_TA_REMAP,			/* bio was remapped */
 	__BLK_TA_ABORT,			/* request aborted */
 	__BLK_TA_DRV_DATA,		/* driver-specific binary data */
+	__BLK_TA_CGROUP = 1 << 8,	/* from a cgroup*/
 };
 
 /*
@@ -61,6 +62,7 @@  enum blktrace_notify {
 	__BLK_TN_PROCESS = 0,		/* establish pid/name mapping */
 	__BLK_TN_TIMESTAMP,		/* include system clock */
 	__BLK_TN_MESSAGE,		/* Character string message */
+	__BLK_TN_CGROUP = __BLK_TA_CGROUP, /* from a cgroup */
 };
 
 
@@ -107,6 +109,7 @@  struct blk_io_trace {
 	__u32 cpu;		/* on what cpu did it happen */
 	__u16 error;		/* completion error */
 	__u16 pdu_len;		/* length of data after this trace */
+	/* cgroup id will be stored here if exists */
 };
 
 /*
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 193c5f5..e351837 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -27,6 +27,7 @@ 
 #include <linux/time.h>
 #include <linux/uaccess.h>
 #include <linux/list.h>
+#include <linux/blk-cgroup.h>
 
 #include "../../block/blk.h"
 
@@ -46,10 +47,14 @@  static __cacheline_aligned_in_smp DEFINE_SPINLOCK(running_trace_lock);
 
 /* Select an alternative, minimalistic output than the original one */
 #define TRACE_BLK_OPT_CLASSIC	0x1
+#define TRACE_BLK_OPT_CGROUP	0x2
 
 static struct tracer_opt blk_tracer_opts[] = {
 	/* Default disable the minimalistic output */
 	{ TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC) },
+#ifdef CONFIG_BLK_CGROUP
+	{ TRACER_OPT(blk_cgroup, TRACE_BLK_OPT_CGROUP) },
+#endif
 	{ }
 };
 
@@ -68,7 +73,8 @@  static void blk_unregister_tracepoints(void);
  * Send out a notify message.
  */
 static void trace_note(struct blk_trace *bt, pid_t pid, int action,
-		       const void *data, size_t len)
+		       const void *data, size_t len,
+		       struct kernfs_node_id *cgid)
 {
 	struct blk_io_trace *t;
 	struct ring_buffer_event *event = NULL;
@@ -76,12 +82,13 @@  static void trace_note(struct blk_trace *bt, pid_t pid, int action,
 	int pc = 0;
 	int cpu = smp_processor_id();
 	bool blk_tracer = blk_tracer_enabled;
+	ssize_t cgid_len = cgid ? sizeof(*cgid) : 0;
 
 	if (blk_tracer) {
 		buffer = blk_tr->trace_buffer.buffer;
 		pc = preempt_count();
 		event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
-						  sizeof(*t) + len,
+						  sizeof(*t) + len + cgid_len,
 						  0, pc);
 		if (!event)
 			return;
@@ -92,17 +99,19 @@  static void trace_note(struct blk_trace *bt, pid_t pid, int action,
 	if (!bt->rchan)
 		return;
 
-	t = relay_reserve(bt->rchan, sizeof(*t) + len);
+	t = relay_reserve(bt->rchan, sizeof(*t) + len + cgid_len);
 	if (t) {
 		t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
 		t->time = ktime_to_ns(ktime_get());
 record_it:
 		t->device = bt->dev;
-		t->action = action;
+		t->action = action | (cgid ? __BLK_TN_CGROUP : 0);
 		t->pid = pid;
 		t->cpu = cpu;
-		t->pdu_len = len;
-		memcpy((void *) t + sizeof(*t), data, len);
+		t->pdu_len = len + cgid_len;
+		if (cgid)
+			memcpy((void *)t + sizeof(*t), cgid, cgid_len);
+		memcpy((void *) t + sizeof(*t) + cgid_len, data, len);
 
 		if (blk_tracer)
 			trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc);
@@ -122,7 +131,7 @@  static void trace_note_tsk(struct task_struct *tsk)
 	spin_lock_irqsave(&running_trace_lock, flags);
 	list_for_each_entry(bt, &running_trace_list, running_list) {
 		trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm,
-			   sizeof(tsk->comm));
+			   sizeof(tsk->comm), NULL);
 	}
 	spin_unlock_irqrestore(&running_trace_lock, flags);
 }
@@ -139,7 +148,7 @@  static void trace_note_time(struct blk_trace *bt)
 	words[1] = now.tv_nsec;
 
 	local_irq_save(flags);
-	trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words));
+	trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words), NULL);
 	local_irq_restore(flags);
 }
 
@@ -167,7 +176,7 @@  void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
 	n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args);
 	va_end(args);
 
-	trace_note(bt, 0, BLK_TN_MESSAGE, buf, n);
+	trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, NULL);
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(__trace_note_message);
@@ -204,7 +213,7 @@  static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
  */
 static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 		     int op, int op_flags, u32 what, int error, int pdu_len,
-		     void *pdu_data)
+		     void *pdu_data, struct kernfs_node_id *cgid)
 {
 	struct task_struct *tsk = current;
 	struct ring_buffer_event *event = NULL;
@@ -215,6 +224,7 @@  static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 	pid_t pid;
 	int cpu, pc = 0;
 	bool blk_tracer = blk_tracer_enabled;
+	ssize_t cgid_len = cgid ? sizeof(*cgid) : 0;
 
 	if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer))
 		return;
@@ -229,6 +239,8 @@  static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 		what |= BLK_TC_ACT(BLK_TC_DISCARD);
 	if (op == REQ_OP_FLUSH)
 		what |= BLK_TC_ACT(BLK_TC_FLUSH);
+	if (cgid)
+		what |= __BLK_TA_CGROUP;
 
 	pid = tsk->pid;
 	if (act_log_check(bt, what, sector, pid))
@@ -241,7 +253,7 @@  static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 		buffer = blk_tr->trace_buffer.buffer;
 		pc = preempt_count();
 		event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
-						  sizeof(*t) + pdu_len,
+						  sizeof(*t) + pdu_len + cgid_len,
 						  0, pc);
 		if (!event)
 			return;
@@ -258,7 +270,7 @@  static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 	 * from coming in and stepping on our toes.
 	 */
 	local_irq_save(flags);
-	t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len);
+	t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len + cgid_len);
 	if (t) {
 		sequence = per_cpu_ptr(bt->sequence, cpu);
 
@@ -280,10 +292,12 @@  static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 		t->action = what;
 		t->device = bt->dev;
 		t->error = error;
-		t->pdu_len = pdu_len;
+		t->pdu_len = pdu_len + cgid_len;
 
+		if (cgid_len)
+			memcpy((void *)t + sizeof(*t), cgid, cgid_len);
 		if (pdu_len)
-			memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
+			memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len);
 
 		if (blk_tracer) {
 			trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc);
@@ -684,6 +698,36 @@  void blk_trace_shutdown(struct request_queue *q)
 	}
 }
 
+#ifdef CONFIG_BLK_CGROUP
+static struct kernfs_node_id *
+blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
+{
+	struct blk_trace *bt = q->blk_trace;
+
+	if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP))
+		return NULL;
+
+	if (!bio->bi_css)
+		return NULL;
+	return cgroup_get_node_id(bio->bi_css->cgroup);
+}
+#else
+static struct kernfs_node_id *
+blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
+{
+	return NULL;
+}
+#endif
+
+static struct kernfs_node_id *
+blk_trace_request_get_cgid(struct request_queue *q, struct request *rq)
+{
+	if (!rq->bio)
+		return NULL;
+	/* Use the first bio */
+	return blk_trace_bio_get_cgid(q, rq->bio);
+}
+
 /*
  * blktrace probes
  */
@@ -700,7 +744,8 @@  void blk_trace_shutdown(struct request_queue *q)
  *
  **/
 static void blk_add_trace_rq(struct request *rq, int error,
-			     unsigned int nr_bytes, u32 what)
+			     unsigned int nr_bytes, u32 what,
+			     struct kernfs_node_id *cgid)
 {
 	struct blk_trace *bt = rq->q->blk_trace;
 
@@ -713,32 +758,36 @@  static void blk_add_trace_rq(struct request *rq, int error,
 		what |= BLK_TC_ACT(BLK_TC_FS);
 
 	__blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq),
-			rq->cmd_flags, what, error, 0, NULL);
+			rq->cmd_flags, what, error, 0, NULL, cgid);
 }
 
 static void blk_add_trace_rq_insert(void *ignore,
 				    struct request_queue *q, struct request *rq)
 {
-	blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT);
+	blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT,
+			 blk_trace_request_get_cgid(q, rq));
 }
 
 static void blk_add_trace_rq_issue(void *ignore,
 				   struct request_queue *q, struct request *rq)
 {
-	blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE);
+	blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE,
+			 blk_trace_request_get_cgid(q, rq));
 }
 
 static void blk_add_trace_rq_requeue(void *ignore,
 				     struct request_queue *q,
 				     struct request *rq)
 {
-	blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE);
+	blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE,
+			 blk_trace_request_get_cgid(q, rq));
 }
 
 static void blk_add_trace_rq_complete(void *ignore, struct request *rq,
 			int error, unsigned int nr_bytes)
 {
-	blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE);
+	blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE,
+			 blk_trace_request_get_cgid(rq->q, rq));
 }
 
 /**
@@ -753,7 +802,7 @@  static void blk_add_trace_rq_complete(void *ignore, struct request *rq,
  *
  **/
 static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
-			      u32 what, int error)
+			      u32 what, int error, struct kernfs_node_id *cgid)
 {
 	struct blk_trace *bt = q->blk_trace;
 
@@ -761,20 +810,22 @@  static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
 		return;
 
 	__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
-			bio_op(bio), bio->bi_opf, what, error, 0, NULL);
+			bio_op(bio), bio->bi_opf, what, error, 0, NULL, cgid);
 }
 
 static void blk_add_trace_bio_bounce(void *ignore,
 				     struct request_queue *q, struct bio *bio)
 {
-	blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0);
+	blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0,
+			  blk_trace_bio_get_cgid(q, bio));
 }
 
 static void blk_add_trace_bio_complete(void *ignore,
 				       struct request_queue *q, struct bio *bio,
 				       int error)
 {
-	blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error);
+	blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error,
+			  blk_trace_bio_get_cgid(q, bio));
 }
 
 static void blk_add_trace_bio_backmerge(void *ignore,
@@ -782,7 +833,8 @@  static void blk_add_trace_bio_backmerge(void *ignore,
 					struct request *rq,
 					struct bio *bio)
 {
-	blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0);
+	blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0,
+			 blk_trace_bio_get_cgid(q, bio));
 }
 
 static void blk_add_trace_bio_frontmerge(void *ignore,
@@ -790,13 +842,15 @@  static void blk_add_trace_bio_frontmerge(void *ignore,
 					 struct request *rq,
 					 struct bio *bio)
 {
-	blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0);
+	blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0,
+			  blk_trace_bio_get_cgid(q, bio));
 }
 
 static void blk_add_trace_bio_queue(void *ignore,
 				    struct request_queue *q, struct bio *bio)
 {
-	blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0);
+	blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0,
+			  blk_trace_bio_get_cgid(q, bio));
 }
 
 static void blk_add_trace_getrq(void *ignore,
@@ -804,13 +858,14 @@  static void blk_add_trace_getrq(void *ignore,
 				struct bio *bio, int rw)
 {
 	if (bio)
-		blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0);
+		blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0,
+				  blk_trace_bio_get_cgid(q, bio));
 	else {
 		struct blk_trace *bt = q->blk_trace;
 
 		if (bt)
 			__blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0,
-					NULL);
+					NULL, NULL);
 	}
 }
 
@@ -820,13 +875,14 @@  static void blk_add_trace_sleeprq(void *ignore,
 				  struct bio *bio, int rw)
 {
 	if (bio)
-		blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0);
+		blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0,
+				  blk_trace_bio_get_cgid(q, bio));
 	else {
 		struct blk_trace *bt = q->blk_trace;
 
 		if (bt)
 			__blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ,
-					0, 0, NULL);
+					0, 0, NULL, NULL);
 	}
 }
 
@@ -835,7 +891,7 @@  static void blk_add_trace_plug(void *ignore, struct request_queue *q)
 	struct blk_trace *bt = q->blk_trace;
 
 	if (bt)
-		__blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
+		__blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, NULL);
 }
 
 static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
@@ -852,7 +908,7 @@  static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
 		else
 			what = BLK_TA_UNPLUG_TIMER;
 
-		__blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
+		__blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, NULL);
 	}
 }
 
@@ -868,7 +924,7 @@  static void blk_add_trace_split(void *ignore,
 		__blk_add_trace(bt, bio->bi_iter.bi_sector,
 				bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf,
 				BLK_TA_SPLIT, bio->bi_error, sizeof(rpdu),
-				&rpdu);
+				&rpdu, blk_trace_bio_get_cgid(q, bio));
 	}
 }
 
@@ -901,7 +957,7 @@  static void blk_add_trace_bio_remap(void *ignore,
 
 	__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
 			bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_error,
-			sizeof(r), &r);
+			sizeof(r), &r, blk_trace_bio_get_cgid(q, bio));
 }
 
 /**
@@ -934,7 +990,7 @@  static void blk_add_trace_rq_remap(void *ignore,
 
 	__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
 			rq_data_dir(rq), 0, BLK_TA_REMAP, 0,
-			sizeof(r), &r);
+			sizeof(r), &r, blk_trace_request_get_cgid(q, rq));
 }
 
 /**
@@ -958,7 +1014,8 @@  void blk_add_driver_data(struct request_queue *q,
 		return;
 
 	__blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0,
-				BLK_TA_DRV_DATA, 0, len, data);
+				BLK_TA_DRV_DATA, 0, len, data,
+				blk_trace_request_get_cgid(q, rq));
 }
 EXPORT_SYMBOL_GPL(blk_add_driver_data);
 
@@ -1031,7 +1088,7 @@  static void fill_rwbs(char *rwbs, const struct blk_io_trace *t)
 	int i = 0;
 	int tc = t->action >> BLK_TC_SHIFT;
 
-	if (t->action == BLK_TN_MESSAGE) {
+	if ((t->action & ~__BLK_TN_CGROUP) == BLK_TN_MESSAGE) {
 		rwbs[i++] = 'N';
 		goto out;
 	}
@@ -1066,9 +1123,21 @@  const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent)
 	return (const struct blk_io_trace *)ent;
 }
 
-static inline const void *pdu_start(const struct trace_entry *ent)
+static inline const void *pdu_start(const struct trace_entry *ent, bool has_cg)
 {
-	return te_blk_io_trace(ent) + 1;
+	return (void *)(te_blk_io_trace(ent) + 1) +
+		(has_cg ? sizeof(struct kernfs_node_id) : 0);
+}
+
+static inline const void *cgid_start(const struct trace_entry *ent)
+{
+	return (void *)(te_blk_io_trace(ent) + 1);
+}
+
+static inline int pdu_real_len(const struct trace_entry *ent, bool has_cg)
+{
+	return te_blk_io_trace(ent)->pdu_len -
+			(has_cg ? sizeof(struct kernfs_node_id) : 0);
 }
 
 static inline u32 t_action(const struct trace_entry *ent)
@@ -1096,16 +1165,16 @@  static inline __u16 t_error(const struct trace_entry *ent)
 	return te_blk_io_trace(ent)->error;
 }
 
-static __u64 get_pdu_int(const struct trace_entry *ent)
+static __u64 get_pdu_int(const struct trace_entry *ent, bool has_cg)
 {
-	const __u64 *val = pdu_start(ent);
+	const __u64 *val = pdu_start(ent, has_cg);
 	return be64_to_cpu(*val);
 }
 
 static void get_pdu_remap(const struct trace_entry *ent,
-			  struct blk_io_trace_remap *r)
+			  struct blk_io_trace_remap *r, bool has_cg)
 {
-	const struct blk_io_trace_remap *__r = pdu_start(ent);
+	const struct blk_io_trace_remap *__r = pdu_start(ent, has_cg);
 	__u64 sector_from = __r->sector_from;
 
 	r->device_from = be32_to_cpu(__r->device_from);
@@ -1113,9 +1182,11 @@  static void get_pdu_remap(const struct trace_entry *ent,
 	r->sector_from = be64_to_cpu(sector_from);
 }
 
-typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act);
+typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act,
+	bool has_cg);
 
-static void blk_log_action_classic(struct trace_iterator *iter, const char *act)
+static void blk_log_action_classic(struct trace_iterator *iter, const char *act,
+	bool has_cg)
 {
 	char rwbs[RWBS_LEN];
 	unsigned long long ts  = iter->ts;
@@ -1131,24 +1202,33 @@  static void blk_log_action_classic(struct trace_iterator *iter, const char *act)
 			 secs, nsec_rem, iter->ent->pid, act, rwbs);
 }
 
-static void blk_log_action(struct trace_iterator *iter, const char *act)
+static void blk_log_action(struct trace_iterator *iter, const char *act,
+	bool has_cg)
 {
 	char rwbs[RWBS_LEN];
 	const struct blk_io_trace *t = te_blk_io_trace(iter->ent);
 
 	fill_rwbs(rwbs, t);
-	trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ",
-			 MAJOR(t->device), MINOR(t->device), act, rwbs);
+	if (has_cg) {
+		const struct kernfs_node_id *id = cgid_start(iter->ent);
+
+		trace_seq_printf(&iter->seq, "%3d,%-3d %8x,%-8x %2s %3s ",
+				 MAJOR(t->device), MINOR(t->device),
+				 id->ino, id->gen, act, rwbs);
+	} else
+		trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ",
+				 MAJOR(t->device), MINOR(t->device), act, rwbs);
 }
 
-static void blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_dump_pdu(struct trace_seq *s,
+	const struct trace_entry *ent, bool has_cg)
 {
 	const unsigned char *pdu_buf;
 	int pdu_len;
 	int i, end;
 
-	pdu_buf = pdu_start(ent);
-	pdu_len = te_blk_io_trace(ent)->pdu_len;
+	pdu_buf = pdu_start(ent, has_cg);
+	pdu_len = pdu_real_len(ent, has_cg);
 
 	if (!pdu_len)
 		return;
@@ -1179,7 +1259,7 @@  static void blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent)
 	trace_seq_puts(s, ") ");
 }
 
-static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
 {
 	char cmd[TASK_COMM_LEN];
 
@@ -1187,7 +1267,7 @@  static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
 
 	if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
 		trace_seq_printf(s, "%u ", t_bytes(ent));
-		blk_log_dump_pdu(s, ent);
+		blk_log_dump_pdu(s, ent, has_cg);
 		trace_seq_printf(s, "[%s]\n", cmd);
 	} else {
 		if (t_sec(ent))
@@ -1199,10 +1279,10 @@  static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
 }
 
 static void blk_log_with_error(struct trace_seq *s,
-			      const struct trace_entry *ent)
+			      const struct trace_entry *ent, bool has_cg)
 {
 	if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
-		blk_log_dump_pdu(s, ent);
+		blk_log_dump_pdu(s, ent, has_cg);
 		trace_seq_printf(s, "[%d]\n", t_error(ent));
 	} else {
 		if (t_sec(ent))
@@ -1215,18 +1295,18 @@  static void blk_log_with_error(struct trace_seq *s,
 	}
 }
 
-static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
 {
 	struct blk_io_trace_remap r = { .device_from = 0, };
 
-	get_pdu_remap(ent, &r);
+	get_pdu_remap(ent, &r, has_cg);
 	trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
 			 t_sector(ent), t_sec(ent),
 			 MAJOR(r.device_from), MINOR(r.device_from),
 			 (unsigned long long)r.sector_from);
 }
 
-static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
 {
 	char cmd[TASK_COMM_LEN];
 
@@ -1235,30 +1315,31 @@  static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
 	trace_seq_printf(s, "[%s]\n", cmd);
 }
 
-static void blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
 {
 	char cmd[TASK_COMM_LEN];
 
 	trace_find_cmdline(ent->pid, cmd);
 
-	trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent));
+	trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent, has_cg));
 }
 
-static void blk_log_split(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_split(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
 {
 	char cmd[TASK_COMM_LEN];
 
 	trace_find_cmdline(ent->pid, cmd);
 
 	trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent),
-			 get_pdu_int(ent), cmd);
+			 get_pdu_int(ent, has_cg), cmd);
 }
 
-static void blk_log_msg(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_msg(struct trace_seq *s, const struct trace_entry *ent,
+			bool has_cg)
 {
-	const struct blk_io_trace *t = te_blk_io_trace(ent);
 
-	trace_seq_putmem(s, t + 1, t->pdu_len);
+	trace_seq_putmem(s, pdu_start(ent, has_cg),
+		pdu_real_len(ent, has_cg));
 	trace_seq_putc(s, '\n');
 }
 
@@ -1298,7 +1379,8 @@  static void blk_tracer_reset(struct trace_array *tr)
 
 static const struct {
 	const char *act[2];
-	void	   (*print)(struct trace_seq *s, const struct trace_entry *ent);
+	void	   (*print)(struct trace_seq *s, const struct trace_entry *ent,
+			    bool has_cg);
 } what2act[] = {
 	[__BLK_TA_QUEUE]	= {{  "Q", "queue" },	   blk_log_generic },
 	[__BLK_TA_BACKMERGE]	= {{  "M", "backmerge" },  blk_log_generic },
@@ -1326,23 +1408,25 @@  static enum print_line_t print_one_line(struct trace_iterator *iter,
 	u16 what;
 	bool long_act;
 	blk_log_action_t *log_action;
+	bool has_cg;
 
 	t	   = te_blk_io_trace(iter->ent);
-	what	   = t->action & ((1 << BLK_TC_SHIFT) - 1);
+	what	   = (t->action & ((1 << BLK_TC_SHIFT) - 1)) & ~__BLK_TA_CGROUP;
 	long_act   = !!(tr->trace_flags & TRACE_ITER_VERBOSE);
 	log_action = classic ? &blk_log_action_classic : &blk_log_action;
+	has_cg	   = t->action & __BLK_TA_CGROUP;
 
-	if (t->action == BLK_TN_MESSAGE) {
-		log_action(iter, long_act ? "message" : "m");
-		blk_log_msg(s, iter->ent);
+	if ((t->action & ~__BLK_TN_CGROUP) == BLK_TN_MESSAGE) {
+		log_action(iter, long_act ? "message" : "m", has_cg);
+		blk_log_msg(s, iter->ent, has_cg);
 		return trace_handle_return(s);
 	}
 
 	if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act)))
 		trace_seq_printf(s, "Unknown action %x\n", what);
 	else {
-		log_action(iter, what2act[what].act[long_act]);
-		what2act[what].print(s, iter->ent);
+		log_action(iter, what2act[what].act[long_act], has_cg);
+		what2act[what].print(s, iter->ent, has_cg);
 	}
 
 	return trace_handle_return(s);