diff mbox series

[v2] blktrace: put bounds on BLKTRACESETUP buf_size and buf_nr

Message ID 20200611013326.218542-1-harshadshirwadkar@gmail.com (mailing list archive)
State New, archived
Headers show
Series [v2] blktrace: put bounds on BLKTRACESETUP buf_size and buf_nr | expand

Commit Message

harshad shirwadkar June 11, 2020, 1:33 a.m. UTC
Make sure that user requested memory via BLKTRACESETUP is within
bounds. This can be easily exploited by setting really large values
for buf_size and buf_nr in BLKTRACESETUP ioctl.

blktrace program has following hardcoded values for bufsize and bufnr:
BUF_SIZE=(512 * 1024)
BUF_NR=(4)

This is very easy to exploit. Setting buf_size / buf_nr in userspace
program to big values make kernel go oom.

This patch adds a new new sysfs tunable called "blktrace_max_alloc"
with the default value as:
blktrace_max_alloc=(1024 * 1024 * 16)

Verified that the fix makes BLKTRACESETUP return -E2BIG if the
buf_size * buf_nr crosses the configured upper bound in the device's
sysfs file. Verified that the bound checking is turned off when we
write 0 to the sysfs file.

Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
---
 Documentation/block/queue-sysfs.rst |  8 ++++++++
 block/blk-settings.c                |  1 +
 block/blk-sysfs.c                   | 27 +++++++++++++++++++++++++++
 include/linux/blkdev.h              |  5 +++++
 kernel/trace/blktrace.c             |  8 ++++++++
 5 files changed, 49 insertions(+)

Comments

Chaitanya Kulkarni June 11, 2020, 1:53 a.m. UTC | #1
On 6/10/20 6:33 PM, Harshad Shirwadkar wrote:
> Make sure that user requested memory via BLKTRACESETUP is within
> bounds. This can be easily exploited by setting really large values
> for buf_size and buf_nr in BLKTRACESETUP ioctl.
> 
> blktrace program has following hardcoded values for bufsize and bufnr:
> BUF_SIZE=(512 * 1024)
> BUF_NR=(4)
> 
> This is very easy to exploit. Setting buf_size / buf_nr in userspace
> program to big values make kernel go oom.
> 
> This patch adds a new new sysfs tunable called "blktrace_max_alloc"
> with the default value as:
> blktrace_max_alloc=(1024 * 1024 * 16)
> 
> Verified that the fix makes BLKTRACESETUP return -E2BIG if the
> buf_size * buf_nr crosses the configured upper bound in the device's
> sysfs file. Verified that the bound checking is turned off when we
> write 0 to the sysfs file.
> 
> Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
> ---
>   Documentation/block/queue-sysfs.rst |  8 ++++++++
>   block/blk-settings.c                |  1 +
>   block/blk-sysfs.c                   | 27 +++++++++++++++++++++++++++
>   include/linux/blkdev.h              |  5 +++++
>   kernel/trace/blktrace.c             |  8 ++++++++
>   5 files changed, 49 insertions(+)
> 
> diff --git a/Documentation/block/queue-sysfs.rst b/Documentation/block/queue-sysfs.rst
> index 6a8513af9201..ef4eec68cd24 100644
> --- a/Documentation/block/queue-sysfs.rst
> +++ b/Documentation/block/queue-sysfs.rst
> @@ -251,4 +251,12 @@ devices are described in the ZBC (Zoned Block Commands) and ZAC
>   do not support zone commands, they will be treated as regular block devices
>   and zoned will report "none".
>   
> +blktrace_max_alloc (RW)
> +----------
nit:-based on the pattern present in the same file how about following?

blk_trace_max_alloc (RW)
------------------------

> +BLKTRACESETUP ioctl takes the number of buffers and the size of each
> +buffer as an argument and uses these values to allocate kernel memory
> +for tracing purpose. This is the limit on the maximum kernel memory
> +that can be allocated by BLKTRACESETUP ioctl. The default limit is
> +16MB. Value of 0 indicates that this bound checking is disabled.
> +
>   Jens Axboe <jens.axboe@oracle.com>, February 2009
> diff --git a/block/blk-settings.c b/block/blk-settings.c
> index 9a2c23cd9700..38535aa146f4 100644
> --- a/block/blk-settings.c
> +++ b/block/blk-settings.c
> @@ -60,6 +60,7 @@ void blk_set_default_limits(struct queue_limits *lim)
>   	lim->io_opt = 0;
>   	lim->misaligned = 0;
>   	lim->zoned = BLK_ZONED_NONE;
> +	lim->blktrace_max_alloc = BLKTRACE_MAX_ALLOC;
>   }
>   EXPORT_SYMBOL(blk_set_default_limits);
>   
> diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
> index 02643e149d5e..e849e80930c4 100644
> --- a/block/blk-sysfs.c
> +++ b/block/blk-sysfs.c
> @@ -496,6 +496,26 @@ static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page,
>   	return count;
>   }
>   
> +static ssize_t queue_blktrace_max_alloc_show(struct request_queue *q, char *page)
> +{
> +	return queue_var_show(q->limits.blktrace_max_alloc, page);
> +}
> +
> +static ssize_t queue_blktrace_max_alloc_store(struct request_queue *q,
> +					      const char *page,
> +					      size_t count)
> +{
> +	unsigned long blktrace_max_alloc;
> +	int ret;
> +
> +	ret = queue_var_store(&blktrace_max_alloc, page, count);
> +	if (ret < 0)
> +		return ret;
> +
> +	q->limits.blktrace_max_alloc = blktrace_max_alloc;
> +	return count;
> +}
> +
>   static ssize_t queue_wc_show(struct request_queue *q, char *page)
>   {
>   	if (test_bit(QUEUE_FLAG_WC, &q->queue_flags))
> @@ -731,6 +751,12 @@ static struct queue_sysfs_entry queue_wb_lat_entry = {
>   	.store = queue_wb_lat_store,
>   };
>   
> +static struct queue_sysfs_entry queue_blktrace_max_alloc_entry = {
> +	.attr = {.name = "blktrace_max_alloc", .mode = 0644 },
> +	.show = queue_blktrace_max_alloc_show,
> +	.store = queue_blktrace_max_alloc_store,
> +};
> +
>   #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
>   static struct queue_sysfs_entry throtl_sample_time_entry = {
>   	.attr = {.name = "throttle_sample_time", .mode = 0644 },
> @@ -779,6 +805,7 @@ static struct attribute *queue_attrs[] = {
>   #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
>   	&throtl_sample_time_entry.attr,
>   #endif
> +	&queue_blktrace_max_alloc_entry.attr,
Is above attribute needs to be under CONFIG_BLK_DEV_IO_TRACE guard ?
>   	NULL,
>   };
>   
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index 8fd900998b4e..3a72b0fd723e 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -309,6 +309,10 @@ enum blk_queue_state {
>   #define BLK_SCSI_MAX_CMDS	(256)
>   #define BLK_SCSI_CMD_PER_LONG	(BLK_SCSI_MAX_CMDS / (sizeof(long) * 8))
>   
> +#define BLKTRACE_MAX_BUFSIZ	(1024 * 1024)
nit:- macro name ending with SIZ is unusual for include/linux/blkdev.h.
Consider renaming it to BLKTRACE_MAX_BUF_SIZE.
> +#define BLKTRACE_MAX_BUFNR	16
nit:- just like BLKTRACE_MAX_BUFSIZ value of BLKTRACE_MAX_BUFNR needs to 
be guarded by () ?
> +#define BLKTRACE_MAX_ALLOC	((BLKTRACE_MAX_BUFSIZ) * (BLKTRACE_MAX_BUFNR))
nit:- The macro BLKTRACE_MAX_BUFSIZ already has () we can get rid of the 
extra inner () for BLKTRACE_MAX_BUFSIZ in BLKTRACE_MAX_ALLOC.
> +
>   /*
>    * Zoned block device models (zoned limit).
>    */
> @@ -322,6 +326,7 @@ struct queue_limits {
>   	unsigned long		bounce_pfn;
>   	unsigned long		seg_boundary_mask;
>   	unsigned long		virt_boundary_mask;
> +	unsigned long		blktrace_max_alloc;
>   
>   	unsigned int		max_hw_sectors;
>   	unsigned int		max_dev_sectors;
> diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
> index ea47f2084087..de028bdbb148 100644
> --- a/kernel/trace/blktrace.c
> +++ b/kernel/trace/blktrace.c
> @@ -477,11 +477,19 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
>   {
>   	struct blk_trace *bt = NULL;
>   	struct dentry *dir = NULL;
> +	u32 alloc_siz;
nit:- please use full names 's/alloc_siz/alloc_size/'
>   	int ret;
>   
>   	if (!buts->buf_size || !buts->buf_nr)
>   		return -EINVAL;
>   
> +	if (check_mul_overflow(buts->buf_size, buts->buf_nr, &alloc_siz))
> +		return -E2BIG;
> +
> +	if (q->limits.blktrace_max_alloc &&
> +	    alloc_siz > q->limits.blktrace_max_alloc)
> +		return -E2BIG;
> +
>   	if (!blk_debugfs_root)
>   		return -ENOENT;
>   
>
Bart Van Assche June 11, 2020, 1:41 p.m. UTC | #2
On 2020-06-10 18:33, Harshad Shirwadkar wrote:
> +static struct queue_sysfs_entry queue_blktrace_max_alloc_entry = {
> +	.attr = {.name = "blktrace_max_alloc", .mode = 0644 },
> +	.show = queue_blktrace_max_alloc_show,
> +	.store = queue_blktrace_max_alloc_store,
> +};

I may have sent you in the wrong direction with my comment about
introducing sysfs attributes to control the blktrace_max_alloc limit.
That comment was made before I fully understood the issue this patch
addresses. There is a concern with introducing any limit for the amount
of memory allocated for blktrace, namely that any such limit may be seen
as a regression in the behavior of the BLKTRACESETUP ioctl. How about
not limiting the memory allocated for blktrace buffers at all?

If triggering an out-of-memory complaint is a big concern, how about
modifying relay_open() such that it becomes possible to pass a flag like
__GFP_ACCOUNT to the memory allocations relay_open() performs? From
memory-allocations.rst:

  * Untrusted allocations triggered from userspace should be a subject
    of kmem accounting and must have ``__GFP_ACCOUNT`` bit set. There
    is the handy ``GFP_KERNEL_ACCOUNT`` shortcut for ``GFP_KERNEL``
    allocations that should be accounted.

> @@ -322,6 +326,7 @@ struct queue_limits {
>  	unsigned long		bounce_pfn;
>  	unsigned long		seg_boundary_mask;
>  	unsigned long		virt_boundary_mask;
> +	unsigned long		blktrace_max_alloc;
>  
>  	unsigned int		max_hw_sectors;
>  	unsigned int		max_dev_sectors;

Is this the wrong structure to add a limit like blktrace_max_alloc? As
far as I can see struct queue_limits is only used for limits that are
affected by stacking block devices. My understanding is that the
blktrace_max_alloc limit is not affected by device stacking. See also
blk_stack_limits().

> diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
> index ea47f2084087..de028bdbb148 100644
> --- a/kernel/trace/blktrace.c
> +++ b/kernel/trace/blktrace.c
> @@ -477,11 +477,19 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
>  {
>  	struct blk_trace *bt = NULL;
>  	struct dentry *dir = NULL;
> +	u32 alloc_siz;
>  	int ret;
>  
>  	if (!buts->buf_size || !buts->buf_nr)
>  		return -EINVAL;
>  
> +	if (check_mul_overflow(buts->buf_size, buts->buf_nr, &alloc_siz))
> +		return -E2BIG;
> +
> +	if (q->limits.blktrace_max_alloc &&
> +	    alloc_siz > q->limits.blktrace_max_alloc)
> +		return -E2BIG;
> +
>  	if (!blk_debugfs_root)
>  		return -ENOENT;

Please add the check_mul_overflow() check in the code that performs the
multiplication, namely relay_open(), such that all relay_open() callers
are protected against multiplication overflows.

Thanks,

Bart.
Chaitanya Kulkarni June 13, 2020, 12:09 a.m. UTC | #3
> Is this the wrong structure to add a limit like blktrace_max_alloc? As
> far as I can see struct queue_limits is only used for limits that are
> affected by stacking block devices. My understanding is that the
> blktrace_max_alloc limit is not affected by device stacking. See also
> blk_stack_limits().

I can also see that, how about adding a parameter in
struct request_queue under CONFIG_BLK_DEV_IO_TRACE if we are going that 
route ?
harshad shirwadkar June 13, 2020, 12:57 a.m. UTC | #4
Thanks for the feedback Chaitanya and Bart.

In an offline sync-up with Shakeel and Bart, Shakeel mentioned that
probably doing the GFP_ACCOUNT is the right approach as long as the
trace buffer allocation lifetime can be tied with the process
lifetime. Given that, we can drop this patch for now. Whenever I get
time, I'll try to submit a new patch with that handled.

On Fri, Jun 12, 2020 at 5:09 PM Chaitanya Kulkarni
<Chaitanya.Kulkarni@wdc.com> wrote:
>
> > Is this the wrong structure to add a limit like blktrace_max_alloc? As
> > far as I can see struct queue_limits is only used for limits that are
> > affected by stacking block devices. My understanding is that the
> > blktrace_max_alloc limit is not affected by device stacking. See also
> > blk_stack_limits().
>
> I can also see that, how about adding a parameter in
> struct request_queue under CONFIG_BLK_DEV_IO_TRACE if we are going that
> route ?
>
Christoph Hellwig June 15, 2020, 11:52 a.m. UTC | #5
On Wed, Jun 10, 2020 at 06:33:26PM -0700, Harshad Shirwadkar wrote:
> Make sure that user requested memory via BLKTRACESETUP is within
> bounds. This can be easily exploited by setting really large values
> for buf_size and buf_nr in BLKTRACESETUP ioctl.
> 
> blktrace program has following hardcoded values for bufsize and bufnr:
> BUF_SIZE=(512 * 1024)
> BUF_NR=(4)
> 
> This is very easy to exploit. Setting buf_size / buf_nr in userspace
> program to big values make kernel go oom.
> 
> This patch adds a new new sysfs tunable called "blktrace_max_alloc"
> with the default value as:
> blktrace_max_alloc=(1024 * 1024 * 16)
> 
> Verified that the fix makes BLKTRACESETUP return -E2BIG if the
> buf_size * buf_nr crosses the configured upper bound in the device's
> sysfs file. Verified that the bound checking is turned off when we
> write 0 to the sysfs file.

I don't think the configurability makes any sense.  We need to put
a hard upper cap on, preferably one that doesn't break widely used
userspace.  Just pick a reasonable large value to get started.
Bart Van Assche June 15, 2020, 2:01 p.m. UTC | #6
On 2020-06-15 04:52, Christoph Hellwig wrote:
> I don't think the configurability makes any sense.  We need to put
> a hard upper cap on, preferably one that doesn't break widely used
> userspace.  Just pick a reasonable large value to get started.

Hi Christoph,

Something that has not been mentioned in the patch description is that
this patch addresses an issue that was discovered by syzbot. Do you
agree that the following changes are useful on top of a hard upper limit
for the blktrace buffer and that these changes will address more
potential syzbot issues?
- Use __GFP_ACCOUNT in the relay_open() call that allocates blktrace
buffers such that the memory allocation is accounted to a process and
such that the OOM killer becomes aware of blktrace buffer allocations.
- Making blkdev_close(), raw_release() and sg_release() shut down
tracing in case the BLKTRACETEARDOWN ioctl has not been invoked. That
will cause the blktrace buffers to be freed when e.g. the process that
owns these buffers is killed.

Thanks,

Bart.
Christoph Hellwig June 15, 2020, 2:04 p.m. UTC | #7
On Mon, Jun 15, 2020 at 07:01:00AM -0700, Bart Van Assche wrote:
> On 2020-06-15 04:52, Christoph Hellwig wrote:
> > I don't think the configurability makes any sense.  We need to put
> > a hard upper cap on, preferably one that doesn't break widely used
> > userspace.  Just pick a reasonable large value to get started.
> 
> Hi Christoph,
> 
> Something that has not been mentioned in the patch description is that
> this patch addresses an issue that was discovered by syzbot. Do you
> agree that the following changes are useful on top of a hard upper limit
> for the blktrace buffer and that these changes will address more
> potential syzbot issues?
> - Use __GFP_ACCOUNT in the relay_open() call that allocates blktrace
> buffers such that the memory allocation is accounted to a process and
> such that the OOM killer becomes aware of blktrace buffer allocations.

Sounds ok. 

> - Making blkdev_close(), raw_release() and sg_release() shut down
> tracing in case the BLKTRACETEARDOWN ioctl has not been invoked. That
> will cause the blktrace buffers to be freed when e.g. the process that
> owns these buffers is killed.

While this sounds useful I think it might very well break existing
use cases.
diff mbox series

Patch

diff --git a/Documentation/block/queue-sysfs.rst b/Documentation/block/queue-sysfs.rst
index 6a8513af9201..ef4eec68cd24 100644
--- a/Documentation/block/queue-sysfs.rst
+++ b/Documentation/block/queue-sysfs.rst
@@ -251,4 +251,12 @@  devices are described in the ZBC (Zoned Block Commands) and ZAC
 do not support zone commands, they will be treated as regular block devices
 and zoned will report "none".
 
+blktrace_max_alloc (RW)
+----------
+BLKTRACESETUP ioctl takes the number of buffers and the size of each
+buffer as an argument and uses these values to allocate kernel memory
+for tracing purpose. This is the limit on the maximum kernel memory
+that can be allocated by BLKTRACESETUP ioctl. The default limit is
+16MB. Value of 0 indicates that this bound checking is disabled.
+
 Jens Axboe <jens.axboe@oracle.com>, February 2009
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 9a2c23cd9700..38535aa146f4 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -60,6 +60,7 @@  void blk_set_default_limits(struct queue_limits *lim)
 	lim->io_opt = 0;
 	lim->misaligned = 0;
 	lim->zoned = BLK_ZONED_NONE;
+	lim->blktrace_max_alloc = BLKTRACE_MAX_ALLOC;
 }
 EXPORT_SYMBOL(blk_set_default_limits);
 
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 02643e149d5e..e849e80930c4 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -496,6 +496,26 @@  static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page,
 	return count;
 }
 
+static ssize_t queue_blktrace_max_alloc_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(q->limits.blktrace_max_alloc, page);
+}
+
+static ssize_t queue_blktrace_max_alloc_store(struct request_queue *q,
+					      const char *page,
+					      size_t count)
+{
+	unsigned long blktrace_max_alloc;
+	int ret;
+
+	ret = queue_var_store(&blktrace_max_alloc, page, count);
+	if (ret < 0)
+		return ret;
+
+	q->limits.blktrace_max_alloc = blktrace_max_alloc;
+	return count;
+}
+
 static ssize_t queue_wc_show(struct request_queue *q, char *page)
 {
 	if (test_bit(QUEUE_FLAG_WC, &q->queue_flags))
@@ -731,6 +751,12 @@  static struct queue_sysfs_entry queue_wb_lat_entry = {
 	.store = queue_wb_lat_store,
 };
 
+static struct queue_sysfs_entry queue_blktrace_max_alloc_entry = {
+	.attr = {.name = "blktrace_max_alloc", .mode = 0644 },
+	.show = queue_blktrace_max_alloc_show,
+	.store = queue_blktrace_max_alloc_store,
+};
+
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
 static struct queue_sysfs_entry throtl_sample_time_entry = {
 	.attr = {.name = "throttle_sample_time", .mode = 0644 },
@@ -779,6 +805,7 @@  static struct attribute *queue_attrs[] = {
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
 	&throtl_sample_time_entry.attr,
 #endif
+	&queue_blktrace_max_alloc_entry.attr,
 	NULL,
 };
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 8fd900998b4e..3a72b0fd723e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -309,6 +309,10 @@  enum blk_queue_state {
 #define BLK_SCSI_MAX_CMDS	(256)
 #define BLK_SCSI_CMD_PER_LONG	(BLK_SCSI_MAX_CMDS / (sizeof(long) * 8))
 
+#define BLKTRACE_MAX_BUFSIZ	(1024 * 1024)
+#define BLKTRACE_MAX_BUFNR	16
+#define BLKTRACE_MAX_ALLOC	((BLKTRACE_MAX_BUFSIZ) * (BLKTRACE_MAX_BUFNR))
+
 /*
  * Zoned block device models (zoned limit).
  */
@@ -322,6 +326,7 @@  struct queue_limits {
 	unsigned long		bounce_pfn;
 	unsigned long		seg_boundary_mask;
 	unsigned long		virt_boundary_mask;
+	unsigned long		blktrace_max_alloc;
 
 	unsigned int		max_hw_sectors;
 	unsigned int		max_dev_sectors;
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index ea47f2084087..de028bdbb148 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -477,11 +477,19 @@  static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 {
 	struct blk_trace *bt = NULL;
 	struct dentry *dir = NULL;
+	u32 alloc_siz;
 	int ret;
 
 	if (!buts->buf_size || !buts->buf_nr)
 		return -EINVAL;
 
+	if (check_mul_overflow(buts->buf_size, buts->buf_nr, &alloc_siz))
+		return -E2BIG;
+
+	if (q->limits.blktrace_max_alloc &&
+	    alloc_siz > q->limits.blktrace_max_alloc)
+		return -E2BIG;
+
 	if (!blk_debugfs_root)
 		return -ENOENT;