diff mbox series

audit: io_uring openat triggers audit reference count underflow in worker thread

Message ID MW2PR2101MB1033FFF044A258F84AEAA584F1C9A@MW2PR2101MB1033.namprd21.prod.outlook.com (mailing list archive)
State Changes Requested
Delegated to: Paul Moore
Headers show
Series audit: io_uring openat triggers audit reference count underflow in worker thread | expand

Commit Message

Dan Clash Oct. 6, 2023, 8:09 p.m. UTC
This discussion is about how to fix an audit reference count decrement
race between two io_uring threads.

Original discussion link:
https : / / github . com / axboe / liburing / issues / 958

Details:

The test program below hangs indefinitely waiting for an openat cqe.  The
reproduction is with a distro kernel Ubuntu-azure-6.2-6.2.0-1012.12_22.04.1.
However, the bug seems possible with an upstream kernel.  An experiment of
changing the reference count in struct filename from int to refcount_t allows
the test program to complete.

The bug did not occur with this test program until a kernel containing
commit 5bd2182d58e9 was used.  I have not found a matching reported issue
or upstream commit yet.

The dmseg log shows an audit related path:

[27883.992550] kernel BUG at fs/namei.c:262!
[27883.994051] invalid opcode: 0000 [#15] SMP PTI
[27883.995719] CPU: 3 PID: 84988 Comm: iou-wrk-84835 Tainted: G      D
               6.2.0-1012-azure #12~22.04.1-Ubuntu
[27883.999064] Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine,
               BIOS Hyper-V UEFI Release v4.1 05/09/2022
[27884.002734] RIP: 0010:putname+0x68/0x70
...
[27884.032893] Call Trace:
[27884.034032]  <TASK>
[27884.035117]  ? show_regs+0x6a/0x80
[27884.036763]  ? die+0x38/0xa0
[27884.038023]  ? do_trap+0xd0/0xf0
[27884.039359]  ? do_error_trap+0x70/0x90
[27884.040861]  ? putname+0x68/0x70
[27884.042201]  ? exc_invalid_op+0x53/0x70
[27884.043698]  ? putname+0x68/0x70
[27884.045076]  ? asm_exc_invalid_op+0x1b/0x20
[27884.047051]  ? putname+0x68/0x70
[27884.048415]  audit_reset_context.part.0.constprop.0+0xe1/0x300
[27884.050511]  __audit_uring_exit+0xda/0x1c0
[27884.052100]  io_issue_sqe+0x1f3/0x450
[27884.053702]  ? lock_timer_base+0x3b/0xd0
[27884.055283]  io_wq_submit_work+0x8d/0x2b0
[27884.056848]  ? __try_to_del_timer_sync+0x67/0xa0
[27884.058577]  io_worker_handle_work+0x17c/0x2b0
[27884.060267]  io_wqe_worker+0x10a/0x350
[27884.061714]  ? raw_spin_rq_unlock+0x10/0x30
[27884.063295]  ? finish_task_switch.isra.0+0x8b/0x2c0
[27884.065537]  ? __pfx_io_wqe_worker+0x10/0x10
[27884.067215]  ret_from_fork+0x2c/0x50
[27884.068733] RIP: 0033:0x0
...

Test program usage:

./io_uring_open_close_audit_hang --directory /tmp/deleteme --count 10000

Test program source:

// Note: The test program is C++ but could be converted to C.
#include <cassert>
#include <fcntl.h>
#include <filesystem>
#include <getopt.h>
#include <iostream>
#include <liburing.h>

// open and close a file.  the file is created if it does not exist.

void
openClose(struct io_uring& ring, std::string fileName)
{
    int ret;
    struct io_uring_cqe* cqe {};
    struct io_uring_sqe* sqe {};
    int fd {};
    int flags {O_RDWR | O_CREAT};
    mode_t mode {0666};

    // openat2

    sqe = io_uring_get_sqe(&ring);
    assert(sqe != nullptr);

    io_uring_prep_openat(sqe, AT_FDCWD, fileName.data(), flags, mode);
    io_uring_sqe_set_flags(sqe, IOSQE_ASYNC);

    ret = io_uring_submit(&ring);
    assert(ret == 1);

    ret = io_uring_wait_cqe(&ring, &cqe);
    assert(ret == 0);

    fd = cqe->res;
    assert(fd > 0);

    io_uring_cqe_seen(&ring, cqe);

    // close

    sqe = io_uring_get_sqe(&ring);
    assert(sqe != nullptr);

    io_uring_prep_close(sqe, fd);
    io_uring_sqe_set_flags(sqe, IOSQE_ASYNC);

    ret = io_uring_submit(&ring);
    assert(ret == 1);

    // wait for the close to complete.
    ret = io_uring_wait_cqe(&ring, &cqe);
    assert(ret == 0);

    // verify that close succeeded.
    assert(cqe->res == 0);

    io_uring_cqe_seen(&ring, cqe);
}

// create 100 files and then open each file twice.

void
openCloseHang(std::string filePath)
{
    int ret;
    struct io_uring ring;

    ret = io_uring_queue_init(8, &ring, 0);
    assert(0 == ret);

    int repeat {3};
    int numFiles {100};

    std::filesystem::create_directory(filePath);

    // files of length 0 are created in the j==0 iteration below.
    // those files are opened and closed during the j>0 iteraions.
    // a repeat of 3 results in a fairly reliable reproduction.

    for (int j = 0; j < repeat; j += 1) {
        for (int i = 0; i < numFiles; i += 1) {
            std::string fileName(filePath + "/file" + std::to_string(i));
            openClose(ring, fileName);
        }
    }

    std::filesystem::remove_all(filePath);

    io_uring_queue_exit(&ring);
}

int
main(int argc, char** argv)
{
    std::string filePath {};
    int iterations {};

    struct option options[]
    {
        {"help", no_argument, 0, 'h'}, {"directory", required_argument, 0, 'd'},
            {"count", required_argument, 0, 'c'},
        {
            0, 0, 0, 0
        }
    };
    bool printUsage {false};
    int val {};

    while ((val = getopt_long_only(argc, argv, "", options, nullptr)) != -1) {
        if (val == 'h') {
            printUsage = true;
        } else if (val == 'd') {
            filePath = optarg;
            if (std::filesystem::exists(filePath)) {
                printUsage = true;
                std::cerr << "directory must not exist" << std::endl;
            }
        } else if (val == 'c') {
            iterations = atoi(optarg);
            if (0 == iterations) {
                printUsage = true;
            }
        } else {
            printUsage = true;
        }
    }

    if ((0 == iterations) || (filePath.empty())) {
        printUsage = true;
    }

    if (printUsage || (optind < argc)) {
        std::cerr << "io_uring_open_close_audit_hang.cc --directory DIR --count COUNT" << std::endl;
        exit(1);
    }

    for (int i = 0; i < iterations; i += 1) {
        if (0 == (i % 100)) {
            std::cout << "i=" << std::to_string(i) << std::endl;
        }
        openCloseHang(filePath);
    }
    return 0;
}

Changing the reference count from int to refcount_t allows the test program
to complete using the v6.2 distro kernel.  The patch applies and builds on the
upstream v6.1.55 kernel.

Signed-off-by: Dan Clash <dan.clash@microsoft.com>
---

Comments

Jens Axboe Oct. 7, 2023, 2:32 a.m. UTC | #1
On 10/6/23 2:09 PM, Dan Clash wrote:
> diff --git a/fs/namei.c b/fs/namei.c
> index 2a8baa6ce3e8..4f7ac131c9d1 100644
> --- a/fs/namei.c
> +++ b/fs/namei.c
> @@ -187,7 +187,7 @@ getname_flags(const char __user *filename, int flags, int *empty)
>  		}
>  	}
> 
> -	result->refcnt = 1;
> +	refcount_set(&result->refcnt, 1);
>  	/* The empty path is special. */
>  	if (unlikely(!len)) {
>  		if (empty)
> @@ -248,7 +248,7 @@ getname_kernel(const char * filename)
>  	memcpy((char *)result->name, filename, len);
>  	result->uptr = NULL;
>  	result->aname = NULL;
> -	result->refcnt = 1;
> +	refcount_set(&result->refcnt, 1);
>  	audit_getname(result);
> 
>  	return result;
> @@ -259,9 +259,10 @@ void putname(struct filename *name)
>  	if (IS_ERR(name))
>  		return;
> 
> -	BUG_ON(name->refcnt <= 0);
> +	BUG_ON(refcount_read(&name->refcnt) == 0);
> +	BUG_ON(refcount_read(&name->refcnt) == REFCOUNT_SATURATED);
> 
> -	if (--name->refcnt > 0)
> +	if (!refcount_dec_and_test(&name->refcnt))
>  		return;
> 
>  	if (name->name != name->iname) {
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index d0a54e9aac7a..8217e07726d4 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -2719,7 +2719,7 @@ struct audit_names;
>  struct filename {
>  	const char		*name;	/* pointer to actual string */
>  	const __user char	*uptr;	/* original userland pointer */
> -	int			refcnt;
> +	refcount_t		refcnt;
>  	struct audit_names	*aname;
>  	const char		iname[];
>  };
> diff --git a/kernel/auditsc.c b/kernel/auditsc.c
> index 37cded22497e..232e0be9f6d9 100644
> --- a/kernel/auditsc.c
> +++ b/kernel/auditsc.c
> @@ -2188,7 +2188,7 @@ __audit_reusename(const __user char *uptr)
>  		if (!n->name)
>  			continue;
>  		if (n->name->uptr == uptr) {
> -			n->name->refcnt++;
> +			refcount_inc(&n->name->refcnt);
>  			return n->name;
>  		}
>  	}
> @@ -2217,7 +2217,7 @@ void __audit_getname(struct filename *name)
>  	n->name = name;
>  	n->name_len = AUDIT_NAME_FULL;
>  	name->aname = n;
> -	name->refcnt++;
> +	refcount_inc(&name->refcnt);
>  }
> 
>  static inline int audit_copy_fcaps(struct audit_names *name,
> @@ -2349,7 +2349,7 @@ void __audit_inode(struct filename *name, const struct dentry *dentry,
>  		return;
>  	if (name) {
>  		n->name = name;
> -		name->refcnt++;
> +		refcount_inc(&name->refcnt);
>  	}
> 
>  out:
> @@ -2474,7 +2474,7 @@ void __audit_inode_child(struct inode *parent,
>  		if (found_parent) {
>  			found_child->name = found_parent->name;
>  			found_child->name_len = AUDIT_NAME_FULL;
> -			found_child->name->refcnt++;
> +			refcount_inc(&found_child->name->refcnt);
>  		}
>  	}

I'm not fully aware of what audit is doing with struct filename outside
of needing it for the audit log. Rather than impose the atomic
references for everyone, would it be doable to simply dupe the struct
instead of grabbing the (non-atomic) reference to the existing one?

If not, since there's no over/underflow handling right now, it'd
certainly be cheaper to use an atomic_t here rather than a full
refcount.
Jens Axboe Oct. 7, 2023, 1:11 p.m. UTC | #2
On 10/6/23 8:32 PM, Jens Axboe wrote:
> On 10/6/23 2:09 PM, Dan Clash wrote:
>> diff --git a/fs/namei.c b/fs/namei.c
>> index 2a8baa6ce3e8..4f7ac131c9d1 100644
>> --- a/fs/namei.c
>> +++ b/fs/namei.c
>> @@ -187,7 +187,7 @@ getname_flags(const char __user *filename, int flags, int *empty)
>>  		}
>>  	}
>>
>> -	result->refcnt = 1;
>> +	refcount_set(&result->refcnt, 1);
>>  	/* The empty path is special. */
>>  	if (unlikely(!len)) {
>>  		if (empty)
>> @@ -248,7 +248,7 @@ getname_kernel(const char * filename)
>>  	memcpy((char *)result->name, filename, len);
>>  	result->uptr = NULL;
>>  	result->aname = NULL;
>> -	result->refcnt = 1;
>> +	refcount_set(&result->refcnt, 1);
>>  	audit_getname(result);
>>
>>  	return result;
>> @@ -259,9 +259,10 @@ void putname(struct filename *name)
>>  	if (IS_ERR(name))
>>  		return;
>>
>> -	BUG_ON(name->refcnt <= 0);
>> +	BUG_ON(refcount_read(&name->refcnt) == 0);
>> +	BUG_ON(refcount_read(&name->refcnt) == REFCOUNT_SATURATED);
>>
>> -	if (--name->refcnt > 0)
>> +	if (!refcount_dec_and_test(&name->refcnt))
>>  		return;
>>
>>  	if (name->name != name->iname) {
>> diff --git a/include/linux/fs.h b/include/linux/fs.h
>> index d0a54e9aac7a..8217e07726d4 100644
>> --- a/include/linux/fs.h
>> +++ b/include/linux/fs.h
>> @@ -2719,7 +2719,7 @@ struct audit_names;
>>  struct filename {
>>  	const char		*name;	/* pointer to actual string */
>>  	const __user char	*uptr;	/* original userland pointer */
>> -	int			refcnt;
>> +	refcount_t		refcnt;
>>  	struct audit_names	*aname;
>>  	const char		iname[];
>>  };
>> diff --git a/kernel/auditsc.c b/kernel/auditsc.c
>> index 37cded22497e..232e0be9f6d9 100644
>> --- a/kernel/auditsc.c
>> +++ b/kernel/auditsc.c
>> @@ -2188,7 +2188,7 @@ __audit_reusename(const __user char *uptr)
>>  		if (!n->name)
>>  			continue;
>>  		if (n->name->uptr == uptr) {
>> -			n->name->refcnt++;
>> +			refcount_inc(&n->name->refcnt);
>>  			return n->name;
>>  		}
>>  	}
>> @@ -2217,7 +2217,7 @@ void __audit_getname(struct filename *name)
>>  	n->name = name;
>>  	n->name_len = AUDIT_NAME_FULL;
>>  	name->aname = n;
>> -	name->refcnt++;
>> +	refcount_inc(&name->refcnt);
>>  }
>>
>>  static inline int audit_copy_fcaps(struct audit_names *name,
>> @@ -2349,7 +2349,7 @@ void __audit_inode(struct filename *name, const struct dentry *dentry,
>>  		return;
>>  	if (name) {
>>  		n->name = name;
>> -		name->refcnt++;
>> +		refcount_inc(&name->refcnt);
>>  	}
>>
>>  out:
>> @@ -2474,7 +2474,7 @@ void __audit_inode_child(struct inode *parent,
>>  		if (found_parent) {
>>  			found_child->name = found_parent->name;
>>  			found_child->name_len = AUDIT_NAME_FULL;
>> -			found_child->name->refcnt++;
>> +			refcount_inc(&found_child->name->refcnt);
>>  		}
>>  	}
> 
> I'm not fully aware of what audit is doing with struct filename outside
> of needing it for the audit log. Rather than impose the atomic
> references for everyone, would it be doable to simply dupe the struct
> instead of grabbing the (non-atomic) reference to the existing one?
> 
> If not, since there's no over/underflow handling right now, it'd
> certainly be cheaper to use an atomic_t here rather than a full
> refcount.

After taking a closer look at this, I think the best course of action
would be to make the struct filename refcnt and atomic_t. With audit in
the picture, it's quite possible to have multiple threads manipulating
the filename refcnt at the same time, which is obviously not currently
safe.

Dan, would you mind sending that as a patch? Include a link to your
original email:

Link: https://lore.kernel.org/lkml/MW2PR2101MB1033FFF044A258F84AEAA584F1C9A@MW2PR2101MB1033.namprd21.prod.outlook.com/

and a Fixes tag as well:

Fixes: 5bd2182d58e9 ("audit,io_uring,io-wq: add some basic audit support to io_uring")

and CC linux-fsdevel@vger.kernel.org and
Christian Brauner <brauner@kernel.org> as well.

Thanks!
Paul Moore Oct. 7, 2023, 3:13 p.m. UTC | #3
On Sat, Oct 7, 2023 at 9:11 AM Jens Axboe <axboe@kernel.dk> wrote:
> On 10/6/23 8:32 PM, Jens Axboe wrote:
> > On 10/6/23 2:09 PM, Dan Clash wrote:

...

> > I'm not fully aware of what audit is doing with struct filename outside
> > of needing it for the audit log. Rather than impose the atomic
> > references for everyone, would it be doable to simply dupe the struct
> > instead of grabbing the (non-atomic) reference to the existing one?
> >
> > If not, since there's no over/underflow handling right now, it'd
> > certainly be cheaper to use an atomic_t here rather than a full
> > refcount.
>
> After taking a closer look at this, I think the best course of action
> would be to make the struct filename refcnt and atomic_t. With audit in
> the picture, it's quite possible to have multiple threads manipulating
> the filename refcnt at the same time, which is obviously not currently
> safe.

Thanks Jens.

I personally would feel a bit better with the additional safety
provided by refount_t, but I agree that there is little chance of an
overflow/underflow in this case so the additional refcount_t checking
is not likely to be needed here.

For the record, this should only be an issue when audit is combined
with io_uring, prior to io_uring there wasn't an issue with multiple
threads attempting to cleanup the filename objects so we didn't have
to worry about racing on filename::refcnt updates.  However, for those
systems where both audit and io_uring are in use we definitely have a
problem and will need the upcoming fix from Dan to ensure the safety
of the system.

Thanks for spotting this Dan and doing the initial investigation into
the problem, if you run into any problems with the patch or need a
hand let me know, I'm happy to help.

> Dan, would you mind sending that as a patch? Include a link to your
> original email:
>
> Link: https://lore.kernel.org/lkml/MW2PR2101MB1033FFF044A258F84AEAA584F1C9A@MW2PR2101MB1033.namprd21.prod.outlook.com/
>
> and a Fixes tag as well:
>
> Fixes: 5bd2182d58e9 ("audit,io_uring,io-wq: add some basic audit support to io_uring")
>
> and CC linux-fsdevel@vger.kernel.org and
> Christian Brauner <brauner@kernel.org> as well.

I'm going to CC Christian on this email just so he has a heads-up
about the problem and knows to expect a patch.

--
paul-moore.com
Dan Clash Oct. 9, 2023, 2:38 a.m. UTC | #4
I retested with the following change as a sanity check:

-       BUG_ON(name->refcnt <= 0);
+       BUG_ON(atomic_read(&name->refcnt) <= 0);

checkpatch.pl suggests using WARN_ON_ONCE rather than BUG.

devvm ~ $ ~/linux/scripts/checkpatch.pl --patch ~/io_uring_audit_hang_atomic.patch 
WARNING: Do not crash the kernel unless it is absolutely unavoidable
  --use WARN_ON_ONCE() plus recovery code (if feasible) instead of BUG() or variants
#28: FILE: fs/namei.c:262:
+       BUG_ON(atomic_read(&name->refcnt) <= 0);
...

refcount_t uses WARN_ON_ONCE.

I can think of three choices:

1. Use atomic_t and remove the BUG line.
2. Use refcount_t and remove the BUG line. 
3. Use atomic_t and partially implement the warn behavior of refcount_t.

Choice 1 and 2 seem better than choice 3.
Jens Axboe Oct. 9, 2023, 1:40 p.m. UTC | #5
On 10/8/23 8:38 PM, Dan Clash wrote:
> I retested with the following change as a sanity check:
> 
> -       BUG_ON(name->refcnt <= 0);
> +       BUG_ON(atomic_read(&name->refcnt) <= 0);
> 
> checkpatch.pl suggests using WARN_ON_ONCE rather than BUG.
> 
> devvm ~ $ ~/linux/scripts/checkpatch.pl --patch ~/io_uring_audit_hang_atomic.patch 
> WARNING: Do not crash the kernel unless it is absolutely unavoidable
>   --use WARN_ON_ONCE() plus recovery code (if feasible) instead of BUG() or variants
> #28: FILE: fs/namei.c:262:
> +       BUG_ON(atomic_read(&name->refcnt) <= 0);
> ...
> 
> refcount_t uses WARN_ON_ONCE.
> 
> I can think of three choices:
> 
> 1. Use atomic_t and remove the BUG line.
> 2. Use refcount_t and remove the BUG line. 
> 3. Use atomic_t and partially implement the warn behavior of refcount_t.
> 
> Choice 1 and 2 seem better than choice 3.

I'd probably just make it:

if (WARN_ON_ONCE(!atomic_read(name->refcnt)))
	return;

to make it a straightforward conversion.
Dan Clash Oct. 12, 2023, 9:12 p.m. UTC | #6
The patch will be sent from my alternate email address daclash@linux.microsoft.com.
Paul and Jens, thank you for your guidance so far.

Dan
diff mbox series

Patch

diff --git a/fs/namei.c b/fs/namei.c
index 2a8baa6ce3e8..4f7ac131c9d1 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -187,7 +187,7 @@  getname_flags(const char __user *filename, int flags, int *empty)
 		}
 	}

-	result->refcnt = 1;
+	refcount_set(&result->refcnt, 1);
 	/* The empty path is special. */
 	if (unlikely(!len)) {
 		if (empty)
@@ -248,7 +248,7 @@  getname_kernel(const char * filename)
 	memcpy((char *)result->name, filename, len);
 	result->uptr = NULL;
 	result->aname = NULL;
-	result->refcnt = 1;
+	refcount_set(&result->refcnt, 1);
 	audit_getname(result);

 	return result;
@@ -259,9 +259,10 @@  void putname(struct filename *name)
 	if (IS_ERR(name))
 		return;

-	BUG_ON(name->refcnt <= 0);
+	BUG_ON(refcount_read(&name->refcnt) == 0);
+	BUG_ON(refcount_read(&name->refcnt) == REFCOUNT_SATURATED);

-	if (--name->refcnt > 0)
+	if (!refcount_dec_and_test(&name->refcnt))
 		return;

 	if (name->name != name->iname) {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index d0a54e9aac7a..8217e07726d4 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2719,7 +2719,7 @@  struct audit_names;
 struct filename {
 	const char		*name;	/* pointer to actual string */
 	const __user char	*uptr;	/* original userland pointer */
-	int			refcnt;
+	refcount_t		refcnt;
 	struct audit_names	*aname;
 	const char		iname[];
 };
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 37cded22497e..232e0be9f6d9 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2188,7 +2188,7 @@  __audit_reusename(const __user char *uptr)
 		if (!n->name)
 			continue;
 		if (n->name->uptr == uptr) {
-			n->name->refcnt++;
+			refcount_inc(&n->name->refcnt);
 			return n->name;
 		}
 	}
@@ -2217,7 +2217,7 @@  void __audit_getname(struct filename *name)
 	n->name = name;
 	n->name_len = AUDIT_NAME_FULL;
 	name->aname = n;
-	name->refcnt++;
+	refcount_inc(&name->refcnt);
 }

 static inline int audit_copy_fcaps(struct audit_names *name,
@@ -2349,7 +2349,7 @@  void __audit_inode(struct filename *name, const struct dentry *dentry,
 		return;
 	if (name) {
 		n->name = name;
-		name->refcnt++;
+		refcount_inc(&name->refcnt);
 	}

 out:
@@ -2474,7 +2474,7 @@  void __audit_inode_child(struct inode *parent,
 		if (found_parent) {
 			found_child->name = found_parent->name;
 			found_child->name_len = AUDIT_NAME_FULL;
-			found_child->name->refcnt++;
+			refcount_inc(&found_child->name->refcnt);
 		}
 	}