diff mbox

fs: fix data races on inode->i_flctx

Message ID 1442821386-24807-1-git-send-email-dvyukov@google.com (mailing list archive)
State New, archived
Headers show

Commit Message

Dmitry Vyukov Sept. 21, 2015, 7:43 a.m. UTC
locks_get_lock_context() uses cmpxchg() to install i_flctx.
cmpxchg() is a release operation which is correct. But it uses
a plain load to load i_flctx. This is incorrect. Subsequent loads
from i_flctx can hoist above the load of i_flctx pointer itself
and observe uninitialized garbage there. This in turn can lead
to corruption of ctx->flc_lock and other members.

Documentation/memory-barriers.txt explicitly requires to use
a barrier in such context:
"A load-load control dependency requires a full read memory barrier".

Use smp_load_acquire() in locks_get_lock_context() and in bunch
of other functions that can proceed concurrently with
locks_get_lock_context().

The data race was found with KernelThreadSanitizer (KTSAN).

Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
---
For the record, here is the KTSAN report:

ThreadSanitizer: data-race in _raw_spin_lock

Read at 0xffff8800bae50da0 of size 1 by thread 3060 on CPU 2:
 [<     inline     >] __raw_spin_lock include/linux/spinlock_api_smp.h:158
 [<ffffffff81ee37d0>] _raw_spin_lock+0x50/0x70 kernel/locking/spinlock.c:151
 [<     inline     >] spin_lock include/linux/spinlock.h:312
 [<ffffffff812e1187>] flock_lock_inode+0xb7/0x440 fs/locks.c:895
 [<ffffffff812e16e6>] flock_lock_inode_wait+0x46/0x110 fs/locks.c:1871
 [<ffffffff812e2814>] SyS_flock+0x224/0x23

Previous write at 0xffff8800bae50da0 of size 8 by thread 3063 on CPU 8:
 [<ffffffff81248628>] kmem_cache_alloc+0xd8/0x2f0 mm/slab.c:3420
 [<ffffffff812ddba0>] locks_get_lock_context+0x60/0x140 fs/locks.c:213
 [<ffffffff812e112a>] flock_lock_inode+0x5a/0x440 fs/locks.c:882
 [<ffffffff812e16e6>] flock_lock_inode_wait+0x46/0x110 fs/locks.c:1871
 [<     inline     >] flock_lock_file_wait include/linux/fs.h:1219
 [<     inline     >] SYSC_flock fs/locks.c:1941
 [<ffffffff812e2814>] SyS_flock+0x224/0x230 fs/locks.c:1904
 [<ffffffff81ee3e11>] entry_SYSCALL_64_fastpath+0x31/0x95 arch/x86/entry/entry_64.S:188
---
 fs/locks.c | 63 +++++++++++++++++++++++++++++++++++---------------------------
 1 file changed, 36 insertions(+), 27 deletions(-)

Comments

Jeff Layton Sept. 21, 2015, 10:50 a.m. UTC | #1
On Mon, 21 Sep 2015 09:43:06 +0200
Dmitry Vyukov <dvyukov@google.com> wrote:

> locks_get_lock_context() uses cmpxchg() to install i_flctx.
> cmpxchg() is a release operation which is correct. But it uses
> a plain load to load i_flctx. This is incorrect. Subsequent loads
> from i_flctx can hoist above the load of i_flctx pointer itself
> and observe uninitialized garbage there. This in turn can lead
> to corruption of ctx->flc_lock and other members.
> 

I don't get this bit. The i_flctx field is initialized to zero when the
inode is allocated, and we only assign it with cmpxchg(). How would you
ever see uninitialized garbage in there? It should either be NULL or a
valid pointer, no?

If that'st the case, then most of the places where you're adding
smp_load_acquire are places that can tolerate seeing NULL there. e.g.
you have racing LOCK_EX/LOCK_UN flock() calls in different tasks or
something.

> Documentation/memory-barriers.txt explicitly requires to use
> a barrier in such context:
> "A load-load control dependency requires a full read memory barrier".
> 

The same file also says:

"Any atomic operation that modifies some state in memory and returns information
about the state (old or new) implies an SMP-conditional general memory barrier
(smp_mb()) on each side of the actual operation (with the exception of
explicit lock operations, described later).  These include:

...
/* when succeeds */
cmpxchg();
"

If there is an implied smp_mb() before and after the cmpxchg(), how
could the CPU hoist anything before that point?

Note that I'm not saying that you're wrong, I just want to make sure I
fully understand the problem before we go trying to fix it.

> Use smp_load_acquire() in locks_get_lock_context() and in bunch
> of other functions that can proceed concurrently with
> locks_get_lock_context().
> 
> The data race was found with KernelThreadSanitizer (KTSAN).
> 
> Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
> ---
> For the record, here is the KTSAN report:
> 
> ThreadSanitizer: data-race in _raw_spin_lock
> 
> Read at 0xffff8800bae50da0 of size 1 by thread 3060 on CPU 2:
>  [<     inline     >] __raw_spin_lock include/linux/spinlock_api_smp.h:158
>  [<ffffffff81ee37d0>] _raw_spin_lock+0x50/0x70 kernel/locking/spinlock.c:151
>  [<     inline     >] spin_lock include/linux/spinlock.h:312
>  [<ffffffff812e1187>] flock_lock_inode+0xb7/0x440 fs/locks.c:895
>  [<ffffffff812e16e6>] flock_lock_inode_wait+0x46/0x110 fs/locks.c:1871
>  [<ffffffff812e2814>] SyS_flock+0x224/0x23
> 
> Previous write at 0xffff8800bae50da0 of size 8 by thread 3063 on CPU 8:
>  [<ffffffff81248628>] kmem_cache_alloc+0xd8/0x2f0 mm/slab.c:3420
>  [<ffffffff812ddba0>] locks_get_lock_context+0x60/0x140 fs/locks.c:213
>  [<ffffffff812e112a>] flock_lock_inode+0x5a/0x440 fs/locks.c:882
>  [<ffffffff812e16e6>] flock_lock_inode_wait+0x46/0x110 fs/locks.c:1871
>  [<     inline     >] flock_lock_file_wait include/linux/fs.h:1219
>  [<     inline     >] SYSC_flock fs/locks.c:1941
>  [<ffffffff812e2814>] SyS_flock+0x224/0x230 fs/locks.c:1904
>  [<ffffffff81ee3e11>] entry_SYSCALL_64_fastpath+0x31/0x95 arch/x86/entry/entry_64.S:188
> ---
>  fs/locks.c | 63 +++++++++++++++++++++++++++++++++++---------------------------
>  1 file changed, 36 insertions(+), 27 deletions(-)
> 
> diff --git a/fs/locks.c b/fs/locks.c
> index 2a54c80..316e474 100644
> --- a/fs/locks.c
> +++ b/fs/locks.c
> @@ -205,28 +205,32 @@ static struct kmem_cache *filelock_cache __read_mostly;
>  static struct file_lock_context *
>  locks_get_lock_context(struct inode *inode, int type)
>  {
> -	struct file_lock_context *new;
> +	struct file_lock_context *ctx;
>  
> -	if (likely(inode->i_flctx) || type == F_UNLCK)
> +	/* paired with cmpxchg() below */
> +	ctx = smp_load_acquire(&inode->i_flctx);
> +	if (likely(ctx) || type == F_UNLCK)
>  		goto out;
>  
> -	new = kmem_cache_alloc(flctx_cache, GFP_KERNEL);
> -	if (!new)
> +	ctx = kmem_cache_alloc(flctx_cache, GFP_KERNEL);
> +	if (!ctx)
>  		goto out;
>  
> -	spin_lock_init(&new->flc_lock);
> -	INIT_LIST_HEAD(&new->flc_flock);
> -	INIT_LIST_HEAD(&new->flc_posix);
> -	INIT_LIST_HEAD(&new->flc_lease);
> +	spin_lock_init(&ctx->flc_lock);
> +	INIT_LIST_HEAD(&ctx->flc_flock);
> +	INIT_LIST_HEAD(&ctx->flc_posix);
> +	INIT_LIST_HEAD(&ctx->flc_lease);
>  
>  	/*
>  	 * Assign the pointer if it's not already assigned. If it is, then
>  	 * free the context we just allocated.
>  	 */
> -	if (cmpxchg(&inode->i_flctx, NULL, new))
> -		kmem_cache_free(flctx_cache, new);
> +	if (cmpxchg(&inode->i_flctx, NULL, ctx)) {
> +		kmem_cache_free(flctx_cache, ctx);
> +		ctx = smp_load_acquire(&inode->i_flctx);
> +	}
>  out:
> -	return inode->i_flctx;
> +	return ctx;
>  }
>  
>  void
> @@ -762,7 +766,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
>  	struct file_lock_context *ctx;
>  	struct inode *inode = file_inode(filp);
>  
> -	ctx = inode->i_flctx;
> +	ctx = smp_load_acquire(&inode->i_flctx);
>  	if (!ctx || list_empty_careful(&ctx->flc_posix)) {
>  		fl->fl_type = F_UNLCK;
>  		return;
> @@ -1203,7 +1207,7 @@ int locks_mandatory_locked(struct file *file)
>  	struct file_lock_context *ctx;
>  	struct file_lock *fl;
>  
> -	ctx = inode->i_flctx;
> +	ctx = smp_load_acquire(&inode->i_flctx);
>  	if (!ctx || list_empty_careful(&ctx->flc_posix))
>  		return 0;
>  
> @@ -1388,7 +1392,7 @@ any_leases_conflict(struct inode *inode, struct file_lock *breaker)
>  int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
>  {
>  	int error = 0;
> -	struct file_lock_context *ctx = inode->i_flctx;
> +	struct file_lock_context *ctx;
>  	struct file_lock *new_fl, *fl, *tmp;
>  	unsigned long break_time;
>  	int want_write = (mode & O_ACCMODE) != O_RDONLY;
> @@ -1400,6 +1404,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
>  	new_fl->fl_flags = type;
>  
>  	/* typically we will check that ctx is non-NULL before calling */
> +	ctx = smp_load_acquire(&inode->i_flctx);
>  	if (!ctx) {
>  		WARN_ON_ONCE(1);
>  		return error;
> @@ -1494,9 +1499,10 @@ EXPORT_SYMBOL(__break_lease);
>  void lease_get_mtime(struct inode *inode, struct timespec *time)
>  {
>  	bool has_lease = false;
> -	struct file_lock_context *ctx = inode->i_flctx;
> +	struct file_lock_context *ctx;
>  	struct file_lock *fl;
>  
> +	ctx = smp_load_acquire(&inode->i_flctx);
>  	if (ctx && !list_empty_careful(&ctx->flc_lease)) {
>  		spin_lock(&ctx->flc_lock);
>  		if (!list_empty(&ctx->flc_lease)) {
> @@ -1543,10 +1549,11 @@ int fcntl_getlease(struct file *filp)
>  {
>  	struct file_lock *fl;
>  	struct inode *inode = file_inode(filp);
> -	struct file_lock_context *ctx = inode->i_flctx;
> +	struct file_lock_context *ctx;
>  	int type = F_UNLCK;
>  	LIST_HEAD(dispose);
>  
> +	ctx = smp_load_acquire(&inode->i_flctx);
>  	if (ctx && !list_empty_careful(&ctx->flc_lease)) {
>  		spin_lock(&ctx->flc_lock);
>  		time_out_leases(file_inode(filp), &dispose);
> @@ -1713,9 +1720,10 @@ static int generic_delete_lease(struct file *filp, void *owner)
>  	struct file_lock *fl, *victim = NULL;
>  	struct dentry *dentry = filp->f_path.dentry;
>  	struct inode *inode = dentry->d_inode;
> -	struct file_lock_context *ctx = inode->i_flctx;
> +	struct file_lock_context *ctx;
>  	LIST_HEAD(dispose);
>  
> +	ctx = smp_load_acquire(&inode->i_flctx);
>  	if (!ctx) {
>  		trace_generic_delete_lease(inode, NULL);
>  		return error;
> @@ -2359,13 +2367,14 @@ out:
>  void locks_remove_posix(struct file *filp, fl_owner_t owner)
>  {
>  	struct file_lock lock;
> -	struct file_lock_context *ctx = file_inode(filp)->i_flctx;
> +	struct file_lock_context *ctx;
>  
>  	/*
>  	 * If there are no locks held on this file, we don't need to call
>  	 * posix_lock_file().  Another process could be setting a lock on this
>  	 * file at the same time, but we wouldn't remove that lock anyway.
>  	 */
> +	ctx =  smp_load_acquire(&file_inode(filp)->i_flctx);
>  	if (!ctx || list_empty(&ctx->flc_posix))
>  		return;
>  
> @@ -2389,7 +2398,7 @@ EXPORT_SYMBOL(locks_remove_posix);
>  
>  /* The i_flctx must be valid when calling into here */
>  static void
> -locks_remove_flock(struct file *filp)
> +locks_remove_flock(struct file *filp, struct file_lock_context *flctx)
>  {
>  	struct file_lock fl = {
>  		.fl_owner = filp,
> @@ -2400,7 +2409,6 @@ locks_remove_flock(struct file *filp)
>  		.fl_end = OFFSET_MAX,
>  	};
>  	struct inode *inode = file_inode(filp);
> -	struct file_lock_context *flctx = inode->i_flctx;
>  
>  	if (list_empty(&flctx->flc_flock))
>  		return;
> @@ -2416,10 +2424,8 @@ locks_remove_flock(struct file *filp)
>  
>  /* The i_flctx must be valid when calling into here */
>  static void
> -locks_remove_lease(struct file *filp)
> +locks_remove_lease(struct file *filp, struct file_lock_context *ctx)
>  {
> -	struct inode *inode = file_inode(filp);
> -	struct file_lock_context *ctx = inode->i_flctx;
>  	struct file_lock *fl, *tmp;
>  	LIST_HEAD(dispose);
>  
> @@ -2439,17 +2445,20 @@ locks_remove_lease(struct file *filp)
>   */
>  void locks_remove_file(struct file *filp)
>  {
> -	if (!file_inode(filp)->i_flctx)
> +	struct file_lock_context *ctx;
> +
> +	ctx = smp_load_acquire(&file_inode(filp)->i_flctx);
> +	if (!ctx)
>  		return;
>  
>  	/* remove any OFD locks */
>  	locks_remove_posix(filp, filp);
>  
>  	/* remove flock locks */
> -	locks_remove_flock(filp);
> +	locks_remove_flock(filp, ctx);
>  
>  	/* remove any leases */
> -	locks_remove_lease(filp);
> +	locks_remove_lease(filp, ctx);
>  }
>  
>  /**
> @@ -2616,7 +2625,7 @@ void show_fd_locks(struct seq_file *f,
>  	struct file_lock_context *ctx;
>  	int id = 0;
>  
> -	ctx = inode->i_flctx;
> +	ctx = smp_load_acquire(&inode->i_flctx);
>  	if (!ctx)
>  		return;
>
Dmitry Vyukov Sept. 21, 2015, 10:53 a.m. UTC | #2
On Mon, Sep 21, 2015 at 12:50 PM, Jeff Layton <jlayton@poochiereds.net> wrote:
> On Mon, 21 Sep 2015 09:43:06 +0200
> Dmitry Vyukov <dvyukov@google.com> wrote:
>
>> locks_get_lock_context() uses cmpxchg() to install i_flctx.
>> cmpxchg() is a release operation which is correct. But it uses
>> a plain load to load i_flctx. This is incorrect. Subsequent loads
>> from i_flctx can hoist above the load of i_flctx pointer itself
>> and observe uninitialized garbage there. This in turn can lead
>> to corruption of ctx->flc_lock and other members.
>>
>
> I don't get this bit. The i_flctx field is initialized to zero when the
> inode is allocated, and we only assign it with cmpxchg(). How would you
> ever see uninitialized garbage in there? It should either be NULL or a
> valid pointer, no?

This is not about i_flctx pointer, this is about i_flctx object
contents (pointee).
Readers can see a non-NULL pointer, but read garbage from *i_flctx.

> If that'st the case, then most of the places where you're adding
> smp_load_acquire are places that can tolerate seeing NULL there. e.g.
> you have racing LOCK_EX/LOCK_UN flock() calls in different tasks or
> something.
>
>> Documentation/memory-barriers.txt explicitly requires to use
>> a barrier in such context:
>> "A load-load control dependency requires a full read memory barrier".
>>
>
> The same file also says:
>
> "Any atomic operation that modifies some state in memory and returns information
> about the state (old or new) implies an SMP-conditional general memory barrier
> (smp_mb()) on each side of the actual operation (with the exception of
> explicit lock operations, described later).  These include:
>
> ...
> /* when succeeds */
> cmpxchg();
> "
>
> If there is an implied smp_mb() before and after the cmpxchg(), how
> could the CPU hoist anything before that point?
>
> Note that I'm not saying that you're wrong, I just want to make sure I
> fully understand the problem before we go trying to fix it.
>
>> Use smp_load_acquire() in locks_get_lock_context() and in bunch
>> of other functions that can proceed concurrently with
>> locks_get_lock_context().
>>
>> The data race was found with KernelThreadSanitizer (KTSAN).
>>
>> Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
>> ---
>> For the record, here is the KTSAN report:
>>
>> ThreadSanitizer: data-race in _raw_spin_lock
>>
>> Read at 0xffff8800bae50da0 of size 1 by thread 3060 on CPU 2:
>>  [<     inline     >] __raw_spin_lock include/linux/spinlock_api_smp.h:158
>>  [<ffffffff81ee37d0>] _raw_spin_lock+0x50/0x70 kernel/locking/spinlock.c:151
>>  [<     inline     >] spin_lock include/linux/spinlock.h:312
>>  [<ffffffff812e1187>] flock_lock_inode+0xb7/0x440 fs/locks.c:895
>>  [<ffffffff812e16e6>] flock_lock_inode_wait+0x46/0x110 fs/locks.c:1871
>>  [<ffffffff812e2814>] SyS_flock+0x224/0x23
>>
>> Previous write at 0xffff8800bae50da0 of size 8 by thread 3063 on CPU 8:
>>  [<ffffffff81248628>] kmem_cache_alloc+0xd8/0x2f0 mm/slab.c:3420
>>  [<ffffffff812ddba0>] locks_get_lock_context+0x60/0x140 fs/locks.c:213
>>  [<ffffffff812e112a>] flock_lock_inode+0x5a/0x440 fs/locks.c:882
>>  [<ffffffff812e16e6>] flock_lock_inode_wait+0x46/0x110 fs/locks.c:1871
>>  [<     inline     >] flock_lock_file_wait include/linux/fs.h:1219
>>  [<     inline     >] SYSC_flock fs/locks.c:1941
>>  [<ffffffff812e2814>] SyS_flock+0x224/0x230 fs/locks.c:1904
>>  [<ffffffff81ee3e11>] entry_SYSCALL_64_fastpath+0x31/0x95 arch/x86/entry/entry_64.S:188
>> ---
>>  fs/locks.c | 63 +++++++++++++++++++++++++++++++++++---------------------------
>>  1 file changed, 36 insertions(+), 27 deletions(-)
>>
>> diff --git a/fs/locks.c b/fs/locks.c
>> index 2a54c80..316e474 100644
>> --- a/fs/locks.c
>> +++ b/fs/locks.c
>> @@ -205,28 +205,32 @@ static struct kmem_cache *filelock_cache __read_mostly;
>>  static struct file_lock_context *
>>  locks_get_lock_context(struct inode *inode, int type)
>>  {
>> -     struct file_lock_context *new;
>> +     struct file_lock_context *ctx;
>>
>> -     if (likely(inode->i_flctx) || type == F_UNLCK)
>> +     /* paired with cmpxchg() below */
>> +     ctx = smp_load_acquire(&inode->i_flctx);
>> +     if (likely(ctx) || type == F_UNLCK)
>>               goto out;
>>
>> -     new = kmem_cache_alloc(flctx_cache, GFP_KERNEL);
>> -     if (!new)
>> +     ctx = kmem_cache_alloc(flctx_cache, GFP_KERNEL);
>> +     if (!ctx)
>>               goto out;
>>
>> -     spin_lock_init(&new->flc_lock);
>> -     INIT_LIST_HEAD(&new->flc_flock);
>> -     INIT_LIST_HEAD(&new->flc_posix);
>> -     INIT_LIST_HEAD(&new->flc_lease);
>> +     spin_lock_init(&ctx->flc_lock);
>> +     INIT_LIST_HEAD(&ctx->flc_flock);
>> +     INIT_LIST_HEAD(&ctx->flc_posix);
>> +     INIT_LIST_HEAD(&ctx->flc_lease);
>>
>>       /*
>>        * Assign the pointer if it's not already assigned. If it is, then
>>        * free the context we just allocated.
>>        */
>> -     if (cmpxchg(&inode->i_flctx, NULL, new))
>> -             kmem_cache_free(flctx_cache, new);
>> +     if (cmpxchg(&inode->i_flctx, NULL, ctx)) {
>> +             kmem_cache_free(flctx_cache, ctx);
>> +             ctx = smp_load_acquire(&inode->i_flctx);
>> +     }
>>  out:
>> -     return inode->i_flctx;
>> +     return ctx;
>>  }
>>
>>  void
>> @@ -762,7 +766,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
>>       struct file_lock_context *ctx;
>>       struct inode *inode = file_inode(filp);
>>
>> -     ctx = inode->i_flctx;
>> +     ctx = smp_load_acquire(&inode->i_flctx);
>>       if (!ctx || list_empty_careful(&ctx->flc_posix)) {
>>               fl->fl_type = F_UNLCK;
>>               return;
>> @@ -1203,7 +1207,7 @@ int locks_mandatory_locked(struct file *file)
>>       struct file_lock_context *ctx;
>>       struct file_lock *fl;
>>
>> -     ctx = inode->i_flctx;
>> +     ctx = smp_load_acquire(&inode->i_flctx);
>>       if (!ctx || list_empty_careful(&ctx->flc_posix))
>>               return 0;
>>
>> @@ -1388,7 +1392,7 @@ any_leases_conflict(struct inode *inode, struct file_lock *breaker)
>>  int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
>>  {
>>       int error = 0;
>> -     struct file_lock_context *ctx = inode->i_flctx;
>> +     struct file_lock_context *ctx;
>>       struct file_lock *new_fl, *fl, *tmp;
>>       unsigned long break_time;
>>       int want_write = (mode & O_ACCMODE) != O_RDONLY;
>> @@ -1400,6 +1404,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
>>       new_fl->fl_flags = type;
>>
>>       /* typically we will check that ctx is non-NULL before calling */
>> +     ctx = smp_load_acquire(&inode->i_flctx);
>>       if (!ctx) {
>>               WARN_ON_ONCE(1);
>>               return error;
>> @@ -1494,9 +1499,10 @@ EXPORT_SYMBOL(__break_lease);
>>  void lease_get_mtime(struct inode *inode, struct timespec *time)
>>  {
>>       bool has_lease = false;
>> -     struct file_lock_context *ctx = inode->i_flctx;
>> +     struct file_lock_context *ctx;
>>       struct file_lock *fl;
>>
>> +     ctx = smp_load_acquire(&inode->i_flctx);
>>       if (ctx && !list_empty_careful(&ctx->flc_lease)) {
>>               spin_lock(&ctx->flc_lock);
>>               if (!list_empty(&ctx->flc_lease)) {
>> @@ -1543,10 +1549,11 @@ int fcntl_getlease(struct file *filp)
>>  {
>>       struct file_lock *fl;
>>       struct inode *inode = file_inode(filp);
>> -     struct file_lock_context *ctx = inode->i_flctx;
>> +     struct file_lock_context *ctx;
>>       int type = F_UNLCK;
>>       LIST_HEAD(dispose);
>>
>> +     ctx = smp_load_acquire(&inode->i_flctx);
>>       if (ctx && !list_empty_careful(&ctx->flc_lease)) {
>>               spin_lock(&ctx->flc_lock);
>>               time_out_leases(file_inode(filp), &dispose);
>> @@ -1713,9 +1720,10 @@ static int generic_delete_lease(struct file *filp, void *owner)
>>       struct file_lock *fl, *victim = NULL;
>>       struct dentry *dentry = filp->f_path.dentry;
>>       struct inode *inode = dentry->d_inode;
>> -     struct file_lock_context *ctx = inode->i_flctx;
>> +     struct file_lock_context *ctx;
>>       LIST_HEAD(dispose);
>>
>> +     ctx = smp_load_acquire(&inode->i_flctx);
>>       if (!ctx) {
>>               trace_generic_delete_lease(inode, NULL);
>>               return error;
>> @@ -2359,13 +2367,14 @@ out:
>>  void locks_remove_posix(struct file *filp, fl_owner_t owner)
>>  {
>>       struct file_lock lock;
>> -     struct file_lock_context *ctx = file_inode(filp)->i_flctx;
>> +     struct file_lock_context *ctx;
>>
>>       /*
>>        * If there are no locks held on this file, we don't need to call
>>        * posix_lock_file().  Another process could be setting a lock on this
>>        * file at the same time, but we wouldn't remove that lock anyway.
>>        */
>> +     ctx =  smp_load_acquire(&file_inode(filp)->i_flctx);
>>       if (!ctx || list_empty(&ctx->flc_posix))
>>               return;
>>
>> @@ -2389,7 +2398,7 @@ EXPORT_SYMBOL(locks_remove_posix);
>>
>>  /* The i_flctx must be valid when calling into here */
>>  static void
>> -locks_remove_flock(struct file *filp)
>> +locks_remove_flock(struct file *filp, struct file_lock_context *flctx)
>>  {
>>       struct file_lock fl = {
>>               .fl_owner = filp,
>> @@ -2400,7 +2409,6 @@ locks_remove_flock(struct file *filp)
>>               .fl_end = OFFSET_MAX,
>>       };
>>       struct inode *inode = file_inode(filp);
>> -     struct file_lock_context *flctx = inode->i_flctx;
>>
>>       if (list_empty(&flctx->flc_flock))
>>               return;
>> @@ -2416,10 +2424,8 @@ locks_remove_flock(struct file *filp)
>>
>>  /* The i_flctx must be valid when calling into here */
>>  static void
>> -locks_remove_lease(struct file *filp)
>> +locks_remove_lease(struct file *filp, struct file_lock_context *ctx)
>>  {
>> -     struct inode *inode = file_inode(filp);
>> -     struct file_lock_context *ctx = inode->i_flctx;
>>       struct file_lock *fl, *tmp;
>>       LIST_HEAD(dispose);
>>
>> @@ -2439,17 +2445,20 @@ locks_remove_lease(struct file *filp)
>>   */
>>  void locks_remove_file(struct file *filp)
>>  {
>> -     if (!file_inode(filp)->i_flctx)
>> +     struct file_lock_context *ctx;
>> +
>> +     ctx = smp_load_acquire(&file_inode(filp)->i_flctx);
>> +     if (!ctx)
>>               return;
>>
>>       /* remove any OFD locks */
>>       locks_remove_posix(filp, filp);
>>
>>       /* remove flock locks */
>> -     locks_remove_flock(filp);
>> +     locks_remove_flock(filp, ctx);
>>
>>       /* remove any leases */
>> -     locks_remove_lease(filp);
>> +     locks_remove_lease(filp, ctx);
>>  }
>>
>>  /**
>> @@ -2616,7 +2625,7 @@ void show_fd_locks(struct seq_file *f,
>>       struct file_lock_context *ctx;
>>       int id = 0;
>>
>> -     ctx = inode->i_flctx;
>> +     ctx = smp_load_acquire(&inode->i_flctx);
>>       if (!ctx)
>>               return;
>>
>
>
> --
> Jeff Layton <jlayton@poochiereds.net>
Jeff Layton Sept. 21, 2015, 10:56 a.m. UTC | #3
On Mon, 21 Sep 2015 12:53:40 +0200
Dmitry Vyukov <dvyukov@google.com> wrote:

> On Mon, Sep 21, 2015 at 12:50 PM, Jeff Layton <jlayton@poochiereds.net> wrote:
> > On Mon, 21 Sep 2015 09:43:06 +0200
> > Dmitry Vyukov <dvyukov@google.com> wrote:
> >
> >> locks_get_lock_context() uses cmpxchg() to install i_flctx.
> >> cmpxchg() is a release operation which is correct. But it uses
> >> a plain load to load i_flctx. This is incorrect. Subsequent loads
> >> from i_flctx can hoist above the load of i_flctx pointer itself
> >> and observe uninitialized garbage there. This in turn can lead
> >> to corruption of ctx->flc_lock and other members.
> >>
> >
> > I don't get this bit. The i_flctx field is initialized to zero when the
> > inode is allocated, and we only assign it with cmpxchg(). How would you
> > ever see uninitialized garbage in there? It should either be NULL or a
> > valid pointer, no?
> 
> This is not about i_flctx pointer, this is about i_flctx object
> contents (pointee).
> Readers can see a non-NULL pointer, but read garbage from *i_flctx.
> 

Again, I don't get it though. The cmpxchg happens after we've
initialized the structure. Given that there are implicit full memory
barriers before and after the cmpxchg(), how do you end up with another
task seeing the pointer before it was ever initialized?

The store should only happen after the initialization has occurred, and
the loads in the other tasks shouldn't be able to see the results of
that store until then. No?

> > If that'st the case, then most of the places where you're adding
> > smp_load_acquire are places that can tolerate seeing NULL there. e.g.
> > you have racing LOCK_EX/LOCK_UN flock() calls in different tasks or
> > something.
> >
> >> Documentation/memory-barriers.txt explicitly requires to use
> >> a barrier in such context:
> >> "A load-load control dependency requires a full read memory barrier".
> >>
> >
> > The same file also says:
> >
> > "Any atomic operation that modifies some state in memory and returns information
> > about the state (old or new) implies an SMP-conditional general memory barrier
> > (smp_mb()) on each side of the actual operation (with the exception of
> > explicit lock operations, described later).  These include:
> >
> > ...
> > /* when succeeds */
> > cmpxchg();
> > "
> >
> > If there is an implied smp_mb() before and after the cmpxchg(), how
> > could the CPU hoist anything before that point?
> >
> > Note that I'm not saying that you're wrong, I just want to make sure I
> > fully understand the problem before we go trying to fix it.
> >
> >> Use smp_load_acquire() in locks_get_lock_context() and in bunch
> >> of other functions that can proceed concurrently with
> >> locks_get_lock_context().
> >>
> >> The data race was found with KernelThreadSanitizer (KTSAN).
> >>
> >> Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
> >> ---
> >> For the record, here is the KTSAN report:
> >>
> >> ThreadSanitizer: data-race in _raw_spin_lock
> >>
> >> Read at 0xffff8800bae50da0 of size 1 by thread 3060 on CPU 2:
> >>  [<     inline     >] __raw_spin_lock include/linux/spinlock_api_smp.h:158
> >>  [<ffffffff81ee37d0>] _raw_spin_lock+0x50/0x70 kernel/locking/spinlock.c:151
> >>  [<     inline     >] spin_lock include/linux/spinlock.h:312
> >>  [<ffffffff812e1187>] flock_lock_inode+0xb7/0x440 fs/locks.c:895
> >>  [<ffffffff812e16e6>] flock_lock_inode_wait+0x46/0x110 fs/locks.c:1871
> >>  [<ffffffff812e2814>] SyS_flock+0x224/0x23
> >>
> >> Previous write at 0xffff8800bae50da0 of size 8 by thread 3063 on CPU 8:
> >>  [<ffffffff81248628>] kmem_cache_alloc+0xd8/0x2f0 mm/slab.c:3420
> >>  [<ffffffff812ddba0>] locks_get_lock_context+0x60/0x140 fs/locks.c:213
> >>  [<ffffffff812e112a>] flock_lock_inode+0x5a/0x440 fs/locks.c:882
> >>  [<ffffffff812e16e6>] flock_lock_inode_wait+0x46/0x110 fs/locks.c:1871
> >>  [<     inline     >] flock_lock_file_wait include/linux/fs.h:1219
> >>  [<     inline     >] SYSC_flock fs/locks.c:1941
> >>  [<ffffffff812e2814>] SyS_flock+0x224/0x230 fs/locks.c:1904
> >>  [<ffffffff81ee3e11>] entry_SYSCALL_64_fastpath+0x31/0x95 arch/x86/entry/entry_64.S:188
> >> ---
> >>  fs/locks.c | 63 +++++++++++++++++++++++++++++++++++---------------------------
> >>  1 file changed, 36 insertions(+), 27 deletions(-)
> >>
> >> diff --git a/fs/locks.c b/fs/locks.c
> >> index 2a54c80..316e474 100644
> >> --- a/fs/locks.c
> >> +++ b/fs/locks.c
> >> @@ -205,28 +205,32 @@ static struct kmem_cache *filelock_cache __read_mostly;
> >>  static struct file_lock_context *
> >>  locks_get_lock_context(struct inode *inode, int type)
> >>  {
> >> -     struct file_lock_context *new;
> >> +     struct file_lock_context *ctx;
> >>
> >> -     if (likely(inode->i_flctx) || type == F_UNLCK)
> >> +     /* paired with cmpxchg() below */
> >> +     ctx = smp_load_acquire(&inode->i_flctx);
> >> +     if (likely(ctx) || type == F_UNLCK)
> >>               goto out;
> >>
> >> -     new = kmem_cache_alloc(flctx_cache, GFP_KERNEL);
> >> -     if (!new)
> >> +     ctx = kmem_cache_alloc(flctx_cache, GFP_KERNEL);
> >> +     if (!ctx)
> >>               goto out;
> >>
> >> -     spin_lock_init(&new->flc_lock);
> >> -     INIT_LIST_HEAD(&new->flc_flock);
> >> -     INIT_LIST_HEAD(&new->flc_posix);
> >> -     INIT_LIST_HEAD(&new->flc_lease);
> >> +     spin_lock_init(&ctx->flc_lock);
> >> +     INIT_LIST_HEAD(&ctx->flc_flock);
> >> +     INIT_LIST_HEAD(&ctx->flc_posix);
> >> +     INIT_LIST_HEAD(&ctx->flc_lease);
> >>
> >>       /*
> >>        * Assign the pointer if it's not already assigned. If it is, then
> >>        * free the context we just allocated.
> >>        */
> >> -     if (cmpxchg(&inode->i_flctx, NULL, new))
> >> -             kmem_cache_free(flctx_cache, new);
> >> +     if (cmpxchg(&inode->i_flctx, NULL, ctx)) {
> >> +             kmem_cache_free(flctx_cache, ctx);
> >> +             ctx = smp_load_acquire(&inode->i_flctx);
> >> +     }
> >>  out:
> >> -     return inode->i_flctx;
> >> +     return ctx;
> >>  }
> >>
> >>  void
> >> @@ -762,7 +766,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
> >>       struct file_lock_context *ctx;
> >>       struct inode *inode = file_inode(filp);
> >>
> >> -     ctx = inode->i_flctx;
> >> +     ctx = smp_load_acquire(&inode->i_flctx);
> >>       if (!ctx || list_empty_careful(&ctx->flc_posix)) {
> >>               fl->fl_type = F_UNLCK;
> >>               return;
> >> @@ -1203,7 +1207,7 @@ int locks_mandatory_locked(struct file *file)
> >>       struct file_lock_context *ctx;
> >>       struct file_lock *fl;
> >>
> >> -     ctx = inode->i_flctx;
> >> +     ctx = smp_load_acquire(&inode->i_flctx);
> >>       if (!ctx || list_empty_careful(&ctx->flc_posix))
> >>               return 0;
> >>
> >> @@ -1388,7 +1392,7 @@ any_leases_conflict(struct inode *inode, struct file_lock *breaker)
> >>  int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
> >>  {
> >>       int error = 0;
> >> -     struct file_lock_context *ctx = inode->i_flctx;
> >> +     struct file_lock_context *ctx;
> >>       struct file_lock *new_fl, *fl, *tmp;
> >>       unsigned long break_time;
> >>       int want_write = (mode & O_ACCMODE) != O_RDONLY;
> >> @@ -1400,6 +1404,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
> >>       new_fl->fl_flags = type;
> >>
> >>       /* typically we will check that ctx is non-NULL before calling */
> >> +     ctx = smp_load_acquire(&inode->i_flctx);
> >>       if (!ctx) {
> >>               WARN_ON_ONCE(1);
> >>               return error;
> >> @@ -1494,9 +1499,10 @@ EXPORT_SYMBOL(__break_lease);
> >>  void lease_get_mtime(struct inode *inode, struct timespec *time)
> >>  {
> >>       bool has_lease = false;
> >> -     struct file_lock_context *ctx = inode->i_flctx;
> >> +     struct file_lock_context *ctx;
> >>       struct file_lock *fl;
> >>
> >> +     ctx = smp_load_acquire(&inode->i_flctx);
> >>       if (ctx && !list_empty_careful(&ctx->flc_lease)) {
> >>               spin_lock(&ctx->flc_lock);
> >>               if (!list_empty(&ctx->flc_lease)) {
> >> @@ -1543,10 +1549,11 @@ int fcntl_getlease(struct file *filp)
> >>  {
> >>       struct file_lock *fl;
> >>       struct inode *inode = file_inode(filp);
> >> -     struct file_lock_context *ctx = inode->i_flctx;
> >> +     struct file_lock_context *ctx;
> >>       int type = F_UNLCK;
> >>       LIST_HEAD(dispose);
> >>
> >> +     ctx = smp_load_acquire(&inode->i_flctx);
> >>       if (ctx && !list_empty_careful(&ctx->flc_lease)) {
> >>               spin_lock(&ctx->flc_lock);
> >>               time_out_leases(file_inode(filp), &dispose);
> >> @@ -1713,9 +1720,10 @@ static int generic_delete_lease(struct file *filp, void *owner)
> >>       struct file_lock *fl, *victim = NULL;
> >>       struct dentry *dentry = filp->f_path.dentry;
> >>       struct inode *inode = dentry->d_inode;
> >> -     struct file_lock_context *ctx = inode->i_flctx;
> >> +     struct file_lock_context *ctx;
> >>       LIST_HEAD(dispose);
> >>
> >> +     ctx = smp_load_acquire(&inode->i_flctx);
> >>       if (!ctx) {
> >>               trace_generic_delete_lease(inode, NULL);
> >>               return error;
> >> @@ -2359,13 +2367,14 @@ out:
> >>  void locks_remove_posix(struct file *filp, fl_owner_t owner)
> >>  {
> >>       struct file_lock lock;
> >> -     struct file_lock_context *ctx = file_inode(filp)->i_flctx;
> >> +     struct file_lock_context *ctx;
> >>
> >>       /*
> >>        * If there are no locks held on this file, we don't need to call
> >>        * posix_lock_file().  Another process could be setting a lock on this
> >>        * file at the same time, but we wouldn't remove that lock anyway.
> >>        */
> >> +     ctx =  smp_load_acquire(&file_inode(filp)->i_flctx);
> >>       if (!ctx || list_empty(&ctx->flc_posix))
> >>               return;
> >>
> >> @@ -2389,7 +2398,7 @@ EXPORT_SYMBOL(locks_remove_posix);
> >>
> >>  /* The i_flctx must be valid when calling into here */
> >>  static void
> >> -locks_remove_flock(struct file *filp)
> >> +locks_remove_flock(struct file *filp, struct file_lock_context *flctx)
> >>  {
> >>       struct file_lock fl = {
> >>               .fl_owner = filp,
> >> @@ -2400,7 +2409,6 @@ locks_remove_flock(struct file *filp)
> >>               .fl_end = OFFSET_MAX,
> >>       };
> >>       struct inode *inode = file_inode(filp);
> >> -     struct file_lock_context *flctx = inode->i_flctx;
> >>
> >>       if (list_empty(&flctx->flc_flock))
> >>               return;
> >> @@ -2416,10 +2424,8 @@ locks_remove_flock(struct file *filp)
> >>
> >>  /* The i_flctx must be valid when calling into here */
> >>  static void
> >> -locks_remove_lease(struct file *filp)
> >> +locks_remove_lease(struct file *filp, struct file_lock_context *ctx)
> >>  {
> >> -     struct inode *inode = file_inode(filp);
> >> -     struct file_lock_context *ctx = inode->i_flctx;
> >>       struct file_lock *fl, *tmp;
> >>       LIST_HEAD(dispose);
> >>
> >> @@ -2439,17 +2445,20 @@ locks_remove_lease(struct file *filp)
> >>   */
> >>  void locks_remove_file(struct file *filp)
> >>  {
> >> -     if (!file_inode(filp)->i_flctx)
> >> +     struct file_lock_context *ctx;
> >> +
> >> +     ctx = smp_load_acquire(&file_inode(filp)->i_flctx);
> >> +     if (!ctx)
> >>               return;
> >>
> >>       /* remove any OFD locks */
> >>       locks_remove_posix(filp, filp);
> >>
> >>       /* remove flock locks */
> >> -     locks_remove_flock(filp);
> >> +     locks_remove_flock(filp, ctx);
> >>
> >>       /* remove any leases */
> >> -     locks_remove_lease(filp);
> >> +     locks_remove_lease(filp, ctx);
> >>  }
> >>
> >>  /**
> >> @@ -2616,7 +2625,7 @@ void show_fd_locks(struct seq_file *f,
> >>       struct file_lock_context *ctx;
> >>       int id = 0;
> >>
> >> -     ctx = inode->i_flctx;
> >> +     ctx = smp_load_acquire(&inode->i_flctx);
> >>       if (!ctx)
> >>               return;
> >>
> >
> >
> > --
> > Jeff Layton <jlayton@poochiereds.net>
> 
> 
>
Dmitry Vyukov Sept. 21, 2015, 11 a.m. UTC | #4
On Mon, Sep 21, 2015 at 12:56 PM, Jeff Layton <jlayton@poochiereds.net> wrote:
> On Mon, 21 Sep 2015 12:53:40 +0200
> Dmitry Vyukov <dvyukov@google.com> wrote:
>
>> On Mon, Sep 21, 2015 at 12:50 PM, Jeff Layton <jlayton@poochiereds.net> wrote:
>> > On Mon, 21 Sep 2015 09:43:06 +0200
>> > Dmitry Vyukov <dvyukov@google.com> wrote:
>> >
>> >> locks_get_lock_context() uses cmpxchg() to install i_flctx.
>> >> cmpxchg() is a release operation which is correct. But it uses
>> >> a plain load to load i_flctx. This is incorrect. Subsequent loads
>> >> from i_flctx can hoist above the load of i_flctx pointer itself
>> >> and observe uninitialized garbage there. This in turn can lead
>> >> to corruption of ctx->flc_lock and other members.
>> >>
>> >
>> > I don't get this bit. The i_flctx field is initialized to zero when the
>> > inode is allocated, and we only assign it with cmpxchg(). How would you
>> > ever see uninitialized garbage in there? It should either be NULL or a
>> > valid pointer, no?
>>
>> This is not about i_flctx pointer, this is about i_flctx object
>> contents (pointee).
>> Readers can see a non-NULL pointer, but read garbage from *i_flctx.
>>
>
> Again, I don't get it though. The cmpxchg happens after we've
> initialized the structure. Given that there are implicit full memory
> barriers before and after the cmpxchg(), how do you end up with another
> task seeing the pointer before it was ever initialized?
>
> The store should only happen after the initialization has occurred, and
> the loads in the other tasks shouldn't be able to see the results of
> that store until then. No?

If a reader looks at things in the opposite order, then it does not
matter how you store them. Reader still can observe them in the wrong
order.
Memory barriers is always a game of two. Writer needs to do stores in
the correct order and reader must do reads in the correct order. A
single memory barrier cannot possible make any sense.


>> > If that'st the case, then most of the places where you're adding
>> > smp_load_acquire are places that can tolerate seeing NULL there. e.g.
>> > you have racing LOCK_EX/LOCK_UN flock() calls in different tasks or
>> > something.
>> >
>> >> Documentation/memory-barriers.txt explicitly requires to use
>> >> a barrier in such context:
>> >> "A load-load control dependency requires a full read memory barrier".
>> >>
>> >
>> > The same file also says:
>> >
>> > "Any atomic operation that modifies some state in memory and returns information
>> > about the state (old or new) implies an SMP-conditional general memory barrier
>> > (smp_mb()) on each side of the actual operation (with the exception of
>> > explicit lock operations, described later).  These include:
>> >
>> > ...
>> > /* when succeeds */
>> > cmpxchg();
>> > "
>> >
>> > If there is an implied smp_mb() before and after the cmpxchg(), how
>> > could the CPU hoist anything before that point?
>> >
>> > Note that I'm not saying that you're wrong, I just want to make sure I
>> > fully understand the problem before we go trying to fix it.
>> >
>> >> Use smp_load_acquire() in locks_get_lock_context() and in bunch
>> >> of other functions that can proceed concurrently with
>> >> locks_get_lock_context().
>> >>
>> >> The data race was found with KernelThreadSanitizer (KTSAN).
>> >>
>> >> Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
>> >> ---
>> >> For the record, here is the KTSAN report:
>> >>
>> >> ThreadSanitizer: data-race in _raw_spin_lock
>> >>
>> >> Read at 0xffff8800bae50da0 of size 1 by thread 3060 on CPU 2:
>> >>  [<     inline     >] __raw_spin_lock include/linux/spinlock_api_smp.h:158
>> >>  [<ffffffff81ee37d0>] _raw_spin_lock+0x50/0x70 kernel/locking/spinlock.c:151
>> >>  [<     inline     >] spin_lock include/linux/spinlock.h:312
>> >>  [<ffffffff812e1187>] flock_lock_inode+0xb7/0x440 fs/locks.c:895
>> >>  [<ffffffff812e16e6>] flock_lock_inode_wait+0x46/0x110 fs/locks.c:1871
>> >>  [<ffffffff812e2814>] SyS_flock+0x224/0x23
>> >>
>> >> Previous write at 0xffff8800bae50da0 of size 8 by thread 3063 on CPU 8:
>> >>  [<ffffffff81248628>] kmem_cache_alloc+0xd8/0x2f0 mm/slab.c:3420
>> >>  [<ffffffff812ddba0>] locks_get_lock_context+0x60/0x140 fs/locks.c:213
>> >>  [<ffffffff812e112a>] flock_lock_inode+0x5a/0x440 fs/locks.c:882
>> >>  [<ffffffff812e16e6>] flock_lock_inode_wait+0x46/0x110 fs/locks.c:1871
>> >>  [<     inline     >] flock_lock_file_wait include/linux/fs.h:1219
>> >>  [<     inline     >] SYSC_flock fs/locks.c:1941
>> >>  [<ffffffff812e2814>] SyS_flock+0x224/0x230 fs/locks.c:1904
>> >>  [<ffffffff81ee3e11>] entry_SYSCALL_64_fastpath+0x31/0x95 arch/x86/entry/entry_64.S:188
>> >> ---
>> >>  fs/locks.c | 63 +++++++++++++++++++++++++++++++++++---------------------------
>> >>  1 file changed, 36 insertions(+), 27 deletions(-)
>> >>
>> >> diff --git a/fs/locks.c b/fs/locks.c
>> >> index 2a54c80..316e474 100644
>> >> --- a/fs/locks.c
>> >> +++ b/fs/locks.c
>> >> @@ -205,28 +205,32 @@ static struct kmem_cache *filelock_cache __read_mostly;
>> >>  static struct file_lock_context *
>> >>  locks_get_lock_context(struct inode *inode, int type)
>> >>  {
>> >> -     struct file_lock_context *new;
>> >> +     struct file_lock_context *ctx;
>> >>
>> >> -     if (likely(inode->i_flctx) || type == F_UNLCK)
>> >> +     /* paired with cmpxchg() below */
>> >> +     ctx = smp_load_acquire(&inode->i_flctx);
>> >> +     if (likely(ctx) || type == F_UNLCK)
>> >>               goto out;
>> >>
>> >> -     new = kmem_cache_alloc(flctx_cache, GFP_KERNEL);
>> >> -     if (!new)
>> >> +     ctx = kmem_cache_alloc(flctx_cache, GFP_KERNEL);
>> >> +     if (!ctx)
>> >>               goto out;
>> >>
>> >> -     spin_lock_init(&new->flc_lock);
>> >> -     INIT_LIST_HEAD(&new->flc_flock);
>> >> -     INIT_LIST_HEAD(&new->flc_posix);
>> >> -     INIT_LIST_HEAD(&new->flc_lease);
>> >> +     spin_lock_init(&ctx->flc_lock);
>> >> +     INIT_LIST_HEAD(&ctx->flc_flock);
>> >> +     INIT_LIST_HEAD(&ctx->flc_posix);
>> >> +     INIT_LIST_HEAD(&ctx->flc_lease);
>> >>
>> >>       /*
>> >>        * Assign the pointer if it's not already assigned. If it is, then
>> >>        * free the context we just allocated.
>> >>        */
>> >> -     if (cmpxchg(&inode->i_flctx, NULL, new))
>> >> -             kmem_cache_free(flctx_cache, new);
>> >> +     if (cmpxchg(&inode->i_flctx, NULL, ctx)) {
>> >> +             kmem_cache_free(flctx_cache, ctx);
>> >> +             ctx = smp_load_acquire(&inode->i_flctx);
>> >> +     }
>> >>  out:
>> >> -     return inode->i_flctx;
>> >> +     return ctx;
>> >>  }
>> >>
>> >>  void
>> >> @@ -762,7 +766,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
>> >>       struct file_lock_context *ctx;
>> >>       struct inode *inode = file_inode(filp);
>> >>
>> >> -     ctx = inode->i_flctx;
>> >> +     ctx = smp_load_acquire(&inode->i_flctx);
>> >>       if (!ctx || list_empty_careful(&ctx->flc_posix)) {
>> >>               fl->fl_type = F_UNLCK;
>> >>               return;
>> >> @@ -1203,7 +1207,7 @@ int locks_mandatory_locked(struct file *file)
>> >>       struct file_lock_context *ctx;
>> >>       struct file_lock *fl;
>> >>
>> >> -     ctx = inode->i_flctx;
>> >> +     ctx = smp_load_acquire(&inode->i_flctx);
>> >>       if (!ctx || list_empty_careful(&ctx->flc_posix))
>> >>               return 0;
>> >>
>> >> @@ -1388,7 +1392,7 @@ any_leases_conflict(struct inode *inode, struct file_lock *breaker)
>> >>  int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
>> >>  {
>> >>       int error = 0;
>> >> -     struct file_lock_context *ctx = inode->i_flctx;
>> >> +     struct file_lock_context *ctx;
>> >>       struct file_lock *new_fl, *fl, *tmp;
>> >>       unsigned long break_time;
>> >>       int want_write = (mode & O_ACCMODE) != O_RDONLY;
>> >> @@ -1400,6 +1404,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
>> >>       new_fl->fl_flags = type;
>> >>
>> >>       /* typically we will check that ctx is non-NULL before calling */
>> >> +     ctx = smp_load_acquire(&inode->i_flctx);
>> >>       if (!ctx) {
>> >>               WARN_ON_ONCE(1);
>> >>               return error;
>> >> @@ -1494,9 +1499,10 @@ EXPORT_SYMBOL(__break_lease);
>> >>  void lease_get_mtime(struct inode *inode, struct timespec *time)
>> >>  {
>> >>       bool has_lease = false;
>> >> -     struct file_lock_context *ctx = inode->i_flctx;
>> >> +     struct file_lock_context *ctx;
>> >>       struct file_lock *fl;
>> >>
>> >> +     ctx = smp_load_acquire(&inode->i_flctx);
>> >>       if (ctx && !list_empty_careful(&ctx->flc_lease)) {
>> >>               spin_lock(&ctx->flc_lock);
>> >>               if (!list_empty(&ctx->flc_lease)) {
>> >> @@ -1543,10 +1549,11 @@ int fcntl_getlease(struct file *filp)
>> >>  {
>> >>       struct file_lock *fl;
>> >>       struct inode *inode = file_inode(filp);
>> >> -     struct file_lock_context *ctx = inode->i_flctx;
>> >> +     struct file_lock_context *ctx;
>> >>       int type = F_UNLCK;
>> >>       LIST_HEAD(dispose);
>> >>
>> >> +     ctx = smp_load_acquire(&inode->i_flctx);
>> >>       if (ctx && !list_empty_careful(&ctx->flc_lease)) {
>> >>               spin_lock(&ctx->flc_lock);
>> >>               time_out_leases(file_inode(filp), &dispose);
>> >> @@ -1713,9 +1720,10 @@ static int generic_delete_lease(struct file *filp, void *owner)
>> >>       struct file_lock *fl, *victim = NULL;
>> >>       struct dentry *dentry = filp->f_path.dentry;
>> >>       struct inode *inode = dentry->d_inode;
>> >> -     struct file_lock_context *ctx = inode->i_flctx;
>> >> +     struct file_lock_context *ctx;
>> >>       LIST_HEAD(dispose);
>> >>
>> >> +     ctx = smp_load_acquire(&inode->i_flctx);
>> >>       if (!ctx) {
>> >>               trace_generic_delete_lease(inode, NULL);
>> >>               return error;
>> >> @@ -2359,13 +2367,14 @@ out:
>> >>  void locks_remove_posix(struct file *filp, fl_owner_t owner)
>> >>  {
>> >>       struct file_lock lock;
>> >> -     struct file_lock_context *ctx = file_inode(filp)->i_flctx;
>> >> +     struct file_lock_context *ctx;
>> >>
>> >>       /*
>> >>        * If there are no locks held on this file, we don't need to call
>> >>        * posix_lock_file().  Another process could be setting a lock on this
>> >>        * file at the same time, but we wouldn't remove that lock anyway.
>> >>        */
>> >> +     ctx =  smp_load_acquire(&file_inode(filp)->i_flctx);
>> >>       if (!ctx || list_empty(&ctx->flc_posix))
>> >>               return;
>> >>
>> >> @@ -2389,7 +2398,7 @@ EXPORT_SYMBOL(locks_remove_posix);
>> >>
>> >>  /* The i_flctx must be valid when calling into here */
>> >>  static void
>> >> -locks_remove_flock(struct file *filp)
>> >> +locks_remove_flock(struct file *filp, struct file_lock_context *flctx)
>> >>  {
>> >>       struct file_lock fl = {
>> >>               .fl_owner = filp,
>> >> @@ -2400,7 +2409,6 @@ locks_remove_flock(struct file *filp)
>> >>               .fl_end = OFFSET_MAX,
>> >>       };
>> >>       struct inode *inode = file_inode(filp);
>> >> -     struct file_lock_context *flctx = inode->i_flctx;
>> >>
>> >>       if (list_empty(&flctx->flc_flock))
>> >>               return;
>> >> @@ -2416,10 +2424,8 @@ locks_remove_flock(struct file *filp)
>> >>
>> >>  /* The i_flctx must be valid when calling into here */
>> >>  static void
>> >> -locks_remove_lease(struct file *filp)
>> >> +locks_remove_lease(struct file *filp, struct file_lock_context *ctx)
>> >>  {
>> >> -     struct inode *inode = file_inode(filp);
>> >> -     struct file_lock_context *ctx = inode->i_flctx;
>> >>       struct file_lock *fl, *tmp;
>> >>       LIST_HEAD(dispose);
>> >>
>> >> @@ -2439,17 +2445,20 @@ locks_remove_lease(struct file *filp)
>> >>   */
>> >>  void locks_remove_file(struct file *filp)
>> >>  {
>> >> -     if (!file_inode(filp)->i_flctx)
>> >> +     struct file_lock_context *ctx;
>> >> +
>> >> +     ctx = smp_load_acquire(&file_inode(filp)->i_flctx);
>> >> +     if (!ctx)
>> >>               return;
>> >>
>> >>       /* remove any OFD locks */
>> >>       locks_remove_posix(filp, filp);
>> >>
>> >>       /* remove flock locks */
>> >> -     locks_remove_flock(filp);
>> >> +     locks_remove_flock(filp, ctx);
>> >>
>> >>       /* remove any leases */
>> >> -     locks_remove_lease(filp);
>> >> +     locks_remove_lease(filp, ctx);
>> >>  }
>> >>
>> >>  /**
>> >> @@ -2616,7 +2625,7 @@ void show_fd_locks(struct seq_file *f,
>> >>       struct file_lock_context *ctx;
>> >>       int id = 0;
>> >>
>> >> -     ctx = inode->i_flctx;
>> >> +     ctx = smp_load_acquire(&inode->i_flctx);
>> >>       if (!ctx)
>> >>               return;
>> >>
>> >
>> >
>> > --
>> > Jeff Layton <jlayton@poochiereds.net>
>>
>>
>>
>
>
> --
> Jeff Layton <jlayton@poochiereds.net>
Jeff Layton Sept. 21, 2015, 11:17 a.m. UTC | #5
On Mon, 21 Sep 2015 13:00:04 +0200
Dmitry Vyukov <dvyukov@google.com> wrote:

> On Mon, Sep 21, 2015 at 12:56 PM, Jeff Layton <jlayton@poochiereds.net> wrote:
> > On Mon, 21 Sep 2015 12:53:40 +0200
> > Dmitry Vyukov <dvyukov@google.com> wrote:
> >
> >> On Mon, Sep 21, 2015 at 12:50 PM, Jeff Layton <jlayton@poochiereds.net> wrote:
> >> > On Mon, 21 Sep 2015 09:43:06 +0200
> >> > Dmitry Vyukov <dvyukov@google.com> wrote:
> >> >
> >> >> locks_get_lock_context() uses cmpxchg() to install i_flctx.
> >> >> cmpxchg() is a release operation which is correct. But it uses
> >> >> a plain load to load i_flctx. This is incorrect. Subsequent loads
> >> >> from i_flctx can hoist above the load of i_flctx pointer itself
> >> >> and observe uninitialized garbage there. This in turn can lead
> >> >> to corruption of ctx->flc_lock and other members.
> >> >>
> >> >
> >> > I don't get this bit. The i_flctx field is initialized to zero when the
> >> > inode is allocated, and we only assign it with cmpxchg(). How would you
> >> > ever see uninitialized garbage in there? It should either be NULL or a
> >> > valid pointer, no?
> >>
> >> This is not about i_flctx pointer, this is about i_flctx object
> >> contents (pointee).
> >> Readers can see a non-NULL pointer, but read garbage from *i_flctx.
> >>
> >
> > Again, I don't get it though. The cmpxchg happens after we've
> > initialized the structure. Given that there are implicit full memory
> > barriers before and after the cmpxchg(), how do you end up with another
> > task seeing the pointer before it was ever initialized?
> >
> > The store should only happen after the initialization has occurred, and
> > the loads in the other tasks shouldn't be able to see the results of
> > that store until then. No?
> 
> If a reader looks at things in the opposite order, then it does not
> matter how you store them. Reader still can observe them in the wrong
> order.
> Memory barriers is always a game of two. Writer needs to do stores in
> the correct order and reader must do reads in the correct order. A
> single memory barrier cannot possible make any sense.
> 

I get that, but...isn't there a data dependency there? In order for the
reader to see bogus fields inside of i_flctx, it needs to be able to
see a valid pointer in i_flctx...and in order for that to occur the
store has to have occurred.

I guess the concern is that the reader CPU could stumble across some
memory that is eventually going to part of i_flctx prior to loading
that pointer, and assume that its contents are still valid after the
pointer store has occurred? Is that the basic scenario?

> 
> >> > If that'st the case, then most of the places where you're adding
> >> > smp_load_acquire are places that can tolerate seeing NULL there. e.g.
> >> > you have racing LOCK_EX/LOCK_UN flock() calls in different tasks or
> >> > something.
> >> >
> >> >> Documentation/memory-barriers.txt explicitly requires to use
> >> >> a barrier in such context:
> >> >> "A load-load control dependency requires a full read memory barrier".
> >> >>
> >> >
> >> > The same file also says:
> >> >
> >> > "Any atomic operation that modifies some state in memory and returns information
> >> > about the state (old or new) implies an SMP-conditional general memory barrier
> >> > (smp_mb()) on each side of the actual operation (with the exception of
> >> > explicit lock operations, described later).  These include:
> >> >
> >> > ...
> >> > /* when succeeds */
> >> > cmpxchg();
> >> > "
> >> >
> >> > If there is an implied smp_mb() before and after the cmpxchg(), how
> >> > could the CPU hoist anything before that point?
> >> >
> >> > Note that I'm not saying that you're wrong, I just want to make sure I
> >> > fully understand the problem before we go trying to fix it.
> >> >
> >> >> Use smp_load_acquire() in locks_get_lock_context() and in bunch
> >> >> of other functions that can proceed concurrently with
> >> >> locks_get_lock_context().
> >> >>
> >> >> The data race was found with KernelThreadSanitizer (KTSAN).
> >> >>
> >> >> Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
> >> >> ---
> >> >> For the record, here is the KTSAN report:
> >> >>
> >> >> ThreadSanitizer: data-race in _raw_spin_lock
> >> >>
> >> >> Read at 0xffff8800bae50da0 of size 1 by thread 3060 on CPU 2:
> >> >>  [<     inline     >] __raw_spin_lock include/linux/spinlock_api_smp.h:158
> >> >>  [<ffffffff81ee37d0>] _raw_spin_lock+0x50/0x70 kernel/locking/spinlock.c:151
> >> >>  [<     inline     >] spin_lock include/linux/spinlock.h:312
> >> >>  [<ffffffff812e1187>] flock_lock_inode+0xb7/0x440 fs/locks.c:895
> >> >>  [<ffffffff812e16e6>] flock_lock_inode_wait+0x46/0x110 fs/locks.c:1871
> >> >>  [<ffffffff812e2814>] SyS_flock+0x224/0x23
> >> >>
> >> >> Previous write at 0xffff8800bae50da0 of size 8 by thread 3063 on CPU 8:
> >> >>  [<ffffffff81248628>] kmem_cache_alloc+0xd8/0x2f0 mm/slab.c:3420
> >> >>  [<ffffffff812ddba0>] locks_get_lock_context+0x60/0x140 fs/locks.c:213
> >> >>  [<ffffffff812e112a>] flock_lock_inode+0x5a/0x440 fs/locks.c:882
> >> >>  [<ffffffff812e16e6>] flock_lock_inode_wait+0x46/0x110 fs/locks.c:1871
> >> >>  [<     inline     >] flock_lock_file_wait include/linux/fs.h:1219
> >> >>  [<     inline     >] SYSC_flock fs/locks.c:1941
> >> >>  [<ffffffff812e2814>] SyS_flock+0x224/0x230 fs/locks.c:1904
> >> >>  [<ffffffff81ee3e11>] entry_SYSCALL_64_fastpath+0x31/0x95 arch/x86/entry/entry_64.S:188
> >> >> ---
> >> >>  fs/locks.c | 63 +++++++++++++++++++++++++++++++++++---------------------------
> >> >>  1 file changed, 36 insertions(+), 27 deletions(-)
> >> >>
> >> >> diff --git a/fs/locks.c b/fs/locks.c
> >> >> index 2a54c80..316e474 100644
> >> >> --- a/fs/locks.c
> >> >> +++ b/fs/locks.c
> >> >> @@ -205,28 +205,32 @@ static struct kmem_cache *filelock_cache __read_mostly;
> >> >>  static struct file_lock_context *
> >> >>  locks_get_lock_context(struct inode *inode, int type)
> >> >>  {
> >> >> -     struct file_lock_context *new;
> >> >> +     struct file_lock_context *ctx;
> >> >>
> >> >> -     if (likely(inode->i_flctx) || type == F_UNLCK)
> >> >> +     /* paired with cmpxchg() below */
> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
> >> >> +     if (likely(ctx) || type == F_UNLCK)
> >> >>               goto out;
> >> >>
> >> >> -     new = kmem_cache_alloc(flctx_cache, GFP_KERNEL);
> >> >> -     if (!new)
> >> >> +     ctx = kmem_cache_alloc(flctx_cache, GFP_KERNEL);
> >> >> +     if (!ctx)
> >> >>               goto out;
> >> >>
> >> >> -     spin_lock_init(&new->flc_lock);
> >> >> -     INIT_LIST_HEAD(&new->flc_flock);
> >> >> -     INIT_LIST_HEAD(&new->flc_posix);
> >> >> -     INIT_LIST_HEAD(&new->flc_lease);
> >> >> +     spin_lock_init(&ctx->flc_lock);
> >> >> +     INIT_LIST_HEAD(&ctx->flc_flock);
> >> >> +     INIT_LIST_HEAD(&ctx->flc_posix);
> >> >> +     INIT_LIST_HEAD(&ctx->flc_lease);
> >> >>
> >> >>       /*
> >> >>        * Assign the pointer if it's not already assigned. If it is, then
> >> >>        * free the context we just allocated.
> >> >>        */
> >> >> -     if (cmpxchg(&inode->i_flctx, NULL, new))
> >> >> -             kmem_cache_free(flctx_cache, new);
> >> >> +     if (cmpxchg(&inode->i_flctx, NULL, ctx)) {
> >> >> +             kmem_cache_free(flctx_cache, ctx);
> >> >> +             ctx = smp_load_acquire(&inode->i_flctx);
> >> >> +     }
> >> >>  out:
> >> >> -     return inode->i_flctx;
> >> >> +     return ctx;
> >> >>  }
> >> >>
> >> >>  void
> >> >> @@ -762,7 +766,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
> >> >>       struct file_lock_context *ctx;
> >> >>       struct inode *inode = file_inode(filp);
> >> >>
> >> >> -     ctx = inode->i_flctx;
> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
> >> >>       if (!ctx || list_empty_careful(&ctx->flc_posix)) {
> >> >>               fl->fl_type = F_UNLCK;
> >> >>               return;
> >> >> @@ -1203,7 +1207,7 @@ int locks_mandatory_locked(struct file *file)
> >> >>       struct file_lock_context *ctx;
> >> >>       struct file_lock *fl;
> >> >>
> >> >> -     ctx = inode->i_flctx;
> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
> >> >>       if (!ctx || list_empty_careful(&ctx->flc_posix))
> >> >>               return 0;
> >> >>
> >> >> @@ -1388,7 +1392,7 @@ any_leases_conflict(struct inode *inode, struct file_lock *breaker)
> >> >>  int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
> >> >>  {
> >> >>       int error = 0;
> >> >> -     struct file_lock_context *ctx = inode->i_flctx;
> >> >> +     struct file_lock_context *ctx;
> >> >>       struct file_lock *new_fl, *fl, *tmp;
> >> >>       unsigned long break_time;
> >> >>       int want_write = (mode & O_ACCMODE) != O_RDONLY;
> >> >> @@ -1400,6 +1404,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
> >> >>       new_fl->fl_flags = type;
> >> >>
> >> >>       /* typically we will check that ctx is non-NULL before calling */
> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
> >> >>       if (!ctx) {
> >> >>               WARN_ON_ONCE(1);
> >> >>               return error;
> >> >> @@ -1494,9 +1499,10 @@ EXPORT_SYMBOL(__break_lease);
> >> >>  void lease_get_mtime(struct inode *inode, struct timespec *time)
> >> >>  {
> >> >>       bool has_lease = false;
> >> >> -     struct file_lock_context *ctx = inode->i_flctx;
> >> >> +     struct file_lock_context *ctx;
> >> >>       struct file_lock *fl;
> >> >>
> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
> >> >>       if (ctx && !list_empty_careful(&ctx->flc_lease)) {
> >> >>               spin_lock(&ctx->flc_lock);
> >> >>               if (!list_empty(&ctx->flc_lease)) {
> >> >> @@ -1543,10 +1549,11 @@ int fcntl_getlease(struct file *filp)
> >> >>  {
> >> >>       struct file_lock *fl;
> >> >>       struct inode *inode = file_inode(filp);
> >> >> -     struct file_lock_context *ctx = inode->i_flctx;
> >> >> +     struct file_lock_context *ctx;
> >> >>       int type = F_UNLCK;
> >> >>       LIST_HEAD(dispose);
> >> >>
> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
> >> >>       if (ctx && !list_empty_careful(&ctx->flc_lease)) {
> >> >>               spin_lock(&ctx->flc_lock);
> >> >>               time_out_leases(file_inode(filp), &dispose);
> >> >> @@ -1713,9 +1720,10 @@ static int generic_delete_lease(struct file *filp, void *owner)
> >> >>       struct file_lock *fl, *victim = NULL;
> >> >>       struct dentry *dentry = filp->f_path.dentry;
> >> >>       struct inode *inode = dentry->d_inode;
> >> >> -     struct file_lock_context *ctx = inode->i_flctx;
> >> >> +     struct file_lock_context *ctx;
> >> >>       LIST_HEAD(dispose);
> >> >>
> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
> >> >>       if (!ctx) {
> >> >>               trace_generic_delete_lease(inode, NULL);
> >> >>               return error;
> >> >> @@ -2359,13 +2367,14 @@ out:
> >> >>  void locks_remove_posix(struct file *filp, fl_owner_t owner)
> >> >>  {
> >> >>       struct file_lock lock;
> >> >> -     struct file_lock_context *ctx = file_inode(filp)->i_flctx;
> >> >> +     struct file_lock_context *ctx;
> >> >>
> >> >>       /*
> >> >>        * If there are no locks held on this file, we don't need to call
> >> >>        * posix_lock_file().  Another process could be setting a lock on this
> >> >>        * file at the same time, but we wouldn't remove that lock anyway.
> >> >>        */
> >> >> +     ctx =  smp_load_acquire(&file_inode(filp)->i_flctx);
> >> >>       if (!ctx || list_empty(&ctx->flc_posix))
> >> >>               return;
> >> >>
> >> >> @@ -2389,7 +2398,7 @@ EXPORT_SYMBOL(locks_remove_posix);
> >> >>
> >> >>  /* The i_flctx must be valid when calling into here */
> >> >>  static void
> >> >> -locks_remove_flock(struct file *filp)
> >> >> +locks_remove_flock(struct file *filp, struct file_lock_context *flctx)
> >> >>  {
> >> >>       struct file_lock fl = {
> >> >>               .fl_owner = filp,
> >> >> @@ -2400,7 +2409,6 @@ locks_remove_flock(struct file *filp)
> >> >>               .fl_end = OFFSET_MAX,
> >> >>       };
> >> >>       struct inode *inode = file_inode(filp);
> >> >> -     struct file_lock_context *flctx = inode->i_flctx;
> >> >>
> >> >>       if (list_empty(&flctx->flc_flock))
> >> >>               return;
> >> >> @@ -2416,10 +2424,8 @@ locks_remove_flock(struct file *filp)
> >> >>
> >> >>  /* The i_flctx must be valid when calling into here */
> >> >>  static void
> >> >> -locks_remove_lease(struct file *filp)
> >> >> +locks_remove_lease(struct file *filp, struct file_lock_context *ctx)
> >> >>  {
> >> >> -     struct inode *inode = file_inode(filp);
> >> >> -     struct file_lock_context *ctx = inode->i_flctx;
> >> >>       struct file_lock *fl, *tmp;
> >> >>       LIST_HEAD(dispose);
> >> >>
> >> >> @@ -2439,17 +2445,20 @@ locks_remove_lease(struct file *filp)
> >> >>   */
> >> >>  void locks_remove_file(struct file *filp)
> >> >>  {
> >> >> -     if (!file_inode(filp)->i_flctx)
> >> >> +     struct file_lock_context *ctx;
> >> >> +
> >> >> +     ctx = smp_load_acquire(&file_inode(filp)->i_flctx);
> >> >> +     if (!ctx)
> >> >>               return;
> >> >>
> >> >>       /* remove any OFD locks */
> >> >>       locks_remove_posix(filp, filp);
> >> >>
> >> >>       /* remove flock locks */
> >> >> -     locks_remove_flock(filp);
> >> >> +     locks_remove_flock(filp, ctx);
> >> >>
> >> >>       /* remove any leases */
> >> >> -     locks_remove_lease(filp);
> >> >> +     locks_remove_lease(filp, ctx);
> >> >>  }
> >> >>
> >> >>  /**
> >> >> @@ -2616,7 +2625,7 @@ void show_fd_locks(struct seq_file *f,
> >> >>       struct file_lock_context *ctx;
> >> >>       int id = 0;
> >> >>
> >> >> -     ctx = inode->i_flctx;
> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
> >> >>       if (!ctx)
> >> >>               return;
> >> >>
> >> >
> >> >
> >> > --
> >> > Jeff Layton <jlayton@poochiereds.net>
> >>
> >>
> >>
> >
> >
> > --
> > Jeff Layton <jlayton@poochiereds.net>
> 
> 
>
Dmitry Vyukov Sept. 21, 2015, 11:38 a.m. UTC | #6
Yes. That processor is Alpha and that's documented in DATA DEPENDENCY
BARRIERS section of Documentation/memory-barriers.txt.



On Mon, Sep 21, 2015 at 1:17 PM, Jeff Layton <jlayton@poochiereds.net> wrote:
> On Mon, 21 Sep 2015 13:00:04 +0200
> Dmitry Vyukov <dvyukov@google.com> wrote:
>
>> On Mon, Sep 21, 2015 at 12:56 PM, Jeff Layton <jlayton@poochiereds.net> wrote:
>> > On Mon, 21 Sep 2015 12:53:40 +0200
>> > Dmitry Vyukov <dvyukov@google.com> wrote:
>> >
>> >> On Mon, Sep 21, 2015 at 12:50 PM, Jeff Layton <jlayton@poochiereds.net> wrote:
>> >> > On Mon, 21 Sep 2015 09:43:06 +0200
>> >> > Dmitry Vyukov <dvyukov@google.com> wrote:
>> >> >
>> >> >> locks_get_lock_context() uses cmpxchg() to install i_flctx.
>> >> >> cmpxchg() is a release operation which is correct. But it uses
>> >> >> a plain load to load i_flctx. This is incorrect. Subsequent loads
>> >> >> from i_flctx can hoist above the load of i_flctx pointer itself
>> >> >> and observe uninitialized garbage there. This in turn can lead
>> >> >> to corruption of ctx->flc_lock and other members.
>> >> >>
>> >> >
>> >> > I don't get this bit. The i_flctx field is initialized to zero when the
>> >> > inode is allocated, and we only assign it with cmpxchg(). How would you
>> >> > ever see uninitialized garbage in there? It should either be NULL or a
>> >> > valid pointer, no?
>> >>
>> >> This is not about i_flctx pointer, this is about i_flctx object
>> >> contents (pointee).
>> >> Readers can see a non-NULL pointer, but read garbage from *i_flctx.
>> >>
>> >
>> > Again, I don't get it though. The cmpxchg happens after we've
>> > initialized the structure. Given that there are implicit full memory
>> > barriers before and after the cmpxchg(), how do you end up with another
>> > task seeing the pointer before it was ever initialized?
>> >
>> > The store should only happen after the initialization has occurred, and
>> > the loads in the other tasks shouldn't be able to see the results of
>> > that store until then. No?
>>
>> If a reader looks at things in the opposite order, then it does not
>> matter how you store them. Reader still can observe them in the wrong
>> order.
>> Memory barriers is always a game of two. Writer needs to do stores in
>> the correct order and reader must do reads in the correct order. A
>> single memory barrier cannot possible make any sense.
>>
>
> I get that, but...isn't there a data dependency there? In order for the
> reader to see bogus fields inside of i_flctx, it needs to be able to
> see a valid pointer in i_flctx...and in order for that to occur the
> store has to have occurred.
>
> I guess the concern is that the reader CPU could stumble across some
> memory that is eventually going to part of i_flctx prior to loading
> that pointer, and assume that its contents are still valid after the
> pointer store has occurred? Is that the basic scenario?
>
>>
>> >> > If that'st the case, then most of the places where you're adding
>> >> > smp_load_acquire are places that can tolerate seeing NULL there. e.g.
>> >> > you have racing LOCK_EX/LOCK_UN flock() calls in different tasks or
>> >> > something.
>> >> >
>> >> >> Documentation/memory-barriers.txt explicitly requires to use
>> >> >> a barrier in such context:
>> >> >> "A load-load control dependency requires a full read memory barrier".
>> >> >>
>> >> >
>> >> > The same file also says:
>> >> >
>> >> > "Any atomic operation that modifies some state in memory and returns information
>> >> > about the state (old or new) implies an SMP-conditional general memory barrier
>> >> > (smp_mb()) on each side of the actual operation (with the exception of
>> >> > explicit lock operations, described later).  These include:
>> >> >
>> >> > ...
>> >> > /* when succeeds */
>> >> > cmpxchg();
>> >> > "
>> >> >
>> >> > If there is an implied smp_mb() before and after the cmpxchg(), how
>> >> > could the CPU hoist anything before that point?
>> >> >
>> >> > Note that I'm not saying that you're wrong, I just want to make sure I
>> >> > fully understand the problem before we go trying to fix it.
>> >> >
>> >> >> Use smp_load_acquire() in locks_get_lock_context() and in bunch
>> >> >> of other functions that can proceed concurrently with
>> >> >> locks_get_lock_context().
>> >> >>
>> >> >> The data race was found with KernelThreadSanitizer (KTSAN).
>> >> >>
>> >> >> Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
>> >> >> ---
>> >> >> For the record, here is the KTSAN report:
>> >> >>
>> >> >> ThreadSanitizer: data-race in _raw_spin_lock
>> >> >>
>> >> >> Read at 0xffff8800bae50da0 of size 1 by thread 3060 on CPU 2:
>> >> >>  [<     inline     >] __raw_spin_lock include/linux/spinlock_api_smp.h:158
>> >> >>  [<ffffffff81ee37d0>] _raw_spin_lock+0x50/0x70 kernel/locking/spinlock.c:151
>> >> >>  [<     inline     >] spin_lock include/linux/spinlock.h:312
>> >> >>  [<ffffffff812e1187>] flock_lock_inode+0xb7/0x440 fs/locks.c:895
>> >> >>  [<ffffffff812e16e6>] flock_lock_inode_wait+0x46/0x110 fs/locks.c:1871
>> >> >>  [<ffffffff812e2814>] SyS_flock+0x224/0x23
>> >> >>
>> >> >> Previous write at 0xffff8800bae50da0 of size 8 by thread 3063 on CPU 8:
>> >> >>  [<ffffffff81248628>] kmem_cache_alloc+0xd8/0x2f0 mm/slab.c:3420
>> >> >>  [<ffffffff812ddba0>] locks_get_lock_context+0x60/0x140 fs/locks.c:213
>> >> >>  [<ffffffff812e112a>] flock_lock_inode+0x5a/0x440 fs/locks.c:882
>> >> >>  [<ffffffff812e16e6>] flock_lock_inode_wait+0x46/0x110 fs/locks.c:1871
>> >> >>  [<     inline     >] flock_lock_file_wait include/linux/fs.h:1219
>> >> >>  [<     inline     >] SYSC_flock fs/locks.c:1941
>> >> >>  [<ffffffff812e2814>] SyS_flock+0x224/0x230 fs/locks.c:1904
>> >> >>  [<ffffffff81ee3e11>] entry_SYSCALL_64_fastpath+0x31/0x95 arch/x86/entry/entry_64.S:188
>> >> >> ---
>> >> >>  fs/locks.c | 63 +++++++++++++++++++++++++++++++++++---------------------------
>> >> >>  1 file changed, 36 insertions(+), 27 deletions(-)
>> >> >>
>> >> >> diff --git a/fs/locks.c b/fs/locks.c
>> >> >> index 2a54c80..316e474 100644
>> >> >> --- a/fs/locks.c
>> >> >> +++ b/fs/locks.c
>> >> >> @@ -205,28 +205,32 @@ static struct kmem_cache *filelock_cache __read_mostly;
>> >> >>  static struct file_lock_context *
>> >> >>  locks_get_lock_context(struct inode *inode, int type)
>> >> >>  {
>> >> >> -     struct file_lock_context *new;
>> >> >> +     struct file_lock_context *ctx;
>> >> >>
>> >> >> -     if (likely(inode->i_flctx) || type == F_UNLCK)
>> >> >> +     /* paired with cmpxchg() below */
>> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
>> >> >> +     if (likely(ctx) || type == F_UNLCK)
>> >> >>               goto out;
>> >> >>
>> >> >> -     new = kmem_cache_alloc(flctx_cache, GFP_KERNEL);
>> >> >> -     if (!new)
>> >> >> +     ctx = kmem_cache_alloc(flctx_cache, GFP_KERNEL);
>> >> >> +     if (!ctx)
>> >> >>               goto out;
>> >> >>
>> >> >> -     spin_lock_init(&new->flc_lock);
>> >> >> -     INIT_LIST_HEAD(&new->flc_flock);
>> >> >> -     INIT_LIST_HEAD(&new->flc_posix);
>> >> >> -     INIT_LIST_HEAD(&new->flc_lease);
>> >> >> +     spin_lock_init(&ctx->flc_lock);
>> >> >> +     INIT_LIST_HEAD(&ctx->flc_flock);
>> >> >> +     INIT_LIST_HEAD(&ctx->flc_posix);
>> >> >> +     INIT_LIST_HEAD(&ctx->flc_lease);
>> >> >>
>> >> >>       /*
>> >> >>        * Assign the pointer if it's not already assigned. If it is, then
>> >> >>        * free the context we just allocated.
>> >> >>        */
>> >> >> -     if (cmpxchg(&inode->i_flctx, NULL, new))
>> >> >> -             kmem_cache_free(flctx_cache, new);
>> >> >> +     if (cmpxchg(&inode->i_flctx, NULL, ctx)) {
>> >> >> +             kmem_cache_free(flctx_cache, ctx);
>> >> >> +             ctx = smp_load_acquire(&inode->i_flctx);
>> >> >> +     }
>> >> >>  out:
>> >> >> -     return inode->i_flctx;
>> >> >> +     return ctx;
>> >> >>  }
>> >> >>
>> >> >>  void
>> >> >> @@ -762,7 +766,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
>> >> >>       struct file_lock_context *ctx;
>> >> >>       struct inode *inode = file_inode(filp);
>> >> >>
>> >> >> -     ctx = inode->i_flctx;
>> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
>> >> >>       if (!ctx || list_empty_careful(&ctx->flc_posix)) {
>> >> >>               fl->fl_type = F_UNLCK;
>> >> >>               return;
>> >> >> @@ -1203,7 +1207,7 @@ int locks_mandatory_locked(struct file *file)
>> >> >>       struct file_lock_context *ctx;
>> >> >>       struct file_lock *fl;
>> >> >>
>> >> >> -     ctx = inode->i_flctx;
>> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
>> >> >>       if (!ctx || list_empty_careful(&ctx->flc_posix))
>> >> >>               return 0;
>> >> >>
>> >> >> @@ -1388,7 +1392,7 @@ any_leases_conflict(struct inode *inode, struct file_lock *breaker)
>> >> >>  int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
>> >> >>  {
>> >> >>       int error = 0;
>> >> >> -     struct file_lock_context *ctx = inode->i_flctx;
>> >> >> +     struct file_lock_context *ctx;
>> >> >>       struct file_lock *new_fl, *fl, *tmp;
>> >> >>       unsigned long break_time;
>> >> >>       int want_write = (mode & O_ACCMODE) != O_RDONLY;
>> >> >> @@ -1400,6 +1404,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
>> >> >>       new_fl->fl_flags = type;
>> >> >>
>> >> >>       /* typically we will check that ctx is non-NULL before calling */
>> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
>> >> >>       if (!ctx) {
>> >> >>               WARN_ON_ONCE(1);
>> >> >>               return error;
>> >> >> @@ -1494,9 +1499,10 @@ EXPORT_SYMBOL(__break_lease);
>> >> >>  void lease_get_mtime(struct inode *inode, struct timespec *time)
>> >> >>  {
>> >> >>       bool has_lease = false;
>> >> >> -     struct file_lock_context *ctx = inode->i_flctx;
>> >> >> +     struct file_lock_context *ctx;
>> >> >>       struct file_lock *fl;
>> >> >>
>> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
>> >> >>       if (ctx && !list_empty_careful(&ctx->flc_lease)) {
>> >> >>               spin_lock(&ctx->flc_lock);
>> >> >>               if (!list_empty(&ctx->flc_lease)) {
>> >> >> @@ -1543,10 +1549,11 @@ int fcntl_getlease(struct file *filp)
>> >> >>  {
>> >> >>       struct file_lock *fl;
>> >> >>       struct inode *inode = file_inode(filp);
>> >> >> -     struct file_lock_context *ctx = inode->i_flctx;
>> >> >> +     struct file_lock_context *ctx;
>> >> >>       int type = F_UNLCK;
>> >> >>       LIST_HEAD(dispose);
>> >> >>
>> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
>> >> >>       if (ctx && !list_empty_careful(&ctx->flc_lease)) {
>> >> >>               spin_lock(&ctx->flc_lock);
>> >> >>               time_out_leases(file_inode(filp), &dispose);
>> >> >> @@ -1713,9 +1720,10 @@ static int generic_delete_lease(struct file *filp, void *owner)
>> >> >>       struct file_lock *fl, *victim = NULL;
>> >> >>       struct dentry *dentry = filp->f_path.dentry;
>> >> >>       struct inode *inode = dentry->d_inode;
>> >> >> -     struct file_lock_context *ctx = inode->i_flctx;
>> >> >> +     struct file_lock_context *ctx;
>> >> >>       LIST_HEAD(dispose);
>> >> >>
>> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
>> >> >>       if (!ctx) {
>> >> >>               trace_generic_delete_lease(inode, NULL);
>> >> >>               return error;
>> >> >> @@ -2359,13 +2367,14 @@ out:
>> >> >>  void locks_remove_posix(struct file *filp, fl_owner_t owner)
>> >> >>  {
>> >> >>       struct file_lock lock;
>> >> >> -     struct file_lock_context *ctx = file_inode(filp)->i_flctx;
>> >> >> +     struct file_lock_context *ctx;
>> >> >>
>> >> >>       /*
>> >> >>        * If there are no locks held on this file, we don't need to call
>> >> >>        * posix_lock_file().  Another process could be setting a lock on this
>> >> >>        * file at the same time, but we wouldn't remove that lock anyway.
>> >> >>        */
>> >> >> +     ctx =  smp_load_acquire(&file_inode(filp)->i_flctx);
>> >> >>       if (!ctx || list_empty(&ctx->flc_posix))
>> >> >>               return;
>> >> >>
>> >> >> @@ -2389,7 +2398,7 @@ EXPORT_SYMBOL(locks_remove_posix);
>> >> >>
>> >> >>  /* The i_flctx must be valid when calling into here */
>> >> >>  static void
>> >> >> -locks_remove_flock(struct file *filp)
>> >> >> +locks_remove_flock(struct file *filp, struct file_lock_context *flctx)
>> >> >>  {
>> >> >>       struct file_lock fl = {
>> >> >>               .fl_owner = filp,
>> >> >> @@ -2400,7 +2409,6 @@ locks_remove_flock(struct file *filp)
>> >> >>               .fl_end = OFFSET_MAX,
>> >> >>       };
>> >> >>       struct inode *inode = file_inode(filp);
>> >> >> -     struct file_lock_context *flctx = inode->i_flctx;
>> >> >>
>> >> >>       if (list_empty(&flctx->flc_flock))
>> >> >>               return;
>> >> >> @@ -2416,10 +2424,8 @@ locks_remove_flock(struct file *filp)
>> >> >>
>> >> >>  /* The i_flctx must be valid when calling into here */
>> >> >>  static void
>> >> >> -locks_remove_lease(struct file *filp)
>> >> >> +locks_remove_lease(struct file *filp, struct file_lock_context *ctx)
>> >> >>  {
>> >> >> -     struct inode *inode = file_inode(filp);
>> >> >> -     struct file_lock_context *ctx = inode->i_flctx;
>> >> >>       struct file_lock *fl, *tmp;
>> >> >>       LIST_HEAD(dispose);
>> >> >>
>> >> >> @@ -2439,17 +2445,20 @@ locks_remove_lease(struct file *filp)
>> >> >>   */
>> >> >>  void locks_remove_file(struct file *filp)
>> >> >>  {
>> >> >> -     if (!file_inode(filp)->i_flctx)
>> >> >> +     struct file_lock_context *ctx;
>> >> >> +
>> >> >> +     ctx = smp_load_acquire(&file_inode(filp)->i_flctx);
>> >> >> +     if (!ctx)
>> >> >>               return;
>> >> >>
>> >> >>       /* remove any OFD locks */
>> >> >>       locks_remove_posix(filp, filp);
>> >> >>
>> >> >>       /* remove flock locks */
>> >> >> -     locks_remove_flock(filp);
>> >> >> +     locks_remove_flock(filp, ctx);
>> >> >>
>> >> >>       /* remove any leases */
>> >> >> -     locks_remove_lease(filp);
>> >> >> +     locks_remove_lease(filp, ctx);
>> >> >>  }
>> >> >>
>> >> >>  /**
>> >> >> @@ -2616,7 +2625,7 @@ void show_fd_locks(struct seq_file *f,
>> >> >>       struct file_lock_context *ctx;
>> >> >>       int id = 0;
>> >> >>
>> >> >> -     ctx = inode->i_flctx;
>> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
>> >> >>       if (!ctx)
>> >> >>               return;
>> >> >>
>> >> >
>> >> >
>> >> > --
>> >> > Jeff Layton <jlayton@poochiereds.net>
>> >>
>> >>
>> >>
>> >
>> >
>> > --
>> > Jeff Layton <jlayton@poochiereds.net>
>>
>>
>>
>
>
> --
> Jeff Layton <jlayton@poochiereds.net>
Jeff Layton Sept. 21, 2015, 11:44 a.m. UTC | #7
On Mon, 21 Sep 2015 13:38:45 +0200
Dmitry Vyukov <dvyukov@google.com> wrote:

> Yes. That processor is Alpha and that's documented in DATA DEPENDENCY
> BARRIERS section of Documentation/memory-barriers.txt.
> 
> 

Ok, thanks for the explanation. Patch looks fine to me. I'll go ahead
and merge it for v4.4. Let me know though if you think this needs to go
in sooner.

Thanks,
Jeff

> 
> On Mon, Sep 21, 2015 at 1:17 PM, Jeff Layton <jlayton@poochiereds.net> wrote:
> > On Mon, 21 Sep 2015 13:00:04 +0200
> > Dmitry Vyukov <dvyukov@google.com> wrote:
> >
> >> On Mon, Sep 21, 2015 at 12:56 PM, Jeff Layton <jlayton@poochiereds.net> wrote:
> >> > On Mon, 21 Sep 2015 12:53:40 +0200
> >> > Dmitry Vyukov <dvyukov@google.com> wrote:
> >> >
> >> >> On Mon, Sep 21, 2015 at 12:50 PM, Jeff Layton <jlayton@poochiereds.net> wrote:
> >> >> > On Mon, 21 Sep 2015 09:43:06 +0200
> >> >> > Dmitry Vyukov <dvyukov@google.com> wrote:
> >> >> >
> >> >> >> locks_get_lock_context() uses cmpxchg() to install i_flctx.
> >> >> >> cmpxchg() is a release operation which is correct. But it uses
> >> >> >> a plain load to load i_flctx. This is incorrect. Subsequent loads
> >> >> >> from i_flctx can hoist above the load of i_flctx pointer itself
> >> >> >> and observe uninitialized garbage there. This in turn can lead
> >> >> >> to corruption of ctx->flc_lock and other members.
> >> >> >>
> >> >> >
> >> >> > I don't get this bit. The i_flctx field is initialized to zero when the
> >> >> > inode is allocated, and we only assign it with cmpxchg(). How would you
> >> >> > ever see uninitialized garbage in there? It should either be NULL or a
> >> >> > valid pointer, no?
> >> >>
> >> >> This is not about i_flctx pointer, this is about i_flctx object
> >> >> contents (pointee).
> >> >> Readers can see a non-NULL pointer, but read garbage from *i_flctx.
> >> >>
> >> >
> >> > Again, I don't get it though. The cmpxchg happens after we've
> >> > initialized the structure. Given that there are implicit full memory
> >> > barriers before and after the cmpxchg(), how do you end up with another
> >> > task seeing the pointer before it was ever initialized?
> >> >
> >> > The store should only happen after the initialization has occurred, and
> >> > the loads in the other tasks shouldn't be able to see the results of
> >> > that store until then. No?
> >>
> >> If a reader looks at things in the opposite order, then it does not
> >> matter how you store them. Reader still can observe them in the wrong
> >> order.
> >> Memory barriers is always a game of two. Writer needs to do stores in
> >> the correct order and reader must do reads in the correct order. A
> >> single memory barrier cannot possible make any sense.
> >>
> >
> > I get that, but...isn't there a data dependency there? In order for the
> > reader to see bogus fields inside of i_flctx, it needs to be able to
> > see a valid pointer in i_flctx...and in order for that to occur the
> > store has to have occurred.
> >
> > I guess the concern is that the reader CPU could stumble across some
> > memory that is eventually going to part of i_flctx prior to loading
> > that pointer, and assume that its contents are still valid after the
> > pointer store has occurred? Is that the basic scenario?
> >
> >>
> >> >> > If that'st the case, then most of the places where you're adding
> >> >> > smp_load_acquire are places that can tolerate seeing NULL there. e.g.
> >> >> > you have racing LOCK_EX/LOCK_UN flock() calls in different tasks or
> >> >> > something.
> >> >> >
> >> >> >> Documentation/memory-barriers.txt explicitly requires to use
> >> >> >> a barrier in such context:
> >> >> >> "A load-load control dependency requires a full read memory barrier".
> >> >> >>
> >> >> >
> >> >> > The same file also says:
> >> >> >
> >> >> > "Any atomic operation that modifies some state in memory and returns information
> >> >> > about the state (old or new) implies an SMP-conditional general memory barrier
> >> >> > (smp_mb()) on each side of the actual operation (with the exception of
> >> >> > explicit lock operations, described later).  These include:
> >> >> >
> >> >> > ...
> >> >> > /* when succeeds */
> >> >> > cmpxchg();
> >> >> > "
> >> >> >
> >> >> > If there is an implied smp_mb() before and after the cmpxchg(), how
> >> >> > could the CPU hoist anything before that point?
> >> >> >
> >> >> > Note that I'm not saying that you're wrong, I just want to make sure I
> >> >> > fully understand the problem before we go trying to fix it.
> >> >> >
> >> >> >> Use smp_load_acquire() in locks_get_lock_context() and in bunch
> >> >> >> of other functions that can proceed concurrently with
> >> >> >> locks_get_lock_context().
> >> >> >>
> >> >> >> The data race was found with KernelThreadSanitizer (KTSAN).
> >> >> >>
> >> >> >> Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
> >> >> >> ---
> >> >> >> For the record, here is the KTSAN report:
> >> >> >>
> >> >> >> ThreadSanitizer: data-race in _raw_spin_lock
> >> >> >>
> >> >> >> Read at 0xffff8800bae50da0 of size 1 by thread 3060 on CPU 2:
> >> >> >>  [<     inline     >] __raw_spin_lock include/linux/spinlock_api_smp.h:158
> >> >> >>  [<ffffffff81ee37d0>] _raw_spin_lock+0x50/0x70 kernel/locking/spinlock.c:151
> >> >> >>  [<     inline     >] spin_lock include/linux/spinlock.h:312
> >> >> >>  [<ffffffff812e1187>] flock_lock_inode+0xb7/0x440 fs/locks.c:895
> >> >> >>  [<ffffffff812e16e6>] flock_lock_inode_wait+0x46/0x110 fs/locks.c:1871
> >> >> >>  [<ffffffff812e2814>] SyS_flock+0x224/0x23
> >> >> >>
> >> >> >> Previous write at 0xffff8800bae50da0 of size 8 by thread 3063 on CPU 8:
> >> >> >>  [<ffffffff81248628>] kmem_cache_alloc+0xd8/0x2f0 mm/slab.c:3420
> >> >> >>  [<ffffffff812ddba0>] locks_get_lock_context+0x60/0x140 fs/locks.c:213
> >> >> >>  [<ffffffff812e112a>] flock_lock_inode+0x5a/0x440 fs/locks.c:882
> >> >> >>  [<ffffffff812e16e6>] flock_lock_inode_wait+0x46/0x110 fs/locks.c:1871
> >> >> >>  [<     inline     >] flock_lock_file_wait include/linux/fs.h:1219
> >> >> >>  [<     inline     >] SYSC_flock fs/locks.c:1941
> >> >> >>  [<ffffffff812e2814>] SyS_flock+0x224/0x230 fs/locks.c:1904
> >> >> >>  [<ffffffff81ee3e11>] entry_SYSCALL_64_fastpath+0x31/0x95 arch/x86/entry/entry_64.S:188
> >> >> >> ---
> >> >> >>  fs/locks.c | 63 +++++++++++++++++++++++++++++++++++---------------------------
> >> >> >>  1 file changed, 36 insertions(+), 27 deletions(-)
> >> >> >>
> >> >> >> diff --git a/fs/locks.c b/fs/locks.c
> >> >> >> index 2a54c80..316e474 100644
> >> >> >> --- a/fs/locks.c
> >> >> >> +++ b/fs/locks.c
> >> >> >> @@ -205,28 +205,32 @@ static struct kmem_cache *filelock_cache __read_mostly;
> >> >> >>  static struct file_lock_context *
> >> >> >>  locks_get_lock_context(struct inode *inode, int type)
> >> >> >>  {
> >> >> >> -     struct file_lock_context *new;
> >> >> >> +     struct file_lock_context *ctx;
> >> >> >>
> >> >> >> -     if (likely(inode->i_flctx) || type == F_UNLCK)
> >> >> >> +     /* paired with cmpxchg() below */
> >> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
> >> >> >> +     if (likely(ctx) || type == F_UNLCK)
> >> >> >>               goto out;
> >> >> >>
> >> >> >> -     new = kmem_cache_alloc(flctx_cache, GFP_KERNEL);
> >> >> >> -     if (!new)
> >> >> >> +     ctx = kmem_cache_alloc(flctx_cache, GFP_KERNEL);
> >> >> >> +     if (!ctx)
> >> >> >>               goto out;
> >> >> >>
> >> >> >> -     spin_lock_init(&new->flc_lock);
> >> >> >> -     INIT_LIST_HEAD(&new->flc_flock);
> >> >> >> -     INIT_LIST_HEAD(&new->flc_posix);
> >> >> >> -     INIT_LIST_HEAD(&new->flc_lease);
> >> >> >> +     spin_lock_init(&ctx->flc_lock);
> >> >> >> +     INIT_LIST_HEAD(&ctx->flc_flock);
> >> >> >> +     INIT_LIST_HEAD(&ctx->flc_posix);
> >> >> >> +     INIT_LIST_HEAD(&ctx->flc_lease);
> >> >> >>
> >> >> >>       /*
> >> >> >>        * Assign the pointer if it's not already assigned. If it is, then
> >> >> >>        * free the context we just allocated.
> >> >> >>        */
> >> >> >> -     if (cmpxchg(&inode->i_flctx, NULL, new))
> >> >> >> -             kmem_cache_free(flctx_cache, new);
> >> >> >> +     if (cmpxchg(&inode->i_flctx, NULL, ctx)) {
> >> >> >> +             kmem_cache_free(flctx_cache, ctx);
> >> >> >> +             ctx = smp_load_acquire(&inode->i_flctx);
> >> >> >> +     }
> >> >> >>  out:
> >> >> >> -     return inode->i_flctx;
> >> >> >> +     return ctx;
> >> >> >>  }
> >> >> >>
> >> >> >>  void
> >> >> >> @@ -762,7 +766,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
> >> >> >>       struct file_lock_context *ctx;
> >> >> >>       struct inode *inode = file_inode(filp);
> >> >> >>
> >> >> >> -     ctx = inode->i_flctx;
> >> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
> >> >> >>       if (!ctx || list_empty_careful(&ctx->flc_posix)) {
> >> >> >>               fl->fl_type = F_UNLCK;
> >> >> >>               return;
> >> >> >> @@ -1203,7 +1207,7 @@ int locks_mandatory_locked(struct file *file)
> >> >> >>       struct file_lock_context *ctx;
> >> >> >>       struct file_lock *fl;
> >> >> >>
> >> >> >> -     ctx = inode->i_flctx;
> >> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
> >> >> >>       if (!ctx || list_empty_careful(&ctx->flc_posix))
> >> >> >>               return 0;
> >> >> >>
> >> >> >> @@ -1388,7 +1392,7 @@ any_leases_conflict(struct inode *inode, struct file_lock *breaker)
> >> >> >>  int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
> >> >> >>  {
> >> >> >>       int error = 0;
> >> >> >> -     struct file_lock_context *ctx = inode->i_flctx;
> >> >> >> +     struct file_lock_context *ctx;
> >> >> >>       struct file_lock *new_fl, *fl, *tmp;
> >> >> >>       unsigned long break_time;
> >> >> >>       int want_write = (mode & O_ACCMODE) != O_RDONLY;
> >> >> >> @@ -1400,6 +1404,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
> >> >> >>       new_fl->fl_flags = type;
> >> >> >>
> >> >> >>       /* typically we will check that ctx is non-NULL before calling */
> >> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
> >> >> >>       if (!ctx) {
> >> >> >>               WARN_ON_ONCE(1);
> >> >> >>               return error;
> >> >> >> @@ -1494,9 +1499,10 @@ EXPORT_SYMBOL(__break_lease);
> >> >> >>  void lease_get_mtime(struct inode *inode, struct timespec *time)
> >> >> >>  {
> >> >> >>       bool has_lease = false;
> >> >> >> -     struct file_lock_context *ctx = inode->i_flctx;
> >> >> >> +     struct file_lock_context *ctx;
> >> >> >>       struct file_lock *fl;
> >> >> >>
> >> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
> >> >> >>       if (ctx && !list_empty_careful(&ctx->flc_lease)) {
> >> >> >>               spin_lock(&ctx->flc_lock);
> >> >> >>               if (!list_empty(&ctx->flc_lease)) {
> >> >> >> @@ -1543,10 +1549,11 @@ int fcntl_getlease(struct file *filp)
> >> >> >>  {
> >> >> >>       struct file_lock *fl;
> >> >> >>       struct inode *inode = file_inode(filp);
> >> >> >> -     struct file_lock_context *ctx = inode->i_flctx;
> >> >> >> +     struct file_lock_context *ctx;
> >> >> >>       int type = F_UNLCK;
> >> >> >>       LIST_HEAD(dispose);
> >> >> >>
> >> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
> >> >> >>       if (ctx && !list_empty_careful(&ctx->flc_lease)) {
> >> >> >>               spin_lock(&ctx->flc_lock);
> >> >> >>               time_out_leases(file_inode(filp), &dispose);
> >> >> >> @@ -1713,9 +1720,10 @@ static int generic_delete_lease(struct file *filp, void *owner)
> >> >> >>       struct file_lock *fl, *victim = NULL;
> >> >> >>       struct dentry *dentry = filp->f_path.dentry;
> >> >> >>       struct inode *inode = dentry->d_inode;
> >> >> >> -     struct file_lock_context *ctx = inode->i_flctx;
> >> >> >> +     struct file_lock_context *ctx;
> >> >> >>       LIST_HEAD(dispose);
> >> >> >>
> >> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
> >> >> >>       if (!ctx) {
> >> >> >>               trace_generic_delete_lease(inode, NULL);
> >> >> >>               return error;
> >> >> >> @@ -2359,13 +2367,14 @@ out:
> >> >> >>  void locks_remove_posix(struct file *filp, fl_owner_t owner)
> >> >> >>  {
> >> >> >>       struct file_lock lock;
> >> >> >> -     struct file_lock_context *ctx = file_inode(filp)->i_flctx;
> >> >> >> +     struct file_lock_context *ctx;
> >> >> >>
> >> >> >>       /*
> >> >> >>        * If there are no locks held on this file, we don't need to call
> >> >> >>        * posix_lock_file().  Another process could be setting a lock on this
> >> >> >>        * file at the same time, but we wouldn't remove that lock anyway.
> >> >> >>        */
> >> >> >> +     ctx =  smp_load_acquire(&file_inode(filp)->i_flctx);
> >> >> >>       if (!ctx || list_empty(&ctx->flc_posix))
> >> >> >>               return;
> >> >> >>
> >> >> >> @@ -2389,7 +2398,7 @@ EXPORT_SYMBOL(locks_remove_posix);
> >> >> >>
> >> >> >>  /* The i_flctx must be valid when calling into here */
> >> >> >>  static void
> >> >> >> -locks_remove_flock(struct file *filp)
> >> >> >> +locks_remove_flock(struct file *filp, struct file_lock_context *flctx)
> >> >> >>  {
> >> >> >>       struct file_lock fl = {
> >> >> >>               .fl_owner = filp,
> >> >> >> @@ -2400,7 +2409,6 @@ locks_remove_flock(struct file *filp)
> >> >> >>               .fl_end = OFFSET_MAX,
> >> >> >>       };
> >> >> >>       struct inode *inode = file_inode(filp);
> >> >> >> -     struct file_lock_context *flctx = inode->i_flctx;
> >> >> >>
> >> >> >>       if (list_empty(&flctx->flc_flock))
> >> >> >>               return;
> >> >> >> @@ -2416,10 +2424,8 @@ locks_remove_flock(struct file *filp)
> >> >> >>
> >> >> >>  /* The i_flctx must be valid when calling into here */
> >> >> >>  static void
> >> >> >> -locks_remove_lease(struct file *filp)
> >> >> >> +locks_remove_lease(struct file *filp, struct file_lock_context *ctx)
> >> >> >>  {
> >> >> >> -     struct inode *inode = file_inode(filp);
> >> >> >> -     struct file_lock_context *ctx = inode->i_flctx;
> >> >> >>       struct file_lock *fl, *tmp;
> >> >> >>       LIST_HEAD(dispose);
> >> >> >>
> >> >> >> @@ -2439,17 +2445,20 @@ locks_remove_lease(struct file *filp)
> >> >> >>   */
> >> >> >>  void locks_remove_file(struct file *filp)
> >> >> >>  {
> >> >> >> -     if (!file_inode(filp)->i_flctx)
> >> >> >> +     struct file_lock_context *ctx;
> >> >> >> +
> >> >> >> +     ctx = smp_load_acquire(&file_inode(filp)->i_flctx);
> >> >> >> +     if (!ctx)
> >> >> >>               return;
> >> >> >>
> >> >> >>       /* remove any OFD locks */
> >> >> >>       locks_remove_posix(filp, filp);
> >> >> >>
> >> >> >>       /* remove flock locks */
> >> >> >> -     locks_remove_flock(filp);
> >> >> >> +     locks_remove_flock(filp, ctx);
> >> >> >>
> >> >> >>       /* remove any leases */
> >> >> >> -     locks_remove_lease(filp);
> >> >> >> +     locks_remove_lease(filp, ctx);
> >> >> >>  }
> >> >> >>
> >> >> >>  /**
> >> >> >> @@ -2616,7 +2625,7 @@ void show_fd_locks(struct seq_file *f,
> >> >> >>       struct file_lock_context *ctx;
> >> >> >>       int id = 0;
> >> >> >>
> >> >> >> -     ctx = inode->i_flctx;
> >> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
> >> >> >>       if (!ctx)
> >> >> >>               return;
> >> >> >>
> >> >> >
> >> >> >
> >> >> > --
> >> >> > Jeff Layton <jlayton@poochiereds.net>
> >> >>
> >> >>
> >> >>
> >> >
> >> >
> >> > --
> >> > Jeff Layton <jlayton@poochiereds.net>
> >>
> >>
> >>
> >
> >
> > --
> > Jeff Layton <jlayton@poochiereds.net>
> 
> 
>
Dmitry Vyukov Sept. 21, 2015, 11:53 a.m. UTC | #8
No hurry on my side.
Thanks

On Mon, Sep 21, 2015 at 1:44 PM, Jeff Layton <jlayton@poochiereds.net> wrote:
> On Mon, 21 Sep 2015 13:38:45 +0200
> Dmitry Vyukov <dvyukov@google.com> wrote:
>
>> Yes. That processor is Alpha and that's documented in DATA DEPENDENCY
>> BARRIERS section of Documentation/memory-barriers.txt.
>>
>>
>
> Ok, thanks for the explanation. Patch looks fine to me. I'll go ahead
> and merge it for v4.4. Let me know though if you think this needs to go
> in sooner.
>
> Thanks,
> Jeff
>
>>
>> On Mon, Sep 21, 2015 at 1:17 PM, Jeff Layton <jlayton@poochiereds.net> wrote:
>> > On Mon, 21 Sep 2015 13:00:04 +0200
>> > Dmitry Vyukov <dvyukov@google.com> wrote:
>> >
>> >> On Mon, Sep 21, 2015 at 12:56 PM, Jeff Layton <jlayton@poochiereds.net> wrote:
>> >> > On Mon, 21 Sep 2015 12:53:40 +0200
>> >> > Dmitry Vyukov <dvyukov@google.com> wrote:
>> >> >
>> >> >> On Mon, Sep 21, 2015 at 12:50 PM, Jeff Layton <jlayton@poochiereds.net> wrote:
>> >> >> > On Mon, 21 Sep 2015 09:43:06 +0200
>> >> >> > Dmitry Vyukov <dvyukov@google.com> wrote:
>> >> >> >
>> >> >> >> locks_get_lock_context() uses cmpxchg() to install i_flctx.
>> >> >> >> cmpxchg() is a release operation which is correct. But it uses
>> >> >> >> a plain load to load i_flctx. This is incorrect. Subsequent loads
>> >> >> >> from i_flctx can hoist above the load of i_flctx pointer itself
>> >> >> >> and observe uninitialized garbage there. This in turn can lead
>> >> >> >> to corruption of ctx->flc_lock and other members.
>> >> >> >>
>> >> >> >
>> >> >> > I don't get this bit. The i_flctx field is initialized to zero when the
>> >> >> > inode is allocated, and we only assign it with cmpxchg(). How would you
>> >> >> > ever see uninitialized garbage in there? It should either be NULL or a
>> >> >> > valid pointer, no?
>> >> >>
>> >> >> This is not about i_flctx pointer, this is about i_flctx object
>> >> >> contents (pointee).
>> >> >> Readers can see a non-NULL pointer, but read garbage from *i_flctx.
>> >> >>
>> >> >
>> >> > Again, I don't get it though. The cmpxchg happens after we've
>> >> > initialized the structure. Given that there are implicit full memory
>> >> > barriers before and after the cmpxchg(), how do you end up with another
>> >> > task seeing the pointer before it was ever initialized?
>> >> >
>> >> > The store should only happen after the initialization has occurred, and
>> >> > the loads in the other tasks shouldn't be able to see the results of
>> >> > that store until then. No?
>> >>
>> >> If a reader looks at things in the opposite order, then it does not
>> >> matter how you store them. Reader still can observe them in the wrong
>> >> order.
>> >> Memory barriers is always a game of two. Writer needs to do stores in
>> >> the correct order and reader must do reads in the correct order. A
>> >> single memory barrier cannot possible make any sense.
>> >>
>> >
>> > I get that, but...isn't there a data dependency there? In order for the
>> > reader to see bogus fields inside of i_flctx, it needs to be able to
>> > see a valid pointer in i_flctx...and in order for that to occur the
>> > store has to have occurred.
>> >
>> > I guess the concern is that the reader CPU could stumble across some
>> > memory that is eventually going to part of i_flctx prior to loading
>> > that pointer, and assume that its contents are still valid after the
>> > pointer store has occurred? Is that the basic scenario?
>> >
>> >>
>> >> >> > If that'st the case, then most of the places where you're adding
>> >> >> > smp_load_acquire are places that can tolerate seeing NULL there. e.g.
>> >> >> > you have racing LOCK_EX/LOCK_UN flock() calls in different tasks or
>> >> >> > something.
>> >> >> >
>> >> >> >> Documentation/memory-barriers.txt explicitly requires to use
>> >> >> >> a barrier in such context:
>> >> >> >> "A load-load control dependency requires a full read memory barrier".
>> >> >> >>
>> >> >> >
>> >> >> > The same file also says:
>> >> >> >
>> >> >> > "Any atomic operation that modifies some state in memory and returns information
>> >> >> > about the state (old or new) implies an SMP-conditional general memory barrier
>> >> >> > (smp_mb()) on each side of the actual operation (with the exception of
>> >> >> > explicit lock operations, described later).  These include:
>> >> >> >
>> >> >> > ...
>> >> >> > /* when succeeds */
>> >> >> > cmpxchg();
>> >> >> > "
>> >> >> >
>> >> >> > If there is an implied smp_mb() before and after the cmpxchg(), how
>> >> >> > could the CPU hoist anything before that point?
>> >> >> >
>> >> >> > Note that I'm not saying that you're wrong, I just want to make sure I
>> >> >> > fully understand the problem before we go trying to fix it.
>> >> >> >
>> >> >> >> Use smp_load_acquire() in locks_get_lock_context() and in bunch
>> >> >> >> of other functions that can proceed concurrently with
>> >> >> >> locks_get_lock_context().
>> >> >> >>
>> >> >> >> The data race was found with KernelThreadSanitizer (KTSAN).
>> >> >> >>
>> >> >> >> Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
>> >> >> >> ---
>> >> >> >> For the record, here is the KTSAN report:
>> >> >> >>
>> >> >> >> ThreadSanitizer: data-race in _raw_spin_lock
>> >> >> >>
>> >> >> >> Read at 0xffff8800bae50da0 of size 1 by thread 3060 on CPU 2:
>> >> >> >>  [<     inline     >] __raw_spin_lock include/linux/spinlock_api_smp.h:158
>> >> >> >>  [<ffffffff81ee37d0>] _raw_spin_lock+0x50/0x70 kernel/locking/spinlock.c:151
>> >> >> >>  [<     inline     >] spin_lock include/linux/spinlock.h:312
>> >> >> >>  [<ffffffff812e1187>] flock_lock_inode+0xb7/0x440 fs/locks.c:895
>> >> >> >>  [<ffffffff812e16e6>] flock_lock_inode_wait+0x46/0x110 fs/locks.c:1871
>> >> >> >>  [<ffffffff812e2814>] SyS_flock+0x224/0x23
>> >> >> >>
>> >> >> >> Previous write at 0xffff8800bae50da0 of size 8 by thread 3063 on CPU 8:
>> >> >> >>  [<ffffffff81248628>] kmem_cache_alloc+0xd8/0x2f0 mm/slab.c:3420
>> >> >> >>  [<ffffffff812ddba0>] locks_get_lock_context+0x60/0x140 fs/locks.c:213
>> >> >> >>  [<ffffffff812e112a>] flock_lock_inode+0x5a/0x440 fs/locks.c:882
>> >> >> >>  [<ffffffff812e16e6>] flock_lock_inode_wait+0x46/0x110 fs/locks.c:1871
>> >> >> >>  [<     inline     >] flock_lock_file_wait include/linux/fs.h:1219
>> >> >> >>  [<     inline     >] SYSC_flock fs/locks.c:1941
>> >> >> >>  [<ffffffff812e2814>] SyS_flock+0x224/0x230 fs/locks.c:1904
>> >> >> >>  [<ffffffff81ee3e11>] entry_SYSCALL_64_fastpath+0x31/0x95 arch/x86/entry/entry_64.S:188
>> >> >> >> ---
>> >> >> >>  fs/locks.c | 63 +++++++++++++++++++++++++++++++++++---------------------------
>> >> >> >>  1 file changed, 36 insertions(+), 27 deletions(-)
>> >> >> >>
>> >> >> >> diff --git a/fs/locks.c b/fs/locks.c
>> >> >> >> index 2a54c80..316e474 100644
>> >> >> >> --- a/fs/locks.c
>> >> >> >> +++ b/fs/locks.c
>> >> >> >> @@ -205,28 +205,32 @@ static struct kmem_cache *filelock_cache __read_mostly;
>> >> >> >>  static struct file_lock_context *
>> >> >> >>  locks_get_lock_context(struct inode *inode, int type)
>> >> >> >>  {
>> >> >> >> -     struct file_lock_context *new;
>> >> >> >> +     struct file_lock_context *ctx;
>> >> >> >>
>> >> >> >> -     if (likely(inode->i_flctx) || type == F_UNLCK)
>> >> >> >> +     /* paired with cmpxchg() below */
>> >> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
>> >> >> >> +     if (likely(ctx) || type == F_UNLCK)
>> >> >> >>               goto out;
>> >> >> >>
>> >> >> >> -     new = kmem_cache_alloc(flctx_cache, GFP_KERNEL);
>> >> >> >> -     if (!new)
>> >> >> >> +     ctx = kmem_cache_alloc(flctx_cache, GFP_KERNEL);
>> >> >> >> +     if (!ctx)
>> >> >> >>               goto out;
>> >> >> >>
>> >> >> >> -     spin_lock_init(&new->flc_lock);
>> >> >> >> -     INIT_LIST_HEAD(&new->flc_flock);
>> >> >> >> -     INIT_LIST_HEAD(&new->flc_posix);
>> >> >> >> -     INIT_LIST_HEAD(&new->flc_lease);
>> >> >> >> +     spin_lock_init(&ctx->flc_lock);
>> >> >> >> +     INIT_LIST_HEAD(&ctx->flc_flock);
>> >> >> >> +     INIT_LIST_HEAD(&ctx->flc_posix);
>> >> >> >> +     INIT_LIST_HEAD(&ctx->flc_lease);
>> >> >> >>
>> >> >> >>       /*
>> >> >> >>        * Assign the pointer if it's not already assigned. If it is, then
>> >> >> >>        * free the context we just allocated.
>> >> >> >>        */
>> >> >> >> -     if (cmpxchg(&inode->i_flctx, NULL, new))
>> >> >> >> -             kmem_cache_free(flctx_cache, new);
>> >> >> >> +     if (cmpxchg(&inode->i_flctx, NULL, ctx)) {
>> >> >> >> +             kmem_cache_free(flctx_cache, ctx);
>> >> >> >> +             ctx = smp_load_acquire(&inode->i_flctx);
>> >> >> >> +     }
>> >> >> >>  out:
>> >> >> >> -     return inode->i_flctx;
>> >> >> >> +     return ctx;
>> >> >> >>  }
>> >> >> >>
>> >> >> >>  void
>> >> >> >> @@ -762,7 +766,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
>> >> >> >>       struct file_lock_context *ctx;
>> >> >> >>       struct inode *inode = file_inode(filp);
>> >> >> >>
>> >> >> >> -     ctx = inode->i_flctx;
>> >> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
>> >> >> >>       if (!ctx || list_empty_careful(&ctx->flc_posix)) {
>> >> >> >>               fl->fl_type = F_UNLCK;
>> >> >> >>               return;
>> >> >> >> @@ -1203,7 +1207,7 @@ int locks_mandatory_locked(struct file *file)
>> >> >> >>       struct file_lock_context *ctx;
>> >> >> >>       struct file_lock *fl;
>> >> >> >>
>> >> >> >> -     ctx = inode->i_flctx;
>> >> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
>> >> >> >>       if (!ctx || list_empty_careful(&ctx->flc_posix))
>> >> >> >>               return 0;
>> >> >> >>
>> >> >> >> @@ -1388,7 +1392,7 @@ any_leases_conflict(struct inode *inode, struct file_lock *breaker)
>> >> >> >>  int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
>> >> >> >>  {
>> >> >> >>       int error = 0;
>> >> >> >> -     struct file_lock_context *ctx = inode->i_flctx;
>> >> >> >> +     struct file_lock_context *ctx;
>> >> >> >>       struct file_lock *new_fl, *fl, *tmp;
>> >> >> >>       unsigned long break_time;
>> >> >> >>       int want_write = (mode & O_ACCMODE) != O_RDONLY;
>> >> >> >> @@ -1400,6 +1404,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
>> >> >> >>       new_fl->fl_flags = type;
>> >> >> >>
>> >> >> >>       /* typically we will check that ctx is non-NULL before calling */
>> >> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
>> >> >> >>       if (!ctx) {
>> >> >> >>               WARN_ON_ONCE(1);
>> >> >> >>               return error;
>> >> >> >> @@ -1494,9 +1499,10 @@ EXPORT_SYMBOL(__break_lease);
>> >> >> >>  void lease_get_mtime(struct inode *inode, struct timespec *time)
>> >> >> >>  {
>> >> >> >>       bool has_lease = false;
>> >> >> >> -     struct file_lock_context *ctx = inode->i_flctx;
>> >> >> >> +     struct file_lock_context *ctx;
>> >> >> >>       struct file_lock *fl;
>> >> >> >>
>> >> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
>> >> >> >>       if (ctx && !list_empty_careful(&ctx->flc_lease)) {
>> >> >> >>               spin_lock(&ctx->flc_lock);
>> >> >> >>               if (!list_empty(&ctx->flc_lease)) {
>> >> >> >> @@ -1543,10 +1549,11 @@ int fcntl_getlease(struct file *filp)
>> >> >> >>  {
>> >> >> >>       struct file_lock *fl;
>> >> >> >>       struct inode *inode = file_inode(filp);
>> >> >> >> -     struct file_lock_context *ctx = inode->i_flctx;
>> >> >> >> +     struct file_lock_context *ctx;
>> >> >> >>       int type = F_UNLCK;
>> >> >> >>       LIST_HEAD(dispose);
>> >> >> >>
>> >> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
>> >> >> >>       if (ctx && !list_empty_careful(&ctx->flc_lease)) {
>> >> >> >>               spin_lock(&ctx->flc_lock);
>> >> >> >>               time_out_leases(file_inode(filp), &dispose);
>> >> >> >> @@ -1713,9 +1720,10 @@ static int generic_delete_lease(struct file *filp, void *owner)
>> >> >> >>       struct file_lock *fl, *victim = NULL;
>> >> >> >>       struct dentry *dentry = filp->f_path.dentry;
>> >> >> >>       struct inode *inode = dentry->d_inode;
>> >> >> >> -     struct file_lock_context *ctx = inode->i_flctx;
>> >> >> >> +     struct file_lock_context *ctx;
>> >> >> >>       LIST_HEAD(dispose);
>> >> >> >>
>> >> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
>> >> >> >>       if (!ctx) {
>> >> >> >>               trace_generic_delete_lease(inode, NULL);
>> >> >> >>               return error;
>> >> >> >> @@ -2359,13 +2367,14 @@ out:
>> >> >> >>  void locks_remove_posix(struct file *filp, fl_owner_t owner)
>> >> >> >>  {
>> >> >> >>       struct file_lock lock;
>> >> >> >> -     struct file_lock_context *ctx = file_inode(filp)->i_flctx;
>> >> >> >> +     struct file_lock_context *ctx;
>> >> >> >>
>> >> >> >>       /*
>> >> >> >>        * If there are no locks held on this file, we don't need to call
>> >> >> >>        * posix_lock_file().  Another process could be setting a lock on this
>> >> >> >>        * file at the same time, but we wouldn't remove that lock anyway.
>> >> >> >>        */
>> >> >> >> +     ctx =  smp_load_acquire(&file_inode(filp)->i_flctx);
>> >> >> >>       if (!ctx || list_empty(&ctx->flc_posix))
>> >> >> >>               return;
>> >> >> >>
>> >> >> >> @@ -2389,7 +2398,7 @@ EXPORT_SYMBOL(locks_remove_posix);
>> >> >> >>
>> >> >> >>  /* The i_flctx must be valid when calling into here */
>> >> >> >>  static void
>> >> >> >> -locks_remove_flock(struct file *filp)
>> >> >> >> +locks_remove_flock(struct file *filp, struct file_lock_context *flctx)
>> >> >> >>  {
>> >> >> >>       struct file_lock fl = {
>> >> >> >>               .fl_owner = filp,
>> >> >> >> @@ -2400,7 +2409,6 @@ locks_remove_flock(struct file *filp)
>> >> >> >>               .fl_end = OFFSET_MAX,
>> >> >> >>       };
>> >> >> >>       struct inode *inode = file_inode(filp);
>> >> >> >> -     struct file_lock_context *flctx = inode->i_flctx;
>> >> >> >>
>> >> >> >>       if (list_empty(&flctx->flc_flock))
>> >> >> >>               return;
>> >> >> >> @@ -2416,10 +2424,8 @@ locks_remove_flock(struct file *filp)
>> >> >> >>
>> >> >> >>  /* The i_flctx must be valid when calling into here */
>> >> >> >>  static void
>> >> >> >> -locks_remove_lease(struct file *filp)
>> >> >> >> +locks_remove_lease(struct file *filp, struct file_lock_context *ctx)
>> >> >> >>  {
>> >> >> >> -     struct inode *inode = file_inode(filp);
>> >> >> >> -     struct file_lock_context *ctx = inode->i_flctx;
>> >> >> >>       struct file_lock *fl, *tmp;
>> >> >> >>       LIST_HEAD(dispose);
>> >> >> >>
>> >> >> >> @@ -2439,17 +2445,20 @@ locks_remove_lease(struct file *filp)
>> >> >> >>   */
>> >> >> >>  void locks_remove_file(struct file *filp)
>> >> >> >>  {
>> >> >> >> -     if (!file_inode(filp)->i_flctx)
>> >> >> >> +     struct file_lock_context *ctx;
>> >> >> >> +
>> >> >> >> +     ctx = smp_load_acquire(&file_inode(filp)->i_flctx);
>> >> >> >> +     if (!ctx)
>> >> >> >>               return;
>> >> >> >>
>> >> >> >>       /* remove any OFD locks */
>> >> >> >>       locks_remove_posix(filp, filp);
>> >> >> >>
>> >> >> >>       /* remove flock locks */
>> >> >> >> -     locks_remove_flock(filp);
>> >> >> >> +     locks_remove_flock(filp, ctx);
>> >> >> >>
>> >> >> >>       /* remove any leases */
>> >> >> >> -     locks_remove_lease(filp);
>> >> >> >> +     locks_remove_lease(filp, ctx);
>> >> >> >>  }
>> >> >> >>
>> >> >> >>  /**
>> >> >> >> @@ -2616,7 +2625,7 @@ void show_fd_locks(struct seq_file *f,
>> >> >> >>       struct file_lock_context *ctx;
>> >> >> >>       int id = 0;
>> >> >> >>
>> >> >> >> -     ctx = inode->i_flctx;
>> >> >> >> +     ctx = smp_load_acquire(&inode->i_flctx);
>> >> >> >>       if (!ctx)
>> >> >> >>               return;
>> >> >> >>
>> >> >> >
>> >> >> >
>> >> >> > --
>> >> >> > Jeff Layton <jlayton@poochiereds.net>
>> >> >>
>> >> >>
>> >> >>
>> >> >
>> >> >
>> >> > --
>> >> > Jeff Layton <jlayton@poochiereds.net>
>> >>
>> >>
>> >>
>> >
>> >
>> > --
>> > Jeff Layton <jlayton@poochiereds.net>
>>
>>
>>
>
>
> --
> Jeff Layton <jlayton@poochiereds.net>
William Dauchy Oct. 19, 2015, 3:18 p.m. UTC | #9
Hello Dmitry,

On Mon, Sep 21, 2015 at 1:44 PM, Jeff Layton <jlayton@poochiereds.net> wrote:
> Ok, thanks for the explanation. Patch looks fine to me. I'll go ahead
> and merge it for v4.4. Let me know though if you think this needs to go
> in sooner.

I am getting a null deref on a v4.1.x
Do you think your patch could fix the following trace? It looks
similar in my opinion.

BUG: unable to handle kernel NULL pointer dereference at 00000000000001c8
IP: [<ffffffff811d0cf3>] locks_get_lock_context+0x3/0xc0
PGD 0
Oops: 0000 [#1] SMP
CPU: 1 PID: 1773 Comm: kworker/1:1H Not tainted 4.1.11-rc1 #1
Workqueue: rpciod ffffffff8164fff0
task: ffff8810374deba0 ti: ffff8810374df150 task.ti: ffff8810374df150
RIP: 0010:[<ffffffff811d0cf3>]  [<ffffffff811d0cf3>]
locks_get_lock_context+0x3/0xc0
RSP: 0000:ffff881036007bb0  EFLAGS: 00010246
RAX: ffff881036007c30 RBX: ffff881001981880 RCX: 0000000000000002
RDX: 00000000000006ed RSI: 0000000000000002 RDI: 0000000000000000
RBP: ffff881036007c08 R08: 0000000000000000 R09: 0000000000000001
R10: 0000000000000000 R11: ffff88101db69948 R12: ffff8810019818d8
R13: ffff881036007bc8 R14: ffff880e225d81c0 R15: ffff881edfd2b400
FS:  0000000000000000(0000) GS:ffff88103fc20000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00000000000001c8 CR3: 000000000169b000 CR4: 00000000000606f0
Stack:
 ffffffff811d2710 ffff881036007bc8 ffffffff819f1af1 ffff881036007bc8
 ffff881036007bc8 ffff881036007c08 ffff881001981880 ffff8810019818d8
 ffff881036007c48 ffff880e225d81c0 ffff881edfd2b400 ffff881036007c88
Call Trace:
 [<ffffffff811d2710>] ? flock_lock_file+0x30/0x270
 [<ffffffff811d3ad1>] flock_lock_file_wait+0x41/0xf0
 [<ffffffff8168be66>] ? _raw_spin_unlock+0x26/0x40
 [<ffffffff81268de9>] do_vfs_lock+0x19/0x40
 [<ffffffff812695cc>] nfs4_locku_done+0x5c/0xf0
 [<ffffffff8164f3f4>] rpc_exit_task+0x34/0xb0
 [<ffffffff8164fcd9>] __rpc_execute+0x79/0x390
 [<ffffffff81650000>] rpc_async_schedule+0x10/0x20
 [<ffffffff81086095>] process_one_work+0x1a5/0x450
 [<ffffffff81086024>] ? process_one_work+0x134/0x450
 [<ffffffff8108638b>] worker_thread+0x4b/0x4a0
 [<ffffffff81086340>] ? process_one_work+0x450/0x450
 [<ffffffff81086340>] ? process_one_work+0x450/0x450
 [<ffffffff8108d777>] kthread+0xf7/0x110
 [<ffffffff8108d680>] ? __kthread_parkme+0xa0/0xa0
 [<ffffffff8168ce3e>] ret_from_fork+0x3e/0x70
 [<ffffffff8108d680>] ? __kthread_parkme+0xa0/0xa0
Code: 48 b8 00 00 00 00 00 00 00 80 55 48 89 e5 48 09 c1 ff d1 5d 85
c0 0f 95 c0 0f b6 c0 eb b9 66 2e 0f 1f 84 00 00 00 00 00 83 fe 02 <48>
8b 87 c8 01 00 00 0f 84 a0 00 00 00 48 85 c0 0f 85 97 00 00
RIP  [<ffffffff811d0cf3>] locks_get_lock_context+0x3/0xc0
 RSP <ffff881036007bb0>
CR2: 00000000000001c8
---[ end trace 2da9686dda1b5574 ]---


Thanks,
Dmitry Vyukov Oct. 19, 2015, 3:27 p.m. UTC | #10
I would expect that you see something else.
The issue that I fixed would fire very infrequently and unreproducibly.


On Mon, Oct 19, 2015 at 5:18 PM, William Dauchy <wdauchy@gmail.com> wrote:
> Hello Dmitry,
>
> On Mon, Sep 21, 2015 at 1:44 PM, Jeff Layton <jlayton@poochiereds.net> wrote:
>> Ok, thanks for the explanation. Patch looks fine to me. I'll go ahead
>> and merge it for v4.4. Let me know though if you think this needs to go
>> in sooner.
>
> I am getting a null deref on a v4.1.x
> Do you think your patch could fix the following trace? It looks
> similar in my opinion.
>
> BUG: unable to handle kernel NULL pointer dereference at 00000000000001c8
> IP: [<ffffffff811d0cf3>] locks_get_lock_context+0x3/0xc0
> PGD 0
> Oops: 0000 [#1] SMP
> CPU: 1 PID: 1773 Comm: kworker/1:1H Not tainted 4.1.11-rc1 #1
> Workqueue: rpciod ffffffff8164fff0
> task: ffff8810374deba0 ti: ffff8810374df150 task.ti: ffff8810374df150
> RIP: 0010:[<ffffffff811d0cf3>]  [<ffffffff811d0cf3>]
> locks_get_lock_context+0x3/0xc0
> RSP: 0000:ffff881036007bb0  EFLAGS: 00010246
> RAX: ffff881036007c30 RBX: ffff881001981880 RCX: 0000000000000002
> RDX: 00000000000006ed RSI: 0000000000000002 RDI: 0000000000000000
> RBP: ffff881036007c08 R08: 0000000000000000 R09: 0000000000000001
> R10: 0000000000000000 R11: ffff88101db69948 R12: ffff8810019818d8
> R13: ffff881036007bc8 R14: ffff880e225d81c0 R15: ffff881edfd2b400
> FS:  0000000000000000(0000) GS:ffff88103fc20000(0000) knlGS:0000000000000000
> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 00000000000001c8 CR3: 000000000169b000 CR4: 00000000000606f0
> Stack:
>  ffffffff811d2710 ffff881036007bc8 ffffffff819f1af1 ffff881036007bc8
>  ffff881036007bc8 ffff881036007c08 ffff881001981880 ffff8810019818d8
>  ffff881036007c48 ffff880e225d81c0 ffff881edfd2b400 ffff881036007c88
> Call Trace:
>  [<ffffffff811d2710>] ? flock_lock_file+0x30/0x270
>  [<ffffffff811d3ad1>] flock_lock_file_wait+0x41/0xf0
>  [<ffffffff8168be66>] ? _raw_spin_unlock+0x26/0x40
>  [<ffffffff81268de9>] do_vfs_lock+0x19/0x40
>  [<ffffffff812695cc>] nfs4_locku_done+0x5c/0xf0
>  [<ffffffff8164f3f4>] rpc_exit_task+0x34/0xb0
>  [<ffffffff8164fcd9>] __rpc_execute+0x79/0x390
>  [<ffffffff81650000>] rpc_async_schedule+0x10/0x20
>  [<ffffffff81086095>] process_one_work+0x1a5/0x450
>  [<ffffffff81086024>] ? process_one_work+0x134/0x450
>  [<ffffffff8108638b>] worker_thread+0x4b/0x4a0
>  [<ffffffff81086340>] ? process_one_work+0x450/0x450
>  [<ffffffff81086340>] ? process_one_work+0x450/0x450
>  [<ffffffff8108d777>] kthread+0xf7/0x110
>  [<ffffffff8108d680>] ? __kthread_parkme+0xa0/0xa0
>  [<ffffffff8168ce3e>] ret_from_fork+0x3e/0x70
>  [<ffffffff8108d680>] ? __kthread_parkme+0xa0/0xa0
> Code: 48 b8 00 00 00 00 00 00 00 80 55 48 89 e5 48 09 c1 ff d1 5d 85
> c0 0f 95 c0 0f b6 c0 eb b9 66 2e 0f 1f 84 00 00 00 00 00 83 fe 02 <48>
> 8b 87 c8 01 00 00 0f 84 a0 00 00 00 48 85 c0 0f 85 97 00 00
> RIP  [<ffffffff811d0cf3>] locks_get_lock_context+0x3/0xc0
>  RSP <ffff881036007bb0>
> CR2: 00000000000001c8
> ---[ end trace 2da9686dda1b5574 ]---
>
>
> Thanks,
> --
> William
>
> --
> You received this message because you are subscribed to the Google Groups "ktsan" group.
> To unsubscribe from this group and stop receiving emails from it, send an email to ktsan+unsubscribe@googlegroups.com.
> To post to this group, send email to ktsan@googlegroups.com.
> To view this discussion on the web visit https://groups.google.com/d/msgid/ktsan/CAJ75kXYAFAWUh6RScKqsFbt4D462Rbc3%2BW3dd6x4-bh7EOKXcQ%40mail.gmail.com.
> For more options, visit https://groups.google.com/d/optout.
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
William Dauchy Oct. 19, 2015, 3:36 p.m. UTC | #11
Hello Dmitry,

On Mon, Oct 19, 2015 at 5:27 PM, Dmitry Vyukov <dvyukov@google.com> wrote:
> I would expect that you see something else.
> The issue that I fixed would fire very infrequently and unreproducibly.

Thanks for the quick reply. I will open another thread for this issue.
Jeff Layton Oct. 19, 2015, 4:44 p.m. UTC | #12
On Mon, 2015-10-19 at 17:18 +0200, William Dauchy wrote:
> Hello Dmitry,
> 
> On Mon, Sep 21, 2015 at 1:44 PM, Jeff Layton <jlayton@poochiereds.net
> > wrote:
> > Ok, thanks for the explanation. Patch looks fine to me. I'll go
> > ahead
> > and merge it for v4.4. Let me know though if you think this needs
> > to go
> > in sooner.
> 
> I am getting a null deref on a v4.1.x
> Do you think your patch could fix the following trace? It looks
> similar in my opinion.
> 
> BUG: unable to handle kernel NULL pointer dereference at
> 00000000000001c8
> IP: [<ffffffff811d0cf3>] locks_get_lock_context+0x3/0xc0
> PGD 0
> Oops: 0000 [#1] SMP
> CPU: 1 PID: 1773 Comm: kworker/1:1H Not tainted 4.1.11-rc1 #1
> Workqueue: rpciod ffffffff8164fff0
> task: ffff8810374deba0 ti: ffff8810374df150 task.ti: ffff8810374df150
> RIP: 0010:[<ffffffff811d0cf3>]  [<ffffffff811d0cf3>]
> locks_get_lock_context+0x3/0xc0
> RSP: 0000:ffff881036007bb0  EFLAGS: 00010246
> RAX: ffff881036007c30 RBX: ffff881001981880 RCX: 0000000000000002
> RDX: 00000000000006ed RSI: 0000000000000002 RDI: 0000000000000000
> RBP: ffff881036007c08 R08: 0000000000000000 R09: 0000000000000001
> R10: 0000000000000000 R11: ffff88101db69948 R12: ffff8810019818d8
> R13: ffff881036007bc8 R14: ffff880e225d81c0 R15: ffff881edfd2b400
> FS:  0000000000000000(0000) GS:ffff88103fc20000(0000)
> knlGS:0000000000000000
> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 00000000000001c8 CR3: 000000000169b000 CR4: 00000000000606f0
> Stack:
>  ffffffff811d2710 ffff881036007bc8 ffffffff819f1af1 ffff881036007bc8
>  ffff881036007bc8 ffff881036007c08 ffff881001981880 ffff8810019818d8
>  ffff881036007c48 ffff880e225d81c0 ffff881edfd2b400 ffff881036007c88
> Call Trace:
>  [<ffffffff811d2710>] ? flock_lock_file+0x30/0x270
>  [<ffffffff811d3ad1>] flock_lock_file_wait+0x41/0xf0
>  [<ffffffff8168be66>] ? _raw_spin_unlock+0x26/0x40
>  [<ffffffff81268de9>] do_vfs_lock+0x19/0x40
>  [<ffffffff812695cc>] nfs4_locku_done+0x5c/0xf0
>  [<ffffffff8164f3f4>] rpc_exit_task+0x34/0xb0
>  [<ffffffff8164fcd9>] __rpc_execute+0x79/0x390
>  [<ffffffff81650000>] rpc_async_schedule+0x10/0x20
>  [<ffffffff81086095>] process_one_work+0x1a5/0x450
>  [<ffffffff81086024>] ? process_one_work+0x134/0x450
>  [<ffffffff8108638b>] worker_thread+0x4b/0x4a0
>  [<ffffffff81086340>] ? process_one_work+0x450/0x450
>  [<ffffffff81086340>] ? process_one_work+0x450/0x450
>  [<ffffffff8108d777>] kthread+0xf7/0x110
>  [<ffffffff8108d680>] ? __kthread_parkme+0xa0/0xa0
>  [<ffffffff8168ce3e>] ret_from_fork+0x3e/0x70
>  [<ffffffff8108d680>] ? __kthread_parkme+0xa0/0xa0
> Code: 48 b8 00 00 00 00 00 00 00 80 55 48 89 e5 48 09 c1 ff d1 5d 85
> c0 0f 95 c0 0f b6 c0 eb b9 66 2e 0f 1f 84 00 00 00 00 00 83 fe 02
> <48>
> 8b 87 c8 01 00 00 0f 84 a0 00 00 00 48 85 c0 0f 85 97 00 00
> RIP  [<ffffffff811d0cf3>] locks_get_lock_context+0x3/0xc0
>  RSP <ffff881036007bb0>
> CR2: 00000000000001c8
> ---[ end trace 2da9686dda1b5574 ]---
> 
> 
> Thanks,


This should be fixed by this series of four commits that are already in
mainline:

   
bcd7f78d078ff6197715c1ed070c92aca57ec12c..ee296d7c5709440f8abd36b5b65c6
b3e388538d9

The basic problem is that when the struct file is being torn down,
file_inode(file) may already return NULL. So we need to be able to pass
in the inode directly from the reference held by the nfs_open_context.

Are those patches reasonable to pull in?
William Dauchy Oct. 19, 2015, 4:53 p.m. UTC | #13
Hi Jeff,

Thank you for you reply.

On Mon, Oct 19, 2015 at 6:44 PM, Jeff Layton <jlayton@poochiereds.net> wrote:
> This should be fixed by this series of four commits that are already in
> mainline:
> bcd7f78d078ff6197715c1ed070c92aca57ec12c..ee296d7c5709440f8abd36b5b65c6
> b3e388538d9

Am I missing something, I see three of them between
bcd7f78d078ff6197715c1ed070c92aca57ec12c..ee296d7c5709440f8abd36b5b65c6b3e388538d9
(and not four)

ee296d7c5709440f8abd36b5b65c6b3e388538d9 locks: inline
posix_lock_file_wait and flock_lock_file_wait
83bfff23e9ed19f37c4ef0bba84e75bd88e5cf21 nfs4: have do_vfs_lock take
an inode pointer
29d01b22eaa18d8b46091d3c98c6001c49f78e4a locks: new helpers -
flock_lock_inode_wait and posix_lock_inode_wait

Thanks,
William Dauchy Oct. 19, 2015, 5:24 p.m. UTC | #14
On Mon, Oct 19, 2015 at 6:53 PM, William Dauchy <wdauchy@gmail.com> wrote:
> On Mon, Oct 19, 2015 at 6:44 PM, Jeff Layton <jlayton@poochiereds.net> wrote:
>> This should be fixed by this series of four commits that are already in
>> mainline:
>> bcd7f78d078ff6197715c1ed070c92aca57ec12c..ee296d7c5709440f8abd36b5b65c6
>> b3e388538d9
>
> Am I missing something, I see three of them between
> bcd7f78d078ff6197715c1ed070c92aca57ec12c..ee296d7c5709440f8abd36b5b65c6b3e388538d9
> (and not four)
>
> ee296d7c5709440f8abd36b5b65c6b3e388538d9 locks: inline
> posix_lock_file_wait and flock_lock_file_wait
> 83bfff23e9ed19f37c4ef0bba84e75bd88e5cf21 nfs4: have do_vfs_lock take
> an inode pointer
> 29d01b22eaa18d8b46091d3c98c6001c49f78e4a locks: new helpers -
> flock_lock_inode_wait and posix_lock_inode_wait

I suppose I found the missing one
bcd7f78 locks: have flock_lock_file take an inode pointer instead of a filp
Jeff Layton Oct. 19, 2015, 5:46 p.m. UTC | #15
On Mon, 2015-10-19 at 19:24 +0200, William Dauchy wrote:
> On Mon, Oct 19, 2015 at 6:53 PM, William Dauchy <wdauchy@gmail.com>
> wrote:
> > On Mon, Oct 19, 2015 at 6:44 PM, Jeff Layton <
> > jlayton@poochiereds.net> wrote:
> > > This should be fixed by this series of four commits that are
> > > already in
> > > mainline:
> > > bcd7f78d078ff6197715c1ed070c92aca57ec12c..ee296d7c5709440f8abd36b
> > > 5b65c6
> > > b3e388538d9
> > 
> > Am I missing something, I see three of them between
> > bcd7f78d078ff6197715c1ed070c92aca57ec12c..ee296d7c5709440f8abd36b5b
> > 65c6b3e388538d9
> > (and not four)
> > 
> > ee296d7c5709440f8abd36b5b65c6b3e388538d9 locks: inline
> > posix_lock_file_wait and flock_lock_file_wait
> > 83bfff23e9ed19f37c4ef0bba84e75bd88e5cf21 nfs4: have do_vfs_lock
> > take
> > an inode pointer
> > 29d01b22eaa18d8b46091d3c98c6001c49f78e4a locks: new helpers -
> > flock_lock_inode_wait and posix_lock_inode_wait
> 
> I suppose I found the missing one
> bcd7f78 locks: have flock_lock_file take an inode pointer instead of
> a filp
> 

Ahh yes, sorry -- that one is also required.

Thanks!
diff mbox

Patch

diff --git a/fs/locks.c b/fs/locks.c
index 2a54c80..316e474 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -205,28 +205,32 @@  static struct kmem_cache *filelock_cache __read_mostly;
 static struct file_lock_context *
 locks_get_lock_context(struct inode *inode, int type)
 {
-	struct file_lock_context *new;
+	struct file_lock_context *ctx;
 
-	if (likely(inode->i_flctx) || type == F_UNLCK)
+	/* paired with cmpxchg() below */
+	ctx = smp_load_acquire(&inode->i_flctx);
+	if (likely(ctx) || type == F_UNLCK)
 		goto out;
 
-	new = kmem_cache_alloc(flctx_cache, GFP_KERNEL);
-	if (!new)
+	ctx = kmem_cache_alloc(flctx_cache, GFP_KERNEL);
+	if (!ctx)
 		goto out;
 
-	spin_lock_init(&new->flc_lock);
-	INIT_LIST_HEAD(&new->flc_flock);
-	INIT_LIST_HEAD(&new->flc_posix);
-	INIT_LIST_HEAD(&new->flc_lease);
+	spin_lock_init(&ctx->flc_lock);
+	INIT_LIST_HEAD(&ctx->flc_flock);
+	INIT_LIST_HEAD(&ctx->flc_posix);
+	INIT_LIST_HEAD(&ctx->flc_lease);
 
 	/*
 	 * Assign the pointer if it's not already assigned. If it is, then
 	 * free the context we just allocated.
 	 */
-	if (cmpxchg(&inode->i_flctx, NULL, new))
-		kmem_cache_free(flctx_cache, new);
+	if (cmpxchg(&inode->i_flctx, NULL, ctx)) {
+		kmem_cache_free(flctx_cache, ctx);
+		ctx = smp_load_acquire(&inode->i_flctx);
+	}
 out:
-	return inode->i_flctx;
+	return ctx;
 }
 
 void
@@ -762,7 +766,7 @@  posix_test_lock(struct file *filp, struct file_lock *fl)
 	struct file_lock_context *ctx;
 	struct inode *inode = file_inode(filp);
 
-	ctx = inode->i_flctx;
+	ctx = smp_load_acquire(&inode->i_flctx);
 	if (!ctx || list_empty_careful(&ctx->flc_posix)) {
 		fl->fl_type = F_UNLCK;
 		return;
@@ -1203,7 +1207,7 @@  int locks_mandatory_locked(struct file *file)
 	struct file_lock_context *ctx;
 	struct file_lock *fl;
 
-	ctx = inode->i_flctx;
+	ctx = smp_load_acquire(&inode->i_flctx);
 	if (!ctx || list_empty_careful(&ctx->flc_posix))
 		return 0;
 
@@ -1388,7 +1392,7 @@  any_leases_conflict(struct inode *inode, struct file_lock *breaker)
 int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
 {
 	int error = 0;
-	struct file_lock_context *ctx = inode->i_flctx;
+	struct file_lock_context *ctx;
 	struct file_lock *new_fl, *fl, *tmp;
 	unsigned long break_time;
 	int want_write = (mode & O_ACCMODE) != O_RDONLY;
@@ -1400,6 +1404,7 @@  int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
 	new_fl->fl_flags = type;
 
 	/* typically we will check that ctx is non-NULL before calling */
+	ctx = smp_load_acquire(&inode->i_flctx);
 	if (!ctx) {
 		WARN_ON_ONCE(1);
 		return error;
@@ -1494,9 +1499,10 @@  EXPORT_SYMBOL(__break_lease);
 void lease_get_mtime(struct inode *inode, struct timespec *time)
 {
 	bool has_lease = false;
-	struct file_lock_context *ctx = inode->i_flctx;
+	struct file_lock_context *ctx;
 	struct file_lock *fl;
 
+	ctx = smp_load_acquire(&inode->i_flctx);
 	if (ctx && !list_empty_careful(&ctx->flc_lease)) {
 		spin_lock(&ctx->flc_lock);
 		if (!list_empty(&ctx->flc_lease)) {
@@ -1543,10 +1549,11 @@  int fcntl_getlease(struct file *filp)
 {
 	struct file_lock *fl;
 	struct inode *inode = file_inode(filp);
-	struct file_lock_context *ctx = inode->i_flctx;
+	struct file_lock_context *ctx;
 	int type = F_UNLCK;
 	LIST_HEAD(dispose);
 
+	ctx = smp_load_acquire(&inode->i_flctx);
 	if (ctx && !list_empty_careful(&ctx->flc_lease)) {
 		spin_lock(&ctx->flc_lock);
 		time_out_leases(file_inode(filp), &dispose);
@@ -1713,9 +1720,10 @@  static int generic_delete_lease(struct file *filp, void *owner)
 	struct file_lock *fl, *victim = NULL;
 	struct dentry *dentry = filp->f_path.dentry;
 	struct inode *inode = dentry->d_inode;
-	struct file_lock_context *ctx = inode->i_flctx;
+	struct file_lock_context *ctx;
 	LIST_HEAD(dispose);
 
+	ctx = smp_load_acquire(&inode->i_flctx);
 	if (!ctx) {
 		trace_generic_delete_lease(inode, NULL);
 		return error;
@@ -2359,13 +2367,14 @@  out:
 void locks_remove_posix(struct file *filp, fl_owner_t owner)
 {
 	struct file_lock lock;
-	struct file_lock_context *ctx = file_inode(filp)->i_flctx;
+	struct file_lock_context *ctx;
 
 	/*
 	 * If there are no locks held on this file, we don't need to call
 	 * posix_lock_file().  Another process could be setting a lock on this
 	 * file at the same time, but we wouldn't remove that lock anyway.
 	 */
+	ctx =  smp_load_acquire(&file_inode(filp)->i_flctx);
 	if (!ctx || list_empty(&ctx->flc_posix))
 		return;
 
@@ -2389,7 +2398,7 @@  EXPORT_SYMBOL(locks_remove_posix);
 
 /* The i_flctx must be valid when calling into here */
 static void
-locks_remove_flock(struct file *filp)
+locks_remove_flock(struct file *filp, struct file_lock_context *flctx)
 {
 	struct file_lock fl = {
 		.fl_owner = filp,
@@ -2400,7 +2409,6 @@  locks_remove_flock(struct file *filp)
 		.fl_end = OFFSET_MAX,
 	};
 	struct inode *inode = file_inode(filp);
-	struct file_lock_context *flctx = inode->i_flctx;
 
 	if (list_empty(&flctx->flc_flock))
 		return;
@@ -2416,10 +2424,8 @@  locks_remove_flock(struct file *filp)
 
 /* The i_flctx must be valid when calling into here */
 static void
-locks_remove_lease(struct file *filp)
+locks_remove_lease(struct file *filp, struct file_lock_context *ctx)
 {
-	struct inode *inode = file_inode(filp);
-	struct file_lock_context *ctx = inode->i_flctx;
 	struct file_lock *fl, *tmp;
 	LIST_HEAD(dispose);
 
@@ -2439,17 +2445,20 @@  locks_remove_lease(struct file *filp)
  */
 void locks_remove_file(struct file *filp)
 {
-	if (!file_inode(filp)->i_flctx)
+	struct file_lock_context *ctx;
+
+	ctx = smp_load_acquire(&file_inode(filp)->i_flctx);
+	if (!ctx)
 		return;
 
 	/* remove any OFD locks */
 	locks_remove_posix(filp, filp);
 
 	/* remove flock locks */
-	locks_remove_flock(filp);
+	locks_remove_flock(filp, ctx);
 
 	/* remove any leases */
-	locks_remove_lease(filp);
+	locks_remove_lease(filp, ctx);
 }
 
 /**
@@ -2616,7 +2625,7 @@  void show_fd_locks(struct seq_file *f,
 	struct file_lock_context *ctx;
 	int id = 0;
 
-	ctx = inode->i_flctx;
+	ctx = smp_load_acquire(&inode->i_flctx);
 	if (!ctx)
 		return;