diff mbox

[RFC,v2,3/4] locks: Split insert/delete block functions into flock/posix parts

Message ID 1425306313-7234-4-git-send-email-daniel.wagner@bmw-carit.de (mailing list archive)
State New, archived
Headers show

Commit Message

Daniel Wagner March 2, 2015, 2:25 p.m. UTC
The locks_insert/delete_block() functions are used for flock, posix
and leases types. blocked_lock_lock is used to serialize all access to
fl_link, fl_block, fl_next and blocked_hash. Here, we prepare the
stage for using blocked_lock_lock to protect blocked_hash.

Signed-off-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
Cc: Jeff Layton <jlayton@poochiereds.net>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
---
 fs/locks.c | 48 ++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 40 insertions(+), 8 deletions(-)

Comments

Jeff Layton March 3, 2015, 12:55 a.m. UTC | #1
On Mon,  2 Mar 2015 15:25:12 +0100
Daniel Wagner <daniel.wagner@bmw-carit.de> wrote:

> The locks_insert/delete_block() functions are used for flock, posix
> and leases types. blocked_lock_lock is used to serialize all access to
> fl_link, fl_block, fl_next and blocked_hash. Here, we prepare the
> stage for using blocked_lock_lock to protect blocked_hash.
> 
> Signed-off-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
> Cc: Jeff Layton <jlayton@poochiereds.net>
> Cc: "J. Bruce Fields" <bfields@fieldses.org>
> Cc: Alexander Viro <viro@zeniv.linux.org.uk>
> ---
>  fs/locks.c | 48 ++++++++++++++++++++++++++++++++++++++++--------
>  1 file changed, 40 insertions(+), 8 deletions(-)
> 
> diff --git a/fs/locks.c b/fs/locks.c
> index 4498da0..02821dd 100644
> --- a/fs/locks.c
> +++ b/fs/locks.c
> @@ -611,11 +611,20 @@ static void locks_delete_global_blocked(struct file_lock *waiter)
>   */
>  static void __locks_delete_block(struct file_lock *waiter)
>  {
> -	locks_delete_global_blocked(waiter);
>  	list_del_init(&waiter->fl_block);
>  	waiter->fl_next = NULL;
>  }
>  
> +/* Posix block variant of __locks_delete_block.
> + *
> + * Must be called with blocked_lock_lock held.
> + */
> +static void __locks_delete_posix_block(struct file_lock *waiter)
> +{
> +	locks_delete_global_blocked(waiter);
> +	__locks_delete_block(waiter);
> +}
> +
>  static void locks_delete_block(struct file_lock *waiter)
>  {
>  	spin_lock(&blocked_lock_lock);
> @@ -623,6 +632,13 @@ static void locks_delete_block(struct file_lock *waiter)
>  	spin_unlock(&blocked_lock_lock);
>  }
>  
> +static void locks_delete_posix_block(struct file_lock *waiter)
> +{
> +	spin_lock(&blocked_lock_lock);
> +	__locks_delete_posix_block(waiter);
> +	spin_unlock(&blocked_lock_lock);
> +}
> +
>  /* Insert waiter into blocker's block list.
>   * We use a circular list so that processes can be easily woken up in
>   * the order they blocked. The documentation doesn't require this but
> @@ -639,7 +655,17 @@ static void __locks_insert_block(struct file_lock *blocker,
>  	BUG_ON(!list_empty(&waiter->fl_block));
>  	waiter->fl_next = blocker;
>  	list_add_tail(&waiter->fl_block, &blocker->fl_block);
> -	if (IS_POSIX(blocker) && !IS_OFDLCK(blocker))
> +}
> +
> +/* Posix block variant of __locks_insert_block.
> + *
> + * Must be called with flc_lock and blocked_lock_lock held.
> + */
> +static void __locks_insert_posix_block(struct file_lock *blocker,
> +					struct file_lock *waiter)
> +{
> +	__locks_insert_block(blocker, waiter);
> +	if (!IS_OFDLCK(blocker))
>  		locks_insert_global_blocked(waiter);
>  }
>

In many ways OFD locks act more like flock locks than POSIX ones. In
particular, there is no deadlock detection there, so once your
conversion is done to more widely use the percpu locks, then you should
be able to avoid taking the blocked_lock_lock for OFD locks as well.
The 4th patch in this series doesn't currently do that.

You may want to revisit this patch such that the IS_OFDLCK checks are
done earlier, so that you can avoid taking the blocked_lock_lock if
IS_POSIX and !IS_OFDLCK.

> @@ -675,7 +701,10 @@ static void locks_wake_up_blocks(struct
> file_lock *blocker) 
>  		waiter = list_first_entry(&blocker->fl_block,
>  				struct file_lock, fl_block);
> -		__locks_delete_block(waiter);
> +		if (IS_POSIX(blocker))

...again, you probably want to make this read:

		if (IS_POSIX(blocker) && !IS_OFDLCK(blocker)

...so that later you can avoid taking the blocked_lock_lock if IS_OFDLOCK(blocker).


> +			__locks_delete_posix_block(waiter);
> +		else
> +			__locks_delete_block(waiter);
>  		if (waiter->fl_lmops && waiter->fl_lmops->lm_notify)
>  			waiter->fl_lmops->lm_notify(waiter);
>  		else
> @@ -985,7 +1014,7 @@ static int __posix_lock_file(struct inode
> *inode, struct file_lock *request, str spin_lock(&blocked_lock_lock);
>  			if (likely(!posix_locks_deadlock(request,
> fl))) { error = FILE_LOCK_DEFERRED;
> -				__locks_insert_block(fl, request);
> +				__locks_insert_posix_block(fl,
> request); }
>  			spin_unlock(&blocked_lock_lock);
>  			goto out;
> @@ -1186,7 +1215,7 @@ int posix_lock_file_wait(struct file *filp,
> struct file_lock *fl) if (!error)
>  			continue;
>  
> -		locks_delete_block(fl);
> +		locks_delete_posix_block(fl);
>  		break;
>  	}
>  	return error;
> @@ -1283,7 +1312,7 @@ int locks_mandatory_area(int read_write, struct
> inode *inode, continue;
>  		}
>  
> -		locks_delete_block(&fl);
> +		locks_delete_posix_block(&fl);
>  		break;
>  	}
>  
> @@ -2103,7 +2132,10 @@ static int do_lock_file_wait(struct file
> *filp, unsigned int cmd, if (!error)
>  			continue;
>  
> -		locks_delete_block(fl);
> +		if (IS_POSIX(fl))
> +			locks_delete_posix_block(fl);
> +		else
> +			locks_delete_block(fl);
>  		break;
>  	}
>  
> @@ -2467,7 +2499,7 @@ posix_unblock_lock(struct file_lock *waiter)
>  
>  	spin_lock(&blocked_lock_lock);
>  	if (waiter->fl_next)
> -		__locks_delete_block(waiter);
> +		__locks_delete_posix_block(waiter);
>  	else
>  		status = -ENOENT;
>  	spin_unlock(&blocked_lock_lock);
Daniel Wagner March 4, 2015, 2:20 p.m. UTC | #2
On 03/03/2015 01:55 AM, Jeff Layton wrote:
> On Mon,  2 Mar 2015 15:25:12 +0100
> Daniel Wagner <daniel.wagner@bmw-carit.de> wrote:
> 
>> The locks_insert/delete_block() functions are used for flock, posix
>> and leases types. blocked_lock_lock is used to serialize all access to
>> fl_link, fl_block, fl_next and blocked_hash. Here, we prepare the
>> stage for using blocked_lock_lock to protect blocked_hash.
>>
>> Signed-off-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
>> Cc: Jeff Layton <jlayton@poochiereds.net>
>> Cc: "J. Bruce Fields" <bfields@fieldses.org>
>> Cc: Alexander Viro <viro@zeniv.linux.org.uk>
>> ---
>>  fs/locks.c | 48 ++++++++++++++++++++++++++++++++++++++++--------
>>  1 file changed, 40 insertions(+), 8 deletions(-)
>>
>> diff --git a/fs/locks.c b/fs/locks.c
>> index 4498da0..02821dd 100644
>> --- a/fs/locks.c
>> +++ b/fs/locks.c
>> @@ -611,11 +611,20 @@ static void locks_delete_global_blocked(struct file_lock *waiter)
>>   */
>>  static void __locks_delete_block(struct file_lock *waiter)
>>  {
>> -	locks_delete_global_blocked(waiter);
>>  	list_del_init(&waiter->fl_block);
>>  	waiter->fl_next = NULL;
>>  }
>>  
>> +/* Posix block variant of __locks_delete_block.
>> + *
>> + * Must be called with blocked_lock_lock held.
>> + */
>> +static void __locks_delete_posix_block(struct file_lock *waiter)
>> +{
>> +	locks_delete_global_blocked(waiter);
>> +	__locks_delete_block(waiter);
>> +}
>> +
>>  static void locks_delete_block(struct file_lock *waiter)
>>  {
>>  	spin_lock(&blocked_lock_lock);
>> @@ -623,6 +632,13 @@ static void locks_delete_block(struct file_lock *waiter)
>>  	spin_unlock(&blocked_lock_lock);
>>  }
>>  
>> +static void locks_delete_posix_block(struct file_lock *waiter)
>> +{
>> +	spin_lock(&blocked_lock_lock);
>> +	__locks_delete_posix_block(waiter);
>> +	spin_unlock(&blocked_lock_lock);
>> +}
>> +
>>  /* Insert waiter into blocker's block list.
>>   * We use a circular list so that processes can be easily woken up in
>>   * the order they blocked. The documentation doesn't require this but
>> @@ -639,7 +655,17 @@ static void __locks_insert_block(struct file_lock *blocker,
>>  	BUG_ON(!list_empty(&waiter->fl_block));
>>  	waiter->fl_next = blocker;
>>  	list_add_tail(&waiter->fl_block, &blocker->fl_block);
>> -	if (IS_POSIX(blocker) && !IS_OFDLCK(blocker))
>> +}
>> +
>> +/* Posix block variant of __locks_insert_block.
>> + *
>> + * Must be called with flc_lock and blocked_lock_lock held.
>> + */
>> +static void __locks_insert_posix_block(struct file_lock *blocker,
>> +					struct file_lock *waiter)
>> +{
>> +	__locks_insert_block(blocker, waiter);
>> +	if (!IS_OFDLCK(blocker))
>>  		locks_insert_global_blocked(waiter);
>>  }
>>
> 
> In many ways OFD locks act more like flock locks than POSIX ones. In
> particular, there is no deadlock detection there, so once your
> conversion is done to more widely use the percpu locks, then you should
> be able to avoid taking the blocked_lock_lock for OFD locks as well.
> The 4th patch in this series doesn't currently do that.
> 
> You may want to revisit this patch such that the IS_OFDLCK checks are
> done earlier, so that you can avoid taking the blocked_lock_lock if
> IS_POSIX and !IS_OFDLCK.

Thanks for the explanation. I was not entirely sure what to do here
and forgot to ask.

I have fixed that stuff and now I am testing it. Though it seems
that there is a memory leak which can be triggered with 

	while true; rm -rf /tmp/a; ./lease02 /tmp/a; done

and this happens also without any of my patches. Still trying to
figure out what's happening. Hopefully I just see a ghost.

slabtop tells me that ftrace_event_field is constantly growing:

 Active / Total Objects (% used)    : 968819303 / 968828665 (100.0%)
 Active / Total Slabs (% used)      : 11404623 / 11404623 (100.0%)
 Active / Total Caches (% used)     : 72 / 99 (72.7%)
 Active / Total Size (% used)       : 45616199.68K / 45619608.73K (100.0%)
 Minimum / Average / Maximum Object : 0.01K / 0.05K / 16.00K

  OBJS ACTIVE  USE OBJ SIZE  SLABS OBJ/SLAB CACHE SIZE NAME                   
967510630 967510630   2%    0.05K 11382478       85  45529912K ftrace_event_field
154368 154368 100%    0.03K   1206      128      4824K kmalloc-32
121856 121856 100%    0.01K    238      512       952K kmalloc-8
121227 121095  99%    0.08K   2377       51      9508K Acpi-State

This is on proper hardware. On a kvm guest, fasync_cache grows fast and finally the
guest runs out of memory. systemd tries hard to restart everything and fails constantly:

[  187.021758] systemd invoked oom-killer: gfp_mask=0x200da, order=0, oom_score_adj=0
[  187.022337] systemd cpuset=/ mems_allowed=0
[  187.022662] CPU: 3 PID: 1 Comm: systemd Not tainted 4.0.0-rc1+ #380
[  187.023117] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.7.5-20140709_153950- 04/01/2014
[  187.023801]  ffff88007c918000 ffff88007c9179c8 ffffffff81b4f9be ffffffff8116a9cc
[  187.024373]  0000000000000000 ffff88007c917a88 ffffffff8116a9d1 000000007c917a58
[  187.024940]  ffffffff8224bc98 ffff88007c917a28 0000000000000092 ffffffff81c1b780
[  187.025515] Call Trace:
[  187.025698]  [<ffffffff81b4f9be>] dump_stack+0x4c/0x65
[  187.026083]  [<ffffffff8116a9cc>] ? dump_header.isra.13+0x7c/0x450
[  187.026525]  [<ffffffff8116a9d1>] dump_header.isra.13+0x81/0x450
[  187.026958]  [<ffffffff810a45c6>] ? trace_hardirqs_on_caller+0x16/0x240
[  187.027437]  [<ffffffff810a47fd>] ? trace_hardirqs_on+0xd/0x10
[  187.027859]  [<ffffffff814fe5c4>] ? ___ratelimit+0x84/0x110
[  187.028264]  [<ffffffff8116b378>] oom_kill_process+0x1e8/0x4c0
[  187.028683]  [<ffffffff8105fda5>] ? has_ns_capability_noaudit+0x5/0x170
[  187.029167]  [<ffffffff8116baf4>] __out_of_memory+0x4a4/0x510
[  187.029579]  [<ffffffff8116bd2b>] out_of_memory+0x5b/0x80
[  187.029970]  [<ffffffff81170f2e>] __alloc_pages_nodemask+0xa0e/0xb60
[  187.030434]  [<ffffffff811ad863>] read_swap_cache_async+0xe3/0x180
[  187.030881]  [<ffffffff811ad9ed>] swapin_readahead+0xed/0x190
[  187.031300]  [<ffffffff8119bcae>] handle_mm_fault+0xbbe/0x1180
[  187.031719]  [<ffffffff81046bed>] __do_page_fault+0x1ed/0x4c0
[  187.032138]  [<ffffffff81046ecc>] do_page_fault+0xc/0x10
[  187.032520]  [<ffffffff81b5ddc2>] page_fault+0x22/0x30
[  187.032889] Mem-Info:
[  187.033066] DMA per-cpu:
[  187.033254] CPU    0: hi:    0, btch:   1 usd:   0
[  187.033596] CPU    1: hi:    0, btch:   1 usd:   0
[  187.033941] CPU    2: hi:    0, btch:   1 usd:   0
[  187.034292] CPU    3: hi:    0, btch:   1 usd:   0
[  187.034637] DMA32 per-cpu:
[  187.034837] CPU    0: hi:  186, btch:  31 usd:  51
[  187.035185] CPU    1: hi:  186, btch:  31 usd:   0
[  187.035529] CPU    2: hi:  186, btch:  31 usd:   0
[  187.035873] CPU    3: hi:  186, btch:  31 usd:  32
[  187.036221] active_anon:5 inactive_anon:0 isolated_anon:0
[  187.036221]  active_file:238 inactive_file:194 isolated_file:0
[  187.036221]  unevictable:0 dirty:0 writeback:8 unstable:0
[  187.036221]  free:3361 slab_reclaimable:4651 slab_unreclaimable:493909
[  187.036221]  mapped:347 shmem:0 pagetables:400 bounce:0
[  187.036221]  free_cma:0
[  187.038385] DMA free:7848kB min:44kB low:52kB high:64kB active_anon:4kB inactive_anon:12kB active_file:0kB inactive_file:4kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:15992kB managed:15908kB mlocked:0kB dirty:0kB writeback:8kB mapped:4kB shmem:0kB slab_reclaimable:12kB slab_unreclaimable:7880kB kernel_stack:32kB pagetables:36kB unstable:0kB bounce:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:132 all_unreclaimable? yes
[  187.041138] lowmem_reserve[]: 0 1952 1952 1952
[  187.041510] DMA32 free:5596kB min:5628kB low:7032kB high:8440kB active_anon:16kB inactive_anon:0kB active_file:952kB inactive_file:772kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:2080640kB managed:2004912kB mlocked:0kB dirty:0kB writeback:24kB mapped:1384kB shmem:0kB slab_reclaimable:18592kB slab_unreclaimable:1967756kB kernel_stack:1968kB pagetables:1564kB unstable:0kB bounce:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:12716 all_unreclaimable? yes
[  187.044442] lowmem_reserve[]: 0 0 0 0
[  187.044756] DMA: 4*4kB (UEM) 2*8kB (UM) 5*16kB (UEM) 2*32kB (UE) 2*64kB (EM) 3*128kB (UEM) 2*256kB (EM) 3*512kB (UEM) 3*1024kB (UEM) 1*2048kB (R) 0*4096kB = 7856kB
[  187.046022] DMA32: 190*4kB (UER) 6*8kB (R) 1*16kB (R) 1*32kB (R) 0*64kB 0*128kB 1*256kB (R) 1*512kB (R) 0*1024kB 0*2048kB 1*4096kB (R) = 5720kB
[  187.047128] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB
[  187.047724] 554 total pagecache pages
[  187.047991] 60 pages in swap cache
[  187.048259] Swap cache stats: add 102769, delete 102709, find 75688/136761
[  187.048748] Free swap  = 1041456kB
[  187.048995] Total swap = 1048572kB
[  187.049250] 524158 pages RAM
[  187.049463] 0 pages HighMem/MovableOnly
[  187.049739] 18953 pages reserved
[  187.049974] [ pid ]   uid  tgid total_vm      rss nr_ptes nr_pmds swapents oom_score_adj name
[  187.050587] [ 1293]     0  1293    10283        1      23       2      131         -1000 systemd-udevd
[  187.051253] [ 1660]     0  1660    12793       57      24       2      134         -1000 auditd
[  187.051872] [ 1681]    81  1681     6637        1      18       2      124          -900 dbus-daemon
[  187.052529] [ 1725]     0  1725    20707        0      42       3      216         -1000 sshd
[  187.053146] [ 2344]     0  2344     3257        0      11       2       49             0 systemd-cgroups
[  187.053820] [ 2345]     0  2345     3257        0      11       2       55             0 systemd-cgroups
[  187.054497] [ 2350]     0  2350     3257        0      11       2       35             0 systemd-cgroups
[  187.055175] [ 2352]     0  2352     3257        0      12       2       37             0 systemd-cgroups
[  187.055846] [ 2354]     0  2354     3257        0      11       2       43             0 systemd-cgroups
[  187.056530] [ 2355]     0  2355     3257        0      11       2       40             0 systemd-cgroups
[  187.057212] [ 2356]     0  2356     3257        0      11       2       44             0 systemd-cgroups
[  187.057886] [ 2362]     0  2362     3257        0      11       3       33             0 systemd-cgroups
[  187.058564] [ 2371]     0  2371     3257        0      11       2       33             0 systemd-cgroups
[  187.059244] [ 2372]     0  2372     3257        0      10       2       44             0 systemd-cgroups
[  187.059917] [ 2373]     0  2373     3257        0      11       2       39             0 systemd-cgroups
[  187.060600] [ 2376]     0  2376     3257        0      11       2       34             0 systemd-cgroups
[  187.061280] [ 2377]     0  2377     3257        0      10       2       43             0 systemd-cgroups
[  187.061942] [ 2378]     0  2378     3257        0      12       3       34             0 systemd-cgroups
[  187.062598] [ 2379]     0  2379    27502        0      10       3       33             0 agetty
[  187.063200] [ 2385]     0  2385     3257        0      12       2       44             0 systemd-cgroups
[  187.063859] [ 2390]     0  2390     3257        0      11       2       43             0 systemd-cgroups
[  187.064520] [ 2394]     0  2394     3257        0      11       2       41             0 systemd-cgroups
[  187.065182] [ 2397]     0  2397     3257        0      11       2       43             0 systemd-cgroups
[  187.065833] [ 2402]     0  2402     3257        0      11       2       42             0 systemd-cgroups
[  187.066490] [ 2403]     0  2403     3257        0      11       2       44             0 systemd-cgroups
[  187.067148] [ 2404]     0  2404    27502        0      13       3       30             0 agetty
[  187.067743] [ 2410]     0  2410     3257        0      11       2       43             0 systemd-cgroups
[  187.068407] [ 2413]     0  2413     3257        0      11       2       36             0 systemd-cgroups
[  187.069072] [ 2416]     0  2416     3257        0      11       2       49             0 systemd-cgroups
[  187.069720] [ 2417]     0  2417    11861      173      26       2      334             0 (journald)
[  187.070368] Out of memory: Kill process 2417 ((journald)) score 0 or sacrifice child
[  187.070943] Killed process 2417 ((journald)) total-vm:47444kB, anon-rss:0kB, file-rss:692kB
[  187.513857] systemd[1]: Unit systemd-logind.service entered failed state.
[  188.262477] systemd[1]: Unit systemd-journald.service entered failed state.
[  188.315222] systemd[1]: systemd-logind.service holdoff time over, scheduling restart.
[  188.334194] systemd[1]: Stopping Login Service...
[  188.341556] systemd[1]: Starting Login Service...
[  188.408787] systemd[1]: systemd-journald.service holdoff time over, scheduling restart.
[  189.284506] systemd[1]: Stopping Journal Service...
[  189.330806] systemd[1]: Starting Journal Service...
[  189.384800] systemd[1]: Started Journal Service.


cheers,
daniel

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Boaz Harrosh March 4, 2015, 3 p.m. UTC | #3
On 03/04/2015 04:20 PM, Daniel Wagner wrote:
> On 03/03/2015 01:55 AM, Jeff Layton wrote:
>> On Mon,  2 Mar 2015 15:25:12 +0100
>> Daniel Wagner <daniel.wagner@bmw-carit.de> wrote:
>>
<>
> I have fixed that stuff and now I am testing it. Though it seems
> that there is a memory leak which can be triggered with 
> 
> 	while true; rm -rf /tmp/a; ./lease02 /tmp/a; done
> 
> and this happens also without any of my patches. Still trying to
> figure out what's happening. Hopefully I just see a ghost.
> 
> slabtop tells me that ftrace_event_field is constantly growing:
> 

check out the Kernel's leak detector it is perfect in showing you
what was the exact call stack of the leaked memory.

(Tell me if you need the exact Kconfig key to enable it should not
 be hard to find)

Cheers
Boaz

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Daniel Wagner March 4, 2015, 3:32 p.m. UTC | #4
On 03/04/2015 04:00 PM, Boaz Harrosh wrote:
> On 03/04/2015 04:20 PM, Daniel Wagner wrote:
>> On 03/03/2015 01:55 AM, Jeff Layton wrote:
>>> On Mon,  2 Mar 2015 15:25:12 +0100
>>> Daniel Wagner <daniel.wagner@bmw-carit.de> wrote:
>>>
> <>
>> I have fixed that stuff and now I am testing it. Though it seems
>> that there is a memory leak which can be triggered with 
>>
>> 	while true; rm -rf /tmp/a; ./lease02 /tmp/a; done
>>
>> and this happens also without any of my patches. Still trying to
>> figure out what's happening. Hopefully I just see a ghost.
>>
>> slabtop tells me that ftrace_event_field is constantly growing:
>>
> 
> check out the Kernel's leak detector it is perfect in showing you
> what was the exact call stack of the leaked memory.

Thanks for the tip. Will use it in future :)

I have done a quick bisect limit the search on fs/locks.c.
I suspect that the file_lock_context refactoring is the source of the leak.
bisect agrees with me


8634b51f6ca298fb8b07aa4847340764903533ab is the first bad commit
commit 8634b51f6ca298fb8b07aa4847340764903533ab
Author: Jeff Layton <jlayton@primarydata.com>
Date:   Fri Jan 16 15:05:55 2015 -0500

    locks: convert lease handling to file_lock_context
    
    Signed-off-by: Jeff Layton <jlayton@primarydata.com>
    Acked-by: Christoph Hellwig <hch@lst.de>

:040000 040000 4114db9392dc4dadb30664b71a954321e5e87bab 5b9abbaf1808a7c926c09fa2164044e0cc26fd54 M      fs
:040000 040000 bd569f527a195edf673c4f7d0e80bf356c7f8d1b 6362646e04dd83efc1a9e92877900797ac879e9a M      include

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jeff Layton March 4, 2015, 5:59 p.m. UTC | #5
On Wed, 4 Mar 2015 16:32:57 +0100
Daniel Wagner <daniel.wagner@bmw-carit.de> wrote:

> On 03/04/2015 04:00 PM, Boaz Harrosh wrote:
> > On 03/04/2015 04:20 PM, Daniel Wagner wrote:
> >> On 03/03/2015 01:55 AM, Jeff Layton wrote:
> >>> On Mon,  2 Mar 2015 15:25:12 +0100
> >>> Daniel Wagner <daniel.wagner@bmw-carit.de> wrote:
> >>>
> > <>
> >> I have fixed that stuff and now I am testing it. Though it seems
> >> that there is a memory leak which can be triggered with 
> >>
> >> 	while true; rm -rf /tmp/a; ./lease02 /tmp/a; done
> >>
> >> and this happens also without any of my patches. Still trying to
> >> figure out what's happening. Hopefully I just see a ghost.
> >>
> >> slabtop tells me that ftrace_event_field is constantly growing:
> >>
> > 
> > check out the Kernel's leak detector it is perfect in showing you
> > what was the exact call stack of the leaked memory.
> 
> Thanks for the tip. Will use it in future :)
> 
> I have done a quick bisect limit the search on fs/locks.c.
> I suspect that the file_lock_context refactoring is the source of the leak.
> bisect agrees with me
> 
> 
> 8634b51f6ca298fb8b07aa4847340764903533ab is the first bad commit
> commit 8634b51f6ca298fb8b07aa4847340764903533ab
> Author: Jeff Layton <jlayton@primarydata.com>
> Date:   Fri Jan 16 15:05:55 2015 -0500
> 
>     locks: convert lease handling to file_lock_context
>     
>     Signed-off-by: Jeff Layton <jlayton@primarydata.com>
>     Acked-by: Christoph Hellwig <hch@lst.de>
> 
> :040000 040000 4114db9392dc4dadb30664b71a954321e5e87bab 5b9abbaf1808a7c926c09fa2164044e0cc26fd54 M      fs
> :040000 040000 bd569f527a195edf673c4f7d0e80bf356c7f8d1b 6362646e04dd83efc1a9e92877900797ac879e9a M      include
> 

Thanks. I'll take a look.
Jeff Layton March 4, 2015, 7:16 p.m. UTC | #6
On Wed, 4 Mar 2015 12:59:23 -0500
Jeff Layton <jlayton@poochiereds.net> wrote:

> On Wed, 4 Mar 2015 16:32:57 +0100
> Daniel Wagner <daniel.wagner@bmw-carit.de> wrote:
> 
> > On 03/04/2015 04:00 PM, Boaz Harrosh wrote:
> > > On 03/04/2015 04:20 PM, Daniel Wagner wrote:
> > >> On 03/03/2015 01:55 AM, Jeff Layton wrote:
> > >>> On Mon,  2 Mar 2015 15:25:12 +0100
> > >>> Daniel Wagner <daniel.wagner@bmw-carit.de> wrote:
> > >>>
> > > <>
> > >> I have fixed that stuff and now I am testing it. Though it seems
> > >> that there is a memory leak which can be triggered with 
> > >>
> > >> 	while true; rm -rf /tmp/a; ./lease02 /tmp/a; done
> > >>
> > >> and this happens also without any of my patches. Still trying to
> > >> figure out what's happening. Hopefully I just see a ghost.
> > >>
> > >> slabtop tells me that ftrace_event_field is constantly growing:
> > >>
> > > 
> > > check out the Kernel's leak detector it is perfect in showing you
> > > what was the exact call stack of the leaked memory.
> > 
> > Thanks for the tip. Will use it in future :)
> > 
> > I have done a quick bisect limit the search on fs/locks.c.
> > I suspect that the file_lock_context refactoring is the source of the leak.
> > bisect agrees with me
> > 
> > 
> > 8634b51f6ca298fb8b07aa4847340764903533ab is the first bad commit
> > commit 8634b51f6ca298fb8b07aa4847340764903533ab
> > Author: Jeff Layton <jlayton@primarydata.com>
> > Date:   Fri Jan 16 15:05:55 2015 -0500
> > 
> >     locks: convert lease handling to file_lock_context
> >     
> >     Signed-off-by: Jeff Layton <jlayton@primarydata.com>
> >     Acked-by: Christoph Hellwig <hch@lst.de>
> > 
> > :040000 040000 4114db9392dc4dadb30664b71a954321e5e87bab 5b9abbaf1808a7c926c09fa2164044e0cc26fd54 M      fs
> > :040000 040000 bd569f527a195edf673c4f7d0e80bf356c7f8d1b 6362646e04dd83efc1a9e92877900797ac879e9a M      include
> > 
> 
> Thanks. I'll take a look.
> 

Huh. I'm was a bit surprised by this as I didn't really touch how the
fasync entries get handled. I added a bit of printk debugging
(primitive, I know...) and I see this:

[  458.715319] lease_modify: calling fasync_helper on ffff880035a942d0

So, the fasync_helper getting called on the fasync entry, but it's
definitely not getting freed. When I look at the object in the
debugger, it looks like call_rcu has been called on it though:

  fa_file = 0x0, 
  fa_rcu = {
    next = 0xffff8800ccd6a8a0, 
    func = 0xffffffff8122b1c0 <fasync_free_rcu>
  }

...it's almost like the rcu grace period isn't ending properly? I'll
keep poking at though to see if I can figure out what's going wrong.
Jeff Layton March 4, 2015, 9:01 p.m. UTC | #7
On Wed, 4 Mar 2015 15:20:33 +0100
Daniel Wagner <daniel.wagner@bmw-carit.de> wrote:

> On 03/03/2015 01:55 AM, Jeff Layton wrote:
> > On Mon,  2 Mar 2015 15:25:12 +0100
> > Daniel Wagner <daniel.wagner@bmw-carit.de> wrote:
> > 
> >> The locks_insert/delete_block() functions are used for flock, posix
> >> and leases types. blocked_lock_lock is used to serialize all access to
> >> fl_link, fl_block, fl_next and blocked_hash. Here, we prepare the
> >> stage for using blocked_lock_lock to protect blocked_hash.
> >>
> >> Signed-off-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
> >> Cc: Jeff Layton <jlayton@poochiereds.net>
> >> Cc: "J. Bruce Fields" <bfields@fieldses.org>
> >> Cc: Alexander Viro <viro@zeniv.linux.org.uk>
> >> ---
> >>  fs/locks.c | 48 ++++++++++++++++++++++++++++++++++++++++--------
> >>  1 file changed, 40 insertions(+), 8 deletions(-)
> >>
> >> diff --git a/fs/locks.c b/fs/locks.c
> >> index 4498da0..02821dd 100644
> >> --- a/fs/locks.c
> >> +++ b/fs/locks.c
> >> @@ -611,11 +611,20 @@ static void locks_delete_global_blocked(struct file_lock *waiter)
> >>   */
> >>  static void __locks_delete_block(struct file_lock *waiter)
> >>  {
> >> -	locks_delete_global_blocked(waiter);
> >>  	list_del_init(&waiter->fl_block);
> >>  	waiter->fl_next = NULL;
> >>  }
> >>  
> >> +/* Posix block variant of __locks_delete_block.
> >> + *
> >> + * Must be called with blocked_lock_lock held.
> >> + */
> >> +static void __locks_delete_posix_block(struct file_lock *waiter)
> >> +{
> >> +	locks_delete_global_blocked(waiter);
> >> +	__locks_delete_block(waiter);
> >> +}
> >> +
> >>  static void locks_delete_block(struct file_lock *waiter)
> >>  {
> >>  	spin_lock(&blocked_lock_lock);
> >> @@ -623,6 +632,13 @@ static void locks_delete_block(struct file_lock *waiter)
> >>  	spin_unlock(&blocked_lock_lock);
> >>  }
> >>  
> >> +static void locks_delete_posix_block(struct file_lock *waiter)
> >> +{
> >> +	spin_lock(&blocked_lock_lock);
> >> +	__locks_delete_posix_block(waiter);
> >> +	spin_unlock(&blocked_lock_lock);
> >> +}
> >> +
> >>  /* Insert waiter into blocker's block list.
> >>   * We use a circular list so that processes can be easily woken up in
> >>   * the order they blocked. The documentation doesn't require this but
> >> @@ -639,7 +655,17 @@ static void __locks_insert_block(struct file_lock *blocker,
> >>  	BUG_ON(!list_empty(&waiter->fl_block));
> >>  	waiter->fl_next = blocker;
> >>  	list_add_tail(&waiter->fl_block, &blocker->fl_block);
> >> -	if (IS_POSIX(blocker) && !IS_OFDLCK(blocker))
> >> +}
> >> +
> >> +/* Posix block variant of __locks_insert_block.
> >> + *
> >> + * Must be called with flc_lock and blocked_lock_lock held.
> >> + */
> >> +static void __locks_insert_posix_block(struct file_lock *blocker,
> >> +					struct file_lock *waiter)
> >> +{
> >> +	__locks_insert_block(blocker, waiter);
> >> +	if (!IS_OFDLCK(blocker))
> >>  		locks_insert_global_blocked(waiter);
> >>  }
> >>
> > 
> > In many ways OFD locks act more like flock locks than POSIX ones. In
> > particular, there is no deadlock detection there, so once your
> > conversion is done to more widely use the percpu locks, then you should
> > be able to avoid taking the blocked_lock_lock for OFD locks as well.
> > The 4th patch in this series doesn't currently do that.
> > 
> > You may want to revisit this patch such that the IS_OFDLCK checks are
> > done earlier, so that you can avoid taking the blocked_lock_lock if
> > IS_POSIX and !IS_OFDLCK.
> 
> Thanks for the explanation. I was not entirely sure what to do here
> and forgot to ask.
> 
> I have fixed that stuff and now I am testing it. Though it seems
> that there is a memory leak which can be triggered with 
> 
> 	while true; rm -rf /tmp/a; ./lease02 /tmp/a; done
> 
> and this happens also without any of my patches. Still trying to
> figure out what's happening. Hopefully I just see a ghost.
> 
> slabtop tells me that ftrace_event_field is constantly growing:
> 
>  Active / Total Objects (% used)    : 968819303 / 968828665 (100.0%)
>  Active / Total Slabs (% used)      : 11404623 / 11404623 (100.0%)
>  Active / Total Caches (% used)     : 72 / 99 (72.7%)
>  Active / Total Size (% used)       : 45616199.68K / 45619608.73K (100.0%)
>  Minimum / Average / Maximum Object : 0.01K / 0.05K / 16.00K
> 
>   OBJS ACTIVE  USE OBJ SIZE  SLABS OBJ/SLAB CACHE SIZE NAME                   
> 967510630 967510630   2%    0.05K 11382478       85  45529912K ftrace_event_field
> 154368 154368 100%    0.03K   1206      128      4824K kmalloc-32
> 121856 121856 100%    0.01K    238      512       952K kmalloc-8
> 121227 121095  99%    0.08K   2377       51      9508K Acpi-State
> 
> This is on proper hardware. On a kvm guest, fasync_cache grows fast and finally the
> guest runs out of memory. systemd tries hard to restart everything and fails constantly:
> 
> [  187.021758] systemd invoked oom-killer: gfp_mask=0x200da, order=0, oom_score_adj=0
> [  187.022337] systemd cpuset=/ mems_allowed=0
> [  187.022662] CPU: 3 PID: 1 Comm: systemd Not tainted 4.0.0-rc1+ #380
> [  187.023117] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.7.5-20140709_153950- 04/01/2014
> [  187.023801]  ffff88007c918000 ffff88007c9179c8 ffffffff81b4f9be ffffffff8116a9cc
> [  187.024373]  0000000000000000 ffff88007c917a88 ffffffff8116a9d1 000000007c917a58
> [  187.024940]  ffffffff8224bc98 ffff88007c917a28 0000000000000092 ffffffff81c1b780
> [  187.025515] Call Trace:
> [  187.025698]  [<ffffffff81b4f9be>] dump_stack+0x4c/0x65
> [  187.026083]  [<ffffffff8116a9cc>] ? dump_header.isra.13+0x7c/0x450
> [  187.026525]  [<ffffffff8116a9d1>] dump_header.isra.13+0x81/0x450
> [  187.026958]  [<ffffffff810a45c6>] ? trace_hardirqs_on_caller+0x16/0x240
> [  187.027437]  [<ffffffff810a47fd>] ? trace_hardirqs_on+0xd/0x10
> [  187.027859]  [<ffffffff814fe5c4>] ? ___ratelimit+0x84/0x110
> [  187.028264]  [<ffffffff8116b378>] oom_kill_process+0x1e8/0x4c0
> [  187.028683]  [<ffffffff8105fda5>] ? has_ns_capability_noaudit+0x5/0x170
> [  187.029167]  [<ffffffff8116baf4>] __out_of_memory+0x4a4/0x510
> [  187.029579]  [<ffffffff8116bd2b>] out_of_memory+0x5b/0x80
> [  187.029970]  [<ffffffff81170f2e>] __alloc_pages_nodemask+0xa0e/0xb60
> [  187.030434]  [<ffffffff811ad863>] read_swap_cache_async+0xe3/0x180
> [  187.030881]  [<ffffffff811ad9ed>] swapin_readahead+0xed/0x190
> [  187.031300]  [<ffffffff8119bcae>] handle_mm_fault+0xbbe/0x1180
> [  187.031719]  [<ffffffff81046bed>] __do_page_fault+0x1ed/0x4c0
> [  187.032138]  [<ffffffff81046ecc>] do_page_fault+0xc/0x10
> [  187.032520]  [<ffffffff81b5ddc2>] page_fault+0x22/0x30
> [  187.032889] Mem-Info:
> [  187.033066] DMA per-cpu:
> [  187.033254] CPU    0: hi:    0, btch:   1 usd:   0
> [  187.033596] CPU    1: hi:    0, btch:   1 usd:   0
> [  187.033941] CPU    2: hi:    0, btch:   1 usd:   0
> [  187.034292] CPU    3: hi:    0, btch:   1 usd:   0
> [  187.034637] DMA32 per-cpu:
> [  187.034837] CPU    0: hi:  186, btch:  31 usd:  51
> [  187.035185] CPU    1: hi:  186, btch:  31 usd:   0
> [  187.035529] CPU    2: hi:  186, btch:  31 usd:   0
> [  187.035873] CPU    3: hi:  186, btch:  31 usd:  32
> [  187.036221] active_anon:5 inactive_anon:0 isolated_anon:0
> [  187.036221]  active_file:238 inactive_file:194 isolated_file:0
> [  187.036221]  unevictable:0 dirty:0 writeback:8 unstable:0
> [  187.036221]  free:3361 slab_reclaimable:4651 slab_unreclaimable:493909
> [  187.036221]  mapped:347 shmem:0 pagetables:400 bounce:0
> [  187.036221]  free_cma:0
> [  187.038385] DMA free:7848kB min:44kB low:52kB high:64kB active_anon:4kB inactive_anon:12kB active_file:0kB inactive_file:4kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:15992kB managed:15908kB mlocked:0kB dirty:0kB writeback:8kB mapped:4kB shmem:0kB slab_reclaimable:12kB slab_unreclaimable:7880kB kernel_stack:32kB pagetables:36kB unstable:0kB bounce:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:132 all_unreclaimable? yes
> [  187.041138] lowmem_reserve[]: 0 1952 1952 1952
> [  187.041510] DMA32 free:5596kB min:5628kB low:7032kB high:8440kB active_anon:16kB inactive_anon:0kB active_file:952kB inactive_file:772kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:2080640kB managed:2004912kB mlocked:0kB dirty:0kB writeback:24kB mapped:1384kB shmem:0kB slab_reclaimable:18592kB slab_unreclaimable:1967756kB kernel_stack:1968kB pagetables:1564kB unstable:0kB bounce:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:12716 all_unreclaimable? yes
> [  187.044442] lowmem_reserve[]: 0 0 0 0
> [  187.044756] DMA: 4*4kB (UEM) 2*8kB (UM) 5*16kB (UEM) 2*32kB (UE) 2*64kB (EM) 3*128kB (UEM) 2*256kB (EM) 3*512kB (UEM) 3*1024kB (UEM) 1*2048kB (R) 0*4096kB = 7856kB
> [  187.046022] DMA32: 190*4kB (UER) 6*8kB (R) 1*16kB (R) 1*32kB (R) 0*64kB 0*128kB 1*256kB (R) 1*512kB (R) 0*1024kB 0*2048kB 1*4096kB (R) = 5720kB
> [  187.047128] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB
> [  187.047724] 554 total pagecache pages
> [  187.047991] 60 pages in swap cache
> [  187.048259] Swap cache stats: add 102769, delete 102709, find 75688/136761
> [  187.048748] Free swap  = 1041456kB
> [  187.048995] Total swap = 1048572kB
> [  187.049250] 524158 pages RAM
> [  187.049463] 0 pages HighMem/MovableOnly
> [  187.049739] 18953 pages reserved
> [  187.049974] [ pid ]   uid  tgid total_vm      rss nr_ptes nr_pmds swapents oom_score_adj name
> [  187.050587] [ 1293]     0  1293    10283        1      23       2      131         -1000 systemd-udevd
> [  187.051253] [ 1660]     0  1660    12793       57      24       2      134         -1000 auditd
> [  187.051872] [ 1681]    81  1681     6637        1      18       2      124          -900 dbus-daemon
> [  187.052529] [ 1725]     0  1725    20707        0      42       3      216         -1000 sshd
> [  187.053146] [ 2344]     0  2344     3257        0      11       2       49             0 systemd-cgroups
> [  187.053820] [ 2345]     0  2345     3257        0      11       2       55             0 systemd-cgroups
> [  187.054497] [ 2350]     0  2350     3257        0      11       2       35             0 systemd-cgroups
> [  187.055175] [ 2352]     0  2352     3257        0      12       2       37             0 systemd-cgroups
> [  187.055846] [ 2354]     0  2354     3257        0      11       2       43             0 systemd-cgroups
> [  187.056530] [ 2355]     0  2355     3257        0      11       2       40             0 systemd-cgroups
> [  187.057212] [ 2356]     0  2356     3257        0      11       2       44             0 systemd-cgroups
> [  187.057886] [ 2362]     0  2362     3257        0      11       3       33             0 systemd-cgroups
> [  187.058564] [ 2371]     0  2371     3257        0      11       2       33             0 systemd-cgroups
> [  187.059244] [ 2372]     0  2372     3257        0      10       2       44             0 systemd-cgroups
> [  187.059917] [ 2373]     0  2373     3257        0      11       2       39             0 systemd-cgroups
> [  187.060600] [ 2376]     0  2376     3257        0      11       2       34             0 systemd-cgroups
> [  187.061280] [ 2377]     0  2377     3257        0      10       2       43             0 systemd-cgroups
> [  187.061942] [ 2378]     0  2378     3257        0      12       3       34             0 systemd-cgroups
> [  187.062598] [ 2379]     0  2379    27502        0      10       3       33             0 agetty
> [  187.063200] [ 2385]     0  2385     3257        0      12       2       44             0 systemd-cgroups
> [  187.063859] [ 2390]     0  2390     3257        0      11       2       43             0 systemd-cgroups
> [  187.064520] [ 2394]     0  2394     3257        0      11       2       41             0 systemd-cgroups
> [  187.065182] [ 2397]     0  2397     3257        0      11       2       43             0 systemd-cgroups
> [  187.065833] [ 2402]     0  2402     3257        0      11       2       42             0 systemd-cgroups
> [  187.066490] [ 2403]     0  2403     3257        0      11       2       44             0 systemd-cgroups
> [  187.067148] [ 2404]     0  2404    27502        0      13       3       30             0 agetty
> [  187.067743] [ 2410]     0  2410     3257        0      11       2       43             0 systemd-cgroups
> [  187.068407] [ 2413]     0  2413     3257        0      11       2       36             0 systemd-cgroups
> [  187.069072] [ 2416]     0  2416     3257        0      11       2       49             0 systemd-cgroups
> [  187.069720] [ 2417]     0  2417    11861      173      26       2      334             0 (journald)
> [  187.070368] Out of memory: Kill process 2417 ((journald)) score 0 or sacrifice child
> [  187.070943] Killed process 2417 ((journald)) total-vm:47444kB, anon-rss:0kB, file-rss:692kB
> [  187.513857] systemd[1]: Unit systemd-logind.service entered failed state.
> [  188.262477] systemd[1]: Unit systemd-journald.service entered failed state.
> [  188.315222] systemd[1]: systemd-logind.service holdoff time over, scheduling restart.
> [  188.334194] systemd[1]: Stopping Login Service...
> [  188.341556] systemd[1]: Starting Login Service...
> [  188.408787] systemd[1]: systemd-journald.service holdoff time over, scheduling restart.
> [  189.284506] systemd[1]: Stopping Journal Service...
> [  189.330806] systemd[1]: Starting Journal Service...
> [  189.384800] systemd[1]: Started Journal Service.
> 
> 
> cheers,
> daniel
> 

I pulled down the most recent Fedora rawhide kernel today:

    4.0.0-0.rc2.git0.1.fc23.x86_64

...and with that, I can't reproduce this. The ftrace_event_field slab
(which is shared by the fasync_struct cache) seems to stay under
control. I see it hover around 3-4M in size while the test is running
but the box isn't falling over or anything.

Perhaps this was an MM or RCU bug that is now fixed? Can you confirm
whether you're still able to reproduce it with the most recent mainline
kernels?
Jeff Layton March 4, 2015, 9:12 p.m. UTC | #8
On Wed, 4 Mar 2015 16:01:36 -0500
Jeff Layton <jlayton@poochiereds.net> wrote:

> On Wed, 4 Mar 2015 15:20:33 +0100
> Daniel Wagner <daniel.wagner@bmw-carit.de> wrote:
> 
> > On 03/03/2015 01:55 AM, Jeff Layton wrote:
> > > On Mon,  2 Mar 2015 15:25:12 +0100
> > > Daniel Wagner <daniel.wagner@bmw-carit.de> wrote:
> > > 
> > >> The locks_insert/delete_block() functions are used for flock, posix
> > >> and leases types. blocked_lock_lock is used to serialize all access to
> > >> fl_link, fl_block, fl_next and blocked_hash. Here, we prepare the
> > >> stage for using blocked_lock_lock to protect blocked_hash.
> > >>
> > >> Signed-off-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
> > >> Cc: Jeff Layton <jlayton@poochiereds.net>
> > >> Cc: "J. Bruce Fields" <bfields@fieldses.org>
> > >> Cc: Alexander Viro <viro@zeniv.linux.org.uk>
> > >> ---
> > >>  fs/locks.c | 48 ++++++++++++++++++++++++++++++++++++++++--------
> > >>  1 file changed, 40 insertions(+), 8 deletions(-)
> > >>
> > >> diff --git a/fs/locks.c b/fs/locks.c
> > >> index 4498da0..02821dd 100644
> > >> --- a/fs/locks.c
> > >> +++ b/fs/locks.c
> > >> @@ -611,11 +611,20 @@ static void locks_delete_global_blocked(struct file_lock *waiter)
> > >>   */
> > >>  static void __locks_delete_block(struct file_lock *waiter)
> > >>  {
> > >> -	locks_delete_global_blocked(waiter);
> > >>  	list_del_init(&waiter->fl_block);
> > >>  	waiter->fl_next = NULL;
> > >>  }
> > >>  
> > >> +/* Posix block variant of __locks_delete_block.
> > >> + *
> > >> + * Must be called with blocked_lock_lock held.
> > >> + */
> > >> +static void __locks_delete_posix_block(struct file_lock *waiter)
> > >> +{
> > >> +	locks_delete_global_blocked(waiter);
> > >> +	__locks_delete_block(waiter);
> > >> +}
> > >> +
> > >>  static void locks_delete_block(struct file_lock *waiter)
> > >>  {
> > >>  	spin_lock(&blocked_lock_lock);
> > >> @@ -623,6 +632,13 @@ static void locks_delete_block(struct file_lock *waiter)
> > >>  	spin_unlock(&blocked_lock_lock);
> > >>  }
> > >>  
> > >> +static void locks_delete_posix_block(struct file_lock *waiter)
> > >> +{
> > >> +	spin_lock(&blocked_lock_lock);
> > >> +	__locks_delete_posix_block(waiter);
> > >> +	spin_unlock(&blocked_lock_lock);
> > >> +}
> > >> +
> > >>  /* Insert waiter into blocker's block list.
> > >>   * We use a circular list so that processes can be easily woken up in
> > >>   * the order they blocked. The documentation doesn't require this but
> > >> @@ -639,7 +655,17 @@ static void __locks_insert_block(struct file_lock *blocker,
> > >>  	BUG_ON(!list_empty(&waiter->fl_block));
> > >>  	waiter->fl_next = blocker;
> > >>  	list_add_tail(&waiter->fl_block, &blocker->fl_block);
> > >> -	if (IS_POSIX(blocker) && !IS_OFDLCK(blocker))
> > >> +}
> > >> +
> > >> +/* Posix block variant of __locks_insert_block.
> > >> + *
> > >> + * Must be called with flc_lock and blocked_lock_lock held.
> > >> + */
> > >> +static void __locks_insert_posix_block(struct file_lock *blocker,
> > >> +					struct file_lock *waiter)
> > >> +{
> > >> +	__locks_insert_block(blocker, waiter);
> > >> +	if (!IS_OFDLCK(blocker))
> > >>  		locks_insert_global_blocked(waiter);
> > >>  }
> > >>
> > > 
> > > In many ways OFD locks act more like flock locks than POSIX ones. In
> > > particular, there is no deadlock detection there, so once your
> > > conversion is done to more widely use the percpu locks, then you should
> > > be able to avoid taking the blocked_lock_lock for OFD locks as well.
> > > The 4th patch in this series doesn't currently do that.
> > > 
> > > You may want to revisit this patch such that the IS_OFDLCK checks are
> > > done earlier, so that you can avoid taking the blocked_lock_lock if
> > > IS_POSIX and !IS_OFDLCK.
> > 
> > Thanks for the explanation. I was not entirely sure what to do here
> > and forgot to ask.
> > 
> > I have fixed that stuff and now I am testing it. Though it seems
> > that there is a memory leak which can be triggered with 
> > 
> > 	while true; rm -rf /tmp/a; ./lease02 /tmp/a; done
> > 
> > and this happens also without any of my patches. Still trying to
> > figure out what's happening. Hopefully I just see a ghost.
> > 
> > slabtop tells me that ftrace_event_field is constantly growing:
> > 
> >  Active / Total Objects (% used)    : 968819303 / 968828665 (100.0%)
> >  Active / Total Slabs (% used)      : 11404623 / 11404623 (100.0%)
> >  Active / Total Caches (% used)     : 72 / 99 (72.7%)
> >  Active / Total Size (% used)       : 45616199.68K / 45619608.73K (100.0%)
> >  Minimum / Average / Maximum Object : 0.01K / 0.05K / 16.00K
> > 
> >   OBJS ACTIVE  USE OBJ SIZE  SLABS OBJ/SLAB CACHE SIZE NAME                   
> > 967510630 967510630   2%    0.05K 11382478       85  45529912K ftrace_event_field
> > 154368 154368 100%    0.03K   1206      128      4824K kmalloc-32
> > 121856 121856 100%    0.01K    238      512       952K kmalloc-8
> > 121227 121095  99%    0.08K   2377       51      9508K Acpi-State
> > 
> > This is on proper hardware. On a kvm guest, fasync_cache grows fast and finally the
> > guest runs out of memory. systemd tries hard to restart everything and fails constantly:
> > 
> > [  187.021758] systemd invoked oom-killer: gfp_mask=0x200da, order=0, oom_score_adj=0
> > [  187.022337] systemd cpuset=/ mems_allowed=0
> > [  187.022662] CPU: 3 PID: 1 Comm: systemd Not tainted 4.0.0-rc1+ #380
> > [  187.023117] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.7.5-20140709_153950- 04/01/2014
> > [  187.023801]  ffff88007c918000 ffff88007c9179c8 ffffffff81b4f9be ffffffff8116a9cc
> > [  187.024373]  0000000000000000 ffff88007c917a88 ffffffff8116a9d1 000000007c917a58
> > [  187.024940]  ffffffff8224bc98 ffff88007c917a28 0000000000000092 ffffffff81c1b780
> > [  187.025515] Call Trace:
> > [  187.025698]  [<ffffffff81b4f9be>] dump_stack+0x4c/0x65
> > [  187.026083]  [<ffffffff8116a9cc>] ? dump_header.isra.13+0x7c/0x450
> > [  187.026525]  [<ffffffff8116a9d1>] dump_header.isra.13+0x81/0x450
> > [  187.026958]  [<ffffffff810a45c6>] ? trace_hardirqs_on_caller+0x16/0x240
> > [  187.027437]  [<ffffffff810a47fd>] ? trace_hardirqs_on+0xd/0x10
> > [  187.027859]  [<ffffffff814fe5c4>] ? ___ratelimit+0x84/0x110
> > [  187.028264]  [<ffffffff8116b378>] oom_kill_process+0x1e8/0x4c0
> > [  187.028683]  [<ffffffff8105fda5>] ? has_ns_capability_noaudit+0x5/0x170
> > [  187.029167]  [<ffffffff8116baf4>] __out_of_memory+0x4a4/0x510
> > [  187.029579]  [<ffffffff8116bd2b>] out_of_memory+0x5b/0x80
> > [  187.029970]  [<ffffffff81170f2e>] __alloc_pages_nodemask+0xa0e/0xb60
> > [  187.030434]  [<ffffffff811ad863>] read_swap_cache_async+0xe3/0x180
> > [  187.030881]  [<ffffffff811ad9ed>] swapin_readahead+0xed/0x190
> > [  187.031300]  [<ffffffff8119bcae>] handle_mm_fault+0xbbe/0x1180
> > [  187.031719]  [<ffffffff81046bed>] __do_page_fault+0x1ed/0x4c0
> > [  187.032138]  [<ffffffff81046ecc>] do_page_fault+0xc/0x10
> > [  187.032520]  [<ffffffff81b5ddc2>] page_fault+0x22/0x30
> > [  187.032889] Mem-Info:
> > [  187.033066] DMA per-cpu:
> > [  187.033254] CPU    0: hi:    0, btch:   1 usd:   0
> > [  187.033596] CPU    1: hi:    0, btch:   1 usd:   0
> > [  187.033941] CPU    2: hi:    0, btch:   1 usd:   0
> > [  187.034292] CPU    3: hi:    0, btch:   1 usd:   0
> > [  187.034637] DMA32 per-cpu:
> > [  187.034837] CPU    0: hi:  186, btch:  31 usd:  51
> > [  187.035185] CPU    1: hi:  186, btch:  31 usd:   0
> > [  187.035529] CPU    2: hi:  186, btch:  31 usd:   0
> > [  187.035873] CPU    3: hi:  186, btch:  31 usd:  32
> > [  187.036221] active_anon:5 inactive_anon:0 isolated_anon:0
> > [  187.036221]  active_file:238 inactive_file:194 isolated_file:0
> > [  187.036221]  unevictable:0 dirty:0 writeback:8 unstable:0
> > [  187.036221]  free:3361 slab_reclaimable:4651 slab_unreclaimable:493909
> > [  187.036221]  mapped:347 shmem:0 pagetables:400 bounce:0
> > [  187.036221]  free_cma:0
> > [  187.038385] DMA free:7848kB min:44kB low:52kB high:64kB active_anon:4kB inactive_anon:12kB active_file:0kB inactive_file:4kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:15992kB managed:15908kB mlocked:0kB dirty:0kB writeback:8kB mapped:4kB shmem:0kB slab_reclaimable:12kB slab_unreclaimable:7880kB kernel_stack:32kB pagetables:36kB unstable:0kB bounce:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:132 all_unreclaimable? yes
> > [  187.041138] lowmem_reserve[]: 0 1952 1952 1952
> > [  187.041510] DMA32 free:5596kB min:5628kB low:7032kB high:8440kB active_anon:16kB inactive_anon:0kB active_file:952kB inactive_file:772kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:2080640kB managed:2004912kB mlocked:0kB dirty:0kB writeback:24kB mapped:1384kB shmem:0kB slab_reclaimable:18592kB slab_unreclaimable:1967756kB kernel_stack:1968kB pagetables:1564kB unstable:0kB bounce:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:12716 all_unreclaimable? yes
> > [  187.044442] lowmem_reserve[]: 0 0 0 0
> > [  187.044756] DMA: 4*4kB (UEM) 2*8kB (UM) 5*16kB (UEM) 2*32kB (UE) 2*64kB (EM) 3*128kB (UEM) 2*256kB (EM) 3*512kB (UEM) 3*1024kB (UEM) 1*2048kB (R) 0*4096kB = 7856kB
> > [  187.046022] DMA32: 190*4kB (UER) 6*8kB (R) 1*16kB (R) 1*32kB (R) 0*64kB 0*128kB 1*256kB (R) 1*512kB (R) 0*1024kB 0*2048kB 1*4096kB (R) = 5720kB
> > [  187.047128] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB
> > [  187.047724] 554 total pagecache pages
> > [  187.047991] 60 pages in swap cache
> > [  187.048259] Swap cache stats: add 102769, delete 102709, find 75688/136761
> > [  187.048748] Free swap  = 1041456kB
> > [  187.048995] Total swap = 1048572kB
> > [  187.049250] 524158 pages RAM
> > [  187.049463] 0 pages HighMem/MovableOnly
> > [  187.049739] 18953 pages reserved
> > [  187.049974] [ pid ]   uid  tgid total_vm      rss nr_ptes nr_pmds swapents oom_score_adj name
> > [  187.050587] [ 1293]     0  1293    10283        1      23       2      131         -1000 systemd-udevd
> > [  187.051253] [ 1660]     0  1660    12793       57      24       2      134         -1000 auditd
> > [  187.051872] [ 1681]    81  1681     6637        1      18       2      124          -900 dbus-daemon
> > [  187.052529] [ 1725]     0  1725    20707        0      42       3      216         -1000 sshd
> > [  187.053146] [ 2344]     0  2344     3257        0      11       2       49             0 systemd-cgroups
> > [  187.053820] [ 2345]     0  2345     3257        0      11       2       55             0 systemd-cgroups
> > [  187.054497] [ 2350]     0  2350     3257        0      11       2       35             0 systemd-cgroups
> > [  187.055175] [ 2352]     0  2352     3257        0      12       2       37             0 systemd-cgroups
> > [  187.055846] [ 2354]     0  2354     3257        0      11       2       43             0 systemd-cgroups
> > [  187.056530] [ 2355]     0  2355     3257        0      11       2       40             0 systemd-cgroups
> > [  187.057212] [ 2356]     0  2356     3257        0      11       2       44             0 systemd-cgroups
> > [  187.057886] [ 2362]     0  2362     3257        0      11       3       33             0 systemd-cgroups
> > [  187.058564] [ 2371]     0  2371     3257        0      11       2       33             0 systemd-cgroups
> > [  187.059244] [ 2372]     0  2372     3257        0      10       2       44             0 systemd-cgroups
> > [  187.059917] [ 2373]     0  2373     3257        0      11       2       39             0 systemd-cgroups
> > [  187.060600] [ 2376]     0  2376     3257        0      11       2       34             0 systemd-cgroups
> > [  187.061280] [ 2377]     0  2377     3257        0      10       2       43             0 systemd-cgroups
> > [  187.061942] [ 2378]     0  2378     3257        0      12       3       34             0 systemd-cgroups
> > [  187.062598] [ 2379]     0  2379    27502        0      10       3       33             0 agetty
> > [  187.063200] [ 2385]     0  2385     3257        0      12       2       44             0 systemd-cgroups
> > [  187.063859] [ 2390]     0  2390     3257        0      11       2       43             0 systemd-cgroups
> > [  187.064520] [ 2394]     0  2394     3257        0      11       2       41             0 systemd-cgroups
> > [  187.065182] [ 2397]     0  2397     3257        0      11       2       43             0 systemd-cgroups
> > [  187.065833] [ 2402]     0  2402     3257        0      11       2       42             0 systemd-cgroups
> > [  187.066490] [ 2403]     0  2403     3257        0      11       2       44             0 systemd-cgroups
> > [  187.067148] [ 2404]     0  2404    27502        0      13       3       30             0 agetty
> > [  187.067743] [ 2410]     0  2410     3257        0      11       2       43             0 systemd-cgroups
> > [  187.068407] [ 2413]     0  2413     3257        0      11       2       36             0 systemd-cgroups
> > [  187.069072] [ 2416]     0  2416     3257        0      11       2       49             0 systemd-cgroups
> > [  187.069720] [ 2417]     0  2417    11861      173      26       2      334             0 (journald)
> > [  187.070368] Out of memory: Kill process 2417 ((journald)) score 0 or sacrifice child
> > [  187.070943] Killed process 2417 ((journald)) total-vm:47444kB, anon-rss:0kB, file-rss:692kB
> > [  187.513857] systemd[1]: Unit systemd-logind.service entered failed state.
> > [  188.262477] systemd[1]: Unit systemd-journald.service entered failed state.
> > [  188.315222] systemd[1]: systemd-logind.service holdoff time over, scheduling restart.
> > [  188.334194] systemd[1]: Stopping Login Service...
> > [  188.341556] systemd[1]: Starting Login Service...
> > [  188.408787] systemd[1]: systemd-journald.service holdoff time over, scheduling restart.
> > [  189.284506] systemd[1]: Stopping Journal Service...
> > [  189.330806] systemd[1]: Starting Journal Service...
> > [  189.384800] systemd[1]: Started Journal Service.
> > 
> > 
> > cheers,
> > daniel
> > 
> 
> I pulled down the most recent Fedora rawhide kernel today:
> 
>     4.0.0-0.rc2.git0.1.fc23.x86_64
> 
> ...and with that, I can't reproduce this. The ftrace_event_field slab
> (which is shared by the fasync_struct cache) seems to stay under
> control. I see it hover around 3-4M in size while the test is running
> but the box isn't falling over or anything.
> 
> Perhaps this was an MM or RCU bug that is now fixed? Can you confirm
> whether you're still able to reproduce it with the most recent mainline
> kernels?
> 

Oh! I take it back. I was testing with your patched lease02 test. With
the unpatched one I can definitely still reproduce it. That suggests
that this problem is occurring when we go to clean up leases when
closing the file. I'll have a close look at that code.
Daniel Wagner March 4, 2015, 9:13 p.m. UTC | #9
On 03/04/2015 10:01 PM, Jeff Layton wrote:
> I pulled down the most recent Fedora rawhide kernel today:
>
>      4.0.0-0.rc2.git0.1.fc23.x86_64
>
> ...and with that, I can't reproduce this. The ftrace_event_field slab
> (which is shared by the fasync_struct cache) seems to stay under
> control. I see it hover around 3-4M in size while the test is running
> but the box isn't falling over or anything.
>
> Perhaps this was an MM or RCU bug that is now fixed? Can you confirm
> whether you're still able to reproduce it with the most recent mainline
> kernels?

Sure, I'll give tomorrow a spin.


--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/locks.c b/fs/locks.c
index 4498da0..02821dd 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -611,11 +611,20 @@  static void locks_delete_global_blocked(struct file_lock *waiter)
  */
 static void __locks_delete_block(struct file_lock *waiter)
 {
-	locks_delete_global_blocked(waiter);
 	list_del_init(&waiter->fl_block);
 	waiter->fl_next = NULL;
 }
 
+/* Posix block variant of __locks_delete_block.
+ *
+ * Must be called with blocked_lock_lock held.
+ */
+static void __locks_delete_posix_block(struct file_lock *waiter)
+{
+	locks_delete_global_blocked(waiter);
+	__locks_delete_block(waiter);
+}
+
 static void locks_delete_block(struct file_lock *waiter)
 {
 	spin_lock(&blocked_lock_lock);
@@ -623,6 +632,13 @@  static void locks_delete_block(struct file_lock *waiter)
 	spin_unlock(&blocked_lock_lock);
 }
 
+static void locks_delete_posix_block(struct file_lock *waiter)
+{
+	spin_lock(&blocked_lock_lock);
+	__locks_delete_posix_block(waiter);
+	spin_unlock(&blocked_lock_lock);
+}
+
 /* Insert waiter into blocker's block list.
  * We use a circular list so that processes can be easily woken up in
  * the order they blocked. The documentation doesn't require this but
@@ -639,7 +655,17 @@  static void __locks_insert_block(struct file_lock *blocker,
 	BUG_ON(!list_empty(&waiter->fl_block));
 	waiter->fl_next = blocker;
 	list_add_tail(&waiter->fl_block, &blocker->fl_block);
-	if (IS_POSIX(blocker) && !IS_OFDLCK(blocker))
+}
+
+/* Posix block variant of __locks_insert_block.
+ *
+ * Must be called with flc_lock and blocked_lock_lock held.
+ */
+static void __locks_insert_posix_block(struct file_lock *blocker,
+					struct file_lock *waiter)
+{
+	__locks_insert_block(blocker, waiter);
+	if (!IS_OFDLCK(blocker))
 		locks_insert_global_blocked(waiter);
 }
 
@@ -675,7 +701,10 @@  static void locks_wake_up_blocks(struct file_lock *blocker)
 
 		waiter = list_first_entry(&blocker->fl_block,
 				struct file_lock, fl_block);
-		__locks_delete_block(waiter);
+		if (IS_POSIX(blocker))
+			__locks_delete_posix_block(waiter);
+		else
+			__locks_delete_block(waiter);
 		if (waiter->fl_lmops && waiter->fl_lmops->lm_notify)
 			waiter->fl_lmops->lm_notify(waiter);
 		else
@@ -985,7 +1014,7 @@  static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 			spin_lock(&blocked_lock_lock);
 			if (likely(!posix_locks_deadlock(request, fl))) {
 				error = FILE_LOCK_DEFERRED;
-				__locks_insert_block(fl, request);
+				__locks_insert_posix_block(fl, request);
 			}
 			spin_unlock(&blocked_lock_lock);
 			goto out;
@@ -1186,7 +1215,7 @@  int posix_lock_file_wait(struct file *filp, struct file_lock *fl)
 		if (!error)
 			continue;
 
-		locks_delete_block(fl);
+		locks_delete_posix_block(fl);
 		break;
 	}
 	return error;
@@ -1283,7 +1312,7 @@  int locks_mandatory_area(int read_write, struct inode *inode,
 				continue;
 		}
 
-		locks_delete_block(&fl);
+		locks_delete_posix_block(&fl);
 		break;
 	}
 
@@ -2103,7 +2132,10 @@  static int do_lock_file_wait(struct file *filp, unsigned int cmd,
 		if (!error)
 			continue;
 
-		locks_delete_block(fl);
+		if (IS_POSIX(fl))
+			locks_delete_posix_block(fl);
+		else
+			locks_delete_block(fl);
 		break;
 	}
 
@@ -2467,7 +2499,7 @@  posix_unblock_lock(struct file_lock *waiter)
 
 	spin_lock(&blocked_lock_lock);
 	if (waiter->fl_next)
-		__locks_delete_block(waiter);
+		__locks_delete_posix_block(waiter);
 	else
 		status = -ENOENT;
 	spin_unlock(&blocked_lock_lock);