diff mbox

[bug,report,v4.8] fs/locks.c: kernel oops during posix lock stress test

Message ID CACVXFVPsHjh3CWjdUrKB_r6=hkXK=qS3wpykbacdKe1rzz1H8Q@mail.gmail.com (mailing list archive)
State New, archived
Headers show

Commit Message

Ming Lei Nov. 28, 2016, 3:10 a.m. UTC
Hi Guys,

When I run stress-ng via the following steps on one ARM64 dual
socket system(Cavium Thunder), the kernel oops[1] can often be
triggered after running the stress test for several hours(sometimes
it may take longer):

- git clone git://kernel.ubuntu.com/cking/stress-ng.git
- apply the attachment patch which just makes the posix file
lock stress test more aggressive
- run the test via '~/git/stress-ng$./stress-ng --lockf 128 --aggressive'


From the oops log, looks one garbage file_lock node is got
from the linked list of 'ctx->flc_posix' when the issue happens.

BTW, the issue isn't observed on single socket Cavium Thunder yet,
and the same issue can be seen on Ubuntu Xenial(v4.4 based kernel)
too.

Thanks,
Ming

[1] kernel oops log
ubuntu@ubuntu:~/git/stress-ng$ ./stress-ng --lockf 128 --aggressive
stress-ng: info:  [63828] defaulting to a 86400 second run per stressor
stress-ng: info:  [63828] dispatching hogs: 128 lockf
stress-ng: info:  [63828] cache allocate: default cache size: 16384K
[80659.799092] Unable to handle kernel NULL pointer dereference at
virtual address 00000030
[80659.807219] pgd = ffff81001f365800
[80659.810683] [00000030] *pgd=000001001a290003,
*pud=000001001a290003, *pmd=0000010fa07f0003, *pte=0000000000000000
[80659.821029] Internal error: Oops: 96000007 [#1] SMP
[80659.825901] Modules linked in:
[80659.828962] CPU: 15 PID: 63848 Comm: stress-ng-lockf Tainted: G
   W       4.8.0 #167
[80659.837132] Hardware name: Cavium ThunderX CRB/To be filled by
O.E.M., BIOS 5.11 12/12/2012
[80659.845479] task: ffff81001ee78580 task.stack: ffff81001f798000
[80659.851402] PC is at posix_locks_conflict+0x94/0xc0
[80659.856282] LR is at posix_lock_inode+0x90/0x6b0
[80659.860896] pc : [<ffff00000828c694>] lr : [<ffff00000828cd90>]
pstate: a0000145
[80659.868285] sp : ffff81001f79bca0
[80659.871596] x29: ffff81001f79bca0 x28: ffff81001f798000
[80659.876915] x27: ffff800fdffbc160 x26: 0000000000000000
[80659.882234] x25: ffff800fd2da2b30 x24: ffff800fce927430
[80659.887551] x23: ffff800fce92d8f0 x22: ffff81001f79bd30
[80659.892869] x21: ffff800fd2da2b18 x20: fffffffffffffff8
[80659.898187] x19: ffff800fdffbc160 x18: 0000000000001140
[80659.903504] x17: 0000ffff8870a578 x16: ffff000008245768
[80659.908821] x15: 0000ffff888bc000 x14: 0000000000000000
[80659.914139] x13: 00000003e8000000 x12: 0000000000000018
[80659.919457] x11: 00000000000e6a17 x10: 00000000ffffffd0
[80659.924776] x9 : 0000000000000000 x8 : ffff800fce927500
[80659.930094] x7 : 0000000000000000 x6 : 000000000000007f
[80659.935413] x5 : 0000000000000080 x4 : ffff800fce927438
[80659.940729] x3 : ffff800fce927458 x2 : 00000000000026b9
[80659.946047] x1 : ffff81001f37f300 x0 : 0000000000000000
[80659.951363]
[80659.952851] Process stress-ng-lockf (pid: 63848, stack limit =
0xffff81001f798020)
[80659.960415] Stack: (0xffff81001f79bca0 to 0xffff81001f79c000)
[80659.966158] bca0: ffff81001f79bcc0 ffff00000828cd90
fffffffffffffff8 ffff800fa3a66568
[80659.973986] bcc0: ffff81001f79bd40 ffff00000828d5f0
ffff800f8185c700 ffff800fdffbc160
[80659.981812] bce0: 0000000000000006 0000000000000000
ffff81001f79bdd0 0000000000000006
[80659.989638] bd00: 0000000000000120 0000000000000019
ffff0000088b1000 ffff81001f798000
[80659.997465] bd20: ffff81001f79bd40 ffff000008403fec
ffff81001f79bd30 ffff81001f79bd30
[80660.005292] bd40: ffff81001f79bd70 ffff00000828d8bc
ffff800f8185c700 ffff800fdffbc160
[80660.013118] bd60: ffff800fdffbc1b8 ffff800f8185c700
ffff81001f79bde0 ffff00000828ef10
[80660.020944] bd80: ffff800f8185c700 0000000000000000
ffff800fdffbc160 ffff800fa3a66568
[80660.028770] bda0: 0000000000000006 0000000000000004
ffff81001f79bde0 ffff00000828ee14
[80660.036596] bdc0: ffff800f8185c700 00000000fffffff2
ffff800fdffbc160 ffff810ff99aae80
[80660.044423] bde0: ffff81001f79be70 ffff000008245b84
ffff800f8185c700 ffff800f8185c700
[80660.052249] be00: 0000000000000000 0000000000000006
0000ffffdad5d4b0 0000000000000004
[80660.060087] be20: 0000000000000120 000000000000003e
0000000000010001 0000000000000000
[80660.067916] be40: 0000000000000008 0000000000000000
0000000000010001 0000000000000000
[80660.075742] be60: 0000000000000008 0000000000000000
0000000000000000 ffff0000080836f0
[80660.083568] be80: 0000000000000000 00000000005c5000
ffffffffffffffff 0000ffff8870a3b8
[80660.091394] bea0: 0000000080000000 0000000000000015
0000000080000000 00000000005c5000
[80660.099220] bec0: 0000000000000004 0000000000000006
0000ffffdad5d4b0 00000000ffffff80
[80660.107046] bee0: 0000ffffdad5d490 0000000026c26373
000000000000176f 0000000000004650
[80660.114873] bf00: 0000000000000019 0000000000006536
00000000ffffffd0 00000000000e6a17
[80660.122698] bf20: 0000000000000018 00000003e8000000
0000000000000000 0000ffff888bc000
[80660.130524] bf40: 000000000048a170 0000ffff8870a578
0000000000001140 000000000000055f
[80660.138351] bf60: 00000000005c5000 0000000000000004
0000ffff879f9008 0000000000000000
[80660.146177] bf80: 0000000000000002 000000000048b530
2001000800400201 0000ffffdad60758
[80660.154004] bfa0: 000000000048b008 0000ffffdad5d390
0000ffff8870a518 0000ffffdad5d390
[80660.161830] bfc0: 0000ffff8870a3b8 0000000080000000
0000000000000004 0000000000000019
[80660.169656] bfe0: 0000000000000000 0000000000000000
0000000000000000 0000000000000000
[80660.177481] Call trace:
[80660.179928] Exception stack(0xffff81001f79bad0 to 0xffff81001f79bc00)
[80660.186365] bac0:
ffff800fdffbc160 0001000000000000
[80660.194192] bae0: ffff81001f79bca0 ffff00000828c694
ffff800fc0002c00 ffff81001ee78600
[80660.202017] bb00: ffff81001f79bb70 ffff00000820b57c
ffff800fcb2a6d88 ffff800fc0002c00
[80660.209843] bb20: 0000000000000001 ffff810008ddbf00
ffff81001f79bc30 ffff81001f79bc30
[80660.217670] bb40: 0000000000000000 ffff810fa0712be8
ffff800f81dfd680 ffff810fa0712be8
[80660.225496] bb60: 0000000000000001 ffff810008ddbf00
0000000000000000 ffff81001f37f300
[80660.233322] bb80: 00000000000026b9 ffff800fce927458
ffff800fce927438 0000000000000080
[80660.241148] bba0: 000000000000007f 0000000000000000
ffff800fce927500 0000000000000000
[80660.248974] bbc0: 00000000ffffffd0 00000000000e6a17
0000000000000018 00000003e8000000
[80660.256800] bbe0: 0000000000000000 0000ffff888bc000
ffff000008245768 0000ffff8870a578
[80660.264636] [<ffff00000828c694>] posix_locks_conflict+0x94/0xc0
[80660.270559] [<ffff00000828cd90>] posix_lock_inode+0x90/0x6b0
[80660.276220] [<ffff00000828d5f0>] vfs_lock_file+0x68/0x78
[80660.281537] [<ffff00000828d8bc>] do_lock_file_wait+0x54/0xe0
[80660.287199] [<ffff00000828ef10>] fcntl_setlk+0x1c0/0x308
[80660.292513] [<ffff000008245b84>] SyS_fcntl+0x41c/0x5b8
[80660.297653] [<ffff0000080836f0>] el0_svc_naked+0x24/0x28
[80660.302961] Code: a8c27bfd d65f03c0 d503201f f9401e61 (f9401e80)
[80660.309188] ---[ end trace aa50050684d3a3fe ]---

Comments

Will Deacon Nov. 28, 2016, 10:52 a.m. UTC | #1
Hi Ming,

On Mon, Nov 28, 2016 at 11:10:14AM +0800, Ming Lei wrote:
> When I run stress-ng via the following steps on one ARM64 dual
> socket system(Cavium Thunder), the kernel oops[1] can often be
> triggered after running the stress test for several hours(sometimes
> it may take longer):
> 
> - git clone git://kernel.ubuntu.com/cking/stress-ng.git
> - apply the attachment patch which just makes the posix file
> lock stress test more aggressive
> - run the test via '~/git/stress-ng$./stress-ng --lockf 128 --aggressive'
> 
> 
> From the oops log, looks one garbage file_lock node is got
> from the linked list of 'ctx->flc_posix' when the issue happens.
> 
> BTW, the issue isn't observed on single socket Cavium Thunder yet,
> and the same issue can be seen on Ubuntu Xenial(v4.4 based kernel)
> too.

I've seen issues with the LSE atomics on the Thunder platform -- can you
try disabling those (CONFIG_ARM64_LSE_ATOMICS) and see if the problem
persists, please?

Will
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ming Lei Nov. 28, 2016, 12:39 p.m. UTC | #2
Hi Will,

On Mon, Nov 28, 2016 at 6:52 PM, Will Deacon <will.deacon@arm.com> wrote:
> Hi Ming,
>
> On Mon, Nov 28, 2016 at 11:10:14AM +0800, Ming Lei wrote:
>> When I run stress-ng via the following steps on one ARM64 dual
>> socket system(Cavium Thunder), the kernel oops[1] can often be
>> triggered after running the stress test for several hours(sometimes
>> it may take longer):
>>
>> - git clone git://kernel.ubuntu.com/cking/stress-ng.git
>> - apply the attachment patch which just makes the posix file
>> lock stress test more aggressive
>> - run the test via '~/git/stress-ng$./stress-ng --lockf 128 --aggressive'
>>
>>
>> From the oops log, looks one garbage file_lock node is got
>> from the linked list of 'ctx->flc_posix' when the issue happens.
>>
>> BTW, the issue isn't observed on single socket Cavium Thunder yet,
>> and the same issue can be seen on Ubuntu Xenial(v4.4 based kernel)
>> too.
>
> I've seen issues with the LSE atomics on the Thunder platform -- can you
> try disabling those (CONFIG_ARM64_LSE_ATOMICS) and see if the problem
> persists, please?
>

Ubuntu Xenial doesn't enable CONFIG_ARM64_LSE_ATOMICS, which
is disabled in my v4.8 kernel config too, please see that in the attachement.


Thanks,
Ming
Jeff Layton Nov. 28, 2016, 1:40 p.m. UTC | #3
On Mon, 2016-11-28 at 11:10 +0800, Ming Lei wrote:
> Hi Guys,
> 
> When I run stress-ng via the following steps on one ARM64 dual
> socket system(Cavium Thunder), the kernel oops[1] can often be
> triggered after running the stress test for several hours(sometimes
> it may take longer):
> 
> - git clone git://kernel.ubuntu.com/cking/stress-ng.git
> - apply the attachment patch which just makes the posix file
> lock stress test more aggressive
> - run the test via '~/git/stress-ng$./stress-ng --lockf 128 --aggressive'
> 
> 
> From the oops log, looks one garbage file_lock node is got
> from the linked list of 'ctx->flc_posix' when the issue happens.
> 
> BTW, the issue isn't observed on single socket Cavium Thunder yet,
> and the same issue can be seen on Ubuntu Xenial(v4.4 based kernel)
> too.
> 
> Thanks,
> Ming
> 

Some questions just for clarification:

- I assume this is being run on a local fs of some sort? ext4 or xfs or
something?

- have you seen this on any other arch, besides ARM?

The file locking code does do some lockless checking to see whether the
i_flctx is even present and whether the list is empty in
locks_remove_posix. It's possible we have some barrier problems there,
but I don't quite see how that would cause us to have a corrupt lock on
the flc_posix list.

> [1] kernel oops log
> ubuntu@ubuntu:~/git/stress-ng$ ./stress-ng --lockf 128 --aggressive
> stress-ng: info:  [63828] defaulting to a 86400 second run per stressor
> stress-ng: info:  [63828] dispatching hogs: 128 lockf
> stress-ng: info:  [63828] cache allocate: default cache size: 16384K
> [80659.799092] Unable to handle kernel NULL pointer dereference at
> virtual address 00000030
> [80659.807219] pgd = ffff81001f365800
> [80659.810683] [00000030] *pgd=000001001a290003,
> *pud=000001001a290003, *pmd=0000010fa07f0003, *pte=0000000000000000
> [80659.821029] Internal error: Oops: 96000007 [#1] SMP
> [80659.825901] Modules linked in:
> [80659.828962] CPU: 15 PID: 63848 Comm: stress-ng-lockf Tainted: G
>    W       4.8.0 #167
> [80659.837132] Hardware name: Cavium ThunderX CRB/To be filled by
> O.E.M., BIOS 5.11 12/12/2012
> [80659.845479] task: ffff81001ee78580 task.stack: ffff81001f798000
> [80659.851402] PC is at posix_locks_conflict+0x94/0xc0
> [80659.856282] LR is at posix_lock_inode+0x90/0x6b0
> [80659.860896] pc : [<ffff00000828c694>] lr : [<ffff00000828cd90>]
> pstate: a0000145
> [80659.868285] sp : ffff81001f79bca0
> [80659.871596] x29: ffff81001f79bca0 x28: ffff81001f798000
> [80659.876915] x27: ffff800fdffbc160 x26: 0000000000000000
> [80659.882234] x25: ffff800fd2da2b30 x24: ffff800fce927430
> [80659.887551] x23: ffff800fce92d8f0 x22: ffff81001f79bd30
> [80659.892869] x21: ffff800fd2da2b18 x20: fffffffffffffff8
> [80659.898187] x19: ffff800fdffbc160 x18: 0000000000001140
> [80659.903504] x17: 0000ffff8870a578 x16: ffff000008245768
> [80659.908821] x15: 0000ffff888bc000 x14: 0000000000000000
> [80659.914139] x13: 00000003e8000000 x12: 0000000000000018
> [80659.919457] x11: 00000000000e6a17 x10: 00000000ffffffd0
> [80659.924776] x9 : 0000000000000000 x8 : ffff800fce927500
> [80659.930094] x7 : 0000000000000000 x6 : 000000000000007f
> [80659.935413] x5 : 0000000000000080 x4 : ffff800fce927438
> [80659.940729] x3 : ffff800fce927458 x2 : 00000000000026b9
> [80659.946047] x1 : ffff81001f37f300 x0 : 0000000000000000
> [80659.951363]
> [80659.952851] Process stress-ng-lockf (pid: 63848, stack limit =
> 0xffff81001f798020)
> [80659.960415] Stack: (0xffff81001f79bca0 to 0xffff81001f79c000)
> [80659.966158] bca0: ffff81001f79bcc0 ffff00000828cd90
> fffffffffffffff8 ffff800fa3a66568
> [80659.973986] bcc0: ffff81001f79bd40 ffff00000828d5f0
> ffff800f8185c700 ffff800fdffbc160
> [80659.981812] bce0: 0000000000000006 0000000000000000
> ffff81001f79bdd0 0000000000000006
> [80659.989638] bd00: 0000000000000120 0000000000000019
> ffff0000088b1000 ffff81001f798000
> [80659.997465] bd20: ffff81001f79bd40 ffff000008403fec
> ffff81001f79bd30 ffff81001f79bd30
> [80660.005292] bd40: ffff81001f79bd70 ffff00000828d8bc
> ffff800f8185c700 ffff800fdffbc160
> [80660.013118] bd60: ffff800fdffbc1b8 ffff800f8185c700
> ffff81001f79bde0 ffff00000828ef10
> [80660.020944] bd80: ffff800f8185c700 0000000000000000
> ffff800fdffbc160 ffff800fa3a66568
> [80660.028770] bda0: 0000000000000006 0000000000000004
> ffff81001f79bde0 ffff00000828ee14
> [80660.036596] bdc0: ffff800f8185c700 00000000fffffff2
> ffff800fdffbc160 ffff810ff99aae80
> [80660.044423] bde0: ffff81001f79be70 ffff000008245b84
> ffff800f8185c700 ffff800f8185c700
> [80660.052249] be00: 0000000000000000 0000000000000006
> 0000ffffdad5d4b0 0000000000000004
> [80660.060087] be20: 0000000000000120 000000000000003e
> 0000000000010001 0000000000000000
> [80660.067916] be40: 0000000000000008 0000000000000000
> 0000000000010001 0000000000000000
> [80660.075742] be60: 0000000000000008 0000000000000000
> 0000000000000000 ffff0000080836f0
> [80660.083568] be80: 0000000000000000 00000000005c5000
> ffffffffffffffff 0000ffff8870a3b8
> [80660.091394] bea0: 0000000080000000 0000000000000015
> 0000000080000000 00000000005c5000
> [80660.099220] bec0: 0000000000000004 0000000000000006
> 0000ffffdad5d4b0 00000000ffffff80
> [80660.107046] bee0: 0000ffffdad5d490 0000000026c26373
> 000000000000176f 0000000000004650
> [80660.114873] bf00: 0000000000000019 0000000000006536
> 00000000ffffffd0 00000000000e6a17
> [80660.122698] bf20: 0000000000000018 00000003e8000000
> 0000000000000000 0000ffff888bc000
> [80660.130524] bf40: 000000000048a170 0000ffff8870a578
> 0000000000001140 000000000000055f
> [80660.138351] bf60: 00000000005c5000 0000000000000004
> 0000ffff879f9008 0000000000000000
> [80660.146177] bf80: 0000000000000002 000000000048b530
> 2001000800400201 0000ffffdad60758
> [80660.154004] bfa0: 000000000048b008 0000ffffdad5d390
> 0000ffff8870a518 0000ffffdad5d390
> [80660.161830] bfc0: 0000ffff8870a3b8 0000000080000000
> 0000000000000004 0000000000000019
> [80660.169656] bfe0: 0000000000000000 0000000000000000
> 0000000000000000 0000000000000000
> [80660.177481] Call trace:
> [80660.179928] Exception stack(0xffff81001f79bad0 to 0xffff81001f79bc00)
> [80660.186365] bac0:
> ffff800fdffbc160 0001000000000000
> [80660.194192] bae0: ffff81001f79bca0 ffff00000828c694
> ffff800fc0002c00 ffff81001ee78600
> [80660.202017] bb00: ffff81001f79bb70 ffff00000820b57c
> ffff800fcb2a6d88 ffff800fc0002c00
> [80660.209843] bb20: 0000000000000001 ffff810008ddbf00
> ffff81001f79bc30 ffff81001f79bc30
> [80660.217670] bb40: 0000000000000000 ffff810fa0712be8
> ffff800f81dfd680 ffff810fa0712be8
> [80660.225496] bb60: 0000000000000001 ffff810008ddbf00
> 0000000000000000 ffff81001f37f300
> [80660.233322] bb80: 00000000000026b9 ffff800fce927458
> ffff800fce927438 0000000000000080
> [80660.241148] bba0: 000000000000007f 0000000000000000
> ffff800fce927500 0000000000000000
> [80660.248974] bbc0: 00000000ffffffd0 00000000000e6a17
> 0000000000000018 00000003e8000000
> [80660.256800] bbe0: 0000000000000000 0000ffff888bc000
> ffff000008245768 0000ffff8870a578
> [80660.264636] [<ffff00000828c694>] posix_locks_conflict+0x94/0xc0
> [80660.270559] [<ffff00000828cd90>] posix_lock_inode+0x90/0x6b0
> [80660.276220] [<ffff00000828d5f0>] vfs_lock_file+0x68/0x78
> [80660.281537] [<ffff00000828d8bc>] do_lock_file_wait+0x54/0xe0
> [80660.287199] [<ffff00000828ef10>] fcntl_setlk+0x1c0/0x308
> [80660.292513] [<ffff000008245b84>] SyS_fcntl+0x41c/0x5b8
> [80660.297653] [<ffff0000080836f0>] el0_svc_naked+0x24/0x28
> [80660.302961] Code: a8c27bfd d65f03c0 d503201f f9401e61 (f9401e80)
> [80660.309188] ---[ end trace aa50050684d3a3fe ]---
Ming Lei Nov. 29, 2016, 1:14 a.m. UTC | #4
Hi Jeff,

On Mon, Nov 28, 2016 at 9:40 PM, Jeff Layton <jlayton@poochiereds.net> wrote:
> On Mon, 2016-11-28 at 11:10 +0800, Ming Lei wrote:
>> Hi Guys,
>>
>> When I run stress-ng via the following steps on one ARM64 dual
>> socket system(Cavium Thunder), the kernel oops[1] can often be
>> triggered after running the stress test for several hours(sometimes
>> it may take longer):
>>
>> - git clone git://kernel.ubuntu.com/cking/stress-ng.git
>> - apply the attachment patch which just makes the posix file
>> lock stress test more aggressive
>> - run the test via '~/git/stress-ng$./stress-ng --lockf 128 --aggressive'
>>
>>
>> From the oops log, looks one garbage file_lock node is got
>> from the linked list of 'ctx->flc_posix' when the issue happens.
>>
>> BTW, the issue isn't observed on single socket Cavium Thunder yet,
>> and the same issue can be seen on Ubuntu Xenial(v4.4 based kernel)
>> too.
>>
>> Thanks,
>> Ming
>>
>
> Some questions just for clarification:
>
> - I assume this is being run on a local fs of some sort? ext4 or xfs or
> something?

Yes, I just tested it on local ext4, and not test it on other filesystems yet.

>
> - have you seen this on any other arch, besides ARM?

I run the same tests on x86 before, and not see the issue.

>
> The file locking code does do some lockless checking to see whether the
> i_flctx is even present and whether the list is empty in
> locks_remove_posix. It's possible we have some barrier problems there,
> but I don't quite see how that would cause us to have a corrupt lock on
> the flc_posix list.

Yeah, I looked at the function of posix_lock_inode(), seems both add and
remove are protected by the lock.

Thanks,
Ming
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Will Deacon Dec. 1, 2016, 11:30 a.m. UTC | #5
On Mon, Nov 28, 2016 at 11:10:14AM +0800, Ming Lei wrote:
> When I run stress-ng via the following steps on one ARM64 dual
> socket system(Cavium Thunder), the kernel oops[1] can often be
> triggered after running the stress test for several hours(sometimes
> it may take longer):
> 
> - git clone git://kernel.ubuntu.com/cking/stress-ng.git
> - apply the attachment patch which just makes the posix file
> lock stress test more aggressive
> - run the test via '~/git/stress-ng$./stress-ng --lockf 128 --aggressive'
> 
> 
> From the oops log, looks one garbage file_lock node is got
> from the linked list of 'ctx->flc_posix' when the issue happens.
> 
> BTW, the issue isn't observed on single socket Cavium Thunder yet,
> and the same issue can be seen on Ubuntu Xenial(v4.4 based kernel)
> too.

FWIW, I've been running this on Seattle for 24 hours with your patch applied
and not seen any problems yet. That said, Thomas did just fix an rt_mutex
race which only seemed to pop up on Thunder, so you could give those
patches a try.

  https://lkml.kernel.org/r/20161130205431.629977871@linutronix.de

Will
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ming Lei Dec. 6, 2016, 9:53 a.m. UTC | #6
Hi Will,

On Thu, Dec 1, 2016 at 7:30 PM, Will Deacon <will.deacon@arm.com> wrote:
> On Mon, Nov 28, 2016 at 11:10:14AM +0800, Ming Lei wrote:
>> When I run stress-ng via the following steps on one ARM64 dual
>> socket system(Cavium Thunder), the kernel oops[1] can often be
>> triggered after running the stress test for several hours(sometimes
>> it may take longer):
>>
>> - git clone git://kernel.ubuntu.com/cking/stress-ng.git
>> - apply the attachment patch which just makes the posix file
>> lock stress test more aggressive
>> - run the test via '~/git/stress-ng$./stress-ng --lockf 128 --aggressive'
>>
>>
>> From the oops log, looks one garbage file_lock node is got
>> from the linked list of 'ctx->flc_posix' when the issue happens.
>>
>> BTW, the issue isn't observed on single socket Cavium Thunder yet,
>> and the same issue can be seen on Ubuntu Xenial(v4.4 based kernel)
>> too.
>
> FWIW, I've been running this on Seattle for 24 hours with your patch applied
> and not seen any problems yet. That said, Thomas did just fix an rt_mutex
> race which only seemed to pop up on Thunder, so you could give those
> patches a try.
>
>   https://lkml.kernel.org/r/20161130205431.629977871@linutronix.de

I applied the patch against Ubuntu Yakkety kernel(v4.8 based), and run
the test again on one dual-socket Cavium ThunderX system, and the
issue can still be triggered.

So looks not a same issue with David Daney's.

Anyway, thank you for providing this input!

Thanks,
Ming
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ming Lei Dec. 8, 2016, 3:57 p.m. UTC | #7
Hi,

On Mon, Nov 28, 2016 at 9:40 PM, Jeff Layton <jlayton@poochiereds.net> wrote:
> On Mon, 2016-11-28 at 11:10 +0800, Ming Lei wrote:
>> Hi Guys,
>>
>> When I run stress-ng via the following steps on one ARM64 dual
>> socket system(Cavium Thunder), the kernel oops[1] can often be
>> triggered after running the stress test for several hours(sometimes
>> it may take longer):
>>
>> - git clone git://kernel.ubuntu.com/cking/stress-ng.git
>> - apply the attachment patch which just makes the posix file
>> lock stress test more aggressive
>> - run the test via '~/git/stress-ng$./stress-ng --lockf 128 --aggressive'
>>
>>
>> From the oops log, looks one garbage file_lock node is got
>> from the linked list of 'ctx->flc_posix' when the issue happens.
>>
>> BTW, the issue isn't observed on single socket Cavium Thunder yet,
>> and the same issue can be seen on Ubuntu Xenial(v4.4 based kernel)
>> too.
>>
>> Thanks,
>> Ming
>>
>
> Some questions just for clarification:
>
> - I assume this is being run on a local fs of some sort? ext4 or xfs or
> something?
>
> - have you seen this on any other arch, besides ARM?
>
> The file locking code does do some lockless checking to see whether the
> i_flctx is even present and whether the list is empty in
> locks_remove_posix. It's possible we have some barrier problems there,

I have used ebpf trace to see what is going on when 'stress-ng --lockf'
is running, and almost all exported symbols in fs/locks.c are covered.

Except for locks_alloc/locks_free/locks_copy/locks_init, the only observable
symbols are fcntl_setlk, vfs_lock_file and locks_remove_posix, but
locks_remove_posix() is just run at the begining and ending of the
test.

So seems not related with locks_remove_posix().

Then looks only fcntl_setlk() is running from different contexts
during the test,
but in this path, the 'ctx->flc_lock' is always held when operating the list.
That said it is very strange to see the list corrupted even though it is
protected by the lock.

Thanks,
Ming

> but I don't quite see how that would cause us to have a corrupt lock on
> the flc_posix list.
>
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ming Lei Dec. 19, 2016, 9:59 a.m. UTC | #8
Hi,

On Thu, Dec 8, 2016 at 11:57 PM, Ming Lei <ming.lei@canonical.com> wrote:
> Hi,
>
> On Mon, Nov 28, 2016 at 9:40 PM, Jeff Layton <jlayton@poochiereds.net> wrote:
>> On Mon, 2016-11-28 at 11:10 +0800, Ming Lei wrote:
>>> Hi Guys,
>>>
>>> When I run stress-ng via the following steps on one ARM64 dual
>>> socket system(Cavium Thunder), the kernel oops[1] can often be
>>> triggered after running the stress test for several hours(sometimes
>>> it may take longer):
>>>
>>> - git clone git://kernel.ubuntu.com/cking/stress-ng.git
>>> - apply the attachment patch which just makes the posix file
>>> lock stress test more aggressive
>>> - run the test via '~/git/stress-ng$./stress-ng --lockf 128 --aggressive'
>>>
>>>
>>> From the oops log, looks one garbage file_lock node is got
>>> from the linked list of 'ctx->flc_posix' when the issue happens.
>>>
>>> BTW, the issue isn't observed on single socket Cavium Thunder yet,
>>> and the same issue can be seen on Ubuntu Xenial(v4.4 based kernel)
>>> too.
>>>
>>> Thanks,
>>> Ming
>>>
>>
>> Some questions just for clarification:
>>
>> - I assume this is being run on a local fs of some sort? ext4 or xfs or
>> something?
>>
>> - have you seen this on any other arch, besides ARM?
>>
>> The file locking code does do some lockless checking to see whether the
>> i_flctx is even present and whether the list is empty in
>> locks_remove_posix. It's possible we have some barrier problems there,
>
> I have used ebpf trace to see what is going on when 'stress-ng --lockf'
> is running, and almost all exported symbols in fs/locks.c are covered.
>
> Except for locks_alloc/locks_free/locks_copy/locks_init, the only observable
> symbols are fcntl_setlk, vfs_lock_file and locks_remove_posix, but
> locks_remove_posix() is just run at the begining and ending of the
> test.
>
> So seems not related with locks_remove_posix().
>
> Then looks only fcntl_setlk() is running from different contexts
> during the test,
> but in this path, the 'ctx->flc_lock' is always held when operating the list.
> That said it is very strange to see the list corrupted even though it is
> protected by the lock.

After some analysis on traces collected recently, there are a few discoveries:

1) the spinlock scenario(ctx->flc_lock) is correct

2) the kernel oops(file lock corruption) always happens in the
task of stress-ng-lockf's child, which isn't affected by
sched_setaffinity(), and the process of stress-ng-lockf is schedued
from one CPU to another one from another socket at random according to
sched_setaffinity() called
from stress-ng main task.

Thanks,
Ming
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/stress-lockf.c b/stress-lockf.c
index fb2f7b5f085e..232c269d0ec8 100644
--- a/stress-lockf.c
+++ b/stress-lockf.c
@@ -43,7 +43,7 @@ 
 
 #define LOCK_FILE_SIZE	(64 * 1024)
 #define LOCK_SIZE	(8)
-#define LOCK_MAX	(1024)
+#define LOCK_MAX	(8192)
 
 typedef struct lockf_info {
 	off_t	offset;
diff --git a/stress-ng.c b/stress-ng.c
index 7ab2ab1c42a6..de48d2a34738 100644
--- a/stress-ng.c
+++ b/stress-ng.c
@@ -1642,12 +1642,22 @@  static void kill_procs(const int sig)
 		int j;
 
 		for (j = 0; j < procs[i].started_procs; j++) {
-			if (procs[i].pids[j])
-				(void)kill(procs[i].pids[j], signum);
+			if (procs[i].pids[j].pid)
+				(void)kill(procs[i].pids[j].pid, signum);
 		}
 	}
 }
 
+static int on_one_sock(unsigned cpu1, unsigned cpu2, unsigned cpus)
+{
+	unsigned mid = cpus / 2;
+	if (cpu1 < mid && cpu2 < mid)
+		return 1;
+	if (cpu1 >= mid && cpu2 >= mid)
+		return 1;
+	return 0;
+}
+
 /*
  *  wait_procs()
  * 	wait for procs
@@ -1670,7 +1680,8 @@  static void MLOCKED wait_procs(bool *success, bool *resource_success)
 		cpu_set_t proc_mask;
 		unsigned long int cpu = 0;
 		const uint32_t ticks_per_sec = stress_get_ticks_per_second() * 5;
-		const useconds_t usec_sleep = ticks_per_sec ? 1000000 / ticks_per_sec : 1000000 / 250;
+		//const useconds_t usec_sleep = ticks_per_sec ? 1000000 / ticks_per_sec : 1000000 / 250;
+		const useconds_t usec_sleep = 50000;
 
 		while (opt_do_wait) {
 			const int32_t cpus = stress_get_processors_configured();
@@ -1685,7 +1696,9 @@  static void MLOCKED wait_procs(bool *success, bool *resource_success)
 				int j;
 
 				for (j = 0; j < procs[i].started_procs; j++) {
-					const pid_t pid = procs[i].pids[j];
+					const pid_t pid = procs[i].pids[j].pid;
+					unsigned last_cpu =
+						procs[i].pids[j].last_cpu;
 					if (pid) {
 						cpu_set_t mask;
 						int32_t cpu_num;
@@ -1694,10 +1707,16 @@  static void MLOCKED wait_procs(bool *success, bool *resource_success)
 							cpu_num = mwc32() % cpus;
 						} while (!(CPU_ISSET(cpu_num, &proc_mask)));
 
+						if (on_one_sock(last_cpu,
+									cpu_num,
+									cpus))
+							cpu_num = cpus - cpu_num -1;
+
 						CPU_ZERO(&mask);
 						CPU_SET(cpu_num, &mask);
 						if (sched_setaffinity(pid, sizeof(mask), &mask) < 0)
 							goto do_wait;
+						procs[i].pids[j].last_cpu = cpu_num;
 					}
 				}
 			}
@@ -1713,7 +1732,7 @@  do_wait:
 		for (j = 0; j < procs[i].started_procs; j++) {
 			pid_t pid;
 redo:
-			pid = procs[i].pids[j];
+			pid = procs[i].pids[j].pid;
 			if (pid) {
 				int status, ret;
 
@@ -1752,7 +1771,7 @@  redo:
 						*success = false;
 						break;
 					}
-					proc_finished(&procs[i].pids[j]);
+					proc_finished(&procs[i].pids[j].pid);
 					pr_dbg(stderr, "process [%d] terminated\n", ret);
 				} else if (ret == -1) {
 					/* Somebody interrupted the wait */
@@ -1760,7 +1779,7 @@  redo:
 						goto redo;
 					/* This child did not exist, mark it done anyhow */
 					if (errno == ECHILD)
-						proc_finished(&procs[i].pids[j]);
+						proc_finished(&procs[i].pids[j].pid);
 				}
 			}
 		}
@@ -1901,7 +1920,7 @@  again:
 				default:
 					if (pid > -1) {
 						(void)setpgid(pid, pgrp);
-						procs[i].pids[j] = pid;
+						procs[i].pids[j].pid = pid;
 						procs[i].started_procs++;
 					}
 
@@ -2952,7 +2971,7 @@  next_opt:
 		/* Sequential mode has no bogo ops threshold */
 		for (i = 0; i < STRESS_MAX; i++) {
 			procs[i].bogo_ops = 0;
-			procs[i].pids = calloc(opt_sequential, sizeof(pid_t));
+			procs[i].pids = calloc(opt_sequential, sizeof(my_pid_t));
 			if (!procs[i].pids) {
 				pr_err(stderr, "cannot allocate pid list\n");
 				free_procs();
@@ -2980,7 +2999,7 @@  next_opt:
 			if (max_procs < procs[i].num_procs)
 				max_procs = procs[i].num_procs;
 			if (procs[i].num_procs) {
-				procs[i].pids = calloc(procs[i].num_procs, sizeof(pid_t));
+				procs[i].pids = calloc(procs[i].num_procs, sizeof(my_pid_t));
 				if (!procs[i].pids) {
 					pr_err(stderr, "cannot allocate pid list\n");
 					free_procs();
@@ -3008,7 +3027,7 @@  next_opt:
 			if (max_procs < procs[i].num_procs)
 				max_procs = procs[i].num_procs;
 			if (procs[i].num_procs) {
-				procs[i].pids = calloc(procs[i].num_procs, sizeof(pid_t));
+				procs[i].pids = calloc(procs[i].num_procs, sizeof(my_pid_t));
 				if (!procs[i].pids) {
 					pr_err(stderr, "cannot allocate pid list\n");
 					free_procs();
diff --git a/stress-ng.h b/stress-ng.h
index c5104a322e45..4f4098fc0d11 100644
--- a/stress-ng.h
+++ b/stress-ng.h
@@ -1713,7 +1713,12 @@  typedef struct {
 } stress_t;
 
 typedef struct {
-	pid_t	*pids;			/* process id */
+	pid_t pid;
+	unsigned last_cpu;
+} my_pid_t;
+
+typedef struct {
+	my_pid_t *pids;			/* process id */
 	int32_t started_procs;		/* count of started processes */
 	int32_t num_procs;		/* number of process per stressor */
 	uint64_t bogo_ops;		/* number of bogo ops */