Message ID | 1408747772-37938-1-git-send-email-trond.myklebust@primarydata.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Hi Trond, Thanks for looking into this issue. PF_FSTRANS is also used in file net/sunrpc/xprtrdma/transport.c - xprt_rdma_connect_worker(), do we also add PF_MEMALLOC_NOIO there? Also PF_FSTRANS seemed duplicated with PF_MEMALLOC_NOIO, can we directly replace it with PF_MEMALLOC_NOIO? Thanks, Junxiao. On 08/23/2014 06:49 AM, Trond Myklebust wrote: > Junxiao Bi reports seeing the following deadlock: > > @ crash> bt 1539 > @ PID: 1539 TASK: ffff88178f64a040 CPU: 1 COMMAND: "rpciod/1" > @ #0 [ffff88178f64d2c0] schedule at ffffffff8145833a > @ #1 [ffff88178f64d348] io_schedule at ffffffff8145842c > @ #2 [ffff88178f64d368] sync_page at ffffffff810d8161 > @ #3 [ffff88178f64d378] __wait_on_bit at ffffffff8145895b > @ #4 [ffff88178f64d3b8] wait_on_page_bit at ffffffff810d82fe > @ #5 [ffff88178f64d418] wait_on_page_writeback at ffffffff810e2a1a > @ #6 [ffff88178f64d438] shrink_page_list at ffffffff810e34e1 > @ #7 [ffff88178f64d588] shrink_list at ffffffff810e3dbe > @ #8 [ffff88178f64d6f8] shrink_zone at ffffffff810e425e > @ #9 [ffff88178f64d7b8] do_try_to_free_pages at ffffffff810e4978 > @ #10 [ffff88178f64d828] try_to_free_pages at ffffffff810e4c31 > @ #11 [ffff88178f64d8c8] __alloc_pages_nodemask at ffffffff810de370 > @ #12 [ffff88178f64d978] kmem_getpages at ffffffff8110e18b > @ #13 [ffff88178f64d9a8] fallback_alloc at ffffffff8110e35e > @ #14 [ffff88178f64da08] ____cache_alloc_node at ffffffff8110e51f > @ #15 [ffff88178f64da48] __kmalloc at ffffffff8110efba > @ #16 [ffff88178f64da98] xs_setup_xprt at ffffffffa00a563f [sunrpc] > @ #17 [ffff88178f64dad8] xs_setup_tcp at ffffffffa00a7648 [sunrpc] > @ #18 [ffff88178f64daf8] xprt_create_transport at ffffffffa00a478f [sunrpc] > @ #19 [ffff88178f64db18] rpc_create at ffffffffa00a2d7a [sunrpc] > @ #20 [ffff88178f64dbf8] rpcb_create at ffffffffa00b026b [sunrpc] > @ #21 [ffff88178f64dc98] rpcb_getport_async at ffffffffa00b0c94 [sunrpc] > @ #22 [ffff88178f64ddf8] call_bind at ffffffffa00a11f8 [sunrpc] > @ #23 [ffff88178f64de18] __rpc_execute at ffffffffa00a88ef [sunrpc] > @ #24 [ffff88178f64de58] rpc_async_schedule at ffffffffa00a9187 [sunrpc] > @ #25 [ffff88178f64de78] worker_thread at ffffffff81072ed2 > @ #26 [ffff88178f64dee8] kthread at ffffffff81076df3 > @ #27 [ffff88178f64df48] kernel_thread at ffffffff81012e2a > @ crash> > > Junxiao notes that the problem is not limited to the rpcbind client. In > fact we can trigger the exact same problem when trying to reconnect to > the server, and we find ourselves calling sock_alloc(). > > The following solution should work for all kernels that support the > PF_MEMALLOC_NOIO flag (i.e. Linux 3.9 and newer). > > Link: http://lkml.kernel.org/r/53F6F772.6020708@oracle.com > Reported-by: Junxiao Bi <junxiao.bi@oracle.com> > Cc: stable@vger.kernel.org # 3.9+ > Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com> > --- > net/sunrpc/sched.c | 5 +++-- > net/sunrpc/xprtsock.c | 15 ++++++++------- > 2 files changed, 11 insertions(+), 9 deletions(-) > > diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c > index 9358c79fd589..ab3aff71ff93 100644 > --- a/net/sunrpc/sched.c > +++ b/net/sunrpc/sched.c > @@ -19,6 +19,7 @@ > #include <linux/spinlock.h> > #include <linux/mutex.h> > #include <linux/freezer.h> > +#include <linux/sched.h> > > #include <linux/sunrpc/clnt.h> > > @@ -821,9 +822,9 @@ void rpc_execute(struct rpc_task *task) > > static void rpc_async_schedule(struct work_struct *work) > { > - current->flags |= PF_FSTRANS; > + current->flags |= PF_FSTRANS | PF_MEMALLOC_NOIO; > __rpc_execute(container_of(work, struct rpc_task, u.tk_work)); > - current->flags &= ~PF_FSTRANS; > + current->flags &= ~(PF_FSTRANS | PF_MEMALLOC_NOIO); > } > > /** > diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c > index 43cd89eacfab..1d6d4d84b299 100644 > --- a/net/sunrpc/xprtsock.c > +++ b/net/sunrpc/xprtsock.c > @@ -38,6 +38,7 @@ > #include <linux/sunrpc/svcsock.h> > #include <linux/sunrpc/xprtsock.h> > #include <linux/file.h> > +#include <linux/sched.h> > #ifdef CONFIG_SUNRPC_BACKCHANNEL > #include <linux/sunrpc/bc_xprt.h> > #endif > @@ -1927,7 +1928,7 @@ static int xs_local_setup_socket(struct sock_xprt *transport) > struct socket *sock; > int status = -EIO; > > - current->flags |= PF_FSTRANS; > + current->flags |= PF_FSTRANS | PF_MEMALLOC_NOIO; > > clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); > status = __sock_create(xprt->xprt_net, AF_LOCAL, > @@ -1968,7 +1969,7 @@ static int xs_local_setup_socket(struct sock_xprt *transport) > out: > xprt_clear_connecting(xprt); > xprt_wake_pending_tasks(xprt, status); > - current->flags &= ~PF_FSTRANS; > + current->flags &= ~(PF_FSTRANS | PF_MEMALLOC_NOIO); > return status; > } > > @@ -2071,7 +2072,7 @@ static void xs_udp_setup_socket(struct work_struct *work) > struct socket *sock = transport->sock; > int status = -EIO; > > - current->flags |= PF_FSTRANS; > + current->flags |= PF_FSTRANS | PF_MEMALLOC_NOIO; > > /* Start by resetting any existing state */ > xs_reset_transport(transport); > @@ -2092,7 +2093,7 @@ static void xs_udp_setup_socket(struct work_struct *work) > out: > xprt_clear_connecting(xprt); > xprt_wake_pending_tasks(xprt, status); > - current->flags &= ~PF_FSTRANS; > + current->flags &= ~(PF_FSTRANS | PF_MEMALLOC_NOIO); > } > > /* > @@ -2229,7 +2230,7 @@ static void xs_tcp_setup_socket(struct work_struct *work) > struct rpc_xprt *xprt = &transport->xprt; > int status = -EIO; > > - current->flags |= PF_FSTRANS; > + current->flags |= PF_FSTRANS | PF_MEMALLOC_NOIO; > > if (!sock) { > clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); > @@ -2276,7 +2277,7 @@ static void xs_tcp_setup_socket(struct work_struct *work) > case -EINPROGRESS: > case -EALREADY: > xprt_clear_connecting(xprt); > - current->flags &= ~PF_FSTRANS; > + current->flags &= ~(PF_FSTRANS | PF_MEMALLOC_NOIO); > return; > case -EINVAL: > /* Happens, for instance, if the user specified a link > @@ -2294,7 +2295,7 @@ out_eagain: > out: > xprt_clear_connecting(xprt); > xprt_wake_pending_tasks(xprt, status); > - current->flags &= ~PF_FSTRANS; > + current->flags &= ~(PF_FSTRANS | PF_MEMALLOC_NOIO); > } > > /** > -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, 22 Aug 2014 18:49:31 -0400 Trond Myklebust <trond.myklebust@primarydata.com> wrote: > Junxiao Bi reports seeing the following deadlock: > > @ crash> bt 1539 > @ PID: 1539 TASK: ffff88178f64a040 CPU: 1 COMMAND: "rpciod/1" > @ #0 [ffff88178f64d2c0] schedule at ffffffff8145833a > @ #1 [ffff88178f64d348] io_schedule at ffffffff8145842c > @ #2 [ffff88178f64d368] sync_page at ffffffff810d8161 > @ #3 [ffff88178f64d378] __wait_on_bit at ffffffff8145895b > @ #4 [ffff88178f64d3b8] wait_on_page_bit at ffffffff810d82fe > @ #5 [ffff88178f64d418] wait_on_page_writeback at ffffffff810e2a1a > @ #6 [ffff88178f64d438] shrink_page_list at ffffffff810e34e1 > @ #7 [ffff88178f64d588] shrink_list at ffffffff810e3dbe > @ #8 [ffff88178f64d6f8] shrink_zone at ffffffff810e425e > @ #9 [ffff88178f64d7b8] do_try_to_free_pages at ffffffff810e4978 > @ #10 [ffff88178f64d828] try_to_free_pages at ffffffff810e4c31 > @ #11 [ffff88178f64d8c8] __alloc_pages_nodemask at ffffffff810de370 This stack trace (from 2.6.32) cannot happen in mainline, though it took me a while to remember/discover exactly why. try_to_free_pages() creates a 'struct scan_control' with ->target_mem_cgroup set to NULL. shrink_page_list() checks ->target_mem_cgroup using global_reclaim() and if it is NULL, wait_on_page_writeback is *not* called. So we can only hit this deadlock if mem-cgroup limits are imposed on a process which is using NFS - which is quite possible but probably not common. The fact that a dead-lock can happen only when memcg limits are imposed seems very fragile. People aren't going to test that case much so there could well be other deadlock possibilities lurking. Mel: might there be some other way we could get out of this deadlock? Could the wait_on_page_writeback() in shrink_page_list() be made a timed-out wait or something? Any other wait out of this deadlock other than setting PF_MEMALLOC_NOIO everywhere? Thanks, NeilBrown > @ #12 [ffff88178f64d978] kmem_getpages at ffffffff8110e18b > @ #13 [ffff88178f64d9a8] fallback_alloc at ffffffff8110e35e > @ #14 [ffff88178f64da08] ____cache_alloc_node at ffffffff8110e51f > @ #15 [ffff88178f64da48] __kmalloc at ffffffff8110efba > @ #16 [ffff88178f64da98] xs_setup_xprt at ffffffffa00a563f [sunrpc] > @ #17 [ffff88178f64dad8] xs_setup_tcp at ffffffffa00a7648 [sunrpc] > @ #18 [ffff88178f64daf8] xprt_create_transport at ffffffffa00a478f [sunrpc] > @ #19 [ffff88178f64db18] rpc_create at ffffffffa00a2d7a [sunrpc] > @ #20 [ffff88178f64dbf8] rpcb_create at ffffffffa00b026b [sunrpc] > @ #21 [ffff88178f64dc98] rpcb_getport_async at ffffffffa00b0c94 [sunrpc] > @ #22 [ffff88178f64ddf8] call_bind at ffffffffa00a11f8 [sunrpc] > @ #23 [ffff88178f64de18] __rpc_execute at ffffffffa00a88ef [sunrpc] > @ #24 [ffff88178f64de58] rpc_async_schedule at ffffffffa00a9187 [sunrpc] > @ #25 [ffff88178f64de78] worker_thread at ffffffff81072ed2 > @ #26 [ffff88178f64dee8] kthread at ffffffff81076df3 > @ #27 [ffff88178f64df48] kernel_thread at ffffffff81012e2a > @ crash> > > Junxiao notes that the problem is not limited to the rpcbind client. In > fact we can trigger the exact same problem when trying to reconnect to > the server, and we find ourselves calling sock_alloc(). > > The following solution should work for all kernels that support the > PF_MEMALLOC_NOIO flag (i.e. Linux 3.9 and newer). > > Link: http://lkml.kernel.org/r/53F6F772.6020708@oracle.com > Reported-by: Junxiao Bi <junxiao.bi@oracle.com> > Cc: stable@vger.kernel.org # 3.9+ > Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com> > --- > net/sunrpc/sched.c | 5 +++-- > net/sunrpc/xprtsock.c | 15 ++++++++------- > 2 files changed, 11 insertions(+), 9 deletions(-) > > diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c > index 9358c79fd589..ab3aff71ff93 100644 > --- a/net/sunrpc/sched.c > +++ b/net/sunrpc/sched.c > @@ -19,6 +19,7 @@ > #include <linux/spinlock.h> > #include <linux/mutex.h> > #include <linux/freezer.h> > +#include <linux/sched.h> > > #include <linux/sunrpc/clnt.h> > > @@ -821,9 +822,9 @@ void rpc_execute(struct rpc_task *task) > > static void rpc_async_schedule(struct work_struct *work) > { > - current->flags |= PF_FSTRANS; > + current->flags |= PF_FSTRANS | PF_MEMALLOC_NOIO; > __rpc_execute(container_of(work, struct rpc_task, u.tk_work)); > - current->flags &= ~PF_FSTRANS; > + current->flags &= ~(PF_FSTRANS | PF_MEMALLOC_NOIO); > } > > /** > diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c > index 43cd89eacfab..1d6d4d84b299 100644 > --- a/net/sunrpc/xprtsock.c > +++ b/net/sunrpc/xprtsock.c > @@ -38,6 +38,7 @@ > #include <linux/sunrpc/svcsock.h> > #include <linux/sunrpc/xprtsock.h> > #include <linux/file.h> > +#include <linux/sched.h> > #ifdef CONFIG_SUNRPC_BACKCHANNEL > #include <linux/sunrpc/bc_xprt.h> > #endif > @@ -1927,7 +1928,7 @@ static int xs_local_setup_socket(struct sock_xprt *transport) > struct socket *sock; > int status = -EIO; > > - current->flags |= PF_FSTRANS; > + current->flags |= PF_FSTRANS | PF_MEMALLOC_NOIO; > > clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); > status = __sock_create(xprt->xprt_net, AF_LOCAL, > @@ -1968,7 +1969,7 @@ static int xs_local_setup_socket(struct sock_xprt *transport) > out: > xprt_clear_connecting(xprt); > xprt_wake_pending_tasks(xprt, status); > - current->flags &= ~PF_FSTRANS; > + current->flags &= ~(PF_FSTRANS | PF_MEMALLOC_NOIO); > return status; > } > > @@ -2071,7 +2072,7 @@ static void xs_udp_setup_socket(struct work_struct *work) > struct socket *sock = transport->sock; > int status = -EIO; > > - current->flags |= PF_FSTRANS; > + current->flags |= PF_FSTRANS | PF_MEMALLOC_NOIO; > > /* Start by resetting any existing state */ > xs_reset_transport(transport); > @@ -2092,7 +2093,7 @@ static void xs_udp_setup_socket(struct work_struct *work) > out: > xprt_clear_connecting(xprt); > xprt_wake_pending_tasks(xprt, status); > - current->flags &= ~PF_FSTRANS; > + current->flags &= ~(PF_FSTRANS | PF_MEMALLOC_NOIO); > } > > /* > @@ -2229,7 +2230,7 @@ static void xs_tcp_setup_socket(struct work_struct *work) > struct rpc_xprt *xprt = &transport->xprt; > int status = -EIO; > > - current->flags |= PF_FSTRANS; > + current->flags |= PF_FSTRANS | PF_MEMALLOC_NOIO; > > if (!sock) { > clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); > @@ -2276,7 +2277,7 @@ static void xs_tcp_setup_socket(struct work_struct *work) > case -EINPROGRESS: > case -EALREADY: > xprt_clear_connecting(xprt); > - current->flags &= ~PF_FSTRANS; > + current->flags &= ~(PF_FSTRANS | PF_MEMALLOC_NOIO); > return; > case -EINVAL: > /* Happens, for instance, if the user specified a link > @@ -2294,7 +2295,7 @@ out_eagain: > out: > xprt_clear_connecting(xprt); > xprt_wake_pending_tasks(xprt, status); > - current->flags &= ~PF_FSTRANS; > + current->flags &= ~(PF_FSTRANS | PF_MEMALLOC_NOIO); > } > > /**
On 08/25/2014 02:48 PM, NeilBrown wrote: > On Fri, 22 Aug 2014 18:49:31 -0400 Trond Myklebust > <trond.myklebust@primarydata.com> wrote: > >> Junxiao Bi reports seeing the following deadlock: >> >> @ crash> bt 1539 >> @ PID: 1539 TASK: ffff88178f64a040 CPU: 1 COMMAND: "rpciod/1" >> @ #0 [ffff88178f64d2c0] schedule at ffffffff8145833a >> @ #1 [ffff88178f64d348] io_schedule at ffffffff8145842c >> @ #2 [ffff88178f64d368] sync_page at ffffffff810d8161 >> @ #3 [ffff88178f64d378] __wait_on_bit at ffffffff8145895b >> @ #4 [ffff88178f64d3b8] wait_on_page_bit at ffffffff810d82fe >> @ #5 [ffff88178f64d418] wait_on_page_writeback at ffffffff810e2a1a >> @ #6 [ffff88178f64d438] shrink_page_list at ffffffff810e34e1 >> @ #7 [ffff88178f64d588] shrink_list at ffffffff810e3dbe >> @ #8 [ffff88178f64d6f8] shrink_zone at ffffffff810e425e >> @ #9 [ffff88178f64d7b8] do_try_to_free_pages at ffffffff810e4978 >> @ #10 [ffff88178f64d828] try_to_free_pages at ffffffff810e4c31 >> @ #11 [ffff88178f64d8c8] __alloc_pages_nodemask at ffffffff810de370 > > This stack trace (from 2.6.32) cannot happen in mainline, though it took me a > while to remember/discover exactly why. > > try_to_free_pages() creates a 'struct scan_control' with ->target_mem_cgroup > set to NULL. > shrink_page_list() checks ->target_mem_cgroup using global_reclaim() and if > it is NULL, wait_on_page_writeback is *not* called. > > So we can only hit this deadlock if mem-cgroup limits are imposed on a > process which is using NFS - which is quite possible but probably not common. > > The fact that a dead-lock can happen only when memcg limits are imposed seems > very fragile. People aren't going to test that case much so there could well > be other deadlock possibilities lurking. > > Mel: might there be some other way we could get out of this deadlock? > Could the wait_on_page_writeback() in shrink_page_list() be made a timed-out > wait or something? Any other wait out of this deadlock other than setting > PF_MEMALLOC_NOIO everywhere? Not only the wait_on_page_writeback() cause the deadlock but also the next pageout()-> (mapping->a_ops->writepage), Trond's second patch fix this. So fix the wait_on_page_writeback is not enough to fix deadlock. Thanks, Junxiao. > > Thanks, > NeilBrown > > > >> @ #12 [ffff88178f64d978] kmem_getpages at ffffffff8110e18b >> @ #13 [ffff88178f64d9a8] fallback_alloc at ffffffff8110e35e >> @ #14 [ffff88178f64da08] ____cache_alloc_node at ffffffff8110e51f >> @ #15 [ffff88178f64da48] __kmalloc at ffffffff8110efba >> @ #16 [ffff88178f64da98] xs_setup_xprt at ffffffffa00a563f [sunrpc] >> @ #17 [ffff88178f64dad8] xs_setup_tcp at ffffffffa00a7648 [sunrpc] >> @ #18 [ffff88178f64daf8] xprt_create_transport at ffffffffa00a478f [sunrpc] >> @ #19 [ffff88178f64db18] rpc_create at ffffffffa00a2d7a [sunrpc] >> @ #20 [ffff88178f64dbf8] rpcb_create at ffffffffa00b026b [sunrpc] >> @ #21 [ffff88178f64dc98] rpcb_getport_async at ffffffffa00b0c94 [sunrpc] >> @ #22 [ffff88178f64ddf8] call_bind at ffffffffa00a11f8 [sunrpc] >> @ #23 [ffff88178f64de18] __rpc_execute at ffffffffa00a88ef [sunrpc] >> @ #24 [ffff88178f64de58] rpc_async_schedule at ffffffffa00a9187 [sunrpc] >> @ #25 [ffff88178f64de78] worker_thread at ffffffff81072ed2 >> @ #26 [ffff88178f64dee8] kthread at ffffffff81076df3 >> @ #27 [ffff88178f64df48] kernel_thread at ffffffff81012e2a >> @ crash> >> >> Junxiao notes that the problem is not limited to the rpcbind client. In >> fact we can trigger the exact same problem when trying to reconnect to >> the server, and we find ourselves calling sock_alloc(). >> >> The following solution should work for all kernels that support the >> PF_MEMALLOC_NOIO flag (i.e. Linux 3.9 and newer). >> >> Link: http://lkml.kernel.org/r/53F6F772.6020708@oracle.com >> Reported-by: Junxiao Bi <junxiao.bi@oracle.com> >> Cc: stable@vger.kernel.org # 3.9+ >> Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com> >> --- >> net/sunrpc/sched.c | 5 +++-- >> net/sunrpc/xprtsock.c | 15 ++++++++------- >> 2 files changed, 11 insertions(+), 9 deletions(-) >> >> diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c >> index 9358c79fd589..ab3aff71ff93 100644 >> --- a/net/sunrpc/sched.c >> +++ b/net/sunrpc/sched.c >> @@ -19,6 +19,7 @@ >> #include <linux/spinlock.h> >> #include <linux/mutex.h> >> #include <linux/freezer.h> >> +#include <linux/sched.h> >> >> #include <linux/sunrpc/clnt.h> >> >> @@ -821,9 +822,9 @@ void rpc_execute(struct rpc_task *task) >> >> static void rpc_async_schedule(struct work_struct *work) >> { >> - current->flags |= PF_FSTRANS; >> + current->flags |= PF_FSTRANS | PF_MEMALLOC_NOIO; >> __rpc_execute(container_of(work, struct rpc_task, u.tk_work)); >> - current->flags &= ~PF_FSTRANS; >> + current->flags &= ~(PF_FSTRANS | PF_MEMALLOC_NOIO); >> } >> >> /** >> diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c >> index 43cd89eacfab..1d6d4d84b299 100644 >> --- a/net/sunrpc/xprtsock.c >> +++ b/net/sunrpc/xprtsock.c >> @@ -38,6 +38,7 @@ >> #include <linux/sunrpc/svcsock.h> >> #include <linux/sunrpc/xprtsock.h> >> #include <linux/file.h> >> +#include <linux/sched.h> >> #ifdef CONFIG_SUNRPC_BACKCHANNEL >> #include <linux/sunrpc/bc_xprt.h> >> #endif >> @@ -1927,7 +1928,7 @@ static int xs_local_setup_socket(struct sock_xprt *transport) >> struct socket *sock; >> int status = -EIO; >> >> - current->flags |= PF_FSTRANS; >> + current->flags |= PF_FSTRANS | PF_MEMALLOC_NOIO; >> >> clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); >> status = __sock_create(xprt->xprt_net, AF_LOCAL, >> @@ -1968,7 +1969,7 @@ static int xs_local_setup_socket(struct sock_xprt *transport) >> out: >> xprt_clear_connecting(xprt); >> xprt_wake_pending_tasks(xprt, status); >> - current->flags &= ~PF_FSTRANS; >> + current->flags &= ~(PF_FSTRANS | PF_MEMALLOC_NOIO); >> return status; >> } >> >> @@ -2071,7 +2072,7 @@ static void xs_udp_setup_socket(struct work_struct *work) >> struct socket *sock = transport->sock; >> int status = -EIO; >> >> - current->flags |= PF_FSTRANS; >> + current->flags |= PF_FSTRANS | PF_MEMALLOC_NOIO; >> >> /* Start by resetting any existing state */ >> xs_reset_transport(transport); >> @@ -2092,7 +2093,7 @@ static void xs_udp_setup_socket(struct work_struct *work) >> out: >> xprt_clear_connecting(xprt); >> xprt_wake_pending_tasks(xprt, status); >> - current->flags &= ~PF_FSTRANS; >> + current->flags &= ~(PF_FSTRANS | PF_MEMALLOC_NOIO); >> } >> >> /* >> @@ -2229,7 +2230,7 @@ static void xs_tcp_setup_socket(struct work_struct *work) >> struct rpc_xprt *xprt = &transport->xprt; >> int status = -EIO; >> >> - current->flags |= PF_FSTRANS; >> + current->flags |= PF_FSTRANS | PF_MEMALLOC_NOIO; >> >> if (!sock) { >> clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); >> @@ -2276,7 +2277,7 @@ static void xs_tcp_setup_socket(struct work_struct *work) >> case -EINPROGRESS: >> case -EALREADY: >> xprt_clear_connecting(xprt); >> - current->flags &= ~PF_FSTRANS; >> + current->flags &= ~(PF_FSTRANS | PF_MEMALLOC_NOIO); >> return; >> case -EINVAL: >> /* Happens, for instance, if the user specified a link >> @@ -2294,7 +2295,7 @@ out_eagain: >> out: >> xprt_clear_connecting(xprt); >> xprt_wake_pending_tasks(xprt, status); >> - current->flags &= ~PF_FSTRANS; >> + current->flags &= ~(PF_FSTRANS | PF_MEMALLOC_NOIO); >> } >> >> /** > -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, 26 Aug 2014 13:43:47 +0800 Junxiao Bi <junxiao.bi@oracle.com> wrote: > On 08/25/2014 02:48 PM, NeilBrown wrote: > > On Fri, 22 Aug 2014 18:49:31 -0400 Trond Myklebust > > <trond.myklebust@primarydata.com> wrote: > > > >> Junxiao Bi reports seeing the following deadlock: > >> > >> @ crash> bt 1539 > >> @ PID: 1539 TASK: ffff88178f64a040 CPU: 1 COMMAND: "rpciod/1" > >> @ #0 [ffff88178f64d2c0] schedule at ffffffff8145833a > >> @ #1 [ffff88178f64d348] io_schedule at ffffffff8145842c > >> @ #2 [ffff88178f64d368] sync_page at ffffffff810d8161 > >> @ #3 [ffff88178f64d378] __wait_on_bit at ffffffff8145895b > >> @ #4 [ffff88178f64d3b8] wait_on_page_bit at ffffffff810d82fe > >> @ #5 [ffff88178f64d418] wait_on_page_writeback at ffffffff810e2a1a > >> @ #6 [ffff88178f64d438] shrink_page_list at ffffffff810e34e1 > >> @ #7 [ffff88178f64d588] shrink_list at ffffffff810e3dbe > >> @ #8 [ffff88178f64d6f8] shrink_zone at ffffffff810e425e > >> @ #9 [ffff88178f64d7b8] do_try_to_free_pages at ffffffff810e4978 > >> @ #10 [ffff88178f64d828] try_to_free_pages at ffffffff810e4c31 > >> @ #11 [ffff88178f64d8c8] __alloc_pages_nodemask at ffffffff810de370 > > > > This stack trace (from 2.6.32) cannot happen in mainline, though it took me a > > while to remember/discover exactly why. > > > > try_to_free_pages() creates a 'struct scan_control' with ->target_mem_cgroup > > set to NULL. > > shrink_page_list() checks ->target_mem_cgroup using global_reclaim() and if > > it is NULL, wait_on_page_writeback is *not* called. > > > > So we can only hit this deadlock if mem-cgroup limits are imposed on a > > process which is using NFS - which is quite possible but probably not common. > > > > The fact that a dead-lock can happen only when memcg limits are imposed seems > > very fragile. People aren't going to test that case much so there could well > > be other deadlock possibilities lurking. > > > > Mel: might there be some other way we could get out of this deadlock? > > Could the wait_on_page_writeback() in shrink_page_list() be made a timed-out > > wait or something? Any other wait out of this deadlock other than setting > > PF_MEMALLOC_NOIO everywhere? > > Not only the wait_on_page_writeback() cause the deadlock but also the > next pageout()-> (mapping->a_ops->writepage), Trond's second patch fix > this. So fix the wait_on_page_writeback is not enough to fix deadlock. Shortly before the only place that pageout() is called there is this code: if (page_is_file_cache(page) && (!current_is_kswapd() || !zone_is_reclaim_dirty(zone))) { ..... goto keep_locked; So pageout() only gets called by kswapd() .... or for swap. swap-over-NFS is already very cautious about memory allocations, and uses nfs_direct_IO, not nfs_writepage. So nfs_writepage will never get called during direct reclaim. There is no memory-allocate deadlock risk there. NeilBrown
On 08/26/2014 02:21 PM, NeilBrown wrote: > On Tue, 26 Aug 2014 13:43:47 +0800 Junxiao Bi <junxiao.bi@oracle.com> wrote: > >> On 08/25/2014 02:48 PM, NeilBrown wrote: >>> On Fri, 22 Aug 2014 18:49:31 -0400 Trond Myklebust >>> <trond.myklebust@primarydata.com> wrote: >>> >>>> Junxiao Bi reports seeing the following deadlock: >>>> >>>> @ crash> bt 1539 >>>> @ PID: 1539 TASK: ffff88178f64a040 CPU: 1 COMMAND: "rpciod/1" >>>> @ #0 [ffff88178f64d2c0] schedule at ffffffff8145833a >>>> @ #1 [ffff88178f64d348] io_schedule at ffffffff8145842c >>>> @ #2 [ffff88178f64d368] sync_page at ffffffff810d8161 >>>> @ #3 [ffff88178f64d378] __wait_on_bit at ffffffff8145895b >>>> @ #4 [ffff88178f64d3b8] wait_on_page_bit at ffffffff810d82fe >>>> @ #5 [ffff88178f64d418] wait_on_page_writeback at ffffffff810e2a1a >>>> @ #6 [ffff88178f64d438] shrink_page_list at ffffffff810e34e1 >>>> @ #7 [ffff88178f64d588] shrink_list at ffffffff810e3dbe >>>> @ #8 [ffff88178f64d6f8] shrink_zone at ffffffff810e425e >>>> @ #9 [ffff88178f64d7b8] do_try_to_free_pages at ffffffff810e4978 >>>> @ #10 [ffff88178f64d828] try_to_free_pages at ffffffff810e4c31 >>>> @ #11 [ffff88178f64d8c8] __alloc_pages_nodemask at ffffffff810de370 >>> >>> This stack trace (from 2.6.32) cannot happen in mainline, though it took me a >>> while to remember/discover exactly why. >>> >>> try_to_free_pages() creates a 'struct scan_control' with ->target_mem_cgroup >>> set to NULL. >>> shrink_page_list() checks ->target_mem_cgroup using global_reclaim() and if >>> it is NULL, wait_on_page_writeback is *not* called. >>> >>> So we can only hit this deadlock if mem-cgroup limits are imposed on a >>> process which is using NFS - which is quite possible but probably not common. >>> >>> The fact that a dead-lock can happen only when memcg limits are imposed seems >>> very fragile. People aren't going to test that case much so there could well >>> be other deadlock possibilities lurking. >>> >>> Mel: might there be some other way we could get out of this deadlock? >>> Could the wait_on_page_writeback() in shrink_page_list() be made a timed-out >>> wait or something? Any other wait out of this deadlock other than setting >>> PF_MEMALLOC_NOIO everywhere? >> >> Not only the wait_on_page_writeback() cause the deadlock but also the >> next pageout()-> (mapping->a_ops->writepage), Trond's second patch fix >> this. So fix the wait_on_page_writeback is not enough to fix deadlock. > > Shortly before the only place that pageout() is called there is this code: > > if (page_is_file_cache(page) && > (!current_is_kswapd() || > !zone_is_reclaim_dirty(zone))) { > ..... > goto keep_locked; > > > So pageout() only gets called by kswapd() .... or for swap. swap-over-NFS is > already very cautious about memory allocations, and uses nfs_direct_IO, not > nfs_writepage. > > So nfs_writepage will never get called during direct reclaim. There is no > memory-allocate deadlock risk there. Yes, thanks for explaining this. But is it possible rpciod blocked somewhere by memory allocation using GFP_KERNEL and kswapd is trying to pageout nfs dirty pages and blocked by rpciod? Thanks, Junxiao. > > NeilBrown > -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, 26 Aug 2014 14:49:01 +0800 Junxiao Bi <junxiao.bi@oracle.com> wrote: > On 08/26/2014 02:21 PM, NeilBrown wrote: > > On Tue, 26 Aug 2014 13:43:47 +0800 Junxiao Bi <junxiao.bi@oracle.com> wrote: > > > >> On 08/25/2014 02:48 PM, NeilBrown wrote: > >>> On Fri, 22 Aug 2014 18:49:31 -0400 Trond Myklebust > >>> <trond.myklebust@primarydata.com> wrote: > >>> > >>>> Junxiao Bi reports seeing the following deadlock: > >>>> > >>>> @ crash> bt 1539 > >>>> @ PID: 1539 TASK: ffff88178f64a040 CPU: 1 COMMAND: "rpciod/1" > >>>> @ #0 [ffff88178f64d2c0] schedule at ffffffff8145833a > >>>> @ #1 [ffff88178f64d348] io_schedule at ffffffff8145842c > >>>> @ #2 [ffff88178f64d368] sync_page at ffffffff810d8161 > >>>> @ #3 [ffff88178f64d378] __wait_on_bit at ffffffff8145895b > >>>> @ #4 [ffff88178f64d3b8] wait_on_page_bit at ffffffff810d82fe > >>>> @ #5 [ffff88178f64d418] wait_on_page_writeback at ffffffff810e2a1a > >>>> @ #6 [ffff88178f64d438] shrink_page_list at ffffffff810e34e1 > >>>> @ #7 [ffff88178f64d588] shrink_list at ffffffff810e3dbe > >>>> @ #8 [ffff88178f64d6f8] shrink_zone at ffffffff810e425e > >>>> @ #9 [ffff88178f64d7b8] do_try_to_free_pages at ffffffff810e4978 > >>>> @ #10 [ffff88178f64d828] try_to_free_pages at ffffffff810e4c31 > >>>> @ #11 [ffff88178f64d8c8] __alloc_pages_nodemask at ffffffff810de370 > >>> > >>> This stack trace (from 2.6.32) cannot happen in mainline, though it took me a > >>> while to remember/discover exactly why. > >>> > >>> try_to_free_pages() creates a 'struct scan_control' with ->target_mem_cgroup > >>> set to NULL. > >>> shrink_page_list() checks ->target_mem_cgroup using global_reclaim() and if > >>> it is NULL, wait_on_page_writeback is *not* called. > >>> > >>> So we can only hit this deadlock if mem-cgroup limits are imposed on a > >>> process which is using NFS - which is quite possible but probably not common. > >>> > >>> The fact that a dead-lock can happen only when memcg limits are imposed seems > >>> very fragile. People aren't going to test that case much so there could well > >>> be other deadlock possibilities lurking. > >>> > >>> Mel: might there be some other way we could get out of this deadlock? > >>> Could the wait_on_page_writeback() in shrink_page_list() be made a timed-out > >>> wait or something? Any other wait out of this deadlock other than setting > >>> PF_MEMALLOC_NOIO everywhere? > >> > >> Not only the wait_on_page_writeback() cause the deadlock but also the > >> next pageout()-> (mapping->a_ops->writepage), Trond's second patch fix > >> this. So fix the wait_on_page_writeback is not enough to fix deadlock. > > > > Shortly before the only place that pageout() is called there is this code: > > > > if (page_is_file_cache(page) && > > (!current_is_kswapd() || > > !zone_is_reclaim_dirty(zone))) { > > ..... > > goto keep_locked; > > > > > > So pageout() only gets called by kswapd() .... or for swap. swap-over-NFS is > > already very cautious about memory allocations, and uses nfs_direct_IO, not > > nfs_writepage. > > > > So nfs_writepage will never get called during direct reclaim. There is no > > memory-allocate deadlock risk there. > Yes, thanks for explaining this. > But is it possible rpciod blocked somewhere by memory allocation using > GFP_KERNEL and kswapd is trying to pageout nfs dirty pages and blocked > by rpciod? I don't think so, no. Only 40% of memory (/proc/sys/vm/dirty_ratio) can be dirty. The direct reclaim procedure will eventually find some non-dirty memory it can use. If it cannot, and cannot write anything out to swap either, it will eventually trigger the OOM killer. Direct reclaim shouldn't ever block indefinitely. It will sometimes wait for a short while (e.g. congestion_wait()) but it should then push on until it finds something it can do: free a clean page, write something to swap, or kill a memory-hog with the OOM killer. NeilBrown
On 08/26/2014 03:04 PM, NeilBrown wrote: > On Tue, 26 Aug 2014 14:49:01 +0800 Junxiao Bi <junxiao.bi@oracle.com> wrote: > >> On 08/26/2014 02:21 PM, NeilBrown wrote: >>> On Tue, 26 Aug 2014 13:43:47 +0800 Junxiao Bi <junxiao.bi@oracle.com> wrote: >>> >>>> On 08/25/2014 02:48 PM, NeilBrown wrote: >>>>> On Fri, 22 Aug 2014 18:49:31 -0400 Trond Myklebust >>>>> <trond.myklebust@primarydata.com> wrote: >>>>> >>>>>> Junxiao Bi reports seeing the following deadlock: >>>>>> >>>>>> @ crash> bt 1539 >>>>>> @ PID: 1539 TASK: ffff88178f64a040 CPU: 1 COMMAND: "rpciod/1" >>>>>> @ #0 [ffff88178f64d2c0] schedule at ffffffff8145833a >>>>>> @ #1 [ffff88178f64d348] io_schedule at ffffffff8145842c >>>>>> @ #2 [ffff88178f64d368] sync_page at ffffffff810d8161 >>>>>> @ #3 [ffff88178f64d378] __wait_on_bit at ffffffff8145895b >>>>>> @ #4 [ffff88178f64d3b8] wait_on_page_bit at ffffffff810d82fe >>>>>> @ #5 [ffff88178f64d418] wait_on_page_writeback at ffffffff810e2a1a >>>>>> @ #6 [ffff88178f64d438] shrink_page_list at ffffffff810e34e1 >>>>>> @ #7 [ffff88178f64d588] shrink_list at ffffffff810e3dbe >>>>>> @ #8 [ffff88178f64d6f8] shrink_zone at ffffffff810e425e >>>>>> @ #9 [ffff88178f64d7b8] do_try_to_free_pages at ffffffff810e4978 >>>>>> @ #10 [ffff88178f64d828] try_to_free_pages at ffffffff810e4c31 >>>>>> @ #11 [ffff88178f64d8c8] __alloc_pages_nodemask at ffffffff810de370 >>>>> >>>>> This stack trace (from 2.6.32) cannot happen in mainline, though it took me a >>>>> while to remember/discover exactly why. >>>>> >>>>> try_to_free_pages() creates a 'struct scan_control' with ->target_mem_cgroup >>>>> set to NULL. >>>>> shrink_page_list() checks ->target_mem_cgroup using global_reclaim() and if >>>>> it is NULL, wait_on_page_writeback is *not* called. >>>>> >>>>> So we can only hit this deadlock if mem-cgroup limits are imposed on a >>>>> process which is using NFS - which is quite possible but probably not common. >>>>> >>>>> The fact that a dead-lock can happen only when memcg limits are imposed seems >>>>> very fragile. People aren't going to test that case much so there could well >>>>> be other deadlock possibilities lurking. >>>>> >>>>> Mel: might there be some other way we could get out of this deadlock? >>>>> Could the wait_on_page_writeback() in shrink_page_list() be made a timed-out >>>>> wait or something? Any other wait out of this deadlock other than setting >>>>> PF_MEMALLOC_NOIO everywhere? >>>> >>>> Not only the wait_on_page_writeback() cause the deadlock but also the >>>> next pageout()-> (mapping->a_ops->writepage), Trond's second patch fix >>>> this. So fix the wait_on_page_writeback is not enough to fix deadlock. >>> >>> Shortly before the only place that pageout() is called there is this code: >>> >>> if (page_is_file_cache(page) && >>> (!current_is_kswapd() || >>> !zone_is_reclaim_dirty(zone))) { >>> ..... >>> goto keep_locked; >>> >>> >>> So pageout() only gets called by kswapd() .... or for swap. swap-over-NFS is >>> already very cautious about memory allocations, and uses nfs_direct_IO, not >>> nfs_writepage. >>> >>> So nfs_writepage will never get called during direct reclaim. There is no >>> memory-allocate deadlock risk there. >> Yes, thanks for explaining this. >> But is it possible rpciod blocked somewhere by memory allocation using >> GFP_KERNEL and kswapd is trying to pageout nfs dirty pages and blocked >> by rpciod? > > I don't think so, no. > > Only 40% of memory (/proc/sys/vm/dirty_ratio) can be dirty. The direct > reclaim procedure will eventually find some non-dirty memory it can use. > If it cannot, and cannot write anything out to swap either, it will > eventually trigger the OOM killer. > > Direct reclaim shouldn't ever block indefinitely. It will sometimes wait for > a short while (e.g. congestion_wait()) but it should then push on until it > finds something it can do: free a clean page, write something to swap, or > kill a memory-hog with the OOM killer. That makes sense. Thanks. Junxiao. > > NeilBrown > -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Aug 25, 2014 at 04:48:52PM +1000, NeilBrown wrote: > On Fri, 22 Aug 2014 18:49:31 -0400 Trond Myklebust > <trond.myklebust@primarydata.com> wrote: > > > Junxiao Bi reports seeing the following deadlock: > > > > @ crash> bt 1539 > > @ PID: 1539 TASK: ffff88178f64a040 CPU: 1 COMMAND: "rpciod/1" > > @ #0 [ffff88178f64d2c0] schedule at ffffffff8145833a > > @ #1 [ffff88178f64d348] io_schedule at ffffffff8145842c > > @ #2 [ffff88178f64d368] sync_page at ffffffff810d8161 > > @ #3 [ffff88178f64d378] __wait_on_bit at ffffffff8145895b > > @ #4 [ffff88178f64d3b8] wait_on_page_bit at ffffffff810d82fe > > @ #5 [ffff88178f64d418] wait_on_page_writeback at ffffffff810e2a1a > > @ #6 [ffff88178f64d438] shrink_page_list at ffffffff810e34e1 > > @ #7 [ffff88178f64d588] shrink_list at ffffffff810e3dbe > > @ #8 [ffff88178f64d6f8] shrink_zone at ffffffff810e425e > > @ #9 [ffff88178f64d7b8] do_try_to_free_pages at ffffffff810e4978 > > @ #10 [ffff88178f64d828] try_to_free_pages at ffffffff810e4c31 > > @ #11 [ffff88178f64d8c8] __alloc_pages_nodemask at ffffffff810de370 > > This stack trace (from 2.6.32) cannot happen in mainline, though it took me a > while to remember/discover exactly why. > > try_to_free_pages() creates a 'struct scan_control' with ->target_mem_cgroup > set to NULL. > shrink_page_list() checks ->target_mem_cgroup using global_reclaim() and if > it is NULL, wait_on_page_writeback is *not* called. > wait_on_page_writeback has a host of other damage associated with it which is why we don't do it from reclaim any more. If the storage is very slow then a process can be stalled by unrelated IO to slow storage. If the storage is broken and the writeback can never complete then it causes other issues. That kind of thing. > So we can only hit this deadlock if mem-cgroup limits are imposed on a > process which is using NFS - which is quite possible but probably not common. > > The fact that a dead-lock can happen only when memcg limits are imposed seems > very fragile. People aren't going to test that case much so there could well > be other deadlock possibilities lurking. > memcgs still can call wait_on_page_writeback and this is known to be a hand-grenade to the memcg people but I've never heard of them trying to tackle the problem. > Mel: might there be some other way we could get out of this deadlock? > Could the wait_on_page_writeback() in shrink_page_list() be made a timed-out > wait or something? Any other wait out of this deadlock other than setting > PF_MEMALLOC_NOIO everywhere? > I don't have the full thread as it was not cc'd to lkml so I don't know what circumstances reached this deadlock in the first place. If this is on 2.6.32 and the deadline cannot happen during reclaim in mainline then why is mainline being patched? Do not alter wait_on_page_writeback() to timeout as it will blow up spectacularly -- swap unuse races, data would not longer be synced correctly to disk, sync IO would be flaky, stable page writes would be fired out the window etc.
On Tue, Aug 26, 2014 at 6:53 AM, Mel Gorman <mgorman@suse.com> wrote: > On Mon, Aug 25, 2014 at 04:48:52PM +1000, NeilBrown wrote: >> On Fri, 22 Aug 2014 18:49:31 -0400 Trond Myklebust >> <trond.myklebust@primarydata.com> wrote: >> >> > Junxiao Bi reports seeing the following deadlock: >> > >> > @ crash> bt 1539 >> > @ PID: 1539 TASK: ffff88178f64a040 CPU: 1 COMMAND: "rpciod/1" >> > @ #0 [ffff88178f64d2c0] schedule at ffffffff8145833a >> > @ #1 [ffff88178f64d348] io_schedule at ffffffff8145842c >> > @ #2 [ffff88178f64d368] sync_page at ffffffff810d8161 >> > @ #3 [ffff88178f64d378] __wait_on_bit at ffffffff8145895b >> > @ #4 [ffff88178f64d3b8] wait_on_page_bit at ffffffff810d82fe >> > @ #5 [ffff88178f64d418] wait_on_page_writeback at ffffffff810e2a1a >> > @ #6 [ffff88178f64d438] shrink_page_list at ffffffff810e34e1 >> > @ #7 [ffff88178f64d588] shrink_list at ffffffff810e3dbe >> > @ #8 [ffff88178f64d6f8] shrink_zone at ffffffff810e425e >> > @ #9 [ffff88178f64d7b8] do_try_to_free_pages at ffffffff810e4978 >> > @ #10 [ffff88178f64d828] try_to_free_pages at ffffffff810e4c31 >> > @ #11 [ffff88178f64d8c8] __alloc_pages_nodemask at ffffffff810de370 >> >> This stack trace (from 2.6.32) cannot happen in mainline, though it took me a >> while to remember/discover exactly why. >> >> try_to_free_pages() creates a 'struct scan_control' with ->target_mem_cgroup >> set to NULL. >> shrink_page_list() checks ->target_mem_cgroup using global_reclaim() and if >> it is NULL, wait_on_page_writeback is *not* called. >> > > wait_on_page_writeback has a host of other damage associated with it which > is why we don't do it from reclaim any more. If the storage is very slow > then a process can be stalled by unrelated IO to slow storage. If the > storage is broken and the writeback can never complete then it causes other > issues. That kind of thing. > >> So we can only hit this deadlock if mem-cgroup limits are imposed on a >> process which is using NFS - which is quite possible but probably not common. >> >> The fact that a dead-lock can happen only when memcg limits are imposed seems >> very fragile. People aren't going to test that case much so there could well >> be other deadlock possibilities lurking. >> > > memcgs still can call wait_on_page_writeback and this is known to be a > hand-grenade to the memcg people but I've never heard of them trying to > tackle the problem. > >> Mel: might there be some other way we could get out of this deadlock? >> Could the wait_on_page_writeback() in shrink_page_list() be made a timed-out >> wait or something? Any other wait out of this deadlock other than setting >> PF_MEMALLOC_NOIO everywhere? >> > > I don't have the full thread as it was not cc'd to lkml so I don't know > what circumstances reached this deadlock in the first place. If this is > on 2.6.32 and the deadline cannot happen during reclaim in mainline then > why is mainline being patched? > > Do not alter wait_on_page_writeback() to timeout as it will blow > up spectacularly -- swap unuse races, data would not longer be synced > correctly to disk, sync IO would be flaky, stable page writes would be > fired out the window etc. Hi Mel, The above stack trace really is the entire deadlock: the rpciod work queue, which drives I/O on behalf of NFS, gets caught in a shrink_page_list() situation where it ends up waiting on page writeback. Boom.... Even if this can only happen for non-trivial memcg situations, then it still needs to be addressed: if rpciod blocks, then all NFS I/O will block and we can no longer write out the dirty pages. This is why we need a mainline fix. Cheers Trond
On Tue, Aug 26, 2014 at 08:58:36AM -0400, Trond Myklebust wrote: > On Tue, Aug 26, 2014 at 6:53 AM, Mel Gorman <mgorman@suse.com> wrote: > > On Mon, Aug 25, 2014 at 04:48:52PM +1000, NeilBrown wrote: > >> On Fri, 22 Aug 2014 18:49:31 -0400 Trond Myklebust > >> <trond.myklebust@primarydata.com> wrote: > >> > >> > Junxiao Bi reports seeing the following deadlock: > >> > > >> > @ crash> bt 1539 > >> > @ PID: 1539 TASK: ffff88178f64a040 CPU: 1 COMMAND: "rpciod/1" > >> > @ #0 [ffff88178f64d2c0] schedule at ffffffff8145833a > >> > @ #1 [ffff88178f64d348] io_schedule at ffffffff8145842c > >> > @ #2 [ffff88178f64d368] sync_page at ffffffff810d8161 > >> > @ #3 [ffff88178f64d378] __wait_on_bit at ffffffff8145895b > >> > @ #4 [ffff88178f64d3b8] wait_on_page_bit at ffffffff810d82fe > >> > @ #5 [ffff88178f64d418] wait_on_page_writeback at ffffffff810e2a1a > >> > @ #6 [ffff88178f64d438] shrink_page_list at ffffffff810e34e1 > >> > @ #7 [ffff88178f64d588] shrink_list at ffffffff810e3dbe > >> > @ #8 [ffff88178f64d6f8] shrink_zone at ffffffff810e425e > >> > @ #9 [ffff88178f64d7b8] do_try_to_free_pages at ffffffff810e4978 > >> > @ #10 [ffff88178f64d828] try_to_free_pages at ffffffff810e4c31 > >> > @ #11 [ffff88178f64d8c8] __alloc_pages_nodemask at ffffffff810de370 > >> > >> This stack trace (from 2.6.32) cannot happen in mainline, though it took me a > >> while to remember/discover exactly why. > >> > >> try_to_free_pages() creates a 'struct scan_control' with ->target_mem_cgroup > >> set to NULL. > >> shrink_page_list() checks ->target_mem_cgroup using global_reclaim() and if > >> it is NULL, wait_on_page_writeback is *not* called. > >> > > > > wait_on_page_writeback has a host of other damage associated with it which > > is why we don't do it from reclaim any more. If the storage is very slow > > then a process can be stalled by unrelated IO to slow storage. If the > > storage is broken and the writeback can never complete then it causes other > > issues. That kind of thing. > > > >> So we can only hit this deadlock if mem-cgroup limits are imposed on a > >> process which is using NFS - which is quite possible but probably not common. > >> > >> The fact that a dead-lock can happen only when memcg limits are imposed seems > >> very fragile. People aren't going to test that case much so there could well > >> be other deadlock possibilities lurking. > >> > > > > memcgs still can call wait_on_page_writeback and this is known to be a > > hand-grenade to the memcg people but I've never heard of them trying to > > tackle the problem. > > > >> Mel: might there be some other way we could get out of this deadlock? > >> Could the wait_on_page_writeback() in shrink_page_list() be made a timed-out > >> wait or something? Any other wait out of this deadlock other than setting > >> PF_MEMALLOC_NOIO everywhere? > >> > > > > I don't have the full thread as it was not cc'd to lkml so I don't know > > what circumstances reached this deadlock in the first place. If this is > > on 2.6.32 and the deadline cannot happen during reclaim in mainline then > > why is mainline being patched? > > > > Do not alter wait_on_page_writeback() to timeout as it will blow > > up spectacularly -- swap unuse races, data would not longer be synced > > correctly to disk, sync IO would be flaky, stable page writes would be > > fired out the window etc. > > Hi Mel, > > The above stack trace really is the entire deadlock: the rpciod work > queue, which drives I/O on behalf of NFS, gets caught in a > shrink_page_list() situation where it ends up waiting on page > writeback. Boom.... > > Even if this can only happen for non-trivial memcg situations, then it > still needs to be addressed: if rpciod blocks, then all NFS I/O will > block and we can no longer write out the dirty pages. This is why we > need a mainline fix. > In that case I'm adding the memcg people. I recognise that rpciod should never block on writeback for similar reasons why flushers should never block. memcg blocking on writeback is dangerous for reasons other than NFS but adding a variant that times out just means that on occasion processes get stalled for long periods of time timing out on these writeback pages. In that case, forward progress of rpciod would be painfully slow. On the other hand, forcing PF_MEMALLOC_NOIO for all rpciod allocations in an ideal world is massive overkill and while it will work, there will be other consequences -- unable to swap pages for example, unable to release buffers to free clean pages etc. It'd be nice of the memcg people could comment on whether they plan to handle the fact that memcg is the only called of wait_on_page_writeback in direct reclaim paths.
On Tue, Aug 26, 2014 at 02:26:24PM +0100, Mel Gorman wrote: > On Tue, Aug 26, 2014 at 08:58:36AM -0400, Trond Myklebust wrote: > > On Tue, Aug 26, 2014 at 6:53 AM, Mel Gorman <mgorman@suse.com> wrote: > > > On Mon, Aug 25, 2014 at 04:48:52PM +1000, NeilBrown wrote: > > >> On Fri, 22 Aug 2014 18:49:31 -0400 Trond Myklebust > > >> <trond.myklebust@primarydata.com> wrote: > > >> > > >> > Junxiao Bi reports seeing the following deadlock: > > >> > > > >> > @ crash> bt 1539 > > >> > @ PID: 1539 TASK: ffff88178f64a040 CPU: 1 COMMAND: "rpciod/1" > > >> > @ #0 [ffff88178f64d2c0] schedule at ffffffff8145833a > > >> > @ #1 [ffff88178f64d348] io_schedule at ffffffff8145842c > > >> > @ #2 [ffff88178f64d368] sync_page at ffffffff810d8161 > > >> > @ #3 [ffff88178f64d378] __wait_on_bit at ffffffff8145895b > > >> > @ #4 [ffff88178f64d3b8] wait_on_page_bit at ffffffff810d82fe > > >> > @ #5 [ffff88178f64d418] wait_on_page_writeback at ffffffff810e2a1a > > >> > @ #6 [ffff88178f64d438] shrink_page_list at ffffffff810e34e1 > > >> > @ #7 [ffff88178f64d588] shrink_list at ffffffff810e3dbe > > >> > @ #8 [ffff88178f64d6f8] shrink_zone at ffffffff810e425e > > >> > @ #9 [ffff88178f64d7b8] do_try_to_free_pages at ffffffff810e4978 > > >> > @ #10 [ffff88178f64d828] try_to_free_pages at ffffffff810e4c31 > > >> > @ #11 [ffff88178f64d8c8] __alloc_pages_nodemask at ffffffff810de370 > > >> > > >> This stack trace (from 2.6.32) cannot happen in mainline, though it took me a > > >> while to remember/discover exactly why. > > >> > > >> try_to_free_pages() creates a 'struct scan_control' with ->target_mem_cgroup > > >> set to NULL. > > >> shrink_page_list() checks ->target_mem_cgroup using global_reclaim() and if > > >> it is NULL, wait_on_page_writeback is *not* called. > > >> > > > > > > wait_on_page_writeback has a host of other damage associated with it which > > > is why we don't do it from reclaim any more. If the storage is very slow > > > then a process can be stalled by unrelated IO to slow storage. If the > > > storage is broken and the writeback can never complete then it causes other > > > issues. That kind of thing. > > > > > >> So we can only hit this deadlock if mem-cgroup limits are imposed on a > > >> process which is using NFS - which is quite possible but probably not common. > > >> > > >> The fact that a dead-lock can happen only when memcg limits are imposed seems > > >> very fragile. People aren't going to test that case much so there could well > > >> be other deadlock possibilities lurking. > > >> > > > > > > memcgs still can call wait_on_page_writeback and this is known to be a > > > hand-grenade to the memcg people but I've never heard of them trying to > > > tackle the problem. > > > > > >> Mel: might there be some other way we could get out of this deadlock? > > >> Could the wait_on_page_writeback() in shrink_page_list() be made a timed-out > > >> wait or something? Any other wait out of this deadlock other than setting > > >> PF_MEMALLOC_NOIO everywhere? > > >> > > > > > > I don't have the full thread as it was not cc'd to lkml so I don't know > > > what circumstances reached this deadlock in the first place. If this is > > > on 2.6.32 and the deadline cannot happen during reclaim in mainline then > > > why is mainline being patched? > > > > > > Do not alter wait_on_page_writeback() to timeout as it will blow > > > up spectacularly -- swap unuse races, data would not longer be synced > > > correctly to disk, sync IO would be flaky, stable page writes would be > > > fired out the window etc. > > > > Hi Mel, > > > > The above stack trace really is the entire deadlock: the rpciod work > > queue, which drives I/O on behalf of NFS, gets caught in a > > shrink_page_list() situation where it ends up waiting on page > > writeback. Boom.... > > > > Even if this can only happen for non-trivial memcg situations, then it > > still needs to be addressed: if rpciod blocks, then all NFS I/O will > > block and we can no longer write out the dirty pages. This is why we > > need a mainline fix. > > > > In that case I'm adding the memcg people. I recognise that rpciod should > never block on writeback for similar reasons why flushers should never block. > memcg blocking on writeback is dangerous for reasons other than NFS but > adding a variant that times out just means that on occasion processes get > stalled for long periods of time timing out on these writeback pages. In > that case, forward progress of rpciod would be painfully slow. > > On the other hand, forcing PF_MEMALLOC_NOIO for all rpciod allocations in > an ideal world is massive overkill and while it will work, there will be > other consequences -- unable to swap pages for example, unable to release > buffers to free clean pages etc. > > It'd be nice of the memcg people could comment on whether they plan to > handle the fact that memcg is the only called of wait_on_page_writeback > in direct reclaim paths. wait_on_page_writeback() is a hammer, and we need to be better about this once we have per-memcg dirty writeback and throttling, but I think that really misses the point. Even if memcg writeback waiting were smarter, any length of time spent waiting for yourself to make progress is absurd. We just shouldn't be solving deadlock scenarios through arbitrary timeouts on one side. If you can't wait for IO to finish, you shouldn't be passing __GFP_IO. Can't you use mempools like the other IO paths? -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Aug 26, 2014 at 7:19 PM, Johannes Weiner <hannes@cmpxchg.org> wrote: > On Tue, Aug 26, 2014 at 02:26:24PM +0100, Mel Gorman wrote: >> On Tue, Aug 26, 2014 at 08:58:36AM -0400, Trond Myklebust wrote: >> > On Tue, Aug 26, 2014 at 6:53 AM, Mel Gorman <mgorman@suse.com> wrote: >> > > On Mon, Aug 25, 2014 at 04:48:52PM +1000, NeilBrown wrote: >> > >> On Fri, 22 Aug 2014 18:49:31 -0400 Trond Myklebust >> > >> <trond.myklebust@primarydata.com> wrote: >> > >> >> > >> > Junxiao Bi reports seeing the following deadlock: >> > >> > >> > >> > @ crash> bt 1539 >> > >> > @ PID: 1539 TASK: ffff88178f64a040 CPU: 1 COMMAND: "rpciod/1" >> > >> > @ #0 [ffff88178f64d2c0] schedule at ffffffff8145833a >> > >> > @ #1 [ffff88178f64d348] io_schedule at ffffffff8145842c >> > >> > @ #2 [ffff88178f64d368] sync_page at ffffffff810d8161 >> > >> > @ #3 [ffff88178f64d378] __wait_on_bit at ffffffff8145895b >> > >> > @ #4 [ffff88178f64d3b8] wait_on_page_bit at ffffffff810d82fe >> > >> > @ #5 [ffff88178f64d418] wait_on_page_writeback at ffffffff810e2a1a >> > >> > @ #6 [ffff88178f64d438] shrink_page_list at ffffffff810e34e1 >> > >> > @ #7 [ffff88178f64d588] shrink_list at ffffffff810e3dbe >> > >> > @ #8 [ffff88178f64d6f8] shrink_zone at ffffffff810e425e >> > >> > @ #9 [ffff88178f64d7b8] do_try_to_free_pages at ffffffff810e4978 >> > >> > @ #10 [ffff88178f64d828] try_to_free_pages at ffffffff810e4c31 >> > >> > @ #11 [ffff88178f64d8c8] __alloc_pages_nodemask at ffffffff810de370 >> > >> >> > >> This stack trace (from 2.6.32) cannot happen in mainline, though it took me a >> > >> while to remember/discover exactly why. >> > >> >> > >> try_to_free_pages() creates a 'struct scan_control' with ->target_mem_cgroup >> > >> set to NULL. >> > >> shrink_page_list() checks ->target_mem_cgroup using global_reclaim() and if >> > >> it is NULL, wait_on_page_writeback is *not* called. >> > >> >> > > >> > > wait_on_page_writeback has a host of other damage associated with it which >> > > is why we don't do it from reclaim any more. If the storage is very slow >> > > then a process can be stalled by unrelated IO to slow storage. If the >> > > storage is broken and the writeback can never complete then it causes other >> > > issues. That kind of thing. >> > > >> > >> So we can only hit this deadlock if mem-cgroup limits are imposed on a >> > >> process which is using NFS - which is quite possible but probably not common. >> > >> >> > >> The fact that a dead-lock can happen only when memcg limits are imposed seems >> > >> very fragile. People aren't going to test that case much so there could well >> > >> be other deadlock possibilities lurking. >> > >> >> > > >> > > memcgs still can call wait_on_page_writeback and this is known to be a >> > > hand-grenade to the memcg people but I've never heard of them trying to >> > > tackle the problem. >> > > >> > >> Mel: might there be some other way we could get out of this deadlock? >> > >> Could the wait_on_page_writeback() in shrink_page_list() be made a timed-out >> > >> wait or something? Any other wait out of this deadlock other than setting >> > >> PF_MEMALLOC_NOIO everywhere? >> > >> >> > > >> > > I don't have the full thread as it was not cc'd to lkml so I don't know >> > > what circumstances reached this deadlock in the first place. If this is >> > > on 2.6.32 and the deadline cannot happen during reclaim in mainline then >> > > why is mainline being patched? >> > > >> > > Do not alter wait_on_page_writeback() to timeout as it will blow >> > > up spectacularly -- swap unuse races, data would not longer be synced >> > > correctly to disk, sync IO would be flaky, stable page writes would be >> > > fired out the window etc. >> > >> > Hi Mel, >> > >> > The above stack trace really is the entire deadlock: the rpciod work >> > queue, which drives I/O on behalf of NFS, gets caught in a >> > shrink_page_list() situation where it ends up waiting on page >> > writeback. Boom.... >> > >> > Even if this can only happen for non-trivial memcg situations, then it >> > still needs to be addressed: if rpciod blocks, then all NFS I/O will >> > block and we can no longer write out the dirty pages. This is why we >> > need a mainline fix. >> > >> >> In that case I'm adding the memcg people. I recognise that rpciod should >> never block on writeback for similar reasons why flushers should never block. >> memcg blocking on writeback is dangerous for reasons other than NFS but >> adding a variant that times out just means that on occasion processes get >> stalled for long periods of time timing out on these writeback pages. In >> that case, forward progress of rpciod would be painfully slow. >> >> On the other hand, forcing PF_MEMALLOC_NOIO for all rpciod allocations in >> an ideal world is massive overkill and while it will work, there will be >> other consequences -- unable to swap pages for example, unable to release >> buffers to free clean pages etc. >> >> It'd be nice of the memcg people could comment on whether they plan to >> handle the fact that memcg is the only called of wait_on_page_writeback >> in direct reclaim paths. > > wait_on_page_writeback() is a hammer, and we need to be better about > this once we have per-memcg dirty writeback and throttling, but I > think that really misses the point. Even if memcg writeback waiting > were smarter, any length of time spent waiting for yourself to make > progress is absurd. We just shouldn't be solving deadlock scenarios > through arbitrary timeouts on one side. If you can't wait for IO to > finish, you shouldn't be passing __GFP_IO. > > Can't you use mempools like the other IO paths? There is no way to pass any allocation flags at all to an operation such as __sock_create() (which may be needed if the server disconnects). So in general, the answer is no.
On Tue, Aug 26, 2014 at 7:51 PM, Trond Myklebust <trond.myklebust@primarydata.com> wrote: > On Tue, Aug 26, 2014 at 7:19 PM, Johannes Weiner <hannes@cmpxchg.org> wrote: >> On Tue, Aug 26, 2014 at 02:26:24PM +0100, Mel Gorman wrote: >>> On Tue, Aug 26, 2014 at 08:58:36AM -0400, Trond Myklebust wrote: >>> > On Tue, Aug 26, 2014 at 6:53 AM, Mel Gorman <mgorman@suse.com> wrote: >>> > > On Mon, Aug 25, 2014 at 04:48:52PM +1000, NeilBrown wrote: >>> > >> On Fri, 22 Aug 2014 18:49:31 -0400 Trond Myklebust >>> > >> <trond.myklebust@primarydata.com> wrote: >>> > >> >>> > >> > Junxiao Bi reports seeing the following deadlock: >>> > >> > >>> > >> > @ crash> bt 1539 >>> > >> > @ PID: 1539 TASK: ffff88178f64a040 CPU: 1 COMMAND: "rpciod/1" >>> > >> > @ #0 [ffff88178f64d2c0] schedule at ffffffff8145833a >>> > >> > @ #1 [ffff88178f64d348] io_schedule at ffffffff8145842c >>> > >> > @ #2 [ffff88178f64d368] sync_page at ffffffff810d8161 >>> > >> > @ #3 [ffff88178f64d378] __wait_on_bit at ffffffff8145895b >>> > >> > @ #4 [ffff88178f64d3b8] wait_on_page_bit at ffffffff810d82fe >>> > >> > @ #5 [ffff88178f64d418] wait_on_page_writeback at ffffffff810e2a1a >>> > >> > @ #6 [ffff88178f64d438] shrink_page_list at ffffffff810e34e1 >>> > >> > @ #7 [ffff88178f64d588] shrink_list at ffffffff810e3dbe >>> > >> > @ #8 [ffff88178f64d6f8] shrink_zone at ffffffff810e425e >>> > >> > @ #9 [ffff88178f64d7b8] do_try_to_free_pages at ffffffff810e4978 >>> > >> > @ #10 [ffff88178f64d828] try_to_free_pages at ffffffff810e4c31 >>> > >> > @ #11 [ffff88178f64d8c8] __alloc_pages_nodemask at ffffffff810de370 >>> > >> >>> > >> This stack trace (from 2.6.32) cannot happen in mainline, though it took me a >>> > >> while to remember/discover exactly why. >>> > >> >>> > >> try_to_free_pages() creates a 'struct scan_control' with ->target_mem_cgroup >>> > >> set to NULL. >>> > >> shrink_page_list() checks ->target_mem_cgroup using global_reclaim() and if >>> > >> it is NULL, wait_on_page_writeback is *not* called. >>> > >> >>> > > >>> > > wait_on_page_writeback has a host of other damage associated with it which >>> > > is why we don't do it from reclaim any more. If the storage is very slow >>> > > then a process can be stalled by unrelated IO to slow storage. If the >>> > > storage is broken and the writeback can never complete then it causes other >>> > > issues. That kind of thing. >>> > > >>> > >> So we can only hit this deadlock if mem-cgroup limits are imposed on a >>> > >> process which is using NFS - which is quite possible but probably not common. >>> > >> >>> > >> The fact that a dead-lock can happen only when memcg limits are imposed seems >>> > >> very fragile. People aren't going to test that case much so there could well >>> > >> be other deadlock possibilities lurking. >>> > >> >>> > > >>> > > memcgs still can call wait_on_page_writeback and this is known to be a >>> > > hand-grenade to the memcg people but I've never heard of them trying to >>> > > tackle the problem. >>> > > >>> > >> Mel: might there be some other way we could get out of this deadlock? >>> > >> Could the wait_on_page_writeback() in shrink_page_list() be made a timed-out >>> > >> wait or something? Any other wait out of this deadlock other than setting >>> > >> PF_MEMALLOC_NOIO everywhere? >>> > >> >>> > > >>> > > I don't have the full thread as it was not cc'd to lkml so I don't know >>> > > what circumstances reached this deadlock in the first place. If this is >>> > > on 2.6.32 and the deadline cannot happen during reclaim in mainline then >>> > > why is mainline being patched? >>> > > >>> > > Do not alter wait_on_page_writeback() to timeout as it will blow >>> > > up spectacularly -- swap unuse races, data would not longer be synced >>> > > correctly to disk, sync IO would be flaky, stable page writes would be >>> > > fired out the window etc. >>> > >>> > Hi Mel, >>> > >>> > The above stack trace really is the entire deadlock: the rpciod work >>> > queue, which drives I/O on behalf of NFS, gets caught in a >>> > shrink_page_list() situation where it ends up waiting on page >>> > writeback. Boom.... >>> > >>> > Even if this can only happen for non-trivial memcg situations, then it >>> > still needs to be addressed: if rpciod blocks, then all NFS I/O will >>> > block and we can no longer write out the dirty pages. This is why we >>> > need a mainline fix. >>> > >>> >>> In that case I'm adding the memcg people. I recognise that rpciod should >>> never block on writeback for similar reasons why flushers should never block. >>> memcg blocking on writeback is dangerous for reasons other than NFS but >>> adding a variant that times out just means that on occasion processes get >>> stalled for long periods of time timing out on these writeback pages. In >>> that case, forward progress of rpciod would be painfully slow. >>> >>> On the other hand, forcing PF_MEMALLOC_NOIO for all rpciod allocations in >>> an ideal world is massive overkill and while it will work, there will be >>> other consequences -- unable to swap pages for example, unable to release >>> buffers to free clean pages etc. >>> >>> It'd be nice of the memcg people could comment on whether they plan to >>> handle the fact that memcg is the only called of wait_on_page_writeback >>> in direct reclaim paths. >> >> wait_on_page_writeback() is a hammer, and we need to be better about >> this once we have per-memcg dirty writeback and throttling, but I >> think that really misses the point. Even if memcg writeback waiting >> were smarter, any length of time spent waiting for yourself to make >> progress is absurd. We just shouldn't be solving deadlock scenarios >> through arbitrary timeouts on one side. If you can't wait for IO to >> finish, you shouldn't be passing __GFP_IO. >> >> Can't you use mempools like the other IO paths? > > There is no way to pass any allocation flags at all to an operation > such as __sock_create() (which may be needed if the server > disconnects). So in general, the answer is no. > Actually, one question that should probably be raised before anything else: is it at all possible for a workqueue like rpciod to have a non-trivial setting for ->target_mem_cgroup? If not, then the whole question is moot.
On Tue, 26 Aug 2014 19:19:38 -0400 Johannes Weiner <hannes@cmpxchg.org> wrote: > On Tue, Aug 26, 2014 at 02:26:24PM +0100, Mel Gorman wrote: > > It'd be nice of the memcg people could comment on whether they plan to > > handle the fact that memcg is the only called of wait_on_page_writeback > > in direct reclaim paths. > > wait_on_page_writeback() is a hammer, and we need to be better about > this once we have per-memcg dirty writeback and throttling, but I > think that really misses the point. Even if memcg writeback waiting > were smarter, any length of time spent waiting for yourself to make > progress is absurd. We just shouldn't be solving deadlock scenarios > through arbitrary timeouts on one side. If you can't wait for IO to > finish, you shouldn't be passing __GFP_IO. I think that is overly simplistic. Certainly "waiting for yourself" is absurd, but it can be hard to know if that is what you are doing. Refusing to wait at all just because you might be waiting for yourself is also absurd. Direct reclaim already has "congestion_wait()" calls which wait a little while, just in case. Doing that you find a page in writeback might not be such a bad thing. When this becomes an issue, writeout is already slowing everything down, and maybe slowing down a bit more isn't much cost. > > Can't you use mempools like the other IO paths? mempools and other pre-allocation strategies are appropriate for block devices and critical for any "swap out" path. Filesystems have traditionally got by without them, using GFP_NOFS when necessary. GFP_NOFS was originally meant to be set when holding filesystem-internal locks. Setting it everywhere that memory might be allocated while handing write-out is a very different use-case. Setting GFP_NOFS in more and more places doesn't really scale very well and is particularly awkward for NFS as lots of network interfaces don't allow setting GFP flags, and the network maintainers really don't want them to. The recent direct-reclaim changes to get kswapd and the flush- threads to do most of the work made it much easier to avoid deadlocks. Direct reclaim no longer calls ->writepage and doesn't wait_on_page_writeback(). Except when handling memory pressure for a memcg. It's not an easy problem, but I don't think that "use mempools" is a valid answer. A simple rule like "direct reclaim never blocks indefinitely" is, I think, quite achievable and would resolve a whole class of deadlocks. Thanks, NeilBrown
On Tue, Aug 26, 2014 at 08:00:20PM -0400, Trond Myklebust wrote: > On Tue, Aug 26, 2014 at 7:51 PM, Trond Myklebust > <trond.myklebust@primarydata.com> wrote: > > On Tue, Aug 26, 2014 at 7:19 PM, Johannes Weiner <hannes@cmpxchg.org> wrote: > >> On Tue, Aug 26, 2014 at 02:26:24PM +0100, Mel Gorman wrote: > >>> On Tue, Aug 26, 2014 at 08:58:36AM -0400, Trond Myklebust wrote: > >>> > On Tue, Aug 26, 2014 at 6:53 AM, Mel Gorman <mgorman@suse.com> wrote: > >>> > > On Mon, Aug 25, 2014 at 04:48:52PM +1000, NeilBrown wrote: > >>> > >> On Fri, 22 Aug 2014 18:49:31 -0400 Trond Myklebust > >>> > >> <trond.myklebust@primarydata.com> wrote: > >>> > >> > >>> > >> > Junxiao Bi reports seeing the following deadlock: > >>> > >> > > >>> > >> > @ crash> bt 1539 > >>> > >> > @ PID: 1539 TASK: ffff88178f64a040 CPU: 1 COMMAND: "rpciod/1" > >>> > >> > @ #0 [ffff88178f64d2c0] schedule at ffffffff8145833a > >>> > >> > @ #1 [ffff88178f64d348] io_schedule at ffffffff8145842c > >>> > >> > @ #2 [ffff88178f64d368] sync_page at ffffffff810d8161 > >>> > >> > @ #3 [ffff88178f64d378] __wait_on_bit at ffffffff8145895b > >>> > >> > @ #4 [ffff88178f64d3b8] wait_on_page_bit at ffffffff810d82fe > >>> > >> > @ #5 [ffff88178f64d418] wait_on_page_writeback at ffffffff810e2a1a > >>> > >> > @ #6 [ffff88178f64d438] shrink_page_list at ffffffff810e34e1 > >>> > >> > @ #7 [ffff88178f64d588] shrink_list at ffffffff810e3dbe > >>> > >> > @ #8 [ffff88178f64d6f8] shrink_zone at ffffffff810e425e > >>> > >> > @ #9 [ffff88178f64d7b8] do_try_to_free_pages at ffffffff810e4978 > >>> > >> > @ #10 [ffff88178f64d828] try_to_free_pages at ffffffff810e4c31 > >>> > >> > @ #11 [ffff88178f64d8c8] __alloc_pages_nodemask at ffffffff810de370 > >>> > >> > >>> > >> This stack trace (from 2.6.32) cannot happen in mainline, though it took me a > >>> > >> while to remember/discover exactly why. > >>> > >> > >>> > >> try_to_free_pages() creates a 'struct scan_control' with ->target_mem_cgroup > >>> > >> set to NULL. > >>> > >> shrink_page_list() checks ->target_mem_cgroup using global_reclaim() and if > >>> > >> it is NULL, wait_on_page_writeback is *not* called. > >>> > >> > >>> > > > >>> > > wait_on_page_writeback has a host of other damage associated with it which > >>> > > is why we don't do it from reclaim any more. If the storage is very slow > >>> > > then a process can be stalled by unrelated IO to slow storage. If the > >>> > > storage is broken and the writeback can never complete then it causes other > >>> > > issues. That kind of thing. > >>> > > > >>> > >> So we can only hit this deadlock if mem-cgroup limits are imposed on a > >>> > >> process which is using NFS - which is quite possible but probably not common. > >>> > >> > >>> > >> The fact that a dead-lock can happen only when memcg limits are imposed seems > >>> > >> very fragile. People aren't going to test that case much so there could well > >>> > >> be other deadlock possibilities lurking. > >>> > >> > >>> > > > >>> > > memcgs still can call wait_on_page_writeback and this is known to be a > >>> > > hand-grenade to the memcg people but I've never heard of them trying to > >>> > > tackle the problem. > >>> > > > >>> > >> Mel: might there be some other way we could get out of this deadlock? > >>> > >> Could the wait_on_page_writeback() in shrink_page_list() be made a timed-out > >>> > >> wait or something? Any other wait out of this deadlock other than setting > >>> > >> PF_MEMALLOC_NOIO everywhere? > >>> > >> > >>> > > > >>> > > I don't have the full thread as it was not cc'd to lkml so I don't know > >>> > > what circumstances reached this deadlock in the first place. If this is > >>> > > on 2.6.32 and the deadline cannot happen during reclaim in mainline then > >>> > > why is mainline being patched? > >>> > > > >>> > > Do not alter wait_on_page_writeback() to timeout as it will blow > >>> > > up spectacularly -- swap unuse races, data would not longer be synced > >>> > > correctly to disk, sync IO would be flaky, stable page writes would be > >>> > > fired out the window etc. > >>> > > >>> > Hi Mel, > >>> > > >>> > The above stack trace really is the entire deadlock: the rpciod work > >>> > queue, which drives I/O on behalf of NFS, gets caught in a > >>> > shrink_page_list() situation where it ends up waiting on page > >>> > writeback. Boom.... > >>> > > >>> > Even if this can only happen for non-trivial memcg situations, then it > >>> > still needs to be addressed: if rpciod blocks, then all NFS I/O will > >>> > block and we can no longer write out the dirty pages. This is why we > >>> > need a mainline fix. > >>> > > >>> > >>> In that case I'm adding the memcg people. I recognise that rpciod should > >>> never block on writeback for similar reasons why flushers should never block. > >>> memcg blocking on writeback is dangerous for reasons other than NFS but > >>> adding a variant that times out just means that on occasion processes get > >>> stalled for long periods of time timing out on these writeback pages. In > >>> that case, forward progress of rpciod would be painfully slow. > >>> > >>> On the other hand, forcing PF_MEMALLOC_NOIO for all rpciod allocations in > >>> an ideal world is massive overkill and while it will work, there will be > >>> other consequences -- unable to swap pages for example, unable to release > >>> buffers to free clean pages etc. > >>> > >>> It'd be nice of the memcg people could comment on whether they plan to > >>> handle the fact that memcg is the only called of wait_on_page_writeback > >>> in direct reclaim paths. > >> > >> wait_on_page_writeback() is a hammer, and we need to be better about > >> this once we have per-memcg dirty writeback and throttling, but I > >> think that really misses the point. Even if memcg writeback waiting > >> were smarter, any length of time spent waiting for yourself to make > >> progress is absurd. We just shouldn't be solving deadlock scenarios > >> through arbitrary timeouts on one side. If you can't wait for IO to > >> finish, you shouldn't be passing __GFP_IO. > >> > >> Can't you use mempools like the other IO paths? > > > > There is no way to pass any allocation flags at all to an operation > > such as __sock_create() (which may be needed if the server > > disconnects). So in general, the answer is no. > > > > Actually, one question that should probably be raised before anything > else: is it at all possible for a workqueue like rpciod to have a > non-trivial setting for ->target_mem_cgroup? If not, then the whole > question is moot. > AFAIK, today it's not possible to add kernel threads (which rpciod is one) to a memcg so the issue is entirely theoritical at the moment. Even if this was to change, it's not clear to me what adding kernel threads to a memcg would mean as kernel threads have no RSS. Even if kernel resources were accounted for, I cannot see why a kernel thread would join a memcg. I expec that it's currently impossible for rpciod to have a non-trivial target_mem_cgroup. The memcg folk will correct me if I'm wrong or if there are plans to change that for some reason.
On Wed, Aug 27, 2014 at 11:36 AM, Mel Gorman <mgorman@suse.com> wrote: > On Tue, Aug 26, 2014 at 08:00:20PM -0400, Trond Myklebust wrote: >> On Tue, Aug 26, 2014 at 7:51 PM, Trond Myklebust >> <trond.myklebust@primarydata.com> wrote: >> > On Tue, Aug 26, 2014 at 7:19 PM, Johannes Weiner <hannes@cmpxchg.org> wrote: >> >> On Tue, Aug 26, 2014 at 02:26:24PM +0100, Mel Gorman wrote: >> >>> On Tue, Aug 26, 2014 at 08:58:36AM -0400, Trond Myklebust wrote: >> >>> > On Tue, Aug 26, 2014 at 6:53 AM, Mel Gorman <mgorman@suse.com> wrote: >> >>> > > On Mon, Aug 25, 2014 at 04:48:52PM +1000, NeilBrown wrote: >> >>> > >> On Fri, 22 Aug 2014 18:49:31 -0400 Trond Myklebust >> >>> > >> <trond.myklebust@primarydata.com> wrote: >> >>> > >> >> >>> > >> > Junxiao Bi reports seeing the following deadlock: >> >>> > >> > >> >>> > >> > @ crash> bt 1539 >> >>> > >> > @ PID: 1539 TASK: ffff88178f64a040 CPU: 1 COMMAND: "rpciod/1" >> >>> > >> > @ #0 [ffff88178f64d2c0] schedule at ffffffff8145833a >> >>> > >> > @ #1 [ffff88178f64d348] io_schedule at ffffffff8145842c >> >>> > >> > @ #2 [ffff88178f64d368] sync_page at ffffffff810d8161 >> >>> > >> > @ #3 [ffff88178f64d378] __wait_on_bit at ffffffff8145895b >> >>> > >> > @ #4 [ffff88178f64d3b8] wait_on_page_bit at ffffffff810d82fe >> >>> > >> > @ #5 [ffff88178f64d418] wait_on_page_writeback at ffffffff810e2a1a >> >>> > >> > @ #6 [ffff88178f64d438] shrink_page_list at ffffffff810e34e1 >> >>> > >> > @ #7 [ffff88178f64d588] shrink_list at ffffffff810e3dbe >> >>> > >> > @ #8 [ffff88178f64d6f8] shrink_zone at ffffffff810e425e >> >>> > >> > @ #9 [ffff88178f64d7b8] do_try_to_free_pages at ffffffff810e4978 >> >>> > >> > @ #10 [ffff88178f64d828] try_to_free_pages at ffffffff810e4c31 >> >>> > >> > @ #11 [ffff88178f64d8c8] __alloc_pages_nodemask at ffffffff810de370 >> >>> > >> >> >>> > >> This stack trace (from 2.6.32) cannot happen in mainline, though it took me a >> >>> > >> while to remember/discover exactly why. >> >>> > >> >> >>> > >> try_to_free_pages() creates a 'struct scan_control' with ->target_mem_cgroup >> >>> > >> set to NULL. >> >>> > >> shrink_page_list() checks ->target_mem_cgroup using global_reclaim() and if >> >>> > >> it is NULL, wait_on_page_writeback is *not* called. >> >>> > >> >> >>> > > >> >>> > > wait_on_page_writeback has a host of other damage associated with it which >> >>> > > is why we don't do it from reclaim any more. If the storage is very slow >> >>> > > then a process can be stalled by unrelated IO to slow storage. If the >> >>> > > storage is broken and the writeback can never complete then it causes other >> >>> > > issues. That kind of thing. >> >>> > > >> >>> > >> So we can only hit this deadlock if mem-cgroup limits are imposed on a >> >>> > >> process which is using NFS - which is quite possible but probably not common. >> >>> > >> >> >>> > >> The fact that a dead-lock can happen only when memcg limits are imposed seems >> >>> > >> very fragile. People aren't going to test that case much so there could well >> >>> > >> be other deadlock possibilities lurking. >> >>> > >> >> >>> > > >> >>> > > memcgs still can call wait_on_page_writeback and this is known to be a >> >>> > > hand-grenade to the memcg people but I've never heard of them trying to >> >>> > > tackle the problem. >> >>> > > >> >>> > >> Mel: might there be some other way we could get out of this deadlock? >> >>> > >> Could the wait_on_page_writeback() in shrink_page_list() be made a timed-out >> >>> > >> wait or something? Any other wait out of this deadlock other than setting >> >>> > >> PF_MEMALLOC_NOIO everywhere? >> >>> > >> >> >>> > > >> >>> > > I don't have the full thread as it was not cc'd to lkml so I don't know >> >>> > > what circumstances reached this deadlock in the first place. If this is >> >>> > > on 2.6.32 and the deadline cannot happen during reclaim in mainline then >> >>> > > why is mainline being patched? >> >>> > > >> >>> > > Do not alter wait_on_page_writeback() to timeout as it will blow >> >>> > > up spectacularly -- swap unuse races, data would not longer be synced >> >>> > > correctly to disk, sync IO would be flaky, stable page writes would be >> >>> > > fired out the window etc. >> >>> > >> >>> > Hi Mel, >> >>> > >> >>> > The above stack trace really is the entire deadlock: the rpciod work >> >>> > queue, which drives I/O on behalf of NFS, gets caught in a >> >>> > shrink_page_list() situation where it ends up waiting on page >> >>> > writeback. Boom.... >> >>> > >> >>> > Even if this can only happen for non-trivial memcg situations, then it >> >>> > still needs to be addressed: if rpciod blocks, then all NFS I/O will >> >>> > block and we can no longer write out the dirty pages. This is why we >> >>> > need a mainline fix. >> >>> > >> >>> >> >>> In that case I'm adding the memcg people. I recognise that rpciod should >> >>> never block on writeback for similar reasons why flushers should never block. >> >>> memcg blocking on writeback is dangerous for reasons other than NFS but >> >>> adding a variant that times out just means that on occasion processes get >> >>> stalled for long periods of time timing out on these writeback pages. In >> >>> that case, forward progress of rpciod would be painfully slow. >> >>> >> >>> On the other hand, forcing PF_MEMALLOC_NOIO for all rpciod allocations in >> >>> an ideal world is massive overkill and while it will work, there will be >> >>> other consequences -- unable to swap pages for example, unable to release >> >>> buffers to free clean pages etc. >> >>> >> >>> It'd be nice of the memcg people could comment on whether they plan to >> >>> handle the fact that memcg is the only called of wait_on_page_writeback >> >>> in direct reclaim paths. >> >> >> >> wait_on_page_writeback() is a hammer, and we need to be better about >> >> this once we have per-memcg dirty writeback and throttling, but I >> >> think that really misses the point. Even if memcg writeback waiting >> >> were smarter, any length of time spent waiting for yourself to make >> >> progress is absurd. We just shouldn't be solving deadlock scenarios >> >> through arbitrary timeouts on one side. If you can't wait for IO to >> >> finish, you shouldn't be passing __GFP_IO. >> >> >> >> Can't you use mempools like the other IO paths? >> > >> > There is no way to pass any allocation flags at all to an operation >> > such as __sock_create() (which may be needed if the server >> > disconnects). So in general, the answer is no. >> > >> >> Actually, one question that should probably be raised before anything >> else: is it at all possible for a workqueue like rpciod to have a >> non-trivial setting for ->target_mem_cgroup? If not, then the whole >> question is moot. >> > > AFAIK, today it's not possible to add kernel threads (which rpciod is one) > to a memcg so the issue is entirely theoritical at the moment. Even if > this was to change, it's not clear to me what adding kernel threads to a > memcg would mean as kernel threads have no RSS. Even if kernel resources > were accounted for, I cannot see why a kernel thread would join a memcg. > > I expec that it's currently impossible for rpciod to have a non-trivial > target_mem_cgroup. The memcg folk will correct me if I'm wrong or if there > are plans to change that for some reason. > Thanks! Then I'll assume that the problem is nonexistent in upstream for now, and drop the idea of using PF_MEMALLOC_NOIO. Perhaps we can then encourage Junxiao to look into backporting some of the VM changes in order to fix his Oracle legacy kernel issues? Cheers Trond
On Wed, Aug 27, 2014 at 12:15:33PM -0400, Trond Myklebust wrote: > On Wed, Aug 27, 2014 at 11:36 AM, Mel Gorman <mgorman@suse.com> wrote: > > On Tue, Aug 26, 2014 at 08:00:20PM -0400, Trond Myklebust wrote: > >> On Tue, Aug 26, 2014 at 7:51 PM, Trond Myklebust > >> <trond.myklebust@primarydata.com> wrote: > >> > On Tue, Aug 26, 2014 at 7:19 PM, Johannes Weiner <hannes@cmpxchg.org> wrote: > >> >> On Tue, Aug 26, 2014 at 02:26:24PM +0100, Mel Gorman wrote: > >> >>> On Tue, Aug 26, 2014 at 08:58:36AM -0400, Trond Myklebust wrote: > >> >>> > On Tue, Aug 26, 2014 at 6:53 AM, Mel Gorman <mgorman@suse.com> wrote: > >> >>> > > On Mon, Aug 25, 2014 at 04:48:52PM +1000, NeilBrown wrote: > >> >>> > >> On Fri, 22 Aug 2014 18:49:31 -0400 Trond Myklebust > >> >>> > >> <trond.myklebust@primarydata.com> wrote: > >> >>> > >> > >> >>> > >> > Junxiao Bi reports seeing the following deadlock: > >> >>> > >> > > >> >>> > >> > @ crash> bt 1539 > >> >>> > >> > @ PID: 1539 TASK: ffff88178f64a040 CPU: 1 COMMAND: "rpciod/1" > >> >>> > >> > @ #0 [ffff88178f64d2c0] schedule at ffffffff8145833a > >> >>> > >> > @ #1 [ffff88178f64d348] io_schedule at ffffffff8145842c > >> >>> > >> > @ #2 [ffff88178f64d368] sync_page at ffffffff810d8161 > >> >>> > >> > @ #3 [ffff88178f64d378] __wait_on_bit at ffffffff8145895b > >> >>> > >> > @ #4 [ffff88178f64d3b8] wait_on_page_bit at ffffffff810d82fe > >> >>> > >> > @ #5 [ffff88178f64d418] wait_on_page_writeback at ffffffff810e2a1a > >> >>> > >> > @ #6 [ffff88178f64d438] shrink_page_list at ffffffff810e34e1 > >> >>> > >> > @ #7 [ffff88178f64d588] shrink_list at ffffffff810e3dbe > >> >>> > >> > @ #8 [ffff88178f64d6f8] shrink_zone at ffffffff810e425e > >> >>> > >> > @ #9 [ffff88178f64d7b8] do_try_to_free_pages at ffffffff810e4978 > >> >>> > >> > @ #10 [ffff88178f64d828] try_to_free_pages at ffffffff810e4c31 > >> >>> > >> > @ #11 [ffff88178f64d8c8] __alloc_pages_nodemask at ffffffff810de370 > >> >>> > >> > >> >>> > >> This stack trace (from 2.6.32) cannot happen in mainline, though it took me a > >> >>> > >> while to remember/discover exactly why. > >> >>> > >> > >> >>> > >> try_to_free_pages() creates a 'struct scan_control' with ->target_mem_cgroup > >> >>> > >> set to NULL. > >> >>> > >> shrink_page_list() checks ->target_mem_cgroup using global_reclaim() and if > >> >>> > >> it is NULL, wait_on_page_writeback is *not* called. > >> >>> > >> > >> >>> > > > >> >>> > > wait_on_page_writeback has a host of other damage associated with it which > >> >>> > > is why we don't do it from reclaim any more. If the storage is very slow > >> >>> > > then a process can be stalled by unrelated IO to slow storage. If the > >> >>> > > storage is broken and the writeback can never complete then it causes other > >> >>> > > issues. That kind of thing. > >> >>> > > > >> >>> > >> So we can only hit this deadlock if mem-cgroup limits are imposed on a > >> >>> > >> process which is using NFS - which is quite possible but probably not common. > >> >>> > >> > >> >>> > >> The fact that a dead-lock can happen only when memcg limits are imposed seems > >> >>> > >> very fragile. People aren't going to test that case much so there could well > >> >>> > >> be other deadlock possibilities lurking. > >> >>> > >> > >> >>> > > > >> >>> > > memcgs still can call wait_on_page_writeback and this is known to be a > >> >>> > > hand-grenade to the memcg people but I've never heard of them trying to > >> >>> > > tackle the problem. > >> >>> > > > >> >>> > >> Mel: might there be some other way we could get out of this deadlock? > >> >>> > >> Could the wait_on_page_writeback() in shrink_page_list() be made a timed-out > >> >>> > >> wait or something? Any other wait out of this deadlock other than setting > >> >>> > >> PF_MEMALLOC_NOIO everywhere? > >> >>> > >> > >> >>> > > > >> >>> > > I don't have the full thread as it was not cc'd to lkml so I don't know > >> >>> > > what circumstances reached this deadlock in the first place. If this is > >> >>> > > on 2.6.32 and the deadline cannot happen during reclaim in mainline then > >> >>> > > why is mainline being patched? > >> >>> > > > >> >>> > > Do not alter wait_on_page_writeback() to timeout as it will blow > >> >>> > > up spectacularly -- swap unuse races, data would not longer be synced > >> >>> > > correctly to disk, sync IO would be flaky, stable page writes would be > >> >>> > > fired out the window etc. > >> >>> > > >> >>> > Hi Mel, > >> >>> > > >> >>> > The above stack trace really is the entire deadlock: the rpciod work > >> >>> > queue, which drives I/O on behalf of NFS, gets caught in a > >> >>> > shrink_page_list() situation where it ends up waiting on page > >> >>> > writeback. Boom.... > >> >>> > > >> >>> > Even if this can only happen for non-trivial memcg situations, then it > >> >>> > still needs to be addressed: if rpciod blocks, then all NFS I/O will > >> >>> > block and we can no longer write out the dirty pages. This is why we > >> >>> > need a mainline fix. > >> >>> > > >> >>> > >> >>> In that case I'm adding the memcg people. I recognise that rpciod should > >> >>> never block on writeback for similar reasons why flushers should never block. > >> >>> memcg blocking on writeback is dangerous for reasons other than NFS but > >> >>> adding a variant that times out just means that on occasion processes get > >> >>> stalled for long periods of time timing out on these writeback pages. In > >> >>> that case, forward progress of rpciod would be painfully slow. > >> >>> > >> >>> On the other hand, forcing PF_MEMALLOC_NOIO for all rpciod allocations in > >> >>> an ideal world is massive overkill and while it will work, there will be > >> >>> other consequences -- unable to swap pages for example, unable to release > >> >>> buffers to free clean pages etc. > >> >>> > >> >>> It'd be nice of the memcg people could comment on whether they plan to > >> >>> handle the fact that memcg is the only called of wait_on_page_writeback > >> >>> in direct reclaim paths. > >> >> > >> >> wait_on_page_writeback() is a hammer, and we need to be better about > >> >> this once we have per-memcg dirty writeback and throttling, but I > >> >> think that really misses the point. Even if memcg writeback waiting > >> >> were smarter, any length of time spent waiting for yourself to make > >> >> progress is absurd. We just shouldn't be solving deadlock scenarios > >> >> through arbitrary timeouts on one side. If you can't wait for IO to > >> >> finish, you shouldn't be passing __GFP_IO. > >> >> > >> >> Can't you use mempools like the other IO paths? > >> > > >> > There is no way to pass any allocation flags at all to an operation > >> > such as __sock_create() (which may be needed if the server > >> > disconnects). So in general, the answer is no. > >> > > >> > >> Actually, one question that should probably be raised before anything > >> else: is it at all possible for a workqueue like rpciod to have a > >> non-trivial setting for ->target_mem_cgroup? If not, then the whole > >> question is moot. > >> > > > > AFAIK, today it's not possible to add kernel threads (which rpciod is one) > > to a memcg so the issue is entirely theoritical at the moment. Even if > > this was to change, it's not clear to me what adding kernel threads to a > > memcg would mean as kernel threads have no RSS. Even if kernel resources > > were accounted for, I cannot see why a kernel thread would join a memcg. > > > > I expec that it's currently impossible for rpciod to have a non-trivial > > target_mem_cgroup. The memcg folk will correct me if I'm wrong or if there > > are plans to change that for some reason. > > > > Thanks! Then I'll assume that the problem is nonexistent in upstream > for now, and drop the idea of using PF_MEMALLOC_NOIO. Perhaps we can > then encourage Junxiao to look into backporting some of the VM changes > in order to fix his Oracle legacy kernel issues? > Sounds like a plan to me. The other alternative would be backporting the handling of wait_on_page_writeback and writeback handling from reclaim but that would be much harder considering the rate of change in vmscan.c and the problems that were experienced with high CPU usage from kswapd during that transition.
On 08/28/2014 04:30 PM, Mel Gorman wrote: > On Wed, Aug 27, 2014 at 12:15:33PM -0400, Trond Myklebust wrote: >> On Wed, Aug 27, 2014 at 11:36 AM, Mel Gorman <mgorman@suse.com> wrote: >>> On Tue, Aug 26, 2014 at 08:00:20PM -0400, Trond Myklebust wrote: >>>> On Tue, Aug 26, 2014 at 7:51 PM, Trond Myklebust >>>> <trond.myklebust@primarydata.com> wrote: >>>>> On Tue, Aug 26, 2014 at 7:19 PM, Johannes Weiner <hannes@cmpxchg.org> wrote: >>>>>> On Tue, Aug 26, 2014 at 02:26:24PM +0100, Mel Gorman wrote: >>>>>>> On Tue, Aug 26, 2014 at 08:58:36AM -0400, Trond Myklebust wrote: >>>>>>>> On Tue, Aug 26, 2014 at 6:53 AM, Mel Gorman <mgorman@suse.com> wrote: >>>>>>>>> On Mon, Aug 25, 2014 at 04:48:52PM +1000, NeilBrown wrote: >>>>>>>>>> On Fri, 22 Aug 2014 18:49:31 -0400 Trond Myklebust >>>>>>>>>> <trond.myklebust@primarydata.com> wrote: >>>>>>>>>> >>>>>>>>>>> Junxiao Bi reports seeing the following deadlock: >>>>>>>>>>> >>>>>>>>>>> @ crash> bt 1539 >>>>>>>>>>> @ PID: 1539 TASK: ffff88178f64a040 CPU: 1 COMMAND: "rpciod/1" >>>>>>>>>>> @ #0 [ffff88178f64d2c0] schedule at ffffffff8145833a >>>>>>>>>>> @ #1 [ffff88178f64d348] io_schedule at ffffffff8145842c >>>>>>>>>>> @ #2 [ffff88178f64d368] sync_page at ffffffff810d8161 >>>>>>>>>>> @ #3 [ffff88178f64d378] __wait_on_bit at ffffffff8145895b >>>>>>>>>>> @ #4 [ffff88178f64d3b8] wait_on_page_bit at ffffffff810d82fe >>>>>>>>>>> @ #5 [ffff88178f64d418] wait_on_page_writeback at ffffffff810e2a1a >>>>>>>>>>> @ #6 [ffff88178f64d438] shrink_page_list at ffffffff810e34e1 >>>>>>>>>>> @ #7 [ffff88178f64d588] shrink_list at ffffffff810e3dbe >>>>>>>>>>> @ #8 [ffff88178f64d6f8] shrink_zone at ffffffff810e425e >>>>>>>>>>> @ #9 [ffff88178f64d7b8] do_try_to_free_pages at ffffffff810e4978 >>>>>>>>>>> @ #10 [ffff88178f64d828] try_to_free_pages at ffffffff810e4c31 >>>>>>>>>>> @ #11 [ffff88178f64d8c8] __alloc_pages_nodemask at ffffffff810de370 >>>>>>>>>> >>>>>>>>>> This stack trace (from 2.6.32) cannot happen in mainline, though it took me a >>>>>>>>>> while to remember/discover exactly why. >>>>>>>>>> >>>>>>>>>> try_to_free_pages() creates a 'struct scan_control' with ->target_mem_cgroup >>>>>>>>>> set to NULL. >>>>>>>>>> shrink_page_list() checks ->target_mem_cgroup using global_reclaim() and if >>>>>>>>>> it is NULL, wait_on_page_writeback is *not* called. >>>>>>>>>> >>>>>>>>> >>>>>>>>> wait_on_page_writeback has a host of other damage associated with it which >>>>>>>>> is why we don't do it from reclaim any more. If the storage is very slow >>>>>>>>> then a process can be stalled by unrelated IO to slow storage. If the >>>>>>>>> storage is broken and the writeback can never complete then it causes other >>>>>>>>> issues. That kind of thing. >>>>>>>>> >>>>>>>>>> So we can only hit this deadlock if mem-cgroup limits are imposed on a >>>>>>>>>> process which is using NFS - which is quite possible but probably not common. >>>>>>>>>> >>>>>>>>>> The fact that a dead-lock can happen only when memcg limits are imposed seems >>>>>>>>>> very fragile. People aren't going to test that case much so there could well >>>>>>>>>> be other deadlock possibilities lurking. >>>>>>>>>> >>>>>>>>> >>>>>>>>> memcgs still can call wait_on_page_writeback and this is known to be a >>>>>>>>> hand-grenade to the memcg people but I've never heard of them trying to >>>>>>>>> tackle the problem. >>>>>>>>> >>>>>>>>>> Mel: might there be some other way we could get out of this deadlock? >>>>>>>>>> Could the wait_on_page_writeback() in shrink_page_list() be made a timed-out >>>>>>>>>> wait or something? Any other wait out of this deadlock other than setting >>>>>>>>>> PF_MEMALLOC_NOIO everywhere? >>>>>>>>>> >>>>>>>>> >>>>>>>>> I don't have the full thread as it was not cc'd to lkml so I don't know >>>>>>>>> what circumstances reached this deadlock in the first place. If this is >>>>>>>>> on 2.6.32 and the deadline cannot happen during reclaim in mainline then >>>>>>>>> why is mainline being patched? >>>>>>>>> >>>>>>>>> Do not alter wait_on_page_writeback() to timeout as it will blow >>>>>>>>> up spectacularly -- swap unuse races, data would not longer be synced >>>>>>>>> correctly to disk, sync IO would be flaky, stable page writes would be >>>>>>>>> fired out the window etc. >>>>>>>> >>>>>>>> Hi Mel, >>>>>>>> >>>>>>>> The above stack trace really is the entire deadlock: the rpciod work >>>>>>>> queue, which drives I/O on behalf of NFS, gets caught in a >>>>>>>> shrink_page_list() situation where it ends up waiting on page >>>>>>>> writeback. Boom.... >>>>>>>> >>>>>>>> Even if this can only happen for non-trivial memcg situations, then it >>>>>>>> still needs to be addressed: if rpciod blocks, then all NFS I/O will >>>>>>>> block and we can no longer write out the dirty pages. This is why we >>>>>>>> need a mainline fix. >>>>>>>> >>>>>>> >>>>>>> In that case I'm adding the memcg people. I recognise that rpciod should >>>>>>> never block on writeback for similar reasons why flushers should never block. >>>>>>> memcg blocking on writeback is dangerous for reasons other than NFS but >>>>>>> adding a variant that times out just means that on occasion processes get >>>>>>> stalled for long periods of time timing out on these writeback pages. In >>>>>>> that case, forward progress of rpciod would be painfully slow. >>>>>>> >>>>>>> On the other hand, forcing PF_MEMALLOC_NOIO for all rpciod allocations in >>>>>>> an ideal world is massive overkill and while it will work, there will be >>>>>>> other consequences -- unable to swap pages for example, unable to release >>>>>>> buffers to free clean pages etc. >>>>>>> >>>>>>> It'd be nice of the memcg people could comment on whether they plan to >>>>>>> handle the fact that memcg is the only called of wait_on_page_writeback >>>>>>> in direct reclaim paths. >>>>>> >>>>>> wait_on_page_writeback() is a hammer, and we need to be better about >>>>>> this once we have per-memcg dirty writeback and throttling, but I >>>>>> think that really misses the point. Even if memcg writeback waiting >>>>>> were smarter, any length of time spent waiting for yourself to make >>>>>> progress is absurd. We just shouldn't be solving deadlock scenarios >>>>>> through arbitrary timeouts on one side. If you can't wait for IO to >>>>>> finish, you shouldn't be passing __GFP_IO. >>>>>> >>>>>> Can't you use mempools like the other IO paths? >>>>> >>>>> There is no way to pass any allocation flags at all to an operation >>>>> such as __sock_create() (which may be needed if the server >>>>> disconnects). So in general, the answer is no. >>>>> >>>> >>>> Actually, one question that should probably be raised before anything >>>> else: is it at all possible for a workqueue like rpciod to have a >>>> non-trivial setting for ->target_mem_cgroup? If not, then the whole >>>> question is moot. >>>> >>> >>> AFAIK, today it's not possible to add kernel threads (which rpciod is one) >>> to a memcg so the issue is entirely theoritical at the moment. Even if >>> this was to change, it's not clear to me what adding kernel threads to a >>> memcg would mean as kernel threads have no RSS. Even if kernel resources >>> were accounted for, I cannot see why a kernel thread would join a memcg. >>> >>> I expec that it's currently impossible for rpciod to have a non-trivial >>> target_mem_cgroup. The memcg folk will correct me if I'm wrong or if there >>> are plans to change that for some reason. >>> >> >> Thanks! Then I'll assume that the problem is nonexistent in upstream >> for now, and drop the idea of using PF_MEMALLOC_NOIO. Perhaps we can >> then encourage Junxiao to look into backporting some of the VM changes >> in order to fix his Oracle legacy kernel issues? >> > > Sounds like a plan to me. The other alternative would be backporting the > handling of wait_on_page_writeback and writeback handling from reclaim but > that would be much harder considering the rate of change in vmscan.c and > the problems that were experienced with high CPU usage from kswapd during > that transition. Backport the vm changes may cause a lot of risk due to lots of changes, i am thinking to check PF_FSTRANS flag in shrink_page_list() to bypass the fs ops in our old kernel. Can this cause other issue? Thanks, Junxiao. > -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Aug 28, 2014 at 04:49:46PM +0800, Junxiao Bi wrote: > >>>>>> <SNIP> > >>>>>> Can't you use mempools like the other IO paths? > >>>>> > >>>>> There is no way to pass any allocation flags at all to an operation > >>>>> such as __sock_create() (which may be needed if the server > >>>>> disconnects). So in general, the answer is no. > >>>>> > >>>> > >>>> Actually, one question that should probably be raised before anything > >>>> else: is it at all possible for a workqueue like rpciod to have a > >>>> non-trivial setting for ->target_mem_cgroup? If not, then the whole > >>>> question is moot. > >>>> > >>> > >>> AFAIK, today it's not possible to add kernel threads (which rpciod is one) > >>> to a memcg so the issue is entirely theoritical at the moment. Even if > >>> this was to change, it's not clear to me what adding kernel threads to a > >>> memcg would mean as kernel threads have no RSS. Even if kernel resources > >>> were accounted for, I cannot see why a kernel thread would join a memcg. > >>> > >>> I expec that it's currently impossible for rpciod to have a non-trivial > >>> target_mem_cgroup. The memcg folk will correct me if I'm wrong or if there > >>> are plans to change that for some reason. > >>> > >> > >> Thanks! Then I'll assume that the problem is nonexistent in upstream > >> for now, and drop the idea of using PF_MEMALLOC_NOIO. Perhaps we can > >> then encourage Junxiao to look into backporting some of the VM changes > >> in order to fix his Oracle legacy kernel issues? > >> > > > > Sounds like a plan to me. The other alternative would be backporting the > > handling of wait_on_page_writeback and writeback handling from reclaim but > > that would be much harder considering the rate of change in vmscan.c and > > the problems that were experienced with high CPU usage from kswapd during > > that transition. > > Backport the vm changes may cause a lot of risk due to lots of changes, > i am thinking to check PF_FSTRANS flag in shrink_page_list() to bypass > the fs ops in our old kernel. Can this cause other issue? > I'm afraid that depends on exactly how the kernel you are backporting to interprets PF_FSTRANS. Your original bug was related to wait_on_page_writeback so you'll need to check if PF_FSTRANS is interpreted as !may_enter_fs in reclaim context in your kernel to avoid the wait_on_page_writeback.
[Sorry for jumping in so late - I've been busy last days] On Wed 27-08-14 16:36:44, Mel Gorman wrote: > On Tue, Aug 26, 2014 at 08:00:20PM -0400, Trond Myklebust wrote: > > On Tue, Aug 26, 2014 at 7:51 PM, Trond Myklebust > > <trond.myklebust@primarydata.com> wrote: > > > On Tue, Aug 26, 2014 at 7:19 PM, Johannes Weiner <hannes@cmpxchg.org> wrote: [...] > > >> wait_on_page_writeback() is a hammer, and we need to be better about > > >> this once we have per-memcg dirty writeback and throttling, but I > > >> think that really misses the point. Even if memcg writeback waiting > > >> were smarter, any length of time spent waiting for yourself to make > > >> progress is absurd. We just shouldn't be solving deadlock scenarios > > >> through arbitrary timeouts on one side. If you can't wait for IO to > > >> finish, you shouldn't be passing __GFP_IO. Exactly! > > >> Can't you use mempools like the other IO paths? > > > > > > There is no way to pass any allocation flags at all to an operation > > > such as __sock_create() (which may be needed if the server > > > disconnects). So in general, the answer is no. > > > > > > > Actually, one question that should probably be raised before anything > > else: is it at all possible for a workqueue like rpciod to have a > > non-trivial setting for ->target_mem_cgroup? If not, then the whole > > question is moot. > > AFAIK, today it's not possible to add kernel threads (which rpciod is one) > to a memcg so the issue is entirely theoritical at the moment. Even if > this was to change, it's not clear to me what adding kernel threads to a > memcg would mean as kernel threads have no RSS. Even if kernel resources > were accounted for, I cannot see why a kernel thread would join a memcg. It doesn't make _any_ sense to assign kernel threads to memcg as they do not have mm_struct associated with them and mm_struct is the central association point to memcg. I do not see any reason this to change in the future. > I expec that it's currently impossible for rpciod to have a non-trivial > target_mem_cgroup. The memcg folk will correct me if I'm wrong or if there > are plans to change that for some reason. The only possible case would be if the rpciod would use_mm of current for some reason.
On Thu, 4 Sep 2014 15:54:27 +0200 Michal Hocko <mhocko@suse.cz> wrote: > [Sorry for jumping in so late - I've been busy last days] > > On Wed 27-08-14 16:36:44, Mel Gorman wrote: > > On Tue, Aug 26, 2014 at 08:00:20PM -0400, Trond Myklebust wrote: > > > On Tue, Aug 26, 2014 at 7:51 PM, Trond Myklebust > > > <trond.myklebust@primarydata.com> wrote: > > > > On Tue, Aug 26, 2014 at 7:19 PM, Johannes Weiner <hannes@cmpxchg.org> wrote: > [...] > > > >> wait_on_page_writeback() is a hammer, and we need to be better about > > > >> this once we have per-memcg dirty writeback and throttling, but I > > > >> think that really misses the point. Even if memcg writeback waiting > > > >> were smarter, any length of time spent waiting for yourself to make > > > >> progress is absurd. We just shouldn't be solving deadlock scenarios > > > >> through arbitrary timeouts on one side. If you can't wait for IO to > > > >> finish, you shouldn't be passing __GFP_IO. > > Exactly! This is overly simplistic. The code that cannot wait may be further up the call chain and not in a position to avoid passing __GFP_IO. In many case it isn't that "you can't wait for IO" in general, but that you cannot wait for one specific IO request. wait_on_page_writeback() waits for a specific IO and so is dangerous. congestion_wait() or similar waits for IO in general and so is much safer. NeilBrown
On Tue 09-09-14 12:33:46, Neil Brown wrote: > On Thu, 4 Sep 2014 15:54:27 +0200 Michal Hocko <mhocko@suse.cz> wrote: > > > [Sorry for jumping in so late - I've been busy last days] > > > > On Wed 27-08-14 16:36:44, Mel Gorman wrote: > > > On Tue, Aug 26, 2014 at 08:00:20PM -0400, Trond Myklebust wrote: > > > > On Tue, Aug 26, 2014 at 7:51 PM, Trond Myklebust > > > > <trond.myklebust@primarydata.com> wrote: > > > > > On Tue, Aug 26, 2014 at 7:19 PM, Johannes Weiner <hannes@cmpxchg.org> wrote: > > [...] > > > > >> wait_on_page_writeback() is a hammer, and we need to be better about > > > > >> this once we have per-memcg dirty writeback and throttling, but I > > > > >> think that really misses the point. Even if memcg writeback waiting > > > > >> were smarter, any length of time spent waiting for yourself to make > > > > >> progress is absurd. We just shouldn't be solving deadlock scenarios > > > > >> through arbitrary timeouts on one side. If you can't wait for IO to > > > > >> finish, you shouldn't be passing __GFP_IO. > > > > Exactly! > > This is overly simplistic. > The code that cannot wait may be further up the call chain and not in a > position to avoid passing __GFP_IO. > In many case it isn't that "you can't wait for IO" in general, but that you > cannot wait for one specific IO request. Could you be more specific, please? Why would a particular IO make any difference to general IO from the same path? My understanding was that once the page is marked PG_writeback then it is about to be written to its destination and if there is any need for memory allocation it should better not allow IO from reclaim. > wait_on_page_writeback() waits for a specific IO and so is dangerous. > congestion_wait() or similar waits for IO in general and so is much safer. congestion_wait was actually not sufficient to prevent from OOM with heavy writer in a small memcg. We simply do not know how long will the IO last so any "wait for a random timeout" will end up causing some troubles.
On Wed, 10 Sep 2014 15:48:43 +0200 Michal Hocko <mhocko@suse.cz> wrote: > On Tue 09-09-14 12:33:46, Neil Brown wrote: > > On Thu, 4 Sep 2014 15:54:27 +0200 Michal Hocko <mhocko@suse.cz> wrote: > > > > > [Sorry for jumping in so late - I've been busy last days] > > > > > > On Wed 27-08-14 16:36:44, Mel Gorman wrote: > > > > On Tue, Aug 26, 2014 at 08:00:20PM -0400, Trond Myklebust wrote: > > > > > On Tue, Aug 26, 2014 at 7:51 PM, Trond Myklebust > > > > > <trond.myklebust@primarydata.com> wrote: > > > > > > On Tue, Aug 26, 2014 at 7:19 PM, Johannes Weiner <hannes@cmpxchg.org> wrote: > > > [...] > > > > > >> wait_on_page_writeback() is a hammer, and we need to be better about > > > > > >> this once we have per-memcg dirty writeback and throttling, but I > > > > > >> think that really misses the point. Even if memcg writeback waiting > > > > > >> were smarter, any length of time spent waiting for yourself to make > > > > > >> progress is absurd. We just shouldn't be solving deadlock scenarios > > > > > >> through arbitrary timeouts on one side. If you can't wait for IO to > > > > > >> finish, you shouldn't be passing __GFP_IO. > > > > > > Exactly! > > > > This is overly simplistic. > > The code that cannot wait may be further up the call chain and not in a > > position to avoid passing __GFP_IO. > > In many case it isn't that "you can't wait for IO" in general, but that you > > cannot wait for one specific IO request. > > Could you be more specific, please? Why would a particular IO make any > difference to general IO from the same path? My understanding was that > once the page is marked PG_writeback then it is about to be written to > its destination and if there is any need for memory allocation it should > better not allow IO from reclaim. The more complex the filesystem, the harder it is to "not allow IO from reclaim". For NFS (which started this thread) there might be a need to open a new connection - so allocating in the networking code would all need to be careful. And it isn't impossible that a 'gss' credential needs to be re-negotiated, and that might even need user-space interaction (not sure of details). What you say certainly used to be the case, and very often still is. But it doesn't really scale with complexity of filesystems. I don't think there is (yet) any need to optimised for allocations that don't disallow IO happening in the writeout path. But I do think waiting indefinitely for a particular IO is unjustifiable. > > > wait_on_page_writeback() waits for a specific IO and so is dangerous. > > congestion_wait() or similar waits for IO in general and so is much safer. > > congestion_wait was actually not sufficient to prevent from OOM with > heavy writer in a small memcg. We simply do not know how long will the > IO last so any "wait for a random timeout" will end up causing some > troubles. I certainly accept that "congestion_wait" isn't a sufficient solution. The thing I like about it is that it combines a timeout with a measure of activity. As long as writebacks are completing, it is reasonable to wait_on_page_writeback(). But if no writebacks have completed for a while, then it seems pointless waiting on this page any more. Best to try to make forward progress with whatever memory you can find. NeilBrown
On Thu 11-09-14 09:57:43, Neil Brown wrote: > On Wed, 10 Sep 2014 15:48:43 +0200 Michal Hocko <mhocko@suse.cz> wrote: > > > On Tue 09-09-14 12:33:46, Neil Brown wrote: > > > On Thu, 4 Sep 2014 15:54:27 +0200 Michal Hocko <mhocko@suse.cz> wrote: > > > > > > > [Sorry for jumping in so late - I've been busy last days] > > > > > > > > On Wed 27-08-14 16:36:44, Mel Gorman wrote: > > > > > On Tue, Aug 26, 2014 at 08:00:20PM -0400, Trond Myklebust wrote: > > > > > > On Tue, Aug 26, 2014 at 7:51 PM, Trond Myklebust > > > > > > <trond.myklebust@primarydata.com> wrote: > > > > > > > On Tue, Aug 26, 2014 at 7:19 PM, Johannes Weiner <hannes@cmpxchg.org> wrote: > > > > [...] > > > > > > >> wait_on_page_writeback() is a hammer, and we need to be better about > > > > > > >> this once we have per-memcg dirty writeback and throttling, but I > > > > > > >> think that really misses the point. Even if memcg writeback waiting > > > > > > >> were smarter, any length of time spent waiting for yourself to make > > > > > > >> progress is absurd. We just shouldn't be solving deadlock scenarios > > > > > > >> through arbitrary timeouts on one side. If you can't wait for IO to > > > > > > >> finish, you shouldn't be passing __GFP_IO. > > > > > > > > Exactly! > > > > > > This is overly simplistic. > > > The code that cannot wait may be further up the call chain and not in a > > > position to avoid passing __GFP_IO. > > > In many case it isn't that "you can't wait for IO" in general, but that you > > > cannot wait for one specific IO request. > > > > Could you be more specific, please? Why would a particular IO make any > > difference to general IO from the same path? My understanding was that > > once the page is marked PG_writeback then it is about to be written to > > its destination and if there is any need for memory allocation it should > > better not allow IO from reclaim. > > The more complex the filesystem, the harder it is to "not allow IO from > reclaim". > For NFS (which started this thread) there might be a need to open a new > connection - so allocating in the networking code would all need to be > careful. memalloc_noio_{save,restor} might help in that regards. > And it isn't impossible that a 'gss' credential needs to be re-negotiated, > and that might even need user-space interaction (not sure of details). OK, so if I understand you correctly all those allocations tmight happen _after_ the page has been marked PG_writeback. This would be bad indeed if such a path could appear in the memcg limit reclaim. The outcome of the previous discussion was that this doesn't happen in practice for nfs code, though, because the real flushing doesn't happen from a user context. The issue was reported for an old kernel where the flushing happened from the user context. It would be a huge problem to have a flusher within a restricted environment not only because of this path. > What you say certainly used to be the case, and very often still is. But it > doesn't really scale with complexity of filesystems. > > I don't think there is (yet) any need to optimised for allocations that don't > disallow IO happening in the writeout path. But I do think waiting > indefinitely for a particular IO is unjustifiable. Well, as Johannes already pointed out. The right way to fix memcg reclaim is to implement proper memcg aware dirty pages throttling and flushing. This is a song of distant future I am afraid. Until then we have to live with workarounds. I would be happy to make this one more robust but timeout based solutions just sound too fragile and triggering OOM is a big risk. Maybe we can disbale waiting if current->flags & PF_LESS_THROTTLE. I would be even tempted to WARN_ON(current->flags & PF_LESS_THROTTLE) in that path to catch a potential misconfiguration when the flusher is a part of restricted environment. The only real user of the flag is nfsd though and it runs from a kernel thread so this wouldn't help much to catch potentialy buggy code. So I am not really sure how much of an improvement this would be.
On Thu, 11 Sep 2014 10:50:47 +0200 Michal Hocko <mhocko@suse.cz> wrote: > On Thu 11-09-14 09:57:43, Neil Brown wrote: > > On Wed, 10 Sep 2014 15:48:43 +0200 Michal Hocko <mhocko@suse.cz> wrote: > > > > > On Tue 09-09-14 12:33:46, Neil Brown wrote: > > > > On Thu, 4 Sep 2014 15:54:27 +0200 Michal Hocko <mhocko@suse.cz> wrote: > > > > > > > > > [Sorry for jumping in so late - I've been busy last days] > > > > > > > > > > On Wed 27-08-14 16:36:44, Mel Gorman wrote: > > > > > > On Tue, Aug 26, 2014 at 08:00:20PM -0400, Trond Myklebust wrote: > > > > > > > On Tue, Aug 26, 2014 at 7:51 PM, Trond Myklebust > > > > > > > <trond.myklebust@primarydata.com> wrote: > > > > > > > > On Tue, Aug 26, 2014 at 7:19 PM, Johannes Weiner <hannes@cmpxchg.org> wrote: > > > > > [...] > > > > > > > >> wait_on_page_writeback() is a hammer, and we need to be better about > > > > > > > >> this once we have per-memcg dirty writeback and throttling, but I > > > > > > > >> think that really misses the point. Even if memcg writeback waiting > > > > > > > >> were smarter, any length of time spent waiting for yourself to make > > > > > > > >> progress is absurd. We just shouldn't be solving deadlock scenarios > > > > > > > >> through arbitrary timeouts on one side. If you can't wait for IO to > > > > > > > >> finish, you shouldn't be passing __GFP_IO. > > > > > > > > > > Exactly! > > > > > > > > This is overly simplistic. > > > > The code that cannot wait may be further up the call chain and not in a > > > > position to avoid passing __GFP_IO. > > > > In many case it isn't that "you can't wait for IO" in general, but that you > > > > cannot wait for one specific IO request. > > > > > > Could you be more specific, please? Why would a particular IO make any > > > difference to general IO from the same path? My understanding was that > > > once the page is marked PG_writeback then it is about to be written to > > > its destination and if there is any need for memory allocation it should > > > better not allow IO from reclaim. > > > > The more complex the filesystem, the harder it is to "not allow IO from > > reclaim". > > For NFS (which started this thread) there might be a need to open a new > > connection - so allocating in the networking code would all need to be > > careful. > > memalloc_noio_{save,restor} might help in that regards. It might. It is a bit of a heavy stick though. Especially as "nofs" is what is really wanted (I think). > > > And it isn't impossible that a 'gss' credential needs to be re-negotiated, > > and that might even need user-space interaction (not sure of details). > > OK, so if I understand you correctly all those allocations tmight happen > _after_ the page has been marked PG_writeback. This would be bad indeed > if such a path could appear in the memcg limit reclaim. The outcome of > the previous discussion was that this doesn't happen in practice for > nfs code, though, because the real flushing doesn't happen from a user > context. The issue was reported for an old kernel where the flushing > happened from the user context. It would be a huge problem to have a > flusher within a restricted environment not only because of this path. > > > What you say certainly used to be the case, and very often still is. But it > > doesn't really scale with complexity of filesystems. > > > > I don't think there is (yet) any need to optimised for allocations that don't > > disallow IO happening in the writeout path. But I do think waiting > > indefinitely for a particular IO is unjustifiable. > > Well, as Johannes already pointed out. The right way to fix memcg > reclaim is to implement proper memcg aware dirty pages throttling and > flushing. This is a song of distant future I am afraid. Until then we > have to live with workarounds. I would be happy to make this one more > robust but timeout based solutions just sound too fragile and triggering > OOM is a big risk. > > Maybe we can disbale waiting if current->flags & PF_LESS_THROTTLE. I > would be even tempted to WARN_ON(current->flags & PF_LESS_THROTTLE) in > that path to catch a potential misconfiguration when the flusher is a > part of restricted environment. The only real user of the flag is nfsd > though and it runs from a kernel thread so this wouldn't help much to > catch potentialy buggy code. So I am not really sure how much of an > improvement this would be. > I think it would be inappropriate to use PF_LESS_THROTTLE. That is really about throttling the dirtying of pages, not their writeback. As has been said, there isn't really a bug that needs fixing at present, so delving too deeply into designing a solution is probably pointless. Using global flags is sometimes suitable, but it doesn't help when you are waiting for memory allocation to happen in another process. Using timeouts is sometimes suitable, but only if the backup plan isn't too drastic. My feeling is that the "ideal" would be to wait until: - this thread can make forward progress, or - no thread (in this memcg?) can make forward progress In the first case we succeed. In the second we take the most gentle backup solution (e.g. use the last dregs of memory, or trigger OOM). Detecting when no other thread can make forward progress is probably not trivial, but it doesn't need to be cheap. Hopefully when a real issue arises we'll be able to figure something out. Thanks, NeilBrown
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 9358c79fd589..ab3aff71ff93 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -19,6 +19,7 @@ #include <linux/spinlock.h> #include <linux/mutex.h> #include <linux/freezer.h> +#include <linux/sched.h> #include <linux/sunrpc/clnt.h> @@ -821,9 +822,9 @@ void rpc_execute(struct rpc_task *task) static void rpc_async_schedule(struct work_struct *work) { - current->flags |= PF_FSTRANS; + current->flags |= PF_FSTRANS | PF_MEMALLOC_NOIO; __rpc_execute(container_of(work, struct rpc_task, u.tk_work)); - current->flags &= ~PF_FSTRANS; + current->flags &= ~(PF_FSTRANS | PF_MEMALLOC_NOIO); } /** diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 43cd89eacfab..1d6d4d84b299 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -38,6 +38,7 @@ #include <linux/sunrpc/svcsock.h> #include <linux/sunrpc/xprtsock.h> #include <linux/file.h> +#include <linux/sched.h> #ifdef CONFIG_SUNRPC_BACKCHANNEL #include <linux/sunrpc/bc_xprt.h> #endif @@ -1927,7 +1928,7 @@ static int xs_local_setup_socket(struct sock_xprt *transport) struct socket *sock; int status = -EIO; - current->flags |= PF_FSTRANS; + current->flags |= PF_FSTRANS | PF_MEMALLOC_NOIO; clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); status = __sock_create(xprt->xprt_net, AF_LOCAL, @@ -1968,7 +1969,7 @@ static int xs_local_setup_socket(struct sock_xprt *transport) out: xprt_clear_connecting(xprt); xprt_wake_pending_tasks(xprt, status); - current->flags &= ~PF_FSTRANS; + current->flags &= ~(PF_FSTRANS | PF_MEMALLOC_NOIO); return status; } @@ -2071,7 +2072,7 @@ static void xs_udp_setup_socket(struct work_struct *work) struct socket *sock = transport->sock; int status = -EIO; - current->flags |= PF_FSTRANS; + current->flags |= PF_FSTRANS | PF_MEMALLOC_NOIO; /* Start by resetting any existing state */ xs_reset_transport(transport); @@ -2092,7 +2093,7 @@ static void xs_udp_setup_socket(struct work_struct *work) out: xprt_clear_connecting(xprt); xprt_wake_pending_tasks(xprt, status); - current->flags &= ~PF_FSTRANS; + current->flags &= ~(PF_FSTRANS | PF_MEMALLOC_NOIO); } /* @@ -2229,7 +2230,7 @@ static void xs_tcp_setup_socket(struct work_struct *work) struct rpc_xprt *xprt = &transport->xprt; int status = -EIO; - current->flags |= PF_FSTRANS; + current->flags |= PF_FSTRANS | PF_MEMALLOC_NOIO; if (!sock) { clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); @@ -2276,7 +2277,7 @@ static void xs_tcp_setup_socket(struct work_struct *work) case -EINPROGRESS: case -EALREADY: xprt_clear_connecting(xprt); - current->flags &= ~PF_FSTRANS; + current->flags &= ~(PF_FSTRANS | PF_MEMALLOC_NOIO); return; case -EINVAL: /* Happens, for instance, if the user specified a link @@ -2294,7 +2295,7 @@ out_eagain: out: xprt_clear_connecting(xprt); xprt_wake_pending_tasks(xprt, status); - current->flags &= ~PF_FSTRANS; + current->flags &= ~(PF_FSTRANS | PF_MEMALLOC_NOIO); } /**
Junxiao Bi reports seeing the following deadlock: @ crash> bt 1539 @ PID: 1539 TASK: ffff88178f64a040 CPU: 1 COMMAND: "rpciod/1" @ #0 [ffff88178f64d2c0] schedule at ffffffff8145833a @ #1 [ffff88178f64d348] io_schedule at ffffffff8145842c @ #2 [ffff88178f64d368] sync_page at ffffffff810d8161 @ #3 [ffff88178f64d378] __wait_on_bit at ffffffff8145895b @ #4 [ffff88178f64d3b8] wait_on_page_bit at ffffffff810d82fe @ #5 [ffff88178f64d418] wait_on_page_writeback at ffffffff810e2a1a @ #6 [ffff88178f64d438] shrink_page_list at ffffffff810e34e1 @ #7 [ffff88178f64d588] shrink_list at ffffffff810e3dbe @ #8 [ffff88178f64d6f8] shrink_zone at ffffffff810e425e @ #9 [ffff88178f64d7b8] do_try_to_free_pages at ffffffff810e4978 @ #10 [ffff88178f64d828] try_to_free_pages at ffffffff810e4c31 @ #11 [ffff88178f64d8c8] __alloc_pages_nodemask at ffffffff810de370 @ #12 [ffff88178f64d978] kmem_getpages at ffffffff8110e18b @ #13 [ffff88178f64d9a8] fallback_alloc at ffffffff8110e35e @ #14 [ffff88178f64da08] ____cache_alloc_node at ffffffff8110e51f @ #15 [ffff88178f64da48] __kmalloc at ffffffff8110efba @ #16 [ffff88178f64da98] xs_setup_xprt at ffffffffa00a563f [sunrpc] @ #17 [ffff88178f64dad8] xs_setup_tcp at ffffffffa00a7648 [sunrpc] @ #18 [ffff88178f64daf8] xprt_create_transport at ffffffffa00a478f [sunrpc] @ #19 [ffff88178f64db18] rpc_create at ffffffffa00a2d7a [sunrpc] @ #20 [ffff88178f64dbf8] rpcb_create at ffffffffa00b026b [sunrpc] @ #21 [ffff88178f64dc98] rpcb_getport_async at ffffffffa00b0c94 [sunrpc] @ #22 [ffff88178f64ddf8] call_bind at ffffffffa00a11f8 [sunrpc] @ #23 [ffff88178f64de18] __rpc_execute at ffffffffa00a88ef [sunrpc] @ #24 [ffff88178f64de58] rpc_async_schedule at ffffffffa00a9187 [sunrpc] @ #25 [ffff88178f64de78] worker_thread at ffffffff81072ed2 @ #26 [ffff88178f64dee8] kthread at ffffffff81076df3 @ #27 [ffff88178f64df48] kernel_thread at ffffffff81012e2a @ crash> Junxiao notes that the problem is not limited to the rpcbind client. In fact we can trigger the exact same problem when trying to reconnect to the server, and we find ourselves calling sock_alloc(). The following solution should work for all kernels that support the PF_MEMALLOC_NOIO flag (i.e. Linux 3.9 and newer). Link: http://lkml.kernel.org/r/53F6F772.6020708@oracle.com Reported-by: Junxiao Bi <junxiao.bi@oracle.com> Cc: stable@vger.kernel.org # 3.9+ Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com> --- net/sunrpc/sched.c | 5 +++-- net/sunrpc/xprtsock.c | 15 ++++++++------- 2 files changed, 11 insertions(+), 9 deletions(-)