Message ID | 1360113112.17267.1.camel@fli24-HP-Compaq-8100-Elite-CMT-PC (mailing list archive) |
---|---|
State | RFC, archived |
Headers | show |
On Wed, Feb 6, 2013 at 2:11 AM, Li Fei <fei.li@intel.com> wrote: > > There is well known issue that freezing will fail in case that fuse > daemon is frozen firstly with some requests not handled, as the fuse > usage task is waiting for the response from fuse daemon and can't be > frozen. > > To solve the issue above, make fuse daemon frozen after all all user > space processes frozen and during the kernel threads frozen phase. > PF_FREEZE_DAEMON flag is added to indicate that the current thread is > the fuse daemon, Okay and how do you know which thread, process or processes belong to the "fuse daemon"? The only entity which may have a chance of defining the correct answer to the above question is the fuse daemon itself. What I mean is that automatically setting PF_FREEZE_DAEMON by the kernel is the wrong approach. On the other hand an ioctl to set this flag on a defined task or task group (*) would actually make some sense. Add some, possibly optional, inheritence behavior (i.e. a cloned process would inherit the PF_FREEZE_DAMON flag) and it might actually take care of most of the cases. But I think it definitely needs more thought and testing. Your patch looks like it won't work on most of the unprivileged fuse filesystems which use the fusermount(1) helper to perform the actual mounting. Thanks, Miklos (*) setting this flag for other than the current process should, I think, be a privileged operation, otherwise it might allow misuse. > set on connection, and clear on disconnection. > It works as all fuse requests are handled during user space processes > frozen, there will no further fuse request, and it's safe to continue > to freeze fuse daemon together with kernel freezable tasks. > > Signed-off-by: Wang Biao <biao.wang@intel.com> > Signed-off-by: Liu Chuansheng <chuansheng.liu@intel.com> > Signed-off-by: Li Fei <fei.li@intel.com> > --- > fs/fuse/inode.c | 9 +++++++++ > include/linux/freezer.h | 4 ++++ > include/linux/sched.h | 2 ++ > kernel/freezer.c | 33 ++++++++++++++++++++++++++++++++- > kernel/power/process.c | 2 +- > 5 files changed, 48 insertions(+), 2 deletions(-) > > diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c > index 73ca6b7..fe476e8 100644 > --- a/fs/fuse/inode.c > +++ b/fs/fuse/inode.c > @@ -20,6 +20,7 @@ > #include <linux/random.h> > #include <linux/sched.h> > #include <linux/exportfs.h> > +#include <linux/freezer.h> > > MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>"); > MODULE_DESCRIPTION("Filesystem in Userspace"); > @@ -367,6 +368,10 @@ void fuse_conn_kill(struct fuse_conn *fc) > wake_up_all(&fc->waitq); > wake_up_all(&fc->blocked_waitq); > wake_up_all(&fc->reserved_req_waitq); > + > + /* Clear daemon flag after disconnection */ > + clear_freeze_daemon_flag(); > + > } > EXPORT_SYMBOL_GPL(fuse_conn_kill); > > @@ -1054,6 +1059,10 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) > goto err_unlock; > > list_add_tail(&fc->entry, &fuse_conn_list); > + > + /* Set daemon flag just before connection */ > + set_freeze_daemon_flag(); > + > sb->s_root = root_dentry; > fc->connected = 1; > file->private_data = fuse_conn_get(fc); > diff --git a/include/linux/freezer.h b/include/linux/freezer.h > index e4238ce..9f0d0cf 100644 > --- a/include/linux/freezer.h > +++ b/include/linux/freezer.h > @@ -51,6 +51,8 @@ static inline bool try_to_freeze(void) > > extern bool freeze_task(struct task_struct *p); > extern bool set_freezable(void); > +extern bool set_freeze_daemon_flag(void); > +extern bool clear_freeze_daemon_flag(void); > > #ifdef CONFIG_CGROUP_FREEZER > extern bool cgroup_freezing(struct task_struct *task); > @@ -217,6 +219,8 @@ static inline void freezer_do_not_count(void) {} > static inline void freezer_count(void) {} > static inline int freezer_should_skip(struct task_struct *p) { return 0; } > static inline void set_freezable(void) {} > +static inline void set_freeze_daemon_flag(void) {} > +static inline void clear_freeze_daemon_flag(void) {} > > #define freezable_schedule() schedule() > > diff --git a/include/linux/sched.h b/include/linux/sched.h > index d211247..6cdc969 100644 > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -1826,6 +1826,8 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, > #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ > #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ > #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */ > +/* Daemon threads to be frozen along with kernel threads */ > +#define PF_FREEZER_DAEMON 0x80000000 > > /* > * Only the _current_ task can read/write to tsk->flags, but other > diff --git a/kernel/freezer.c b/kernel/freezer.c > index c38893b..eff9440 100644 > --- a/kernel/freezer.c > +++ b/kernel/freezer.c > @@ -39,7 +39,8 @@ bool freezing_slow_path(struct task_struct *p) > if (pm_nosig_freezing || cgroup_freezing(p)) > return true; > > - if (pm_freezing && !(p->flags & PF_KTHREAD)) > + if (pm_freezing && !(p->flags & PF_KTHREAD) && > + !(p->flags & PF_FREEZE_DAEMON)) > return true; > > return false; > @@ -162,3 +163,33 @@ bool set_freezable(void) > return try_to_freeze(); > } > EXPORT_SYMBOL(set_freezable); > + > +/** > + * set_freeze_daemon_flag - make %current as daemon > + * > + * Make %current being frozen by freezer along with kernel threads > + */ > +bool set_freeze_daemon_flag(void) > +{ > + spin_lock_irq(&freezer_lock); > + current->flags |= PF_FREEZE_DAEMON; > + spin_unlock_irq(&freezer_lock); > + > + return try_to_freeze(); > +} > +EXPORT_SYMBOL(set_freeze_daemon_flag); > + > +/** > + * clear_freeze_daemon_flag - make %current as usual user space process > + * > + * Make %current being frozen by freezer along with user space processes > + */ > +bool clear_freeze_daemon_flag(void) > +{ > + spin_lock_irq(&freezer_lock); > + current->flags &= ~PF_FREEZE_DAEMON; > + spin_unlock_irq(&freezer_lock); > + > + return try_to_freeze(); > +} > +EXPORT_SYMBOL(clear_freeze_daemon_flag); > diff --git a/kernel/power/process.c b/kernel/power/process.c > index d5a258b..a4489ae 100644 > --- a/kernel/power/process.c > +++ b/kernel/power/process.c > @@ -199,7 +199,7 @@ void thaw_kernel_threads(void) > > read_lock(&tasklist_lock); > do_each_thread(g, p) { > - if (p->flags & (PF_KTHREAD | PF_WQ_WORKER)) > + if (p->flags & (PF_KTHREAD | PF_WQ_WORKER | PF_FREEZE_DAEMON)) > __thaw_task(p); > } while_each_thread(g, p); > read_unlock(&tasklist_lock); > -- > 1.7.4.1 > > > > -- To unsubscribe from this list: send the line "unsubscribe linux-pm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Feb 6, 2013 at 2:11 AM, Li Fei <fei.li@intel.com> wrote: > > There is well known issue that freezing will fail in case that fuse > daemon is frozen firstly with some requests not handled, as the fuse > usage task is waiting for the response from fuse daemon and can't be > frozen. > > To solve the issue above, make fuse daemon frozen after all all user > space processes frozen and during the kernel threads frozen phase. > PF_FREEZE_DAEMON flag is added to indicate that the current thread is > the fuse daemon, set on connection, and clear on disconnection. > It works as all fuse requests are handled during user space processes > frozen, there will no further fuse request, and it's safe to continue > to freeze fuse daemon together with kernel freezable tasks. Will this work correctly if one FUSE daemon is opening files in from another FUSE filesystem?
On Wed, Feb 6, 2013 at 10:56 AM, Han-Wen Nienhuys <hanwenn@gmail.com> wrote: > On Wed, Feb 6, 2013 at 2:11 AM, Li Fei <fei.li@intel.com> wrote: >> >> There is well known issue that freezing will fail in case that fuse >> daemon is frozen firstly with some requests not handled, as the fuse >> usage task is waiting for the response from fuse daemon and can't be >> frozen. >> >> To solve the issue above, make fuse daemon frozen after all all user >> space processes frozen and during the kernel threads frozen phase. >> PF_FREEZE_DAEMON flag is added to indicate that the current thread is >> the fuse daemon, set on connection, and clear on disconnection. >> It works as all fuse requests are handled during user space processes >> frozen, there will no further fuse request, and it's safe to continue >> to freeze fuse daemon together with kernel freezable tasks. > > Will this work correctly if one FUSE daemon is opening files in from > another FUSE filesystem? As long as only non-fuse-daemon processes are generating the requests (i.e. fuse daemons don't spontaneously generate new requests) it should work. I think more problematic is that userspace processes tend to communicate with each other, sometimes in a non-trivial way. For example gethostbyname(3) will turn to nscd(8) for name service cache results. So a fuse daemon that might call gethostbyname() should mark nscd with PF_FREEZE_DAEMON but that requires careful audit (or nscd might mark itself PF_FREEZE_DAEMON for that reason). And that's just an example of a most basic C library function. There are many more such hidden interactions that can trip up this scheme. Thanks, Miklos -- To unsubscribe from this list: send the line "unsubscribe linux-pm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[CC list restored] On Thu, Feb 7, 2013 at 9:41 AM, Goswin von Brederlow <goswin-v-b@web.de> wrote: > On Wed, Feb 06, 2013 at 10:27:40AM +0100, Miklos Szeredi wrote: >> On Wed, Feb 6, 2013 at 2:11 AM, Li Fei <fei.li@intel.com> wrote: >> > >> > There is well known issue that freezing will fail in case that fuse >> > daemon is frozen firstly with some requests not handled, as the fuse >> > usage task is waiting for the response from fuse daemon and can't be >> > frozen. >> > >> > To solve the issue above, make fuse daemon frozen after all all user >> > space processes frozen and during the kernel threads frozen phase. >> > PF_FREEZE_DAEMON flag is added to indicate that the current thread is >> > the fuse daemon, >> >> Okay and how do you know which thread, process or processes belong to >> the "fuse daemon"? > > Maybe I'm talking about the wrong thing but isn't any process having > /dev/fuse open "the fuse daemon"? And that doesn't even cover cases > where one thread reads requests from the kernel and hands them to > worker threads (that do not have /dev/fuse themself). Or the fuse > request might need mysql to finish a request. > > I believe figuring out what processes handle fuse requests is a lost > proposition. Pretty much. > > > Secondly how does freezing the daemon second garanty that it has > replied to all pending requests? Or how is leaving it thawed the right > decision? > > Instead the kernel side of fuse should be half frozen and stop sending > out new requests. Then it should wait for all pending requests to > complete. Then and only then can userspace processes be frozen safely. The problem with that is one fuse filesystem might be calling into another. Say two fuse filesystems are mounted at /mnt/A and /mnt/B, Process X starts a read request on /mnt/A. This is handled by process A, which in turn starts a read request on /mnt/B, which is handled by B. If we freeze the system at the moment when A starts handling the request but hasn't yet sent the request to B then things wil be stuck and the freezing will fail. So the order should be: Freeze the "topmost" fuse filesystems (A in the example) first and if all pending requests are completed then freeze the next ones in the hierarchy, etc... This would work if this dependency between filesystems were known. But it's not and again it looks like an impossible task. The only way to *reliably* freeze fuse filesystems is to let it freeze even if there are outstanding requests. But that's the hardest to implement, because then it needs to allow freezing of tasks waiting on i_mutex, for example, which is currently not possible. But this is the only way I see that would not have unsolvable corner cases that prop up at the worst moment. And yes, it would be prudent to wait some time for pending requests to finish before freezing. But it's not a requirement, things *should* work without that: suspending a machine is just like a very long pause by the CPU, as long as no hardware is involved. And with fuse filesystems there is no hardware involved directly by definition. But I'm open to ideas and at this stage I think even patches that improve the situation for the majority of the cases would be acceptable, since this is turning out to be a major problem for a lot of people. Thanks, Miklos -- To unsubscribe from this list: send the line "unsubscribe linux-pm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Feb 07, 2013 at 10:59:19AM +0100, Miklos Szeredi wrote: > [CC list restored] > > On Thu, Feb 7, 2013 at 9:41 AM, Goswin von Brederlow <goswin-v-b@web.de> wrote: > > On Wed, Feb 06, 2013 at 10:27:40AM +0100, Miklos Szeredi wrote: > >> On Wed, Feb 6, 2013 at 2:11 AM, Li Fei <fei.li@intel.com> wrote: > >> > > >> > There is well known issue that freezing will fail in case that fuse > >> > daemon is frozen firstly with some requests not handled, as the fuse > >> > usage task is waiting for the response from fuse daemon and can't be > >> > frozen. > >> > > >> > To solve the issue above, make fuse daemon frozen after all all user > >> > space processes frozen and during the kernel threads frozen phase. > >> > PF_FREEZE_DAEMON flag is added to indicate that the current thread is > >> > the fuse daemon, > >> > >> Okay and how do you know which thread, process or processes belong to > >> the "fuse daemon"? > > > > Maybe I'm talking about the wrong thing but isn't any process having > > /dev/fuse open "the fuse daemon"? And that doesn't even cover cases > > where one thread reads requests from the kernel and hands them to > > worker threads (that do not have /dev/fuse themself). Or the fuse > > request might need mysql to finish a request. > > > > I believe figuring out what processes handle fuse requests is a lost > > proposition. > > Pretty much. > > > > > > > Secondly how does freezing the daemon second garanty that it has > > replied to all pending requests? Or how is leaving it thawed the right > > decision? > > > > Instead the kernel side of fuse should be half frozen and stop sending > > out new requests. Then it should wait for all pending requests to > > complete. Then and only then can userspace processes be frozen safely. > > The problem with that is one fuse filesystem might be calling into > another. Say two fuse filesystems are mounted at /mnt/A and /mnt/B, > Process X starts a read request on /mnt/A. This is handled by process > A, which in turn starts a read request on /mnt/B, which is handled by > B. If we freeze the system at the moment when A starts handling the > request but hasn't yet sent the request to B then things wil be stuck > and the freezing will fail. > > So the order should be: Freeze the "topmost" fuse filesystems (A in > the example) first and if all pending requests are completed then > freeze the next ones in the hierarchy, etc... This would work if this > dependency between filesystems were known. But it's not and again it > looks like an impossible task. What is topmost? The kernel can't know that for sure. > The only way to *reliably* freeze fuse filesystems is to let it freeze > even if there are outstanding requests. But that's the hardest to > implement, because then it needs to allow freezing of tasks waiting on > i_mutex, for example, which is currently not possible. But this is > the only way I see that would not have unsolvable corner cases that > prop up at the worst moment. > > And yes, it would be prudent to wait some time for pending requests > to finish before freezing. But it's not a requirement, things > *should* work without that: suspending a machine is just like a very > long pause by the CPU, as long as no hardware is involved. And with > fuse filesystems there is no hardware involved directly by definition. > > But I'm open to ideas and at this stage I think even patches that > improve the situation for the majority of the cases would be > acceptable, since this is turning out to be a major problem for a lot > of people. > > Thanks, > Miklos For shutdown in userspace there is the sendsigs.omit.d/ to avoid the problem of halting/killing processes of the fuse filesystems (or other services) prematurely. I guess something similar needs to be done for freeze. The fuse filesystem has to tell the kernel what is up. MfG Goswin -- To unsubscribe from this list: send the line "unsubscribe linux-pm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Hi! > > The only way to *reliably* freeze fuse filesystems is to let it freeze > > even if there are outstanding requests. But that's the hardest to > > implement, because then it needs to allow freezing of tasks waiting on > > i_mutex, for example, which is currently not possible. But this is > > the only way I see that would not have unsolvable corner cases that > > prop up at the worst moment. > > > > And yes, it would be prudent to wait some time for pending requests > > to finish before freezing. But it's not a requirement, things > > *should* work without that: suspending a machine is just like a very > > long pause by the CPU, as long as no hardware is involved. And with > > fuse filesystems there is no hardware involved directly by definition. > > > > But I'm open to ideas and at this stage I think even patches that > > improve the situation for the majority of the cases would be > > acceptable, since this is turning out to be a major problem for a lot > > of people. > > For shutdown in userspace there is the sendsigs.omit.d/ to avoid the > problem of halting/killing processes of the fuse filesystems (or other > services) prematurely. I guess something similar needs to be done for > freeze. The fuse filesystem has to tell the kernel what is up. Would it be feasible to create some kind of fuse-stop-script.sh, and run it before suspend (from userspace)? It should be pretty similar to sendsigs.omit.d/ mechanism AFAICT. I'm sorry, freezer is not too suitable for fuse. (BTW: for suspend, we may be able to improve it so that it is possible to remove freezer from it. For hibernation, it would be very hard.) Pavel
Hi, On Saturday, February 09, 2013 06:49:10 PM Pavel Machek wrote: > Hi! > > > > The only way to *reliably* freeze fuse filesystems is to let it freeze > > > even if there are outstanding requests. But that's the hardest to > > > implement, because then it needs to allow freezing of tasks waiting on > > > i_mutex, for example, which is currently not possible. But this is > > > the only way I see that would not have unsolvable corner cases that > > > prop up at the worst moment. > > > > > > And yes, it would be prudent to wait some time for pending requests > > > to finish before freezing. But it's not a requirement, things > > > *should* work without that: suspending a machine is just like a very > > > long pause by the CPU, as long as no hardware is involved. And with > > > fuse filesystems there is no hardware involved directly by definition. > > > > > > But I'm open to ideas and at this stage I think even patches that > > > improve the situation for the majority of the cases would be > > > acceptable, since this is turning out to be a major problem for a lot > > > of people. > > > > For shutdown in userspace there is the sendsigs.omit.d/ to avoid the > > problem of halting/killing processes of the fuse filesystems (or other > > services) prematurely. I guess something similar needs to be done for > > freeze. The fuse filesystem has to tell the kernel what is up. > > Would it be feasible to create some kind of fuse-stop-script.sh, and > run it before suspend (from userspace)? It should be pretty similar to > sendsigs.omit.d/ mechanism AFAICT. > > I'm sorry, freezer is not too suitable for fuse. > > (BTW: for suspend, we may be able to improve it so that it is possible > to remove freezer from it. For hibernation, it would be very hard.) Well, this is so incorrect that it's not even funny. :-( The whole memory shrinking we do for hibernation is now done by allocating memory, so the freezer is not necessary for *that* and there's *zero* difference between suspend and hibernation with respect to why the freezer is used. And *the* reason why it's used is that intercepting all of the possible ways user space could interfere with the suspend process without the freezer is simply totally impractical. System calls (including ioctls and fctls), sysfs accesses, /proc accesses, debugfs, to name a few, and mmap (I wonder how you would handle *that* alone). And I'm sure there's more I didn't even think about. Moreover, all of the drivers we have assume that user space will be frozen before their suspend callbacks are run by now and changing that is totally unrealistic. So please don't tell people that something is doable while it practically realisticly isn't. Thanks, Rafael
Hi! > > > For shutdown in userspace there is the sendsigs.omit.d/ to avoid the > > > problem of halting/killing processes of the fuse filesystems (or other > > > services) prematurely. I guess something similar needs to be done for > > > freeze. The fuse filesystem has to tell the kernel what is up. > > > > Would it be feasible to create some kind of fuse-stop-script.sh, and > > run it before suspend (from userspace)? It should be pretty similar to > > sendsigs.omit.d/ mechanism AFAICT. > > > > I'm sorry, freezer is not too suitable for fuse. > > > > (BTW: for suspend, we may be able to improve it so that it is possible > > to remove freezer from it. For hibernation, it would be very hard.) > > Well, this is so incorrect that it's not even funny. :-( > > The whole memory shrinking we do for hibernation is now done by allocating > memory, so the freezer is not necessary for *that* and there's *zero* > difference between suspend and hibernation with respect to why the freezer is > used. Funny. Freezer was put there so that hibernation image was safe to write out. You need disk subsystems in workable state for hibernation. > And *the* reason why it's used is that intercepting all of the possible ways > user space could interfere with the suspend process without the freezer is > simply totally impractical. System calls (including ioctls and > fctls), It is a lot of work, agreed. But runtime power management is already moving in that direction. And I could imagine introducing "big suspend lock" to be grabbed pretty much everywhere. (And IIRC n900 pretty much suspends when idle.) > sysfs accesses, /proc accesses, debugfs, to name a few, and mmap (I wonder how > you would handle *that* alone). And I'm sure there's more I didn't > even mmap... what is problem with mmap? For suspend, memory is powered, so you can permit people changing it. > So please don't tell people that something is doable while it practically > realisticly isn't. I can imagine doing that for suspend, and believe we are moving in that direction with runtime suspend. For hibernation, we need to write image the image to the disk (which is the important difference, not memory shrinking), and "big suspend lock" would interfere with that. I can not imagine reasonable solution for that. Pavel
On Sunday, February 10, 2013 11:33:45 AM Pavel Machek wrote: > Hi! > > > > > For shutdown in userspace there is the sendsigs.omit.d/ to avoid the > > > > problem of halting/killing processes of the fuse filesystems (or other > > > > services) prematurely. I guess something similar needs to be done for > > > > freeze. The fuse filesystem has to tell the kernel what is up. > > > > > > Would it be feasible to create some kind of fuse-stop-script.sh, and > > > run it before suspend (from userspace)? It should be pretty similar to > > > sendsigs.omit.d/ mechanism AFAICT. > > > > > > I'm sorry, freezer is not too suitable for fuse. > > > > > > (BTW: for suspend, we may be able to improve it so that it is possible > > > to remove freezer from it. For hibernation, it would be very hard.) > > > > Well, this is so incorrect that it's not even funny. :-( > > > > The whole memory shrinking we do for hibernation is now done by allocating > > memory, so the freezer is not necessary for *that* and there's *zero* > > difference between suspend and hibernation with respect to why the freezer is > > used. > > Funny. Freezer was put there so that hibernation image was safe to > write out. You need disk subsystems in workable state for hibernation. I'm not really sure what you're talking about. Why do you think the freezer is necessary for that? > > And *the* reason why it's used is that intercepting all of the possible ways > > user space could interfere with the suspend process without the freezer is > > simply totally impractical. System calls (including ioctls and > > fctls), > > It is a lot of work, agreed. But runtime power management is already > moving in that direction. Runtime PM is a different thing. It works on the "this device is not in use right now, so it can be suspended" basis, while the system suspend's principle is "I want everything to quiesce now and go low-power". So system suspend can trigger while devices are still in use and there are good reasons for that to happen sometimes. > And I could imagine introducing "big suspend lock" to be grabbed pretty much > everywhere. Sorry, I can't imagine *that*. > (And IIRC n900 pretty much suspends when idle.) > > > sysfs accesses, /proc accesses, debugfs, to name a few, and mmap (I wonder how > > you would handle *that* alone). And I'm sure there's more I didn't > > even > > mmap... what is problem with mmap? For suspend, memory is powered, so > you can permit people changing it. Suppose mmap is used to make the registers of some device available to user space. Yes, that can happen. > > So please don't tell people that something is doable while it practically > > realisticly isn't. > > I can imagine doing that for suspend, and believe we are moving in > that direction with runtime suspend. For hibernation, we need to write > image the image to the disk (which is the important difference, not > memory shrinking), and "big suspend lock" would interfere with > that. I can not imagine reasonable solution for that. Again, I'm not sure what you're talking about. Once the image has been created, it can be saved while user space is running just fine. And that "big suspend lock" would be pretty much the same as the BKL that people spent quite some time an effort to remove, so I don't think your idea to reintroduce something similar just for suspend purposes would be popular. And since the freezer is now used for other things beyond suspend and hibernation, introducing such a lock wouldn't even address the Fuse problem. Thanks, Rafael
On Sunday, February 10, 2013 02:51:22 PM Rafael J. Wysocki wrote: > On Sunday, February 10, 2013 11:33:45 AM Pavel Machek wrote: > > Hi! > > > > > > > For shutdown in userspace there is the sendsigs.omit.d/ to avoid the > > > > > problem of halting/killing processes of the fuse filesystems (or other > > > > > services) prematurely. I guess something similar needs to be done for > > > > > freeze. The fuse filesystem has to tell the kernel what is up. > > > > > > > > Would it be feasible to create some kind of fuse-stop-script.sh, and > > > > run it before suspend (from userspace)? It should be pretty similar to > > > > sendsigs.omit.d/ mechanism AFAICT. > > > > > > > > I'm sorry, freezer is not too suitable for fuse. > > > > > > > > (BTW: for suspend, we may be able to improve it so that it is possible > > > > to remove freezer from it. For hibernation, it would be very hard.) > > > > > > Well, this is so incorrect that it's not even funny. :-( > > > > > > The whole memory shrinking we do for hibernation is now done by allocating > > > memory, so the freezer is not necessary for *that* and there's *zero* > > > difference between suspend and hibernation with respect to why the freezer is > > > used. > > > > Funny. Freezer was put there so that hibernation image was safe to > > write out. You need disk subsystems in workable state for hibernation. > > I'm not really sure what you're talking about. Why do you think the freezer is > necessary for that? > > > > And *the* reason why it's used is that intercepting all of the possible ways > > > user space could interfere with the suspend process without the freezer is > > > simply totally impractical. System calls (including ioctls and > > > fctls), > > > > It is a lot of work, agreed. But runtime power management is already > > moving in that direction. > > Runtime PM is a different thing. It works on the "this device is not in use > right now, so it can be suspended" basis, while the system suspend's principle > is "I want everything to quiesce now and go low-power". So system suspend can > trigger while devices are still in use and there are good reasons for that to > happen sometimes. > > > And I could imagine introducing "big suspend lock" to be grabbed pretty much > > everywhere. > > Sorry, I can't imagine *that*. > > > (And IIRC n900 pretty much suspends when idle.) > > > > > sysfs accesses, /proc accesses, debugfs, to name a few, and mmap (I wonder how > > > you would handle *that* alone). And I'm sure there's more I didn't > > > even > > > > mmap... what is problem with mmap? For suspend, memory is powered, so > > you can permit people changing it. > > Suppose mmap is used to make the registers of some device available to user > space. Yes, that can happen. > > > > So please don't tell people that something is doable while it practically > > > realisticly isn't. > > > > I can imagine doing that for suspend, and believe we are moving in > > that direction with runtime suspend. For hibernation, we need to write > > image the image to the disk (which is the important difference, not > > memory shrinking), and "big suspend lock" would interfere with > > that. I can not imagine reasonable solution for that. > > Again, I'm not sure what you're talking about. Once the image has been > created, it can be saved while user space is running just fine. Of course, we don't want random changes to be made to persistent storage after the image has been created, because those changes might not be consistent with the memory contents stored in the image, and that's why user space is still frozen when we're saving the image. > And that "big suspend lock" would be pretty much the same as the BKL that > people spent quite some time an effort to remove, so I don't think your idea > to reintroduce something similar just for suspend purposes would be popular. > > And since the freezer is now used for other things beyond suspend and > hibernation, introducing such a lock wouldn't even address the Fuse problem. BTW, Miklos, wouldn't that help if the kernel notified user space somehow before starting to freeze processes? Perhaps the Fuse's user space part could subscribe to such notifications and do something before the freezing kicks in? Rafael
Hi! > > > > The whole memory shrinking we do for hibernation is now done by allocating > > > > memory, so the freezer is not necessary for *that* and there's *zero* > > > > difference between suspend and hibernation with respect to why the freezer is > > > > used. > > > > > > Funny. Freezer was put there so that hibernation image was safe to > > > write out. You need disk subsystems in workable state for hibernation. > > > > I'm not really sure what you're talking about. Why do you think the freezer is > > necessary for that? Well, from freezer you need: 1) user process frozen. 2) essential locks _not_ held so that block devices are still functional. > > > mmap... what is problem with mmap? For suspend, memory is powered, so > > > you can permit people changing it. > > > > Suppose mmap is used to make the registers of some device available to user > > space. Yes, that can happen. "Don't do it, then". Yes, can happen, but hopefully is not too common these days. [And... freezer doing 1) but not 2) would be enough to handle that. Freezer doing 1) but not 2) would also be simpler...] > > Again, I'm not sure what you're talking about. Once the image has been > > created, it can be saved while user space is running just fine. > > Of course, we don't want random changes to be made to persistent storage after > the image has been created, because those changes might not be consistent with > the memory contents stored in the image, and that's why user space is still > frozen when we're saving the image. Yes. That's the reason 1) for freezer. Pavel
On Sunday, February 10, 2013 07:55:05 PM Pavel Machek wrote: > Hi! > > > > > > The whole memory shrinking we do for hibernation is now done by allocating > > > > > memory, so the freezer is not necessary for *that* and there's *zero* > > > > > difference between suspend and hibernation with respect to why the freezer is > > > > > used. > > > > > > > > Funny. Freezer was put there so that hibernation image was safe to > > > > write out. You need disk subsystems in workable state for hibernation. > > > > > > I'm not really sure what you're talking about. Why do you think the freezer is > > > necessary for that? > > Well, from freezer you need: > > 1) user process frozen. > > 2) essential locks _not_ held so that block devices are still functional. > > > > > mmap... what is problem with mmap? For suspend, memory is powered, so > > > > you can permit people changing it. > > > > > > Suppose mmap is used to make the registers of some device available to user > > > space. Yes, that can happen. > > "Don't do it, then". Yes, can happen, but hopefully is not too common > these days. [And... freezer doing 1) but not 2) would be enough to > handle that. Freezer doing 1) but not 2) would also be simpler...] Again, I'm not sure what you mean. Are you trying to say that it would be OK to freeze user space tasks in the D state? Rafael
On Mon, Feb 11, 2013 at 12:31 AM, Rafael J. Wysocki <rjw@sisk.pl> wrote: > On Sunday, February 10, 2013 07:55:05 PM Pavel Machek wrote: >> Well, from freezer you need: >> >> 1) user process frozen. >> >> 2) essential locks _not_ held so that block devices are still functional. >> >> > > > mmap... what is problem with mmap? For suspend, memory is powered, so >> > > > you can permit people changing it. >> > > >> > > Suppose mmap is used to make the registers of some device available to user >> > > space. Yes, that can happen. >> >> "Don't do it, then". Yes, can happen, but hopefully is not too common >> these days. [And... freezer doing 1) but not 2) would be enough to >> handle that. Freezer doing 1) but not 2) would also be simpler...] > > Again, I'm not sure what you mean. > > Are you trying to say that it would be OK to freeze user space tasks in > the D state? I think that's what Pavel is saying. Processes in D state sleeping on non-device mutexes _are_ actually OK to freeze. And that would nicely solve the fuse freeze problem. The only little detail is how do we implement that... Thanks, Miklos -- To unsubscribe from this list: send the line "unsubscribe linux-pm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Hi! > > > > > > The whole memory shrinking we do for hibernation is now done by allocating > > > > > > memory, so the freezer is not necessary for *that* and there's *zero* > > > > > > difference between suspend and hibernation with respect to why the freezer is > > > > > > used. > > > > > > > > > > Funny. Freezer was put there so that hibernation image was safe to > > > > > write out. You need disk subsystems in workable state for hibernation. > > > > > > > > I'm not really sure what you're talking about. Why do you think the freezer is > > > > necessary for that? > > > > Well, from freezer you need: > > > > 1) user process frozen. > > > > 2) essential locks _not_ held so that block devices are still functional. > > > > > > > mmap... what is problem with mmap? For suspend, memory is powered, so > > > > > you can permit people changing it. > > > > > > > > Suppose mmap is used to make the registers of some device available to user > > > > space. Yes, that can happen. > > > > "Don't do it, then". Yes, can happen, but hopefully is not too common > > these days. [And... freezer doing 1) but not 2) would be enough to > > handle that. Freezer doing 1) but not 2) would also be simpler...] > > Again, I'm not sure what you mean. > > Are you trying to say that it would be OK to freeze user space tasks in > the D state? Yes. For suspend, that should pretty much work. (With caveats, as Miklos noticed). Unfortunately, that is not going to be too easy for hibernation. We could mae the situation easier by specifying we only support hibernation to local SATA or SCSI disk, not over NBD, iSCSI, etc... That would reduce number of locks that need to be available for hibernation. Unfortunately, uswsusp did not make that easy as userland may want to do arbitrary stuff with the image. Pavel
On Monday, February 11, 2013 11:11:40 AM Miklos Szeredi wrote: > On Mon, Feb 11, 2013 at 12:31 AM, Rafael J. Wysocki <rjw@sisk.pl> wrote: > > On Sunday, February 10, 2013 07:55:05 PM Pavel Machek wrote: > > >> Well, from freezer you need: > >> > >> 1) user process frozen. > >> > >> 2) essential locks _not_ held so that block devices are still functional. > >> > >> > > > mmap... what is problem with mmap? For suspend, memory is powered, so > >> > > > you can permit people changing it. > >> > > > >> > > Suppose mmap is used to make the registers of some device available to user > >> > > space. Yes, that can happen. > >> > >> "Don't do it, then". Yes, can happen, but hopefully is not too common > >> these days. [And... freezer doing 1) but not 2) would be enough to > >> handle that. Freezer doing 1) but not 2) would also be simpler...] > > > > Again, I'm not sure what you mean. > > > > Are you trying to say that it would be OK to freeze user space tasks in > > the D state? > > I think that's what Pavel is saying. Processes in D state sleeping > on non-device mutexes _are_ actually OK to freeze. And that would > nicely solve the fuse freeze problem. That's potentially deeadlock-prone, because a task waiting for mutex X may very well be holding mutex Y, so if there's another task waiting for mutex Y, it needs to be frozen at the same time. > The only little detail is how do we implement that... This means the only way I can see would be to hack the mutex code so that the try_to_freeze() was called for user space tasks after the sched_preempt_enable_no_resched() before schedule(). That shouldn't be a big deal performance-wise, because we are in the slow path anyway then. I'm not sure if Peter Z will like it, though. Moreover, a task waiting for a mutex may be holding a semaphore or be participating in some other mutual-exclusion mechanism, so we'd need to address them all. Plus, as noted by Pavel, freezing those things would make it difficult to save hibernation images to us. What about having a "freeze me after all of my children" flag that will be inherited from parents? Would that help the fuse case? Rafael
On Mon, Feb 11, 2013 at 1:08 PM, Rafael J. Wysocki <rjw@sisk.pl> wrote: > On Monday, February 11, 2013 11:11:40 AM Miklos Szeredi wrote: >> On Mon, Feb 11, 2013 at 12:31 AM, Rafael J. Wysocki <rjw@sisk.pl> wrote: >> > On Sunday, February 10, 2013 07:55:05 PM Pavel Machek wrote: >> >> >> Well, from freezer you need: >> >> >> >> 1) user process frozen. >> >> >> >> 2) essential locks _not_ held so that block devices are still functional. >> >> >> >> > > > mmap... what is problem with mmap? For suspend, memory is powered, so >> >> > > > you can permit people changing it. >> >> > > >> >> > > Suppose mmap is used to make the registers of some device available to user >> >> > > space. Yes, that can happen. >> >> >> >> "Don't do it, then". Yes, can happen, but hopefully is not too common >> >> these days. [And... freezer doing 1) but not 2) would be enough to >> >> handle that. Freezer doing 1) but not 2) would also be simpler...] >> > >> > Again, I'm not sure what you mean. >> > >> > Are you trying to say that it would be OK to freeze user space tasks in >> > the D state? >> >> I think that's what Pavel is saying. Processes in D state sleeping >> on non-device mutexes _are_ actually OK to freeze. And that would >> nicely solve the fuse freeze problem. > > That's potentially deeadlock-prone, because a task waiting for mutex X may > very well be holding mutex Y, so if there's another task waiting for mutex Y, > it needs to be frozen at the same time. > >> The only little detail is how do we implement that... > > This means the only way I can see would be to hack the mutex code so that the > try_to_freeze() was called for user space tasks after the > sched_preempt_enable_no_resched() before schedule(). > > That shouldn't be a big deal performance-wise, because we are in the slow > path anyway then. I'm not sure if Peter Z will like it, though. > > Moreover, a task waiting for a mutex may be holding a semaphore or be > participating in some other mutual-exclusion mechanism, so we'd need to > address them all. Plus, as noted by Pavel, freezing those things would make > it difficult to save hibernation images to us. Well, as a first step I could cook up a patch that adds a flag to the mutex indicating that it's freezable. Fuse would mark its mutexes (and the mutexes that VFS uses on its behalf) as freezable. That way we don't interfere with hibernation, except if hibernation uses fuse but that's a very special case. > > What about having a "freeze me after all of my children" flag that will be > inherited from parents? Would that help the fuse case? With kernel filesystems there's a clear distinction between tasks that may originate filesystem requests (userspace processes) and those that serve these requests (kernel threads). So its possible to freeze the originators first and the servers afterwards. With fuse there's no such difference. Even if it were known which processes are servers and which are originators (it is not known in the general case) then there would be a problem that some processes are servers AND originators at the same time. Thanks, Miklos -- To unsubscribe from this list: send the line "unsubscribe linux-pm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Monday, February 11, 2013 02:59:56 PM Miklos Szeredi wrote: > On Mon, Feb 11, 2013 at 1:08 PM, Rafael J. Wysocki <rjw@sisk.pl> wrote: > > On Monday, February 11, 2013 11:11:40 AM Miklos Szeredi wrote: > >> On Mon, Feb 11, 2013 at 12:31 AM, Rafael J. Wysocki <rjw@sisk.pl> wrote: > >> > On Sunday, February 10, 2013 07:55:05 PM Pavel Machek wrote: > >> > >> >> Well, from freezer you need: > >> >> > >> >> 1) user process frozen. > >> >> > >> >> 2) essential locks _not_ held so that block devices are still functional. > >> >> > >> >> > > > mmap... what is problem with mmap? For suspend, memory is powered, so > >> >> > > > you can permit people changing it. > >> >> > > > >> >> > > Suppose mmap is used to make the registers of some device available to user > >> >> > > space. Yes, that can happen. > >> >> > >> >> "Don't do it, then". Yes, can happen, but hopefully is not too common > >> >> these days. [And... freezer doing 1) but not 2) would be enough to > >> >> handle that. Freezer doing 1) but not 2) would also be simpler...] > >> > > >> > Again, I'm not sure what you mean. > >> > > >> > Are you trying to say that it would be OK to freeze user space tasks in > >> > the D state? > >> > >> I think that's what Pavel is saying. Processes in D state sleeping > >> on non-device mutexes _are_ actually OK to freeze. And that would > >> nicely solve the fuse freeze problem. > > > > That's potentially deeadlock-prone, because a task waiting for mutex X may > > very well be holding mutex Y, so if there's another task waiting for mutex Y, > > it needs to be frozen at the same time. > > > >> The only little detail is how do we implement that... > > > > This means the only way I can see would be to hack the mutex code so that the > > try_to_freeze() was called for user space tasks after the > > sched_preempt_enable_no_resched() before schedule(). > > > > That shouldn't be a big deal performance-wise, because we are in the slow > > path anyway then. I'm not sure if Peter Z will like it, though. > > > > Moreover, a task waiting for a mutex may be holding a semaphore or be > > participating in some other mutual-exclusion mechanism, so we'd need to > > address them all. Plus, as noted by Pavel, freezing those things would make > > it difficult to save hibernation images to us. > > Well, as a first step I could cook up a patch that adds a flag to the > mutex indicating that it's freezable. Fuse would mark its mutexes > (and the mutexes that VFS uses on its behalf) as freezable. That way > we don't interfere with hibernation, except if hibernation uses fuse > but that's a very special case. Yes, that may be worth a shot. :-) Thanks, Rafael
Hi! > > That's potentially deeadlock-prone, because a task waiting for mutex X may > > very well be holding mutex Y, so if there's another task waiting for mutex Y, > > it needs to be frozen at the same time. > > > >> The only little detail is how do we implement that... > > > > This means the only way I can see would be to hack the mutex code so that the > > try_to_freeze() was called for user space tasks after the > > sched_preempt_enable_no_resched() before schedule(). > > > > That shouldn't be a big deal performance-wise, because we are in the slow > > path anyway then. I'm not sure if Peter Z will like it, though. > > > > Moreover, a task waiting for a mutex may be holding a semaphore or be > > participating in some other mutual-exclusion mechanism, so we'd need to > > address them all. Plus, as noted by Pavel, freezing those things would make > > it difficult to save hibernation images to us. > > Well, as a first step I could cook up a patch that adds a flag to the > mutex indicating that it's freezable. Fuse would mark its mutexes > (and the mutexes that VFS uses on its behalf) as freezable. That way > we don't interfere with hibernation, except if hibernation uses fuse > but that's a very special case. Yes please. If that is doable, that's the right solution. (After all, with FUSE, filesystem clients are just doing IPC. In ideal world, that would be freezeable and killable with -9). Pavel
On Tue, Feb 12, 2013 at 11:46 AM, Pavel Machek <pavel@ucw.cz> wrote: > (After all, with FUSE, filesystem clients are just doing IPC. In ideal > world, that would be freezeable and killable with -9). Exactly. Attaching a patch > > Pavel > -- > (english) http://www.livejournal.com/~pavelmachek > (cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html -- To unsubscribe from this list: send the line "unsubscribe linux-pm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Feb 12, 2013 at 2:13 PM, Miklos Szeredi <miklos@szeredi.hu> wrote: > On Tue, Feb 12, 2013 at 11:46 AM, Pavel Machek <pavel@ucw.cz> wrote: > >> (After all, with FUSE, filesystem clients are just doing IPC. In ideal >> world, that would be freezeable and killable with -9). > > Exactly. > > Attaching a patch And this time for real... Sorry for not sending it inline, I'm having a Mail Client Crisis. Thanks, Miklos
On Tue, Feb 12, 2013 at 2:17 PM, Miklos Szeredi <miklos@szeredi.hu> wrote: > On Tue, Feb 12, 2013 at 2:13 PM, Miklos Szeredi <miklos@szeredi.hu> wrote: >> On Tue, Feb 12, 2013 at 11:46 AM, Pavel Machek <pavel@ucw.cz> wrote: >> >>> (After all, with FUSE, filesystem clients are just doing IPC. In ideal >>> world, that would be freezeable and killable with -9). Well, I thought this can be constrained to locks directly related to the fuse filesystem itself, but it can't... The reason is page faults. Pretty much everyone and their brother uses get_user_pages*, holding various locks (mmap_sem for sure but others as well). A fuse filesystem frozen during a page read will block those. Separating those parts of the kernel that can be frozen from those that can't looks increasingly difficult. So I think the PF_FREEZE_DAEMON idea (the patch from Li Fei that started this thread) may still be our best bet at handling this situation. The idea being that pure "originator" processes (ones that take no part in serving filesystem syscalls) can be frozen up-front. Then the "fuse daemon" (or "server") processes are hopefully in a quiescent state and can be frozen without difficulty. Unfortunately it needs help from userspace: the kernel can't easily guess which processes are part of a "fuse daemon" and which aren't. Fortunately we have a standard library (libfuse) that can tell it to the kernel for the vast majority of cases. Thanks, Miklos -- To unsubscribe from this list: send the line "unsubscribe linux-pm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed 2013-02-13 18:34:16, Miklos Szeredi wrote: > On Tue, Feb 12, 2013 at 2:17 PM, Miklos Szeredi <miklos@szeredi.hu> wrote: > > On Tue, Feb 12, 2013 at 2:13 PM, Miklos Szeredi <miklos@szeredi.hu> wrote: > >> On Tue, Feb 12, 2013 at 11:46 AM, Pavel Machek <pavel@ucw.cz> wrote: > >> > >>> (After all, with FUSE, filesystem clients are just doing IPC. In ideal > >>> world, that would be freezeable and killable with -9). > > Well, I thought this can be constrained to locks directly related to > the fuse filesystem itself, but it can't... The reason is page And it looked so easy and nice... > faults. Pretty much everyone and their brother uses get_user_pages*, > holding various locks (mmap_sem for sure but others as well). A fuse > filesystem frozen during a page read will block those. Is it even valid for get_user_pages to be used on FUSE? What happens if fused tries to do some operation (it is userspace, it can do lot of operations) that needs one of those locks? It seems to me that fused will want to do operations hibernation would want to do, at the very least (reading/writing block devices). Thus, if it does not deadlock with fused, it should not deadlock with hibernation, either... [Or am i mistaken and FUSE's get_user_pages* needs FUSE locks but does not wait for fused to finish the operation?] > Separating those parts of the kernel that can be frozen from those > that can't looks increasingly difficult. :-(. Pavel
On Wednesday, February 13, 2013 06:34:16 PM Miklos Szeredi wrote: > On Tue, Feb 12, 2013 at 2:17 PM, Miklos Szeredi <miklos@szeredi.hu> wrote: > > On Tue, Feb 12, 2013 at 2:13 PM, Miklos Szeredi <miklos@szeredi.hu> wrote: > >> On Tue, Feb 12, 2013 at 11:46 AM, Pavel Machek <pavel@ucw.cz> wrote: > >> > >>> (After all, with FUSE, filesystem clients are just doing IPC. In ideal > >>> world, that would be freezeable and killable with -9). > > Well, I thought this can be constrained to locks directly related to > the fuse filesystem itself, but it can't... The reason is page > faults. Pretty much everyone and their brother uses get_user_pages*, > holding various locks (mmap_sem for sure but others as well). A fuse > filesystem frozen during a page read will block those. > > Separating those parts of the kernel that can be frozen from those > that can't looks increasingly difficult. > > So I think the PF_FREEZE_DAEMON idea (the patch from Li Fei that > started this thread) may still be our best bet at handling this > situation. The idea being that pure "originator" processes (ones that > take no part in serving filesystem syscalls) can be frozen up-front. > Then the "fuse daemon" (or "server") processes are hopefully in a > quiescent state and can be frozen without difficulty. > > Unfortunately it needs help from userspace: the kernel can't easily > guess which processes are part of a "fuse daemon" and which aren't. > Fortunately we have a standard library (libfuse) that can tell it to > the kernel for the vast majority of cases. So basically the idea would be to introduce something like PF_FREEZE_LATE for user space processes that need to be frozen after all of the other (non-PF_FREEZE_LATE) user space processes have been frozen and hack fuse to use that flag? Rafael
On Wed, Feb 13, 2013 at 9:16 PM, Pavel Machek <pavel@ucw.cz> wrote: > On Wed 2013-02-13 18:34:16, Miklos Szeredi wrote: >> On Tue, Feb 12, 2013 at 2:17 PM, Miklos Szeredi <miklos@szeredi.hu> wrote: >> > On Tue, Feb 12, 2013 at 2:13 PM, Miklos Szeredi <miklos@szeredi.hu> wrote: >> >> On Tue, Feb 12, 2013 at 11:46 AM, Pavel Machek <pavel@ucw.cz> wrote: >> >> >> >>> (After all, with FUSE, filesystem clients are just doing IPC. In ideal >> >>> world, that would be freezeable and killable with -9). >> >> Well, I thought this can be constrained to locks directly related to >> the fuse filesystem itself, but it can't... The reason is page > > And it looked so easy and nice... > >> faults. Pretty much everyone and their brother uses get_user_pages*, >> holding various locks (mmap_sem for sure but others as well). A fuse >> filesystem frozen during a page read will block those. > > Is it even valid for get_user_pages to be used on FUSE? Disabling all mmap (which is what you question) would make fuse a useless piece of toy. So yes, definitely it's valid. > > What happens if fused tries to do some operation (it is userspace, it > can do lot of operations) that needs one of those locks? E.g. mmap_sem is taken for the process _accessing_ a vma for a fuse filesystem. The server process better not do this (i.e. access the filesystem it's serving) or deadlocking (*) is basically guaranteed. That's nothing new. The thing that makes this hard for freeze is that those locks (mmap_sem, etc.) are only indirectly related to the fuse filesystem by the _actions_ of a process (accessing mmaped fuse area). Thanks, Miklos (*) userspace induced deadlocks in fuse are not hard deadlocks, they can be forced open by "umount -f" or "echo > /sys/fs/fuse/connections/$DEV/abort". -- To unsubscribe from this list: send the line "unsubscribe linux-pm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Feb 13, 2013 at 10:21 PM, Rafael J. Wysocki <rjw@sisk.pl> wrote: > On Wednesday, February 13, 2013 06:34:16 PM Miklos Szeredi wrote: >> >> So I think the PF_FREEZE_DAEMON idea (the patch from Li Fei that >> started this thread) may still be our best bet at handling this >> situation. The idea being that pure "originator" processes (ones that >> take no part in serving filesystem syscalls) can be frozen up-front. >> Then the "fuse daemon" (or "server") processes are hopefully in a >> quiescent state and can be frozen without difficulty. >> >> Unfortunately it needs help from userspace: the kernel can't easily >> guess which processes are part of a "fuse daemon" and which aren't. >> Fortunately we have a standard library (libfuse) that can tell it to >> the kernel for the vast majority of cases. > > So basically the idea would be to introduce something like PF_FREEZE_LATE > for user space processes that need to be frozen after all of the other > (non-PF_FREEZE_LATE) user space processes have been frozen and hack fuse > to use that flag? Yes. It is essentially the same mechanism that is used to delay the freezing of kernel threads after userspace tasks have been frozen. Except it's a lot more difficult to determine which userspace tasks need to be suspended late and which aren't. Thanks, Miklos -- To unsubscribe from this list: send the line "unsubscribe linux-pm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thursday, February 14, 2013 11:41:16 AM Miklos Szeredi wrote: > On Wed, Feb 13, 2013 at 10:21 PM, Rafael J. Wysocki <rjw@sisk.pl> wrote: > > On Wednesday, February 13, 2013 06:34:16 PM Miklos Szeredi wrote: > > >> > >> So I think the PF_FREEZE_DAEMON idea (the patch from Li Fei that > >> started this thread) may still be our best bet at handling this > >> situation. The idea being that pure "originator" processes (ones that > >> take no part in serving filesystem syscalls) can be frozen up-front. > >> Then the "fuse daemon" (or "server") processes are hopefully in a > >> quiescent state and can be frozen without difficulty. > >> > >> Unfortunately it needs help from userspace: the kernel can't easily > >> guess which processes are part of a "fuse daemon" and which aren't. > >> Fortunately we have a standard library (libfuse) that can tell it to > >> the kernel for the vast majority of cases. > > > > So basically the idea would be to introduce something like PF_FREEZE_LATE > > for user space processes that need to be frozen after all of the other > > (non-PF_FREEZE_LATE) user space processes have been frozen and hack fuse > > to use that flag? > > Yes. > > It is essentially the same mechanism that is used to delay the > freezing of kernel threads after userspace tasks have been frozen. > Except it's a lot more difficult to determine which userspace tasks > need to be suspended late and which aren't. Well, I suppose that information is available to user space. Do we need an interface for a process to mark itself as PF_FREEZE_LATE or do we need an interface for one process to mark another process as PF_FREEZE_LATE, or both? Rafael
On Thu, Feb 14, 2013 at 1:11 PM, Rafael J. Wysocki <rjw@sisk.pl> wrote: > On Thursday, February 14, 2013 11:41:16 AM Miklos Szeredi wrote: >> >> It is essentially the same mechanism that is used to delay the >> freezing of kernel threads after userspace tasks have been frozen. >> Except it's a lot more difficult to determine which userspace tasks >> need to be suspended late and which aren't. > > Well, I suppose that information is available to user space. > > Do we need an interface for a process to mark itself as PF_FREEZE_LATE or > do we need an interface for one process to mark another process as > PF_FREEZE_LATE, or both? As a first step marking self with PF_FREEZE_LATE and inheriting this flag across fork/clone would work for most cases, I think. Marking an unrelated process would have all sorts of issues: Who has permission to do this? Won't it be misused to "fix" random freezer issues. Thanks, Miklos -- To unsubscribe from this list: send the line "unsubscribe linux-pm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thursday, February 14, 2013 02:09:50 PM Miklos Szeredi wrote: > On Thu, Feb 14, 2013 at 1:11 PM, Rafael J. Wysocki <rjw@sisk.pl> wrote: > > On Thursday, February 14, 2013 11:41:16 AM Miklos Szeredi wrote: > > >> > >> It is essentially the same mechanism that is used to delay the > >> freezing of kernel threads after userspace tasks have been frozen. > >> Except it's a lot more difficult to determine which userspace tasks > >> need to be suspended late and which aren't. > > > > Well, I suppose that information is available to user space. > > > > Do we need an interface for a process to mark itself as PF_FREEZE_LATE or > > do we need an interface for one process to mark another process as > > PF_FREEZE_LATE, or both? > > As a first step marking self with PF_FREEZE_LATE and inheriting this > flag across fork/clone would work for most cases, I think. OK, so we can just have a switch for that in /proc I suppose. > Marking an unrelated process would have all sorts of issues: Who has > permission to do this? Won't it be misused to "fix" random freezer > issues. Yes, that's why I was asking. Thanks, Rafael
> -----Original Message----- > From: Rafael J. Wysocki [mailto:rjw@sisk.pl] > Sent: Friday, February 15, 2013 1:38 AM > To: Miklos Szeredi > Cc: Pavel Machek; Goswin von Brederlow; Li, Fei; Brown, Len; > mingo@redhat.com; peterz@infradead.org; Wang, Biao; > linux-pm@vger.kernel.org; fuse-devel@lists.sourceforge.net; > linux-kernel@vger.kernel.org; Liu, Chuansheng > Subject: Re: Getting rid of freezer for suspend [was Re: [fuse-devel] [PATCH] fuse: > make fuse daemon frozen along with kernel threads] > > On Thursday, February 14, 2013 02:09:50 PM Miklos Szeredi wrote: > > On Thu, Feb 14, 2013 at 1:11 PM, Rafael J. Wysocki <rjw@sisk.pl> wrote: > > > On Thursday, February 14, 2013 11:41:16 AM Miklos Szeredi wrote: > > > > >> > > >> It is essentially the same mechanism that is used to delay the > > >> freezing of kernel threads after userspace tasks have been frozen. > > >> Except it's a lot more difficult to determine which userspace tasks > > >> need to be suspended late and which aren't. > > > > > > Well, I suppose that information is available to user space. > > > > > > Do we need an interface for a process to mark itself as PF_FREEZE_LATE or > > > do we need an interface for one process to mark another process as > > > PF_FREEZE_LATE, or both? > > > > As a first step marking self with PF_FREEZE_LATE and inheriting this > > flag across fork/clone would work for most cases, I think. > > OK, so we can just have a switch for that in /proc I suppose. Thanks for feedback and suggestion. We have ever tried similar idea, expose interface /sys/power/pm_freeze_daemon, userspace tasks write 1 to this attribute to make itself to be frozen at the same time with kernel tasks, and it worked in our experiment. Do you think it's suitable and enough to use such attribute /sys/power/pm_freeze_late, or other more suitable place under /proc suggested? If needed, I can prepare the patch. Best Regards, Li Fei > > Marking an unrelated process would have all sorts of issues: Who has > > permission to do this? Won't it be misused to "fix" random freezer > > issues. > > Yes, that's why I was asking. > > Thanks, > Rafael > > > -- > I speak only for myself. > Rafael J. Wysocki, Intel Open Source Technology Center.
On Monday, February 18, 2013 06:26:34 AM Li, Fei wrote: > > > -----Original Message----- > > From: Rafael J. Wysocki [mailto:rjw@sisk.pl] > > Sent: Friday, February 15, 2013 1:38 AM > > To: Miklos Szeredi > > Cc: Pavel Machek; Goswin von Brederlow; Li, Fei; Brown, Len; > > mingo@redhat.com; peterz@infradead.org; Wang, Biao; > > linux-pm@vger.kernel.org; fuse-devel@lists.sourceforge.net; > > linux-kernel@vger.kernel.org; Liu, Chuansheng > > Subject: Re: Getting rid of freezer for suspend [was Re: [fuse-devel] [PATCH] fuse: > > make fuse daemon frozen along with kernel threads] > > > > On Thursday, February 14, 2013 02:09:50 PM Miklos Szeredi wrote: > > > On Thu, Feb 14, 2013 at 1:11 PM, Rafael J. Wysocki <rjw@sisk.pl> wrote: > > > > On Thursday, February 14, 2013 11:41:16 AM Miklos Szeredi wrote: > > > > > > >> > > > >> It is essentially the same mechanism that is used to delay the > > > >> freezing of kernel threads after userspace tasks have been frozen. > > > >> Except it's a lot more difficult to determine which userspace tasks > > > >> need to be suspended late and which aren't. > > > > > > > > Well, I suppose that information is available to user space. > > > > > > > > Do we need an interface for a process to mark itself as PF_FREEZE_LATE or > > > > do we need an interface for one process to mark another process as > > > > PF_FREEZE_LATE, or both? > > > > > > As a first step marking self with PF_FREEZE_LATE and inheriting this > > > flag across fork/clone would work for most cases, I think. > > > > OK, so we can just have a switch for that in /proc I suppose. > > Thanks for feedback and suggestion. > > We have ever tried similar idea, expose interface /sys/power/pm_freeze_daemon, > userspace tasks write 1 to this attribute to make itself to be frozen at the same time > with kernel tasks, and it worked in our experiment. > > Do you think it's suitable and enough to use such attribute /sys/power/pm_freeze_late, > or other more suitable place under /proc suggested? I think it should be inder /proc, because that's where controls related to process behavior are located. E.g. /proc/PID/freeze_late or something like that. Thanks, Rafael
Hi! > > > > > Well, I suppose that information is available to user space. > > > > > > > > > > Do we need an interface for a process to mark itself as PF_FREEZE_LATE or > > > > > do we need an interface for one process to mark another process as > > > > > PF_FREEZE_LATE, or both? > > > > > > > > As a first step marking self with PF_FREEZE_LATE and inheriting this > > > > flag across fork/clone would work for most cases, I think. > > > > > > OK, so we can just have a switch for that in /proc I suppose. > > > > Thanks for feedback and suggestion. > > > > We have ever tried similar idea, expose interface /sys/power/pm_freeze_daemon, > > userspace tasks write 1 to this attribute to make itself to be frozen at the same time > > with kernel tasks, and it worked in our experiment. > > > > Do you think it's suitable and enough to use such attribute /sys/power/pm_freeze_late, > > or other more suitable place under /proc suggested? > > I think it should be inder /proc, because that's where controls related to > process behavior are located. E.g. /proc/PID/freeze_late or something like > that. freeze_priority? I _hope_ we will not need more than three priorities, (user, fused, kernel), but I hoped not no need more than two before, so... Pavel
> -----Original Message----- > From: Pavel Machek [mailto:pavel@ucw.cz] > Sent: Tuesday, February 19, 2013 6:47 PM > To: Rafael J. Wysocki > Cc: Li, Fei; Miklos Szeredi; Goswin von Brederlow; Brown, Len; > mingo@redhat.com; peterz@infradead.org; Wang, Biao; > linux-pm@vger.kernel.org; fuse-devel@lists.sourceforge.net; > linux-kernel@vger.kernel.org; Liu, Chuansheng > Subject: Re: Getting rid of freezer for suspend [was Re: [fuse-devel] [PATCH] fuse: > make fuse daemon frozen along with kernel threads] > > Hi! > > > > > > > Well, I suppose that information is available to user space. > > > > > > > > > > > > Do we need an interface for a process to mark itself as PF_FREEZE_LATE > or > > > > > > do we need an interface for one process to mark another process as > > > > > > PF_FREEZE_LATE, or both? > > > > > > > > > > As a first step marking self with PF_FREEZE_LATE and inheriting this > > > > > flag across fork/clone would work for most cases, I think. > > > > > > > > OK, so we can just have a switch for that in /proc I suppose. > > > > > > Thanks for feedback and suggestion. > > > > > > We have ever tried similar idea, expose interface > /sys/power/pm_freeze_daemon, > > > userspace tasks write 1 to this attribute to make itself to be frozen at the > same time > > > with kernel tasks, and it worked in our experiment. > > > > > > Do you think it's suitable and enough to use such attribute > /sys/power/pm_freeze_late, > > > or other more suitable place under /proc suggested? > > > > I think it should be inder /proc, because that's where controls related to > > process behavior are located. E.g. /proc/PID/freeze_late or something like > > that. > > freeze_priority? > > I _hope_ we will not need more than three priorities, (user, fused, kernel), but > I hoped not no need more than two before, so... > Pavel > -- IMHO, we still use two priorities, with the exception that user space process such as fuse daemon can be configured to be frozen in kernel threads frozen phase instead of user space processes frozen phase. I submit a patch on https://lkml.org/lkml/2013/2/19/705 for such implementation. Could you please help to check whether it's suitable? Thanks and Regards, Li Fei > (english) http://www.livejournal.com/~pavelmachek > (cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html -- To unsubscribe from this list: send the line "unsubscribe linux-pm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Feb 20, 2013 at 3:54 AM, Li, Fei <fei.li@intel.com> wrote: >> From: Pavel Machek [mailto:pavel@ucw.cz] >> >> freeze_priority? >> >> I _hope_ we will not need more than three priorities, (user, fused, kernel), but >> I hoped not no need more than two before, so... > > IMHO, we still use two priorities, with the exception that user space process such > as fuse daemon can be configured to be frozen in kernel threads frozen phase > instead of user space processes frozen phase. I'm not sure that will work. Fuse processes are in fact ordinary userspace processes, and so should be frozen before kernel threads. Thanks, Miklos -- To unsubscribe from this list: send the line "unsubscribe linux-pm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 73ca6b7..fe476e8 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -20,6 +20,7 @@ #include <linux/random.h> #include <linux/sched.h> #include <linux/exportfs.h> +#include <linux/freezer.h> MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>"); MODULE_DESCRIPTION("Filesystem in Userspace"); @@ -367,6 +368,10 @@ void fuse_conn_kill(struct fuse_conn *fc) wake_up_all(&fc->waitq); wake_up_all(&fc->blocked_waitq); wake_up_all(&fc->reserved_req_waitq); + + /* Clear daemon flag after disconnection */ + clear_freeze_daemon_flag(); + } EXPORT_SYMBOL_GPL(fuse_conn_kill); @@ -1054,6 +1059,10 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) goto err_unlock; list_add_tail(&fc->entry, &fuse_conn_list); + + /* Set daemon flag just before connection */ + set_freeze_daemon_flag(); + sb->s_root = root_dentry; fc->connected = 1; file->private_data = fuse_conn_get(fc); diff --git a/include/linux/freezer.h b/include/linux/freezer.h index e4238ce..9f0d0cf 100644 --- a/include/linux/freezer.h +++ b/include/linux/freezer.h @@ -51,6 +51,8 @@ static inline bool try_to_freeze(void) extern bool freeze_task(struct task_struct *p); extern bool set_freezable(void); +extern bool set_freeze_daemon_flag(void); +extern bool clear_freeze_daemon_flag(void); #ifdef CONFIG_CGROUP_FREEZER extern bool cgroup_freezing(struct task_struct *task); @@ -217,6 +219,8 @@ static inline void freezer_do_not_count(void) {} static inline void freezer_count(void) {} static inline int freezer_should_skip(struct task_struct *p) { return 0; } static inline void set_freezable(void) {} +static inline void set_freeze_daemon_flag(void) {} +static inline void clear_freeze_daemon_flag(void) {} #define freezable_schedule() schedule() diff --git a/include/linux/sched.h b/include/linux/sched.h index d211247..6cdc969 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1826,6 +1826,8 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */ +/* Daemon threads to be frozen along with kernel threads */ +#define PF_FREEZER_DAEMON 0x80000000 /* * Only the _current_ task can read/write to tsk->flags, but other diff --git a/kernel/freezer.c b/kernel/freezer.c index c38893b..eff9440 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -39,7 +39,8 @@ bool freezing_slow_path(struct task_struct *p) if (pm_nosig_freezing || cgroup_freezing(p)) return true; - if (pm_freezing && !(p->flags & PF_KTHREAD)) + if (pm_freezing && !(p->flags & PF_KTHREAD) && + !(p->flags & PF_FREEZE_DAEMON)) return true; return false; @@ -162,3 +163,33 @@ bool set_freezable(void) return try_to_freeze(); } EXPORT_SYMBOL(set_freezable); + +/** + * set_freeze_daemon_flag - make %current as daemon + * + * Make %current being frozen by freezer along with kernel threads + */ +bool set_freeze_daemon_flag(void) +{ + spin_lock_irq(&freezer_lock); + current->flags |= PF_FREEZE_DAEMON; + spin_unlock_irq(&freezer_lock); + + return try_to_freeze(); +} +EXPORT_SYMBOL(set_freeze_daemon_flag); + +/** + * clear_freeze_daemon_flag - make %current as usual user space process + * + * Make %current being frozen by freezer along with user space processes + */ +bool clear_freeze_daemon_flag(void) +{ + spin_lock_irq(&freezer_lock); + current->flags &= ~PF_FREEZE_DAEMON; + spin_unlock_irq(&freezer_lock); + + return try_to_freeze(); +} +EXPORT_SYMBOL(clear_freeze_daemon_flag); diff --git a/kernel/power/process.c b/kernel/power/process.c index d5a258b..a4489ae 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -199,7 +199,7 @@ void thaw_kernel_threads(void) read_lock(&tasklist_lock); do_each_thread(g, p) { - if (p->flags & (PF_KTHREAD | PF_WQ_WORKER)) + if (p->flags & (PF_KTHREAD | PF_WQ_WORKER | PF_FREEZE_DAEMON)) __thaw_task(p); } while_each_thread(g, p); read_unlock(&tasklist_lock);