diff mbox

[v2] inotify: Extend ioctl to allow to request id of new watch descriptor

Message ID 20180209125656.e440e0518540d6b76ae42bc0@linux-foundation.org (mailing list archive)
State New, archived
Headers show

Commit Message

Andrew Morton Feb. 9, 2018, 8:56 p.m. UTC
On Fri, 9 Feb 2018 18:04:54 +0300 Kirill Tkhai <ktkhai@virtuozzo.com> wrote:

> Watch descriptor is id of the watch created by inotify_add_watch().
> It is allocated in inotify_add_to_idr(), and takes the numbers
> starting from 1. Every new inotify watch obtains next available
> number (usually, old + 1), as served by idr_alloc_cyclic().
> 
> CRIU (Checkpoint/Restore In Userspace) project supports inotify
> files, and restores watched descriptors with the same numbers,
> they had before dump. Since there was no kernel support, we
> had to use cycle to add a watch with specific descriptor id:
> 
> 	while (1) {
> 		int wd;
> 
> 		wd = inotify_add_watch(inotify_fd, path, mask);
> 		if (wd < 0) {
> 			break;
> 		} else if (wd == desired_wd_id) {
> 			ret = 0;
> 			break;
> 		}
> 
> 		inotify_rm_watch(inotify_fd, wd);
> 	}
> 
> (You may find the actual code at the below link:
>  https://github.com/checkpoint-restore/criu/blob/v3.7/criu/fsnotify.c#L577)
> 
> The cycle is suboptiomal and very expensive, but since there is no better
> kernel support, it was the only way to restore that. Happily, we had met
> mostly descriptors with small id, and this approach had worked somehow.
> 
> But recent time containers with inotify with big watch descriptors
> begun to come, and this way stopped to work at all. When descriptor id
> is something about 0x34d71d6, the restoring process spins in busy loop
> for a long time, and the restore hungs and delay of migration from node
> to node could easily be watched.
> 
> This patch aims to solve this problem. It introduces new ioctl
> INOTIFY_IOC_SETNEXTWD, which allows to request the number of next created
> watch descriptor from userspace. It simply calls idr_set_cursor() primitive
> to populate idr::idr_next, so that next idr_alloc_cyclic() allocation
> will return this id, if it is not occupied. This is the way which is
> used to restore some other resources from userspace. For example,
> /proc/sys/kernel/ns_last_pid works the same for task pids.
> 
> The new code is under CONFIG_CHECKPOINT_RESTORE #define, so small system
> may exclude it.
> 

Reviewed-by: Andrew Morton <akpm@linux-foundation.org>

With a little cleanup:

Comments

Kirill Tkhai Feb. 9, 2018, 10:45 p.m. UTC | #1
On 09.02.2018 23:56, Andrew Morton wrote:
> On Fri, 9 Feb 2018 18:04:54 +0300 Kirill Tkhai <ktkhai@virtuozzo.com> wrote:
> 
>> Watch descriptor is id of the watch created by inotify_add_watch().
>> It is allocated in inotify_add_to_idr(), and takes the numbers
>> starting from 1. Every new inotify watch obtains next available
>> number (usually, old + 1), as served by idr_alloc_cyclic().
>>
>> CRIU (Checkpoint/Restore In Userspace) project supports inotify
>> files, and restores watched descriptors with the same numbers,
>> they had before dump. Since there was no kernel support, we
>> had to use cycle to add a watch with specific descriptor id:
>>
>> 	while (1) {
>> 		int wd;
>>
>> 		wd = inotify_add_watch(inotify_fd, path, mask);
>> 		if (wd < 0) {
>> 			break;
>> 		} else if (wd == desired_wd_id) {
>> 			ret = 0;
>> 			break;
>> 		}
>>
>> 		inotify_rm_watch(inotify_fd, wd);
>> 	}
>>
>> (You may find the actual code at the below link:
>>  https://github.com/checkpoint-restore/criu/blob/v3.7/criu/fsnotify.c#L577)
>>
>> The cycle is suboptiomal and very expensive, but since there is no better
>> kernel support, it was the only way to restore that. Happily, we had met
>> mostly descriptors with small id, and this approach had worked somehow.
>>
>> But recent time containers with inotify with big watch descriptors
>> begun to come, and this way stopped to work at all. When descriptor id
>> is something about 0x34d71d6, the restoring process spins in busy loop
>> for a long time, and the restore hungs and delay of migration from node
>> to node could easily be watched.
>>
>> This patch aims to solve this problem. It introduces new ioctl
>> INOTIFY_IOC_SETNEXTWD, which allows to request the number of next created
>> watch descriptor from userspace. It simply calls idr_set_cursor() primitive
>> to populate idr::idr_next, so that next idr_alloc_cyclic() allocation
>> will return this id, if it is not occupied. This is the way which is
>> used to restore some other resources from userspace. For example,
>> /proc/sys/kernel/ns_last_pid works the same for task pids.
>>
>> The new code is under CONFIG_CHECKPOINT_RESTORE #define, so small system
>> may exclude it.
>>
> 
> Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
> 
> With a little cleanup:
> 
> --- a/fs/notify/inotify/inotify_user.c~inotify-extend-ioctl-to-allow-to-request-id-of-new-watch-descriptor-fix
> +++ a/fs/notify/inotify/inotify_user.c
> @@ -285,7 +285,6 @@ static int inotify_release(struct inode
>  static long inotify_ioctl(struct file *file, unsigned int cmd,
>  			  unsigned long arg)
>  {
> -	struct inotify_group_private_data *data __maybe_unused;
>  	struct fsnotify_group *group;
>  	struct fsnotify_event *fsn_event;
>  	void __user *p;
> @@ -294,7 +293,6 @@ static long inotify_ioctl(struct file *f
>  
>  	group = file->private_data;
>  	p = (void __user *) arg;
> -	data = &group->inotify_data;
>  
>  	pr_debug("%s: group=%p cmd=%u\n", __func__, group, cmd);
>  
> @@ -313,6 +311,9 @@ static long inotify_ioctl(struct file *f
>  	case INOTIFY_IOC_SETNEXTWD:
>  		ret = -EINVAL;
>  		if (arg >= 1 && arg <= INT_MAX) {
> +			struct inotify_group_private_data *data;
> +
> +			data = &group->inotify_data;
>  			spin_lock(&data->idr_lock);
>  			idr_set_cursor(&data->idr, (unsigned int)arg);
>  			spin_unlock(&data->idr_lock);

I have no objections.

Thanks,
Kirill
Stef Bon Feb. 11, 2018, 11:30 a.m. UTC | #2
2018-02-09 23:45 GMT+01:00 Kirill Tkhai <ktkhai@virtuozzo.com>:
> On 09.02.2018 23:56, Andrew Morton wrote:
>> On Fri, 9 Feb 2018 18:04:54 +0300 Kirill Tkhai <ktkhai@virtuozzo.com> wrote:
>>
>>> Watch descriptor is id of the watch created by inotify_add_watch().
>>> It is allocated in inotify_add_to_idr(), and takes the numbers
>>> starting from 1. Every new inotify watch obtains next available
>>> number (usually, old + 1), as served by idr_alloc_cyclic().
>>>
>>> CRIU (Checkpoint/Restore In Userspace) project supports inotify
>>> files, and restores watched descriptors with the same numbers,
>>> they had before dump. Since there was no kernel support, we
>>> had to use cycle to add a watch with specific descriptor id:
>>>
>>>      while (1) {
>>>              int wd;
>>>
>>>              wd = inotify_add_watch(inotify_fd, path, mask);
>>>              if (wd < 0) {
>>>                      break;
>>>              } else if (wd == desired_wd_id) {
>>>                      ret = 0;
>>>                      break;
>>>              }
>>>
>>>              inotify_rm_watch(inotify_fd, wd);
>>>      }
>>>
>>> (You may find the actual code at the below link:
>>>  https://github.com/checkpoint-restore/criu/blob/v3.7/criu/fsnotify.c#L577)

Well using a ioctl command to force a specific wd is possible, but
isn't it also possible
to do a "freeze" of all (inotify) watches which are involved, and
"unfreeze" when restoring?

Stef
Kirill Tkhai Feb. 12, 2018, 8:42 a.m. UTC | #3
On 11.02.2018 14:30, Stef Bon wrote:
> 2018-02-09 23:45 GMT+01:00 Kirill Tkhai <ktkhai@virtuozzo.com>:
>> On 09.02.2018 23:56, Andrew Morton wrote:
>>> On Fri, 9 Feb 2018 18:04:54 +0300 Kirill Tkhai <ktkhai@virtuozzo.com> wrote:
>>>
>>>> Watch descriptor is id of the watch created by inotify_add_watch().
>>>> It is allocated in inotify_add_to_idr(), and takes the numbers
>>>> starting from 1. Every new inotify watch obtains next available
>>>> number (usually, old + 1), as served by idr_alloc_cyclic().
>>>>
>>>> CRIU (Checkpoint/Restore In Userspace) project supports inotify
>>>> files, and restores watched descriptors with the same numbers,
>>>> they had before dump. Since there was no kernel support, we
>>>> had to use cycle to add a watch with specific descriptor id:
>>>>
>>>>      while (1) {
>>>>              int wd;
>>>>
>>>>              wd = inotify_add_watch(inotify_fd, path, mask);
>>>>              if (wd < 0) {
>>>>                      break;
>>>>              } else if (wd == desired_wd_id) {
>>>>                      ret = 0;
>>>>                      break;
>>>>              }
>>>>
>>>>              inotify_rm_watch(inotify_fd, wd);
>>>>      }
>>>>
>>>> (You may find the actual code at the below link:
>>>>  https://github.com/checkpoint-restore/criu/blob/v3.7/criu/fsnotify.c#L577)
> 
> Well using a ioctl command to force a specific wd is possible, but
> isn't it also possible
> to do a "freeze" of all (inotify) watches which are involved, and
> "unfreeze" when restoring?

Regarding C/R, all inotifies are involved ;) Also, all regular files, sockets,
memory mappings, etc. 

Checkpoint code attaches to a process via ptrace() and injects parasite code,
which collects data and metadata of all the process's entities. It's rather
difficult action, because several processes may be checkpointed, and they may
share files/memory mappings/fs/etc. Also, they may be related to different
namespaces. It's long to tell. You may dive into CRIU code, if you're interested.

Then, restore code tries to recreate the processes from ground (possible,
on another physical machine). It uses standard linux system calls to do that,
i.e., it starts from clone() and then creates everything else. When there is
the time to restore a file (inotify in our case), standard linux inotify_init1()
is called. We create the inotify fd, then dup2() it to appropriate number.
Then, we need to add watched files/directories to the inotify. And they must
be added with the same watch descriptor id, as they was at checkpoint time.
We use inotify_add_watch() and it returns id == 1, as you can see in kernel
code. But we need another id, say, 0xfffff. And there is no syscall like dup2()
for inotify watch descriptors. So, we use cyclic inotify_add_watch()/inotify_rm_watch()
as next inotify_add_watch() returns incremented id (see the kernel) despite inotify_rm_watch()
was called to remove old. After 0xfffff-1 iterations, inotify_add_watch() reaches
id we need and returns it. This scheme is very slow, and the patch allows to
restory inotify using 2 syscalls only (ioctl+inotify_add_watch).

So, answering your question: No, it's not possible to use freeze/unfreeze
to do that.

Kirill
Jan Kara Feb. 14, 2018, 10:18 a.m. UTC | #4
On Sat 10-02-18 01:45:16, Kirill Tkhai wrote:
> On 09.02.2018 23:56, Andrew Morton wrote:
> > On Fri, 9 Feb 2018 18:04:54 +0300 Kirill Tkhai <ktkhai@virtuozzo.com> wrote:
> > 
> >> Watch descriptor is id of the watch created by inotify_add_watch().
> >> It is allocated in inotify_add_to_idr(), and takes the numbers
> >> starting from 1. Every new inotify watch obtains next available
> >> number (usually, old + 1), as served by idr_alloc_cyclic().
> >>
> >> CRIU (Checkpoint/Restore In Userspace) project supports inotify
> >> files, and restores watched descriptors with the same numbers,
> >> they had before dump. Since there was no kernel support, we
> >> had to use cycle to add a watch with specific descriptor id:
> >>
> >> 	while (1) {
> >> 		int wd;
> >>
> >> 		wd = inotify_add_watch(inotify_fd, path, mask);
> >> 		if (wd < 0) {
> >> 			break;
> >> 		} else if (wd == desired_wd_id) {
> >> 			ret = 0;
> >> 			break;
> >> 		}
> >>
> >> 		inotify_rm_watch(inotify_fd, wd);
> >> 	}
> >>
> >> (You may find the actual code at the below link:
> >>  https://github.com/checkpoint-restore/criu/blob/v3.7/criu/fsnotify.c#L577)
> >>
> >> The cycle is suboptiomal and very expensive, but since there is no better
> >> kernel support, it was the only way to restore that. Happily, we had met
> >> mostly descriptors with small id, and this approach had worked somehow.
> >>
> >> But recent time containers with inotify with big watch descriptors
> >> begun to come, and this way stopped to work at all. When descriptor id
> >> is something about 0x34d71d6, the restoring process spins in busy loop
> >> for a long time, and the restore hungs and delay of migration from node
> >> to node could easily be watched.
> >>
> >> This patch aims to solve this problem. It introduces new ioctl
> >> INOTIFY_IOC_SETNEXTWD, which allows to request the number of next created
> >> watch descriptor from userspace. It simply calls idr_set_cursor() primitive
> >> to populate idr::idr_next, so that next idr_alloc_cyclic() allocation
> >> will return this id, if it is not occupied. This is the way which is
> >> used to restore some other resources from userspace. For example,
> >> /proc/sys/kernel/ns_last_pid works the same for task pids.
> >>
> >> The new code is under CONFIG_CHECKPOINT_RESTORE #define, so small system
> >> may exclude it.
> >>
> > 
> > Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
> > 
> > With a little cleanup:
> > 
> > --- a/fs/notify/inotify/inotify_user.c~inotify-extend-ioctl-to-allow-to-request-id-of-new-watch-descriptor-fix
> > +++ a/fs/notify/inotify/inotify_user.c
> > @@ -285,7 +285,6 @@ static int inotify_release(struct inode
> >  static long inotify_ioctl(struct file *file, unsigned int cmd,
> >  			  unsigned long arg)
> >  {
> > -	struct inotify_group_private_data *data __maybe_unused;
> >  	struct fsnotify_group *group;
> >  	struct fsnotify_event *fsn_event;
> >  	void __user *p;
> > @@ -294,7 +293,6 @@ static long inotify_ioctl(struct file *f
> >  
> >  	group = file->private_data;
> >  	p = (void __user *) arg;
> > -	data = &group->inotify_data;
> >  
> >  	pr_debug("%s: group=%p cmd=%u\n", __func__, group, cmd);
> >  
> > @@ -313,6 +311,9 @@ static long inotify_ioctl(struct file *f
> >  	case INOTIFY_IOC_SETNEXTWD:
> >  		ret = -EINVAL;
> >  		if (arg >= 1 && arg <= INT_MAX) {
> > +			struct inotify_group_private_data *data;
> > +
> > +			data = &group->inotify_data;
> >  			spin_lock(&data->idr_lock);
> >  			idr_set_cursor(&data->idr, (unsigned int)arg);
> >  			spin_unlock(&data->idr_lock);
> 
> I have no objections.

Thanks guys, I have added the patch (with cleanup included) to my tree.

								Honza
diff mbox

Patch

--- a/fs/notify/inotify/inotify_user.c~inotify-extend-ioctl-to-allow-to-request-id-of-new-watch-descriptor-fix
+++ a/fs/notify/inotify/inotify_user.c
@@ -285,7 +285,6 @@  static int inotify_release(struct inode
 static long inotify_ioctl(struct file *file, unsigned int cmd,
 			  unsigned long arg)
 {
-	struct inotify_group_private_data *data __maybe_unused;
 	struct fsnotify_group *group;
 	struct fsnotify_event *fsn_event;
 	void __user *p;
@@ -294,7 +293,6 @@  static long inotify_ioctl(struct file *f
 
 	group = file->private_data;
 	p = (void __user *) arg;
-	data = &group->inotify_data;
 
 	pr_debug("%s: group=%p cmd=%u\n", __func__, group, cmd);
 
@@ -313,6 +311,9 @@  static long inotify_ioctl(struct file *f
 	case INOTIFY_IOC_SETNEXTWD:
 		ret = -EINVAL;
 		if (arg >= 1 && arg <= INT_MAX) {
+			struct inotify_group_private_data *data;
+
+			data = &group->inotify_data;
 			spin_lock(&data->idr_lock);
 			idr_set_cursor(&data->idr, (unsigned int)arg);
 			spin_unlock(&data->idr_lock);