diff mbox series

[v3,4/6] util/osdep: Introduce qemu_close_range()

Message ID 20230617053621.50359-5-bmeng@tinylab.org (mailing list archive)
State New, archived
Headers show
Series net/tap: Fix QEMU frozen issue when the maximum number of file descriptors is very large | expand

Commit Message

Bin Meng June 17, 2023, 5:36 a.m. UTC
This introduces a new QEMU API qemu_close_range() that closes all
open file descriptors from first to last (included).

This API will try a more efficient call to close_range(), or walk
through of /proc/self/fd whenever these are possible, otherwise it
falls back to a plain close loop.

Co-developed-by: Zhangjin Wu <falcon@tinylab.org>
Signed-off-by: Bin Meng <bmeng@tinylab.org>

---

Changes in v3:
- fix win32 build failure

Changes in v2:
- new patch: "util/osdep: Introduce qemu_close_range()"

 include/qemu/osdep.h |  1 +
 util/osdep.c         | 48 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 49 insertions(+)

Comments

Claudio Imbrenda June 19, 2023, 9:18 a.m. UTC | #1
On Sat, 17 Jun 2023 13:36:19 +0800
Bin Meng <bmeng@tinylab.org> wrote:

> This introduces a new QEMU API qemu_close_range() that closes all
> open file descriptors from first to last (included).
> 
> This API will try a more efficient call to close_range(), or walk
> through of /proc/self/fd whenever these are possible, otherwise it
> falls back to a plain close loop.
> 
> Co-developed-by: Zhangjin Wu <falcon@tinylab.org>
> Signed-off-by: Bin Meng <bmeng@tinylab.org>
> 
> ---
> 
> Changes in v3:
> - fix win32 build failure
> 
> Changes in v2:
> - new patch: "util/osdep: Introduce qemu_close_range()"
> 
>  include/qemu/osdep.h |  1 +
>  util/osdep.c         | 48 ++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 49 insertions(+)
> 
> diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
> index cc61b00ba9..e22434ce10 100644
> --- a/include/qemu/osdep.h
> +++ b/include/qemu/osdep.h
> @@ -560,6 +560,7 @@ int qemu_open_old(const char *name, int flags, ...);
>  int qemu_open(const char *name, int flags, Error **errp);
>  int qemu_create(const char *name, int flags, mode_t mode, Error **errp);
>  int qemu_close(int fd);
> +int qemu_close_range(unsigned int first, unsigned int last);
>  int qemu_unlink(const char *name);
>  #ifndef _WIN32
>  int qemu_dup_flags(int fd, int flags);
> diff --git a/util/osdep.c b/util/osdep.c
> index e996c4744a..91275e70f8 100644
> --- a/util/osdep.c
> +++ b/util/osdep.c
> @@ -30,6 +30,7 @@
>  #include "qemu/mprotect.h"
>  #include "qemu/hw-version.h"
>  #include "monitor/monitor.h"
> +#include <dirent.h>
>  
>  static const char *hw_version = QEMU_HW_VERSION;
>  
> @@ -411,6 +412,53 @@ int qemu_close(int fd)
>      return close(fd);
>  }
>  
> +int qemu_close_range(unsigned int first, unsigned int last)
> +{
> +    DIR *dir = NULL;
> +
> +#ifdef CONFIG_CLOSE_RANGE
> +    int r = close_range(first, last, 0);
> +    if (!r) {
> +        /* Success, no need to try other ways. */
> +        return 0;
> +    }
> +#endif
> +
> +#ifdef __linux__
> +    dir = opendir("/proc/self/fd");
> +#endif
> +    if (!dir) {
> +        /*
> +         * If /proc is not mounted or /proc/self/fd is not supported,
> +         * try close() from first to last.
> +         */
> +        for (int i = first; i <= last; i++) {
> +            close(i);

will this compile on windows?

> +        }
> +
> +        return 0;
> +    }
> +
> +#ifndef _WIN32
> +    /* Avoid closing the directory */
> +    int dfd = dirfd(dir);
> +
> +    for (struct dirent *de = readdir(dir); de; de = readdir(dir)) {
> +        int fd = atoi(de->d_name);
> +        if (fd < first || fd > last) {
> +            /* Exclude the fds outside the target range */
> +            continue;
> +        }
> +        if (fd != dfd) {
> +            close(fd);
> +        }
> +    }
> +    closedir(dir);
> +#endif /* _WIN32 */
> +
> +    return 0;
> +}
> +
>  /*
>   * Delete a file from the filesystem, unless the filename is /dev/fdset/...
>   *
Markus Armbruster June 19, 2023, 9:22 a.m. UTC | #2
Bin Meng <bmeng@tinylab.org> writes:

> This introduces a new QEMU API qemu_close_range() that closes all
> open file descriptors from first to last (included).
>
> This API will try a more efficient call to close_range(), or walk
> through of /proc/self/fd whenever these are possible, otherwise it
> falls back to a plain close loop.
>
> Co-developed-by: Zhangjin Wu <falcon@tinylab.org>
> Signed-off-by: Bin Meng <bmeng@tinylab.org>
>
> ---
>
> Changes in v3:
> - fix win32 build failure
>
> Changes in v2:
> - new patch: "util/osdep: Introduce qemu_close_range()"
>
>  include/qemu/osdep.h |  1 +
>  util/osdep.c         | 48 ++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 49 insertions(+)
>
> diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
> index cc61b00ba9..e22434ce10 100644
> --- a/include/qemu/osdep.h
> +++ b/include/qemu/osdep.h
> @@ -560,6 +560,7 @@ int qemu_open_old(const char *name, int flags, ...);
>  int qemu_open(const char *name, int flags, Error **errp);
>  int qemu_create(const char *name, int flags, mode_t mode, Error **errp);
>  int qemu_close(int fd);
> +int qemu_close_range(unsigned int first, unsigned int last);
>  int qemu_unlink(const char *name);
>  #ifndef _WIN32
>  int qemu_dup_flags(int fd, int flags);
> diff --git a/util/osdep.c b/util/osdep.c
> index e996c4744a..91275e70f8 100644
> --- a/util/osdep.c
> +++ b/util/osdep.c
> @@ -30,6 +30,7 @@
>  #include "qemu/mprotect.h"
>  #include "qemu/hw-version.h"
>  #include "monitor/monitor.h"
> +#include <dirent.h>
>  
>  static const char *hw_version = QEMU_HW_VERSION;
>  
> @@ -411,6 +412,53 @@ int qemu_close(int fd)
>      return close(fd);
>  }
>  
> +int qemu_close_range(unsigned int first, unsigned int last)
> +{
> +    DIR *dir = NULL;
> +
> +#ifdef CONFIG_CLOSE_RANGE
> +    int r = close_range(first, last, 0);

close_range(2) explains flag

       CLOSE_RANGE_UNSHARE
              Unshare  the specified file descriptors from any other processes
              before closing them, avoiding races with other  threads  sharing
              the file descriptor table.

Can anybody explain the races this avoids?

> +    if (!r) {
> +        /* Success, no need to try other ways. */
> +        return 0;
> +    }

What are the failure modes of close_range() where the other ways are
worth trying?

> +#endif
> +
> +#ifdef __linux__
> +    dir = opendir("/proc/self/fd");
> +#endif
> +    if (!dir) {
> +        /*
> +         * If /proc is not mounted or /proc/self/fd is not supported,
> +         * try close() from first to last.
> +         */
> +        for (int i = first; i <= last; i++) {
> +            close(i);
> +        }
> +
> +        return 0;
> +    }
> +
> +#ifndef _WIN32
> +    /* Avoid closing the directory */
> +    int dfd = dirfd(dir);

This directory contains "." "..", "0", "1", ...

> +
> +    for (struct dirent *de = readdir(dir); de; de = readdir(dir)) {
> +        int fd = atoi(de->d_name);

Maps "." and ".." to 0.  Unclean.

Please use qemu_strtoi(de->d_name, NULL, 10, &fd), and skip entries
where it fails.

> +        if (fd < first || fd > last) {
> +            /* Exclude the fds outside the target range */
> +            continue;
> +        }
> +        if (fd != dfd) {
> +            close(fd);
> +        }
> +    }
> +    closedir(dir);
> +#endif /* _WIN32 */
> +
> +    return 0;
> +}

I'd prefer to order this from most to least preferred:

    close_range()
    iterate over /proc/self/fd
    iterate from first to last

Unlike close_range(), qemu_close_range() returns 0 when last < first.
If we want to deviate from close_range(), we better document the
differences.

This is a generalized version of async-teardown.c's close_all_open_fd().
I'd mention this in the commit message.  Suggestion, not demand.

> +
>  /*
>   * Delete a file from the filesystem, unless the filename is /dev/fdset/...
>   *
Bin Meng June 28, 2023, 3:12 p.m. UTC | #3
On 2023/6/19 17:18:53, "Claudio Imbrenda" <imbrenda@linux.ibm.com> 
wrote:

>On Sat, 17 Jun 2023 13:36:19 +0800
>Bin Meng <bmeng@tinylab.org> wrote:
>
>>  This introduces a new QEMU API qemu_close_range() that closes all
>>  open file descriptors from first to last (included).
>>
>>  This API will try a more efficient call to close_range(), or walk
>>  through of /proc/self/fd whenever these are possible, otherwise it
>>  falls back to a plain close loop.
>>
>>  Co-developed-by: Zhangjin Wu <falcon@tinylab.org>
>>  Signed-off-by: Bin Meng <bmeng@tinylab.org>
>>
>>  ---
>>
>>  Changes in v3:
>>  - fix win32 build failure
>>
>>  Changes in v2:
>>  - new patch: "util/osdep: Introduce qemu_close_range()"
>>
>>   include/qemu/osdep.h |  1 +
>>   util/osdep.c         | 48 ++++++++++++++++++++++++++++++++++++++++++++
>>   2 files changed, 49 insertions(+)
>>
>>  diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
>>  index cc61b00ba9..e22434ce10 100644
>>  --- a/include/qemu/osdep.h
>>  +++ b/include/qemu/osdep.h
>>  @@ -560,6 +560,7 @@ int qemu_open_old(const char *name, int flags, ...);
>>   int qemu_open(const char *name, int flags, Error **errp);
>>   int qemu_create(const char *name, int flags, mode_t mode, Error **errp);
>>   int qemu_close(int fd);
>>  +int qemu_close_range(unsigned int first, unsigned int last);
>>   int qemu_unlink(const char *name);
>>   #ifndef _WIN32
>>   int qemu_dup_flags(int fd, int flags);
>>  diff --git a/util/osdep.c b/util/osdep.c
>>  index e996c4744a..91275e70f8 100644
>>  --- a/util/osdep.c
>>  +++ b/util/osdep.c
>>  @@ -30,6 +30,7 @@
>>   #include "qemu/mprotect.h"
>>   #include "qemu/hw-version.h"
>>   #include "monitor/monitor.h"
>>  +#include <dirent.h>
>>
>>   static const char *hw_version = QEMU_HW_VERSION;
>>
>>  @@ -411,6 +412,53 @@ int qemu_close(int fd)
>>       return close(fd);
>>   }
>>
>>  +int qemu_close_range(unsigned int first, unsigned int last)
>>  +{
>>  +    DIR *dir = NULL;
>>  +
>>  +#ifdef CONFIG_CLOSE_RANGE
>>  +    int r = close_range(first, last, 0);
>>  +    if (!r) {
>>  +        /* Success, no need to try other ways. */
>>  +        return 0;
>>  +    }
>>  +#endif
>>  +
>>  +#ifdef __linux__
>>  +    dir = opendir("/proc/self/fd");
>>  +#endif
>>  +    if (!dir) {
>>  +        /*
>>  +         * If /proc is not mounted or /proc/self/fd is not supported,
>>  +         * try close() from first to last.
>>  +         */
>>  +        for (int i = first; i <= last; i++) {
>>  +            close(i);
>
>will this compile on windows?

Yes, it will.

Regards,
Bin
Bin Meng June 28, 2023, 3:40 p.m. UTC | #4
On 2023/6/19 17:22:53, "Markus Armbruster" <armbru@redhat.com> wrote:

>Bin Meng <bmeng@tinylab.org> writes:
>
>>  This introduces a new QEMU API qemu_close_range() that closes all
>>  open file descriptors from first to last (included).
>>
>>  This API will try a more efficient call to close_range(), or walk
>>  through of /proc/self/fd whenever these are possible, otherwise it
>>  falls back to a plain close loop.
>>
>>  Co-developed-by: Zhangjin Wu <falcon@tinylab.org>
>>  Signed-off-by: Bin Meng <bmeng@tinylab.org>
>>
>>  ---
>>
>>  Changes in v3:
>>  - fix win32 build failure
>>
>>  Changes in v2:
>>  - new patch: "util/osdep: Introduce qemu_close_range()"
>>
>>   include/qemu/osdep.h |  1 +
>>   util/osdep.c         | 48 ++++++++++++++++++++++++++++++++++++++++++++
>>   2 files changed, 49 insertions(+)
>>
>>  diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
>>  index cc61b00ba9..e22434ce10 100644
>>  --- a/include/qemu/osdep.h
>>  +++ b/include/qemu/osdep.h
>>  @@ -560,6 +560,7 @@ int qemu_open_old(const char *name, int flags, ...);
>>   int qemu_open(const char *name, int flags, Error **errp);
>>   int qemu_create(const char *name, int flags, mode_t mode, Error **errp);
>>   int qemu_close(int fd);
>>  +int qemu_close_range(unsigned int first, unsigned int last);
>>   int qemu_unlink(const char *name);
>>   #ifndef _WIN32
>>   int qemu_dup_flags(int fd, int flags);
>>  diff --git a/util/osdep.c b/util/osdep.c
>>  index e996c4744a..91275e70f8 100644
>>  --- a/util/osdep.c
>>  +++ b/util/osdep.c
>>  @@ -30,6 +30,7 @@
>>   #include "qemu/mprotect.h"
>>   #include "qemu/hw-version.h"
>>   #include "monitor/monitor.h"
>>  +#include <dirent.h>
>>
>>   static const char *hw_version = QEMU_HW_VERSION;
>>
>>  @@ -411,6 +412,53 @@ int qemu_close(int fd)
>>       return close(fd);
>>   }
>>
>>  +int qemu_close_range(unsigned int first, unsigned int last)
>>  +{
>>  +    DIR *dir = NULL;
>>  +
>>  +#ifdef CONFIG_CLOSE_RANGE
>>  +    int r = close_range(first, last, 0);
>
>close_range(2) explains flag
>
>        CLOSE_RANGE_UNSHARE
>               Unshare  the specified file descriptors from any other processes
>               before closing them, avoiding races with other  threads  sharing
>               the file descriptor table.
>
>Can anybody explain the races this avoids?

The kernel commit [1] which adds the close_range syscall says:

unshare(CLONE_FILES) should only be needed if the calling
task is multi-threaded and shares the file descriptor table with another
thread in which case two threads could race with one thread allocating 
file
descriptors and the other one closing them via close_range().

>
>
>>  +    if (!r) {
>>  +        /* Success, no need to try other ways. */
>>  +        return 0;
>>  +    }
>
>What are the failure modes of close_range() where the other ways are
>worth trying?

Added first < last check in v4 so that the technically close_range() 
should not fail.

>
>
>>  +#endif
>>  +
>>  +#ifdef __linux__
>>  +    dir = opendir("/proc/self/fd");
>>  +#endif
>>  +    if (!dir) {
>>  +        /*
>>  +         * If /proc is not mounted or /proc/self/fd is not supported,
>>  +         * try close() from first to last.
>>  +         */
>>  +        for (int i = first; i <= last; i++) {
>>  +            close(i);
>>  +        }
>>  +
>>  +        return 0;
>>  +    }
>>  +
>>  +#ifndef _WIN32
>>  +    /* Avoid closing the directory */
>>  +    int dfd = dirfd(dir);
>
>This directory contains "." "..", "0", "1", ...
>
>>  +
>>  +    for (struct dirent *de = readdir(dir); de; de = readdir(dir)) {
>>  +        int fd = atoi(de->d_name);
>
>Maps "." and ".." to 0.  Unclean.
>
>Please use qemu_strtoi(de->d_name, NULL, 10, &fd), and skip entries
>where it fails.

Fixed in v4.

>
>
>>  +        if (fd < first || fd > last) {
>>  +            /* Exclude the fds outside the target range */
>>  +            continue;
>>  +        }
>>  +        if (fd != dfd) {
>>  +            close(fd);
>>  +        }
>>  +    }
>>  +    closedir(dir);
>>  +#endif /* _WIN32 */
>>  +
>>  +    return 0;
>>  +}
>
>I'd prefer to order this from most to least preferred:
>
>     close_range()
>     iterate over /proc/self/fd
>     iterate from first to last
>
>Unlike close_range(), qemu_close_range() returns 0 when last < first.
>If we want to deviate from close_range(), we better document the
>differences.
>
>This is a generalized version of async-teardown.c's close_all_open_fd().
>I'd mention this in the commit message.  Suggestion, not demand.

[1] 
https://github.com/torvalds/linux/commit/278a5fbaed89dacd04e9d052f4594ffd0e0585de

Regards,
Bin
diff mbox series

Patch

diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
index cc61b00ba9..e22434ce10 100644
--- a/include/qemu/osdep.h
+++ b/include/qemu/osdep.h
@@ -560,6 +560,7 @@  int qemu_open_old(const char *name, int flags, ...);
 int qemu_open(const char *name, int flags, Error **errp);
 int qemu_create(const char *name, int flags, mode_t mode, Error **errp);
 int qemu_close(int fd);
+int qemu_close_range(unsigned int first, unsigned int last);
 int qemu_unlink(const char *name);
 #ifndef _WIN32
 int qemu_dup_flags(int fd, int flags);
diff --git a/util/osdep.c b/util/osdep.c
index e996c4744a..91275e70f8 100644
--- a/util/osdep.c
+++ b/util/osdep.c
@@ -30,6 +30,7 @@ 
 #include "qemu/mprotect.h"
 #include "qemu/hw-version.h"
 #include "monitor/monitor.h"
+#include <dirent.h>
 
 static const char *hw_version = QEMU_HW_VERSION;
 
@@ -411,6 +412,53 @@  int qemu_close(int fd)
     return close(fd);
 }
 
+int qemu_close_range(unsigned int first, unsigned int last)
+{
+    DIR *dir = NULL;
+
+#ifdef CONFIG_CLOSE_RANGE
+    int r = close_range(first, last, 0);
+    if (!r) {
+        /* Success, no need to try other ways. */
+        return 0;
+    }
+#endif
+
+#ifdef __linux__
+    dir = opendir("/proc/self/fd");
+#endif
+    if (!dir) {
+        /*
+         * If /proc is not mounted or /proc/self/fd is not supported,
+         * try close() from first to last.
+         */
+        for (int i = first; i <= last; i++) {
+            close(i);
+        }
+
+        return 0;
+    }
+
+#ifndef _WIN32
+    /* Avoid closing the directory */
+    int dfd = dirfd(dir);
+
+    for (struct dirent *de = readdir(dir); de; de = readdir(dir)) {
+        int fd = atoi(de->d_name);
+        if (fd < first || fd > last) {
+            /* Exclude the fds outside the target range */
+            continue;
+        }
+        if (fd != dfd) {
+            close(fd);
+        }
+    }
+    closedir(dir);
+#endif /* _WIN32 */
+
+    return 0;
+}
+
 /*
  * Delete a file from the filesystem, unless the filename is /dev/fdset/...
  *