Message ID | 20230617053621.50359-5-bmeng@tinylab.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | net/tap: Fix QEMU frozen issue when the maximum number of file descriptors is very large | expand |
On Sat, 17 Jun 2023 13:36:19 +0800 Bin Meng <bmeng@tinylab.org> wrote: > This introduces a new QEMU API qemu_close_range() that closes all > open file descriptors from first to last (included). > > This API will try a more efficient call to close_range(), or walk > through of /proc/self/fd whenever these are possible, otherwise it > falls back to a plain close loop. > > Co-developed-by: Zhangjin Wu <falcon@tinylab.org> > Signed-off-by: Bin Meng <bmeng@tinylab.org> > > --- > > Changes in v3: > - fix win32 build failure > > Changes in v2: > - new patch: "util/osdep: Introduce qemu_close_range()" > > include/qemu/osdep.h | 1 + > util/osdep.c | 48 ++++++++++++++++++++++++++++++++++++++++++++ > 2 files changed, 49 insertions(+) > > diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h > index cc61b00ba9..e22434ce10 100644 > --- a/include/qemu/osdep.h > +++ b/include/qemu/osdep.h > @@ -560,6 +560,7 @@ int qemu_open_old(const char *name, int flags, ...); > int qemu_open(const char *name, int flags, Error **errp); > int qemu_create(const char *name, int flags, mode_t mode, Error **errp); > int qemu_close(int fd); > +int qemu_close_range(unsigned int first, unsigned int last); > int qemu_unlink(const char *name); > #ifndef _WIN32 > int qemu_dup_flags(int fd, int flags); > diff --git a/util/osdep.c b/util/osdep.c > index e996c4744a..91275e70f8 100644 > --- a/util/osdep.c > +++ b/util/osdep.c > @@ -30,6 +30,7 @@ > #include "qemu/mprotect.h" > #include "qemu/hw-version.h" > #include "monitor/monitor.h" > +#include <dirent.h> > > static const char *hw_version = QEMU_HW_VERSION; > > @@ -411,6 +412,53 @@ int qemu_close(int fd) > return close(fd); > } > > +int qemu_close_range(unsigned int first, unsigned int last) > +{ > + DIR *dir = NULL; > + > +#ifdef CONFIG_CLOSE_RANGE > + int r = close_range(first, last, 0); > + if (!r) { > + /* Success, no need to try other ways. */ > + return 0; > + } > +#endif > + > +#ifdef __linux__ > + dir = opendir("/proc/self/fd"); > +#endif > + if (!dir) { > + /* > + * If /proc is not mounted or /proc/self/fd is not supported, > + * try close() from first to last. > + */ > + for (int i = first; i <= last; i++) { > + close(i); will this compile on windows? > + } > + > + return 0; > + } > + > +#ifndef _WIN32 > + /* Avoid closing the directory */ > + int dfd = dirfd(dir); > + > + for (struct dirent *de = readdir(dir); de; de = readdir(dir)) { > + int fd = atoi(de->d_name); > + if (fd < first || fd > last) { > + /* Exclude the fds outside the target range */ > + continue; > + } > + if (fd != dfd) { > + close(fd); > + } > + } > + closedir(dir); > +#endif /* _WIN32 */ > + > + return 0; > +} > + > /* > * Delete a file from the filesystem, unless the filename is /dev/fdset/... > *
Bin Meng <bmeng@tinylab.org> writes: > This introduces a new QEMU API qemu_close_range() that closes all > open file descriptors from first to last (included). > > This API will try a more efficient call to close_range(), or walk > through of /proc/self/fd whenever these are possible, otherwise it > falls back to a plain close loop. > > Co-developed-by: Zhangjin Wu <falcon@tinylab.org> > Signed-off-by: Bin Meng <bmeng@tinylab.org> > > --- > > Changes in v3: > - fix win32 build failure > > Changes in v2: > - new patch: "util/osdep: Introduce qemu_close_range()" > > include/qemu/osdep.h | 1 + > util/osdep.c | 48 ++++++++++++++++++++++++++++++++++++++++++++ > 2 files changed, 49 insertions(+) > > diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h > index cc61b00ba9..e22434ce10 100644 > --- a/include/qemu/osdep.h > +++ b/include/qemu/osdep.h > @@ -560,6 +560,7 @@ int qemu_open_old(const char *name, int flags, ...); > int qemu_open(const char *name, int flags, Error **errp); > int qemu_create(const char *name, int flags, mode_t mode, Error **errp); > int qemu_close(int fd); > +int qemu_close_range(unsigned int first, unsigned int last); > int qemu_unlink(const char *name); > #ifndef _WIN32 > int qemu_dup_flags(int fd, int flags); > diff --git a/util/osdep.c b/util/osdep.c > index e996c4744a..91275e70f8 100644 > --- a/util/osdep.c > +++ b/util/osdep.c > @@ -30,6 +30,7 @@ > #include "qemu/mprotect.h" > #include "qemu/hw-version.h" > #include "monitor/monitor.h" > +#include <dirent.h> > > static const char *hw_version = QEMU_HW_VERSION; > > @@ -411,6 +412,53 @@ int qemu_close(int fd) > return close(fd); > } > > +int qemu_close_range(unsigned int first, unsigned int last) > +{ > + DIR *dir = NULL; > + > +#ifdef CONFIG_CLOSE_RANGE > + int r = close_range(first, last, 0); close_range(2) explains flag CLOSE_RANGE_UNSHARE Unshare the specified file descriptors from any other processes before closing them, avoiding races with other threads sharing the file descriptor table. Can anybody explain the races this avoids? > + if (!r) { > + /* Success, no need to try other ways. */ > + return 0; > + } What are the failure modes of close_range() where the other ways are worth trying? > +#endif > + > +#ifdef __linux__ > + dir = opendir("/proc/self/fd"); > +#endif > + if (!dir) { > + /* > + * If /proc is not mounted or /proc/self/fd is not supported, > + * try close() from first to last. > + */ > + for (int i = first; i <= last; i++) { > + close(i); > + } > + > + return 0; > + } > + > +#ifndef _WIN32 > + /* Avoid closing the directory */ > + int dfd = dirfd(dir); This directory contains "." "..", "0", "1", ... > + > + for (struct dirent *de = readdir(dir); de; de = readdir(dir)) { > + int fd = atoi(de->d_name); Maps "." and ".." to 0. Unclean. Please use qemu_strtoi(de->d_name, NULL, 10, &fd), and skip entries where it fails. > + if (fd < first || fd > last) { > + /* Exclude the fds outside the target range */ > + continue; > + } > + if (fd != dfd) { > + close(fd); > + } > + } > + closedir(dir); > +#endif /* _WIN32 */ > + > + return 0; > +} I'd prefer to order this from most to least preferred: close_range() iterate over /proc/self/fd iterate from first to last Unlike close_range(), qemu_close_range() returns 0 when last < first. If we want to deviate from close_range(), we better document the differences. This is a generalized version of async-teardown.c's close_all_open_fd(). I'd mention this in the commit message. Suggestion, not demand. > + > /* > * Delete a file from the filesystem, unless the filename is /dev/fdset/... > *
On 2023/6/19 17:18:53, "Claudio Imbrenda" <imbrenda@linux.ibm.com> wrote: >On Sat, 17 Jun 2023 13:36:19 +0800 >Bin Meng <bmeng@tinylab.org> wrote: > >> This introduces a new QEMU API qemu_close_range() that closes all >> open file descriptors from first to last (included). >> >> This API will try a more efficient call to close_range(), or walk >> through of /proc/self/fd whenever these are possible, otherwise it >> falls back to a plain close loop. >> >> Co-developed-by: Zhangjin Wu <falcon@tinylab.org> >> Signed-off-by: Bin Meng <bmeng@tinylab.org> >> >> --- >> >> Changes in v3: >> - fix win32 build failure >> >> Changes in v2: >> - new patch: "util/osdep: Introduce qemu_close_range()" >> >> include/qemu/osdep.h | 1 + >> util/osdep.c | 48 ++++++++++++++++++++++++++++++++++++++++++++ >> 2 files changed, 49 insertions(+) >> >> diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h >> index cc61b00ba9..e22434ce10 100644 >> --- a/include/qemu/osdep.h >> +++ b/include/qemu/osdep.h >> @@ -560,6 +560,7 @@ int qemu_open_old(const char *name, int flags, ...); >> int qemu_open(const char *name, int flags, Error **errp); >> int qemu_create(const char *name, int flags, mode_t mode, Error **errp); >> int qemu_close(int fd); >> +int qemu_close_range(unsigned int first, unsigned int last); >> int qemu_unlink(const char *name); >> #ifndef _WIN32 >> int qemu_dup_flags(int fd, int flags); >> diff --git a/util/osdep.c b/util/osdep.c >> index e996c4744a..91275e70f8 100644 >> --- a/util/osdep.c >> +++ b/util/osdep.c >> @@ -30,6 +30,7 @@ >> #include "qemu/mprotect.h" >> #include "qemu/hw-version.h" >> #include "monitor/monitor.h" >> +#include <dirent.h> >> >> static const char *hw_version = QEMU_HW_VERSION; >> >> @@ -411,6 +412,53 @@ int qemu_close(int fd) >> return close(fd); >> } >> >> +int qemu_close_range(unsigned int first, unsigned int last) >> +{ >> + DIR *dir = NULL; >> + >> +#ifdef CONFIG_CLOSE_RANGE >> + int r = close_range(first, last, 0); >> + if (!r) { >> + /* Success, no need to try other ways. */ >> + return 0; >> + } >> +#endif >> + >> +#ifdef __linux__ >> + dir = opendir("/proc/self/fd"); >> +#endif >> + if (!dir) { >> + /* >> + * If /proc is not mounted or /proc/self/fd is not supported, >> + * try close() from first to last. >> + */ >> + for (int i = first; i <= last; i++) { >> + close(i); > >will this compile on windows? Yes, it will. Regards, Bin
On 2023/6/19 17:22:53, "Markus Armbruster" <armbru@redhat.com> wrote: >Bin Meng <bmeng@tinylab.org> writes: > >> This introduces a new QEMU API qemu_close_range() that closes all >> open file descriptors from first to last (included). >> >> This API will try a more efficient call to close_range(), or walk >> through of /proc/self/fd whenever these are possible, otherwise it >> falls back to a plain close loop. >> >> Co-developed-by: Zhangjin Wu <falcon@tinylab.org> >> Signed-off-by: Bin Meng <bmeng@tinylab.org> >> >> --- >> >> Changes in v3: >> - fix win32 build failure >> >> Changes in v2: >> - new patch: "util/osdep: Introduce qemu_close_range()" >> >> include/qemu/osdep.h | 1 + >> util/osdep.c | 48 ++++++++++++++++++++++++++++++++++++++++++++ >> 2 files changed, 49 insertions(+) >> >> diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h >> index cc61b00ba9..e22434ce10 100644 >> --- a/include/qemu/osdep.h >> +++ b/include/qemu/osdep.h >> @@ -560,6 +560,7 @@ int qemu_open_old(const char *name, int flags, ...); >> int qemu_open(const char *name, int flags, Error **errp); >> int qemu_create(const char *name, int flags, mode_t mode, Error **errp); >> int qemu_close(int fd); >> +int qemu_close_range(unsigned int first, unsigned int last); >> int qemu_unlink(const char *name); >> #ifndef _WIN32 >> int qemu_dup_flags(int fd, int flags); >> diff --git a/util/osdep.c b/util/osdep.c >> index e996c4744a..91275e70f8 100644 >> --- a/util/osdep.c >> +++ b/util/osdep.c >> @@ -30,6 +30,7 @@ >> #include "qemu/mprotect.h" >> #include "qemu/hw-version.h" >> #include "monitor/monitor.h" >> +#include <dirent.h> >> >> static const char *hw_version = QEMU_HW_VERSION; >> >> @@ -411,6 +412,53 @@ int qemu_close(int fd) >> return close(fd); >> } >> >> +int qemu_close_range(unsigned int first, unsigned int last) >> +{ >> + DIR *dir = NULL; >> + >> +#ifdef CONFIG_CLOSE_RANGE >> + int r = close_range(first, last, 0); > >close_range(2) explains flag > > CLOSE_RANGE_UNSHARE > Unshare the specified file descriptors from any other processes > before closing them, avoiding races with other threads sharing > the file descriptor table. > >Can anybody explain the races this avoids? The kernel commit [1] which adds the close_range syscall says: unshare(CLONE_FILES) should only be needed if the calling task is multi-threaded and shares the file descriptor table with another thread in which case two threads could race with one thread allocating file descriptors and the other one closing them via close_range(). > > >> + if (!r) { >> + /* Success, no need to try other ways. */ >> + return 0; >> + } > >What are the failure modes of close_range() where the other ways are >worth trying? Added first < last check in v4 so that the technically close_range() should not fail. > > >> +#endif >> + >> +#ifdef __linux__ >> + dir = opendir("/proc/self/fd"); >> +#endif >> + if (!dir) { >> + /* >> + * If /proc is not mounted or /proc/self/fd is not supported, >> + * try close() from first to last. >> + */ >> + for (int i = first; i <= last; i++) { >> + close(i); >> + } >> + >> + return 0; >> + } >> + >> +#ifndef _WIN32 >> + /* Avoid closing the directory */ >> + int dfd = dirfd(dir); > >This directory contains "." "..", "0", "1", ... > >> + >> + for (struct dirent *de = readdir(dir); de; de = readdir(dir)) { >> + int fd = atoi(de->d_name); > >Maps "." and ".." to 0. Unclean. > >Please use qemu_strtoi(de->d_name, NULL, 10, &fd), and skip entries >where it fails. Fixed in v4. > > >> + if (fd < first || fd > last) { >> + /* Exclude the fds outside the target range */ >> + continue; >> + } >> + if (fd != dfd) { >> + close(fd); >> + } >> + } >> + closedir(dir); >> +#endif /* _WIN32 */ >> + >> + return 0; >> +} > >I'd prefer to order this from most to least preferred: > > close_range() > iterate over /proc/self/fd > iterate from first to last > >Unlike close_range(), qemu_close_range() returns 0 when last < first. >If we want to deviate from close_range(), we better document the >differences. > >This is a generalized version of async-teardown.c's close_all_open_fd(). >I'd mention this in the commit message. Suggestion, not demand. [1] https://github.com/torvalds/linux/commit/278a5fbaed89dacd04e9d052f4594ffd0e0585de Regards, Bin
diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h index cc61b00ba9..e22434ce10 100644 --- a/include/qemu/osdep.h +++ b/include/qemu/osdep.h @@ -560,6 +560,7 @@ int qemu_open_old(const char *name, int flags, ...); int qemu_open(const char *name, int flags, Error **errp); int qemu_create(const char *name, int flags, mode_t mode, Error **errp); int qemu_close(int fd); +int qemu_close_range(unsigned int first, unsigned int last); int qemu_unlink(const char *name); #ifndef _WIN32 int qemu_dup_flags(int fd, int flags); diff --git a/util/osdep.c b/util/osdep.c index e996c4744a..91275e70f8 100644 --- a/util/osdep.c +++ b/util/osdep.c @@ -30,6 +30,7 @@ #include "qemu/mprotect.h" #include "qemu/hw-version.h" #include "monitor/monitor.h" +#include <dirent.h> static const char *hw_version = QEMU_HW_VERSION; @@ -411,6 +412,53 @@ int qemu_close(int fd) return close(fd); } +int qemu_close_range(unsigned int first, unsigned int last) +{ + DIR *dir = NULL; + +#ifdef CONFIG_CLOSE_RANGE + int r = close_range(first, last, 0); + if (!r) { + /* Success, no need to try other ways. */ + return 0; + } +#endif + +#ifdef __linux__ + dir = opendir("/proc/self/fd"); +#endif + if (!dir) { + /* + * If /proc is not mounted or /proc/self/fd is not supported, + * try close() from first to last. + */ + for (int i = first; i <= last; i++) { + close(i); + } + + return 0; + } + +#ifndef _WIN32 + /* Avoid closing the directory */ + int dfd = dirfd(dir); + + for (struct dirent *de = readdir(dir); de; de = readdir(dir)) { + int fd = atoi(de->d_name); + if (fd < first || fd > last) { + /* Exclude the fds outside the target range */ + continue; + } + if (fd != dfd) { + close(fd); + } + } + closedir(dir); +#endif /* _WIN32 */ + + return 0; +} + /* * Delete a file from the filesystem, unless the filename is /dev/fdset/... *
This introduces a new QEMU API qemu_close_range() that closes all open file descriptors from first to last (included). This API will try a more efficient call to close_range(), or walk through of /proc/self/fd whenever these are possible, otherwise it falls back to a plain close loop. Co-developed-by: Zhangjin Wu <falcon@tinylab.org> Signed-off-by: Bin Meng <bmeng@tinylab.org> --- Changes in v3: - fix win32 build failure Changes in v2: - new patch: "util/osdep: Introduce qemu_close_range()" include/qemu/osdep.h | 1 + util/osdep.c | 48 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+)