diff mbox series

[v4,04/16] hw/9pfs: Implement Windows specific xxxdir() APIs

Message ID 20230130095202.2773186-5-bin.meng@windriver.com (mailing list archive)
State New, archived
Headers show
Series hw/9pfs: Add 9pfs support for Windows | expand

Commit Message

Bin Meng Jan. 30, 2023, 9:51 a.m. UTC
From: Guohuai Shi <guohuai.shi@windriver.com>

This commit implements Windows specific xxxdir() APIs for safety
directory access.

Signed-off-by: Guohuai Shi <guohuai.shi@windriver.com>
Signed-off-by: Bin Meng <bin.meng@windriver.com>
---

 hw/9pfs/9p-util.h       |   6 +
 hw/9pfs/9p-util-win32.c | 296 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 302 insertions(+)

Comments

Christian Schoenebeck Feb. 3, 2023, 12:24 p.m. UTC | #1
On Monday, January 30, 2023 10:51:50 AM CET Bin Meng wrote:
> From: Guohuai Shi <guohuai.shi@windriver.com>
> 
> This commit implements Windows specific xxxdir() APIs for safety
> directory access.
> 

This issue deserves a link to either the previous discussion

Link: https://lore.kernel.org/qemu-devel/2830993.GtbaR8S6b6@silver/

and/or a link to this continuation of the discussion here, as it's not a
trivial issue, with pros and cons been discussed for the individual, possible
solutions.

> Signed-off-by: Guohuai Shi <guohuai.shi@windriver.com>
> Signed-off-by: Bin Meng <bin.meng@windriver.com>
> ---
> 
>  hw/9pfs/9p-util.h       |   6 +
>  hw/9pfs/9p-util-win32.c | 296 ++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 302 insertions(+)
> 
> diff --git a/hw/9pfs/9p-util.h b/hw/9pfs/9p-util.h
> index 0f159fb4ce..c1c251fbd1 100644
> --- a/hw/9pfs/9p-util.h
> +++ b/hw/9pfs/9p-util.h
> @@ -141,6 +141,12 @@ int unlinkat_win32(int dirfd, const char *pathname, int flags);
>  int statfs_win32(const char *root_path, struct statfs *stbuf);
>  int openat_dir(int dirfd, const char *name);
>  int openat_file(int dirfd, const char *name, int flags, mode_t mode);
> +DIR *opendir_win32(const char *full_file_name);
> +int closedir_win32(DIR *pDir);
> +struct dirent *readdir_win32(DIR *pDir);
> +void rewinddir_win32(DIR *pDir);
> +void seekdir_win32(DIR *pDir, long pos);
> +long telldir_win32(DIR *pDir);
>  #endif
>  
>  static inline void close_preserve_errno(int fd)
> diff --git a/hw/9pfs/9p-util-win32.c b/hw/9pfs/9p-util-win32.c
> index a99d579a06..5503199300 100644
> --- a/hw/9pfs/9p-util-win32.c
> +++ b/hw/9pfs/9p-util-win32.c
> @@ -37,6 +37,13 @@
>   *    Windows does not support opendir, the directory fd is created by
>   *    CreateFile and convert to fd by _open_osfhandle(). Keep the fd open will
>   *    lock and protect the directory (can not be modified or replaced)
> + *
> + * 5. Windows and MinGW does not provide safety directory accessing functions.
> + *    readdir(), seekdir() and telldir() may get or set wrong value because
> + *    directory entry data is not protected.

I would rephrase that sentence, as it doesn't cover the root problem
adequately. Maybe something like this:

5. Neither Windows native APIs, nor MinGW provide a POSIX compatible API for
acquiring directory entries in a safe way. Calling those APIs (native
_findfirst() and _findnext() or MinGW's readdir(), seekdir() and telldir())
directly can lead to an inconsistent state if directory is modified in
between, e.g. the same directory appearing more than once in output, or
directories not appearing at all in output even though they were neither newly
created nor deleted. POSIX does not define what happens with deleted or newly
created directories in between, but it guarantees a consistent state.

> + *
> + *    This file re-write POSIX directory accessing functions and cache all
> + *    directory entries during opening.
>   */
>  
>  #include "qemu/osdep.h"
> @@ -51,6 +58,27 @@
>  
>  #define V9FS_MAGIC  0x53465039  /* string "9PFS" */
>  
> +/*
> + * MinGW and Windows does not provide safety way to seek directory while other
> + * thread is modifying same directory.
> + *
> + * The two structures are used to cache all directory entries when opening it.
> + * Cached entries are always returned for read or seek.
> + */
> +struct dir_win32_entry {
> +    QSLIST_ENTRY(dir_win32_entry) node;
> +    struct _finddata_t dd_data;
> +};
> +
> +struct dir_win32 {
> +    struct dirent dd_dir;
> +    uint32_t offset;
> +    uint32_t total_entries;
> +    QSLIST_HEAD(, dir_win32_entry) head;
> +    struct dir_win32_entry *current;
> +    char dd_name[1];
> +};
> +
>  /*
>   * win32_error_to_posix - convert Win32 error to POSIX error number
>   *
> @@ -977,3 +1005,271 @@ int qemu_mknodat(int dirfd, const char *filename, mode_t mode, dev_t dev)
>      errno = ENOTSUP;
>      return -1;
>  }
> +
> +/*
> + * opendir_win32 - open a directory
> + *
> + * This function opens a directory and caches all directory entries.
> + */
> +DIR *opendir_win32(const char *full_file_name)
> +{
> +    HANDLE hDir = INVALID_HANDLE_VALUE;
> +    DWORD attribute;
> +    intptr_t dd_handle = -1;
> +    struct _finddata_t dd_data;
> +
> +    struct dir_win32 *stream = NULL;
> +    struct dir_win32_entry *dir_entry;
> +    struct dir_win32_entry *prev;
> +    struct dir_win32_entry *next;
> +
> +    int err = 0;
> +    int find_status;
> +    uint32_t index;
> +
> +    /* open directory to prevent it being removed */
> +
> +    hDir = CreateFile(full_file_name, GENERIC_READ,
> +                      FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
> +                      NULL,
> +                      OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS, NULL);
> +
> +    if (hDir == INVALID_HANDLE_VALUE) {
> +        err = win32_error_to_posix(GetLastError());
> +        goto out;
> +    }
> +
> +    attribute = GetFileAttributes(full_file_name);
> +
> +    /* symlink is not allow */
> +    if (attribute == INVALID_FILE_ATTRIBUTES
> +        || (attribute & FILE_ATTRIBUTE_REPARSE_POINT) != 0) {
> +        err = EACCES;
> +        goto out;
> +    }
> +
> +    /* check if it is a directory */
> +    if ((attribute & FILE_ATTRIBUTE_DIRECTORY) == 0) {
> +        err = ENOTDIR;
> +        goto out;
> +    }
> +
> +    /*
> +     * findfirst() need suffix format name like "\dir1\dir2\*", allocate more
> +     * buffer to store suffix.
> +     */
> +    stream = g_malloc0(sizeof(struct dir_win32) + strlen(full_file_name) + 3);
> +    QSLIST_INIT(&stream->head);
> +
> +    strcpy(stream->dd_name, full_file_name);
> +    strcat(stream->dd_name, "\\*");
> +
> +    dd_handle = _findfirst(stream->dd_name, &dd_data);
> +
> +    if (dd_handle == -1) {
> +        err = errno;
> +        goto out;
> +    }
> +
> +    index = 0;
> +
> +    /* read all entries to link list */
> +    do {
> +        dir_entry = g_malloc0(sizeof(struct dir_win32_entry));
> +        memcpy(&dir_entry->dd_data, &dd_data, sizeof(dd_data));
> +        if (index == 0) {
> +            QSLIST_INSERT_HEAD(&stream->head, dir_entry, node);
> +        } else {
> +            QSLIST_INSERT_AFTER(prev, dir_entry, node);
> +        }
> +
> +        prev = dir_entry;
> +        find_status = _findnext(dd_handle, &dd_data);
> +
> +        index++;
> +    } while (find_status == 0);

So you decided to go for the solution that caches all entries of a directory
in RAM.

So don't you think my last suggested solution that would call native
_findfirst() and _findnext() directly, but without any chaching and instead
picking the relevent entry simply by inode number, might be a better candidate
as a starting point for landing Windows support? Link to that previous
suggestion:

https://lore.kernel.org/qemu-devel/2468168.SvRIHAoRfs@silver/

> +
> +    if (errno == ENOENT) {
> +        /* No more matching files could be found, clean errno */
> +        errno = 0;
> +    } else {
> +        err = errno;
> +        goto out;
> +    }
> +
> +    stream->total_entries = index;
> +    stream->current = QSLIST_FIRST(&stream->head);
> +
> +out:
> +    if (err != 0) {
> +        errno = err;
> +        /* free whole list */
> +        if (stream != NULL) {
> +            QSLIST_FOREACH_SAFE(dir_entry, &stream->head, node, next) {
> +                QSLIST_REMOVE(&stream->head, dir_entry, dir_win32_entry, node);
> +                g_free(dir_entry);
> +            }
> +            g_free(stream);
> +            stream = NULL;
> +        }
> +    }
> +
> +    /* after cached all entries, this handle is useless */
> +    if (dd_handle != -1) {
> +        _findclose(dd_handle);
> +    }
> +
> +    if (hDir != INVALID_HANDLE_VALUE) {
> +        CloseHandle(hDir);
> +    }
> +
> +    return (DIR *)stream;
> +}
> +
> +/*
> + * closedir_win32 - close a directory
> + *
> + * This function closes directory and free all cached resources.
> + */
> +int closedir_win32(DIR *pDir)
> +{
> +    struct dir_win32 *stream = (struct dir_win32 *)pDir;
> +    struct dir_win32_entry *dir_entry;
> +    struct dir_win32_entry *next;
> +
> +    if (stream == NULL) {
> +        errno = EBADF;
> +        return -1;
> +    }
> +
> +    /* free all resources */
> +
> +    QSLIST_FOREACH_SAFE(dir_entry, &stream->head, node, next) {
> +        QSLIST_REMOVE(&stream->head, dir_entry, dir_win32_entry, node);
> +        g_free(dir_entry);
> +    }
> +
> +    g_free(stream);
> +
> +    return 0;
> +}
> +
> +/*
> + * readdir_win32 - read a directory
> + *
> + * This function reads a directory entry from cached entry list.
> + */
> +struct dirent *readdir_win32(DIR *pDir)
> +{
> +    struct dir_win32 *stream = (struct dir_win32 *)pDir;
> +
> +    if (stream == NULL) {
> +        errno = EBADF;
> +        return NULL;
> +    }
> +
> +    if (stream->offset >= stream->total_entries) {
> +        /* reach to the end, return NULL without set errno */
> +        return NULL;
> +    }
> +
> +    memcpy(stream->dd_dir.d_name,
> +           stream->current->dd_data.name,
> +           sizeof(stream->dd_dir.d_name));
> +
> +    /* Windows does not provide inode number */
> +    stream->dd_dir.d_ino = 0;
> +    stream->dd_dir.d_reclen = 0;
> +    stream->dd_dir.d_namlen = strlen(stream->dd_dir.d_name);
> +
> +    stream->offset++;
> +    stream->current = QSLIST_NEXT(stream->current, node);
> +
> +    return &stream->dd_dir;
> +}
> +
> +/*
> + * rewinddir_win32 - reset directory stream
> + *
> + * This function resets the position of the directory stream to the
> + * beginning of the directory.
> + */
> +void rewinddir_win32(DIR *pDir)
> +{
> +    struct dir_win32 *stream = (struct dir_win32 *)pDir;
> +
> +    if (stream == NULL) {
> +        errno = EBADF;
> +        return;
> +    }
> +
> +    stream->offset = 0;
> +    stream->current = QSLIST_FIRST(&stream->head);
> +
> +    return;
> +}
> +
> +/*
> + * seekdir_win32 - set the position of the next readdir() call in the directory
> + *
> + * This function sets the position of the next readdir() call in the directory
> + * from which the next readdir() call will start.
> + */
> +void seekdir_win32(DIR *pDir, long pos)
> +{
> +    struct dir_win32 *stream = (struct dir_win32 *)pDir;
> +    uint32_t index;
> +
> +    if (stream == NULL) {
> +        errno = EBADF;
> +        return;
> +    }
> +
> +    if (pos < -1) {
> +        errno = EINVAL;
> +        return;
> +    }
> +
> +    if (pos == -1 || pos >= (long)stream->total_entries) {
> +        /* seek to the end */
> +        stream->offset = stream->total_entries;
> +        return;
> +    }
> +
> +    if (pos - (long)stream->offset == 0) {
> +        /* no need to seek */
> +        return;
> +    }
> +
> +    /* seek position from list head */
> +
> +    stream->current = QSLIST_FIRST(&stream->head);
> +
> +    for (index = 0; index < (uint32_t)pos; index++) {
> +        stream->current = QSLIST_NEXT(stream->current, node);
> +    }
> +    stream->offset = index;
> +
> +    return;
> +}
> +
> +/*
> + * telldir_win32 - return current location in directory
> + *
> + * This function returns current location in directory.
> + */
> +long telldir_win32(DIR *pDir)
> +{
> +    struct dir_win32 *stream = (struct dir_win32 *)pDir;
> +
> +    if (stream == NULL) {
> +        errno = EBADF;
> +        return -1;
> +    }
> +
> +    if (stream->offset > stream->total_entries) {
> +        return -1;
> +    }
> +
> +    return (long)stream->offset;
> +}
>
Shi, Guohuai Feb. 3, 2023, 1:34 p.m. UTC | #2
> -----Original Message-----
> From: Christian Schoenebeck <qemu_oss@crudebyte.com>
> Sent: Friday, February 3, 2023 20:25
> To: Greg Kurz <groug@kaod.org>; qemu-devel@nongnu.org
> Cc: Shi, Guohuai <Guohuai.Shi@windriver.com>; Meng, Bin
> <Bin.Meng@windriver.com>; Marc-André Lureau <marcandre.lureau@redhat.com>;
> Daniel P. Berrangé <berrange@redhat.com>
> Subject: Re: [PATCH v4 04/16] hw/9pfs: Implement Windows specific xxxdir()
> APIs
> 
> CAUTION: This email comes from a non Wind River email account!
> Do not click links or open attachments unless you recognize the sender and
> know the content is safe.
> 
> On Monday, January 30, 2023 10:51:50 AM CET Bin Meng wrote:
> > From: Guohuai Shi <guohuai.shi@windriver.com>
> >
> > This commit implements Windows specific xxxdir() APIs for safety
> > directory access.
> >
> 
> This issue deserves a link to either the previous discussion
> 
> Link: https://lore.kernel.org/qemu-devel/2830993.GtbaR8S6b6@silver/
> 
> and/or a link to this continuation of the discussion here, as it's not a
> trivial issue, with pros and cons been discussed for the individual, possible
> solutions.
> 
> > Signed-off-by: Guohuai Shi <guohuai.shi@windriver.com>
> > Signed-off-by: Bin Meng <bin.meng@windriver.com>
> > ---
> >
> >  hw/9pfs/9p-util.h       |   6 +
> >  hw/9pfs/9p-util-win32.c | 296
> > ++++++++++++++++++++++++++++++++++++++++
> >  2 files changed, 302 insertions(+)
> >
> > diff --git a/hw/9pfs/9p-util.h b/hw/9pfs/9p-util.h index
> > 0f159fb4ce..c1c251fbd1 100644
> > --- a/hw/9pfs/9p-util.h
> > +++ b/hw/9pfs/9p-util.h
> > @@ -141,6 +141,12 @@ int unlinkat_win32(int dirfd, const char
> > *pathname, int flags);  int statfs_win32(const char *root_path, struct
> > statfs *stbuf);  int openat_dir(int dirfd, const char *name);  int
> > openat_file(int dirfd, const char *name, int flags, mode_t mode);
> > +DIR *opendir_win32(const char *full_file_name); int
> > +closedir_win32(DIR *pDir); struct dirent *readdir_win32(DIR *pDir);
> > +void rewinddir_win32(DIR *pDir); void seekdir_win32(DIR *pDir, long
> > +pos); long telldir_win32(DIR *pDir);
> >  #endif
> >
> >  static inline void close_preserve_errno(int fd) diff --git
> > a/hw/9pfs/9p-util-win32.c b/hw/9pfs/9p-util-win32.c index
> > a99d579a06..5503199300 100644
> > --- a/hw/9pfs/9p-util-win32.c
> > +++ b/hw/9pfs/9p-util-win32.c
> > @@ -37,6 +37,13 @@
> >   *    Windows does not support opendir, the directory fd is created by
> >   *    CreateFile and convert to fd by _open_osfhandle(). Keep the fd open
> will
> >   *    lock and protect the directory (can not be modified or replaced)
> > + *
> > + * 5. Windows and MinGW does not provide safety directory accessing
> functions.
> > + *    readdir(), seekdir() and telldir() may get or set wrong value
> because
> > + *    directory entry data is not protected.
> 
> I would rephrase that sentence, as it doesn't cover the root problem
> adequately. Maybe something like this:
> 
> 5. Neither Windows native APIs, nor MinGW provide a POSIX compatible API for
> acquiring directory entries in a safe way. Calling those APIs (native
> _findfirst() and _findnext() or MinGW's readdir(), seekdir() and telldir())
> directly can lead to an inconsistent state if directory is modified in
> between, e.g. the same directory appearing more than once in output, or
> directories not appearing at all in output even though they were neither
> newly created nor deleted. POSIX does not define what happens with deleted or
> newly created directories in between, but it guarantees a consistent state.
> 
> > + *
> > + *    This file re-write POSIX directory accessing functions and cache all
> > + *    directory entries during opening.
> >   */
> >
> >  #include "qemu/osdep.h"
> > @@ -51,6 +58,27 @@
> >
> >  #define V9FS_MAGIC  0x53465039  /* string "9PFS" */
> >
> > +/*
> > + * MinGW and Windows does not provide safety way to seek directory
> > +while other
> > + * thread is modifying same directory.
> > + *
> > + * The two structures are used to cache all directory entries when opening
> it.
> > + * Cached entries are always returned for read or seek.
> > + */
> > +struct dir_win32_entry {
> > +    QSLIST_ENTRY(dir_win32_entry) node;
> > +    struct _finddata_t dd_data;
> > +};
> > +
> > +struct dir_win32 {
> > +    struct dirent dd_dir;
> > +    uint32_t offset;
> > +    uint32_t total_entries;
> > +    QSLIST_HEAD(, dir_win32_entry) head;
> > +    struct dir_win32_entry *current;
> > +    char dd_name[1];
> > +};
> > +
> >  /*
> >   * win32_error_to_posix - convert Win32 error to POSIX error number
> >   *
> > @@ -977,3 +1005,271 @@ int qemu_mknodat(int dirfd, const char *filename,
> mode_t mode, dev_t dev)
> >      errno = ENOTSUP;
> >      return -1;
> >  }
> > +
> > +/*
> > + * opendir_win32 - open a directory
> > + *
> > + * This function opens a directory and caches all directory entries.
> > + */
> > +DIR *opendir_win32(const char *full_file_name) {
> > +    HANDLE hDir = INVALID_HANDLE_VALUE;
> > +    DWORD attribute;
> > +    intptr_t dd_handle = -1;
> > +    struct _finddata_t dd_data;
> > +
> > +    struct dir_win32 *stream = NULL;
> > +    struct dir_win32_entry *dir_entry;
> > +    struct dir_win32_entry *prev;
> > +    struct dir_win32_entry *next;
> > +
> > +    int err = 0;
> > +    int find_status;
> > +    uint32_t index;
> > +
> > +    /* open directory to prevent it being removed */
> > +
> > +    hDir = CreateFile(full_file_name, GENERIC_READ,
> > +                      FILE_SHARE_READ | FILE_SHARE_WRITE |
> FILE_SHARE_DELETE,
> > +                      NULL,
> > +                      OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS,
> > + NULL);
> > +
> > +    if (hDir == INVALID_HANDLE_VALUE) {
> > +        err = win32_error_to_posix(GetLastError());
> > +        goto out;
> > +    }
> > +
> > +    attribute = GetFileAttributes(full_file_name);
> > +
> > +    /* symlink is not allow */
> > +    if (attribute == INVALID_FILE_ATTRIBUTES
> > +        || (attribute & FILE_ATTRIBUTE_REPARSE_POINT) != 0) {
> > +        err = EACCES;
> > +        goto out;
> > +    }
> > +
> > +    /* check if it is a directory */
> > +    if ((attribute & FILE_ATTRIBUTE_DIRECTORY) == 0) {
> > +        err = ENOTDIR;
> > +        goto out;
> > +    }
> > +
> > +    /*
> > +     * findfirst() need suffix format name like "\dir1\dir2\*", allocate
> more
> > +     * buffer to store suffix.
> > +     */
> > +    stream = g_malloc0(sizeof(struct dir_win32) + strlen(full_file_name) +
> 3);
> > +    QSLIST_INIT(&stream->head);
> > +
> > +    strcpy(stream->dd_name, full_file_name);
> > +    strcat(stream->dd_name, "\\*");
> > +
> > +    dd_handle = _findfirst(stream->dd_name, &dd_data);
> > +
> > +    if (dd_handle == -1) {
> > +        err = errno;
> > +        goto out;
> > +    }
> > +
> > +    index = 0;
> > +
> > +    /* read all entries to link list */
> > +    do {
> > +        dir_entry = g_malloc0(sizeof(struct dir_win32_entry));
> > +        memcpy(&dir_entry->dd_data, &dd_data, sizeof(dd_data));
> > +        if (index == 0) {
> > +            QSLIST_INSERT_HEAD(&stream->head, dir_entry, node);
> > +        } else {
> > +            QSLIST_INSERT_AFTER(prev, dir_entry, node);
> > +        }
> > +
> > +        prev = dir_entry;
> > +        find_status = _findnext(dd_handle, &dd_data);
> > +
> > +        index++;
> > +    } while (find_status == 0);
> 
> So you decided to go for the solution that caches all entries of a directory
> in RAM.
> 
> So don't you think my last suggested solution that would call native
> _findfirst() and _findnext() directly, but without any chaching and instead
> picking the relevent entry simply by inode number, might be a better
> candidate as a starting point for landing Windows support? Link to that
> previous
> suggestion:
> 
> https://lore.kernel.org/qemu-devel/2468168.SvRIHAoRfs@silver/
> 

I did a quick test for caching data without name entry, but it failed for reading + deleting directory on Windows host (like "rm -rf" for a directory).
The root cause is: Windows's directory entry is not cached.
If there is 100 files in a directory:

File1
File2
...
File100

When "rm -rf" is working:

It read first 10 entries, and remove them. 9pfs may seek and re-seek to offset 10 to read next 10 entries.
But Windows and MinGW does not provide rewinddir.
If we using findfirst() and findnext to seek to offset 10, then we will not get File11 but get File 21 (because we skipped 10 entries by seekdir()).

If we removed some entries in directory, inode number is useless because we can not find it again.


Thanks
Guohuai


> > +
> > +    if (errno == ENOENT) {
> > +        /* No more matching files could be found, clean errno */
> > +        errno = 0;
> > +    } else {
> > +        err = errno;
> > +        goto out;
> > +    }
> > +
> > +    stream->total_entries = index;
> > +    stream->current = QSLIST_FIRST(&stream->head);
> > +
> > +out:
> > +    if (err != 0) {
> > +        errno = err;
> > +        /* free whole list */
> > +        if (stream != NULL) {
> > +            QSLIST_FOREACH_SAFE(dir_entry, &stream->head, node, next) {
> > +                QSLIST_REMOVE(&stream->head, dir_entry, dir_win32_entry,
> node);
> > +                g_free(dir_entry);
> > +            }
> > +            g_free(stream);
> > +            stream = NULL;
> > +        }
> > +    }
> > +
> > +    /* after cached all entries, this handle is useless */
> > +    if (dd_handle != -1) {
> > +        _findclose(dd_handle);
> > +    }
> > +
> > +    if (hDir != INVALID_HANDLE_VALUE) {
> > +        CloseHandle(hDir);
> > +    }
> > +
> > +    return (DIR *)stream;
> > +}
> > +
> > +/*
> > + * closedir_win32 - close a directory
> > + *
> > + * This function closes directory and free all cached resources.
> > + */
> > +int closedir_win32(DIR *pDir)
> > +{
> > +    struct dir_win32 *stream = (struct dir_win32 *)pDir;
> > +    struct dir_win32_entry *dir_entry;
> > +    struct dir_win32_entry *next;
> > +
> > +    if (stream == NULL) {
> > +        errno = EBADF;
> > +        return -1;
> > +    }
> > +
> > +    /* free all resources */
> > +
> > +    QSLIST_FOREACH_SAFE(dir_entry, &stream->head, node, next) {
> > +        QSLIST_REMOVE(&stream->head, dir_entry, dir_win32_entry, node);
> > +        g_free(dir_entry);
> > +    }
> > +
> > +    g_free(stream);
> > +
> > +    return 0;
> > +}
> > +
> > +/*
> > + * readdir_win32 - read a directory
> > + *
> > + * This function reads a directory entry from cached entry list.
> > + */
> > +struct dirent *readdir_win32(DIR *pDir) {
> > +    struct dir_win32 *stream = (struct dir_win32 *)pDir;
> > +
> > +    if (stream == NULL) {
> > +        errno = EBADF;
> > +        return NULL;
> > +    }
> > +
> > +    if (stream->offset >= stream->total_entries) {
> > +        /* reach to the end, return NULL without set errno */
> > +        return NULL;
> > +    }
> > +
> > +    memcpy(stream->dd_dir.d_name,
> > +           stream->current->dd_data.name,
> > +           sizeof(stream->dd_dir.d_name));
> > +
> > +    /* Windows does not provide inode number */
> > +    stream->dd_dir.d_ino = 0;
> > +    stream->dd_dir.d_reclen = 0;
> > +    stream->dd_dir.d_namlen = strlen(stream->dd_dir.d_name);
> > +
> > +    stream->offset++;
> > +    stream->current = QSLIST_NEXT(stream->current, node);
> > +
> > +    return &stream->dd_dir;
> > +}
> > +
> > +/*
> > + * rewinddir_win32 - reset directory stream
> > + *
> > + * This function resets the position of the directory stream to the
> > + * beginning of the directory.
> > + */
> > +void rewinddir_win32(DIR *pDir)
> > +{
> > +    struct dir_win32 *stream = (struct dir_win32 *)pDir;
> > +
> > +    if (stream == NULL) {
> > +        errno = EBADF;
> > +        return;
> > +    }
> > +
> > +    stream->offset = 0;
> > +    stream->current = QSLIST_FIRST(&stream->head);
> > +
> > +    return;
> > +}
> > +
> > +/*
> > + * seekdir_win32 - set the position of the next readdir() call in the
> > +directory
> > + *
> > + * This function sets the position of the next readdir() call in the
> > +directory
> > + * from which the next readdir() call will start.
> > + */
> > +void seekdir_win32(DIR *pDir, long pos) {
> > +    struct dir_win32 *stream = (struct dir_win32 *)pDir;
> > +    uint32_t index;
> > +
> > +    if (stream == NULL) {
> > +        errno = EBADF;
> > +        return;
> > +    }
> > +
> > +    if (pos < -1) {
> > +        errno = EINVAL;
> > +        return;
> > +    }
> > +
> > +    if (pos == -1 || pos >= (long)stream->total_entries) {
> > +        /* seek to the end */
> > +        stream->offset = stream->total_entries;
> > +        return;
> > +    }
> > +
> > +    if (pos - (long)stream->offset == 0) {
> > +        /* no need to seek */
> > +        return;
> > +    }
> > +
> > +    /* seek position from list head */
> > +
> > +    stream->current = QSLIST_FIRST(&stream->head);
> > +
> > +    for (index = 0; index < (uint32_t)pos; index++) {
> > +        stream->current = QSLIST_NEXT(stream->current, node);
> > +    }
> > +    stream->offset = index;
> > +
> > +    return;
> > +}
> > +
> > +/*
> > + * telldir_win32 - return current location in directory
> > + *
> > + * This function returns current location in directory.
> > + */
> > +long telldir_win32(DIR *pDir)
> > +{
> > +    struct dir_win32 *stream = (struct dir_win32 *)pDir;
> > +
> > +    if (stream == NULL) {
> > +        errno = EBADF;
> > +        return -1;
> > +    }
> > +
> > +    if (stream->offset > stream->total_entries) {
> > +        return -1;
> > +    }
> > +
> > +    return (long)stream->offset;
> > +}
> >
>
Christian Schoenebeck Feb. 3, 2023, 2:40 p.m. UTC | #3
On Friday, February 3, 2023 2:34:13 PM CET Shi, Guohuai wrote:
> 
> > -----Original Message-----
> > From: Christian Schoenebeck <qemu_oss@crudebyte.com>
> > Sent: Friday, February 3, 2023 20:25
> > To: Greg Kurz <groug@kaod.org>; qemu-devel@nongnu.org
> > Cc: Shi, Guohuai <Guohuai.Shi@windriver.com>; Meng, Bin
> > <Bin.Meng@windriver.com>; Marc-André Lureau <marcandre.lureau@redhat.com>;
> > Daniel P. Berrangé <berrange@redhat.com>
> > Subject: Re: [PATCH v4 04/16] hw/9pfs: Implement Windows specific xxxdir()
> > APIs
> > 
> > CAUTION: This email comes from a non Wind River email account!
> > Do not click links or open attachments unless you recognize the sender and
> > know the content is safe.
> > 
> > On Monday, January 30, 2023 10:51:50 AM CET Bin Meng wrote:
> > > From: Guohuai Shi <guohuai.shi@windriver.com>
> > >
> > > This commit implements Windows specific xxxdir() APIs for safety
> > > directory access.
> > >
> > 
> > This issue deserves a link to either the previous discussion
> > 
> > Link: https://lore.kernel.org/qemu-devel/2830993.GtbaR8S6b6@silver/
> > 
> > and/or a link to this continuation of the discussion here, as it's not a
> > trivial issue, with pros and cons been discussed for the individual, possible
> > solutions.
> > 
> > > Signed-off-by: Guohuai Shi <guohuai.shi@windriver.com>
> > > Signed-off-by: Bin Meng <bin.meng@windriver.com>
> > > ---
> > >
> > >  hw/9pfs/9p-util.h       |   6 +
> > >  hw/9pfs/9p-util-win32.c | 296
> > > ++++++++++++++++++++++++++++++++++++++++
> > >  2 files changed, 302 insertions(+)
> > >
> > > diff --git a/hw/9pfs/9p-util.h b/hw/9pfs/9p-util.h index
> > > 0f159fb4ce..c1c251fbd1 100644
> > > --- a/hw/9pfs/9p-util.h
> > > +++ b/hw/9pfs/9p-util.h
> > > @@ -141,6 +141,12 @@ int unlinkat_win32(int dirfd, const char
> > > *pathname, int flags);  int statfs_win32(const char *root_path, struct
> > > statfs *stbuf);  int openat_dir(int dirfd, const char *name);  int
> > > openat_file(int dirfd, const char *name, int flags, mode_t mode);
> > > +DIR *opendir_win32(const char *full_file_name); int
> > > +closedir_win32(DIR *pDir); struct dirent *readdir_win32(DIR *pDir);
> > > +void rewinddir_win32(DIR *pDir); void seekdir_win32(DIR *pDir, long
> > > +pos); long telldir_win32(DIR *pDir);
> > >  #endif
> > >
> > >  static inline void close_preserve_errno(int fd) diff --git
> > > a/hw/9pfs/9p-util-win32.c b/hw/9pfs/9p-util-win32.c index
> > > a99d579a06..5503199300 100644
> > > --- a/hw/9pfs/9p-util-win32.c
> > > +++ b/hw/9pfs/9p-util-win32.c
> > > @@ -37,6 +37,13 @@
> > >   *    Windows does not support opendir, the directory fd is created by
> > >   *    CreateFile and convert to fd by _open_osfhandle(). Keep the fd open
> > will
> > >   *    lock and protect the directory (can not be modified or replaced)
> > > + *
> > > + * 5. Windows and MinGW does not provide safety directory accessing
> > functions.
> > > + *    readdir(), seekdir() and telldir() may get or set wrong value
> > because
> > > + *    directory entry data is not protected.
> > 
> > I would rephrase that sentence, as it doesn't cover the root problem
> > adequately. Maybe something like this:
> > 
> > 5. Neither Windows native APIs, nor MinGW provide a POSIX compatible API for
> > acquiring directory entries in a safe way. Calling those APIs (native
> > _findfirst() and _findnext() or MinGW's readdir(), seekdir() and telldir())
> > directly can lead to an inconsistent state if directory is modified in
> > between, e.g. the same directory appearing more than once in output, or
> > directories not appearing at all in output even though they were neither
> > newly created nor deleted. POSIX does not define what happens with deleted or
> > newly created directories in between, but it guarantees a consistent state.
> > 
> > > + *
> > > + *    This file re-write POSIX directory accessing functions and cache all
> > > + *    directory entries during opening.
> > >   */
> > >
> > >  #include "qemu/osdep.h"
> > > @@ -51,6 +58,27 @@
> > >
> > >  #define V9FS_MAGIC  0x53465039  /* string "9PFS" */
> > >
> > > +/*
> > > + * MinGW and Windows does not provide safety way to seek directory
> > > +while other
> > > + * thread is modifying same directory.
> > > + *
> > > + * The two structures are used to cache all directory entries when opening
> > it.
> > > + * Cached entries are always returned for read or seek.
> > > + */
> > > +struct dir_win32_entry {
> > > +    QSLIST_ENTRY(dir_win32_entry) node;
> > > +    struct _finddata_t dd_data;
> > > +};
> > > +
> > > +struct dir_win32 {
> > > +    struct dirent dd_dir;
> > > +    uint32_t offset;
> > > +    uint32_t total_entries;
> > > +    QSLIST_HEAD(, dir_win32_entry) head;
> > > +    struct dir_win32_entry *current;
> > > +    char dd_name[1];
> > > +};
> > > +
> > >  /*
> > >   * win32_error_to_posix - convert Win32 error to POSIX error number
> > >   *
> > > @@ -977,3 +1005,271 @@ int qemu_mknodat(int dirfd, const char *filename,
> > mode_t mode, dev_t dev)
> > >      errno = ENOTSUP;
> > >      return -1;
> > >  }
> > > +
> > > +/*
> > > + * opendir_win32 - open a directory
> > > + *
> > > + * This function opens a directory and caches all directory entries.
> > > + */
> > > +DIR *opendir_win32(const char *full_file_name) {
> > > +    HANDLE hDir = INVALID_HANDLE_VALUE;
> > > +    DWORD attribute;
> > > +    intptr_t dd_handle = -1;
> > > +    struct _finddata_t dd_data;
> > > +
> > > +    struct dir_win32 *stream = NULL;
> > > +    struct dir_win32_entry *dir_entry;
> > > +    struct dir_win32_entry *prev;
> > > +    struct dir_win32_entry *next;
> > > +
> > > +    int err = 0;
> > > +    int find_status;
> > > +    uint32_t index;
> > > +
> > > +    /* open directory to prevent it being removed */
> > > +
> > > +    hDir = CreateFile(full_file_name, GENERIC_READ,
> > > +                      FILE_SHARE_READ | FILE_SHARE_WRITE |
> > FILE_SHARE_DELETE,
> > > +                      NULL,
> > > +                      OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS,
> > > + NULL);
> > > +
> > > +    if (hDir == INVALID_HANDLE_VALUE) {
> > > +        err = win32_error_to_posix(GetLastError());
> > > +        goto out;
> > > +    }
> > > +
> > > +    attribute = GetFileAttributes(full_file_name);
> > > +
> > > +    /* symlink is not allow */
> > > +    if (attribute == INVALID_FILE_ATTRIBUTES
> > > +        || (attribute & FILE_ATTRIBUTE_REPARSE_POINT) != 0) {
> > > +        err = EACCES;
> > > +        goto out;
> > > +    }
> > > +
> > > +    /* check if it is a directory */
> > > +    if ((attribute & FILE_ATTRIBUTE_DIRECTORY) == 0) {
> > > +        err = ENOTDIR;
> > > +        goto out;
> > > +    }
> > > +
> > > +    /*
> > > +     * findfirst() need suffix format name like "\dir1\dir2\*", allocate
> > more
> > > +     * buffer to store suffix.
> > > +     */
> > > +    stream = g_malloc0(sizeof(struct dir_win32) + strlen(full_file_name) +
> > 3);
> > > +    QSLIST_INIT(&stream->head);
> > > +
> > > +    strcpy(stream->dd_name, full_file_name);
> > > +    strcat(stream->dd_name, "\\*");
> > > +
> > > +    dd_handle = _findfirst(stream->dd_name, &dd_data);
> > > +
> > > +    if (dd_handle == -1) {
> > > +        err = errno;
> > > +        goto out;
> > > +    }
> > > +
> > > +    index = 0;
> > > +
> > > +    /* read all entries to link list */
> > > +    do {
> > > +        dir_entry = g_malloc0(sizeof(struct dir_win32_entry));
> > > +        memcpy(&dir_entry->dd_data, &dd_data, sizeof(dd_data));
> > > +        if (index == 0) {
> > > +            QSLIST_INSERT_HEAD(&stream->head, dir_entry, node);
> > > +        } else {
> > > +            QSLIST_INSERT_AFTER(prev, dir_entry, node);
> > > +        }
> > > +
> > > +        prev = dir_entry;
> > > +        find_status = _findnext(dd_handle, &dd_data);
> > > +
> > > +        index++;
> > > +    } while (find_status == 0);
> > 
> > So you decided to go for the solution that caches all entries of a directory
> > in RAM.
> > 
> > So don't you think my last suggested solution that would call native
> > _findfirst() and _findnext() directly, but without any chaching and instead
> > picking the relevent entry simply by inode number, might be a better
> > candidate as a starting point for landing Windows support? Link to that
> > previous
> > suggestion:
> > 
> > https://lore.kernel.org/qemu-devel/2468168.SvRIHAoRfs@silver/
> > 
> 
> I did a quick test for caching data without name entry, but it failed for reading + deleting directory on Windows host (like "rm -rf" for a directory).
> The root cause is: Windows's directory entry is not cached.
> If there is 100 files in a directory:
> 
> File1
> File2
> ...
> File100
> 
> When "rm -rf" is working:
> 
> It read first 10 entries, and remove them. 9pfs may seek and re-seek to offset 10 to read next 10 entries.
> But Windows and MinGW does not provide rewinddir.
> If we using findfirst() and findnext to seek to offset 10, then we will not get File11 but get File 21 (because we skipped 10 entries by seekdir()).

I assume you are referring to a simple solution like MinGW does, i.e. a
consecutive dense index (0,1,2,3,...n-1 where n is the current total amount of
directory entries). That would not work, yes. But that's not what I suggested.

With an inode number based lookup you would not seek to an incorrect entry ...

> If we removed some entries in directory, inode number is useless because we can not find it again.

You *can* recover from the previous inode number, even if any directory entry
has been deleted in the meantime: you would lookup the entry with the next
higher inode number.

Example, say initial directory state on host is:

name   inode-nr
aaa    8
bbb    3
ccc    4
ddd    2
eee    9

Say client is looking up exactly 2 entries, you would return to client in this
order (by inode-nr):

1. ddd
2. bbb

Now say "bbb" (a.k.a. previous) and "ccc" (a.k.a next) are removed. Directory
state on host is now:

name   inode-nr
aaa    8
ddd    2
eee    9

Subsequently the last directory entries are requested by client. Previous
inode number (stored in RAM) was 3, which no longer exists, so you lookup the
entry with the next higher inode number than 3, which is now 8 in this
example. Hence you would eventually return to client (in this order):

3. aaa
4. eee

> 
> 
> Thanks
> Guohuai
> 
> 
> > > +
> > > +    if (errno == ENOENT) {
> > > +        /* No more matching files could be found, clean errno */
> > > +        errno = 0;
> > > +    } else {
> > > +        err = errno;
> > > +        goto out;
> > > +    }
> > > +
> > > +    stream->total_entries = index;
> > > +    stream->current = QSLIST_FIRST(&stream->head);
> > > +
> > > +out:
> > > +    if (err != 0) {
> > > +        errno = err;
> > > +        /* free whole list */
> > > +        if (stream != NULL) {
> > > +            QSLIST_FOREACH_SAFE(dir_entry, &stream->head, node, next) {
> > > +                QSLIST_REMOVE(&stream->head, dir_entry, dir_win32_entry,
> > node);
> > > +                g_free(dir_entry);
> > > +            }
> > > +            g_free(stream);
> > > +            stream = NULL;
> > > +        }
> > > +    }
> > > +
> > > +    /* after cached all entries, this handle is useless */
> > > +    if (dd_handle != -1) {
> > > +        _findclose(dd_handle);
> > > +    }
> > > +
> > > +    if (hDir != INVALID_HANDLE_VALUE) {
> > > +        CloseHandle(hDir);
> > > +    }
> > > +
> > > +    return (DIR *)stream;
> > > +}
> > > +
> > > +/*
> > > + * closedir_win32 - close a directory
> > > + *
> > > + * This function closes directory and free all cached resources.
> > > + */
> > > +int closedir_win32(DIR *pDir)
> > > +{
> > > +    struct dir_win32 *stream = (struct dir_win32 *)pDir;
> > > +    struct dir_win32_entry *dir_entry;
> > > +    struct dir_win32_entry *next;
> > > +
> > > +    if (stream == NULL) {
> > > +        errno = EBADF;
> > > +        return -1;
> > > +    }
> > > +
> > > +    /* free all resources */
> > > +
> > > +    QSLIST_FOREACH_SAFE(dir_entry, &stream->head, node, next) {
> > > +        QSLIST_REMOVE(&stream->head, dir_entry, dir_win32_entry, node);
> > > +        g_free(dir_entry);
> > > +    }
> > > +
> > > +    g_free(stream);
> > > +
> > > +    return 0;
> > > +}
> > > +
> > > +/*
> > > + * readdir_win32 - read a directory
> > > + *
> > > + * This function reads a directory entry from cached entry list.
> > > + */
> > > +struct dirent *readdir_win32(DIR *pDir) {
> > > +    struct dir_win32 *stream = (struct dir_win32 *)pDir;
> > > +
> > > +    if (stream == NULL) {
> > > +        errno = EBADF;
> > > +        return NULL;
> > > +    }
> > > +
> > > +    if (stream->offset >= stream->total_entries) {
> > > +        /* reach to the end, return NULL without set errno */
> > > +        return NULL;
> > > +    }
> > > +
> > > +    memcpy(stream->dd_dir.d_name,
> > > +           stream->current->dd_data.name,
> > > +           sizeof(stream->dd_dir.d_name));
> > > +
> > > +    /* Windows does not provide inode number */
> > > +    stream->dd_dir.d_ino = 0;
> > > +    stream->dd_dir.d_reclen = 0;
> > > +    stream->dd_dir.d_namlen = strlen(stream->dd_dir.d_name);
> > > +
> > > +    stream->offset++;
> > > +    stream->current = QSLIST_NEXT(stream->current, node);
> > > +
> > > +    return &stream->dd_dir;
> > > +}
> > > +
> > > +/*
> > > + * rewinddir_win32 - reset directory stream
> > > + *
> > > + * This function resets the position of the directory stream to the
> > > + * beginning of the directory.
> > > + */
> > > +void rewinddir_win32(DIR *pDir)
> > > +{
> > > +    struct dir_win32 *stream = (struct dir_win32 *)pDir;
> > > +
> > > +    if (stream == NULL) {
> > > +        errno = EBADF;
> > > +        return;
> > > +    }
> > > +
> > > +    stream->offset = 0;
> > > +    stream->current = QSLIST_FIRST(&stream->head);
> > > +
> > > +    return;
> > > +}
> > > +
> > > +/*
> > > + * seekdir_win32 - set the position of the next readdir() call in the
> > > +directory
> > > + *
> > > + * This function sets the position of the next readdir() call in the
> > > +directory
> > > + * from which the next readdir() call will start.
> > > + */
> > > +void seekdir_win32(DIR *pDir, long pos) {
> > > +    struct dir_win32 *stream = (struct dir_win32 *)pDir;
> > > +    uint32_t index;
> > > +
> > > +    if (stream == NULL) {
> > > +        errno = EBADF;
> > > +        return;
> > > +    }
> > > +
> > > +    if (pos < -1) {
> > > +        errno = EINVAL;
> > > +        return;
> > > +    }
> > > +
> > > +    if (pos == -1 || pos >= (long)stream->total_entries) {
> > > +        /* seek to the end */
> > > +        stream->offset = stream->total_entries;
> > > +        return;
> > > +    }
> > > +
> > > +    if (pos - (long)stream->offset == 0) {
> > > +        /* no need to seek */
> > > +        return;
> > > +    }
> > > +
> > > +    /* seek position from list head */
> > > +
> > > +    stream->current = QSLIST_FIRST(&stream->head);
> > > +
> > > +    for (index = 0; index < (uint32_t)pos; index++) {
> > > +        stream->current = QSLIST_NEXT(stream->current, node);
> > > +    }
> > > +    stream->offset = index;
> > > +
> > > +    return;
> > > +}
> > > +
> > > +/*
> > > + * telldir_win32 - return current location in directory
> > > + *
> > > + * This function returns current location in directory.
> > > + */
> > > +long telldir_win32(DIR *pDir)
> > > +{
> > > +    struct dir_win32 *stream = (struct dir_win32 *)pDir;
> > > +
> > > +    if (stream == NULL) {
> > > +        errno = EBADF;
> > > +        return -1;
> > > +    }
> > > +
> > > +    if (stream->offset > stream->total_entries) {
> > > +        return -1;
> > > +    }
> > > +
> > > +    return (long)stream->offset;
> > > +}
> > >
> > 
> 
> 
>
Shi, Guohuai Feb. 3, 2023, 4:30 p.m. UTC | #4
> -----Original Message-----
> From: Christian Schoenebeck <qemu_oss@crudebyte.com>
> Sent: Friday, February 3, 2023 22:41
> To: Greg Kurz <groug@kaod.org>; qemu-devel@nongnu.org
> Cc: Meng, Bin <Bin.Meng@windriver.com>; Marc-André Lureau
> <marcandre.lureau@redhat.com>; Daniel P. Berrangé <berrange@redhat.com>; Shi,
> Guohuai <Guohuai.Shi@windriver.com>
> Subject: Re: [PATCH v4 04/16] hw/9pfs: Implement Windows specific xxxdir()
> APIs
> 
> CAUTION: This email comes from a non Wind River email account!
> Do not click links or open attachments unless you recognize the sender and
> know the content is safe.
> 
> On Friday, February 3, 2023 2:34:13 PM CET Shi, Guohuai wrote:
> >
> > > -----Original Message-----
> > > From: Christian Schoenebeck <qemu_oss@crudebyte.com>
> > > Sent: Friday, February 3, 2023 20:25
> > > To: Greg Kurz <groug@kaod.org>; qemu-devel@nongnu.org
> > > Cc: Shi, Guohuai <Guohuai.Shi@windriver.com>; Meng, Bin
> > > <Bin.Meng@windriver.com>; Marc-André Lureau
> > > <marcandre.lureau@redhat.com>; Daniel P. Berrangé
> > > <berrange@redhat.com>
> > > Subject: Re: [PATCH v4 04/16] hw/9pfs: Implement Windows specific
> > > xxxdir() APIs
> > >
> > > CAUTION: This email comes from a non Wind River email account!
> > > Do not click links or open attachments unless you recognize the
> > > sender and know the content is safe.
> > >
> > > On Monday, January 30, 2023 10:51:50 AM CET Bin Meng wrote:
> > > > From: Guohuai Shi <guohuai.shi@windriver.com>
> > > >
> > > > This commit implements Windows specific xxxdir() APIs for safety
> > > > directory access.
> > > >
> > >
> > > This issue deserves a link to either the previous discussion
> > >
> > > Link: https://lore.kernel.org/qemu-devel/2830993.GtbaR8S6b6@silver/
> > >
> > > and/or a link to this continuation of the discussion here, as it's
> > > not a trivial issue, with pros and cons been discussed for the
> > > individual, possible solutions.
> > >
> > > > Signed-off-by: Guohuai Shi <guohuai.shi@windriver.com>
> > > > Signed-off-by: Bin Meng <bin.meng@windriver.com>
> > > > ---
> > > >
> > > >  hw/9pfs/9p-util.h       |   6 +
> > > >  hw/9pfs/9p-util-win32.c | 296
> > > > ++++++++++++++++++++++++++++++++++++++++
> > > >  2 files changed, 302 insertions(+)
> > > >
> > > > diff --git a/hw/9pfs/9p-util.h b/hw/9pfs/9p-util.h index
> > > > 0f159fb4ce..c1c251fbd1 100644
> > > > --- a/hw/9pfs/9p-util.h
> > > > +++ b/hw/9pfs/9p-util.h
> > > > @@ -141,6 +141,12 @@ int unlinkat_win32(int dirfd, const char
> > > > *pathname, int flags);  int statfs_win32(const char *root_path,
> > > > struct statfs *stbuf);  int openat_dir(int dirfd, const char
> > > > *name);  int openat_file(int dirfd, const char *name, int flags,
> > > > mode_t mode);
> > > > +DIR *opendir_win32(const char *full_file_name); int
> > > > +closedir_win32(DIR *pDir); struct dirent *readdir_win32(DIR
> > > > +*pDir); void rewinddir_win32(DIR *pDir); void seekdir_win32(DIR
> > > > +*pDir, long pos); long telldir_win32(DIR *pDir);
> > > >  #endif
> > > >
> > > >  static inline void close_preserve_errno(int fd) diff --git
> > > > a/hw/9pfs/9p-util-win32.c b/hw/9pfs/9p-util-win32.c index
> > > > a99d579a06..5503199300 100644
> > > > --- a/hw/9pfs/9p-util-win32.c
> > > > +++ b/hw/9pfs/9p-util-win32.c
> > > > @@ -37,6 +37,13 @@
> > > >   *    Windows does not support opendir, the directory fd is created by
> > > >   *    CreateFile and convert to fd by _open_osfhandle(). Keep the fd
> open
> > > will
> > > >   *    lock and protect the directory (can not be modified or replaced)
> > > > + *
> > > > + * 5. Windows and MinGW does not provide safety directory
> > > > + accessing
> > > functions.
> > > > + *    readdir(), seekdir() and telldir() may get or set wrong value
> > > because
> > > > + *    directory entry data is not protected.
> > >
> > > I would rephrase that sentence, as it doesn't cover the root problem
> > > adequately. Maybe something like this:
> > >
> > > 5. Neither Windows native APIs, nor MinGW provide a POSIX compatible
> > > API for acquiring directory entries in a safe way. Calling those
> > > APIs (native
> > > _findfirst() and _findnext() or MinGW's readdir(), seekdir() and
> > > telldir()) directly can lead to an inconsistent state if directory
> > > is modified in between, e.g. the same directory appearing more than
> > > once in output, or directories not appearing at all in output even
> > > though they were neither newly created nor deleted. POSIX does not
> > > define what happens with deleted or newly created directories in between,
> but it guarantees a consistent state.
> > >
> > > > + *
> > > > + *    This file re-write POSIX directory accessing functions and cache
> all
> > > > + *    directory entries during opening.
> > > >   */
> > > >
> > > >  #include "qemu/osdep.h"
> > > > @@ -51,6 +58,27 @@
> > > >
> > > >  #define V9FS_MAGIC  0x53465039  /* string "9PFS" */
> > > >
> > > > +/*
> > > > + * MinGW and Windows does not provide safety way to seek
> > > > +directory while other
> > > > + * thread is modifying same directory.
> > > > + *
> > > > + * The two structures are used to cache all directory entries
> > > > +when opening
> > > it.
> > > > + * Cached entries are always returned for read or seek.
> > > > + */
> > > > +struct dir_win32_entry {
> > > > +    QSLIST_ENTRY(dir_win32_entry) node;
> > > > +    struct _finddata_t dd_data;
> > > > +};
> > > > +
> > > > +struct dir_win32 {
> > > > +    struct dirent dd_dir;
> > > > +    uint32_t offset;
> > > > +    uint32_t total_entries;
> > > > +    QSLIST_HEAD(, dir_win32_entry) head;
> > > > +    struct dir_win32_entry *current;
> > > > +    char dd_name[1];
> > > > +};
> > > > +
> > > >  /*
> > > >   * win32_error_to_posix - convert Win32 error to POSIX error number
> > > >   *
> > > > @@ -977,3 +1005,271 @@ int qemu_mknodat(int dirfd, const char
> > > > *filename,
> > > mode_t mode, dev_t dev)
> > > >      errno = ENOTSUP;
> > > >      return -1;
> > > >  }
> > > > +
> > > > +/*
> > > > + * opendir_win32 - open a directory
> > > > + *
> > > > + * This function opens a directory and caches all directory entries.
> > > > + */
> > > > +DIR *opendir_win32(const char *full_file_name) {
> > > > +    HANDLE hDir = INVALID_HANDLE_VALUE;
> > > > +    DWORD attribute;
> > > > +    intptr_t dd_handle = -1;
> > > > +    struct _finddata_t dd_data;
> > > > +
> > > > +    struct dir_win32 *stream = NULL;
> > > > +    struct dir_win32_entry *dir_entry;
> > > > +    struct dir_win32_entry *prev;
> > > > +    struct dir_win32_entry *next;
> > > > +
> > > > +    int err = 0;
> > > > +    int find_status;
> > > > +    uint32_t index;
> > > > +
> > > > +    /* open directory to prevent it being removed */
> > > > +
> > > > +    hDir = CreateFile(full_file_name, GENERIC_READ,
> > > > +                      FILE_SHARE_READ | FILE_SHARE_WRITE |
> > > FILE_SHARE_DELETE,
> > > > +                      NULL,
> > > > +                      OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS,
> > > > + NULL);
> > > > +
> > > > +    if (hDir == INVALID_HANDLE_VALUE) {
> > > > +        err = win32_error_to_posix(GetLastError());
> > > > +        goto out;
> > > > +    }
> > > > +
> > > > +    attribute = GetFileAttributes(full_file_name);
> > > > +
> > > > +    /* symlink is not allow */
> > > > +    if (attribute == INVALID_FILE_ATTRIBUTES
> > > > +        || (attribute & FILE_ATTRIBUTE_REPARSE_POINT) != 0) {
> > > > +        err = EACCES;
> > > > +        goto out;
> > > > +    }
> > > > +
> > > > +    /* check if it is a directory */
> > > > +    if ((attribute & FILE_ATTRIBUTE_DIRECTORY) == 0) {
> > > > +        err = ENOTDIR;
> > > > +        goto out;
> > > > +    }
> > > > +
> > > > +    /*
> > > > +     * findfirst() need suffix format name like "\dir1\dir2\*",
> > > > + allocate
> > > more
> > > > +     * buffer to store suffix.
> > > > +     */
> > > > +    stream = g_malloc0(sizeof(struct dir_win32) +
> > > > + strlen(full_file_name) +
> > > 3);
> > > > +    QSLIST_INIT(&stream->head);
> > > > +
> > > > +    strcpy(stream->dd_name, full_file_name);
> > > > +    strcat(stream->dd_name, "\\*");
> > > > +
> > > > +    dd_handle = _findfirst(stream->dd_name, &dd_data);
> > > > +
> > > > +    if (dd_handle == -1) {
> > > > +        err = errno;
> > > > +        goto out;
> > > > +    }
> > > > +
> > > > +    index = 0;
> > > > +
> > > > +    /* read all entries to link list */
> > > > +    do {
> > > > +        dir_entry = g_malloc0(sizeof(struct dir_win32_entry));
> > > > +        memcpy(&dir_entry->dd_data, &dd_data, sizeof(dd_data));
> > > > +        if (index == 0) {
> > > > +            QSLIST_INSERT_HEAD(&stream->head, dir_entry, node);
> > > > +        } else {
> > > > +            QSLIST_INSERT_AFTER(prev, dir_entry, node);
> > > > +        }
> > > > +
> > > > +        prev = dir_entry;
> > > > +        find_status = _findnext(dd_handle, &dd_data);
> > > > +
> > > > +        index++;
> > > > +    } while (find_status == 0);
> > >
> > > So you decided to go for the solution that caches all entries of a
> > > directory in RAM.
> > >
> > > So don't you think my last suggested solution that would call native
> > > _findfirst() and _findnext() directly, but without any chaching and
> > > instead picking the relevent entry simply by inode number, might be
> > > a better candidate as a starting point for landing Windows support?
> > > Link to that previous
> > > suggestion:
> > >
> > > https://lore.kernel.org/qemu-devel/2468168.SvRIHAoRfs@silver/
> > >
> >
> > I did a quick test for caching data without name entry, but it failed for
> reading + deleting directory on Windows host (like "rm -rf" for a directory).
> > The root cause is: Windows's directory entry is not cached.
> > If there is 100 files in a directory:
> >
> > File1
> > File2
> > ...
> > File100
> >
> > When "rm -rf" is working:
> >
> > It read first 10 entries, and remove them. 9pfs may seek and re-seek to
> offset 10 to read next 10 entries.
> > But Windows and MinGW does not provide rewinddir.
> > If we using findfirst() and findnext to seek to offset 10, then we will not
> get File11 but get File 21 (because we skipped 10 entries by seekdir()).
> 
> I assume you are referring to a simple solution like MinGW does, i.e. a
> consecutive dense index (0,1,2,3,...n-1 where n is the current total amount
> of directory entries). That would not work, yes. But that's not what I
> suggested.
> 
> With an inode number based lookup you would not seek to an incorrect entry
> ...
> 
> > If we removed some entries in directory, inode number is useless because we
> can not find it again.
> 
> You *can* recover from the previous inode number, even if any directory entry
> has been deleted in the meantime: you would lookup the entry with the next
> higher inode number.
> 
> Example, say initial directory state on host is:
> 
> name   inode-nr
> aaa    8
> bbb    3
> ccc    4
> ddd    2
> eee    9
> 
> Say client is looking up exactly 2 entries, you would return to client in
> this order (by inode-nr):
> 
> 1. ddd
> 2. bbb
> 
> Now say "bbb" (a.k.a. previous) and "ccc" (a.k.a next) are removed. Directory
> state on host is now:
> 
> name   inode-nr
> aaa    8
> ddd    2
> eee    9
> 
> Subsequently the last directory entries are requested by client. Previous
> inode number (stored in RAM) was 3, which no longer exists, so you lookup the
> entry with the next higher inode number than 3, which is now 8 in this
> example. Hence you would eventually return to client (in this order):
> 
> 3. aaa
> 4. eee
> 

Yes, it can work by using inode number (called File ID on Windows host: https://learn.microsoft.com/en-us/windows/win32/api/winbase/ns-winbase-file_id_info).
However, Windows does not provide a function to get file information by file ID.
That means, for anytime of seeking directory, 9pfs need to do the following sequence work to locate a name entry:

1. findfirst
2. CreateFile to get file handle
3. GetFileInformationByHandleEx to get file ID (https://learn.microsoft.com/en-us/windows/win32/api/minwinbase/ne-minwinbase-file_info_by_handle_class)
4. Close file handle and return if the file ID is match
5. findnext
6. repeat to step #2

Windows does not short file name entry by file ID and the file ID is 128-bit integer.
When there are many entries in directory, seeking directory will cause a very bad performance.

So I think store all name entries would be better than store all file ID.


> >
> >
> > Thanks
> > Guohuai
> >
> >
> > > > +
> > > > +    if (errno == ENOENT) {
> > > > +        /* No more matching files could be found, clean errno */
> > > > +        errno = 0;
> > > > +    } else {
> > > > +        err = errno;
> > > > +        goto out;
> > > > +    }
> > > > +
> > > > +    stream->total_entries = index;
> > > > +    stream->current = QSLIST_FIRST(&stream->head);
> > > > +
> > > > +out:
> > > > +    if (err != 0) {
> > > > +        errno = err;
> > > > +        /* free whole list */
> > > > +        if (stream != NULL) {
> > > > +            QSLIST_FOREACH_SAFE(dir_entry, &stream->head, node, next)
> {
> > > > +                QSLIST_REMOVE(&stream->head, dir_entry,
> > > > +dir_win32_entry,
> > > node);
> > > > +                g_free(dir_entry);
> > > > +            }
> > > > +            g_free(stream);
> > > > +            stream = NULL;
> > > > +        }
> > > > +    }
> > > > +
> > > > +    /* after cached all entries, this handle is useless */
> > > > +    if (dd_handle != -1) {
> > > > +        _findclose(dd_handle);
> > > > +    }
> > > > +
> > > > +    if (hDir != INVALID_HANDLE_VALUE) {
> > > > +        CloseHandle(hDir);
> > > > +    }
> > > > +
> > > > +    return (DIR *)stream;
> > > > +}
> > > > +
> > > > +/*
> > > > + * closedir_win32 - close a directory
> > > > + *
> > > > + * This function closes directory and free all cached resources.
> > > > + */
> > > > +int closedir_win32(DIR *pDir)
> > > > +{
> > > > +    struct dir_win32 *stream = (struct dir_win32 *)pDir;
> > > > +    struct dir_win32_entry *dir_entry;
> > > > +    struct dir_win32_entry *next;
> > > > +
> > > > +    if (stream == NULL) {
> > > > +        errno = EBADF;
> > > > +        return -1;
> > > > +    }
> > > > +
> > > > +    /* free all resources */
> > > > +
> > > > +    QSLIST_FOREACH_SAFE(dir_entry, &stream->head, node, next) {
> > > > +        QSLIST_REMOVE(&stream->head, dir_entry, dir_win32_entry,
> node);
> > > > +        g_free(dir_entry);
> > > > +    }
> > > > +
> > > > +    g_free(stream);
> > > > +
> > > > +    return 0;
> > > > +}
> > > > +
> > > > +/*
> > > > + * readdir_win32 - read a directory
> > > > + *
> > > > + * This function reads a directory entry from cached entry list.
> > > > + */
> > > > +struct dirent *readdir_win32(DIR *pDir) {
> > > > +    struct dir_win32 *stream = (struct dir_win32 *)pDir;
> > > > +
> > > > +    if (stream == NULL) {
> > > > +        errno = EBADF;
> > > > +        return NULL;
> > > > +    }
> > > > +
> > > > +    if (stream->offset >= stream->total_entries) {
> > > > +        /* reach to the end, return NULL without set errno */
> > > > +        return NULL;
> > > > +    }
> > > > +
> > > > +    memcpy(stream->dd_dir.d_name,
> > > > +           stream->current->dd_data.name,
> > > > +           sizeof(stream->dd_dir.d_name));
> > > > +
> > > > +    /* Windows does not provide inode number */
> > > > +    stream->dd_dir.d_ino = 0;
> > > > +    stream->dd_dir.d_reclen = 0;
> > > > +    stream->dd_dir.d_namlen = strlen(stream->dd_dir.d_name);
> > > > +
> > > > +    stream->offset++;
> > > > +    stream->current = QSLIST_NEXT(stream->current, node);
> > > > +
> > > > +    return &stream->dd_dir;
> > > > +}
> > > > +
> > > > +/*
> > > > + * rewinddir_win32 - reset directory stream
> > > > + *
> > > > + * This function resets the position of the directory stream to
> > > > +the
> > > > + * beginning of the directory.
> > > > + */
> > > > +void rewinddir_win32(DIR *pDir)
> > > > +{
> > > > +    struct dir_win32 *stream = (struct dir_win32 *)pDir;
> > > > +
> > > > +    if (stream == NULL) {
> > > > +        errno = EBADF;
> > > > +        return;
> > > > +    }
> > > > +
> > > > +    stream->offset = 0;
> > > > +    stream->current = QSLIST_FIRST(&stream->head);
> > > > +
> > > > +    return;
> > > > +}
> > > > +
> > > > +/*
> > > > + * seekdir_win32 - set the position of the next readdir() call in
> > > > +the directory
> > > > + *
> > > > + * This function sets the position of the next readdir() call in
> > > > +the directory
> > > > + * from which the next readdir() call will start.
> > > > + */
> > > > +void seekdir_win32(DIR *pDir, long pos) {
> > > > +    struct dir_win32 *stream = (struct dir_win32 *)pDir;
> > > > +    uint32_t index;
> > > > +
> > > > +    if (stream == NULL) {
> > > > +        errno = EBADF;
> > > > +        return;
> > > > +    }
> > > > +
> > > > +    if (pos < -1) {
> > > > +        errno = EINVAL;
> > > > +        return;
> > > > +    }
> > > > +
> > > > +    if (pos == -1 || pos >= (long)stream->total_entries) {
> > > > +        /* seek to the end */
> > > > +        stream->offset = stream->total_entries;
> > > > +        return;
> > > > +    }
> > > > +
> > > > +    if (pos - (long)stream->offset == 0) {
> > > > +        /* no need to seek */
> > > > +        return;
> > > > +    }
> > > > +
> > > > +    /* seek position from list head */
> > > > +
> > > > +    stream->current = QSLIST_FIRST(&stream->head);
> > > > +
> > > > +    for (index = 0; index < (uint32_t)pos; index++) {
> > > > +        stream->current = QSLIST_NEXT(stream->current, node);
> > > > +    }
> > > > +    stream->offset = index;
> > > > +
> > > > +    return;
> > > > +}
> > > > +
> > > > +/*
> > > > + * telldir_win32 - return current location in directory
> > > > + *
> > > > + * This function returns current location in directory.
> > > > + */
> > > > +long telldir_win32(DIR *pDir)
> > > > +{
> > > > +    struct dir_win32 *stream = (struct dir_win32 *)pDir;
> > > > +
> > > > +    if (stream == NULL) {
> > > > +        errno = EBADF;
> > > > +        return -1;
> > > > +    }
> > > > +
> > > > +    if (stream->offset > stream->total_entries) {
> > > > +        return -1;
> > > > +    }
> > > > +
> > > > +    return (long)stream->offset;
> > > > +}
> > > >
> > >
> >
> >
> >
> 
>
Christian Schoenebeck Feb. 3, 2023, 5:55 p.m. UTC | #5
On Friday, February 3, 2023 5:30:35 PM CET Shi, Guohuai wrote:
> 
> > -----Original Message-----
> > From: Christian Schoenebeck <qemu_oss@crudebyte.com>
> > Sent: Friday, February 3, 2023 22:41
> > To: Greg Kurz <groug@kaod.org>; qemu-devel@nongnu.org
> > Cc: Meng, Bin <Bin.Meng@windriver.com>; Marc-André Lureau
> > <marcandre.lureau@redhat.com>; Daniel P. Berrangé <berrange@redhat.com>; Shi,
> > Guohuai <Guohuai.Shi@windriver.com>
> > Subject: Re: [PATCH v4 04/16] hw/9pfs: Implement Windows specific xxxdir()
> > APIs
> > 
> > CAUTION: This email comes from a non Wind River email account!
> > Do not click links or open attachments unless you recognize the sender and
> > know the content is safe.
> > 
> > On Friday, February 3, 2023 2:34:13 PM CET Shi, Guohuai wrote:
> > >
> > > > -----Original Message-----
> > > > From: Christian Schoenebeck <qemu_oss@crudebyte.com>
> > > > Sent: Friday, February 3, 2023 20:25
> > > > To: Greg Kurz <groug@kaod.org>; qemu-devel@nongnu.org
> > > > Cc: Shi, Guohuai <Guohuai.Shi@windriver.com>; Meng, Bin
> > > > <Bin.Meng@windriver.com>; Marc-André Lureau
> > > > <marcandre.lureau@redhat.com>; Daniel P. Berrangé
> > > > <berrange@redhat.com>
> > > > Subject: Re: [PATCH v4 04/16] hw/9pfs: Implement Windows specific
> > > > xxxdir() APIs
> > > >
> > > > CAUTION: This email comes from a non Wind River email account!
> > > > Do not click links or open attachments unless you recognize the
> > > > sender and know the content is safe.
> > > >
> > > > On Monday, January 30, 2023 10:51:50 AM CET Bin Meng wrote:
> > > > > From: Guohuai Shi <guohuai.shi@windriver.com>
> > > > >
> > > > > This commit implements Windows specific xxxdir() APIs for safety
> > > > > directory access.
> > > > >
> > > >
> > > > This issue deserves a link to either the previous discussion
> > > >
> > > > Link: https://lore.kernel.org/qemu-devel/2830993.GtbaR8S6b6@silver/
> > > >
> > > > and/or a link to this continuation of the discussion here, as it's
> > > > not a trivial issue, with pros and cons been discussed for the
> > > > individual, possible solutions.
> > > >
> > > > > Signed-off-by: Guohuai Shi <guohuai.shi@windriver.com>
> > > > > Signed-off-by: Bin Meng <bin.meng@windriver.com>
> > > > > ---
> > > > >
> > > > >  hw/9pfs/9p-util.h       |   6 +
> > > > >  hw/9pfs/9p-util-win32.c | 296
> > > > > ++++++++++++++++++++++++++++++++++++++++
> > > > >  2 files changed, 302 insertions(+)
> > > > >
> > > > > diff --git a/hw/9pfs/9p-util.h b/hw/9pfs/9p-util.h index
> > > > > 0f159fb4ce..c1c251fbd1 100644
> > > > > --- a/hw/9pfs/9p-util.h
> > > > > +++ b/hw/9pfs/9p-util.h
> > > > > @@ -141,6 +141,12 @@ int unlinkat_win32(int dirfd, const char
> > > > > *pathname, int flags);  int statfs_win32(const char *root_path,
> > > > > struct statfs *stbuf);  int openat_dir(int dirfd, const char
> > > > > *name);  int openat_file(int dirfd, const char *name, int flags,
> > > > > mode_t mode);
> > > > > +DIR *opendir_win32(const char *full_file_name); int
> > > > > +closedir_win32(DIR *pDir); struct dirent *readdir_win32(DIR
> > > > > +*pDir); void rewinddir_win32(DIR *pDir); void seekdir_win32(DIR
> > > > > +*pDir, long pos); long telldir_win32(DIR *pDir);
> > > > >  #endif
> > > > >
> > > > >  static inline void close_preserve_errno(int fd) diff --git
> > > > > a/hw/9pfs/9p-util-win32.c b/hw/9pfs/9p-util-win32.c index
> > > > > a99d579a06..5503199300 100644
> > > > > --- a/hw/9pfs/9p-util-win32.c
> > > > > +++ b/hw/9pfs/9p-util-win32.c
> > > > > @@ -37,6 +37,13 @@
> > > > >   *    Windows does not support opendir, the directory fd is created by
> > > > >   *    CreateFile and convert to fd by _open_osfhandle(). Keep the fd
> > open
> > > > will
> > > > >   *    lock and protect the directory (can not be modified or replaced)
> > > > > + *
> > > > > + * 5. Windows and MinGW does not provide safety directory
> > > > > + accessing
> > > > functions.
> > > > > + *    readdir(), seekdir() and telldir() may get or set wrong value
> > > > because
> > > > > + *    directory entry data is not protected.
> > > >
> > > > I would rephrase that sentence, as it doesn't cover the root problem
> > > > adequately. Maybe something like this:
> > > >
> > > > 5. Neither Windows native APIs, nor MinGW provide a POSIX compatible
> > > > API for acquiring directory entries in a safe way. Calling those
> > > > APIs (native
> > > > _findfirst() and _findnext() or MinGW's readdir(), seekdir() and
> > > > telldir()) directly can lead to an inconsistent state if directory
> > > > is modified in between, e.g. the same directory appearing more than
> > > > once in output, or directories not appearing at all in output even
> > > > though they were neither newly created nor deleted. POSIX does not
> > > > define what happens with deleted or newly created directories in between,
> > but it guarantees a consistent state.
> > > >
> > > > > + *
> > > > > + *    This file re-write POSIX directory accessing functions and cache
> > all
> > > > > + *    directory entries during opening.
> > > > >   */
> > > > >
> > > > >  #include "qemu/osdep.h"
> > > > > @@ -51,6 +58,27 @@
> > > > >
> > > > >  #define V9FS_MAGIC  0x53465039  /* string "9PFS" */
> > > > >
> > > > > +/*
> > > > > + * MinGW and Windows does not provide safety way to seek
> > > > > +directory while other
> > > > > + * thread is modifying same directory.
> > > > > + *
> > > > > + * The two structures are used to cache all directory entries
> > > > > +when opening
> > > > it.
> > > > > + * Cached entries are always returned for read or seek.
> > > > > + */
> > > > > +struct dir_win32_entry {
> > > > > +    QSLIST_ENTRY(dir_win32_entry) node;
> > > > > +    struct _finddata_t dd_data;
> > > > > +};
> > > > > +
> > > > > +struct dir_win32 {
> > > > > +    struct dirent dd_dir;
> > > > > +    uint32_t offset;
> > > > > +    uint32_t total_entries;
> > > > > +    QSLIST_HEAD(, dir_win32_entry) head;
> > > > > +    struct dir_win32_entry *current;
> > > > > +    char dd_name[1];
> > > > > +};
> > > > > +
> > > > >  /*
> > > > >   * win32_error_to_posix - convert Win32 error to POSIX error number
> > > > >   *
> > > > > @@ -977,3 +1005,271 @@ int qemu_mknodat(int dirfd, const char
> > > > > *filename,
> > > > mode_t mode, dev_t dev)
> > > > >      errno = ENOTSUP;
> > > > >      return -1;
> > > > >  }
> > > > > +
> > > > > +/*
> > > > > + * opendir_win32 - open a directory
> > > > > + *
> > > > > + * This function opens a directory and caches all directory entries.
> > > > > + */
> > > > > +DIR *opendir_win32(const char *full_file_name) {
> > > > > +    HANDLE hDir = INVALID_HANDLE_VALUE;
> > > > > +    DWORD attribute;
> > > > > +    intptr_t dd_handle = -1;
> > > > > +    struct _finddata_t dd_data;
> > > > > +
> > > > > +    struct dir_win32 *stream = NULL;
> > > > > +    struct dir_win32_entry *dir_entry;
> > > > > +    struct dir_win32_entry *prev;
> > > > > +    struct dir_win32_entry *next;
> > > > > +
> > > > > +    int err = 0;
> > > > > +    int find_status;
> > > > > +    uint32_t index;
> > > > > +
> > > > > +    /* open directory to prevent it being removed */
> > > > > +
> > > > > +    hDir = CreateFile(full_file_name, GENERIC_READ,
> > > > > +                      FILE_SHARE_READ | FILE_SHARE_WRITE |
> > > > FILE_SHARE_DELETE,
> > > > > +                      NULL,
> > > > > +                      OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS,
> > > > > + NULL);
> > > > > +
> > > > > +    if (hDir == INVALID_HANDLE_VALUE) {
> > > > > +        err = win32_error_to_posix(GetLastError());
> > > > > +        goto out;
> > > > > +    }
> > > > > +
> > > > > +    attribute = GetFileAttributes(full_file_name);
> > > > > +
> > > > > +    /* symlink is not allow */
> > > > > +    if (attribute == INVALID_FILE_ATTRIBUTES
> > > > > +        || (attribute & FILE_ATTRIBUTE_REPARSE_POINT) != 0) {
> > > > > +        err = EACCES;
> > > > > +        goto out;
> > > > > +    }
> > > > > +
> > > > > +    /* check if it is a directory */
> > > > > +    if ((attribute & FILE_ATTRIBUTE_DIRECTORY) == 0) {
> > > > > +        err = ENOTDIR;
> > > > > +        goto out;
> > > > > +    }
> > > > > +
> > > > > +    /*
> > > > > +     * findfirst() need suffix format name like "\dir1\dir2\*",
> > > > > + allocate
> > > > more
> > > > > +     * buffer to store suffix.
> > > > > +     */
> > > > > +    stream = g_malloc0(sizeof(struct dir_win32) +
> > > > > + strlen(full_file_name) +
> > > > 3);
> > > > > +    QSLIST_INIT(&stream->head);
> > > > > +
> > > > > +    strcpy(stream->dd_name, full_file_name);
> > > > > +    strcat(stream->dd_name, "\\*");
> > > > > +
> > > > > +    dd_handle = _findfirst(stream->dd_name, &dd_data);
> > > > > +
> > > > > +    if (dd_handle == -1) {
> > > > > +        err = errno;
> > > > > +        goto out;
> > > > > +    }
> > > > > +
> > > > > +    index = 0;
> > > > > +
> > > > > +    /* read all entries to link list */
> > > > > +    do {
> > > > > +        dir_entry = g_malloc0(sizeof(struct dir_win32_entry));
> > > > > +        memcpy(&dir_entry->dd_data, &dd_data, sizeof(dd_data));
> > > > > +        if (index == 0) {
> > > > > +            QSLIST_INSERT_HEAD(&stream->head, dir_entry, node);
> > > > > +        } else {
> > > > > +            QSLIST_INSERT_AFTER(prev, dir_entry, node);
> > > > > +        }
> > > > > +
> > > > > +        prev = dir_entry;
> > > > > +        find_status = _findnext(dd_handle, &dd_data);
> > > > > +
> > > > > +        index++;
> > > > > +    } while (find_status == 0);
> > > >
> > > > So you decided to go for the solution that caches all entries of a
> > > > directory in RAM.
> > > >
> > > > So don't you think my last suggested solution that would call native
> > > > _findfirst() and _findnext() directly, but without any chaching and
> > > > instead picking the relevent entry simply by inode number, might be
> > > > a better candidate as a starting point for landing Windows support?
> > > > Link to that previous
> > > > suggestion:
> > > >
> > > > https://lore.kernel.org/qemu-devel/2468168.SvRIHAoRfs@silver/
> > > >
> > >
> > > I did a quick test for caching data without name entry, but it failed for
> > reading + deleting directory on Windows host (like "rm -rf" for a directory).
> > > The root cause is: Windows's directory entry is not cached.
> > > If there is 100 files in a directory:
> > >
> > > File1
> > > File2
> > > ...
> > > File100
> > >
> > > When "rm -rf" is working:
> > >
> > > It read first 10 entries, and remove them. 9pfs may seek and re-seek to
> > offset 10 to read next 10 entries.
> > > But Windows and MinGW does not provide rewinddir.
> > > If we using findfirst() and findnext to seek to offset 10, then we will not
> > get File11 but get File 21 (because we skipped 10 entries by seekdir()).
> > 
> > I assume you are referring to a simple solution like MinGW does, i.e. a
> > consecutive dense index (0,1,2,3,...n-1 where n is the current total amount
> > of directory entries). That would not work, yes. But that's not what I
> > suggested.
> > 
> > With an inode number based lookup you would not seek to an incorrect entry
> > ...
> > 
> > > If we removed some entries in directory, inode number is useless because we
> > can not find it again.
> > 
> > You *can* recover from the previous inode number, even if any directory entry
> > has been deleted in the meantime: you would lookup the entry with the next
> > higher inode number.
> > 
> > Example, say initial directory state on host is:
> > 
> > name   inode-nr
> > aaa    8
> > bbb    3
> > ccc    4
> > ddd    2
> > eee    9
> > 
> > Say client is looking up exactly 2 entries, you would return to client in
> > this order (by inode-nr):
> > 
> > 1. ddd
> > 2. bbb
> > 
> > Now say "bbb" (a.k.a. previous) and "ccc" (a.k.a next) are removed. Directory
> > state on host is now:
> > 
> > name   inode-nr
> > aaa    8
> > ddd    2
> > eee    9
> > 
> > Subsequently the last directory entries are requested by client. Previous
> > inode number (stored in RAM) was 3, which no longer exists, so you lookup the
> > entry with the next higher inode number than 3, which is now 8 in this
> > example. Hence you would eventually return to client (in this order):
> > 
> > 3. aaa
> > 4. eee
> > 
> 
> Yes, it can work by using inode number (called File ID on Windows host: https://learn.microsoft.com/en-us/windows/win32/api/winbase/ns-winbase-file_id_info).
> However, Windows does not provide a function to get file information by file ID.
> That means, for anytime of seeking directory, 9pfs need to do the following sequence work to locate a name entry:
> 
> 1. findfirst
> 2. CreateFile to get file handle
> 3. GetFileInformationByHandleEx to get file ID (https://learn.microsoft.com/en-us/windows/win32/api/minwinbase/ne-minwinbase-file_info_by_handle_class)
> 4. Close file handle and return if the file ID is match
> 5. findnext
> 6. repeat to step #2
> 
> Windows does not short file name entry by file ID and the file ID is 128-bit integer.
> When there are many entries in directory, seeking directory will cause a very bad performance.

I know, it's an n-square performance issue and what I already wrote in the
summary of the linked original suggestion [1] in v3 before, quote:

  + Relatively straight-forward to implement.

  + No (major) changes in 9pfs code base required.

  - Still n-square performance issue (neglectable to land Windows host support
    IMO).

  o Consistency assured for "most" cases, except one: if hardlinks are
    inserted in between then it might fail

[1] https://lore.kernel.org/qemu-devel/2468168.SvRIHAoRfs@silver/

The idea was to use that just as a starting point to land Windows host support
ASAP, slower on large dirs compared to other solutions, yes, but with
guaranteed correct and deterministic behaviour. And then on the long run we
would of course replace that with a more performant solution.

I mean, this is really simple to implement, so I would at least test it. If it
really runs horribly slow we could still discuss faster solutions, which are
however all much more tricky.

> So I think store all name entries would be better than store all file ID.

As already discussed, NTFS allows up to (2^32 - 1) = 4,294,967,295 entries
per directory. So caching only one directory (entirely) in RAM can already
exceed the available RAM, which would crash QEMU. Multiplied by an expected
amount of directory lookups by client and we even get into much higher
categories, even with much smaller individual directory sizes.

> 
> 
> > >
> > >
> > > Thanks
> > > Guohuai
> > >
> > >
> > > > > +
> > > > > +    if (errno == ENOENT) {
> > > > > +        /* No more matching files could be found, clean errno */
> > > > > +        errno = 0;
> > > > > +    } else {
> > > > > +        err = errno;
> > > > > +        goto out;
> > > > > +    }
> > > > > +
> > > > > +    stream->total_entries = index;
> > > > > +    stream->current = QSLIST_FIRST(&stream->head);
> > > > > +
> > > > > +out:
> > > > > +    if (err != 0) {
> > > > > +        errno = err;
> > > > > +        /* free whole list */
> > > > > +        if (stream != NULL) {
> > > > > +            QSLIST_FOREACH_SAFE(dir_entry, &stream->head, node, next)
> > {
> > > > > +                QSLIST_REMOVE(&stream->head, dir_entry,
> > > > > +dir_win32_entry,
> > > > node);
> > > > > +                g_free(dir_entry);
> > > > > +            }
> > > > > +            g_free(stream);
> > > > > +            stream = NULL;
> > > > > +        }
> > > > > +    }
> > > > > +
> > > > > +    /* after cached all entries, this handle is useless */
> > > > > +    if (dd_handle != -1) {
> > > > > +        _findclose(dd_handle);
> > > > > +    }
> > > > > +
> > > > > +    if (hDir != INVALID_HANDLE_VALUE) {
> > > > > +        CloseHandle(hDir);
> > > > > +    }
> > > > > +
> > > > > +    return (DIR *)stream;
> > > > > +}
> > > > > +
> > > > > +/*
> > > > > + * closedir_win32 - close a directory
> > > > > + *
> > > > > + * This function closes directory and free all cached resources.
> > > > > + */
> > > > > +int closedir_win32(DIR *pDir)
> > > > > +{
> > > > > +    struct dir_win32 *stream = (struct dir_win32 *)pDir;
> > > > > +    struct dir_win32_entry *dir_entry;
> > > > > +    struct dir_win32_entry *next;
> > > > > +
> > > > > +    if (stream == NULL) {
> > > > > +        errno = EBADF;
> > > > > +        return -1;
> > > > > +    }
> > > > > +
> > > > > +    /* free all resources */
> > > > > +
> > > > > +    QSLIST_FOREACH_SAFE(dir_entry, &stream->head, node, next) {
> > > > > +        QSLIST_REMOVE(&stream->head, dir_entry, dir_win32_entry,
> > node);
> > > > > +        g_free(dir_entry);
> > > > > +    }
> > > > > +
> > > > > +    g_free(stream);
> > > > > +
> > > > > +    return 0;
> > > > > +}
> > > > > +
> > > > > +/*
> > > > > + * readdir_win32 - read a directory
> > > > > + *
> > > > > + * This function reads a directory entry from cached entry list.
> > > > > + */
> > > > > +struct dirent *readdir_win32(DIR *pDir) {
> > > > > +    struct dir_win32 *stream = (struct dir_win32 *)pDir;
> > > > > +
> > > > > +    if (stream == NULL) {
> > > > > +        errno = EBADF;
> > > > > +        return NULL;
> > > > > +    }
> > > > > +
> > > > > +    if (stream->offset >= stream->total_entries) {
> > > > > +        /* reach to the end, return NULL without set errno */
> > > > > +        return NULL;
> > > > > +    }
> > > > > +
> > > > > +    memcpy(stream->dd_dir.d_name,
> > > > > +           stream->current->dd_data.name,
> > > > > +           sizeof(stream->dd_dir.d_name));
> > > > > +
> > > > > +    /* Windows does not provide inode number */
> > > > > +    stream->dd_dir.d_ino = 0;
> > > > > +    stream->dd_dir.d_reclen = 0;
> > > > > +    stream->dd_dir.d_namlen = strlen(stream->dd_dir.d_name);
> > > > > +
> > > > > +    stream->offset++;
> > > > > +    stream->current = QSLIST_NEXT(stream->current, node);
> > > > > +
> > > > > +    return &stream->dd_dir;
> > > > > +}
> > > > > +
> > > > > +/*
> > > > > + * rewinddir_win32 - reset directory stream
> > > > > + *
> > > > > + * This function resets the position of the directory stream to
> > > > > +the
> > > > > + * beginning of the directory.
> > > > > + */
> > > > > +void rewinddir_win32(DIR *pDir)
> > > > > +{
> > > > > +    struct dir_win32 *stream = (struct dir_win32 *)pDir;
> > > > > +
> > > > > +    if (stream == NULL) {
> > > > > +        errno = EBADF;
> > > > > +        return;
> > > > > +    }
> > > > > +
> > > > > +    stream->offset = 0;
> > > > > +    stream->current = QSLIST_FIRST(&stream->head);
> > > > > +
> > > > > +    return;
> > > > > +}
> > > > > +
> > > > > +/*
> > > > > + * seekdir_win32 - set the position of the next readdir() call in
> > > > > +the directory
> > > > > + *
> > > > > + * This function sets the position of the next readdir() call in
> > > > > +the directory
> > > > > + * from which the next readdir() call will start.
> > > > > + */
> > > > > +void seekdir_win32(DIR *pDir, long pos) {
> > > > > +    struct dir_win32 *stream = (struct dir_win32 *)pDir;
> > > > > +    uint32_t index;
> > > > > +
> > > > > +    if (stream == NULL) {
> > > > > +        errno = EBADF;
> > > > > +        return;
> > > > > +    }
> > > > > +
> > > > > +    if (pos < -1) {
> > > > > +        errno = EINVAL;
> > > > > +        return;
> > > > > +    }
> > > > > +
> > > > > +    if (pos == -1 || pos >= (long)stream->total_entries) {
> > > > > +        /* seek to the end */
> > > > > +        stream->offset = stream->total_entries;
> > > > > +        return;
> > > > > +    }
> > > > > +
> > > > > +    if (pos - (long)stream->offset == 0) {
> > > > > +        /* no need to seek */
> > > > > +        return;
> > > > > +    }
> > > > > +
> > > > > +    /* seek position from list head */
> > > > > +
> > > > > +    stream->current = QSLIST_FIRST(&stream->head);
> > > > > +
> > > > > +    for (index = 0; index < (uint32_t)pos; index++) {
> > > > > +        stream->current = QSLIST_NEXT(stream->current, node);
> > > > > +    }
> > > > > +    stream->offset = index;
> > > > > +
> > > > > +    return;
> > > > > +}
> > > > > +
> > > > > +/*
> > > > > + * telldir_win32 - return current location in directory
> > > > > + *
> > > > > + * This function returns current location in directory.
> > > > > + */
> > > > > +long telldir_win32(DIR *pDir)
> > > > > +{
> > > > > +    struct dir_win32 *stream = (struct dir_win32 *)pDir;
> > > > > +
> > > > > +    if (stream == NULL) {
> > > > > +        errno = EBADF;
> > > > > +        return -1;
> > > > > +    }
> > > > > +
> > > > > +    if (stream->offset > stream->total_entries) {
> > > > > +        return -1;
> > > > > +    }
> > > > > +
> > > > > +    return (long)stream->offset;
> > > > > +}
> > > > >
> > > >
> > >
> > >
> > >
> > 
> > 
> 
> 
>
Shi, Guohuai Feb. 6, 2023, 5:37 a.m. UTC | #6
> -----Original Message-----
> From: Christian Schoenebeck <qemu_oss@crudebyte.com>
> Sent: Saturday, February 4, 2023 01:55
> To: Greg Kurz <groug@kaod.org>; qemu-devel@nongnu.org
> Cc: Meng, Bin <Bin.Meng@windriver.com>; Marc-André Lureau
> <marcandre.lureau@redhat.com>; Daniel P. Berrangé
> <berrange@redhat.com>; Shi, Guohuai <Guohuai.Shi@windriver.com>
> Subject: Re: [PATCH v4 04/16] hw/9pfs: Implement Windows specific xxxdir()
> APIs
> 
> CAUTION: This email comes from a non Wind River email account!
> Do not click links or open attachments unless you recognize the sender and
> know the content is safe.
> 
> On Friday, February 3, 2023 5:30:35 PM CET Shi, Guohuai wrote:
> >
> > > -----Original Message-----
> > > From: Christian Schoenebeck <qemu_oss@crudebyte.com>
> > > Sent: Friday, February 3, 2023 22:41
> > > To: Greg Kurz <groug@kaod.org>; qemu-devel@nongnu.org
> > > Cc: Meng, Bin <Bin.Meng@windriver.com>; Marc-André Lureau
> > > <marcandre.lureau@redhat.com>; Daniel P. Berrangé
> > > <berrange@redhat.com>; Shi, Guohuai <Guohuai.Shi@windriver.com>
> > > Subject: Re: [PATCH v4 04/16] hw/9pfs: Implement Windows specific
> > > xxxdir() APIs
> > >
> > > CAUTION: This email comes from a non Wind River email account!
> > > Do not click links or open attachments unless you recognize the
> > > sender and know the content is safe.
> > >
> > > On Friday, February 3, 2023 2:34:13 PM CET Shi, Guohuai wrote:
> > > >
> > > > > -----Original Message-----
> > > > > From: Christian Schoenebeck <qemu_oss@crudebyte.com>
> > > > > Sent: Friday, February 3, 2023 20:25
> > > > > To: Greg Kurz <groug@kaod.org>; qemu-devel@nongnu.org
> > > > > Cc: Shi, Guohuai <Guohuai.Shi@windriver.com>; Meng, Bin
> > > > > <Bin.Meng@windriver.com>; Marc-André Lureau
> > > > > <marcandre.lureau@redhat.com>; Daniel P. Berrangé
> > > > > <berrange@redhat.com>
> > > > > Subject: Re: [PATCH v4 04/16] hw/9pfs: Implement Windows
> > > > > specific
> > > > > xxxdir() APIs
> > > > >
> > > > > CAUTION: This email comes from a non Wind River email account!
> > > > > Do not click links or open attachments unless you recognize the
> > > > > sender and know the content is safe.
> > > > >
> > > > > On Monday, January 30, 2023 10:51:50 AM CET Bin Meng wrote:
> > > > > > From: Guohuai Shi <guohuai.shi@windriver.com>
> > > > > >
> > > > > > This commit implements Windows specific xxxdir() APIs for
> > > > > > safety directory access.
> > > > > >
> > > > >
> > > > > This issue deserves a link to either the previous discussion
> > > > >
> > > > > Link:
> > > > > https://lore.kernel.org/qemu-devel/2830993.GtbaR8S6b6@silver/
> > > > >
> > > > > and/or a link to this continuation of the discussion here, as
> > > > > it's not a trivial issue, with pros and cons been discussed for
> > > > > the individual, possible solutions.
> > > > >
> > > > > > Signed-off-by: Guohuai Shi <guohuai.shi@windriver.com>
> > > > > > Signed-off-by: Bin Meng <bin.meng@windriver.com>
> > > > > > ---
> > > > > >
> > > > > >  hw/9pfs/9p-util.h       |   6 +
> > > > > >  hw/9pfs/9p-util-win32.c | 296
> > > > > > ++++++++++++++++++++++++++++++++++++++++
> > > > > >  2 files changed, 302 insertions(+)
> > > > > >
> > > > > > diff --git a/hw/9pfs/9p-util.h b/hw/9pfs/9p-util.h index
> > > > > > 0f159fb4ce..c1c251fbd1 100644
> > > > > > --- a/hw/9pfs/9p-util.h
> > > > > > +++ b/hw/9pfs/9p-util.h
> > > > > > @@ -141,6 +141,12 @@ int unlinkat_win32(int dirfd, const char
> > > > > > *pathname, int flags);  int statfs_win32(const char
> > > > > > *root_path, struct statfs *stbuf);  int openat_dir(int dirfd,
> > > > > > const char *name);  int openat_file(int dirfd, const char
> > > > > > *name, int flags, mode_t mode);
> > > > > > +DIR *opendir_win32(const char *full_file_name); int
> > > > > > +closedir_win32(DIR *pDir); struct dirent *readdir_win32(DIR
> > > > > > +*pDir); void rewinddir_win32(DIR *pDir); void
> > > > > > +seekdir_win32(DIR *pDir, long pos); long telldir_win32(DIR
> > > > > > +*pDir);
> > > > > >  #endif
> > > > > >
> > > > > >  static inline void close_preserve_errno(int fd) diff --git
> > > > > > a/hw/9pfs/9p-util-win32.c b/hw/9pfs/9p-util-win32.c index
> > > > > > a99d579a06..5503199300 100644
> > > > > > --- a/hw/9pfs/9p-util-win32.c
> > > > > > +++ b/hw/9pfs/9p-util-win32.c
> > > > > > @@ -37,6 +37,13 @@
> > > > > >   *    Windows does not support opendir, the directory fd is created
> by
> > > > > >   *    CreateFile and convert to fd by _open_osfhandle(). Keep the fd
> > > open
> > > > > will
> > > > > >   *    lock and protect the directory (can not be modified or replaced)
> > > > > > + *
> > > > > > + * 5. Windows and MinGW does not provide safety directory
> > > > > > + accessing
> > > > > functions.
> > > > > > + *    readdir(), seekdir() and telldir() may get or set wrong value
> > > > > because
> > > > > > + *    directory entry data is not protected.
> > > > >
> > > > > I would rephrase that sentence, as it doesn't cover the root
> > > > > problem adequately. Maybe something like this:
> > > > >
> > > > > 5. Neither Windows native APIs, nor MinGW provide a POSIX
> > > > > compatible API for acquiring directory entries in a safe way.
> > > > > Calling those APIs (native
> > > > > _findfirst() and _findnext() or MinGW's readdir(), seekdir() and
> > > > > telldir()) directly can lead to an inconsistent state if
> > > > > directory is modified in between, e.g. the same directory
> > > > > appearing more than once in output, or directories not appearing
> > > > > at all in output even though they were neither newly created nor
> > > > > deleted. POSIX does not define what happens with deleted or
> > > > > newly created directories in between,
> > > but it guarantees a consistent state.
> > > > >
> > > > > > + *
> > > > > > + *    This file re-write POSIX directory accessing functions and cache
> > > all
> > > > > > + *    directory entries during opening.
> > > > > >   */
> > > > > >
> > > > > >  #include "qemu/osdep.h"
> > > > > > @@ -51,6 +58,27 @@
> > > > > >
> > > > > >  #define V9FS_MAGIC  0x53465039  /* string "9PFS" */
> > > > > >
> > > > > > +/*
> > > > > > + * MinGW and Windows does not provide safety way to seek
> > > > > > +directory while other
> > > > > > + * thread is modifying same directory.
> > > > > > + *
> > > > > > + * The two structures are used to cache all directory entries
> > > > > > +when opening
> > > > > it.
> > > > > > + * Cached entries are always returned for read or seek.
> > > > > > + */
> > > > > > +struct dir_win32_entry {
> > > > > > +    QSLIST_ENTRY(dir_win32_entry) node;
> > > > > > +    struct _finddata_t dd_data; };
> > > > > > +
> > > > > > +struct dir_win32 {
> > > > > > +    struct dirent dd_dir;
> > > > > > +    uint32_t offset;
> > > > > > +    uint32_t total_entries;
> > > > > > +    QSLIST_HEAD(, dir_win32_entry) head;
> > > > > > +    struct dir_win32_entry *current;
> > > > > > +    char dd_name[1];
> > > > > > +};
> > > > > > +
> > > > > >  /*
> > > > > >   * win32_error_to_posix - convert Win32 error to POSIX error
> number
> > > > > >   *
> > > > > > @@ -977,3 +1005,271 @@ int qemu_mknodat(int dirfd, const char
> > > > > > *filename,
> > > > > mode_t mode, dev_t dev)
> > > > > >      errno = ENOTSUP;
> > > > > >      return -1;
> > > > > >  }
> > > > > > +
> > > > > > +/*
> > > > > > + * opendir_win32 - open a directory
> > > > > > + *
> > > > > > + * This function opens a directory and caches all directory entries.
> > > > > > + */
> > > > > > +DIR *opendir_win32(const char *full_file_name) {
> > > > > > +    HANDLE hDir = INVALID_HANDLE_VALUE;
> > > > > > +    DWORD attribute;
> > > > > > +    intptr_t dd_handle = -1;
> > > > > > +    struct _finddata_t dd_data;
> > > > > > +
> > > > > > +    struct dir_win32 *stream = NULL;
> > > > > > +    struct dir_win32_entry *dir_entry;
> > > > > > +    struct dir_win32_entry *prev;
> > > > > > +    struct dir_win32_entry *next;
> > > > > > +
> > > > > > +    int err = 0;
> > > > > > +    int find_status;
> > > > > > +    uint32_t index;
> > > > > > +
> > > > > > +    /* open directory to prevent it being removed */
> > > > > > +
> > > > > > +    hDir = CreateFile(full_file_name, GENERIC_READ,
> > > > > > +                      FILE_SHARE_READ | FILE_SHARE_WRITE |
> > > > > FILE_SHARE_DELETE,
> > > > > > +                      NULL,
> > > > > > +                      OPEN_EXISTING,
> > > > > > + FILE_FLAG_BACKUP_SEMANTICS, NULL);
> > > > > > +
> > > > > > +    if (hDir == INVALID_HANDLE_VALUE) {
> > > > > > +        err = win32_error_to_posix(GetLastError());
> > > > > > +        goto out;
> > > > > > +    }
> > > > > > +
> > > > > > +    attribute = GetFileAttributes(full_file_name);
> > > > > > +
> > > > > > +    /* symlink is not allow */
> > > > > > +    if (attribute == INVALID_FILE_ATTRIBUTES
> > > > > > +        || (attribute & FILE_ATTRIBUTE_REPARSE_POINT) != 0) {
> > > > > > +        err = EACCES;
> > > > > > +        goto out;
> > > > > > +    }
> > > > > > +
> > > > > > +    /* check if it is a directory */
> > > > > > +    if ((attribute & FILE_ATTRIBUTE_DIRECTORY) == 0) {
> > > > > > +        err = ENOTDIR;
> > > > > > +        goto out;
> > > > > > +    }
> > > > > > +
> > > > > > +    /*
> > > > > > +     * findfirst() need suffix format name like
> > > > > > + "\dir1\dir2\*", allocate
> > > > > more
> > > > > > +     * buffer to store suffix.
> > > > > > +     */
> > > > > > +    stream = g_malloc0(sizeof(struct dir_win32) +
> > > > > > + strlen(full_file_name) +
> > > > > 3);
> > > > > > +    QSLIST_INIT(&stream->head);
> > > > > > +
> > > > > > +    strcpy(stream->dd_name, full_file_name);
> > > > > > +    strcat(stream->dd_name, "\\*");
> > > > > > +
> > > > > > +    dd_handle = _findfirst(stream->dd_name, &dd_data);
> > > > > > +
> > > > > > +    if (dd_handle == -1) {
> > > > > > +        err = errno;
> > > > > > +        goto out;
> > > > > > +    }
> > > > > > +
> > > > > > +    index = 0;
> > > > > > +
> > > > > > +    /* read all entries to link list */
> > > > > > +    do {
> > > > > > +        dir_entry = g_malloc0(sizeof(struct dir_win32_entry));
> > > > > > +        memcpy(&dir_entry->dd_data, &dd_data, sizeof(dd_data));
> > > > > > +        if (index == 0) {
> > > > > > +            QSLIST_INSERT_HEAD(&stream->head, dir_entry, node);
> > > > > > +        } else {
> > > > > > +            QSLIST_INSERT_AFTER(prev, dir_entry, node);
> > > > > > +        }
> > > > > > +
> > > > > > +        prev = dir_entry;
> > > > > > +        find_status = _findnext(dd_handle, &dd_data);
> > > > > > +
> > > > > > +        index++;
> > > > > > +    } while (find_status == 0);
> > > > >
> > > > > So you decided to go for the solution that caches all entries of
> > > > > a directory in RAM.
> > > > >
> > > > > So don't you think my last suggested solution that would call
> > > > > native
> > > > > _findfirst() and _findnext() directly, but without any chaching
> > > > > and instead picking the relevent entry simply by inode number,
> > > > > might be a better candidate as a starting point for landing Windows
> support?
> > > > > Link to that previous
> > > > > suggestion:
> > > > >
> > > > > https://lore.kernel.org/qemu-devel/2468168.SvRIHAoRfs@silver/
> > > > >
> > > >
> > > > I did a quick test for caching data without name entry, but it
> > > > failed for
> > > reading + deleting directory on Windows host (like "rm -rf" for a
> directory).
> > > > The root cause is: Windows's directory entry is not cached.
> > > > If there is 100 files in a directory:
> > > >
> > > > File1
> > > > File2
> > > > ...
> > > > File100
> > > >
> > > > When "rm -rf" is working:
> > > >
> > > > It read first 10 entries, and remove them. 9pfs may seek and
> > > > re-seek to
> > > offset 10 to read next 10 entries.
> > > > But Windows and MinGW does not provide rewinddir.
> > > > If we using findfirst() and findnext to seek to offset 10, then we
> > > > will not
> > > get File11 but get File 21 (because we skipped 10 entries by seekdir()).
> > >
> > > I assume you are referring to a simple solution like MinGW does,
> > > i.e. a consecutive dense index (0,1,2,3,...n-1 where n is the
> > > current total amount of directory entries). That would not work,
> > > yes. But that's not what I suggested.
> > >
> > > With an inode number based lookup you would not seek to an incorrect
> > > entry ...
> > >
> > > > If we removed some entries in directory, inode number is useless
> > > > because we
> > > can not find it again.
> > >
> > > You *can* recover from the previous inode number, even if any
> > > directory entry has been deleted in the meantime: you would lookup
> > > the entry with the next higher inode number.
> > >
> > > Example, say initial directory state on host is:
> > >
> > > name   inode-nr
> > > aaa    8
> > > bbb    3
> > > ccc    4
> > > ddd    2
> > > eee    9
> > >
> > > Say client is looking up exactly 2 entries, you would return to
> > > client in this order (by inode-nr):
> > >
> > > 1. ddd
> > > 2. bbb
> > >
> > > Now say "bbb" (a.k.a. previous) and "ccc" (a.k.a next) are removed.
> > > Directory state on host is now:
> > >
> > > name   inode-nr
> > > aaa    8
> > > ddd    2
> > > eee    9
> > >
> > > Subsequently the last directory entries are requested by client.
> > > Previous inode number (stored in RAM) was 3, which no longer exists,
> > > so you lookup the entry with the next higher inode number than 3,
> > > which is now 8 in this example. Hence you would eventually return to
> client (in this order):
> > >
> > > 3. aaa
> > > 4. eee
> > >
> >
> > Yes, it can work by using inode number (called File ID on Windows host:
> https://learn.microsoft.com/en-us/windows/win32/api/winbase/ns-
> winbase-file_id_info).
> > However, Windows does not provide a function to get file information by
> file ID.
> > That means, for anytime of seeking directory, 9pfs need to do the following
> sequence work to locate a name entry:
> >
> > 1. findfirst
> > 2. CreateFile to get file handle
> > 3. GetFileInformationByHandleEx to get file ID
> > (https://learn.microsoft.com/en-us/windows/win32/api/minwinbase/ne-
> min
> > winbase-file_info_by_handle_class)
> > 4. Close file handle and return if the file ID is match 5. findnext 6.
> > repeat to step #2
> >
> > Windows does not short file name entry by file ID and the file ID is 128-bit
> integer.
> > When there are many entries in directory, seeking directory will cause a
> very bad performance.
> 
> I know, it's an n-square performance issue and what I already wrote in the
> summary of the linked original suggestion [1] in v3 before, quote:
> 
>   + Relatively straight-forward to implement.
> 
>   + No (major) changes in 9pfs code base required.
> 
>   - Still n-square performance issue (neglectable to land Windows host
> support
>     IMO).
> 
>   o Consistency assured for "most" cases, except one: if hardlinks are
>     inserted in between then it might fail

readdir() on Linux host may also return the deleted entries.
And POSIX specification does not mention about the consistency issue.

NTFS file id is the $MFT index id. It will keen unique until file is deleted.
But the index id may be reuse if delete and re-create many files.

Saving file id instead of name will make consistency better, but may not cover all status.
Because read directory is not a "atomic" operation.
> 
> [1] https://lore.kernel.org/qemu-devel/2468168.SvRIHAoRfs@silver/
> 
> The idea was to use that just as a starting point to land Windows host support
> ASAP, slower on large dirs compared to other solutions, yes, but with
> guaranteed correct and deterministic behaviour. And then on the long run
> we would of course replace that with a more performant solution.
> 
> I mean, this is really simple to implement, so I would at least test it. If it really
> runs horribly slow we could still discuss faster solutions, which are however
> all much more tricky.
> 

I did a basic test on Windows host, here is the code:

    st = clock();
    pDir = opendir_win32(TEST_DIR);

    if (pDir == NULL)
        return -1;
    
    while ((pEnt = readdir_win32(pDir)) != NULL)
    {
        totals++;
    }
    closedir_win32(pDir);
    ed = clock();

    printf("total = %d clocks = %d %d\n", totals, ed - st, CLOCKS_PER_SEC);

My local storage is SSD disk.

Run this test for 100, 1000, 10000 entries.
For file name cache solution, the time cost is: 2, 9, 44 (in ms).
For file id cache solution, the time cost: 3, 438, 4338 (in ms).
I already used OpenFileById() to make it faster instead of CreateFile(). If I use CreateFile, it need more than 80 seconds.

The performance looks like not good. 
And actually, it would be worse in 9pfs.
Because in current design, 9pfs  may seek forward and seek back several times during reading directory, which may cause the performance worse.

> > So I think store all name entries would be better than store all file ID.
> 
> As already discussed, NTFS allows up to (2^32 - 1) = 4,294,967,295 entries per
> directory. So caching only one directory (entirely) in RAM can already exceed
> the available RAM, which would crash QEMU. Multiplied by an expected
> amount of directory lookups by client and we even get into much higher
> categories, even with much smaller individual directory sizes.
> 

Windows file id structure is 24 bytes, which is not a small structure.
If you think the performance is acceptable, I can rework this commit based on file id.

> >
> >
> > > >
> > > >
> > > > Thanks
> > > > Guohuai
> > > >
> > > >
> > > > > > +
> > > > > > +    if (errno == ENOENT) {
> > > > > > +        /* No more matching files could be found, clean errno */
> > > > > > +        errno = 0;
> > > > > > +    } else {
> > > > > > +        err = errno;
> > > > > > +        goto out;
> > > > > > +    }
> > > > > > +
> > > > > > +    stream->total_entries = index;
> > > > > > +    stream->current = QSLIST_FIRST(&stream->head);
> > > > > > +
> > > > > > +out:
> > > > > > +    if (err != 0) {
> > > > > > +        errno = err;
> > > > > > +        /* free whole list */
> > > > > > +        if (stream != NULL) {
> > > > > > +            QSLIST_FOREACH_SAFE(dir_entry, &stream->head,
> > > > > > +node, next)
> > > {
> > > > > > +                QSLIST_REMOVE(&stream->head, dir_entry,
> > > > > > +dir_win32_entry,
> > > > > node);
> > > > > > +                g_free(dir_entry);
> > > > > > +            }
> > > > > > +            g_free(stream);
> > > > > > +            stream = NULL;
> > > > > > +        }
> > > > > > +    }
> > > > > > +
> > > > > > +    /* after cached all entries, this handle is useless */
> > > > > > +    if (dd_handle != -1) {
> > > > > > +        _findclose(dd_handle);
> > > > > > +    }
> > > > > > +
> > > > > > +    if (hDir != INVALID_HANDLE_VALUE) {
> > > > > > +        CloseHandle(hDir);
> > > > > > +    }
> > > > > > +
> > > > > > +    return (DIR *)stream;
> > > > > > +}
> > > > > > +
> > > > > > +/*
> > > > > > + * closedir_win32 - close a directory
> > > > > > + *
> > > > > > + * This function closes directory and free all cached resources.
> > > > > > + */
> > > > > > +int closedir_win32(DIR *pDir) {
> > > > > > +    struct dir_win32 *stream = (struct dir_win32 *)pDir;
> > > > > > +    struct dir_win32_entry *dir_entry;
> > > > > > +    struct dir_win32_entry *next;
> > > > > > +
> > > > > > +    if (stream == NULL) {
> > > > > > +        errno = EBADF;
> > > > > > +        return -1;
> > > > > > +    }
> > > > > > +
> > > > > > +    /* free all resources */
> > > > > > +
> > > > > > +    QSLIST_FOREACH_SAFE(dir_entry, &stream->head, node, next)
> {
> > > > > > +        QSLIST_REMOVE(&stream->head, dir_entry,
> > > > > > + dir_win32_entry,
> > > node);
> > > > > > +        g_free(dir_entry);
> > > > > > +    }
> > > > > > +
> > > > > > +    g_free(stream);
> > > > > > +
> > > > > > +    return 0;
> > > > > > +}
> > > > > > +
> > > > > > +/*
> > > > > > + * readdir_win32 - read a directory
> > > > > > + *
> > > > > > + * This function reads a directory entry from cached entry list.
> > > > > > + */
> > > > > > +struct dirent *readdir_win32(DIR *pDir) {
> > > > > > +    struct dir_win32 *stream = (struct dir_win32 *)pDir;
> > > > > > +
> > > > > > +    if (stream == NULL) {
> > > > > > +        errno = EBADF;
> > > > > > +        return NULL;
> > > > > > +    }
> > > > > > +
> > > > > > +    if (stream->offset >= stream->total_entries) {
> > > > > > +        /* reach to the end, return NULL without set errno */
> > > > > > +        return NULL;
> > > > > > +    }
> > > > > > +
> > > > > > +    memcpy(stream->dd_dir.d_name,
> > > > > > +           stream->current->dd_data.name,
> > > > > > +           sizeof(stream->dd_dir.d_name));
> > > > > > +
> > > > > > +    /* Windows does not provide inode number */
> > > > > > +    stream->dd_dir.d_ino = 0;
> > > > > > +    stream->dd_dir.d_reclen = 0;
> > > > > > +    stream->dd_dir.d_namlen = strlen(stream->dd_dir.d_name);
> > > > > > +
> > > > > > +    stream->offset++;
> > > > > > +    stream->current = QSLIST_NEXT(stream->current, node);
> > > > > > +
> > > > > > +    return &stream->dd_dir;
> > > > > > +}
> > > > > > +
> > > > > > +/*
> > > > > > + * rewinddir_win32 - reset directory stream
> > > > > > + *
> > > > > > + * This function resets the position of the directory stream
> > > > > > +to the
> > > > > > + * beginning of the directory.
> > > > > > + */
> > > > > > +void rewinddir_win32(DIR *pDir) {
> > > > > > +    struct dir_win32 *stream = (struct dir_win32 *)pDir;
> > > > > > +
> > > > > > +    if (stream == NULL) {
> > > > > > +        errno = EBADF;
> > > > > > +        return;
> > > > > > +    }
> > > > > > +
> > > > > > +    stream->offset = 0;
> > > > > > +    stream->current = QSLIST_FIRST(&stream->head);
> > > > > > +
> > > > > > +    return;
> > > > > > +}
> > > > > > +
> > > > > > +/*
> > > > > > + * seekdir_win32 - set the position of the next readdir()
> > > > > > +call in the directory
> > > > > > + *
> > > > > > + * This function sets the position of the next readdir() call
> > > > > > +in the directory
> > > > > > + * from which the next readdir() call will start.
> > > > > > + */
> > > > > > +void seekdir_win32(DIR *pDir, long pos) {
> > > > > > +    struct dir_win32 *stream = (struct dir_win32 *)pDir;
> > > > > > +    uint32_t index;
> > > > > > +
> > > > > > +    if (stream == NULL) {
> > > > > > +        errno = EBADF;
> > > > > > +        return;
> > > > > > +    }
> > > > > > +
> > > > > > +    if (pos < -1) {
> > > > > > +        errno = EINVAL;
> > > > > > +        return;
> > > > > > +    }
> > > > > > +
> > > > > > +    if (pos == -1 || pos >= (long)stream->total_entries) {
> > > > > > +        /* seek to the end */
> > > > > > +        stream->offset = stream->total_entries;
> > > > > > +        return;
> > > > > > +    }
> > > > > > +
> > > > > > +    if (pos - (long)stream->offset == 0) {
> > > > > > +        /* no need to seek */
> > > > > > +        return;
> > > > > > +    }
> > > > > > +
> > > > > > +    /* seek position from list head */
> > > > > > +
> > > > > > +    stream->current = QSLIST_FIRST(&stream->head);
> > > > > > +
> > > > > > +    for (index = 0; index < (uint32_t)pos; index++) {
> > > > > > +        stream->current = QSLIST_NEXT(stream->current, node);
> > > > > > +    }
> > > > > > +    stream->offset = index;
> > > > > > +
> > > > > > +    return;
> > > > > > +}
> > > > > > +
> > > > > > +/*
> > > > > > + * telldir_win32 - return current location in directory
> > > > > > + *
> > > > > > + * This function returns current location in directory.
> > > > > > + */
> > > > > > +long telldir_win32(DIR *pDir) {
> > > > > > +    struct dir_win32 *stream = (struct dir_win32 *)pDir;
> > > > > > +
> > > > > > +    if (stream == NULL) {
> > > > > > +        errno = EBADF;
> > > > > > +        return -1;
> > > > > > +    }
> > > > > > +
> > > > > > +    if (stream->offset > stream->total_entries) {
> > > > > > +        return -1;
> > > > > > +    }
> > > > > > +
> > > > > > +    return (long)stream->offset; }
> > > > > >
> > > > >
> > > >
> > > >
> > > >
> > >
> > >
> >
> >
> >
> 
>
Christian Schoenebeck Feb. 7, 2023, 10:11 a.m. UTC | #7
On Monday, February 6, 2023 6:37:16 AM CET Shi, Guohuai wrote:
[...]
> > I know, it's an n-square performance issue and what I already wrote in the
> > summary of the linked original suggestion [1] in v3 before, quote:
> > 
> >   + Relatively straight-forward to implement.
> > 
> >   + No (major) changes in 9pfs code base required.
> > 
> >   - Still n-square performance issue (neglectable to land Windows host
> > support
> >     IMO).
> > 
> >   o Consistency assured for "most" cases, except one: if hardlinks are
> >     inserted in between then it might fail
> 
> readdir() on Linux host may also return the deleted entries.
> And POSIX specification does not mention about the consistency issue.

POSIX explicitly specifies that 1. new and 2. deleted entries may or may not
appear in result and leaves that implementation specific. That was never our
concern.

And yes, POSIX does not explicitly discuss consistency concerning entries that
have neither been added or removed, but this expectation is implied. In
practice double entries are probably less of an issue, client might be able to
handle that without misbehaviour (haven't checked this at all yet), but if the
implementation would lead to chances that entries may *never* appear to
clients at all, even after refreshing periodically, I mean how could you work
with a file system like that?

> NTFS file id is the $MFT index id. It will keen unique until file is deleted.
> But the index id may be reuse if delete and re-create many files.
> 
> Saving file id instead of name will make consistency better, but may not cover all status.
> Because read directory is not a "atomic" operation.

I don't see an issue with that, because these are entries that were either
added or removed, we don't care about them. And their file IDs would not
affect fetching the other directory entries that have not been touched in
between.

And we are also not questioning atomicity here, but consistency.

> > [1] https://lore.kernel.org/qemu-devel/2468168.SvRIHAoRfs@silver/
> > 
> > The idea was to use that just as a starting point to land Windows host support
> > ASAP, slower on large dirs compared to other solutions, yes, but with
> > guaranteed correct and deterministic behaviour. And then on the long run
> > we would of course replace that with a more performant solution.
> > 
> > I mean, this is really simple to implement, so I would at least test it. If it really
> > runs horribly slow we could still discuss faster solutions, which are however
> > all much more tricky.
> > 
> 
> I did a basic test on Windows host, here is the code:
> 
>     st = clock();
>     pDir = opendir_win32(TEST_DIR);
> 
>     if (pDir == NULL)
>         return -1;
>     
>     while ((pEnt = readdir_win32(pDir)) != NULL)
>     {
>         totals++;
>     }
>     closedir_win32(pDir);
>     ed = clock();
> 
>     printf("total = %d clocks = %d %d\n", totals, ed - st, CLOCKS_PER_SEC);
> 
> My local storage is SSD disk.
> 
> Run this test for 100, 1000, 10000 entries.
> For file name cache solution, the time cost is: 2, 9, 44 (in ms).
> For file id cache solution, the time cost: 3, 438, 4338 (in ms).
> I already used OpenFileById() to make it faster instead of CreateFile(). If I use CreateFile, it need more than 80 seconds.
> 
> The performance looks like not good. 
> And actually, it would be worse in 9pfs.
> Because in current design, 9pfs  may seek forward and seek back several times during reading directory, which may cause the performance worse.

Poor performance, yes, probably a bit worse than I would have expected.

So it is about choosing your poison (potential crash vs. poor performance).

I mean, I am not keen into suggesting any kind of bike shredding for you on
this issue, but if this is merged, then people expect it to behave reliably
and not allowing a guest to crash QEMU host process by simply creating a large
number of directory entries on guest.

I was also considering to make it a QEMU option, but OTOH, this is a temporary
situation and those options would be wiped once we have an oppropriate
solution a bit later.

I am open for suggestions. Could we probably just mark Windows host support as
experimental for now, is that even allowed by QEMU policies?

> > > So I think store all name entries would be better than store all file ID.
> > 
> > As already discussed, NTFS allows up to (2^32 - 1) = 4,294,967,295 entries per
> > directory. So caching only one directory (entirely) in RAM can already exceed
> > the available RAM, which would crash QEMU. Multiplied by an expected
> > amount of directory lookups by client and we even get into much higher
> > categories, even with much smaller individual directory sizes.
> > 
> 
> Windows file id structure is 24 bytes, which is not a small structure.
> If you think the performance is acceptable, I can rework this commit based on file id.
Shi, Guohuai Feb. 7, 2023, 5:55 p.m. UTC | #8
> -----Original Message-----
> From: Christian Schoenebeck <qemu_oss@crudebyte.com>
> Sent: Tuesday, February 7, 2023 18:12
> To: Greg Kurz <groug@kaod.org>; qemu-devel@nongnu.org
> Cc: Meng, Bin <Bin.Meng@windriver.com>; Marc-André Lureau
> <marcandre.lureau@redhat.com>; Daniel P. Berrangé <berrange@redhat.com>; Shi,
> Guohuai <Guohuai.Shi@windriver.com>
> Subject: Re: [PATCH v4 04/16] hw/9pfs: Implement Windows specific xxxdir()
> APIs
> 
> CAUTION: This email comes from a non Wind River email account!
> Do not click links or open attachments unless you recognize the sender and
> know the content is safe.
> 
> On Monday, February 6, 2023 6:37:16 AM CET Shi, Guohuai wrote:
> [...]
> > > I know, it's an n-square performance issue and what I already wrote
> > > in the summary of the linked original suggestion [1] in v3 before, quote:
> > >
> > >   + Relatively straight-forward to implement.
> > >
> > >   + No (major) changes in 9pfs code base required.
> > >
> > >   - Still n-square performance issue (neglectable to land Windows
> > > host support
> > >     IMO).
> > >
> > >   o Consistency assured for "most" cases, except one: if hardlinks are
> > >     inserted in between then it might fail
> >
> > readdir() on Linux host may also return the deleted entries.
> > And POSIX specification does not mention about the consistency issue.
> 
> POSIX explicitly specifies that 1. new and 2. deleted entries may or may not
> appear in result and leaves that implementation specific. That was never our
> concern.
> 
> And yes, POSIX does not explicitly discuss consistency concerning entries
> that have neither been added or removed, but this expectation is implied. In
> practice double entries are probably less of an issue, client might be able
> to handle that without misbehaviour (haven't checked this at all yet), but if
> the implementation would lead to chances that entries may *never* appear to
> clients at all, even after refreshing periodically, I mean how could you work
> with a file system like that?
> 
> > NTFS file id is the $MFT index id. It will keen unique until file is
> deleted.
> > But the index id may be reuse if delete and re-create many files.
> >
> > Saving file id instead of name will make consistency better, but may not
> cover all status.
> > Because read directory is not a "atomic" operation.
> 
> I don't see an issue with that, because these are entries that were either
> added or removed, we don't care about them. And their file IDs would not
> affect fetching the other directory entries that have not been touched in
> between.
> 
> And we are also not questioning atomicity here, but consistency.
> 
> > > [1] https://lore.kernel.org/qemu-devel/2468168.SvRIHAoRfs@silver/
> > >
> > > The idea was to use that just as a starting point to land Windows
> > > host support ASAP, slower on large dirs compared to other solutions,
> > > yes, but with guaranteed correct and deterministic behaviour. And
> > > then on the long run we would of course replace that with a more
> performant solution.
> > >
> > > I mean, this is really simple to implement, so I would at least test
> > > it. If it really runs horribly slow we could still discuss faster
> > > solutions, which are however all much more tricky.
> > >
> >
> > I did a basic test on Windows host, here is the code:
> >
> >     st = clock();
> >     pDir = opendir_win32(TEST_DIR);
> >
> >     if (pDir == NULL)
> >         return -1;
> >
> >     while ((pEnt = readdir_win32(pDir)) != NULL)
> >     {
> >         totals++;
> >     }
> >     closedir_win32(pDir);
> >     ed = clock();
> >
> >     printf("total = %d clocks = %d %d\n", totals, ed - st,
> > CLOCKS_PER_SEC);
> >
> > My local storage is SSD disk.
> >
> > Run this test for 100, 1000, 10000 entries.
> > For file name cache solution, the time cost is: 2, 9, 44 (in ms).
> > For file id cache solution, the time cost: 3, 438, 4338 (in ms).
> > I already used OpenFileById() to make it faster instead of CreateFile(). If
> I use CreateFile, it need more than 80 seconds.
> >
> > The performance looks like not good.
> > And actually, it would be worse in 9pfs.
> > Because in current design, 9pfs  may seek forward and seek back several
> times during reading directory, which may cause the performance worse.
> 
> Poor performance, yes, probably a bit worse than I would have expected.
> 
> So it is about choosing your poison (potential crash vs. poor performance).
> 
> I mean, I am not keen into suggesting any kind of bike shredding for you on
> this issue, but if this is merged, then people expect it to behave reliably
> and not allowing a guest to crash QEMU host process by simply creating a
> large number of directory entries on guest.
> 
> I was also considering to make it a QEMU option, but OTOH, this is a
> temporary situation and those options would be wiped once we have an
> oppropriate solution a bit later.
> 
> I am open for suggestions. Could we probably just mark Windows host support
> as experimental for now, is that even allowed by QEMU policies?

Yes, it is hard to choose:

a) 1 file id entry is 24 bytes, to reduce memory fragment, I used an array to store the file ids.
b) 1 file name entry is ~300 bytes, by using link list.

If there are 1-million files in one directory, a) need 24 MB continues memory buffer, b) need 300 MB memory (no need continues).
If there are 10-million files in one directory, a) need 240 MB continues memory buffer, b) need 3 GB memory (no need continues).

Both #a and #b are need more and more memory buffer. If there no more free memory, opendir() will be failed.
However, is it a normal status that a directory contains more than 1-million files?

I will prepare an new version solution just for this commit with storing file id.
The new patch would be ready tomorrow.

Thanks
Guohuai

> 
> > > > So I think store all name entries would be better than store all file
> ID.
> > >
> > > As already discussed, NTFS allows up to (2^32 - 1) = 4,294,967,295
> > > entries per directory. So caching only one directory (entirely) in
> > > RAM can already exceed the available RAM, which would crash QEMU.
> > > Multiplied by an expected amount of directory lookups by client and
> > > we even get into much higher categories, even with much smaller
> individual directory sizes.
> > >
> >
> > Windows file id structure is 24 bytes, which is not a small structure.
> > If you think the performance is acceptable, I can rework this commit based
> on file id.
>
diff mbox series

Patch

diff --git a/hw/9pfs/9p-util.h b/hw/9pfs/9p-util.h
index 0f159fb4ce..c1c251fbd1 100644
--- a/hw/9pfs/9p-util.h
+++ b/hw/9pfs/9p-util.h
@@ -141,6 +141,12 @@  int unlinkat_win32(int dirfd, const char *pathname, int flags);
 int statfs_win32(const char *root_path, struct statfs *stbuf);
 int openat_dir(int dirfd, const char *name);
 int openat_file(int dirfd, const char *name, int flags, mode_t mode);
+DIR *opendir_win32(const char *full_file_name);
+int closedir_win32(DIR *pDir);
+struct dirent *readdir_win32(DIR *pDir);
+void rewinddir_win32(DIR *pDir);
+void seekdir_win32(DIR *pDir, long pos);
+long telldir_win32(DIR *pDir);
 #endif
 
 static inline void close_preserve_errno(int fd)
diff --git a/hw/9pfs/9p-util-win32.c b/hw/9pfs/9p-util-win32.c
index a99d579a06..5503199300 100644
--- a/hw/9pfs/9p-util-win32.c
+++ b/hw/9pfs/9p-util-win32.c
@@ -37,6 +37,13 @@ 
  *    Windows does not support opendir, the directory fd is created by
  *    CreateFile and convert to fd by _open_osfhandle(). Keep the fd open will
  *    lock and protect the directory (can not be modified or replaced)
+ *
+ * 5. Windows and MinGW does not provide safety directory accessing functions.
+ *    readdir(), seekdir() and telldir() may get or set wrong value because
+ *    directory entry data is not protected.
+ *
+ *    This file re-write POSIX directory accessing functions and cache all
+ *    directory entries during opening.
  */
 
 #include "qemu/osdep.h"
@@ -51,6 +58,27 @@ 
 
 #define V9FS_MAGIC  0x53465039  /* string "9PFS" */
 
+/*
+ * MinGW and Windows does not provide safety way to seek directory while other
+ * thread is modifying same directory.
+ *
+ * The two structures are used to cache all directory entries when opening it.
+ * Cached entries are always returned for read or seek.
+ */
+struct dir_win32_entry {
+    QSLIST_ENTRY(dir_win32_entry) node;
+    struct _finddata_t dd_data;
+};
+
+struct dir_win32 {
+    struct dirent dd_dir;
+    uint32_t offset;
+    uint32_t total_entries;
+    QSLIST_HEAD(, dir_win32_entry) head;
+    struct dir_win32_entry *current;
+    char dd_name[1];
+};
+
 /*
  * win32_error_to_posix - convert Win32 error to POSIX error number
  *
@@ -977,3 +1005,271 @@  int qemu_mknodat(int dirfd, const char *filename, mode_t mode, dev_t dev)
     errno = ENOTSUP;
     return -1;
 }
+
+/*
+ * opendir_win32 - open a directory
+ *
+ * This function opens a directory and caches all directory entries.
+ */
+DIR *opendir_win32(const char *full_file_name)
+{
+    HANDLE hDir = INVALID_HANDLE_VALUE;
+    DWORD attribute;
+    intptr_t dd_handle = -1;
+    struct _finddata_t dd_data;
+
+    struct dir_win32 *stream = NULL;
+    struct dir_win32_entry *dir_entry;
+    struct dir_win32_entry *prev;
+    struct dir_win32_entry *next;
+
+    int err = 0;
+    int find_status;
+    uint32_t index;
+
+    /* open directory to prevent it being removed */
+
+    hDir = CreateFile(full_file_name, GENERIC_READ,
+                      FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
+                      NULL,
+                      OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS, NULL);
+
+    if (hDir == INVALID_HANDLE_VALUE) {
+        err = win32_error_to_posix(GetLastError());
+        goto out;
+    }
+
+    attribute = GetFileAttributes(full_file_name);
+
+    /* symlink is not allow */
+    if (attribute == INVALID_FILE_ATTRIBUTES
+        || (attribute & FILE_ATTRIBUTE_REPARSE_POINT) != 0) {
+        err = EACCES;
+        goto out;
+    }
+
+    /* check if it is a directory */
+    if ((attribute & FILE_ATTRIBUTE_DIRECTORY) == 0) {
+        err = ENOTDIR;
+        goto out;
+    }
+
+    /*
+     * findfirst() need suffix format name like "\dir1\dir2\*", allocate more
+     * buffer to store suffix.
+     */
+    stream = g_malloc0(sizeof(struct dir_win32) + strlen(full_file_name) + 3);
+    QSLIST_INIT(&stream->head);
+
+    strcpy(stream->dd_name, full_file_name);
+    strcat(stream->dd_name, "\\*");
+
+    dd_handle = _findfirst(stream->dd_name, &dd_data);
+
+    if (dd_handle == -1) {
+        err = errno;
+        goto out;
+    }
+
+    index = 0;
+
+    /* read all entries to link list */
+    do {
+        dir_entry = g_malloc0(sizeof(struct dir_win32_entry));
+        memcpy(&dir_entry->dd_data, &dd_data, sizeof(dd_data));
+        if (index == 0) {
+            QSLIST_INSERT_HEAD(&stream->head, dir_entry, node);
+        } else {
+            QSLIST_INSERT_AFTER(prev, dir_entry, node);
+        }
+
+        prev = dir_entry;
+        find_status = _findnext(dd_handle, &dd_data);
+
+        index++;
+    } while (find_status == 0);
+
+    if (errno == ENOENT) {
+        /* No more matching files could be found, clean errno */
+        errno = 0;
+    } else {
+        err = errno;
+        goto out;
+    }
+
+    stream->total_entries = index;
+    stream->current = QSLIST_FIRST(&stream->head);
+
+out:
+    if (err != 0) {
+        errno = err;
+        /* free whole list */
+        if (stream != NULL) {
+            QSLIST_FOREACH_SAFE(dir_entry, &stream->head, node, next) {
+                QSLIST_REMOVE(&stream->head, dir_entry, dir_win32_entry, node);
+                g_free(dir_entry);
+            }
+            g_free(stream);
+            stream = NULL;
+        }
+    }
+
+    /* after cached all entries, this handle is useless */
+    if (dd_handle != -1) {
+        _findclose(dd_handle);
+    }
+
+    if (hDir != INVALID_HANDLE_VALUE) {
+        CloseHandle(hDir);
+    }
+
+    return (DIR *)stream;
+}
+
+/*
+ * closedir_win32 - close a directory
+ *
+ * This function closes directory and free all cached resources.
+ */
+int closedir_win32(DIR *pDir)
+{
+    struct dir_win32 *stream = (struct dir_win32 *)pDir;
+    struct dir_win32_entry *dir_entry;
+    struct dir_win32_entry *next;
+
+    if (stream == NULL) {
+        errno = EBADF;
+        return -1;
+    }
+
+    /* free all resources */
+
+    QSLIST_FOREACH_SAFE(dir_entry, &stream->head, node, next) {
+        QSLIST_REMOVE(&stream->head, dir_entry, dir_win32_entry, node);
+        g_free(dir_entry);
+    }
+
+    g_free(stream);
+
+    return 0;
+}
+
+/*
+ * readdir_win32 - read a directory
+ *
+ * This function reads a directory entry from cached entry list.
+ */
+struct dirent *readdir_win32(DIR *pDir)
+{
+    struct dir_win32 *stream = (struct dir_win32 *)pDir;
+
+    if (stream == NULL) {
+        errno = EBADF;
+        return NULL;
+    }
+
+    if (stream->offset >= stream->total_entries) {
+        /* reach to the end, return NULL without set errno */
+        return NULL;
+    }
+
+    memcpy(stream->dd_dir.d_name,
+           stream->current->dd_data.name,
+           sizeof(stream->dd_dir.d_name));
+
+    /* Windows does not provide inode number */
+    stream->dd_dir.d_ino = 0;
+    stream->dd_dir.d_reclen = 0;
+    stream->dd_dir.d_namlen = strlen(stream->dd_dir.d_name);
+
+    stream->offset++;
+    stream->current = QSLIST_NEXT(stream->current, node);
+
+    return &stream->dd_dir;
+}
+
+/*
+ * rewinddir_win32 - reset directory stream
+ *
+ * This function resets the position of the directory stream to the
+ * beginning of the directory.
+ */
+void rewinddir_win32(DIR *pDir)
+{
+    struct dir_win32 *stream = (struct dir_win32 *)pDir;
+
+    if (stream == NULL) {
+        errno = EBADF;
+        return;
+    }
+
+    stream->offset = 0;
+    stream->current = QSLIST_FIRST(&stream->head);
+
+    return;
+}
+
+/*
+ * seekdir_win32 - set the position of the next readdir() call in the directory
+ *
+ * This function sets the position of the next readdir() call in the directory
+ * from which the next readdir() call will start.
+ */
+void seekdir_win32(DIR *pDir, long pos)
+{
+    struct dir_win32 *stream = (struct dir_win32 *)pDir;
+    uint32_t index;
+
+    if (stream == NULL) {
+        errno = EBADF;
+        return;
+    }
+
+    if (pos < -1) {
+        errno = EINVAL;
+        return;
+    }
+
+    if (pos == -1 || pos >= (long)stream->total_entries) {
+        /* seek to the end */
+        stream->offset = stream->total_entries;
+        return;
+    }
+
+    if (pos - (long)stream->offset == 0) {
+        /* no need to seek */
+        return;
+    }
+
+    /* seek position from list head */
+
+    stream->current = QSLIST_FIRST(&stream->head);
+
+    for (index = 0; index < (uint32_t)pos; index++) {
+        stream->current = QSLIST_NEXT(stream->current, node);
+    }
+    stream->offset = index;
+
+    return;
+}
+
+/*
+ * telldir_win32 - return current location in directory
+ *
+ * This function returns current location in directory.
+ */
+long telldir_win32(DIR *pDir)
+{
+    struct dir_win32 *stream = (struct dir_win32 *)pDir;
+
+    if (stream == NULL) {
+        errno = EBADF;
+        return -1;
+    }
+
+    if (stream->offset > stream->total_entries) {
+        return -1;
+    }
+
+    return (long)stream->offset;
+}