diff mbox series

[v5,04/16] hw/9pfs: Implement Windows specific xxxdir() APIs

Message ID 20230220100815.1624266-5-bin.meng@windriver.com (mailing list archive)
State New, archived
Headers show
Series hw/9pfs: Add 9pfs support for Windows | expand

Commit Message

Bin Meng Feb. 20, 2023, 10:08 a.m. UTC
From: Guohuai Shi <guohuai.shi@windriver.com>

This commit implements Windows specific xxxdir() APIs for safety
directory access.

Signed-off-by: Guohuai Shi <guohuai.shi@windriver.com>
Signed-off-by: Bin Meng <bin.meng@windriver.com>
---

 hw/9pfs/9p-util.h       |   6 +
 hw/9pfs/9p-util-win32.c | 443 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 449 insertions(+)

Comments

Christian Schoenebeck March 14, 2023, 4:05 p.m. UTC | #1
On Monday, February 20, 2023 11:08:03 AM CET Bin Meng wrote:
> From: Guohuai Shi <guohuai.shi@windriver.com>
> 
> This commit implements Windows specific xxxdir() APIs for safety
> directory access.

That comment is seriously too short for this patch.

1. You should describe the behaviour implementation that you have chosen and
why you have chosen it.

2. Like already said in the previous version of the patch, you should place a
link to the discussion we had on this issue.

> Signed-off-by: Guohuai Shi <guohuai.shi@windriver.com>
> Signed-off-by: Bin Meng <bin.meng@windriver.com>
> ---
> 
>  hw/9pfs/9p-util.h       |   6 +
>  hw/9pfs/9p-util-win32.c | 443 ++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 449 insertions(+)
> 
> diff --git a/hw/9pfs/9p-util.h b/hw/9pfs/9p-util.h
> index 0f159fb4ce..c1c251fbd1 100644
> --- a/hw/9pfs/9p-util.h
> +++ b/hw/9pfs/9p-util.h
> @@ -141,6 +141,12 @@ int unlinkat_win32(int dirfd, const char *pathname, int flags);
>  int statfs_win32(const char *root_path, struct statfs *stbuf);
>  int openat_dir(int dirfd, const char *name);
>  int openat_file(int dirfd, const char *name, int flags, mode_t mode);
> +DIR *opendir_win32(const char *full_file_name);
> +int closedir_win32(DIR *pDir);
> +struct dirent *readdir_win32(DIR *pDir);
> +void rewinddir_win32(DIR *pDir);
> +void seekdir_win32(DIR *pDir, long pos);
> +long telldir_win32(DIR *pDir);
>  #endif
>  
>  static inline void close_preserve_errno(int fd)
> diff --git a/hw/9pfs/9p-util-win32.c b/hw/9pfs/9p-util-win32.c
> index a99d579a06..e9408f3c45 100644
> --- a/hw/9pfs/9p-util-win32.c
> +++ b/hw/9pfs/9p-util-win32.c
> @@ -37,6 +37,16 @@
>   *    Windows does not support opendir, the directory fd is created by
>   *    CreateFile and convert to fd by _open_osfhandle(). Keep the fd open will
>   *    lock and protect the directory (can not be modified or replaced)
> + *
> + * 5. Neither Windows native APIs, nor MinGW provide a POSIX compatible API for
> + *    acquiring directory entries in a safe way. Calling those APIs (native
> + *    _findfirst() and _findnext() or MinGW's readdir(), seekdir() and
> + *    telldir()) directly can lead to an inconsistent state if directory is
> + *    modified in between, e.g. the same directory appearing more than once
> + *    in output, or directories not appearing at all in output even though they
> + *    were neither newly created nor deleted. POSIX does not define what happens
> + *    with deleted or newly created directories in between, but it guarantees a
> + *    consistent state.
>   */
>  
>  #include "qemu/osdep.h"
> @@ -51,6 +61,25 @@
>  
>  #define V9FS_MAGIC  0x53465039  /* string "9PFS" */
>  
> +/*
> + * MinGW and Windows does not provide a safe way to seek directory while other
> + * thread is modifying the same directory.
> + *
> + * This structure is used to store sorted file id and ensure directory seek
> + * consistency.
> + */
> +struct dir_win32 {
> +    struct dirent dd_dir;
> +    uint32_t offset;
> +    uint32_t total_entries;
> +    HANDLE hDir;
> +    uint32_t dir_name_len;
> +    uint64_t dot_id;
> +    uint64_t dot_dot_id;
> +    uint64_t *file_id_list;
> +    char dd_name[1];
> +};
> +
>  /*
>   * win32_error_to_posix - convert Win32 error to POSIX error number
>   *
> @@ -977,3 +1006,417 @@ int qemu_mknodat(int dirfd, const char *filename, mode_t mode, dev_t dev)
>      errno = ENOTSUP;
>      return -1;
>  }
> +
> +static int file_id_compare(const void *id_ptr1, const void *id_ptr2)
> +{
> +    uint64_t id[2];
> +
> +    id[0] = *(uint64_t *)id_ptr1;
> +    id[1] = *(uint64_t *)id_ptr2;
> +
> +    if (id[0] > id[1]) {
> +        return 1;
> +    } else if (id[0] < id[1]) {
> +        return -1;
> +    } else {
> +        return 0;
> +    }
> +}
> +
> +static int get_next_entry(struct dir_win32 *stream)
> +{
> +    HANDLE hDirEntry = INVALID_HANDLE_VALUE;
> +    char *entry_name;
> +    char *entry_start;
> +    FILE_ID_DESCRIPTOR fid;
> +    DWORD attribute;
> +
> +    if (stream->file_id_list[stream->offset] == stream->dot_id) {
> +        strcpy(stream->dd_dir.d_name, ".");
> +        return 0;
> +    }
> +
> +    if (stream->file_id_list[stream->offset] == stream->dot_dot_id) {
> +        strcpy(stream->dd_dir.d_name, "..");
> +        return 0;
> +    }
> +
> +    fid.dwSize = sizeof(fid);
> +    fid.Type = FileIdType;
> +
> +    fid.FileId.QuadPart = stream->file_id_list[stream->offset];
> +
> +    hDirEntry = OpenFileById(stream->hDir, &fid, GENERIC_READ,
> +                             FILE_SHARE_READ | FILE_SHARE_WRITE
> +                             | FILE_SHARE_DELETE,
> +                             NULL,
> +                             FILE_FLAG_BACKUP_SEMANTICS
> +                             | FILE_FLAG_OPEN_REPARSE_POINT);

What's the purpose of FILE_FLAG_OPEN_REPARSE_POINT here? As it's apparently
not obvious, please add a comment.

> +
> +    if (hDirEntry == INVALID_HANDLE_VALUE) {
> +        /*
> +         * Not open it successfully, it may be deleted.

Wrong English. "Open failed, it may have been deleted in the meantime.".

> +         * Try next id.
> +         */
> +        return -1;
> +    }
> +
> +    entry_name = get_full_path_win32(hDirEntry, NULL);
> +
> +    CloseHandle(hDirEntry);
> +
> +    if (entry_name == NULL) {
> +        return -1;
> +    }
> +
> +    attribute = GetFileAttributes(entry_name);
> +
> +    /* symlink is not allowed */
> +    if (attribute == INVALID_FILE_ATTRIBUTES
> +        || (attribute & FILE_ATTRIBUTE_REPARSE_POINT) != 0) {
> +        return -1;

Wouldn't it make sense to call warn_report_once() here to let the user know
that he has some symlinks that are never delivered to guest?

> +    }
> +
> +    if (memcmp(entry_name, stream->dd_name, stream->dir_name_len) != 0) {

No, that's unsafe. You want to use something like strncmp() instead.

> +        /*
> +         * The full entry file name should be a part of parent directory name,
> +         * except dot and dot_dot (is already handled).
> +         * If not, this entry should not be returned.
> +         */
> +        return -1;
> +    }
> +
> +    entry_start = entry_name + stream->dir_name_len;

s/entry_start/entry_basename/ ?

> +
> +    /* skip slash */
> +    while (*entry_start == '\\') {
> +        entry_start++;
> +    }
> +
> +    if (strchr(entry_start, '\\') != NULL) {
> +        return -1;
> +    }
> +
> +    if (strlen(entry_start) == 0
> +        || strlen(entry_start) + 1 > sizeof(stream->dd_dir.d_name)) {
> +        return -1;
> +    }
> +    strcpy(stream->dd_dir.d_name, entry_start);

g_path_get_basename() ? :)

> +
> +    return 0;
> +}
> +
> +/*
> + * opendir_win32 - open a directory
> + *
> + * This function opens a directory and caches all directory entries.

It just caches all file IDs, doesn't it?

> + */
> +DIR *opendir_win32(const char *full_file_name)
> +{
> +    HANDLE hDir = INVALID_HANDLE_VALUE;
> +    HANDLE hDirEntry = INVALID_HANDLE_VALUE;
> +    char *full_dir_entry = NULL;
> +    DWORD attribute;
> +    intptr_t dd_handle = -1;
> +    struct _finddata_t dd_data;
> +    uint64_t file_id;
> +    uint64_t *file_id_list = NULL;
> +    BY_HANDLE_FILE_INFORMATION FileInfo;

FileInfo is the variable name, not a struct name, so no upper case for it
please.

> +    struct dir_win32 *stream = NULL;
> +    int err = 0;
> +    int find_status;
> +    int sort_first_two_entry = 0;
> +    uint32_t list_count = 16;

Magic number 16?

> +    uint32_t index = 0;
> +
> +    /* open directory to prevent it being removed */
> +
> +    hDir = CreateFile(full_file_name, GENERIC_READ,
> +                      FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
> +                      NULL,
> +                      OPEN_EXISTING,
> +                      FILE_FLAG_BACKUP_SEMANTICS | FILE_FLAG_OPEN_REPARSE_POINT,
> +                      NULL);
> +
> +    if (hDir == INVALID_HANDLE_VALUE) {
> +        err = win32_error_to_posix(GetLastError());
> +        goto out;
> +    }
> +
> +    attribute = GetFileAttributes(full_file_name);
> +
> +    /* symlink is not allow */
> +    if (attribute == INVALID_FILE_ATTRIBUTES
> +        || (attribute & FILE_ATTRIBUTE_REPARSE_POINT) != 0) {
> +        err = EACCES;
> +        goto out;
> +    }
> +
> +    /* check if it is a directory */
> +    if ((attribute & FILE_ATTRIBUTE_DIRECTORY) == 0) {
> +        err = ENOTDIR;
> +        goto out;
> +    }
> +
> +    file_id_list = g_malloc0(sizeof(uint64_t) * list_count);
> +
> +    /*
> +     * findfirst() needs suffix format name like "\dir1\dir2\*",
> +     * allocate more buffer to store suffix.
> +     */
> +    stream = g_malloc0(sizeof(struct dir_win32) + strlen(full_file_name) + 3);

Not that I would care much, but +2 would be correct here, as you declared the
struct with one character already, so it is not a classic (zero size) flex
array:

  struct dir_win32 {
    ...
    char dd_name[1];
  };

> +
> +    strcpy(stream->dd_name, full_file_name);
> +    strcat(stream->dd_name, "\\*");
> +
> +    stream->hDir = hDir;
> +    stream->dir_name_len = strlen(full_file_name);
> +
> +    dd_handle = _findfirst(stream->dd_name, &dd_data);
> +
> +    if (dd_handle == -1) {
> +        err = errno;
> +        goto out;
> +    }
> +
> +    /* read all entries to link list */

"read all entries as a linked list"

However there is no linked list here. It seems to be an array.

> +    do {
> +        full_dir_entry = get_full_path_win32(hDir, dd_data.name);
> +
> +        if (full_dir_entry == NULL) {
> +            err = ENOMEM;
> +            break;
> +        }
> +
> +        /*
> +         * Open every entry and get the file informations.
> +         *
> +         * Skip symbolic links during reading directory.
> +         */
> +        hDirEntry = CreateFile(full_dir_entry,
> +                               GENERIC_READ,
> +                               FILE_SHARE_READ | FILE_SHARE_WRITE
> +                               | FILE_SHARE_DELETE,
> +                               NULL,
> +                               OPEN_EXISTING,
> +                               FILE_FLAG_BACKUP_SEMANTICS
> +                               | FILE_FLAG_OPEN_REPARSE_POINT, NULL);
> +
> +        if (hDirEntry != INVALID_HANDLE_VALUE) {
> +            if (GetFileInformationByHandle(hDirEntry,
> +                                           &FileInfo) == TRUE) {
> +                attribute = FileInfo.dwFileAttributes;
> +
> +                /* only save validate entries */
> +                if ((attribute & FILE_ATTRIBUTE_REPARSE_POINT) == 0) {
> +                    if (index >= list_count) {
> +                        list_count = list_count + 16;

Magic number 16 again.

> +                        file_id_list = g_realloc(file_id_list,
> +                                                 sizeof(uint64_t)
> +                                                 * list_count);

OK, so here we are finally at the point where you chose the overall behaviour
for this that we discussed before.

So you are constantly appending 16 entry chunks to the end of the array,
periodically reallocate the entire array, and potentially end up with one
giant dense array with *all* file IDs of the directory.

That's not really what I had in mind, as it still has the potential to easily
crash QEMU if there are large directories on host. Theoretically a Windows
directory might then consume up to 16 GB of RAM for looking up only one single
directory.

So is this the implementation that you said was very slow, or did you test a
different one? Remember, my orgiginal idea (as starting point for Windows) was
to only cache *one* file ID (the last being looked up). That's it. Not a list
of file IDs.

> +                    }
> +                    file_id = (uint64_t)FileInfo.nFileIndexLow
> +                              + (((uint64_t)FileInfo.nFileIndexHigh) << 32);
> +
> +
> +                    file_id_list[index] = file_id;
> +
> +                    if (strcmp(dd_data.name, ".") == 0) {
> +                        stream->dot_id = file_id_list[index];
> +                        if (index != 0) {
> +                            sort_first_two_entry = 1;
> +                        }
> +                    } else if (strcmp(dd_data.name, "..") == 0) {
> +                        stream->dot_dot_id = file_id_list[index];
> +                        if (index != 1) {
> +                            sort_first_two_entry = 1;
> +                        }
> +                    }
> +                    index++;
> +                }
> +            }
> +            CloseHandle(hDirEntry);
> +        }
> +        g_free(full_dir_entry);
> +        find_status = _findnext(dd_handle, &dd_data);
> +    } while (find_status == 0);
> +
> +    if (errno == ENOENT) {
> +        /* No more matching files could be found, clean errno */
> +        errno = 0;
> +    } else {
> +        err = errno;
> +        goto out;
> +    }
> +
> +    stream->total_entries = index;
> +    stream->file_id_list = file_id_list;
> +
> +    if (sort_first_two_entry == 0) {
> +        /*
> +         * If the first two entry is "." and "..", then do not sort them.
> +         *
> +         * If the guest OS always considers first two entries are "." and "..",
> +         * sort the two entries may cause confused display in guest OS.
> +         */
> +        qsort(&file_id_list[2], index - 2, sizeof(file_id), file_id_compare);
> +    } else {
> +        qsort(&file_id_list[0], index, sizeof(file_id), file_id_compare);
> +    }

Were there cases where you did not get "." and ".." ?

> +
> +out:
> +    if (err != 0) {
> +        errno = err;
> +        if (stream != NULL) {
> +            if (file_id_list != NULL) {
> +                g_free(file_id_list);
> +            }
> +            CloseHandle(hDir);
> +            g_free(stream);
> +            stream = NULL;
> +        }
> +    }
> +
> +    if (dd_handle != -1) {
> +        _findclose(dd_handle);
> +    }
> +
> +    return (DIR *)stream;
> +}
> +
> +/*
> + * closedir_win32 - close a directory
> + *
> + * This function closes directory and free all cached resources.
> + */
> +int closedir_win32(DIR *pDir)
> +{
> +    struct dir_win32 *stream = (struct dir_win32 *)pDir;
> +
> +    if (stream == NULL) {
> +        errno = EBADF;
> +        return -1;
> +    }
> +
> +    /* free all resources */
> +    CloseHandle(stream->hDir);
> +
> +    g_free(stream->file_id_list);
> +
> +    g_free(stream);
> +
> +    return 0;
> +}
> +
> +/*
> + * readdir_win32 - read a directory
> + *
> + * This function reads a directory entry from cached entry list.
> + */
> +struct dirent *readdir_win32(DIR *pDir)
> +{
> +    struct dir_win32 *stream = (struct dir_win32 *)pDir;
> +
> +    if (stream == NULL) {
> +        errno = EBADF;
> +        return NULL;
> +    }
> +
> +retry:
> +
> +    if (stream->offset >= stream->total_entries) {
> +        /* reach to the end, return NULL without set errno */
> +        return NULL;
> +    }
> +
> +    if (get_next_entry(stream) != 0) {
> +        stream->offset++;
> +        goto retry;
> +    }
> +
> +    /* Windows does not provide inode number */
> +    stream->dd_dir.d_ino = 0;
> +    stream->dd_dir.d_reclen = 0;
> +    stream->dd_dir.d_namlen = strlen(stream->dd_dir.d_name);
> +
> +    stream->offset++;
> +
> +    return &stream->dd_dir;
> +}
> +
> +/*
> + * rewinddir_win32 - reset directory stream
> + *
> + * This function resets the position of the directory stream to the
> + * beginning of the directory.
> + */
> +void rewinddir_win32(DIR *pDir)
> +{
> +    struct dir_win32 *stream = (struct dir_win32 *)pDir;
> +
> +    if (stream == NULL) {
> +        errno = EBADF;
> +        return;
> +    }
> +
> +    stream->offset = 0;
> +
> +    return;
> +}
> +
> +/*
> + * seekdir_win32 - set the position of the next readdir() call in the directory
> + *
> + * This function sets the position of the next readdir() call in the directory
> + * from which the next readdir() call will start.
> + */
> +void seekdir_win32(DIR *pDir, long pos)
> +{
> +    struct dir_win32 *stream = (struct dir_win32 *)pDir;
> +
> +    if (stream == NULL) {
> +        errno = EBADF;
> +        return;
> +    }
> +
> +    if (pos < -1) {
> +        errno = EINVAL;
> +        return;
> +    }
> +
> +    if (pos == -1 || pos >= (long)stream->total_entries) {
> +        /* seek to the end */
> +        stream->offset = stream->total_entries;
> +        return;
> +    }
> +
> +    if (pos - (long)stream->offset == 0) {
> +        /* no need to seek */
> +        return;
> +    }
> +
> +    stream->offset = pos;
> +
> +    return;
> +}
> +
> +/*
> + * telldir_win32 - return current location in directory
> + *
> + * This function returns current location in directory.
> + */
> +long telldir_win32(DIR *pDir)
> +{
> +    struct dir_win32 *stream = (struct dir_win32 *)pDir;
> +
> +    if (stream == NULL) {
> +        errno = EBADF;
> +        return -1;
> +    }
> +
> +    if (stream->offset > stream->total_entries) {
> +        return -1;
> +    }
> +
> +    return (long)stream->offset;
> +}
>
Shi, Guohuai March 15, 2023, 7:05 p.m. UTC | #2
> -----Original Message-----
> From: Christian Schoenebeck <qemu_oss@crudebyte.com>
> Sent: Wednesday, March 15, 2023 00:06
> To: Greg Kurz <groug@kaod.org>; qemu-devel@nongnu.org
> Cc: Shi, Guohuai <Guohuai.Shi@windriver.com>; Meng, Bin
> <Bin.Meng@windriver.com>
> Subject: Re: [PATCH v5 04/16] hw/9pfs: Implement Windows specific xxxdir()
> APIs
> 
> CAUTION: This email comes from a non Wind River email account!
> Do not click links or open attachments unless you recognize the sender and
> know the content is safe.
> 
> On Monday, February 20, 2023 11:08:03 AM CET Bin Meng wrote:
> > From: Guohuai Shi <guohuai.shi@windriver.com>
> >
> > This commit implements Windows specific xxxdir() APIs for safety
> > directory access.
> 
> That comment is seriously too short for this patch.
> 
> 1. You should describe the behaviour implementation that you have chosen and
> why you have chosen it.
> 
> 2. Like already said in the previous version of the patch, you should place a
> link to the discussion we had on this issue.
> 
> > Signed-off-by: Guohuai Shi <guohuai.shi@windriver.com>
> > Signed-off-by: Bin Meng <bin.meng@windriver.com>
> > ---
> >
> >  hw/9pfs/9p-util.h       |   6 +
> >  hw/9pfs/9p-util-win32.c | 443
> > ++++++++++++++++++++++++++++++++++++++++
> >  2 files changed, 449 insertions(+)
> >
> > diff --git a/hw/9pfs/9p-util.h b/hw/9pfs/9p-util.h index
> > 0f159fb4ce..c1c251fbd1 100644
> > --- a/hw/9pfs/9p-util.h
> > +++ b/hw/9pfs/9p-util.h
> > @@ -141,6 +141,12 @@ int unlinkat_win32(int dirfd, const char
> > *pathname, int flags);  int statfs_win32(const char *root_path, struct
> > statfs *stbuf);  int openat_dir(int dirfd, const char *name);  int
> > openat_file(int dirfd, const char *name, int flags, mode_t mode);
> > +DIR *opendir_win32(const char *full_file_name); int
> > +closedir_win32(DIR *pDir); struct dirent *readdir_win32(DIR *pDir);
> > +void rewinddir_win32(DIR *pDir); void seekdir_win32(DIR *pDir, long
> > +pos); long telldir_win32(DIR *pDir);
> >  #endif
> >
> >  static inline void close_preserve_errno(int fd) diff --git
> > a/hw/9pfs/9p-util-win32.c b/hw/9pfs/9p-util-win32.c index
> > a99d579a06..e9408f3c45 100644
> > --- a/hw/9pfs/9p-util-win32.c
> > +++ b/hw/9pfs/9p-util-win32.c
> > @@ -37,6 +37,16 @@
> >   *    Windows does not support opendir, the directory fd is created by
> >   *    CreateFile and convert to fd by _open_osfhandle(). Keep the fd open
> will
> >   *    lock and protect the directory (can not be modified or replaced)
> > + *
> > + * 5. Neither Windows native APIs, nor MinGW provide a POSIX compatible
> API for
> > + *    acquiring directory entries in a safe way. Calling those APIs
> (native
> > + *    _findfirst() and _findnext() or MinGW's readdir(), seekdir() and
> > + *    telldir()) directly can lead to an inconsistent state if directory
> is
> > + *    modified in between, e.g. the same directory appearing more than
> once
> > + *    in output, or directories not appearing at all in output even though
> they
> > + *    were neither newly created nor deleted. POSIX does not define what
> happens
> > + *    with deleted or newly created directories in between, but it
> guarantees a
> > + *    consistent state.
> >   */
> >
> >  #include "qemu/osdep.h"
> > @@ -51,6 +61,25 @@
> >
> >  #define V9FS_MAGIC  0x53465039  /* string "9PFS" */
> >
> > +/*
> > + * MinGW and Windows does not provide a safe way to seek directory
> > +while other
> > + * thread is modifying the same directory.
> > + *
> > + * This structure is used to store sorted file id and ensure
> > +directory seek
> > + * consistency.
> > + */
> > +struct dir_win32 {
> > +    struct dirent dd_dir;
> > +    uint32_t offset;
> > +    uint32_t total_entries;
> > +    HANDLE hDir;
> > +    uint32_t dir_name_len;
> > +    uint64_t dot_id;
> > +    uint64_t dot_dot_id;
> > +    uint64_t *file_id_list;
> > +    char dd_name[1];
> > +};
> > +
> >  /*
> >   * win32_error_to_posix - convert Win32 error to POSIX error number
> >   *
> > @@ -977,3 +1006,417 @@ int qemu_mknodat(int dirfd, const char *filename,
> mode_t mode, dev_t dev)
> >      errno = ENOTSUP;
> >      return -1;
> >  }
> > +
> > +static int file_id_compare(const void *id_ptr1, const void *id_ptr2)
> > +{
> > +    uint64_t id[2];
> > +
> > +    id[0] = *(uint64_t *)id_ptr1;
> > +    id[1] = *(uint64_t *)id_ptr2;
> > +
> > +    if (id[0] > id[1]) {
> > +        return 1;
> > +    } else if (id[0] < id[1]) {
> > +        return -1;
> > +    } else {
> > +        return 0;
> > +    }
> > +}
> > +
> > +static int get_next_entry(struct dir_win32 *stream) {
> > +    HANDLE hDirEntry = INVALID_HANDLE_VALUE;
> > +    char *entry_name;
> > +    char *entry_start;
> > +    FILE_ID_DESCRIPTOR fid;
> > +    DWORD attribute;
> > +
> > +    if (stream->file_id_list[stream->offset] == stream->dot_id) {
> > +        strcpy(stream->dd_dir.d_name, ".");
> > +        return 0;
> > +    }
> > +
> > +    if (stream->file_id_list[stream->offset] == stream->dot_dot_id) {
> > +        strcpy(stream->dd_dir.d_name, "..");
> > +        return 0;
> > +    }
> > +
> > +    fid.dwSize = sizeof(fid);
> > +    fid.Type = FileIdType;
> > +
> > +    fid.FileId.QuadPart = stream->file_id_list[stream->offset];
> > +
> > +    hDirEntry = OpenFileById(stream->hDir, &fid, GENERIC_READ,
> > +                             FILE_SHARE_READ | FILE_SHARE_WRITE
> > +                             | FILE_SHARE_DELETE,
> > +                             NULL,
> > +                             FILE_FLAG_BACKUP_SEMANTICS
> > +                             | FILE_FLAG_OPEN_REPARSE_POINT);
> 
> What's the purpose of FILE_FLAG_OPEN_REPARSE_POINT here? As it's apparently
> not obvious, please add a comment.
> 

If do not use this flag, and if file id is a symbolic link, then Windows will not symbolic link itself, but open the target file.
This flag is similar as O_NOFOLLOW flag.

> > +
> > +    if (hDirEntry == INVALID_HANDLE_VALUE) {
> > +        /*
> > +         * Not open it successfully, it may be deleted.
> 
> Wrong English. "Open failed, it may have been deleted in the meantime.".
> 
> > +         * Try next id.
> > +         */
> > +        return -1;
> > +    }
> > +
> > +    entry_name = get_full_path_win32(hDirEntry, NULL);
> > +
> > +    CloseHandle(hDirEntry);
> > +
> > +    if (entry_name == NULL) {
> > +        return -1;
> > +    }
> > +
> > +    attribute = GetFileAttributes(entry_name);
> > +
> > +    /* symlink is not allowed */
> > +    if (attribute == INVALID_FILE_ATTRIBUTES
> > +        || (attribute & FILE_ATTRIBUTE_REPARSE_POINT) != 0) {
> > +        return -1;
> 
> Wouldn't it make sense to call warn_report_once() here to let the user know
> that he has some symlinks that are never delivered to guest?

OK, Got it.

> 
> > +    }
> > +
> > +    if (memcmp(entry_name, stream->dd_name, stream->dir_name_len) !=
> > + 0) {
> 
> No, that's unsafe. You want to use something like strncmp() instead.
> 
> > +        /*
> > +         * The full entry file name should be a part of parent directory
> name,
> > +         * except dot and dot_dot (is already handled).
> > +         * If not, this entry should not be returned.
> > +         */
> > +        return -1;
> > +    }
> > +
> > +    entry_start = entry_name + stream->dir_name_len;
> 
> s/entry_start/entry_basename/ ?
> 
> > +
> > +    /* skip slash */
> > +    while (*entry_start == '\\') {
> > +        entry_start++;
> > +    }
> > +
> > +    if (strchr(entry_start, '\\') != NULL) {
> > +        return -1;
> > +    }
> > +
> > +    if (strlen(entry_start) == 0
> > +        || strlen(entry_start) + 1 > sizeof(stream->dd_dir.d_name)) {
> > +        return -1;
> > +    }
> > +    strcpy(stream->dd_dir.d_name, entry_start);
> 
> g_path_get_basename() ? :)

For above three comments:
This code is not good, should be fixed.
The code want to filter the following cases:
The parent directory path is not a part of entry's full path: 
Parent: C:\123\456, entry: C:\123, C:\
Entry contains more than one name components:
Parent: C:\123\456, entry: C:\123\456\789\abc
Entry is zero length or name buffer is too long

I will refactor this part.

> 
> > +
> > +    return 0;
> > +}
> > +
> > +/*
> > + * opendir_win32 - open a directory
> > + *
> > + * This function opens a directory and caches all directory entries.
> 
> It just caches all file IDs, doesn't it?
> 

Will fix it

> > + */
> > +DIR *opendir_win32(const char *full_file_name) {
> > +    HANDLE hDir = INVALID_HANDLE_VALUE;
> > +    HANDLE hDirEntry = INVALID_HANDLE_VALUE;
> > +    char *full_dir_entry = NULL;
> > +    DWORD attribute;
> > +    intptr_t dd_handle = -1;
> > +    struct _finddata_t dd_data;
> > +    uint64_t file_id;
> > +    uint64_t *file_id_list = NULL;
> > +    BY_HANDLE_FILE_INFORMATION FileInfo;
> 
> FileInfo is the variable name, not a struct name, so no upper case for it
> please.

Will fix it.
> 
> > +    struct dir_win32 *stream = NULL;
> > +    int err = 0;
> > +    int find_status;
> > +    int sort_first_two_entry = 0;
> > +    uint32_t list_count = 16;
> 
> Magic number 16?

Will change it to a macro.
> 
> > +    uint32_t index = 0;
> > +
> > +    /* open directory to prevent it being removed */
> > +
> > +    hDir = CreateFile(full_file_name, GENERIC_READ,
> > +                      FILE_SHARE_READ | FILE_SHARE_WRITE |
> FILE_SHARE_DELETE,
> > +                      NULL,
> > +                      OPEN_EXISTING,
> > +                      FILE_FLAG_BACKUP_SEMANTICS |
> FILE_FLAG_OPEN_REPARSE_POINT,
> > +                      NULL);
> > +
> > +    if (hDir == INVALID_HANDLE_VALUE) {
> > +        err = win32_error_to_posix(GetLastError());
> > +        goto out;
> > +    }
> > +
> > +    attribute = GetFileAttributes(full_file_name);
> > +
> > +    /* symlink is not allow */
> > +    if (attribute == INVALID_FILE_ATTRIBUTES
> > +        || (attribute & FILE_ATTRIBUTE_REPARSE_POINT) != 0) {
> > +        err = EACCES;
> > +        goto out;
> > +    }
> > +
> > +    /* check if it is a directory */
> > +    if ((attribute & FILE_ATTRIBUTE_DIRECTORY) == 0) {
> > +        err = ENOTDIR;
> > +        goto out;
> > +    }
> > +
> > +    file_id_list = g_malloc0(sizeof(uint64_t) * list_count);
> > +
> > +    /*
> > +     * findfirst() needs suffix format name like "\dir1\dir2\*",
> > +     * allocate more buffer to store suffix.
> > +     */
> > +    stream = g_malloc0(sizeof(struct dir_win32) +
> > + strlen(full_file_name) + 3);
> 
> Not that I would care much, but +2 would be correct here, as you declared the
> struct with one character already, so it is not a classic (zero size) flex
> array:
> 
>   struct dir_win32 {
>     ...
>     char dd_name[1];
>   };
> 
Will fix it.

> > +
> > +    strcpy(stream->dd_name, full_file_name);
> > +    strcat(stream->dd_name, "\\*");
> > +
> > +    stream->hDir = hDir;
> > +    stream->dir_name_len = strlen(full_file_name);
> > +
> > +    dd_handle = _findfirst(stream->dd_name, &dd_data);
> > +
> > +    if (dd_handle == -1) {
> > +        err = errno;
> > +        goto out;
> > +    }
> > +
> > +    /* read all entries to link list */
> 
> "read all entries as a linked list"
> 
> However there is no linked list here. It seems to be an array.

Will fix it.
> 
> > +    do {
> > +        full_dir_entry = get_full_path_win32(hDir, dd_data.name);
> > +
> > +        if (full_dir_entry == NULL) {
> > +            err = ENOMEM;
> > +            break;
> > +        }
> > +
> > +        /*
> > +         * Open every entry and get the file informations.
> > +         *
> > +         * Skip symbolic links during reading directory.
> > +         */
> > +        hDirEntry = CreateFile(full_dir_entry,
> > +                               GENERIC_READ,
> > +                               FILE_SHARE_READ | FILE_SHARE_WRITE
> > +                               | FILE_SHARE_DELETE,
> > +                               NULL,
> > +                               OPEN_EXISTING,
> > +                               FILE_FLAG_BACKUP_SEMANTICS
> > +                               | FILE_FLAG_OPEN_REPARSE_POINT, NULL);
> > +
> > +        if (hDirEntry != INVALID_HANDLE_VALUE) {
> > +            if (GetFileInformationByHandle(hDirEntry,
> > +                                           &FileInfo) == TRUE) {
> > +                attribute = FileInfo.dwFileAttributes;
> > +
> > +                /* only save validate entries */
> > +                if ((attribute & FILE_ATTRIBUTE_REPARSE_POINT) == 0) {
> > +                    if (index >= list_count) {
> > +                        list_count = list_count + 16;
> 
> Magic number 16 again.
> 
> > +                        file_id_list = g_realloc(file_id_list,
> > +                                                 sizeof(uint64_t)
> > +                                                 * list_count);
> 
> OK, so here we are finally at the point where you chose the overall behaviour
> for this that we discussed before.
> 
> So you are constantly appending 16 entry chunks to the end of the array,
> periodically reallocate the entire array, and potentially end up with one
> giant dense array with *all* file IDs of the directory.
> 
> That's not really what I had in mind, as it still has the potential to easily
> crash QEMU if there are large directories on host. Theoretically a Windows
> directory might then consume up to 16 GB of RAM for looking up only one
> single directory.
> 
> So is this the implementation that you said was very slow, or did you test a
> different one? Remember, my orgiginal idea (as starting point for Windows)
> was to only cache *one* file ID (the last being looked up). That's it. Not a
> list of file IDs.

If only cache one file ID, that means for every read directory operation.
we need to look up whole directory to find out the next ID larger than last cached one.

I provided some performance test in last patch:
Run test for read directory with 100, 1000, 10000 entries
#1, For file name cache solution, the time cost is: 2, 9, 44 (in ms).
#2, For file id cache solution, the time cost: 3, 438, 4338 (in ms). This is current solution.
#3, for cache one id solution, I just tested it: 4, 4788, more than one minutes (in ms)

I think it is not a good idea to cache one file id, it would be very bad performance

> 
> > +                    }
> > +                    file_id = (uint64_t)FileInfo.nFileIndexLow
> > +                              + (((uint64_t)FileInfo.nFileIndexHigh)
> > + << 32);
> > +
> > +
> > +                    file_id_list[index] = file_id;
> > +
> > +                    if (strcmp(dd_data.name, ".") == 0) {
> > +                        stream->dot_id = file_id_list[index];
> > +                        if (index != 0) {
> > +                            sort_first_two_entry = 1;
> > +                        }
> > +                    } else if (strcmp(dd_data.name, "..") == 0) {
> > +                        stream->dot_dot_id = file_id_list[index];
> > +                        if (index != 1) {
> > +                            sort_first_two_entry = 1;
> > +                        }
> > +                    }
> > +                    index++;
> > +                }
> > +            }
> > +            CloseHandle(hDirEntry);
> > +        }
> > +        g_free(full_dir_entry);
> > +        find_status = _findnext(dd_handle, &dd_data);
> > +    } while (find_status == 0);
> > +
> > +    if (errno == ENOENT) {
> > +        /* No more matching files could be found, clean errno */
> > +        errno = 0;
> > +    } else {
> > +        err = errno;
> > +        goto out;
> > +    }
> > +
> > +    stream->total_entries = index;
> > +    stream->file_id_list = file_id_list;
> > +
> > +    if (sort_first_two_entry == 0) {
> > +        /*
> > +         * If the first two entry is "." and "..", then do not sort them.
> > +         *
> > +         * If the guest OS always considers first two entries are "." and
> "..",
> > +         * sort the two entries may cause confused display in guest OS.
> > +         */
> > +        qsort(&file_id_list[2], index - 2, sizeof(file_id),
> file_id_compare);
> > +    } else {
> > +        qsort(&file_id_list[0], index, sizeof(file_id), file_id_compare);
> > +    }
> 
> Were there cases where you did not get "." and ".." ?

NTFS always provides "." and "..".
I could add more checks here to fix this risk

> 
> > +
> > +out:
> > +    if (err != 0) {
> > +        errno = err;
> > +        if (stream != NULL) {
> > +            if (file_id_list != NULL) {
> > +                g_free(file_id_list);
> > +            }
> > +            CloseHandle(hDir);
> > +            g_free(stream);
> > +            stream = NULL;
> > +        }
> > +    }
> > +
> > +    if (dd_handle != -1) {
> > +        _findclose(dd_handle);
> > +    }
> > +
> > +    return (DIR *)stream;
> > +}
> > +
> > +/*
> > + * closedir_win32 - close a directory
> > + *
> > + * This function closes directory and free all cached resources.
> > + */
> > +int closedir_win32(DIR *pDir)
> > +{
> > +    struct dir_win32 *stream = (struct dir_win32 *)pDir;
> > +
> > +    if (stream == NULL) {
> > +        errno = EBADF;
> > +        return -1;
> > +    }
> > +
> > +    /* free all resources */
> > +    CloseHandle(stream->hDir);
> > +
> > +    g_free(stream->file_id_list);
> > +
> > +    g_free(stream);
> > +
> > +    return 0;
> > +}
> > +
> > +/*
> > + * readdir_win32 - read a directory
> > + *
> > + * This function reads a directory entry from cached entry list.
> > + */
> > +struct dirent *readdir_win32(DIR *pDir) {
> > +    struct dir_win32 *stream = (struct dir_win32 *)pDir;
> > +
> > +    if (stream == NULL) {
> > +        errno = EBADF;
> > +        return NULL;
> > +    }
> > +
> > +retry:
> > +
> > +    if (stream->offset >= stream->total_entries) {
> > +        /* reach to the end, return NULL without set errno */
> > +        return NULL;
> > +    }
> > +
> > +    if (get_next_entry(stream) != 0) {
> > +        stream->offset++;
> > +        goto retry;
> > +    }
> > +
> > +    /* Windows does not provide inode number */
> > +    stream->dd_dir.d_ino = 0;
> > +    stream->dd_dir.d_reclen = 0;
> > +    stream->dd_dir.d_namlen = strlen(stream->dd_dir.d_name);
> > +
> > +    stream->offset++;
> > +
> > +    return &stream->dd_dir;
> > +}
> > +
> > +/*
> > + * rewinddir_win32 - reset directory stream
> > + *
> > + * This function resets the position of the directory stream to the
> > + * beginning of the directory.
> > + */
> > +void rewinddir_win32(DIR *pDir)
> > +{
> > +    struct dir_win32 *stream = (struct dir_win32 *)pDir;
> > +
> > +    if (stream == NULL) {
> > +        errno = EBADF;
> > +        return;
> > +    }
> > +
> > +    stream->offset = 0;
> > +
> > +    return;
> > +}
> > +
> > +/*
> > + * seekdir_win32 - set the position of the next readdir() call in the
> > +directory
> > + *
> > + * This function sets the position of the next readdir() call in the
> > +directory
> > + * from which the next readdir() call will start.
> > + */
> > +void seekdir_win32(DIR *pDir, long pos) {
> > +    struct dir_win32 *stream = (struct dir_win32 *)pDir;
> > +
> > +    if (stream == NULL) {
> > +        errno = EBADF;
> > +        return;
> > +    }
> > +
> > +    if (pos < -1) {
> > +        errno = EINVAL;
> > +        return;
> > +    }
> > +
> > +    if (pos == -1 || pos >= (long)stream->total_entries) {
> > +        /* seek to the end */
> > +        stream->offset = stream->total_entries;
> > +        return;
> > +    }
> > +
> > +    if (pos - (long)stream->offset == 0) {
> > +        /* no need to seek */
> > +        return;
> > +    }
> > +
> > +    stream->offset = pos;
> > +
> > +    return;
> > +}
> > +
> > +/*
> > + * telldir_win32 - return current location in directory
> > + *
> > + * This function returns current location in directory.
> > + */
> > +long telldir_win32(DIR *pDir)
> > +{
> > +    struct dir_win32 *stream = (struct dir_win32 *)pDir;
> > +
> > +    if (stream == NULL) {
> > +        errno = EBADF;
> > +        return -1;
> > +    }
> > +
> > +    if (stream->offset > stream->total_entries) {
> > +        return -1;
> > +    }
> > +
> > +    return (long)stream->offset;
> > +}
> >
>
Christian Schoenebeck March 16, 2023, 11:05 a.m. UTC | #3
On Wednesday, March 15, 2023 8:05:34 PM CET Shi, Guohuai wrote:
> 
> > -----Original Message-----
> > From: Christian Schoenebeck <qemu_oss@crudebyte.com>
> > Sent: Wednesday, March 15, 2023 00:06
> > To: Greg Kurz <groug@kaod.org>; qemu-devel@nongnu.org
> > Cc: Shi, Guohuai <Guohuai.Shi@windriver.com>; Meng, Bin
> > <Bin.Meng@windriver.com>
> > Subject: Re: [PATCH v5 04/16] hw/9pfs: Implement Windows specific xxxdir()
> > APIs
> > 
> > CAUTION: This email comes from a non Wind River email account!
> > Do not click links or open attachments unless you recognize the sender and
> > know the content is safe.
> > 
> > On Monday, February 20, 2023 11:08:03 AM CET Bin Meng wrote:
> > > From: Guohuai Shi <guohuai.shi@windriver.com>
> > >
> > > This commit implements Windows specific xxxdir() APIs for safety
> > > directory access.
> > 
> > That comment is seriously too short for this patch.
> > 
> > 1. You should describe the behaviour implementation that you have chosen and
> > why you have chosen it.
> > 
> > 2. Like already said in the previous version of the patch, you should place a
> > link to the discussion we had on this issue.
> > 
> > > Signed-off-by: Guohuai Shi <guohuai.shi@windriver.com>
> > > Signed-off-by: Bin Meng <bin.meng@windriver.com>
> > > ---
> > >
> > >  hw/9pfs/9p-util.h       |   6 +
> > >  hw/9pfs/9p-util-win32.c | 443
> > > ++++++++++++++++++++++++++++++++++++++++
> > >  2 files changed, 449 insertions(+)
> > >
> > > diff --git a/hw/9pfs/9p-util.h b/hw/9pfs/9p-util.h index
> > > 0f159fb4ce..c1c251fbd1 100644
> > > --- a/hw/9pfs/9p-util.h
> > > +++ b/hw/9pfs/9p-util.h
> > > @@ -141,6 +141,12 @@ int unlinkat_win32(int dirfd, const char
> > > *pathname, int flags);  int statfs_win32(const char *root_path, struct
> > > statfs *stbuf);  int openat_dir(int dirfd, const char *name);  int
> > > openat_file(int dirfd, const char *name, int flags, mode_t mode);
> > > +DIR *opendir_win32(const char *full_file_name); int
> > > +closedir_win32(DIR *pDir); struct dirent *readdir_win32(DIR *pDir);
> > > +void rewinddir_win32(DIR *pDir); void seekdir_win32(DIR *pDir, long
> > > +pos); long telldir_win32(DIR *pDir);
> > >  #endif
> > >
> > >  static inline void close_preserve_errno(int fd) diff --git
> > > a/hw/9pfs/9p-util-win32.c b/hw/9pfs/9p-util-win32.c index
> > > a99d579a06..e9408f3c45 100644
> > > --- a/hw/9pfs/9p-util-win32.c
> > > +++ b/hw/9pfs/9p-util-win32.c
> > > @@ -37,6 +37,16 @@
> > >   *    Windows does not support opendir, the directory fd is created by
> > >   *    CreateFile and convert to fd by _open_osfhandle(). Keep the fd open
> > will
> > >   *    lock and protect the directory (can not be modified or replaced)
> > > + *
> > > + * 5. Neither Windows native APIs, nor MinGW provide a POSIX compatible
> > API for
> > > + *    acquiring directory entries in a safe way. Calling those APIs
> > (native
> > > + *    _findfirst() and _findnext() or MinGW's readdir(), seekdir() and
> > > + *    telldir()) directly can lead to an inconsistent state if directory
> > is
> > > + *    modified in between, e.g. the same directory appearing more than
> > once
> > > + *    in output, or directories not appearing at all in output even though
> > they
> > > + *    were neither newly created nor deleted. POSIX does not define what
> > happens
> > > + *    with deleted or newly created directories in between, but it
> > guarantees a
> > > + *    consistent state.
> > >   */
> > >
> > >  #include "qemu/osdep.h"
> > > @@ -51,6 +61,25 @@
> > >
> > >  #define V9FS_MAGIC  0x53465039  /* string "9PFS" */
> > >
> > > +/*
> > > + * MinGW and Windows does not provide a safe way to seek directory
> > > +while other
> > > + * thread is modifying the same directory.
> > > + *
> > > + * This structure is used to store sorted file id and ensure
> > > +directory seek
> > > + * consistency.
> > > + */
> > > +struct dir_win32 {
> > > +    struct dirent dd_dir;
> > > +    uint32_t offset;
> > > +    uint32_t total_entries;
> > > +    HANDLE hDir;
> > > +    uint32_t dir_name_len;
> > > +    uint64_t dot_id;
> > > +    uint64_t dot_dot_id;
> > > +    uint64_t *file_id_list;
> > > +    char dd_name[1];
> > > +};
> > > +
> > >  /*
> > >   * win32_error_to_posix - convert Win32 error to POSIX error number
> > >   *
> > > @@ -977,3 +1006,417 @@ int qemu_mknodat(int dirfd, const char *filename,
> > mode_t mode, dev_t dev)
> > >      errno = ENOTSUP;
> > >      return -1;
> > >  }
> > > +
> > > +static int file_id_compare(const void *id_ptr1, const void *id_ptr2)
> > > +{
> > > +    uint64_t id[2];
> > > +
> > > +    id[0] = *(uint64_t *)id_ptr1;
> > > +    id[1] = *(uint64_t *)id_ptr2;
> > > +
> > > +    if (id[0] > id[1]) {
> > > +        return 1;
> > > +    } else if (id[0] < id[1]) {
> > > +        return -1;
> > > +    } else {
> > > +        return 0;
> > > +    }
> > > +}
> > > +
> > > +static int get_next_entry(struct dir_win32 *stream) {
> > > +    HANDLE hDirEntry = INVALID_HANDLE_VALUE;
> > > +    char *entry_name;
> > > +    char *entry_start;
> > > +    FILE_ID_DESCRIPTOR fid;
> > > +    DWORD attribute;
> > > +
> > > +    if (stream->file_id_list[stream->offset] == stream->dot_id) {
> > > +        strcpy(stream->dd_dir.d_name, ".");
> > > +        return 0;
> > > +    }
> > > +
> > > +    if (stream->file_id_list[stream->offset] == stream->dot_dot_id) {
> > > +        strcpy(stream->dd_dir.d_name, "..");
> > > +        return 0;
> > > +    }
> > > +
> > > +    fid.dwSize = sizeof(fid);
> > > +    fid.Type = FileIdType;
> > > +
> > > +    fid.FileId.QuadPart = stream->file_id_list[stream->offset];
> > > +
> > > +    hDirEntry = OpenFileById(stream->hDir, &fid, GENERIC_READ,
> > > +                             FILE_SHARE_READ | FILE_SHARE_WRITE
> > > +                             | FILE_SHARE_DELETE,
> > > +                             NULL,
> > > +                             FILE_FLAG_BACKUP_SEMANTICS
> > > +                             | FILE_FLAG_OPEN_REPARSE_POINT);
> > 
> > What's the purpose of FILE_FLAG_OPEN_REPARSE_POINT here? As it's apparently
> > not obvious, please add a comment.
> > 
> 
> If do not use this flag, and if file id is a symbolic link, then Windows will not symbolic link itself, but open the target file.
> This flag is similar as O_NOFOLLOW flag.

OK, got it, thanks! But please add a comment in code that describes this.

> > > +
> > > +    if (hDirEntry == INVALID_HANDLE_VALUE) {
> > > +        /*
> > > +         * Not open it successfully, it may be deleted.
> > 
> > Wrong English. "Open failed, it may have been deleted in the meantime.".
> > 
> > > +         * Try next id.
> > > +         */
> > > +        return -1;
> > > +    }
> > > +
> > > +    entry_name = get_full_path_win32(hDirEntry, NULL);
> > > +
> > > +    CloseHandle(hDirEntry);
> > > +
> > > +    if (entry_name == NULL) {
> > > +        return -1;
> > > +    }
> > > +
> > > +    attribute = GetFileAttributes(entry_name);
> > > +
> > > +    /* symlink is not allowed */
> > > +    if (attribute == INVALID_FILE_ATTRIBUTES
> > > +        || (attribute & FILE_ATTRIBUTE_REPARSE_POINT) != 0) {
> > > +        return -1;
> > 
> > Wouldn't it make sense to call warn_report_once() here to let the user know
> > that he has some symlinks that are never delivered to guest?
> 
> OK, Got it.
> 
> > 
> > > +    }
> > > +
> > > +    if (memcmp(entry_name, stream->dd_name, stream->dir_name_len) !=
> > > + 0) {
> > 
> > No, that's unsafe. You want to use something like strncmp() instead.
> > 
> > > +        /*
> > > +         * The full entry file name should be a part of parent directory
> > name,
> > > +         * except dot and dot_dot (is already handled).
> > > +         * If not, this entry should not be returned.
> > > +         */
> > > +        return -1;
> > > +    }
> > > +
> > > +    entry_start = entry_name + stream->dir_name_len;
> > 
> > s/entry_start/entry_basename/ ?
> > 
> > > +
> > > +    /* skip slash */
> > > +    while (*entry_start == '\\') {
> > > +        entry_start++;
> > > +    }
> > > +
> > > +    if (strchr(entry_start, '\\') != NULL) {
> > > +        return -1;
> > > +    }
> > > +
> > > +    if (strlen(entry_start) == 0
> > > +        || strlen(entry_start) + 1 > sizeof(stream->dd_dir.d_name)) {
> > > +        return -1;
> > > +    }
> > > +    strcpy(stream->dd_dir.d_name, entry_start);
> > 
> > g_path_get_basename() ? :)
> 
> For above three comments:
> This code is not good, should be fixed.
> The code want to filter the following cases:
> The parent directory path is not a part of entry's full path: 
> Parent: C:\123\456, entry: C:\123, C:\
> Entry contains more than one name components:
> Parent: C:\123\456, entry: C:\123\456\789\abc
> Entry is zero length or name buffer is too long
> 
> I will refactor this part.

In general: writing parsing code yourself is extremely error prone. That's why
it makes sense to use existing functions from glib, etc.

> > 
> > > +
> > > +    return 0;
> > > +}
> > > +
> > > +/*
> > > + * opendir_win32 - open a directory
> > > + *
> > > + * This function opens a directory and caches all directory entries.
> > 
> > It just caches all file IDs, doesn't it?
> > 
> 
> Will fix it
> 
> > > + */
> > > +DIR *opendir_win32(const char *full_file_name) {
> > > +    HANDLE hDir = INVALID_HANDLE_VALUE;
> > > +    HANDLE hDirEntry = INVALID_HANDLE_VALUE;
> > > +    char *full_dir_entry = NULL;
> > > +    DWORD attribute;
> > > +    intptr_t dd_handle = -1;
> > > +    struct _finddata_t dd_data;
> > > +    uint64_t file_id;
> > > +    uint64_t *file_id_list = NULL;
> > > +    BY_HANDLE_FILE_INFORMATION FileInfo;
> > 
> > FileInfo is the variable name, not a struct name, so no upper case for it
> > please.
> 
> Will fix it.
> > 
> > > +    struct dir_win32 *stream = NULL;
> > > +    int err = 0;
> > > +    int find_status;
> > > +    int sort_first_two_entry = 0;
> > > +    uint32_t list_count = 16;
> > 
> > Magic number 16?
> 
> Will change it to a macro.
> > 
> > > +    uint32_t index = 0;
> > > +
> > > +    /* open directory to prevent it being removed */
> > > +
> > > +    hDir = CreateFile(full_file_name, GENERIC_READ,
> > > +                      FILE_SHARE_READ | FILE_SHARE_WRITE |
> > FILE_SHARE_DELETE,
> > > +                      NULL,
> > > +                      OPEN_EXISTING,
> > > +                      FILE_FLAG_BACKUP_SEMANTICS |
> > FILE_FLAG_OPEN_REPARSE_POINT,
> > > +                      NULL);
> > > +
> > > +    if (hDir == INVALID_HANDLE_VALUE) {
> > > +        err = win32_error_to_posix(GetLastError());
> > > +        goto out;
> > > +    }
> > > +
> > > +    attribute = GetFileAttributes(full_file_name);
> > > +
> > > +    /* symlink is not allow */
> > > +    if (attribute == INVALID_FILE_ATTRIBUTES
> > > +        || (attribute & FILE_ATTRIBUTE_REPARSE_POINT) != 0) {
> > > +        err = EACCES;
> > > +        goto out;
> > > +    }
> > > +
> > > +    /* check if it is a directory */
> > > +    if ((attribute & FILE_ATTRIBUTE_DIRECTORY) == 0) {
> > > +        err = ENOTDIR;
> > > +        goto out;
> > > +    }
> > > +
> > > +    file_id_list = g_malloc0(sizeof(uint64_t) * list_count);
> > > +
> > > +    /*
> > > +     * findfirst() needs suffix format name like "\dir1\dir2\*",
> > > +     * allocate more buffer to store suffix.
> > > +     */
> > > +    stream = g_malloc0(sizeof(struct dir_win32) +
> > > + strlen(full_file_name) + 3);
> > 
> > Not that I would care much, but +2 would be correct here, as you declared the
> > struct with one character already, so it is not a classic (zero size) flex
> > array:
> > 
> >   struct dir_win32 {
> >     ...
> >     char dd_name[1];
> >   };
> > 
> Will fix it.
> 
> > > +
> > > +    strcpy(stream->dd_name, full_file_name);
> > > +    strcat(stream->dd_name, "\\*");
> > > +
> > > +    stream->hDir = hDir;
> > > +    stream->dir_name_len = strlen(full_file_name);
> > > +
> > > +    dd_handle = _findfirst(stream->dd_name, &dd_data);
> > > +
> > > +    if (dd_handle == -1) {
> > > +        err = errno;
> > > +        goto out;
> > > +    }
> > > +
> > > +    /* read all entries to link list */
> > 
> > "read all entries as a linked list"
> > 
> > However there is no linked list here. It seems to be an array.
> 
> Will fix it.
> > 
> > > +    do {
> > > +        full_dir_entry = get_full_path_win32(hDir, dd_data.name);
> > > +
> > > +        if (full_dir_entry == NULL) {
> > > +            err = ENOMEM;
> > > +            break;
> > > +        }
> > > +
> > > +        /*
> > > +         * Open every entry and get the file informations.
> > > +         *
> > > +         * Skip symbolic links during reading directory.
> > > +         */
> > > +        hDirEntry = CreateFile(full_dir_entry,
> > > +                               GENERIC_READ,
> > > +                               FILE_SHARE_READ | FILE_SHARE_WRITE
> > > +                               | FILE_SHARE_DELETE,
> > > +                               NULL,
> > > +                               OPEN_EXISTING,
> > > +                               FILE_FLAG_BACKUP_SEMANTICS
> > > +                               | FILE_FLAG_OPEN_REPARSE_POINT, NULL);
> > > +
> > > +        if (hDirEntry != INVALID_HANDLE_VALUE) {
> > > +            if (GetFileInformationByHandle(hDirEntry,
> > > +                                           &FileInfo) == TRUE) {
> > > +                attribute = FileInfo.dwFileAttributes;
> > > +
> > > +                /* only save validate entries */
> > > +                if ((attribute & FILE_ATTRIBUTE_REPARSE_POINT) == 0) {
> > > +                    if (index >= list_count) {
> > > +                        list_count = list_count + 16;
> > 
> > Magic number 16 again.
> > 
> > > +                        file_id_list = g_realloc(file_id_list,
> > > +                                                 sizeof(uint64_t)
> > > +                                                 * list_count);
> > 
> > OK, so here we are finally at the point where you chose the overall behaviour
> > for this that we discussed before.
> > 
> > So you are constantly appending 16 entry chunks to the end of the array,
> > periodically reallocate the entire array, and potentially end up with one
> > giant dense array with *all* file IDs of the directory.
> > 
> > That's not really what I had in mind, as it still has the potential to easily
> > crash QEMU if there are large directories on host. Theoretically a Windows
> > directory might then consume up to 16 GB of RAM for looking up only one
> > single directory.
> > 
> > So is this the implementation that you said was very slow, or did you test a
> > different one? Remember, my orgiginal idea (as starting point for Windows)
> > was to only cache *one* file ID (the last being looked up). That's it. Not a
> > list of file IDs.
> 
> If only cache one file ID, that means for every read directory operation.
> we need to look up whole directory to find out the next ID larger than last cached one.
> 
> I provided some performance test in last patch:
> Run test for read directory with 100, 1000, 10000 entries
> #1, For file name cache solution, the time cost is: 2, 9, 44 (in ms).
> #2, For file id cache solution, the time cost: 3, 438, 4338 (in ms). This is current solution.
> #3, for cache one id solution, I just tested it: 4, 4788, more than one minutes (in ms)
> 
> I think it is not a good idea to cache one file id, it would be very bad performance

Yes, the performce would be lousy, but at least we would have a basis that
just works^TM. Correct behaviour always comes before performance. And from
there you could add additional patches on top to address performance
improvements. Because the point is: your implementation is also suboptimal,
and more importantly: prone to crashes like we discussed before.

Regarding performance: for instance you are re-allocating an entire dense
buffer on every 16 new entries. That will slow down things extremely. Please
use a container from glib, because these are handling resize operations more
smoothly for you out of the box, i.e. typically by doubling the container
capacity instead of re-allocating frequently with small chunks like you did.

However I am still not convinced that allocating a huge dense buffer with
*all* file IDs of a directory makes sense.

On the long-term it would make sense to do it like other implementations:
store a snapshot of the directory temporarily on disk. That way it would not
matter how huge the directory is. But that's a complex implementation, so not
something that I would do in this series already.

On the short/mid term I think we could simply make a mix of your solution and
the one-ID solution that I suggested: keeping a maximum of e.g. 1k file IDs in
RAM. And once guest seeks past that boundary, loading the subsequent 1k
entries, free-ing the previous 1k entries, and so on.

> > 
> > > +                    }
> > > +                    file_id = (uint64_t)FileInfo.nFileIndexLow
> > > +                              + (((uint64_t)FileInfo.nFileIndexHigh)
> > > + << 32);
> > > +
> > > +
> > > +                    file_id_list[index] = file_id;
> > > +
> > > +                    if (strcmp(dd_data.name, ".") == 0) {
> > > +                        stream->dot_id = file_id_list[index];
> > > +                        if (index != 0) {
> > > +                            sort_first_two_entry = 1;
> > > +                        }
> > > +                    } else if (strcmp(dd_data.name, "..") == 0) {
> > > +                        stream->dot_dot_id = file_id_list[index];
> > > +                        if (index != 1) {
> > > +                            sort_first_two_entry = 1;
> > > +                        }
> > > +                    }
> > > +                    index++;
> > > +                }
> > > +            }
> > > +            CloseHandle(hDirEntry);
> > > +        }
> > > +        g_free(full_dir_entry);
> > > +        find_status = _findnext(dd_handle, &dd_data);
> > > +    } while (find_status == 0);
> > > +
> > > +    if (errno == ENOENT) {
> > > +        /* No more matching files could be found, clean errno */
> > > +        errno = 0;
> > > +    } else {
> > > +        err = errno;
> > > +        goto out;
> > > +    }
> > > +
> > > +    stream->total_entries = index;
> > > +    stream->file_id_list = file_id_list;
> > > +
> > > +    if (sort_first_two_entry == 0) {
> > > +        /*
> > > +         * If the first two entry is "." and "..", then do not sort them.
> > > +         *
> > > +         * If the guest OS always considers first two entries are "." and
> > "..",
> > > +         * sort the two entries may cause confused display in guest OS.
> > > +         */
> > > +        qsort(&file_id_list[2], index - 2, sizeof(file_id),
> > file_id_compare);
> > > +    } else {
> > > +        qsort(&file_id_list[0], index, sizeof(file_id), file_id_compare);
> > > +    }
> > 
> > Were there cases where you did not get "." and ".." ?
> 
> NTFS always provides "." and "..".
> I could add more checks here to fix this risk

That's what I assumed. So you can probably just drop this code for simplicity.

> 
> > 
> > > +
> > > +out:
> > > +    if (err != 0) {
> > > +        errno = err;
> > > +        if (stream != NULL) {
> > > +            if (file_id_list != NULL) {
> > > +                g_free(file_id_list);
> > > +            }
> > > +            CloseHandle(hDir);
> > > +            g_free(stream);
> > > +            stream = NULL;
> > > +        }
> > > +    }
> > > +
> > > +    if (dd_handle != -1) {
> > > +        _findclose(dd_handle);
> > > +    }
> > > +
> > > +    return (DIR *)stream;
> > > +}
> > > +
> > > +/*
> > > + * closedir_win32 - close a directory
> > > + *
> > > + * This function closes directory and free all cached resources.
> > > + */
> > > +int closedir_win32(DIR *pDir)
> > > +{
> > > +    struct dir_win32 *stream = (struct dir_win32 *)pDir;
> > > +
> > > +    if (stream == NULL) {
> > > +        errno = EBADF;
> > > +        return -1;
> > > +    }
> > > +
> > > +    /* free all resources */
> > > +    CloseHandle(stream->hDir);
> > > +
> > > +    g_free(stream->file_id_list);
> > > +
> > > +    g_free(stream);
> > > +
> > > +    return 0;
> > > +}
> > > +
> > > +/*
> > > + * readdir_win32 - read a directory
> > > + *
> > > + * This function reads a directory entry from cached entry list.
> > > + */
> > > +struct dirent *readdir_win32(DIR *pDir) {
> > > +    struct dir_win32 *stream = (struct dir_win32 *)pDir;
> > > +
> > > +    if (stream == NULL) {
> > > +        errno = EBADF;
> > > +        return NULL;
> > > +    }
> > > +
> > > +retry:
> > > +
> > > +    if (stream->offset >= stream->total_entries) {
> > > +        /* reach to the end, return NULL without set errno */
> > > +        return NULL;
> > > +    }
> > > +
> > > +    if (get_next_entry(stream) != 0) {
> > > +        stream->offset++;
> > > +        goto retry;
> > > +    }
> > > +
> > > +    /* Windows does not provide inode number */
> > > +    stream->dd_dir.d_ino = 0;
> > > +    stream->dd_dir.d_reclen = 0;
> > > +    stream->dd_dir.d_namlen = strlen(stream->dd_dir.d_name);
> > > +
> > > +    stream->offset++;
> > > +
> > > +    return &stream->dd_dir;
> > > +}
> > > +
> > > +/*
> > > + * rewinddir_win32 - reset directory stream
> > > + *
> > > + * This function resets the position of the directory stream to the
> > > + * beginning of the directory.
> > > + */
> > > +void rewinddir_win32(DIR *pDir)
> > > +{
> > > +    struct dir_win32 *stream = (struct dir_win32 *)pDir;
> > > +
> > > +    if (stream == NULL) {
> > > +        errno = EBADF;
> > > +        return;
> > > +    }
> > > +
> > > +    stream->offset = 0;
> > > +
> > > +    return;
> > > +}
> > > +
> > > +/*
> > > + * seekdir_win32 - set the position of the next readdir() call in the
> > > +directory
> > > + *
> > > + * This function sets the position of the next readdir() call in the
> > > +directory
> > > + * from which the next readdir() call will start.
> > > + */
> > > +void seekdir_win32(DIR *pDir, long pos) {
> > > +    struct dir_win32 *stream = (struct dir_win32 *)pDir;
> > > +
> > > +    if (stream == NULL) {
> > > +        errno = EBADF;
> > > +        return;
> > > +    }
> > > +
> > > +    if (pos < -1) {
> > > +        errno = EINVAL;
> > > +        return;
> > > +    }
> > > +
> > > +    if (pos == -1 || pos >= (long)stream->total_entries) {
> > > +        /* seek to the end */
> > > +        stream->offset = stream->total_entries;
> > > +        return;
> > > +    }
> > > +
> > > +    if (pos - (long)stream->offset == 0) {
> > > +        /* no need to seek */
> > > +        return;
> > > +    }
> > > +
> > > +    stream->offset = pos;
> > > +
> > > +    return;
> > > +}
> > > +
> > > +/*
> > > + * telldir_win32 - return current location in directory
> > > + *
> > > + * This function returns current location in directory.
> > > + */
> > > +long telldir_win32(DIR *pDir)
> > > +{
> > > +    struct dir_win32 *stream = (struct dir_win32 *)pDir;
> > > +
> > > +    if (stream == NULL) {
> > > +        errno = EBADF;
> > > +        return -1;
> > > +    }
> > > +
> > > +    if (stream->offset > stream->total_entries) {
> > > +        return -1;
> > > +    }
> > > +
> > > +    return (long)stream->offset;
> > > +}
> > >
> > 
> 
> 
>
Shi, Guohuai March 16, 2023, 5:28 p.m. UTC | #4
> -----Original Message-----
> From: Christian Schoenebeck <qemu_oss@crudebyte.com>
> Sent: Thursday, March 16, 2023 19:05
> To: Greg Kurz <groug@kaod.org>; qemu-devel@nongnu.org
> Cc: Meng, Bin <Bin.Meng@windriver.com>; Shi, Guohuai
> <Guohuai.Shi@windriver.com>
> Subject: Re: [PATCH v5 04/16] hw/9pfs: Implement Windows specific xxxdir()
> APIs
> 
> CAUTION: This email comes from a non Wind River email account!
> Do not click links or open attachments unless you recognize the sender and
> know the content is safe.
> 
> On Wednesday, March 15, 2023 8:05:34 PM CET Shi, Guohuai wrote:
> >
> > > -----Original Message-----
> > > From: Christian Schoenebeck <qemu_oss@crudebyte.com>
> > > Sent: Wednesday, March 15, 2023 00:06
> > > To: Greg Kurz <groug@kaod.org>; qemu-devel@nongnu.org
> > > Cc: Shi, Guohuai <Guohuai.Shi@windriver.com>; Meng, Bin
> > > <Bin.Meng@windriver.com>
> > > Subject: Re: [PATCH v5 04/16] hw/9pfs: Implement Windows specific
> > > xxxdir() APIs
> > >
> > > CAUTION: This email comes from a non Wind River email account!
> > > Do not click links or open attachments unless you recognize the
> > > sender and know the content is safe.
> > >
> > > On Monday, February 20, 2023 11:08:03 AM CET Bin Meng wrote:
> > > > From: Guohuai Shi <guohuai.shi@windriver.com>
> > > >
> > > > This commit implements Windows specific xxxdir() APIs for safety
> > > > directory access.
> > >
> > > That comment is seriously too short for this patch.
> > >
> > > 1. You should describe the behaviour implementation that you have
> > > chosen and why you have chosen it.
> > >
> > > 2. Like already said in the previous version of the patch, you
> > > should place a link to the discussion we had on this issue.
> > >
> > > > Signed-off-by: Guohuai Shi <guohuai.shi@windriver.com>
> > > > Signed-off-by: Bin Meng <bin.meng@windriver.com>
> > > > ---
> > > >
> > > >  hw/9pfs/9p-util.h       |   6 +
> > > >  hw/9pfs/9p-util-win32.c | 443
> > > > ++++++++++++++++++++++++++++++++++++++++
> > > >  2 files changed, 449 insertions(+)
> > > >
> > > > diff --git a/hw/9pfs/9p-util.h b/hw/9pfs/9p-util.h index
> > > > 0f159fb4ce..c1c251fbd1 100644
> > > > --- a/hw/9pfs/9p-util.h
> > > > +++ b/hw/9pfs/9p-util.h
> > > > @@ -141,6 +141,12 @@ int unlinkat_win32(int dirfd, const char
> > > > *pathname, int flags);  int statfs_win32(const char *root_path,
> > > > struct statfs *stbuf);  int openat_dir(int dirfd, const char
> > > > *name);  int openat_file(int dirfd, const char *name, int flags,
> > > > mode_t mode);
> > > > +DIR *opendir_win32(const char *full_file_name); int
> > > > +closedir_win32(DIR *pDir); struct dirent *readdir_win32(DIR
> > > > +*pDir); void rewinddir_win32(DIR *pDir); void seekdir_win32(DIR
> > > > +*pDir, long pos); long telldir_win32(DIR *pDir);
> > > >  #endif
> > > >
> > > >  static inline void close_preserve_errno(int fd) diff --git
> > > > a/hw/9pfs/9p-util-win32.c b/hw/9pfs/9p-util-win32.c index
> > > > a99d579a06..e9408f3c45 100644
> > > > --- a/hw/9pfs/9p-util-win32.c
> > > > +++ b/hw/9pfs/9p-util-win32.c
> > > > @@ -37,6 +37,16 @@
> > > >   *    Windows does not support opendir, the directory fd is created by
> > > >   *    CreateFile and convert to fd by _open_osfhandle(). Keep the fd
> open
> > > will
> > > >   *    lock and protect the directory (can not be modified or replaced)
> > > > + *
> > > > + * 5. Neither Windows native APIs, nor MinGW provide a POSIX
> > > > + compatible
> > > API for
> > > > + *    acquiring directory entries in a safe way. Calling those APIs
> > > (native
> > > > + *    _findfirst() and _findnext() or MinGW's readdir(), seekdir() and
> > > > + *    telldir()) directly can lead to an inconsistent state if
> directory
> > > is
> > > > + *    modified in between, e.g. the same directory appearing more than
> > > once
> > > > + *    in output, or directories not appearing at all in output even
> though
> > > they
> > > > + *    were neither newly created nor deleted. POSIX does not define
> what
> > > happens
> > > > + *    with deleted or newly created directories in between, but it
> > > guarantees a
> > > > + *    consistent state.
> > > >   */
> > > >
> > > >  #include "qemu/osdep.h"
> > > > @@ -51,6 +61,25 @@
> > > >
> > > >  #define V9FS_MAGIC  0x53465039  /* string "9PFS" */
> > > >
> > > > +/*
> > > > + * MinGW and Windows does not provide a safe way to seek
> > > > +directory while other
> > > > + * thread is modifying the same directory.
> > > > + *
> > > > + * This structure is used to store sorted file id and ensure
> > > > +directory seek
> > > > + * consistency.
> > > > + */
> > > > +struct dir_win32 {
> > > > +    struct dirent dd_dir;
> > > > +    uint32_t offset;
> > > > +    uint32_t total_entries;
> > > > +    HANDLE hDir;
> > > > +    uint32_t dir_name_len;
> > > > +    uint64_t dot_id;
> > > > +    uint64_t dot_dot_id;
> > > > +    uint64_t *file_id_list;
> > > > +    char dd_name[1];
> > > > +};
> > > > +
> > > >  /*
> > > >   * win32_error_to_posix - convert Win32 error to POSIX error number
> > > >   *
> > > > @@ -977,3 +1006,417 @@ int qemu_mknodat(int dirfd, const char
> > > > *filename,
> > > mode_t mode, dev_t dev)
> > > >      errno = ENOTSUP;
> > > >      return -1;
> > > >  }
> > > > +
> > > > +static int file_id_compare(const void *id_ptr1, const void
> > > > +*id_ptr2) {
> > > > +    uint64_t id[2];
> > > > +
> > > > +    id[0] = *(uint64_t *)id_ptr1;
> > > > +    id[1] = *(uint64_t *)id_ptr2;
> > > > +
> > > > +    if (id[0] > id[1]) {
> > > > +        return 1;
> > > > +    } else if (id[0] < id[1]) {
> > > > +        return -1;
> > > > +    } else {
> > > > +        return 0;
> > > > +    }
> > > > +}
> > > > +
> > > > +static int get_next_entry(struct dir_win32 *stream) {
> > > > +    HANDLE hDirEntry = INVALID_HANDLE_VALUE;
> > > > +    char *entry_name;
> > > > +    char *entry_start;
> > > > +    FILE_ID_DESCRIPTOR fid;
> > > > +    DWORD attribute;
> > > > +
> > > > +    if (stream->file_id_list[stream->offset] == stream->dot_id) {
> > > > +        strcpy(stream->dd_dir.d_name, ".");
> > > > +        return 0;
> > > > +    }
> > > > +
> > > > +    if (stream->file_id_list[stream->offset] == stream->dot_dot_id) {
> > > > +        strcpy(stream->dd_dir.d_name, "..");
> > > > +        return 0;
> > > > +    }
> > > > +
> > > > +    fid.dwSize = sizeof(fid);
> > > > +    fid.Type = FileIdType;
> > > > +
> > > > +    fid.FileId.QuadPart = stream->file_id_list[stream->offset];
> > > > +
> > > > +    hDirEntry = OpenFileById(stream->hDir, &fid, GENERIC_READ,
> > > > +                             FILE_SHARE_READ | FILE_SHARE_WRITE
> > > > +                             | FILE_SHARE_DELETE,
> > > > +                             NULL,
> > > > +                             FILE_FLAG_BACKUP_SEMANTICS
> > > > +                             | FILE_FLAG_OPEN_REPARSE_POINT);
> > >
> > > What's the purpose of FILE_FLAG_OPEN_REPARSE_POINT here? As it's
> > > apparently not obvious, please add a comment.
> > >
> >
> > If do not use this flag, and if file id is a symbolic link, then Windows
> will not symbolic link itself, but open the target file.
> > This flag is similar as O_NOFOLLOW flag.
> 
> OK, got it, thanks! But please add a comment in code that describes this.
> 
> > > > +
> > > > +    if (hDirEntry == INVALID_HANDLE_VALUE) {
> > > > +        /*
> > > > +         * Not open it successfully, it may be deleted.
> > >
> > > Wrong English. "Open failed, it may have been deleted in the meantime.".
> > >
> > > > +         * Try next id.
> > > > +         */
> > > > +        return -1;
> > > > +    }
> > > > +
> > > > +    entry_name = get_full_path_win32(hDirEntry, NULL);
> > > > +
> > > > +    CloseHandle(hDirEntry);
> > > > +
> > > > +    if (entry_name == NULL) {
> > > > +        return -1;
> > > > +    }
> > > > +
> > > > +    attribute = GetFileAttributes(entry_name);
> > > > +
> > > > +    /* symlink is not allowed */
> > > > +    if (attribute == INVALID_FILE_ATTRIBUTES
> > > > +        || (attribute & FILE_ATTRIBUTE_REPARSE_POINT) != 0) {
> > > > +        return -1;
> > >
> > > Wouldn't it make sense to call warn_report_once() here to let the
> > > user know that he has some symlinks that are never delivered to guest?
> >
> > OK, Got it.
> >
> > >
> > > > +    }
> > > > +
> > > > +    if (memcmp(entry_name, stream->dd_name, stream->dir_name_len)
> > > > + !=
> > > > + 0) {
> > >
> > > No, that's unsafe. You want to use something like strncmp() instead.
> > >
> > > > +        /*
> > > > +         * The full entry file name should be a part of parent
> > > > + directory
> > > name,
> > > > +         * except dot and dot_dot (is already handled).
> > > > +         * If not, this entry should not be returned.
> > > > +         */
> > > > +        return -1;
> > > > +    }
> > > > +
> > > > +    entry_start = entry_name + stream->dir_name_len;
> > >
> > > s/entry_start/entry_basename/ ?
> > >
> > > > +
> > > > +    /* skip slash */
> > > > +    while (*entry_start == '\\') {
> > > > +        entry_start++;
> > > > +    }
> > > > +
> > > > +    if (strchr(entry_start, '\\') != NULL) {
> > > > +        return -1;
> > > > +    }
> > > > +
> > > > +    if (strlen(entry_start) == 0
> > > > +        || strlen(entry_start) + 1 > sizeof(stream->dd_dir.d_name)) {
> > > > +        return -1;
> > > > +    }
> > > > +    strcpy(stream->dd_dir.d_name, entry_start);
> > >
> > > g_path_get_basename() ? :)
> >
> > For above three comments:
> > This code is not good, should be fixed.
> > The code want to filter the following cases:
> > The parent directory path is not a part of entry's full path:
> > Parent: C:\123\456, entry: C:\123, C:\ Entry contains more than one
> > name components:
> > Parent: C:\123\456, entry: C:\123\456\789\abc Entry is zero length or
> > name buffer is too long
> >
> > I will refactor this part.
> 
> In general: writing parsing code yourself is extremely error prone. That's
> why it makes sense to use existing functions from glib, etc.
> 
> > >
> > > > +
> > > > +    return 0;
> > > > +}
> > > > +
> > > > +/*
> > > > + * opendir_win32 - open a directory
> > > > + *
> > > > + * This function opens a directory and caches all directory entries.
> > >
> > > It just caches all file IDs, doesn't it?
> > >
> >
> > Will fix it
> >
> > > > + */
> > > > +DIR *opendir_win32(const char *full_file_name) {
> > > > +    HANDLE hDir = INVALID_HANDLE_VALUE;
> > > > +    HANDLE hDirEntry = INVALID_HANDLE_VALUE;
> > > > +    char *full_dir_entry = NULL;
> > > > +    DWORD attribute;
> > > > +    intptr_t dd_handle = -1;
> > > > +    struct _finddata_t dd_data;
> > > > +    uint64_t file_id;
> > > > +    uint64_t *file_id_list = NULL;
> > > > +    BY_HANDLE_FILE_INFORMATION FileInfo;
> > >
> > > FileInfo is the variable name, not a struct name, so no upper case
> > > for it please.
> >
> > Will fix it.
> > >
> > > > +    struct dir_win32 *stream = NULL;
> > > > +    int err = 0;
> > > > +    int find_status;
> > > > +    int sort_first_two_entry = 0;
> > > > +    uint32_t list_count = 16;
> > >
> > > Magic number 16?
> >
> > Will change it to a macro.
> > >
> > > > +    uint32_t index = 0;
> > > > +
> > > > +    /* open directory to prevent it being removed */
> > > > +
> > > > +    hDir = CreateFile(full_file_name, GENERIC_READ,
> > > > +                      FILE_SHARE_READ | FILE_SHARE_WRITE |
> > > FILE_SHARE_DELETE,
> > > > +                      NULL,
> > > > +                      OPEN_EXISTING,
> > > > +                      FILE_FLAG_BACKUP_SEMANTICS |
> > > FILE_FLAG_OPEN_REPARSE_POINT,
> > > > +                      NULL);
> > > > +
> > > > +    if (hDir == INVALID_HANDLE_VALUE) {
> > > > +        err = win32_error_to_posix(GetLastError());
> > > > +        goto out;
> > > > +    }
> > > > +
> > > > +    attribute = GetFileAttributes(full_file_name);
> > > > +
> > > > +    /* symlink is not allow */
> > > > +    if (attribute == INVALID_FILE_ATTRIBUTES
> > > > +        || (attribute & FILE_ATTRIBUTE_REPARSE_POINT) != 0) {
> > > > +        err = EACCES;
> > > > +        goto out;
> > > > +    }
> > > > +
> > > > +    /* check if it is a directory */
> > > > +    if ((attribute & FILE_ATTRIBUTE_DIRECTORY) == 0) {
> > > > +        err = ENOTDIR;
> > > > +        goto out;
> > > > +    }
> > > > +
> > > > +    file_id_list = g_malloc0(sizeof(uint64_t) * list_count);
> > > > +
> > > > +    /*
> > > > +     * findfirst() needs suffix format name like "\dir1\dir2\*",
> > > > +     * allocate more buffer to store suffix.
> > > > +     */
> > > > +    stream = g_malloc0(sizeof(struct dir_win32) +
> > > > + strlen(full_file_name) + 3);
> > >
> > > Not that I would care much, but +2 would be correct here, as you
> > > declared the struct with one character already, so it is not a
> > > classic (zero size) flex
> > > array:
> > >
> > >   struct dir_win32 {
> > >     ...
> > >     char dd_name[1];
> > >   };
> > >
> > Will fix it.
> >
> > > > +
> > > > +    strcpy(stream->dd_name, full_file_name);
> > > > +    strcat(stream->dd_name, "\\*");
> > > > +
> > > > +    stream->hDir = hDir;
> > > > +    stream->dir_name_len = strlen(full_file_name);
> > > > +
> > > > +    dd_handle = _findfirst(stream->dd_name, &dd_data);
> > > > +
> > > > +    if (dd_handle == -1) {
> > > > +        err = errno;
> > > > +        goto out;
> > > > +    }
> > > > +
> > > > +    /* read all entries to link list */
> > >
> > > "read all entries as a linked list"
> > >
> > > However there is no linked list here. It seems to be an array.
> >
> > Will fix it.
> > >
> > > > +    do {
> > > > +        full_dir_entry = get_full_path_win32(hDir, dd_data.name);
> > > > +
> > > > +        if (full_dir_entry == NULL) {
> > > > +            err = ENOMEM;
> > > > +            break;
> > > > +        }
> > > > +
> > > > +        /*
> > > > +         * Open every entry and get the file informations.
> > > > +         *
> > > > +         * Skip symbolic links during reading directory.
> > > > +         */
> > > > +        hDirEntry = CreateFile(full_dir_entry,
> > > > +                               GENERIC_READ,
> > > > +                               FILE_SHARE_READ | FILE_SHARE_WRITE
> > > > +                               | FILE_SHARE_DELETE,
> > > > +                               NULL,
> > > > +                               OPEN_EXISTING,
> > > > +                               FILE_FLAG_BACKUP_SEMANTICS
> > > > +                               | FILE_FLAG_OPEN_REPARSE_POINT,
> > > > + NULL);
> > > > +
> > > > +        if (hDirEntry != INVALID_HANDLE_VALUE) {
> > > > +            if (GetFileInformationByHandle(hDirEntry,
> > > > +                                           &FileInfo) == TRUE) {
> > > > +                attribute = FileInfo.dwFileAttributes;
> > > > +
> > > > +                /* only save validate entries */
> > > > +                if ((attribute & FILE_ATTRIBUTE_REPARSE_POINT) == 0) {
> > > > +                    if (index >= list_count) {
> > > > +                        list_count = list_count + 16;
> > >
> > > Magic number 16 again.
> > >
> > > > +                        file_id_list = g_realloc(file_id_list,
> > > > +                                                 sizeof(uint64_t)
> > > > +                                                 * list_count);
> > >
> > > OK, so here we are finally at the point where you chose the overall
> > > behaviour for this that we discussed before.
> > >
> > > So you are constantly appending 16 entry chunks to the end of the
> > > array, periodically reallocate the entire array, and potentially end
> > > up with one giant dense array with *all* file IDs of the directory.
> > >
> > > That's not really what I had in mind, as it still has the potential
> > > to easily crash QEMU if there are large directories on host.
> > > Theoretically a Windows directory might then consume up to 16 GB of
> > > RAM for looking up only one single directory.
> > >
> > > So is this the implementation that you said was very slow, or did
> > > you test a different one? Remember, my orgiginal idea (as starting
> > > point for Windows) was to only cache *one* file ID (the last being
> > > looked up). That's it. Not a list of file IDs.
> >
> > If only cache one file ID, that means for every read directory operation.
> > we need to look up whole directory to find out the next ID larger than last
> cached one.
> >
> > I provided some performance test in last patch:
> > Run test for read directory with 100, 1000, 10000 entries #1, For file
> > name cache solution, the time cost is: 2, 9, 44 (in ms).
> > #2, For file id cache solution, the time cost: 3, 438, 4338 (in ms). This
> is current solution.
> > #3, for cache one id solution, I just tested it: 4, 4788, more than
> > one minutes (in ms)
> >
> > I think it is not a good idea to cache one file id, it would be very
> > bad performance
> 
> Yes, the performce would be lousy, but at least we would have a basis that
> just works^TM. Correct behaviour always comes before performance. And from
> there you could add additional patches on top to address performance
> improvements. Because the point is: your implementation is also suboptimal,
> and more importantly: prone to crashes like we discussed before.
> 
> Regarding performance: for instance you are re-allocating an entire dense
> buffer on every 16 new entries. That will slow down things extremely. Please
> use a container from glib, because these are handling resize operations more
> smoothly for you out of the box, i.e. typically by doubling the container
> capacity instead of re-allocating frequently with small chunks like you did.
> 
> However I am still not convinced that allocating a huge dense buffer with
> *all* file IDs of a directory makes sense.
> 
> On the long-term it would make sense to do it like other implementations:
> store a snapshot of the directory temporarily on disk. That way it would not
> matter how huge the directory is. But that's a complex implementation, so not
> something that I would do in this series already.
> 
> On the short/mid term I think we could simply make a mix of your solution and
> the one-ID solution that I suggested: keeping a maximum of e.g. 1k file IDs
> in RAM. And once guest seeks past that boundary, loading the subsequent 1k
> entries, free-ing the previous 1k entries, and so on.
> 

Please note that the performance data is tested in native OS, but not in QEMU.
It is even worse in QEMU.

I run Linux guest OS on Windows host, use "ls -l" command to list a directory with about 100 entries.
"ls -l" command need about 0.5 second to display one directory entry.

Caching only one node (file id, or file name, or others) will make 9pfs not usable: listing 100 directory entries need 50 seconds in guest OS.

> > >
> > > > +                    }
> > > > +                    file_id = (uint64_t)FileInfo.nFileIndexLow
> > > > +                              +
> > > > + (((uint64_t)FileInfo.nFileIndexHigh)
> > > > + << 32);
> > > > +
> > > > +
> > > > +                    file_id_list[index] = file_id;
> > > > +
> > > > +                    if (strcmp(dd_data.name, ".") == 0) {
> > > > +                        stream->dot_id = file_id_list[index];
> > > > +                        if (index != 0) {
> > > > +                            sort_first_two_entry = 1;
> > > > +                        }
> > > > +                    } else if (strcmp(dd_data.name, "..") == 0) {
> > > > +                        stream->dot_dot_id = file_id_list[index];
> > > > +                        if (index != 1) {
> > > > +                            sort_first_two_entry = 1;
> > > > +                        }
> > > > +                    }
> > > > +                    index++;
> > > > +                }
> > > > +            }
> > > > +            CloseHandle(hDirEntry);
> > > > +        }
> > > > +        g_free(full_dir_entry);
> > > > +        find_status = _findnext(dd_handle, &dd_data);
> > > > +    } while (find_status == 0);
> > > > +
> > > > +    if (errno == ENOENT) {
> > > > +        /* No more matching files could be found, clean errno */
> > > > +        errno = 0;
> > > > +    } else {
> > > > +        err = errno;
> > > > +        goto out;
> > > > +    }
> > > > +
> > > > +    stream->total_entries = index;
> > > > +    stream->file_id_list = file_id_list;
> > > > +
> > > > +    if (sort_first_two_entry == 0) {
> > > > +        /*
> > > > +         * If the first two entry is "." and "..", then do not sort
> them.
> > > > +         *
> > > > +         * If the guest OS always considers first two entries are
> > > > + "." and
> > > "..",
> > > > +         * sort the two entries may cause confused display in guest
> OS.
> > > > +         */
> > > > +        qsort(&file_id_list[2], index - 2, sizeof(file_id),
> > > file_id_compare);
> > > > +    } else {
> > > > +        qsort(&file_id_list[0], index, sizeof(file_id),
> file_id_compare);
> > > > +    }
> > >
> > > Were there cases where you did not get "." and ".." ?
> >
> > NTFS always provides "." and "..".
> > I could add more checks here to fix this risk
> 
> That's what I assumed. So you can probably just drop this code for
> simplicity.
> 
> >
> > >
> > > > +
> > > > +out:
> > > > +    if (err != 0) {
> > > > +        errno = err;
> > > > +        if (stream != NULL) {
> > > > +            if (file_id_list != NULL) {
> > > > +                g_free(file_id_list);
> > > > +            }
> > > > +            CloseHandle(hDir);
> > > > +            g_free(stream);
> > > > +            stream = NULL;
> > > > +        }
> > > > +    }
> > > > +
> > > > +    if (dd_handle != -1) {
> > > > +        _findclose(dd_handle);
> > > > +    }
> > > > +
> > > > +    return (DIR *)stream;
> > > > +}
> > > > +
> > > > +/*
> > > > + * closedir_win32 - close a directory
> > > > + *
> > > > + * This function closes directory and free all cached resources.
> > > > + */
> > > > +int closedir_win32(DIR *pDir)
> > > > +{
> > > > +    struct dir_win32 *stream = (struct dir_win32 *)pDir;
> > > > +
> > > > +    if (stream == NULL) {
> > > > +        errno = EBADF;
> > > > +        return -1;
> > > > +    }
> > > > +
> > > > +    /* free all resources */
> > > > +    CloseHandle(stream->hDir);
> > > > +
> > > > +    g_free(stream->file_id_list);
> > > > +
> > > > +    g_free(stream);
> > > > +
> > > > +    return 0;
> > > > +}
> > > > +
> > > > +/*
> > > > + * readdir_win32 - read a directory
> > > > + *
> > > > + * This function reads a directory entry from cached entry list.
> > > > + */
> > > > +struct dirent *readdir_win32(DIR *pDir) {
> > > > +    struct dir_win32 *stream = (struct dir_win32 *)pDir;
> > > > +
> > > > +    if (stream == NULL) {
> > > > +        errno = EBADF;
> > > > +        return NULL;
> > > > +    }
> > > > +
> > > > +retry:
> > > > +
> > > > +    if (stream->offset >= stream->total_entries) {
> > > > +        /* reach to the end, return NULL without set errno */
> > > > +        return NULL;
> > > > +    }
> > > > +
> > > > +    if (get_next_entry(stream) != 0) {
> > > > +        stream->offset++;
> > > > +        goto retry;
> > > > +    }
> > > > +
> > > > +    /* Windows does not provide inode number */
> > > > +    stream->dd_dir.d_ino = 0;
> > > > +    stream->dd_dir.d_reclen = 0;
> > > > +    stream->dd_dir.d_namlen = strlen(stream->dd_dir.d_name);
> > > > +
> > > > +    stream->offset++;
> > > > +
> > > > +    return &stream->dd_dir;
> > > > +}
> > > > +
> > > > +/*
> > > > + * rewinddir_win32 - reset directory stream
> > > > + *
> > > > + * This function resets the position of the directory stream to
> > > > +the
> > > > + * beginning of the directory.
> > > > + */
> > > > +void rewinddir_win32(DIR *pDir)
> > > > +{
> > > > +    struct dir_win32 *stream = (struct dir_win32 *)pDir;
> > > > +
> > > > +    if (stream == NULL) {
> > > > +        errno = EBADF;
> > > > +        return;
> > > > +    }
> > > > +
> > > > +    stream->offset = 0;
> > > > +
> > > > +    return;
> > > > +}
> > > > +
> > > > +/*
> > > > + * seekdir_win32 - set the position of the next readdir() call in
> > > > +the directory
> > > > + *
> > > > + * This function sets the position of the next readdir() call in
> > > > +the directory
> > > > + * from which the next readdir() call will start.
> > > > + */
> > > > +void seekdir_win32(DIR *pDir, long pos) {
> > > > +    struct dir_win32 *stream = (struct dir_win32 *)pDir;
> > > > +
> > > > +    if (stream == NULL) {
> > > > +        errno = EBADF;
> > > > +        return;
> > > > +    }
> > > > +
> > > > +    if (pos < -1) {
> > > > +        errno = EINVAL;
> > > > +        return;
> > > > +    }
> > > > +
> > > > +    if (pos == -1 || pos >= (long)stream->total_entries) {
> > > > +        /* seek to the end */
> > > > +        stream->offset = stream->total_entries;
> > > > +        return;
> > > > +    }
> > > > +
> > > > +    if (pos - (long)stream->offset == 0) {
> > > > +        /* no need to seek */
> > > > +        return;
> > > > +    }
> > > > +
> > > > +    stream->offset = pos;
> > > > +
> > > > +    return;
> > > > +}
> > > > +
> > > > +/*
> > > > + * telldir_win32 - return current location in directory
> > > > + *
> > > > + * This function returns current location in directory.
> > > > + */
> > > > +long telldir_win32(DIR *pDir)
> > > > +{
> > > > +    struct dir_win32 *stream = (struct dir_win32 *)pDir;
> > > > +
> > > > +    if (stream == NULL) {
> > > > +        errno = EBADF;
> > > > +        return -1;
> > > > +    }
> > > > +
> > > > +    if (stream->offset > stream->total_entries) {
> > > > +        return -1;
> > > > +    }
> > > > +
> > > > +    return (long)stream->offset;
> > > > +}
> > > >
> > >
> >
> >
> >
> 
>
Shi, Guohuai March 17, 2023, 4:36 a.m. UTC | #5
> -----Original Message-----
> From: Shi, Guohuai
> Sent: Friday, March 17, 2023 01:28
> To: Christian Schoenebeck <qemu_oss@crudebyte.com>; Greg Kurz
> <groug@kaod.org>; qemu-devel@nongnu.org
> Cc: Meng, Bin <Bin.Meng@windriver.com>
> Subject: RE: [PATCH v5 04/16] hw/9pfs: Implement Windows specific xxxdir()
> APIs
> 
> 
> 
> > -----Original Message-----
> > From: Christian Schoenebeck <qemu_oss@crudebyte.com>
> > Sent: Thursday, March 16, 2023 19:05
> > To: Greg Kurz <groug@kaod.org>; qemu-devel@nongnu.org
> > Cc: Meng, Bin <Bin.Meng@windriver.com>; Shi, Guohuai
> > <Guohuai.Shi@windriver.com>
> > Subject: Re: [PATCH v5 04/16] hw/9pfs: Implement Windows specific
> > xxxdir() APIs
> >
> > CAUTION: This email comes from a non Wind River email account!
> > Do not click links or open attachments unless you recognize the sender
> > and know the content is safe.
> >
> > On Wednesday, March 15, 2023 8:05:34 PM CET Shi, Guohuai wrote:
> > >
> > > > -----Original Message-----
> > > > From: Christian Schoenebeck <qemu_oss@crudebyte.com>
> > > > Sent: Wednesday, March 15, 2023 00:06
> > > > To: Greg Kurz <groug@kaod.org>; qemu-devel@nongnu.org
> > > > Cc: Shi, Guohuai <Guohuai.Shi@windriver.com>; Meng, Bin
> > > > <Bin.Meng@windriver.com>
> > > > Subject: Re: [PATCH v5 04/16] hw/9pfs: Implement Windows specific
> > > > xxxdir() APIs
> > > >
> > > > CAUTION: This email comes from a non Wind River email account!
> > > > Do not click links or open attachments unless you recognize the
> > > > sender and know the content is safe.
> > > >
> > > > On Monday, February 20, 2023 11:08:03 AM CET Bin Meng wrote:
> > > > > From: Guohuai Shi <guohuai.shi@windriver.com>
> > > > >
> > > > > This commit implements Windows specific xxxdir() APIs for safety
> > > > > directory access.
> > > >
> > > > That comment is seriously too short for this patch.
> > > >
> > > > 1. You should describe the behaviour implementation that you have
> > > > chosen and why you have chosen it.
> > > >
> > > > 2. Like already said in the previous version of the patch, you
> > > > should place a link to the discussion we had on this issue.
> > > >
> > > > > Signed-off-by: Guohuai Shi <guohuai.shi@windriver.com>
> > > > > Signed-off-by: Bin Meng <bin.meng@windriver.com>
> > > > > ---
> > > > >
> > > > >  hw/9pfs/9p-util.h       |   6 +
> > > > >  hw/9pfs/9p-util-win32.c | 443
> > > > > ++++++++++++++++++++++++++++++++++++++++
> > > > >  2 files changed, 449 insertions(+)
> > > > >
> > > > > diff --git a/hw/9pfs/9p-util.h b/hw/9pfs/9p-util.h index
> > > > > 0f159fb4ce..c1c251fbd1 100644
> > > > > --- a/hw/9pfs/9p-util.h
> > > > > +++ b/hw/9pfs/9p-util.h
> > > > > @@ -141,6 +141,12 @@ int unlinkat_win32(int dirfd, const char
> > > > > *pathname, int flags);  int statfs_win32(const char *root_path,
> > > > > struct statfs *stbuf);  int openat_dir(int dirfd, const char
> > > > > *name);  int openat_file(int dirfd, const char *name, int flags,
> > > > > mode_t mode);
> > > > > +DIR *opendir_win32(const char *full_file_name); int
> > > > > +closedir_win32(DIR *pDir); struct dirent *readdir_win32(DIR
> > > > > +*pDir); void rewinddir_win32(DIR *pDir); void seekdir_win32(DIR
> > > > > +*pDir, long pos); long telldir_win32(DIR *pDir);
> > > > >  #endif
> > > > >
> > > > >  static inline void close_preserve_errno(int fd) diff --git
> > > > > a/hw/9pfs/9p-util-win32.c b/hw/9pfs/9p-util-win32.c index
> > > > > a99d579a06..e9408f3c45 100644
> > > > > --- a/hw/9pfs/9p-util-win32.c
> > > > > +++ b/hw/9pfs/9p-util-win32.c
> > > > > @@ -37,6 +37,16 @@
> > > > >   *    Windows does not support opendir, the directory fd is created by
> > > > >   *    CreateFile and convert to fd by _open_osfhandle(). Keep the fd
> > open
> > > > will
> > > > >   *    lock and protect the directory (can not be modified or replaced)
> > > > > + *
> > > > > + * 5. Neither Windows native APIs, nor MinGW provide a POSIX
> > > > > + compatible
> > > > API for
> > > > > + *    acquiring directory entries in a safe way. Calling those APIs
> > > > (native
> > > > > + *    _findfirst() and _findnext() or MinGW's readdir(), seekdir() and
> > > > > + *    telldir()) directly can lead to an inconsistent state if
> > directory
> > > > is
> > > > > + *    modified in between, e.g. the same directory appearing more
> than
> > > > once
> > > > > + *    in output, or directories not appearing at all in output even
> > though
> > > > they
> > > > > + *    were neither newly created nor deleted. POSIX does not define
> > what
> > > > happens
> > > > > + *    with deleted or newly created directories in between, but it
> > > > guarantees a
> > > > > + *    consistent state.
> > > > >   */
> > > > >
> > > > >  #include "qemu/osdep.h"
> > > > > @@ -51,6 +61,25 @@
> > > > >
> > > > >  #define V9FS_MAGIC  0x53465039  /* string "9PFS" */
> > > > >
> > > > > +/*
> > > > > + * MinGW and Windows does not provide a safe way to seek
> > > > > +directory while other
> > > > > + * thread is modifying the same directory.
> > > > > + *
> > > > > + * This structure is used to store sorted file id and ensure
> > > > > +directory seek
> > > > > + * consistency.
> > > > > + */
> > > > > +struct dir_win32 {
> > > > > +    struct dirent dd_dir;
> > > > > +    uint32_t offset;
> > > > > +    uint32_t total_entries;
> > > > > +    HANDLE hDir;
> > > > > +    uint32_t dir_name_len;
> > > > > +    uint64_t dot_id;
> > > > > +    uint64_t dot_dot_id;
> > > > > +    uint64_t *file_id_list;
> > > > > +    char dd_name[1];
> > > > > +};
> > > > > +
> > > > >  /*
> > > > >   * win32_error_to_posix - convert Win32 error to POSIX error
> number
> > > > >   *
> > > > > @@ -977,3 +1006,417 @@ int qemu_mknodat(int dirfd, const char
> > > > > *filename,
> > > > mode_t mode, dev_t dev)
> > > > >      errno = ENOTSUP;
> > > > >      return -1;
> > > > >  }
> > > > > +
> > > > > +static int file_id_compare(const void *id_ptr1, const void
> > > > > +*id_ptr2) {
> > > > > +    uint64_t id[2];
> > > > > +
> > > > > +    id[0] = *(uint64_t *)id_ptr1;
> > > > > +    id[1] = *(uint64_t *)id_ptr2;
> > > > > +
> > > > > +    if (id[0] > id[1]) {
> > > > > +        return 1;
> > > > > +    } else if (id[0] < id[1]) {
> > > > > +        return -1;
> > > > > +    } else {
> > > > > +        return 0;
> > > > > +    }
> > > > > +}
> > > > > +
> > > > > +static int get_next_entry(struct dir_win32 *stream) {
> > > > > +    HANDLE hDirEntry = INVALID_HANDLE_VALUE;
> > > > > +    char *entry_name;
> > > > > +    char *entry_start;
> > > > > +    FILE_ID_DESCRIPTOR fid;
> > > > > +    DWORD attribute;
> > > > > +
> > > > > +    if (stream->file_id_list[stream->offset] == stream->dot_id) {
> > > > > +        strcpy(stream->dd_dir.d_name, ".");
> > > > > +        return 0;
> > > > > +    }
> > > > > +
> > > > > +    if (stream->file_id_list[stream->offset] == stream->dot_dot_id) {
> > > > > +        strcpy(stream->dd_dir.d_name, "..");
> > > > > +        return 0;
> > > > > +    }
> > > > > +
> > > > > +    fid.dwSize = sizeof(fid);
> > > > > +    fid.Type = FileIdType;
> > > > > +
> > > > > +    fid.FileId.QuadPart = stream->file_id_list[stream->offset];
> > > > > +
> > > > > +    hDirEntry = OpenFileById(stream->hDir, &fid, GENERIC_READ,
> > > > > +                             FILE_SHARE_READ | FILE_SHARE_WRITE
> > > > > +                             | FILE_SHARE_DELETE,
> > > > > +                             NULL,
> > > > > +                             FILE_FLAG_BACKUP_SEMANTICS
> > > > > +                             | FILE_FLAG_OPEN_REPARSE_POINT);
> > > >
> > > > What's the purpose of FILE_FLAG_OPEN_REPARSE_POINT here? As it's
> > > > apparently not obvious, please add a comment.
> > > >
> > >
> > > If do not use this flag, and if file id is a symbolic link, then
> > > Windows
> > will not symbolic link itself, but open the target file.
> > > This flag is similar as O_NOFOLLOW flag.
> >
> > OK, got it, thanks! But please add a comment in code that describes this.
> >
> > > > > +
> > > > > +    if (hDirEntry == INVALID_HANDLE_VALUE) {
> > > > > +        /*
> > > > > +         * Not open it successfully, it may be deleted.
> > > >
> > > > Wrong English. "Open failed, it may have been deleted in the
> meantime.".
> > > >
> > > > > +         * Try next id.
> > > > > +         */
> > > > > +        return -1;
> > > > > +    }
> > > > > +
> > > > > +    entry_name = get_full_path_win32(hDirEntry, NULL);
> > > > > +
> > > > > +    CloseHandle(hDirEntry);
> > > > > +
> > > > > +    if (entry_name == NULL) {
> > > > > +        return -1;
> > > > > +    }
> > > > > +
> > > > > +    attribute = GetFileAttributes(entry_name);
> > > > > +
> > > > > +    /* symlink is not allowed */
> > > > > +    if (attribute == INVALID_FILE_ATTRIBUTES
> > > > > +        || (attribute & FILE_ATTRIBUTE_REPARSE_POINT) != 0) {
> > > > > +        return -1;
> > > >
> > > > Wouldn't it make sense to call warn_report_once() here to let the
> > > > user know that he has some symlinks that are never delivered to guest?
> > >
> > > OK, Got it.
> > >
> > > >
> > > > > +    }
> > > > > +
> > > > > +    if (memcmp(entry_name, stream->dd_name,
> > > > > + stream->dir_name_len) !=
> > > > > + 0) {
> > > >
> > > > No, that's unsafe. You want to use something like strncmp() instead.
> > > >
> > > > > +        /*
> > > > > +         * The full entry file name should be a part of parent
> > > > > + directory
> > > > name,
> > > > > +         * except dot and dot_dot (is already handled).
> > > > > +         * If not, this entry should not be returned.
> > > > > +         */
> > > > > +        return -1;
> > > > > +    }
> > > > > +
> > > > > +    entry_start = entry_name + stream->dir_name_len;
> > > >
> > > > s/entry_start/entry_basename/ ?
> > > >
> > > > > +
> > > > > +    /* skip slash */
> > > > > +    while (*entry_start == '\\') {
> > > > > +        entry_start++;
> > > > > +    }
> > > > > +
> > > > > +    if (strchr(entry_start, '\\') != NULL) {
> > > > > +        return -1;
> > > > > +    }
> > > > > +
> > > > > +    if (strlen(entry_start) == 0
> > > > > +        || strlen(entry_start) + 1 > sizeof(stream->dd_dir.d_name)) {
> > > > > +        return -1;
> > > > > +    }
> > > > > +    strcpy(stream->dd_dir.d_name, entry_start);
> > > >
> > > > g_path_get_basename() ? :)
> > >
> > > For above three comments:
> > > This code is not good, should be fixed.
> > > The code want to filter the following cases:
> > > The parent directory path is not a part of entry's full path:
> > > Parent: C:\123\456, entry: C:\123, C:\ Entry contains more than one
> > > name components:
> > > Parent: C:\123\456, entry: C:\123\456\789\abc Entry is zero length
> > > or name buffer is too long
> > >
> > > I will refactor this part.
> >
> > In general: writing parsing code yourself is extremely error prone.
> > That's why it makes sense to use existing functions from glib, etc.
> >
> > > >
> > > > > +
> > > > > +    return 0;
> > > > > +}
> > > > > +
> > > > > +/*
> > > > > + * opendir_win32 - open a directory
> > > > > + *
> > > > > + * This function opens a directory and caches all directory entries.
> > > >
> > > > It just caches all file IDs, doesn't it?
> > > >
> > >
> > > Will fix it
> > >
> > > > > + */
> > > > > +DIR *opendir_win32(const char *full_file_name) {
> > > > > +    HANDLE hDir = INVALID_HANDLE_VALUE;
> > > > > +    HANDLE hDirEntry = INVALID_HANDLE_VALUE;
> > > > > +    char *full_dir_entry = NULL;
> > > > > +    DWORD attribute;
> > > > > +    intptr_t dd_handle = -1;
> > > > > +    struct _finddata_t dd_data;
> > > > > +    uint64_t file_id;
> > > > > +    uint64_t *file_id_list = NULL;
> > > > > +    BY_HANDLE_FILE_INFORMATION FileInfo;
> > > >
> > > > FileInfo is the variable name, not a struct name, so no upper case
> > > > for it please.
> > >
> > > Will fix it.
> > > >
> > > > > +    struct dir_win32 *stream = NULL;
> > > > > +    int err = 0;
> > > > > +    int find_status;
> > > > > +    int sort_first_two_entry = 0;
> > > > > +    uint32_t list_count = 16;
> > > >
> > > > Magic number 16?
> > >
> > > Will change it to a macro.
> > > >
> > > > > +    uint32_t index = 0;
> > > > > +
> > > > > +    /* open directory to prevent it being removed */
> > > > > +
> > > > > +    hDir = CreateFile(full_file_name, GENERIC_READ,
> > > > > +                      FILE_SHARE_READ | FILE_SHARE_WRITE |
> > > > FILE_SHARE_DELETE,
> > > > > +                      NULL,
> > > > > +                      OPEN_EXISTING,
> > > > > +                      FILE_FLAG_BACKUP_SEMANTICS |
> > > > FILE_FLAG_OPEN_REPARSE_POINT,
> > > > > +                      NULL);
> > > > > +
> > > > > +    if (hDir == INVALID_HANDLE_VALUE) {
> > > > > +        err = win32_error_to_posix(GetLastError());
> > > > > +        goto out;
> > > > > +    }
> > > > > +
> > > > > +    attribute = GetFileAttributes(full_file_name);
> > > > > +
> > > > > +    /* symlink is not allow */
> > > > > +    if (attribute == INVALID_FILE_ATTRIBUTES
> > > > > +        || (attribute & FILE_ATTRIBUTE_REPARSE_POINT) != 0) {
> > > > > +        err = EACCES;
> > > > > +        goto out;
> > > > > +    }
> > > > > +
> > > > > +    /* check if it is a directory */
> > > > > +    if ((attribute & FILE_ATTRIBUTE_DIRECTORY) == 0) {
> > > > > +        err = ENOTDIR;
> > > > > +        goto out;
> > > > > +    }
> > > > > +
> > > > > +    file_id_list = g_malloc0(sizeof(uint64_t) * list_count);
> > > > > +
> > > > > +    /*
> > > > > +     * findfirst() needs suffix format name like "\dir1\dir2\*",
> > > > > +     * allocate more buffer to store suffix.
> > > > > +     */
> > > > > +    stream = g_malloc0(sizeof(struct dir_win32) +
> > > > > + strlen(full_file_name) + 3);
> > > >
> > > > Not that I would care much, but +2 would be correct here, as you
> > > > declared the struct with one character already, so it is not a
> > > > classic (zero size) flex
> > > > array:
> > > >
> > > >   struct dir_win32 {
> > > >     ...
> > > >     char dd_name[1];
> > > >   };
> > > >
> > > Will fix it.
> > >
> > > > > +
> > > > > +    strcpy(stream->dd_name, full_file_name);
> > > > > +    strcat(stream->dd_name, "\\*");
> > > > > +
> > > > > +    stream->hDir = hDir;
> > > > > +    stream->dir_name_len = strlen(full_file_name);
> > > > > +
> > > > > +    dd_handle = _findfirst(stream->dd_name, &dd_data);
> > > > > +
> > > > > +    if (dd_handle == -1) {
> > > > > +        err = errno;
> > > > > +        goto out;
> > > > > +    }
> > > > > +
> > > > > +    /* read all entries to link list */
> > > >
> > > > "read all entries as a linked list"
> > > >
> > > > However there is no linked list here. It seems to be an array.
> > >
> > > Will fix it.
> > > >
> > > > > +    do {
> > > > > +        full_dir_entry = get_full_path_win32(hDir,
> > > > > + dd_data.name);
> > > > > +
> > > > > +        if (full_dir_entry == NULL) {
> > > > > +            err = ENOMEM;
> > > > > +            break;
> > > > > +        }
> > > > > +
> > > > > +        /*
> > > > > +         * Open every entry and get the file informations.
> > > > > +         *
> > > > > +         * Skip symbolic links during reading directory.
> > > > > +         */
> > > > > +        hDirEntry = CreateFile(full_dir_entry,
> > > > > +                               GENERIC_READ,
> > > > > +                               FILE_SHARE_READ | FILE_SHARE_WRITE
> > > > > +                               | FILE_SHARE_DELETE,
> > > > > +                               NULL,
> > > > > +                               OPEN_EXISTING,
> > > > > +                               FILE_FLAG_BACKUP_SEMANTICS
> > > > > +                               | FILE_FLAG_OPEN_REPARSE_POINT,
> > > > > + NULL);
> > > > > +
> > > > > +        if (hDirEntry != INVALID_HANDLE_VALUE) {
> > > > > +            if (GetFileInformationByHandle(hDirEntry,
> > > > > +                                           &FileInfo) == TRUE) {
> > > > > +                attribute = FileInfo.dwFileAttributes;
> > > > > +
> > > > > +                /* only save validate entries */
> > > > > +                if ((attribute & FILE_ATTRIBUTE_REPARSE_POINT) == 0) {
> > > > > +                    if (index >= list_count) {
> > > > > +                        list_count = list_count + 16;
> > > >
> > > > Magic number 16 again.
> > > >
> > > > > +                        file_id_list = g_realloc(file_id_list,
> > > > > +                                                 sizeof(uint64_t)
> > > > > +                                                 * list_count);
> > > >
> > > > OK, so here we are finally at the point where you chose the
> > > > overall behaviour for this that we discussed before.
> > > >
> > > > So you are constantly appending 16 entry chunks to the end of the
> > > > array, periodically reallocate the entire array, and potentially
> > > > end up with one giant dense array with *all* file IDs of the directory.
> > > >
> > > > That's not really what I had in mind, as it still has the
> > > > potential to easily crash QEMU if there are large directories on host.
> > > > Theoretically a Windows directory might then consume up to 16 GB
> > > > of RAM for looking up only one single directory.
> > > >
> > > > So is this the implementation that you said was very slow, or did
> > > > you test a different one? Remember, my orgiginal idea (as starting
> > > > point for Windows) was to only cache *one* file ID (the last being
> > > > looked up). That's it. Not a list of file IDs.
> > >
> > > If only cache one file ID, that means for every read directory operation.
> > > we need to look up whole directory to find out the next ID larger
> > > than last
> > cached one.
> > >
> > > I provided some performance test in last patch:
> > > Run test for read directory with 100, 1000, 10000 entries #1, For
> > > file name cache solution, the time cost is: 2, 9, 44 (in ms).
> > > #2, For file id cache solution, the time cost: 3, 438, 4338 (in ms).
> > > This
> > is current solution.
> > > #3, for cache one id solution, I just tested it: 4, 4788, more than
> > > one minutes (in ms)
> > >
> > > I think it is not a good idea to cache one file id, it would be very
> > > bad performance
> >
> > Yes, the performce would be lousy, but at least we would have a basis
> > that just works^TM. Correct behaviour always comes before performance.
> > And from there you could add additional patches on top to address
> > performance improvements. Because the point is: your implementation is
> > also suboptimal, and more importantly: prone to crashes like we discussed
> before.
> >
> > Regarding performance: for instance you are re-allocating an entire
> > dense buffer on every 16 new entries. That will slow down things
> > extremely. Please use a container from glib, because these are
> > handling resize operations more smoothly for you out of the box, i.e.
> > typically by doubling the container capacity instead of re-allocating
> frequently with small chunks like you did.
> >
> > However I am still not convinced that allocating a huge dense buffer
> > with
> > *all* file IDs of a directory makes sense.
> >
> > On the long-term it would make sense to do it like other implementations:
> > store a snapshot of the directory temporarily on disk. That way it
> > would not matter how huge the directory is. But that's a complex
> > implementation, so not something that I would do in this series already.
> >
> > On the short/mid term I think we could simply make a mix of your
> > solution and the one-ID solution that I suggested: keeping a maximum
> > of e.g. 1k file IDs in RAM. And once guest seeks past that boundary,
> > loading the subsequent 1k entries, free-ing the previous 1k entries, and so
> on.
> >
> 
> Please note that the performance data is tested in native OS, but not in
> QEMU.
> It is even worse in QEMU.
> 
> I run Linux guest OS on Windows host, use "ls -l" command to list a directory
> with about 100 entries.
> "ls -l" command need about 0.5 second to display one directory entry.
> 
> Caching only one node (file id, or file name, or others) will make 9pfs not
> usable: listing 100 directory entries need 50 seconds in guest OS.

I have to point out that you missing about random accessing for a directory, this is the key of performance.
In QEMU 9p directory reading solution, it will try to read as many as possible entries (in function do_readdir_many).
When the butter is not enough, do_readdir_many will re-seek to the last read entry.
The key point is the "re-seek" directory.

Read directory is always read the next entry, so cache one id will be OK, and less performance impact.
But seek directory may seek to anywhere, seek directory need to cache all IDs.

Consider about this case:
There are 100 files in directory, name is from "file001" to "file100".

Currently, next read entry is "file050".
Now, user want to seek to directory offset 20 (should be "file020").
Because we only cached one id ("file050"), we do not know the file id for offset 20.
So we could only get the file id in offset 0 (need to search whole directory to get the minimal ID), and get the file id in offset 1, ... to offset 20.

So for the random accessing, seek to offset N in a directory with M-entries, we need to search whole directory for N times and reading totally M*N entries.
If there are 1000 files in a directory, and want seek to offset 1000 randomly, need to open file 1000*1000 times.
For the worst test case: read + seek + read for 1000 files, 9p on Windows host will need open files for 1000*(1 + 2 + 3 ... 1000) = 500500000 times. It may need several hours to finish it.

Another problem is: if only cache one ID, we can not detect which directory is deleted.
It is no difference with use MinGW native APIs, and we go back to the start point.
Cache one ID is useful for getting next entry, but not useful for telling us where is current offset.
Because after deleting some entries, guest OS may re-seek to the last offset. Storing only one ID is useless for re-seek to last offset.

Here is summarize of requirements:
1. Guest OS may seek directory randomly.
2. Some entries may be deleted during directory reading.

To match the requirements, a snapshot of directory may be the only solution.
So we should force on which information should be in snapshot (file id, or filename), and how to store it.
I do not think it is a big problem for large directory. Actually, if there are more than 1 million files in a directory, Windows File Explorer may not response.


> 
> > > >
> > > > > +                    }
> > > > > +                    file_id = (uint64_t)FileInfo.nFileIndexLow
> > > > > +                              +
> > > > > + (((uint64_t)FileInfo.nFileIndexHigh)
> > > > > + << 32);
> > > > > +
> > > > > +
> > > > > +                    file_id_list[index] = file_id;
> > > > > +
> > > > > +                    if (strcmp(dd_data.name, ".") == 0) {
> > > > > +                        stream->dot_id = file_id_list[index];
> > > > > +                        if (index != 0) {
> > > > > +                            sort_first_two_entry = 1;
> > > > > +                        }
> > > > > +                    } else if (strcmp(dd_data.name, "..") == 0) {
> > > > > +                        stream->dot_dot_id = file_id_list[index];
> > > > > +                        if (index != 1) {
> > > > > +                            sort_first_two_entry = 1;
> > > > > +                        }
> > > > > +                    }
> > > > > +                    index++;
> > > > > +                }
> > > > > +            }
> > > > > +            CloseHandle(hDirEntry);
> > > > > +        }
> > > > > +        g_free(full_dir_entry);
> > > > > +        find_status = _findnext(dd_handle, &dd_data);
> > > > > +    } while (find_status == 0);
> > > > > +
> > > > > +    if (errno == ENOENT) {
> > > > > +        /* No more matching files could be found, clean errno */
> > > > > +        errno = 0;
> > > > > +    } else {
> > > > > +        err = errno;
> > > > > +        goto out;
> > > > > +    }
> > > > > +
> > > > > +    stream->total_entries = index;
> > > > > +    stream->file_id_list = file_id_list;
> > > > > +
> > > > > +    if (sort_first_two_entry == 0) {
> > > > > +        /*
> > > > > +         * If the first two entry is "." and "..", then do not
> > > > > + sort
> > them.
> > > > > +         *
> > > > > +         * If the guest OS always considers first two entries
> > > > > + are "." and
> > > > "..",
> > > > > +         * sort the two entries may cause confused display in
> > > > > + guest
> > OS.
> > > > > +         */
> > > > > +        qsort(&file_id_list[2], index - 2, sizeof(file_id),
> > > > file_id_compare);
> > > > > +    } else {
> > > > > +        qsort(&file_id_list[0], index, sizeof(file_id),
> > file_id_compare);
> > > > > +    }
> > > >
> > > > Were there cases where you did not get "." and ".." ?
> > >
> > > NTFS always provides "." and "..".
> > > I could add more checks here to fix this risk
> >
> > That's what I assumed. So you can probably just drop this code for
> > simplicity.
> >
> > >
> > > >
> > > > > +
> > > > > +out:
> > > > > +    if (err != 0) {
> > > > > +        errno = err;
> > > > > +        if (stream != NULL) {
> > > > > +            if (file_id_list != NULL) {
> > > > > +                g_free(file_id_list);
> > > > > +            }
> > > > > +            CloseHandle(hDir);
> > > > > +            g_free(stream);
> > > > > +            stream = NULL;
> > > > > +        }
> > > > > +    }
> > > > > +
> > > > > +    if (dd_handle != -1) {
> > > > > +        _findclose(dd_handle);
> > > > > +    }
> > > > > +
> > > > > +    return (DIR *)stream;
> > > > > +}
> > > > > +
> > > > > +/*
> > > > > + * closedir_win32 - close a directory
> > > > > + *
> > > > > + * This function closes directory and free all cached resources.
> > > > > + */
> > > > > +int closedir_win32(DIR *pDir)
> > > > > +{
> > > > > +    struct dir_win32 *stream = (struct dir_win32 *)pDir;
> > > > > +
> > > > > +    if (stream == NULL) {
> > > > > +        errno = EBADF;
> > > > > +        return -1;
> > > > > +    }
> > > > > +
> > > > > +    /* free all resources */
> > > > > +    CloseHandle(stream->hDir);
> > > > > +
> > > > > +    g_free(stream->file_id_list);
> > > > > +
> > > > > +    g_free(stream);
> > > > > +
> > > > > +    return 0;
> > > > > +}
> > > > > +
> > > > > +/*
> > > > > + * readdir_win32 - read a directory
> > > > > + *
> > > > > + * This function reads a directory entry from cached entry list.
> > > > > + */
> > > > > +struct dirent *readdir_win32(DIR *pDir) {
> > > > > +    struct dir_win32 *stream = (struct dir_win32 *)pDir;
> > > > > +
> > > > > +    if (stream == NULL) {
> > > > > +        errno = EBADF;
> > > > > +        return NULL;
> > > > > +    }
> > > > > +
> > > > > +retry:
> > > > > +
> > > > > +    if (stream->offset >= stream->total_entries) {
> > > > > +        /* reach to the end, return NULL without set errno */
> > > > > +        return NULL;
> > > > > +    }
> > > > > +
> > > > > +    if (get_next_entry(stream) != 0) {
> > > > > +        stream->offset++;
> > > > > +        goto retry;
> > > > > +    }
> > > > > +
> > > > > +    /* Windows does not provide inode number */
> > > > > +    stream->dd_dir.d_ino = 0;
> > > > > +    stream->dd_dir.d_reclen = 0;
> > > > > +    stream->dd_dir.d_namlen = strlen(stream->dd_dir.d_name);
> > > > > +
> > > > > +    stream->offset++;
> > > > > +
> > > > > +    return &stream->dd_dir;
> > > > > +}
> > > > > +
> > > > > +/*
> > > > > + * rewinddir_win32 - reset directory stream
> > > > > + *
> > > > > + * This function resets the position of the directory stream to
> > > > > +the
> > > > > + * beginning of the directory.
> > > > > + */
> > > > > +void rewinddir_win32(DIR *pDir) {
> > > > > +    struct dir_win32 *stream = (struct dir_win32 *)pDir;
> > > > > +
> > > > > +    if (stream == NULL) {
> > > > > +        errno = EBADF;
> > > > > +        return;
> > > > > +    }
> > > > > +
> > > > > +    stream->offset = 0;
> > > > > +
> > > > > +    return;
> > > > > +}
> > > > > +
> > > > > +/*
> > > > > + * seekdir_win32 - set the position of the next readdir() call
> > > > > +in the directory
> > > > > + *
> > > > > + * This function sets the position of the next readdir() call
> > > > > +in the directory
> > > > > + * from which the next readdir() call will start.
> > > > > + */
> > > > > +void seekdir_win32(DIR *pDir, long pos) {
> > > > > +    struct dir_win32 *stream = (struct dir_win32 *)pDir;
> > > > > +
> > > > > +    if (stream == NULL) {
> > > > > +        errno = EBADF;
> > > > > +        return;
> > > > > +    }
> > > > > +
> > > > > +    if (pos < -1) {
> > > > > +        errno = EINVAL;
> > > > > +        return;
> > > > > +    }
> > > > > +
> > > > > +    if (pos == -1 || pos >= (long)stream->total_entries) {
> > > > > +        /* seek to the end */
> > > > > +        stream->offset = stream->total_entries;
> > > > > +        return;
> > > > > +    }
> > > > > +
> > > > > +    if (pos - (long)stream->offset == 0) {
> > > > > +        /* no need to seek */
> > > > > +        return;
> > > > > +    }
> > > > > +
> > > > > +    stream->offset = pos;
> > > > > +
> > > > > +    return;
> > > > > +}
> > > > > +
> > > > > +/*
> > > > > + * telldir_win32 - return current location in directory
> > > > > + *
> > > > > + * This function returns current location in directory.
> > > > > + */
> > > > > +long telldir_win32(DIR *pDir)
> > > > > +{
> > > > > +    struct dir_win32 *stream = (struct dir_win32 *)pDir;
> > > > > +
> > > > > +    if (stream == NULL) {
> > > > > +        errno = EBADF;
> > > > > +        return -1;
> > > > > +    }
> > > > > +
> > > > > +    if (stream->offset > stream->total_entries) {
> > > > > +        return -1;
> > > > > +    }
> > > > > +
> > > > > +    return (long)stream->offset; }
> > > > >
> > > >
> > >
> > >
> > >
> >
> >
Christian Schoenebeck March 17, 2023, 12:16 p.m. UTC | #6
On Friday, March 17, 2023 5:36:37 AM CET Shi, Guohuai wrote:
[...]
> > > > > > +    do {
> > > > > > +        full_dir_entry = get_full_path_win32(hDir,
> > > > > > + dd_data.name);
> > > > > > +
> > > > > > +        if (full_dir_entry == NULL) {
> > > > > > +            err = ENOMEM;
> > > > > > +            break;
> > > > > > +        }
> > > > > > +
> > > > > > +        /*
> > > > > > +         * Open every entry and get the file informations.
> > > > > > +         *
> > > > > > +         * Skip symbolic links during reading directory.
> > > > > > +         */
> > > > > > +        hDirEntry = CreateFile(full_dir_entry,
> > > > > > +                               GENERIC_READ,
> > > > > > +                               FILE_SHARE_READ | FILE_SHARE_WRITE
> > > > > > +                               | FILE_SHARE_DELETE,
> > > > > > +                               NULL,
> > > > > > +                               OPEN_EXISTING,
> > > > > > +                               FILE_FLAG_BACKUP_SEMANTICS
> > > > > > +                               | FILE_FLAG_OPEN_REPARSE_POINT,
> > > > > > + NULL);
> > > > > > +
> > > > > > +        if (hDirEntry != INVALID_HANDLE_VALUE) {
> > > > > > +            if (GetFileInformationByHandle(hDirEntry,
> > > > > > +                                           &FileInfo) == TRUE) {
> > > > > > +                attribute = FileInfo.dwFileAttributes;
> > > > > > +
> > > > > > +                /* only save validate entries */
> > > > > > +                if ((attribute & FILE_ATTRIBUTE_REPARSE_POINT) == 0) {
> > > > > > +                    if (index >= list_count) {
> > > > > > +                        list_count = list_count + 16;
> > > > >
> > > > > Magic number 16 again.
> > > > >
> > > > > > +                        file_id_list = g_realloc(file_id_list,
> > > > > > +                                                 sizeof(uint64_t)
> > > > > > +                                                 * list_count);
> > > > >
> > > > > OK, so here we are finally at the point where you chose the
> > > > > overall behaviour for this that we discussed before.
> > > > >
> > > > > So you are constantly appending 16 entry chunks to the end of the
> > > > > array, periodically reallocate the entire array, and potentially
> > > > > end up with one giant dense array with *all* file IDs of the directory.
> > > > >
> > > > > That's not really what I had in mind, as it still has the
> > > > > potential to easily crash QEMU if there are large directories on host.
> > > > > Theoretically a Windows directory might then consume up to 16 GB
> > > > > of RAM for looking up only one single directory.
> > > > >
> > > > > So is this the implementation that you said was very slow, or did
> > > > > you test a different one? Remember, my orgiginal idea (as starting
> > > > > point for Windows) was to only cache *one* file ID (the last being
> > > > > looked up). That's it. Not a list of file IDs.
> > > >
> > > > If only cache one file ID, that means for every read directory operation.
> > > > we need to look up whole directory to find out the next ID larger
> > > > than last
> > > cached one.
> > > >
> > > > I provided some performance test in last patch:
> > > > Run test for read directory with 100, 1000, 10000 entries #1, For
> > > > file name cache solution, the time cost is: 2, 9, 44 (in ms).
> > > > #2, For file id cache solution, the time cost: 3, 438, 4338 (in ms).
> > > > This
> > > is current solution.
> > > > #3, for cache one id solution, I just tested it: 4, 4788, more than
> > > > one minutes (in ms)
> > > >
> > > > I think it is not a good idea to cache one file id, it would be very
> > > > bad performance
> > >
> > > Yes, the performce would be lousy, but at least we would have a basis
> > > that just works^TM. Correct behaviour always comes before performance.
> > > And from there you could add additional patches on top to address
> > > performance improvements. Because the point is: your implementation is
> > > also suboptimal, and more importantly: prone to crashes like we discussed
> > before.
> > >
> > > Regarding performance: for instance you are re-allocating an entire
> > > dense buffer on every 16 new entries. That will slow down things
> > > extremely. Please use a container from glib, because these are
> > > handling resize operations more smoothly for you out of the box, i.e.
> > > typically by doubling the container capacity instead of re-allocating
> > frequently with small chunks like you did.
> > >
> > > However I am still not convinced that allocating a huge dense buffer
> > > with
> > > *all* file IDs of a directory makes sense.
> > >
> > > On the long-term it would make sense to do it like other implementations:
> > > store a snapshot of the directory temporarily on disk. That way it
> > > would not matter how huge the directory is. But that's a complex
> > > implementation, so not something that I would do in this series already.
> > >
> > > On the short/mid term I think we could simply make a mix of your
> > > solution and the one-ID solution that I suggested: keeping a maximum
> > > of e.g. 1k file IDs in RAM. And once guest seeks past that boundary,
> > > loading the subsequent 1k entries, free-ing the previous 1k entries, and so
> > on.
> > >
> > 
> > Please note that the performance data is tested in native OS, but not in
> > QEMU.
> > It is even worse in QEMU.
> > 
> > I run Linux guest OS on Windows host, use "ls -l" command to list a directory
> > with about 100 entries.
> > "ls -l" command need about 0.5 second to display one directory entry.
> > 
> > Caching only one node (file id, or file name, or others) will make 9pfs not
> > usable: listing 100 directory entries need 50 seconds in guest OS.

I think we have a misapprehension here, to make this more clear: I had no
intention to roll that one-entry-cache solution out to customers. The idea
rather was this to be the base patch, followed by whatever optimization
patch(es) on top of that. So this one-cache solution would basically just
end up being burried in git history, not being used by a regular user at all.

Reasons for this preliminary DOA patch:

1. An optimized solution with n file IDs (that would then in fact being rolled
out as official QEMU release to users) is a logical extension of a simple
implementation with only 1 file ID, and it always makes sense to split patches
at logical points.

2. If some problem arises, we can always tell people to rollback to this
simple implementation and check if the problem exists there as well (no matter
how long it takes to run the test).

3. If really necessary, we could even make this 1 file ID solution a runtime
option in a distant future, which would be overkill at this point though.

> I have to point out that you missing about random accessing for a directory, this is the key of performance.
> In QEMU 9p directory reading solution, it will try to read as many as possible entries (in function do_readdir_many).
> When the butter is not enough, do_readdir_many will re-seek to the last read entry.
> The key point is the "re-seek" directory.
> 
> Read directory is always read the next entry, so cache one id will be OK, and less performance impact.
> But seek directory may seek to anywhere, seek directory need to cache all IDs.

No, random access is not permitted anywhere! We have two aspects on this:

1. On guest user space level there is seekdir() and telldir(). But it's not
like that user could seek randomly like telldir() + n. In fact, many file
systems don't support this kind of operation, as often some kind of internal
file system dependent value is passed for performance reasons as "offset",
e.g. something like:

Filename  Offset
001.dat   240
002.dat   80
003.dat   586
...

Instead, POSIX defines that the argument passed to seekdir() *must* have been
obtained by a telldir() call before, exactly for the reason described above.

2. On 9p2000.L protocol level (the default 9p protocol version used by Linux
clients): here we have `Treaddir` only. Which is not a random access request,
instead it is designed to just split large directories into several, smaller
requests and passing the offset of the *previous* `Treaddir` response as
argument to the next `Treaddir` request:

https://github.com/chaos/diod/blob/master/protocol.md#readdir---read-a-directory

3. On 9p2000.u protocol level (a 9p protocol version that we already
discourage to use and are probably going to deprecate) there is no such thing
as `Treaddir`, instead `Tread` on a directory FID is used, however also in
this case the protocol specs are clear that random access is not allowed,
quote:

"For directories, read returns an integral number of direc- tory entries
exactly as in stat (see stat(5)), one for each member of the directory. The
read request message must have offset equal to zero or the value of offset in
the previous read on the directory, plus the number of bytes returned in the
previous read. In other words, seeking other than to the beginning is illegal
in a directory (see seek(2))."

http://ericvh.github.io/9p-rfc/rfc9p2000.html#anchor30

> Consider about this case:
> There are 100 files in directory, name is from "file001" to "file100".
> 
> Currently, next read entry is "file050".
> Now, user want to seek to directory offset 20 (should be "file020").
> Because we only cached one id ("file050"), we do not know the file id for offset 20.
> So we could only get the file id in offset 0 (need to search whole directory to get the minimal ID), and get the file id in offset 1, ... to offset 20.
> 
> So for the random accessing, seek to offset N in a directory with M-entries, we need to search whole directory for N times and reading totally M*N entries.

Whenever you are capturing other file IDs - no matter if only 1 different or
multiple different file IDs - you would need to *always* scan the entire
directory. Otherwise you would always risk incorrect behaviour.

That's why I suggested as subsequent patch on top of the 1-file-id patch, a
subsequent 2nd patch as optimization that would cache max. e.g. 1000 entries
directory entries in RAM, to avoid scanning the entire directory too often.

> If there are 1000 files in a directory, and want seek to offset 1000 randomly, need to open file 1000*1000 times.
> For the worst test case: read + seek + read for 1000 files, 9p on Windows host will need open files for 1000*(1 + 2 + 3 ... 1000) = 500500000 times. It may need several hours to finish it.
> 
> Another problem is: if only cache one ID, we can not detect which directory is deleted.

We don't care detecting whether or not entries were deleted.

> It is no difference with use MinGW native APIs, and we go back to the start point.

Yes it is! The essential difference is: with the MinGW API, when some entry is
deleted in between, then offsets are shifted such that guest might not receive
directory entries that *still* exist!

With the ordered file ID solution discussed here (no matter how many are
cached), as we would always return the directory entries sorted by file IDs to
guest, we can in contrast ensure that really all entries that *still* exist
are always returned to guest. And that's what we care about.

Another thing that I noticed when looking at your patch: you are first
obtaining only the file IDs of the individual directory entries and only
caching the file IDs. Which I understand, as you were really caching the 
entire directory in RAM.

It's absolutely OK to cache other directory entry info as well. And if we are
limiting caching to e.g. max. 1k entries or so, then we don't have a problem
with cached size either.

> Cache one ID is useful for getting next entry, but not useful for telling us where is current offset.
> Because after deleting some entries, guest OS may re-seek to the last offset. Storing only one ID is useless for re-seek to last offset.
> 
> Here is summarize of requirements:
> 1. Guest OS may seek directory randomly.
> 2. Some entries may be deleted during directory reading.
> 
> To match the requirements, a snapshot of directory may be the only solution.
> So we should force on which information should be in snapshot (file id, or filename), and how to store it.
> I do not think it is a big problem for large directory. Actually, if there are more than 1 million files in a directory, Windows File Explorer may not response.

:) That's the solution that I suggested as long-term solution several times
before, as I also pointed out that other file servers are using this solution
as well. And yes, that is "probably" the "best" solution. But I think you are
underestimating the complexity of this solution.

Of course you can easily capture all directory entries in one rush, serialize
them as raw struct to a temporary file, and deserialize those structs when
being accessed. That's not the thing. But there is a lot more on this: e.g.
where would you store these temporary files? How long would you store them
there and what would be the precise mechanism to drop them? Whatabout cleanup
mechanisms after an unclean QEMU shutdown? And would it really be faster than
say caching 1000 entries in RAM? Do we share directory snapshots, and if yes how?
diff mbox series

Patch

diff --git a/hw/9pfs/9p-util.h b/hw/9pfs/9p-util.h
index 0f159fb4ce..c1c251fbd1 100644
--- a/hw/9pfs/9p-util.h
+++ b/hw/9pfs/9p-util.h
@@ -141,6 +141,12 @@  int unlinkat_win32(int dirfd, const char *pathname, int flags);
 int statfs_win32(const char *root_path, struct statfs *stbuf);
 int openat_dir(int dirfd, const char *name);
 int openat_file(int dirfd, const char *name, int flags, mode_t mode);
+DIR *opendir_win32(const char *full_file_name);
+int closedir_win32(DIR *pDir);
+struct dirent *readdir_win32(DIR *pDir);
+void rewinddir_win32(DIR *pDir);
+void seekdir_win32(DIR *pDir, long pos);
+long telldir_win32(DIR *pDir);
 #endif
 
 static inline void close_preserve_errno(int fd)
diff --git a/hw/9pfs/9p-util-win32.c b/hw/9pfs/9p-util-win32.c
index a99d579a06..e9408f3c45 100644
--- a/hw/9pfs/9p-util-win32.c
+++ b/hw/9pfs/9p-util-win32.c
@@ -37,6 +37,16 @@ 
  *    Windows does not support opendir, the directory fd is created by
  *    CreateFile and convert to fd by _open_osfhandle(). Keep the fd open will
  *    lock and protect the directory (can not be modified or replaced)
+ *
+ * 5. Neither Windows native APIs, nor MinGW provide a POSIX compatible API for
+ *    acquiring directory entries in a safe way. Calling those APIs (native
+ *    _findfirst() and _findnext() or MinGW's readdir(), seekdir() and
+ *    telldir()) directly can lead to an inconsistent state if directory is
+ *    modified in between, e.g. the same directory appearing more than once
+ *    in output, or directories not appearing at all in output even though they
+ *    were neither newly created nor deleted. POSIX does not define what happens
+ *    with deleted or newly created directories in between, but it guarantees a
+ *    consistent state.
  */
 
 #include "qemu/osdep.h"
@@ -51,6 +61,25 @@ 
 
 #define V9FS_MAGIC  0x53465039  /* string "9PFS" */
 
+/*
+ * MinGW and Windows does not provide a safe way to seek directory while other
+ * thread is modifying the same directory.
+ *
+ * This structure is used to store sorted file id and ensure directory seek
+ * consistency.
+ */
+struct dir_win32 {
+    struct dirent dd_dir;
+    uint32_t offset;
+    uint32_t total_entries;
+    HANDLE hDir;
+    uint32_t dir_name_len;
+    uint64_t dot_id;
+    uint64_t dot_dot_id;
+    uint64_t *file_id_list;
+    char dd_name[1];
+};
+
 /*
  * win32_error_to_posix - convert Win32 error to POSIX error number
  *
@@ -977,3 +1006,417 @@  int qemu_mknodat(int dirfd, const char *filename, mode_t mode, dev_t dev)
     errno = ENOTSUP;
     return -1;
 }
+
+static int file_id_compare(const void *id_ptr1, const void *id_ptr2)
+{
+    uint64_t id[2];
+
+    id[0] = *(uint64_t *)id_ptr1;
+    id[1] = *(uint64_t *)id_ptr2;
+
+    if (id[0] > id[1]) {
+        return 1;
+    } else if (id[0] < id[1]) {
+        return -1;
+    } else {
+        return 0;
+    }
+}
+
+static int get_next_entry(struct dir_win32 *stream)
+{
+    HANDLE hDirEntry = INVALID_HANDLE_VALUE;
+    char *entry_name;
+    char *entry_start;
+    FILE_ID_DESCRIPTOR fid;
+    DWORD attribute;
+
+    if (stream->file_id_list[stream->offset] == stream->dot_id) {
+        strcpy(stream->dd_dir.d_name, ".");
+        return 0;
+    }
+
+    if (stream->file_id_list[stream->offset] == stream->dot_dot_id) {
+        strcpy(stream->dd_dir.d_name, "..");
+        return 0;
+    }
+
+    fid.dwSize = sizeof(fid);
+    fid.Type = FileIdType;
+
+    fid.FileId.QuadPart = stream->file_id_list[stream->offset];
+
+    hDirEntry = OpenFileById(stream->hDir, &fid, GENERIC_READ,
+                             FILE_SHARE_READ | FILE_SHARE_WRITE
+                             | FILE_SHARE_DELETE,
+                             NULL,
+                             FILE_FLAG_BACKUP_SEMANTICS
+                             | FILE_FLAG_OPEN_REPARSE_POINT);
+
+    if (hDirEntry == INVALID_HANDLE_VALUE) {
+        /*
+         * Not open it successfully, it may be deleted.
+         * Try next id.
+         */
+        return -1;
+    }
+
+    entry_name = get_full_path_win32(hDirEntry, NULL);
+
+    CloseHandle(hDirEntry);
+
+    if (entry_name == NULL) {
+        return -1;
+    }
+
+    attribute = GetFileAttributes(entry_name);
+
+    /* symlink is not allowed */
+    if (attribute == INVALID_FILE_ATTRIBUTES
+        || (attribute & FILE_ATTRIBUTE_REPARSE_POINT) != 0) {
+        return -1;
+    }
+
+    if (memcmp(entry_name, stream->dd_name, stream->dir_name_len) != 0) {
+        /*
+         * The full entry file name should be a part of parent directory name,
+         * except dot and dot_dot (is already handled).
+         * If not, this entry should not be returned.
+         */
+        return -1;
+    }
+
+    entry_start = entry_name + stream->dir_name_len;
+
+    /* skip slash */
+    while (*entry_start == '\\') {
+        entry_start++;
+    }
+
+    if (strchr(entry_start, '\\') != NULL) {
+        return -1;
+    }
+
+    if (strlen(entry_start) == 0
+        || strlen(entry_start) + 1 > sizeof(stream->dd_dir.d_name)) {
+        return -1;
+    }
+    strcpy(stream->dd_dir.d_name, entry_start);
+
+    return 0;
+}
+
+/*
+ * opendir_win32 - open a directory
+ *
+ * This function opens a directory and caches all directory entries.
+ */
+DIR *opendir_win32(const char *full_file_name)
+{
+    HANDLE hDir = INVALID_HANDLE_VALUE;
+    HANDLE hDirEntry = INVALID_HANDLE_VALUE;
+    char *full_dir_entry = NULL;
+    DWORD attribute;
+    intptr_t dd_handle = -1;
+    struct _finddata_t dd_data;
+    uint64_t file_id;
+    uint64_t *file_id_list = NULL;
+    BY_HANDLE_FILE_INFORMATION FileInfo;
+    struct dir_win32 *stream = NULL;
+    int err = 0;
+    int find_status;
+    int sort_first_two_entry = 0;
+    uint32_t list_count = 16;
+    uint32_t index = 0;
+
+    /* open directory to prevent it being removed */
+
+    hDir = CreateFile(full_file_name, GENERIC_READ,
+                      FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
+                      NULL,
+                      OPEN_EXISTING,
+                      FILE_FLAG_BACKUP_SEMANTICS | FILE_FLAG_OPEN_REPARSE_POINT,
+                      NULL);
+
+    if (hDir == INVALID_HANDLE_VALUE) {
+        err = win32_error_to_posix(GetLastError());
+        goto out;
+    }
+
+    attribute = GetFileAttributes(full_file_name);
+
+    /* symlink is not allow */
+    if (attribute == INVALID_FILE_ATTRIBUTES
+        || (attribute & FILE_ATTRIBUTE_REPARSE_POINT) != 0) {
+        err = EACCES;
+        goto out;
+    }
+
+    /* check if it is a directory */
+    if ((attribute & FILE_ATTRIBUTE_DIRECTORY) == 0) {
+        err = ENOTDIR;
+        goto out;
+    }
+
+    file_id_list = g_malloc0(sizeof(uint64_t) * list_count);
+
+    /*
+     * findfirst() needs suffix format name like "\dir1\dir2\*",
+     * allocate more buffer to store suffix.
+     */
+    stream = g_malloc0(sizeof(struct dir_win32) + strlen(full_file_name) + 3);
+
+    strcpy(stream->dd_name, full_file_name);
+    strcat(stream->dd_name, "\\*");
+
+    stream->hDir = hDir;
+    stream->dir_name_len = strlen(full_file_name);
+
+    dd_handle = _findfirst(stream->dd_name, &dd_data);
+
+    if (dd_handle == -1) {
+        err = errno;
+        goto out;
+    }
+
+    /* read all entries to link list */
+    do {
+        full_dir_entry = get_full_path_win32(hDir, dd_data.name);
+
+        if (full_dir_entry == NULL) {
+            err = ENOMEM;
+            break;
+        }
+
+        /*
+         * Open every entry and get the file informations.
+         *
+         * Skip symbolic links during reading directory.
+         */
+        hDirEntry = CreateFile(full_dir_entry,
+                               GENERIC_READ,
+                               FILE_SHARE_READ | FILE_SHARE_WRITE
+                               | FILE_SHARE_DELETE,
+                               NULL,
+                               OPEN_EXISTING,
+                               FILE_FLAG_BACKUP_SEMANTICS
+                               | FILE_FLAG_OPEN_REPARSE_POINT, NULL);
+
+        if (hDirEntry != INVALID_HANDLE_VALUE) {
+            if (GetFileInformationByHandle(hDirEntry,
+                                           &FileInfo) == TRUE) {
+                attribute = FileInfo.dwFileAttributes;
+
+                /* only save validate entries */
+                if ((attribute & FILE_ATTRIBUTE_REPARSE_POINT) == 0) {
+                    if (index >= list_count) {
+                        list_count = list_count + 16;
+                        file_id_list = g_realloc(file_id_list,
+                                                 sizeof(uint64_t)
+                                                 * list_count);
+                    }
+                    file_id = (uint64_t)FileInfo.nFileIndexLow
+                              + (((uint64_t)FileInfo.nFileIndexHigh) << 32);
+
+
+                    file_id_list[index] = file_id;
+
+                    if (strcmp(dd_data.name, ".") == 0) {
+                        stream->dot_id = file_id_list[index];
+                        if (index != 0) {
+                            sort_first_two_entry = 1;
+                        }
+                    } else if (strcmp(dd_data.name, "..") == 0) {
+                        stream->dot_dot_id = file_id_list[index];
+                        if (index != 1) {
+                            sort_first_two_entry = 1;
+                        }
+                    }
+                    index++;
+                }
+            }
+            CloseHandle(hDirEntry);
+        }
+        g_free(full_dir_entry);
+        find_status = _findnext(dd_handle, &dd_data);
+    } while (find_status == 0);
+
+    if (errno == ENOENT) {
+        /* No more matching files could be found, clean errno */
+        errno = 0;
+    } else {
+        err = errno;
+        goto out;
+    }
+
+    stream->total_entries = index;
+    stream->file_id_list = file_id_list;
+
+    if (sort_first_two_entry == 0) {
+        /*
+         * If the first two entry is "." and "..", then do not sort them.
+         *
+         * If the guest OS always considers first two entries are "." and "..",
+         * sort the two entries may cause confused display in guest OS.
+         */
+        qsort(&file_id_list[2], index - 2, sizeof(file_id), file_id_compare);
+    } else {
+        qsort(&file_id_list[0], index, sizeof(file_id), file_id_compare);
+    }
+
+out:
+    if (err != 0) {
+        errno = err;
+        if (stream != NULL) {
+            if (file_id_list != NULL) {
+                g_free(file_id_list);
+            }
+            CloseHandle(hDir);
+            g_free(stream);
+            stream = NULL;
+        }
+    }
+
+    if (dd_handle != -1) {
+        _findclose(dd_handle);
+    }
+
+    return (DIR *)stream;
+}
+
+/*
+ * closedir_win32 - close a directory
+ *
+ * This function closes directory and free all cached resources.
+ */
+int closedir_win32(DIR *pDir)
+{
+    struct dir_win32 *stream = (struct dir_win32 *)pDir;
+
+    if (stream == NULL) {
+        errno = EBADF;
+        return -1;
+    }
+
+    /* free all resources */
+    CloseHandle(stream->hDir);
+
+    g_free(stream->file_id_list);
+
+    g_free(stream);
+
+    return 0;
+}
+
+/*
+ * readdir_win32 - read a directory
+ *
+ * This function reads a directory entry from cached entry list.
+ */
+struct dirent *readdir_win32(DIR *pDir)
+{
+    struct dir_win32 *stream = (struct dir_win32 *)pDir;
+
+    if (stream == NULL) {
+        errno = EBADF;
+        return NULL;
+    }
+
+retry:
+
+    if (stream->offset >= stream->total_entries) {
+        /* reach to the end, return NULL without set errno */
+        return NULL;
+    }
+
+    if (get_next_entry(stream) != 0) {
+        stream->offset++;
+        goto retry;
+    }
+
+    /* Windows does not provide inode number */
+    stream->dd_dir.d_ino = 0;
+    stream->dd_dir.d_reclen = 0;
+    stream->dd_dir.d_namlen = strlen(stream->dd_dir.d_name);
+
+    stream->offset++;
+
+    return &stream->dd_dir;
+}
+
+/*
+ * rewinddir_win32 - reset directory stream
+ *
+ * This function resets the position of the directory stream to the
+ * beginning of the directory.
+ */
+void rewinddir_win32(DIR *pDir)
+{
+    struct dir_win32 *stream = (struct dir_win32 *)pDir;
+
+    if (stream == NULL) {
+        errno = EBADF;
+        return;
+    }
+
+    stream->offset = 0;
+
+    return;
+}
+
+/*
+ * seekdir_win32 - set the position of the next readdir() call in the directory
+ *
+ * This function sets the position of the next readdir() call in the directory
+ * from which the next readdir() call will start.
+ */
+void seekdir_win32(DIR *pDir, long pos)
+{
+    struct dir_win32 *stream = (struct dir_win32 *)pDir;
+
+    if (stream == NULL) {
+        errno = EBADF;
+        return;
+    }
+
+    if (pos < -1) {
+        errno = EINVAL;
+        return;
+    }
+
+    if (pos == -1 || pos >= (long)stream->total_entries) {
+        /* seek to the end */
+        stream->offset = stream->total_entries;
+        return;
+    }
+
+    if (pos - (long)stream->offset == 0) {
+        /* no need to seek */
+        return;
+    }
+
+    stream->offset = pos;
+
+    return;
+}
+
+/*
+ * telldir_win32 - return current location in directory
+ *
+ * This function returns current location in directory.
+ */
+long telldir_win32(DIR *pDir)
+{
+    struct dir_win32 *stream = (struct dir_win32 *)pDir;
+
+    if (stream == NULL) {
+        errno = EBADF;
+        return -1;
+    }
+
+    if (stream->offset > stream->total_entries) {
+        return -1;
+    }
+
+    return (long)stream->offset;
+}