diff mbox series

fcntl: Add 32bit filesystem mode

Message ID 20200331133536.3328-1-linus.walleij@linaro.org (mailing list archive)
State New, archived
Headers show
Series fcntl: Add 32bit filesystem mode | expand

Commit Message

Linus Walleij March 31, 2020, 1:35 p.m. UTC
It was brought to my attention that this bug from 2018 was
still unresolved: 32 bit emulators like QEMU were given
64 bit hashes when running 32 bit emulation on 64 bit systems.

This adds a fcntl() operation to set the underlying filesystem
into 32bit mode even if the file hanle was opened using 64bit
mode without the compat syscalls.

Programs that need the 32 bit file system behavior need to
issue a fcntl() system call such as in this example:

  #define F_SET_FILE_32BIT_FS (1024 + 15)

  int main(int argc, char** argv) {
    DIR* dir;
    int err;
    int fd;

    dir = opendir("/boot");
    fd = dirfd(dir);
    err = fcntl(fd, F_SET_FILE_32BIT_FS);
    if (err) {
      printf("fcntl() failed! err=%d\n", err);
      return 1;
    }
    printf("dir=%p\n", dir);
    printf("readdir(dir)=%p\n", readdir(dir));
    printf("errno=%d: %s\n", errno, strerror(errno));
    return 0;
  }

This can be pretty hard to test since C libraries and linux
userspace security extensions aggressively filter the parameters
that are passed down and allowed to commit into actual system
calls.

Cc: Florian Weimer <fw@deneb.enyo.de>
Cc: Peter Maydell <peter.maydell@linaro.org>
Cc: Andy Lutomirski <luto@kernel.org>
Suggested-by: Theodore Ts'o <tytso@mit.edu>
Link: https://bugs.launchpad.net/qemu/+bug/1805913
Link: https://lore.kernel.org/lkml/87bm56vqg4.fsf@mid.deneb.enyo.de/
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=205957
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 fs/fcntl.c                       | 4 ++++
 include/uapi/linux/fcntl.h       | 9 +++++++++
 tools/include/uapi/linux/fcntl.h | 9 +++++++++
 tools/perf/trace/beauty/fcntl.c  | 3 ++-
 4 files changed, 24 insertions(+), 1 deletion(-)

Comments

Peter Maydell April 20, 2020, 11:19 a.m. UTC | #1
On Tue, 31 Mar 2020 at 14:37, Linus Walleij <linus.walleij@linaro.org> wrote:
>
> It was brought to my attention that this bug from 2018 was
> still unresolved: 32 bit emulators like QEMU were given
> 64 bit hashes when running 32 bit emulation on 64 bit systems.
>
> This adds a fcntl() operation to set the underlying filesystem
> into 32bit mode even if the file hanle was opened using 64bit
> mode without the compat syscalls.
>
> Programs that need the 32 bit file system behavior need to
> issue a fcntl() system call such as in this example:
>
>   #define F_SET_FILE_32BIT_FS (1024 + 15)
>
>   int main(int argc, char** argv) {
>     DIR* dir;
>     int err;
>     int fd;
>
>     dir = opendir("/boot");
>     fd = dirfd(dir);
>     err = fcntl(fd, F_SET_FILE_32BIT_FS);
>     if (err) {
>       printf("fcntl() failed! err=%d\n", err);
>       return 1;
>     }
>     printf("dir=%p\n", dir);
>     printf("readdir(dir)=%p\n", readdir(dir));
>     printf("errno=%d: %s\n", errno, strerror(errno));
>     return 0;
>   }

I gave this a try with a modified QEMU, but it doesn't seem
to fix the problem. Here's the relevant chunk of the strace
output from stracing a QEMU that's running a 32-bit guest
binary that issues a getdents64 and fails (it's the 'readdir-bug'
test case from the launchpad bug):

openat(AT_FDCWD, ".", O_RDONLY|O_NONBLOCK|O_CLOEXEC|O_DIRECTORY) = 3
fcntl(3, 0x40f /* F_??? */, 0x3)        = 0
fstat(3, {st_dev=makedev(0, 16), st_ino=4637237, st_mode=S_IFDIR|0755,
st_nlink=12, st_uid=1000, st_gid=1000, st_blksize=8192, st_blocks=8,
st_size=4096, st_atime=1587380917 /*
2020-04-20T11:08:37.756174607+0000 */, st_atime_nsec=756174607,
st_mtime=1587380910 /* 2020-04-20T11:08:30.356230179+0000 */,
st_mtime_nsec=356230179, st_ctime=1587380910 /*
2020-04-20T11:08:30.356230179+0000 */, st_ctime_nsec=356230179}) = 0
fstat(1, {st_dev=makedev(0, 2), st_ino=9017, st_mode=S_IFCHR|0600,
st_nlink=1, st_uid=0, st_gid=0, st_blksize=4096, st_blocks=0,
st_rdev=makedev(5, 1), st_atime=1587381196 /* 2020-04-20T11:13:16+0000
*/, st_atime_nsec=0, st_mtime=1587381196 /* 2020-04-20T11:13:16+0000
*/, st_mtime_nsec=0, st_ctime=1587381042 /*
2020-04-20T11:10:42.484981152+0000 */, st_ctime_nsec=484981152}) = 0
ioctl(1, TCGETS, {c_iflags=0x2502, c_oflags=0x5, c_cflags=0xcbd,
c_lflags=0x8a3b, c_line=0,
c_cc="\x03\x1c\x7f\x15\x04\x00\x01\x00\x11\x13\x1a\x00\x12\x0f\x17\x16\x00\x00\x00"})
= 0
write(1, "dir=0x76128\n", 12)           = 12
getdents64(3, [{d_ino=1, d_off=273341893525646730, d_reclen=24,
d_type=DT_DIR, d_name=".."}, {d_ino=4637239, d_off=849308795555391993,
d_reclen=24, d_type=DT_DIR, d_name="etc"}, {d_ino=4587984,
d_off=1620709961571101518, d_reclen=24, d_type=DT_LNK, d_name="usr"},
{d_ino=4637238, d_off=2787937917159437645, d_reclen=24, d_type=DT_DIR,
d_name="dev"}, {d_ino=4637244, d_off=3015508490233103491, d_reclen=24,
d_type=DT_DIR, d_name="sys"}, {d_ino=4587608,
d_off=3551089360661460833, d_reclen=24, d_type=DT_LNK, d_name="lib"},
{d_ino=4637246, d_off=3857320197951442970, d_reclen=24, d_type=DT_DIR,
d_name="var"}, {d_ino=4637242, d_off=4103122318823701457, d_reclen=24,
d_type=DT_DIR, d_name="proc"}, {d_ino=4587541,
d_off=4252201186220906002, d_reclen=24, d_type=DT_LNK, d_name="bin"},
{d_ino=4637245, d_off=4386533378951587638, d_reclen=24, d_type=DT_DIR,
d_name="tmp"}, {d_ino=4637241, d_off=4883206313583644962, d_reclen=24,
d_type=DT_DIR, d_name="host"}, {d_ino=4637237,
d_off=4941119754928488586, d_reclen=24, d_type=DT_DIR, d_name="."},
{d_ino=4637243, d_off=5301154723342888169, d_reclen=24, d_type=DT_DIR,
d_name="root"}, {d_ino=4587838, d_off=6989908915879243400,
d_reclen=32, d_type=DT_LNK, d_name="lib64"}, {d_ino=4587679,
d_off=7356513223657690979, d_reclen=32, d_type=DT_REG,
d_name="strace.log"}, {d_ino=4587847, d_off=7810090083157553519,
d_reclen=24, d_type=DT_LNK, d_name="sbin"}, {d_ino=4637240,
d_off=8254997891991845677, d_reclen=24, d_type=DT_DIR, d_name="home"},
{d_ino=4637248, d_off=9223372036854775807, d_reclen=24, d_type=DT_DIR,
d_name="virt"}], 32768) = 448
write(1, "readdir(dir)=(nil)\n", 19)    = 19
write(1, "errno=75: Value too large for de"..., 48) = 48
exit_group(0)                           = ?


We open fd 3 to read '.'; we issue the new fcntl, which
succeeds. Then there's some unrelated stuff operating on
stdout. Then we do a getdents64(), but the d_off values
we get back are still 64 bits. The guest binary doesn't
like those, so it fails. My expectation was that we would
get back d_off values here that were in the 32 bit range.

(To be clear, the guest binary here is doing a getdents64(),
which QEMU translates into a host getdents64().)

thanks
-- PMM
Florian Weimer April 20, 2020, 11:23 a.m. UTC | #2
* Peter Maydell:

> We open fd 3 to read '.'; we issue the new fcntl, which
> succeeds. Then there's some unrelated stuff operating on
> stdout. Then we do a getdents64(), but the d_off values
> we get back are still 64 bits. The guest binary doesn't
> like those, so it fails. My expectation was that we would
> get back d_off values here that were in the 32 bit range.

What's your file system?

I think not all of them have 32-bit hashes (some of them probably
can't, particularly in the network-based file system case).
Peter Maydell April 20, 2020, 11:38 a.m. UTC | #3
On Mon, 20 Apr 2020 at 12:23, Florian Weimer <fw@deneb.enyo.de> wrote:
>
> * Peter Maydell:
>
> > We open fd 3 to read '.'; we issue the new fcntl, which
> > succeeds. Then there's some unrelated stuff operating on
> > stdout. Then we do a getdents64(), but the d_off values
> > we get back are still 64 bits. The guest binary doesn't
> > like those, so it fails. My expectation was that we would
> > get back d_off values here that were in the 32 bit range.
>
> What's your file system?
>
> I think not all of them have 32-bit hashes (some of them probably
> can't, particularly in the network-based file system case).

Whoops, good point. I was testing this via lkvm, so it's
actually using a 9p filesystem... I'll see if I can figure
out how to test with an ext3 fs, which I think is the one
we most care about. It would be nice if the flag was
supported by other fses too, of course.

Appended is the QEMU patch I tested with.

thanks
-- PMM

From 73471e01733dd1d998ff3cd41edebb4c78793193 Mon Sep 17 00:00:00 2001
From: Peter Maydell <peter.maydell@linaro.org>
Date: Mon, 20 Apr 2020 11:54:22 +0100
Subject: [RFC] linux-user: Use new F_SET_FILE_32BIT_FS fcntl for 32-bit guests

If the guest is 32 bit then there is a potential problem if the
host gives us back a 64-bit sized value that we can't fit into
the ABI the guest requires. This is a theoretical issue for many
syscalls, but a real issue for directory reads where the host
is using ext3 or ext4. There the 'offset' values retured via
the getdents syscall are hashes, and on a 64-bit system they
will always fill the full 64 bits.

Use the F_SET_FILE_32BIT_FS fcntl to tell the kernel to stick
to 32-bit sized hashes for fds used by the guest.

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
RFC patch because it depends on the kernel patch to provide
F_SET_FILE_32BIT_FS, which is still under discussion. All this
patch does is call the fcntl for every fd the guest opens.

 linux-user/syscall.c | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/linux-user/syscall.c b/linux-user/syscall.c
index 674f70e70a5..8966d4881bd 100644
--- a/linux-user/syscall.c
+++ b/linux-user/syscall.c
@@ -884,6 +884,28 @@ static inline int host_to_target_sock_type(int host_type)
     return target_type;
 }

+/*
+ * If the guest is using a 32 bit ABI then we should try to ask the kernel
+ * to provide 32-bit offsets in getdents syscalls, as otherwise some
+ * filesystems will return 64-bit hash values which we can't fit into
+ * the field sizes the guest ABI mandates.
+ */
+#ifndef F_SET_FILE_32BIT_FS
+#define F_SET_FILE_32BIT_FS (1024 + 15)
+#endif
+
+static inline void request_32bit_fs(int fd)
+{
+#if HOST_LONG_BITS > TARGET_ABI_BITS
+    /*
+     * Ignore errors, which are likely due to the host kernel being too
+     * old to support this fcntl. We'll try anyway, which might or might
+     * not work, depending on the guest code and on the host filesystem.
+     */
+    fcntl(fd, F_SET_FILE_32BIT_FS);
+#endif
+}
+
 static abi_ulong target_brk;
 static abi_ulong target_original_brk;
 static abi_ulong brk_page;
@@ -7704,6 +7726,7 @@ static abi_long do_syscall1(void *cpu_env, int
num, abi_long arg1,
                                   target_to_host_bitmask(arg2,
fcntl_flags_tbl),
                                   arg3));
         fd_trans_unregister(ret);
+        request_32bit_fs(ret);
         unlock_user(p, arg1, 0);
         return ret;
 #endif
@@ -7714,6 +7737,7 @@ static abi_long do_syscall1(void *cpu_env, int
num, abi_long arg1,
                                   target_to_host_bitmask(arg3,
fcntl_flags_tbl),
                                   arg4));
         fd_trans_unregister(ret);
+        request_32bit_fs(ret);
         unlock_user(p, arg2, 0);
         return ret;
 #if defined(TARGET_NR_name_to_handle_at) && defined(CONFIG_OPEN_BY_HANDLE)
@@ -7725,6 +7749,7 @@ static abi_long do_syscall1(void *cpu_env, int
num, abi_long arg1,
     case TARGET_NR_open_by_handle_at:
         ret = do_open_by_handle_at(arg1, arg2, arg3);
         fd_trans_unregister(ret);
+        request_32bit_fs(ret);
         return ret;
 #endif
     case TARGET_NR_close:
@@ -7769,6 +7794,7 @@ static abi_long do_syscall1(void *cpu_env, int
num, abi_long arg1,
             return -TARGET_EFAULT;
         ret = get_errno(creat(p, arg2));
         fd_trans_unregister(ret);
+        request_32bit_fs(ret);
         unlock_user(p, arg1, 0);
         return ret;
 #endif
@@ -12393,6 +12419,7 @@ static abi_long do_syscall1(void *cpu_env, int
num, abi_long arg1,
         }
         ret = get_errno(memfd_create(p, arg2));
         fd_trans_unregister(ret);
+        request_32bit_fs(ret);
         unlock_user(p, arg1, 0);
         return ret;
 #endif
Peter Maydell April 20, 2020, 2:16 p.m. UTC | #4
On Mon, 20 Apr 2020 at 12:38, Peter Maydell <peter.maydell@linaro.org> wrote:
> Whoops, good point. I was testing this via lkvm, so it's
> actually using a 9p filesystem... I'll see if I can figure
> out how to test with an ext3 fs, which I think is the one
> we most care about.

After some effort wrestling with kvmtool (which assumes that
if you provide it a disk image then you must have wanted that
to be your rootfs and can only be persuaded otherwise via
some undocumented and arcane options), I did a test with ext4:

bash-4.4# /qemu-no-fix /readdir-bug
dir=0x76108
readdir(dir)=(nil)
errno=75: Value too large for defined data type
bash-4.4# /qemu-fixed /readdir-bug
dir=0x76108
readdir(dir)=0x76128
errno=0: Success

(where the host kernel has Linus' fcntl patch, qemu-no-fix
is a current-git-master QEMU and qemu-fixed is one with
the patch in my previous email).

So for Linus' patch:

Tested-by: Peter Maydell <peter.maydell@linaro.org>

If 9pfs could be persuaded to honour the fcntl flag too
that would be really nice.

thanks
-- PMM
Theodore Ts'o April 20, 2020, 3:13 p.m. UTC | #5
On Tue, Mar 31, 2020 at 03:35:36PM +0200, Linus Walleij wrote:
> It was brought to my attention that this bug from 2018 was
> still unresolved: 32 bit emulators like QEMU were given
> 64 bit hashes when running 32 bit emulation on 64 bit systems.
> 
> This adds a fcntl() operation to set the underlying filesystem
> into 32bit mode even if the file hanle was opened using 64bit
> mode without the compat syscalls.

s/hanle/handle/

The API that you've proposed as a way to set the 32-bit mode, but
there is no way to clear the 32-bit mode, nor there is a way to get
the current status mode.

My suggestion is to add a flag bit for F_GETFD and F_SETFD (set and
get file descriptor flags).  Currently the only file descriptor flag
is FD_CLOEXEC, so why not add a FD_32BIT_MODE bit?

Cheers,

						- Ted
Eric Blake April 20, 2020, 3:23 p.m. UTC | #6
On 4/20/20 10:13 AM, Theodore Y. Ts'o wrote:
> On Tue, Mar 31, 2020 at 03:35:36PM +0200, Linus Walleij wrote:
>> It was brought to my attention that this bug from 2018 was
>> still unresolved: 32 bit emulators like QEMU were given
>> 64 bit hashes when running 32 bit emulation on 64 bit systems.
>>
>> This adds a fcntl() operation to set the underlying filesystem
>> into 32bit mode even if the file hanle was opened using 64bit
>> mode without the compat syscalls.
> 
> s/hanle/handle/
> 
> The API that you've proposed as a way to set the 32-bit mode, but
> there is no way to clear the 32-bit mode, nor there is a way to get
> the current status mode.
> 
> My suggestion is to add a flag bit for F_GETFD and F_SETFD (set and
> get file descriptor flags).  Currently the only file descriptor flag
> is FD_CLOEXEC, so why not add a FD_32BIT_MODE bit?

Also, POSIX is proposing standardizing FD_CLOFORK, which would be 
another file descriptor flag worth considering in Linux (Solaris and BSD 
already have it):

https://www.austingroupbugs.net/view.php?id=1318

It will be interesting to find how much code (wrongly) assumes it can 
use a blind assignment of fcntl(fd, F_SETFD, 1) and thereby accidentally 
wipes out other existing flags, when it should have instead been doing a 
read-modify-write to protect flags other than FD_CLOEXEC.
Peter Maydell April 20, 2020, 3:29 p.m. UTC | #7
On Mon, 20 Apr 2020 at 16:24, Eric Blake <eblake@redhat.com> wrote:
> It will be interesting to find how much code (wrongly) assumes it can
> use a blind assignment of fcntl(fd, F_SETFD, 1) and thereby accidentally
> wipes out other existing flags, when it should have instead been doing a
> read-modify-write to protect flags other than FD_CLOEXEC.

For instance, a quick grep shows 4 instances of this in QEMU :-)

thanks
-- PMM
Theodore Ts'o April 20, 2020, 5:01 p.m. UTC | #8
On Mon, Apr 20, 2020 at 04:29:32PM +0100, Peter Maydell wrote:
> On Mon, 20 Apr 2020 at 16:24, Eric Blake <eblake@redhat.com> wrote:
> > It will be interesting to find how much code (wrongly) assumes it can
> > use a blind assignment of fcntl(fd, F_SETFD, 1) and thereby accidentally
> > wipes out other existing flags, when it should have instead been doing a
> > read-modify-write to protect flags other than FD_CLOEXEC.
> 
> For instance, a quick grep shows 4 instances of this in QEMU :-)

Fortunately, most applications aren't going to be interested in
forcing 32-bit mode for 64-bit applications, QEMU being the notable
exception.  We do need to make sure that for 32-bit applications, we
either make FD_32BIT_MODE a no-op (don't set the bit, and ignore the
bit).  We could allow the bit to be visible for 32-bit applications,
but we would want to disallow clearing the the bit for 32-bit
applications if it was visible.

If we did that, then blind assignments of fcntl(fd, F_SETFD, 1) should
be mostly harmless with respect to the FD_32BIT_MODE bit.

   	      	 	  	       - Ted
Andreas Dilger April 20, 2020, 11:51 p.m. UTC | #9
> From 73471e01733dd1d998ff3cd41edebb4c78793193 Mon Sep 17 00:00:00 2001
> From: Peter Maydell <peter.maydell@linaro.org>
> Date: Mon, 20 Apr 2020 11:54:22 +0100
> Subject: [RFC] linux-user: Use new F_SET_FILE_32BIT_FS fcntl for 32-bit guests
> 
> If the guest is 32 bit then there is a potential problem if the
> host gives us back a 64-bit sized value that we can't fit into
> the ABI the guest requires. This is a theoretical issue for many
> syscalls, but a real issue for directory reads where the host
> is using ext3 or ext4. There the 'offset' values retured via
> the getdents syscall are hashes, and on a 64-bit system they
> will always fill the full 64 bits.
> 
> Use the F_SET_FILE_32BIT_FS fcntl to tell the kernel to stick
> to 32-bit sized hashes for fds used by the guest.
> 
> Signed-off-by: Peter Maydell <peter.maydell@linaro.org>

Another question I had here is whether the filesystem needs to provide
32-bit values for other syscalls, such as stat() and statfs()?  For
ext4, stat() is not going to return a 64-bit inode number, but other
filesystems might (e.g. Lustre has a mode to do this).  Similarly,
should statfs() scale up f_bsize until it can return a 32-bit f_blocks
value?  We also had to do this ages ago for Lustre when 32-bit clients
couldn't handle > 16TB filesystems, but that is a single disk today.

Should that be added into F_SET_FILE_32BIT_FS also?

Cheers, Andreas

> ---
> RFC patch because it depends on the kernel patch to provide
> F_SET_FILE_32BIT_FS, which is still under discussion. All this
> patch does is call the fcntl for every fd the guest opens.
> 
> linux-user/syscall.c | 27 +++++++++++++++++++++++++++
> 1 file changed, 27 insertions(+)
> 
> diff --git a/linux-user/syscall.c b/linux-user/syscall.c
> index 674f70e70a5..8966d4881bd 100644
> --- a/linux-user/syscall.c
> +++ b/linux-user/syscall.c
> @@ -884,6 +884,28 @@ static inline int host_to_target_sock_type(int host_type)
>     return target_type;
> }
> 
> +/*
> + * If the guest is using a 32 bit ABI then we should try to ask the kernel
> + * to provide 32-bit offsets in getdents syscalls, as otherwise some
> + * filesystems will return 64-bit hash values which we can't fit into
> + * the field sizes the guest ABI mandates.
> + */
> +#ifndef F_SET_FILE_32BIT_FS
> +#define F_SET_FILE_32BIT_FS (1024 + 15)
> +#endif
> +
> +static inline void request_32bit_fs(int fd)
> +{
> +#if HOST_LONG_BITS > TARGET_ABI_BITS
> +    /*
> +     * Ignore errors, which are likely due to the host kernel being too
> +     * old to support this fcntl. We'll try anyway, which might or might
> +     * not work, depending on the guest code and on the host filesystem.
> +     */
> +    fcntl(fd, F_SET_FILE_32BIT_FS);
> +#endif
> +}
> +
> static abi_ulong target_brk;
> static abi_ulong target_original_brk;
> static abi_ulong brk_page;
> @@ -7704,6 +7726,7 @@ static abi_long do_syscall1(void *cpu_env, int
> num, abi_long arg1,
>                                   target_to_host_bitmask(arg2,
> fcntl_flags_tbl),
>                                   arg3));
>         fd_trans_unregister(ret);
> +        request_32bit_fs(ret);
>         unlock_user(p, arg1, 0);
>         return ret;
> #endif
> @@ -7714,6 +7737,7 @@ static abi_long do_syscall1(void *cpu_env, int
> num, abi_long arg1,
>                                   target_to_host_bitmask(arg3,
> fcntl_flags_tbl),
>                                   arg4));
>         fd_trans_unregister(ret);
> +        request_32bit_fs(ret);
>         unlock_user(p, arg2, 0);
>         return ret;
> #if defined(TARGET_NR_name_to_handle_at) && defined(CONFIG_OPEN_BY_HANDLE)
> @@ -7725,6 +7749,7 @@ static abi_long do_syscall1(void *cpu_env, int
> num, abi_long arg1,
>     case TARGET_NR_open_by_handle_at:
>         ret = do_open_by_handle_at(arg1, arg2, arg3);
>         fd_trans_unregister(ret);
> +        request_32bit_fs(ret);
>         return ret;
> #endif
>     case TARGET_NR_close:
> @@ -7769,6 +7794,7 @@ static abi_long do_syscall1(void *cpu_env, int
> num, abi_long arg1,
>             return -TARGET_EFAULT;
>         ret = get_errno(creat(p, arg2));
>         fd_trans_unregister(ret);
> +        request_32bit_fs(ret);
>         unlock_user(p, arg1, 0);
>         return ret;
> #endif
> @@ -12393,6 +12419,7 @@ static abi_long do_syscall1(void *cpu_env, int
> num, abi_long arg1,
>         }
>         ret = get_errno(memfd_create(p, arg2));
>         fd_trans_unregister(ret);
> +        request_32bit_fs(ret);
>         unlock_user(p, arg1, 0);
>         return ret;
> #endif
> --
> 2.20.1


Cheers, Andreas
Peter Maydell April 21, 2020, 1:02 p.m. UTC | #10
On Tue, 21 Apr 2020 at 00:51, Andreas Dilger <adilger@dilger.ca> wrote:
> Another question I had here is whether the filesystem needs to provide
> 32-bit values for other syscalls, such as stat() and statfs()?  For
> ext4, stat() is not going to return a 64-bit inode number, but other
> filesystems might (e.g. Lustre has a mode to do this).  Similarly,
> should statfs() scale up f_bsize until it can return a 32-bit f_blocks
> value?  We also had to do this ages ago for Lustre when 32-bit clients
> couldn't handle > 16TB filesystems, but that is a single disk today.
>
> Should that be added into F_SET_FILE_32BIT_FS also?

Interesting question. The directory-offset is the thing that's
got peoples' attention because it's what has actually been hit
in real-world situations, but other syscalls have the same
potential problem too. The closest I can think of to a 'general
rule' (in terms of what QEMU would like) would be "behave the
same way you would for a compat32 syscall if you had one, or
how you would behave on an actual 32-bit host".

thanks
-- PMM
diff mbox series

Patch

diff --git a/fs/fcntl.c b/fs/fcntl.c
index 2e4c0fa2074b..d194b1265bd4 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -426,6 +426,10 @@  static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
 	case F_SET_FILE_RW_HINT:
 		err = fcntl_rw_hint(filp, cmd, arg);
 		break;
+	case F_SET_FILE_32BIT_FS:
+		filp->f_mode |= FMODE_32BITHASH;
+		err = 0;
+		break;
 	default:
 		break;
 	}
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index ca88b7bce553..b9ad934147e8 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -73,6 +73,15 @@ 
  */
 #define RWF_WRITE_LIFE_NOT_SET	RWH_WRITE_LIFE_NOT_SET
 
+/*
+ * This instructs the kernel to provide 32bit semantics (such as hashes) from
+ * the file system layer, when running a userland that depend on 32bit
+ * semantics on a kernel that supports 64bit userland, but does not use the
+ * compat ioctl() for e.g. open(), so that the kernel would otherwise assume
+ * that the userland process is capable of dealing with 64bit semantics.
+ */
+#define F_SET_FILE_32BIT_FS	(F_LINUX_SPECIFIC_BASE + 15)
+
 /*
  * Types of directory notifications that may be requested.
  */
diff --git a/tools/include/uapi/linux/fcntl.h b/tools/include/uapi/linux/fcntl.h
index ca88b7bce553..b9ad934147e8 100644
--- a/tools/include/uapi/linux/fcntl.h
+++ b/tools/include/uapi/linux/fcntl.h
@@ -73,6 +73,15 @@ 
  */
 #define RWF_WRITE_LIFE_NOT_SET	RWH_WRITE_LIFE_NOT_SET
 
+/*
+ * This instructs the kernel to provide 32bit semantics (such as hashes) from
+ * the file system layer, when running a userland that depend on 32bit
+ * semantics on a kernel that supports 64bit userland, but does not use the
+ * compat ioctl() for e.g. open(), so that the kernel would otherwise assume
+ * that the userland process is capable of dealing with 64bit semantics.
+ */
+#define F_SET_FILE_32BIT_FS	(F_LINUX_SPECIFIC_BASE + 15)
+
 /*
  * Types of directory notifications that may be requested.
  */
diff --git a/tools/perf/trace/beauty/fcntl.c b/tools/perf/trace/beauty/fcntl.c
index 56ef83b3d130..da80264678cb 100644
--- a/tools/perf/trace/beauty/fcntl.c
+++ b/tools/perf/trace/beauty/fcntl.c
@@ -94,7 +94,8 @@  size_t syscall_arg__scnprintf_fcntl_arg(char *bf, size_t size, struct syscall_ar
 	    cmd == F_OFD_SETLK || cmd == F_OFD_SETLKW || cmd == F_OFD_GETLK ||
 	    cmd == F_GETOWN_EX || cmd == F_SETOWN_EX ||
 	    cmd == F_GET_RW_HINT || cmd == F_SET_RW_HINT ||
-	    cmd == F_GET_FILE_RW_HINT || cmd == F_SET_FILE_RW_HINT)
+	    cmd == F_GET_FILE_RW_HINT || cmd == F_SET_FILE_RW_HINT ||
+	    cmd == F_SET_FILE_32BIT_FS)
 		return syscall_arg__scnprintf_hex(bf, size, arg);
 
 	return syscall_arg__scnprintf_long(bf, size, arg);