diff mbox series

[RFC,06/19] rust: fs: introduce `FileSystem::init_root`

Message ID 20231018122518.128049-7-wedsonaf@gmail.com (mailing list archive)
State New, archived
Headers show
Series Rust abstractions for VFS | expand

Commit Message

Wedson Almeida Filho Oct. 18, 2023, 12:25 p.m. UTC
From: Wedson Almeida Filho <walmeida@microsoft.com>

Allow Rust file systems to specify their root directory. Also allow them
to create (and do cache lookups of) directory inodes. (More types of
inodes are added in subsequent patches in the series.)

The `NewINode` type ensures that a new inode is properly initialised
before it is marked so. It also facilitates error paths by automatically
marking inodes as failed if they're not properly initialised.

Signed-off-by: Wedson Almeida Filho <walmeida@microsoft.com>
---
 rust/helpers.c            |  12 +++
 rust/kernel/fs.rs         | 178 +++++++++++++++++++++++++++++++-------
 samples/rust/rust_rofs.rs |  22 ++++-
 3 files changed, 181 insertions(+), 31 deletions(-)

Comments

Benno Lossin Oct. 19, 2023, 2:30 p.m. UTC | #1
On 18.10.23 14:25, Wedson Almeida Filho wrote:
> From: Wedson Almeida Filho <walmeida@microsoft.com>
> 
> Allow Rust file systems to specify their root directory. Also allow them
> to create (and do cache lookups of) directory inodes. (More types of
> inodes are added in subsequent patches in the series.)
> 
> The `NewINode` type ensures that a new inode is properly initialised
> before it is marked so. It also facilitates error paths by automatically
> marking inodes as failed if they're not properly initialised.
> 
> Signed-off-by: Wedson Almeida Filho <walmeida@microsoft.com>
> ---
>   rust/helpers.c            |  12 +++
>   rust/kernel/fs.rs         | 178 +++++++++++++++++++++++++++++++-------
>   samples/rust/rust_rofs.rs |  22 ++++-
>   3 files changed, 181 insertions(+), 31 deletions(-)
> 
> diff --git a/rust/helpers.c b/rust/helpers.c
> index fe45f8ddb31f..c5a2bec6467d 100644
> --- a/rust/helpers.c
> +++ b/rust/helpers.c
> @@ -145,6 +145,18 @@ struct kunit *rust_helper_kunit_get_current_test(void)
>   }
>   EXPORT_SYMBOL_GPL(rust_helper_kunit_get_current_test);
> 
> +void rust_helper_i_uid_write(struct inode *inode, uid_t uid)
> +{
> +	i_uid_write(inode, uid);
> +}
> +EXPORT_SYMBOL_GPL(rust_helper_i_uid_write);
> +
> +void rust_helper_i_gid_write(struct inode *inode, gid_t gid)
> +{
> +	i_gid_write(inode, gid);
> +}
> +EXPORT_SYMBOL_GPL(rust_helper_i_gid_write);
> +
>   off_t rust_helper_i_size_read(const struct inode *inode)
>   {
>   	return i_size_read(inode);
> diff --git a/rust/kernel/fs.rs b/rust/kernel/fs.rs
> index 30fa1f312f33..f3a41cf57502 100644
> --- a/rust/kernel/fs.rs
> +++ b/rust/kernel/fs.rs
> @@ -7,9 +7,9 @@
>   //! C headers: [`include/linux/fs.h`](../../include/linux/fs.h)
> 
>   use crate::error::{code::*, from_result, to_result, Error, Result};
> -use crate::types::{AlwaysRefCounted, Opaque};
> -use crate::{bindings, init::PinInit, str::CStr, try_pin_init, ThisModule};
> -use core::{marker::PhantomData, marker::PhantomPinned, pin::Pin, ptr};
> +use crate::types::{ARef, AlwaysRefCounted, Either, Opaque};
> +use crate::{bindings, init::PinInit, str::CStr, time::Timespec, try_pin_init, ThisModule};
> +use core::{marker::PhantomData, marker::PhantomPinned, mem::ManuallyDrop, pin::Pin, ptr};
>   use macros::{pin_data, pinned_drop};
> 
>   /// Maximum size of an inode.
> @@ -22,6 +22,12 @@ pub trait FileSystem {
> 
>       /// Returns the parameters to initialise a super block.
>       fn super_params(sb: &NewSuperBlock<Self>) -> Result<SuperParams>;
> +
> +    /// Initialises and returns the root inode of the given superblock.
> +    ///
> +    /// This is called during initialisation of a superblock after [`FileSystem::super_params`] has
> +    /// completed successfully.
> +    fn init_root(sb: &SuperBlock<Self>) -> Result<ARef<INode<Self>>>;
>   }
> 
>   /// A registration of a file system.
> @@ -143,12 +149,136 @@ unsafe fn dec_ref(obj: ptr::NonNull<Self>) {
>       }
>   }
> 
> +/// An inode that is locked and hasn't been initialised yet.
> +#[repr(transparent)]
> +pub struct NewINode<T: FileSystem + ?Sized>(ARef<INode<T>>);
> +
> +impl<T: FileSystem + ?Sized> NewINode<T> {
> +    /// Initialises the new inode with the given parameters.
> +    pub fn init(self, params: INodeParams) -> Result<ARef<INode<T>>> {
> +        // SAFETY: This is a new inode, so it's safe to manipulate it mutably.

How do you know that this is a new inode? Maybe add a type invariant?

> +        let inode = unsafe { &mut *self.0 .0.get() };
> +
> +        let mode = match params.typ {
> +            INodeType::Dir => {
> +                // SAFETY: `simple_dir_operations` never changes, it's safe to reference it.
> +                inode.__bindgen_anon_3.i_fop = unsafe { &bindings::simple_dir_operations };
> +
> +                // SAFETY: `simple_dir_inode_operations` never changes, it's safe to reference it.
> +                inode.i_op = unsafe { &bindings::simple_dir_inode_operations };
> +                bindings::S_IFDIR
> +            }
> +        };
> +
> +        inode.i_mode = (params.mode & 0o777) | u16::try_from(mode)?;
> +        inode.i_size = params.size;
> +        inode.i_blocks = params.blocks;
> +
> +        inode.__i_ctime = params.ctime.into();
> +        inode.i_mtime = params.mtime.into();
> +        inode.i_atime = params.atime.into();
> +
> +        // SAFETY: inode is a new inode, so it is valid for write.
> +        unsafe {
> +            bindings::set_nlink(inode, params.nlink);
> +            bindings::i_uid_write(inode, params.uid);
> +            bindings::i_gid_write(inode, params.gid);
> +            bindings::unlock_new_inode(inode);
> +        }
> +
> +        // SAFETY: We are manually destructuring `self` and preventing `drop` from being called.
> +        Ok(unsafe { (&ManuallyDrop::new(self).0 as *const ARef<INode<T>>).read() })

Add a comment that explains why you need to do this instead of `self.0`.

> +    }
> +}
> +
> +impl<T: FileSystem + ?Sized> Drop for NewINode<T> {
> +    fn drop(&mut self) {
> +        // SAFETY: The new inode failed to be turned into an initialised inode, so it's safe (and
> +        // in fact required) to call `iget_failed` on it.
> +        unsafe { bindings::iget_failed(self.0 .0.get()) };
> +    }
> +}
> +
> +/// The type of the inode.
> +#[derive(Copy, Clone)]
> +pub enum INodeType {
> +    /// Directory type.
> +    Dir,
> +}
> +
> +/// Required inode parameters.
> +///
> +/// This is used when creating new inodes.
> +pub struct INodeParams {
> +    /// The access mode. It's a mask that grants execute (1), write (2) and read (4) access to
> +    /// everyone, the owner group, and the owner.
> +    pub mode: u16,
> +
> +    /// Type of inode.
> +    ///
> +    /// Also carries additional per-type data.
> +    pub typ: INodeType,
> +
> +    /// Size of the contents of the inode.
> +    ///
> +    /// Its maximum value is [`MAX_LFS_FILESIZE`].
> +    pub size: i64,
> +
> +    /// Number of blocks.
> +    pub blocks: u64,
> +
> +    /// Number of links to the inode.
> +    pub nlink: u32,
> +
> +    /// User id.
> +    pub uid: u32,
> +
> +    /// Group id.
> +    pub gid: u32,
> +
> +    /// Creation time.
> +    pub ctime: Timespec,
> +
> +    /// Last modification time.
> +    pub mtime: Timespec,
> +
> +    /// Last access time.
> +    pub atime: Timespec,
> +}
> +
>   /// A file system super block.
>   ///
>   /// Wraps the kernel's `struct super_block`.
>   #[repr(transparent)]
>   pub struct SuperBlock<T: FileSystem + ?Sized>(Opaque<bindings::super_block>, PhantomData<T>);
> 
> +impl<T: FileSystem + ?Sized> SuperBlock<T> {
> +    /// Tries to get an existing inode or create a new one if it doesn't exist yet.
> +    pub fn get_or_create_inode(&self, ino: Ino) -> Result<Either<ARef<INode<T>>, NewINode<T>>> {
> +        // SAFETY: The only initialisation missing from the superblock is the root, and this
> +        // function is needed to create the root, so it's safe to call it.

This is a weird safety comment. Why is the superblock not fully
initialized? Why is safe to call the function? This comment doesn't
really explain anything.

> +        let inode =
> +            ptr::NonNull::new(unsafe { bindings::iget_locked(self.0.get(), ino) }).ok_or(ENOMEM)?;
> +
> +        // SAFETY: `inode` is valid for read, but there could be concurrent writers (e.g., if it's
> +        // an already-initialised inode), so we use `read_volatile` to read its current state.
> +        let state = unsafe { ptr::read_volatile(ptr::addr_of!((*inode.as_ptr()).i_state)) };

Are you sure that `read_volatile` is sufficient for this use case? The
documentation [1] clearly states that concurrent write operations are still
UB:

    Just like in C, whether an operation is volatile has no bearing
    whatsoever on questions involving concurrent access from multiple
    threads. Volatile accesses behave exactly like non-atomic accesses in
    that regard. In particular, a race between a read_volatile and any
    write operation to the same location is undefined behavior.

[1]: https://doc.rust-lang.org/core/ptr/fn.read_volatile.html
Boqun Feng Oct. 20, 2023, 12:30 a.m. UTC | #2
On Wed, Oct 18, 2023 at 09:25:05AM -0300, Wedson Almeida Filho wrote:
[...]
> +/// An inode that is locked and hasn't been initialised yet.
> +#[repr(transparent)]
> +pub struct NewINode<T: FileSystem + ?Sized>(ARef<INode<T>>);
> +
> +impl<T: FileSystem + ?Sized> NewINode<T> {
> +    /// Initialises the new inode with the given parameters.
> +    pub fn init(self, params: INodeParams) -> Result<ARef<INode<T>>> {
> +        // SAFETY: This is a new inode, so it's safe to manipulate it mutably.
> +        let inode = unsafe { &mut *self.0 .0.get() };
> +
> +        let mode = match params.typ {
> +            INodeType::Dir => {
> +                // SAFETY: `simple_dir_operations` never changes, it's safe to reference it.
> +                inode.__bindgen_anon_3.i_fop = unsafe { &bindings::simple_dir_operations };
> +
> +                // SAFETY: `simple_dir_inode_operations` never changes, it's safe to reference it.
> +                inode.i_op = unsafe { &bindings::simple_dir_inode_operations };
> +                bindings::S_IFDIR
> +            }
> +        };
> +
> +        inode.i_mode = (params.mode & 0o777) | u16::try_from(mode)?;
> +        inode.i_size = params.size;
> +        inode.i_blocks = params.blocks;
> +
> +        inode.__i_ctime = params.ctime.into();
> +        inode.i_mtime = params.mtime.into();
> +        inode.i_atime = params.atime.into();
> +
> +        // SAFETY: inode is a new inode, so it is valid for write.
> +        unsafe {
> +            bindings::set_nlink(inode, params.nlink);
> +            bindings::i_uid_write(inode, params.uid);
> +            bindings::i_gid_write(inode, params.gid);
> +            bindings::unlock_new_inode(inode);
> +        }
> +
> +        // SAFETY: We are manually destructuring `self` and preventing `drop` from being called.
> +        Ok(unsafe { (&ManuallyDrop::new(self).0 as *const ARef<INode<T>>).read() })

How do we feel about using transmute here? ;-) I.e.

	// SAFETY: `NewINode` is transparent to `ARef<INode<_>>`, and
	// the inode has been initialised, so it's safety to change the
	// object type.
	Ok(unsafe { core::mem::transmute(self) })

What we actually want here is changing the type of the object (i.e.
bitwise move from one type to another), seems to me that transmute is
the best fit here.

Thoughts?

Regards,
Boqun


> +    }
> +}
> +
> +impl<T: FileSystem + ?Sized> Drop for NewINode<T> {
> +    fn drop(&mut self) {
> +        // SAFETY: The new inode failed to be turned into an initialised inode, so it's safe (and
> +        // in fact required) to call `iget_failed` on it.
> +        unsafe { bindings::iget_failed(self.0 .0.get()) };
> +    }
> +}
> +
[...]
Boqun Feng Oct. 20, 2023, 12:52 a.m. UTC | #3
On Thu, Oct 19, 2023 at 02:30:56PM +0000, Benno Lossin wrote:
[...]
> > +        let inode =
> > +            ptr::NonNull::new(unsafe { bindings::iget_locked(self.0.get(), ino) }).ok_or(ENOMEM)?;
> > +
> > +        // SAFETY: `inode` is valid for read, but there could be concurrent writers (e.g., if it's
> > +        // an already-initialised inode), so we use `read_volatile` to read its current state.
> > +        let state = unsafe { ptr::read_volatile(ptr::addr_of!((*inode.as_ptr()).i_state)) };
> 
> Are you sure that `read_volatile` is sufficient for this use case? The
> documentation [1] clearly states that concurrent write operations are still
> UB:
> 
>     Just like in C, whether an operation is volatile has no bearing
>     whatsoever on questions involving concurrent access from multiple
>     threads. Volatile accesses behave exactly like non-atomic accesses in
>     that regard. In particular, a race between a read_volatile and any
>     write operation to the same location is undefined behavior.
> 

Right, `read_volatile` can have data race. I think what we can do here
is:

	// SAFETY: `i_state` in `inode` is `unsigned long`, therefore
	// it's safe to treat it as `AtomicUsize` and do a relaxed read.
	let state = unsafe { *(ptr::addr_of!((*inode.as_ptr()).i_state).cast::<AtomicUsize>()).load(Relaxed) };

Regards,
Boqun

> [1]: https://doc.rust-lang.org/core/ptr/fn.read_volatile.html
> 
> -- 
> Cheers,
> Benno
>
Benno Lossin Oct. 21, 2023, 1:48 p.m. UTC | #4
On 20.10.23 02:52, Boqun Feng wrote:
> On Thu, Oct 19, 2023 at 02:30:56PM +0000, Benno Lossin wrote:
> [...]
>>> +        let inode =
>>> +            ptr::NonNull::new(unsafe { bindings::iget_locked(self.0.get(), ino) }).ok_or(ENOMEM)?;
>>> +
>>> +        // SAFETY: `inode` is valid for read, but there could be concurrent writers (e.g., if it's
>>> +        // an already-initialised inode), so we use `read_volatile` to read its current state.
>>> +        let state = unsafe { ptr::read_volatile(ptr::addr_of!((*inode.as_ptr()).i_state)) };
>>
>> Are you sure that `read_volatile` is sufficient for this use case? The
>> documentation [1] clearly states that concurrent write operations are still
>> UB:
>>
>>      Just like in C, whether an operation is volatile has no bearing
>>      whatsoever on questions involving concurrent access from multiple
>>      threads. Volatile accesses behave exactly like non-atomic accesses in
>>      that regard. In particular, a race between a read_volatile and any
>>      write operation to the same location is undefined behavior.
>>
> 
> Right, `read_volatile` can have data race. I think what we can do here
> is:
> 
> 	// SAFETY: `i_state` in `inode` is `unsigned long`, therefore
> 	// it's safe to treat it as `AtomicUsize` and do a relaxed read.
> 	let state = unsafe { *(ptr::addr_of!((*inode.as_ptr()).i_state).cast::<AtomicUsize>()).load(Relaxed) };

I am not sure if that is enough. What kind of writes happen
concurrently on the C side? If they are atomic, then this should
be fine, if they are not synchronized at all, then it could be
problematic, as miri says that it is still UB:
https://play.rust-lang.org/?version=stable&mode=debug&edition=2021&gist=aa75fb6805c8d67ade8837531a2096d0
Boqun Feng Oct. 21, 2023, 3:57 p.m. UTC | #5
On Sat, Oct 21, 2023 at 01:48:28PM +0000, Benno Lossin wrote:
> On 20.10.23 02:52, Boqun Feng wrote:
> > On Thu, Oct 19, 2023 at 02:30:56PM +0000, Benno Lossin wrote:
> > [...]
> >>> +        let inode =
> >>> +            ptr::NonNull::new(unsafe { bindings::iget_locked(self.0.get(), ino) }).ok_or(ENOMEM)?;
> >>> +
> >>> +        // SAFETY: `inode` is valid for read, but there could be concurrent writers (e.g., if it's
> >>> +        // an already-initialised inode), so we use `read_volatile` to read its current state.
> >>> +        let state = unsafe { ptr::read_volatile(ptr::addr_of!((*inode.as_ptr()).i_state)) };
> >>
> >> Are you sure that `read_volatile` is sufficient for this use case? The
> >> documentation [1] clearly states that concurrent write operations are still
> >> UB:
> >>
> >>      Just like in C, whether an operation is volatile has no bearing
> >>      whatsoever on questions involving concurrent access from multiple
> >>      threads. Volatile accesses behave exactly like non-atomic accesses in
> >>      that regard. In particular, a race between a read_volatile and any
> >>      write operation to the same location is undefined behavior.
> >>
> > 
> > Right, `read_volatile` can have data race. I think what we can do here
> > is:
> > 
> > 	// SAFETY: `i_state` in `inode` is `unsigned long`, therefore
> > 	// it's safe to treat it as `AtomicUsize` and do a relaxed read.
> > 	let state = unsafe { *(ptr::addr_of!((*inode.as_ptr()).i_state).cast::<AtomicUsize>()).load(Relaxed) };
> 
> I am not sure if that is enough. What kind of writes happen
> concurrently on the C side? If they are atomic, then this should
> be fine, if they are not synchronized at all, then it could be
> problematic, as miri says that it is still UB:
> https://play.rust-lang.org/?version=stable&mode=debug&edition=2021&gist=aa75fb6805c8d67ade8837531a2096d0
> 

You're not wrong, my suggestion here had the assumption that write part
of ->i_state is atomic (I hadn't look into that). Now a quick look tells
it isn't, for example in fs/f2fs/namei.c, there is:

	inode->i_state |= I_LINKABLE;

so I think we need to take the inode->i_lock here for a data-race free
solution. Or if we have something like:

	https://github.com/rust-lang/unsafe-code-guidelines/issues/321

in Rust.

Benno, notice my reasoning about whether a write is atomic is less
strict, since in C side, in the current rule of the kernel, plain
writes to machine words can be treated as atomic, in case you're
interested CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC is the pointer ;-)

While we are at it, adding Marco, could kcsan work for Rust code? If I
understand correctly, as long as Rust compilers could generate these
__tsan_* instrument functions, it should work, right?

Regards,
Boqun

> -- 
> Cheers,
> Benno
Matthew Wilcox (Oracle) Oct. 21, 2023, 5:01 p.m. UTC | #6
On Sat, Oct 21, 2023 at 08:57:30AM -0700, Boqun Feng wrote:
> You're not wrong, my suggestion here had the assumption that write part
> of ->i_state is atomic (I hadn't look into that). Now a quick look tells
> it isn't, for example in fs/f2fs/namei.c, there is:
> 
> 	inode->i_state |= I_LINKABLE;

But it doesn't matter what f2fs does to _its_ inodes.  tarfs will never
see an f2fs inode.  I don't know what the rules are around inode->i_state;
I'm only an expert on the page cache, not the rest of the VFS.  So
what are the rules around modifying i_state for the VFS?
Boqun Feng Oct. 21, 2023, 7:33 p.m. UTC | #7
On Sat, Oct 21, 2023 at 06:01:02PM +0100, Matthew Wilcox wrote:
> On Sat, Oct 21, 2023 at 08:57:30AM -0700, Boqun Feng wrote:
> > You're not wrong, my suggestion here had the assumption that write part
> > of ->i_state is atomic (I hadn't look into that). Now a quick look tells
> > it isn't, for example in fs/f2fs/namei.c, there is:
> > 
> > 	inode->i_state |= I_LINKABLE;
> 
> But it doesn't matter what f2fs does to _its_ inodes.  tarfs will never
> see an f2fs inode.  I don't know what the rules are around inode->i_state;

Well, maybe I choose a bad example ;-) I agree that tarfs will never see
an f2fs inode and since tarfs is the only user right now, the data race
should really depend on tarfs right now. But this is general filesystem
Rust API, so it should in theory work with everything. Plus fs/dcache.c
has something similar:

	inode->i_state &= ~I_NEW & ~I_CREATING;

> I'm only an expert on the page cache, not the rest of the VFS.  So
> what are the rules around modifying i_state for the VFS?
> 

Agreed, same question here.

Regards,
Boqun

>
Dave Chinner Oct. 23, 2023, 5:29 a.m. UTC | #8
On Sat, Oct 21, 2023 at 12:33:57PM -0700, Boqun Feng wrote:
> On Sat, Oct 21, 2023 at 06:01:02PM +0100, Matthew Wilcox wrote:
> > I'm only an expert on the page cache, not the rest of the VFS.  So
> > what are the rules around modifying i_state for the VFS?
> 
> Agreed, same question here.

inode->i_state should only be modified under inode->i_lock.

And in most situations, you have to hold the inode->i_lock to read
state flags as well so that reads are serialised against
modifications which are typically non-atomic RMW operations.

There is, I think, one main exception to read side locking and this
is find_inode_rcu() which does an unlocked check for I_WILL_FREE |
I_FREEING. In this case, the inode->i_state updates in iput_final()
use WRITE_ONCE under the inode->i_lock to provide the necessary
semantics for the unlocked READ_ONCE() done under rcu_read_lock().

IOWs, if you follow the general rule that any inode->i_state access
(read or write) needs to hold inode->i_lock, you probably won't
screw up. 

-Dave.
Wedson Almeida Filho Oct. 23, 2023, 12:36 p.m. UTC | #9
On Thu, 19 Oct 2023 at 21:31, Boqun Feng <boqun.feng@gmail.com> wrote:
> On Wed, Oct 18, 2023 at 09:25:05AM -0300, Wedson Almeida Filho wrote:
> > +        // SAFETY: We are manually destructuring `self` and preventing `drop` from being called.
> > +        Ok(unsafe { (&ManuallyDrop::new(self).0 as *const ARef<INode<T>>).read() })
>
> How do we feel about using transmute here? ;-) I.e.
>
>         // SAFETY: `NewINode` is transparent to `ARef<INode<_>>`, and
>         // the inode has been initialised, so it's safety to change the
>         // object type.
>         Ok(unsafe { core::mem::transmute(self) })
>
> What we actually want here is changing the type of the object (i.e.
> bitwise move from one type to another), seems to me that transmute is
> the best fit here.
>
> Thoughts?

That's much nicer. I'll do this in v2.
Wedson Almeida Filho Oct. 23, 2023, 12:55 p.m. UTC | #10
On Mon, 23 Oct 2023 at 02:29, Dave Chinner <david@fromorbit.com> wrote:
>
> On Sat, Oct 21, 2023 at 12:33:57PM -0700, Boqun Feng wrote:
> > On Sat, Oct 21, 2023 at 06:01:02PM +0100, Matthew Wilcox wrote:
> > > I'm only an expert on the page cache, not the rest of the VFS.  So
> > > what are the rules around modifying i_state for the VFS?
> >
> > Agreed, same question here.
>
> inode->i_state should only be modified under inode->i_lock.
>
> And in most situations, you have to hold the inode->i_lock to read
> state flags as well so that reads are serialised against
> modifications which are typically non-atomic RMW operations.
>
> There is, I think, one main exception to read side locking and this
> is find_inode_rcu() which does an unlocked check for I_WILL_FREE |
> I_FREEING. In this case, the inode->i_state updates in iput_final()
> use WRITE_ONCE under the inode->i_lock to provide the necessary
> semantics for the unlocked READ_ONCE() done under rcu_read_lock().
>
> IOWs, if you follow the general rule that any inode->i_state access
> (read or write) needs to hold inode->i_lock, you probably won't
> screw up.

I don't see filesystems doing this though. In particular, see
iget_locked() -- if a new inode is returned, then it is locked, but if
a cached one is found, it's not locked.

So we're in this situation where a returned inode may or may not be
locked. And the way to determine if it's locked or not is to read
i_state.

Here are examples of kernfs, ext2, ext4 and squashfs doing it:
https://elixir.bootlin.com/linux/v6.6-rc7/source/fs/kernfs/inode.c#L252
https://elixir.bootlin.com/linux/v6.6-rc7/source/fs/ext2/inode.c#L1392
https://elixir.bootlin.com/linux/v6.6-rc7/source/fs/ext4/inode.c#L4707
https://elixir.bootlin.com/linux/v6.6-rc7/source/fs/squashfs/inode.c#L82

They all call iget_locked(), and if I_NEW is set, they initialise the
inode and unlock it with unlock_new_inode(); otherwise they just
return the unlocked inode.
Dave Chinner Oct. 30, 2023, 2:29 a.m. UTC | #11
On Mon, Oct 23, 2023 at 09:55:08AM -0300, Wedson Almeida Filho wrote:
> On Mon, 23 Oct 2023 at 02:29, Dave Chinner <david@fromorbit.com> wrote:
> >
> > On Sat, Oct 21, 2023 at 12:33:57PM -0700, Boqun Feng wrote:
> > > On Sat, Oct 21, 2023 at 06:01:02PM +0100, Matthew Wilcox wrote:
> > > > I'm only an expert on the page cache, not the rest of the VFS.  So
> > > > what are the rules around modifying i_state for the VFS?
> > >
> > > Agreed, same question here.
> >
> > inode->i_state should only be modified under inode->i_lock.
> >
> > And in most situations, you have to hold the inode->i_lock to read
> > state flags as well so that reads are serialised against
> > modifications which are typically non-atomic RMW operations.
> >
> > There is, I think, one main exception to read side locking and this
> > is find_inode_rcu() which does an unlocked check for I_WILL_FREE |
> > I_FREEING. In this case, the inode->i_state updates in iput_final()
> > use WRITE_ONCE under the inode->i_lock to provide the necessary
> > semantics for the unlocked READ_ONCE() done under rcu_read_lock().
> >
> > IOWs, if you follow the general rule that any inode->i_state access
> > (read or write) needs to hold inode->i_lock, you probably won't
> > screw up.
> 
> I don't see filesystems doing this though. In particular, see
> iget_locked() -- if a new inode is returned, then it is locked, but if
> a cached one is found, it's not locked.

I did say "if you follow the general rule".

And where there is a "general rule" there is the implication that
there are special cases where the "general rule" doesn't get
applied, yes? :)

I_NEW is the exception to the general rule, and very few people
writing filesystems actually know about it let alone care about
it...

> So we're in this situation where a returned inode may or may not be
> locked. And the way to determine if it's locked or not is to read
> i_state.
> 
> Here are examples of kernfs, ext2, ext4 and squashfs doing it:
> https://elixir.bootlin.com/linux/v6.6-rc7/source/fs/kernfs/inode.c#L252
> https://elixir.bootlin.com/linux/v6.6-rc7/source/fs/ext2/inode.c#L1392
> https://elixir.bootlin.com/linux/v6.6-rc7/source/fs/ext4/inode.c#L4707
> https://elixir.bootlin.com/linux/v6.6-rc7/source/fs/squashfs/inode.c#L82
> 
> They all call iget_locked(), and if I_NEW is set, they initialise the
> inode and unlock it with unlock_new_inode(); otherwise they just
> return the unlocked inode.

All of them are perfectly fine.

I_NEW is the bit we use to synchronise inode initialisation - we
have to ensure there is only a single initialisation running while
there are concurrent lookups that can find the inode whilst it is
being initialised. We cannot hold a spin lock over inode
initialisation (it may have to do IO!), so we set the I_NEW flag
under the i_lock and the inode_hash_lock during hash insertion so
that they are set atomically from the hash lookup POV. If the inode
is then found in cache, wait_on_inode() does the serialisation
against the running initialisation indicated by the __I_NEW bit in
the i_state word.

Hence if the caller of iget_locked() ever sees I_NEW, it is
guaranteed to have exclusive access to the inode and -must- first
initialise the inode and then call unlock_new_inode() when it has
completed. It doesn't need to hold inode->i_lock in this case
because there's nothing it needs to serialise against as
iget_locked() has already done all that work.

If the inode is found in cache by iget_locked, then the
wait_on_inode() call is guaranteed to ensure that I_NEW is not set
when it returns. The atomic bit operations on __I_NEW and the memory
barriers in unlock_new_inode() plays an important part in this
dance, and they guarantee that I_NEW has been cleared before
iget_locked() returns. No need for inode->i_lock to be held in this
case, either, because iget_locked() did all the serialisation for
us.

This special dance is an optimisation that avoids the need to take
inode->i_lock in the inode lookup fast path just to check I_NEW. It
is an exception to the general rule but internal it uses
inode->i_lock in the places it is needed to ensure anything using
the general rule about accessing i_state still behaves correctly.

Cheers,

Dave.
Wedson Almeida Filho Oct. 31, 2023, 8:49 p.m. UTC | #12
On Sun, 29 Oct 2023 at 23:29, Dave Chinner <david@fromorbit.com> wrote:
>
> On Mon, Oct 23, 2023 at 09:55:08AM -0300, Wedson Almeida Filho wrote:
> > On Mon, 23 Oct 2023 at 02:29, Dave Chinner <david@fromorbit.com> wrote:
> > > IOWs, if you follow the general rule that any inode->i_state access
> > > (read or write) needs to hold inode->i_lock, you probably won't
> > > screw up.
> >
> > I don't see filesystems doing this though. In particular, see
> > iget_locked() -- if a new inode is returned, then it is locked, but if
> > a cached one is found, it's not locked.
>
> I did say "if you follow the general rule".
>
> And where there is a "general rule" there is the implication that
> there are special cases where the "general rule" doesn't get
> applied, yes? :)

Sure. But when you say "if _you_ do X", it gives me the impression
that I have a choice. But if want to use `iget_locked`, I don't have
the option to follow the "general rule" you state.

I guess I have the option to ignore `iget_locked`. :)

> I_NEW is the exception to the general rule, and very few people
> writing filesystems actually know about it let alone care about
> it...
<snip>
> All of them are perfectly fine.

I'm not sure I agree with this. They may be fine, but I wouldn't say
perfectly. :)

> I_NEW is the bit we use to synchronise inode initialisation - we
> have to ensure there is only a single initialisation running while
> there are concurrent lookups that can find the inode whilst it is
> being initialised. We cannot hold a spin lock over inode
> initialisation (it may have to do IO!), so we set the I_NEW flag
> under the i_lock and the inode_hash_lock during hash insertion so
> that they are set atomically from the hash lookup POV. If the inode
> is then found in cache, wait_on_inode() does the serialisation
> against the running initialisation indicated by the __I_NEW bit in
> the i_state word.
>
> Hence if the caller of iget_locked() ever sees I_NEW, it is
> guaranteed to have exclusive access to the inode and -must- first
> initialise the inode and then call unlock_new_inode() when it has
> completed. It doesn't need to hold inode->i_lock in this case
> because there's nothing it needs to serialise against as
> iget_locked() has already done all that work.
>
> If the inode is found in cache by iget_locked, then the
> wait_on_inode() call is guaranteed to ensure that I_NEW is not set
> when it returns. The atomic bit operations on __I_NEW and the memory
> barriers in unlock_new_inode() plays an important part in this
> dance, and they guarantee that I_NEW has been cleared before
> iget_locked() returns. No need for inode->i_lock to be held in this
> case, either, because iget_locked() did all the serialisation for
> us.

Thanks for explanation!

Let's consider the case when I call `inode_get`, and it finds an inode
that _has_ been fully initialised before, so I_NEW is not set in
inode->i_state and the inode is _not_ locked.

But the only means of checking that is by inspecting the i_state
field, so I do something like:

if (!(inode->i_state & I_NEW))
    return inode;

But now suppose that while I'm doing a naked load on inode->i_state,
another cpu is running concurrently and happens to be holding the
inode->i_lock, so it is within its right to write to inode->i_state,
for example through a call to __inode_add_lru, which has the
following:

inode->i_state |= I_REFERENCED;

So we have a thread doing a naked read and another thread doing a
naked write, no ordering between them.

Would you agree that this is a data race? (Note that I'm not asking if
"it will be ok" or "the compilers today generate the right code", I'm
asking merely if you agree this is a data race.)

If you do, then you'd have to agree that we are in undefined-behaviour
territory. I can quote the spec if you'd like.

Anyway, the discussion here is that this is also undefined behaviour
in Rust. And we're trying really hard to avoid that. Of course, in
cases like this there's not much we can do on the Rust side alone so
the conclusion now appears to be that we'll introduce helper functions
for this now and live with it. If one day we have a better solution,
we'll update just one place.

But we want the be very deliberate about these. We don't want to
accidentally introduce data races (and therefore potential undefined
behaviour).

Cheers,
-Wedson
Dave Chinner Nov. 8, 2023, 4:54 a.m. UTC | #13
On Tue, Oct 31, 2023 at 05:49:19PM -0300, Wedson Almeida Filho wrote:
> On Sun, 29 Oct 2023 at 23:29, Dave Chinner <david@fromorbit.com> wrote:
> >
> > On Mon, Oct 23, 2023 at 09:55:08AM -0300, Wedson Almeida Filho wrote:
> > > On Mon, 23 Oct 2023 at 02:29, Dave Chinner <david@fromorbit.com> wrote:
> > > > IOWs, if you follow the general rule that any inode->i_state access
> > > > (read or write) needs to hold inode->i_lock, you probably won't
> > > > screw up.
> > >
> > > I don't see filesystems doing this though. In particular, see
> > > iget_locked() -- if a new inode is returned, then it is locked, but if
> > > a cached one is found, it's not locked.
> >
> > I did say "if you follow the general rule".
> >
> > And where there is a "general rule" there is the implication that
> > there are special cases where the "general rule" doesn't get
> > applied, yes? :)
> 
> Sure. But when you say "if _you_ do X", it gives me the impression
> that I have a choice. But if want to use `iget_locked`, I don't have
> the option to follow the "general rule" you state.
> 
> I guess I have the option to ignore `iget_locked`. :)
> 
> > I_NEW is the exception to the general rule, and very few people
> > writing filesystems actually know about it let alone care about
> > it...
> <snip>
> > All of them are perfectly fine.
> 
> I'm not sure I agree with this. They may be fine, but I wouldn't say
> perfectly. :)
> 
> > I_NEW is the bit we use to synchronise inode initialisation - we
> > have to ensure there is only a single initialisation running while
> > there are concurrent lookups that can find the inode whilst it is
> > being initialised. We cannot hold a spin lock over inode
> > initialisation (it may have to do IO!), so we set the I_NEW flag
> > under the i_lock and the inode_hash_lock during hash insertion so
> > that they are set atomically from the hash lookup POV. If the inode
> > is then found in cache, wait_on_inode() does the serialisation
> > against the running initialisation indicated by the __I_NEW bit in
> > the i_state word.
> >
> > Hence if the caller of iget_locked() ever sees I_NEW, it is
> > guaranteed to have exclusive access to the inode and -must- first
> > initialise the inode and then call unlock_new_inode() when it has
> > completed. It doesn't need to hold inode->i_lock in this case
> > because there's nothing it needs to serialise against as
> > iget_locked() has already done all that work.
> >
> > If the inode is found in cache by iget_locked, then the
> > wait_on_inode() call is guaranteed to ensure that I_NEW is not set
> > when it returns. The atomic bit operations on __I_NEW and the memory
> > barriers in unlock_new_inode() plays an important part in this
> > dance, and they guarantee that I_NEW has been cleared before
> > iget_locked() returns. No need for inode->i_lock to be held in this
> > case, either, because iget_locked() did all the serialisation for
> > us.
> 
> Thanks for explanation!
> 
> Let's consider the case when I call `inode_get`, and it finds an inode
> that _has_ been fully initialised before, so I_NEW is not set in
> inode->i_state and the inode is _not_ locked.
> 
> But the only means of checking that is by inspecting the i_state
> field, so I do something like:
> 
> if (!(inode->i_state & I_NEW))
>     return inode;
> 
> But now suppose that while I'm doing a naked load on inode->i_state,
> another cpu is running concurrently and happens to be holding the
> inode->i_lock, so it is within its right to write to inode->i_state,
> for example through a call to __inode_add_lru, which has the
> following:
> 
> inode->i_state |= I_REFERENCED;
> 
> So we have a thread doing a naked read and another thread doing a
> naked write, no ordering between them.
> 
> Would you agree that this is a data race? (Note that I'm not asking if
> "it will be ok" or "the compilers today generate the right code", I'm
> asking merely if you agree this is a data race.)

I'll agree that technically it is a data race on the entire i_state
word. Practically, however, it is not a data race on the I_NEW bit
within that word. The I_NEW bit remains unchanged across the entire
operation.

i.e. it does not matter where the read of i_state intersects with
the RMW of I_REFERENCED bit, the I_NEW bit remains unchanged in
memory across the operation. If the above operation results in the
I_NEW bit changing state in memory - even transiently - then the
compiler implementation is simply broken...

> If you do, then you'd have to agree that we are in undefined-behaviour
> territory. I can quote the spec if you'd like.

/me shrugs

I can point you at lots of code that it will break if bit operations
are allowed to randomly change other bits in the word transiently.

> Anyway, the discussion here is that this is also undefined behaviour
> in Rust. And we're trying really hard to avoid that. Of course, in
> cases like this there's not much we can do on the Rust side alone so
> the conclusion now appears to be that we'll introduce helper functions
> for this now and live with it. If one day we have a better solution,
> we'll update just one place.

All the rust code that calls iget_locked() needs to do to "be safe"
is the rust equivalent of:

	spin_lock(&inode->i_lock);
	if (!(inode->i_state & I_NEW)) {
		spin_unlock(&inode->i_lock);
		return inode;
	}
	spin_unlock(&inode->i_lock);

IOWs, we solve the "safety" concern by ensuring that Rust filesystem
implementations follow the general rule of "always hold the i_lock
when accessing inode->i_state" I originally outlined, yes?

> But we want the be very deliberate about these. We don't want to
> accidentally introduce data races (and therefore potential undefined
> behaviour).

The stop looking at the C code and all the exceptions we make for
special case optimisations and just code to the generic rules for
safe access to given fields. Yes, rust will then have to give up the
optimisations we make in the C code, but there's always a price for
safety...

-Dave.
Wedson Almeida Filho Nov. 8, 2023, 6:15 a.m. UTC | #14
On Wed, 8 Nov 2023 at 01:54, Dave Chinner <david@fromorbit.com> wrote:
>
> On Tue, Oct 31, 2023 at 05:49:19PM -0300, Wedson Almeida Filho wrote:
> > On Sun, 29 Oct 2023 at 23:29, Dave Chinner <david@fromorbit.com> wrote:
> > >
> > > On Mon, Oct 23, 2023 at 09:55:08AM -0300, Wedson Almeida Filho wrote:
> > > > On Mon, 23 Oct 2023 at 02:29, Dave Chinner <david@fromorbit.com> wrote:
>
> > If you do, then you'd have to agree that we are in undefined-behaviour
> > territory. I can quote the spec if you'd like.
>
> /me shrugs
>
> I can point you at lots of code that it will break if bit operations
> are allowed to randomly change other bits in the word transiently.

Sure, in C you have chosen to rely on behaviour that the language spec
says is undefined.

In Rust, we're trying avoid it. When it's unavoidable, we're trying to
clearly mark it so that we can try to fix it later.

> All the rust code that calls iget_locked() needs to do to "be safe"
> is the rust equivalent of:
>
>         spin_lock(&inode->i_lock);
>         if (!(inode->i_state & I_NEW)) {
>                 spin_unlock(&inode->i_lock);
>                 return inode;
>         }
>         spin_unlock(&inode->i_lock);
>
> IOWs, we solve the "safety" concern by ensuring that Rust filesystem
> implementations follow the general rule of "always hold the i_lock
> when accessing inode->i_state" I originally outlined, yes?

Ah, the name of the functions iget_locked() and unlock_new_inode()
threw me off, I thought I wouldn't be able to lock inode->i_lock.

Ok, I will do this for now, I think it's better than relying on
undefined behaviour. Thanks!

Actually, looking at the implementation of iget_locked(), there's a
single place where it returns a new inode. Wouldn't it be better to
just return this piece of information (whether the inode is new or
not) to the caller? Then we would eliminate the data races in C and
the need to lock in Rust, and we would also eliminate a memory load
from inode->i_state in all callers.

> > But we want the be very deliberate about these. We don't want to
> > accidentally introduce data races (and therefore potential undefined
> > behaviour).
>
> The stop looking at the C code and all the exceptions we make for
> special case optimisations and just code to the generic rules for
> safe access to given fields. Yes, rust will then have to give up the
> optimisations we make in the C code, but there's always a price for
> safety...

I'm not trying to do clever optimisations at all. I'm trying to figure
out how to do things by looking at imperfect documentation in
filesystems/porting.rst (which, BTW, checks I_NEW without a lock) and
the functions I call. So I look at what existing filesystems do to
learn the hopefully most up to date way of doing things. If you have a
recommendation on how to do this more efficiently, I'm all ears!

Thanks,
-Wedson
Andreas Hindborg Jan. 3, 2024, 1:29 p.m. UTC | #15
Wedson Almeida Filho <wedsonaf@gmail.com> writes:

[...]

>  
> +/// An inode that is locked and hasn't been initialised yet.
> +#[repr(transparent)]
> +pub struct NewINode<T: FileSystem + ?Sized>(ARef<INode<T>>);
> +
> +impl<T: FileSystem + ?Sized> NewINode<T> {
> +    /// Initialises the new inode with the given parameters.
> +    pub fn init(self, params: INodeParams) -> Result<ARef<INode<T>>> {
> +        // SAFETY: This is a new inode, so it's safe to manipulate it mutably.
> +        let inode = unsafe { &mut *self.0 .0.get() };

Perhaps it would make sense with a `UniqueARef` that guarantees
uniqueness, in line with `alloc::UniqueRc`?

[...]

>  
> +impl<T: FileSystem + ?Sized> SuperBlock<T> {
> +    /// Tries to get an existing inode or create a new one if it doesn't exist yet.
> +    pub fn get_or_create_inode(&self, ino: Ino) -> Result<Either<ARef<INode<T>>, NewINode<T>>> {
> +        // SAFETY: The only initialisation missing from the superblock is the root, and this
> +        // function is needed to create the root, so it's safe to call it.
> +        let inode =
> +            ptr::NonNull::new(unsafe { bindings::iget_locked(self.0.get(), ino) }).ok_or(ENOMEM)?;

I can't parse this safety comment properly.

> +
> +        // SAFETY: `inode` is valid for read, but there could be concurrent writers (e.g., if it's
> +        // an already-initialised inode), so we use `read_volatile` to read its current state.
> +        let state = unsafe { ptr::read_volatile(ptr::addr_of!((*inode.as_ptr()).i_state)) };
> +        if state & u64::from(bindings::I_NEW) == 0 {
> +            // The inode is cached. Just return it.
> +            //
> +            // SAFETY: `inode` had its refcount incremented by `iget_locked`; this increment is now
> +            // owned by `ARef`.
> +            Ok(Either::Left(unsafe { ARef::from_raw(inode.cast()) }))
> +        } else {
> +            // SAFETY: The new inode is valid but not fully initialised yet, so it's ok to create a
> +            // `NewINode`.
> +            Ok(Either::Right(NewINode(unsafe {
> +                ARef::from_raw(inode.cast())

I would suggest making the destination type explicit for the cast.

> +            })))
> +        }
> +    }
> +}
> +
>  /// Required superblock parameters.
>  ///
>  /// This is returned by implementations of [`FileSystem::super_params`].
> @@ -215,41 +345,28 @@ impl<T: FileSystem + ?Sized> Tables<T> {
>              sb.0.s_blocksize = 1 << sb.0.s_blocksize_bits;
>              sb.0.s_flags |= bindings::SB_RDONLY;
>  
> -            // The following is scaffolding code that will be removed in a subsequent patch. It is
> -            // needed to build a root dentry, otherwise core code will BUG().
> -            // SAFETY: `sb` is the superblock being initialised, it is valid for read and write.
> -            let inode = unsafe { bindings::new_inode(&mut sb.0) };
> -            if inode.is_null() {
> -                return Err(ENOMEM);
> -            }
> -
> -            // SAFETY: `inode` is valid for write.
> -            unsafe { bindings::set_nlink(inode, 2) };
> -
> -            {
> -                // SAFETY: This is a newly-created inode. No other references to it exist, so it is
> -                // safe to mutably dereference it.
> -                let inode = unsafe { &mut *inode };
> -                inode.i_ino = 1;
> -                inode.i_mode = (bindings::S_IFDIR | 0o755) as _;
> -
> -                // SAFETY: `simple_dir_operations` never changes, it's safe to reference it.
> -                inode.__bindgen_anon_3.i_fop = unsafe { &bindings::simple_dir_operations };
> +            // SAFETY: The callback contract guarantees that `sb_ptr` is a unique pointer to a
> +            // newly-created (and initialised above) superblock.
> +            let sb = unsafe { &mut *sb_ptr.cast() };

Again, I would suggest an explicit destination type for the cast.

> +            let root = T::init_root(sb)?;
>  
> -                // SAFETY: `simple_dir_inode_operations` never changes, it's safe to reference it.
> -                inode.i_op = unsafe { &bindings::simple_dir_inode_operations };
> +            // Reject root inode if it belongs to a different superblock.

I am curious how this would happen?

BR Andreas
Wedson Almeida Filho Jan. 24, 2024, 4:07 a.m. UTC | #16
On Wed, Jan 03, 2024 at 02:29:33PM +0100, Andreas Hindborg (Samsung) wrote:
> 
> Wedson Almeida Filho <wedsonaf@gmail.com> writes:
> 
> [...]
> 
> >  
> > +/// An inode that is locked and hasn't been initialised yet.
> > +#[repr(transparent)]
> > +pub struct NewINode<T: FileSystem + ?Sized>(ARef<INode<T>>);
> > +
> > +impl<T: FileSystem + ?Sized> NewINode<T> {
> > +    /// Initialises the new inode with the given parameters.
> > +    pub fn init(self, params: INodeParams) -> Result<ARef<INode<T>>> {
> > +        // SAFETY: This is a new inode, so it's safe to manipulate it mutably.
> > +        let inode = unsafe { &mut *self.0 .0.get() };
> 
> Perhaps it would make sense with a `UniqueARef` that guarantees
> uniqueness, in line with `alloc::UniqueRc`?

We do have something like that in the kernel crate for Rust-allocated
ref-counted memory, namely, UniqueArc.

But in this case, this is slightly different: the ref-count may be >1, it's just
that the other holders of pointers will refrain from accessing the object (for
some unspecified reason). We do have another case like this for folios. Perhaps
it does make sense to generalise the concept with a type; I'll look into this.

> 
> [...]
> 
> >  
> > +impl<T: FileSystem + ?Sized> SuperBlock<T> {
> > +    /// Tries to get an existing inode or create a new one if it doesn't exist yet.
> > +    pub fn get_or_create_inode(&self, ino: Ino) -> Result<Either<ARef<INode<T>>, NewINode<T>>> {
> > +        // SAFETY: The only initialisation missing from the superblock is the root, and this
> > +        // function is needed to create the root, so it's safe to call it.
> > +        let inode =
> > +            ptr::NonNull::new(unsafe { bindings::iget_locked(self.0.get(), ino) }).ok_or(ENOMEM)?;
> 
> I can't parse this safety comment properly.

Fixed in v2.

> > +
> > +        // SAFETY: `inode` is valid for read, but there could be concurrent writers (e.g., if it's
> > +        // an already-initialised inode), so we use `read_volatile` to read its current state.
> > +        let state = unsafe { ptr::read_volatile(ptr::addr_of!((*inode.as_ptr()).i_state)) };
> > +        if state & u64::from(bindings::I_NEW) == 0 {
> > +            // The inode is cached. Just return it.
> > +            //
> > +            // SAFETY: `inode` had its refcount incremented by `iget_locked`; this increment is now
> > +            // owned by `ARef`.
> > +            Ok(Either::Left(unsafe { ARef::from_raw(inode.cast()) }))
> > +        } else {
> > +            // SAFETY: The new inode is valid but not fully initialised yet, so it's ok to create a
> > +            // `NewINode`.
> > +            Ok(Either::Right(NewINode(unsafe {
> > +                ARef::from_raw(inode.cast())
> 
> I would suggest making the destination type explicit for the cast.

Done in v2.

> 
> > +            })))
> > +        }
> > +    }
> > +}
> > +
> >  /// Required superblock parameters.
> >  ///
> >  /// This is returned by implementations of [`FileSystem::super_params`].
> > @@ -215,41 +345,28 @@ impl<T: FileSystem + ?Sized> Tables<T> {
> >              sb.0.s_blocksize = 1 << sb.0.s_blocksize_bits;
> >              sb.0.s_flags |= bindings::SB_RDONLY;
> >  
> > -            // The following is scaffolding code that will be removed in a subsequent patch. It is
> > -            // needed to build a root dentry, otherwise core code will BUG().
> > -            // SAFETY: `sb` is the superblock being initialised, it is valid for read and write.
> > -            let inode = unsafe { bindings::new_inode(&mut sb.0) };
> > -            if inode.is_null() {
> > -                return Err(ENOMEM);
> > -            }
> > -
> > -            // SAFETY: `inode` is valid for write.
> > -            unsafe { bindings::set_nlink(inode, 2) };
> > -
> > -            {
> > -                // SAFETY: This is a newly-created inode. No other references to it exist, so it is
> > -                // safe to mutably dereference it.
> > -                let inode = unsafe { &mut *inode };
> > -                inode.i_ino = 1;
> > -                inode.i_mode = (bindings::S_IFDIR | 0o755) as _;
> > -
> > -                // SAFETY: `simple_dir_operations` never changes, it's safe to reference it.
> > -                inode.__bindgen_anon_3.i_fop = unsafe { &bindings::simple_dir_operations };
> > +            // SAFETY: The callback contract guarantees that `sb_ptr` is a unique pointer to a
> > +            // newly-created (and initialised above) superblock.
> > +            let sb = unsafe { &mut *sb_ptr.cast() };
> 
> Again, I would suggest an explicit destination type for the cast.

Done in v2.

> 
> > +            let root = T::init_root(sb)?;
> >  
> > -                // SAFETY: `simple_dir_inode_operations` never changes, it's safe to reference it.
> > -                inode.i_op = unsafe { &bindings::simple_dir_inode_operations };
> > +            // Reject root inode if it belongs to a different superblock.
> 
> I am curious how this would happen?

If a user mounts two instances of a file system and the implementation allocates
root inodes and swap them before returning. The types will match because they
are the same file system, but they'll have the wrong super-block.

Thanks,
-Wedson
diff mbox series

Patch

diff --git a/rust/helpers.c b/rust/helpers.c
index fe45f8ddb31f..c5a2bec6467d 100644
--- a/rust/helpers.c
+++ b/rust/helpers.c
@@ -145,6 +145,18 @@  struct kunit *rust_helper_kunit_get_current_test(void)
 }
 EXPORT_SYMBOL_GPL(rust_helper_kunit_get_current_test);
 
+void rust_helper_i_uid_write(struct inode *inode, uid_t uid)
+{
+	i_uid_write(inode, uid);
+}
+EXPORT_SYMBOL_GPL(rust_helper_i_uid_write);
+
+void rust_helper_i_gid_write(struct inode *inode, gid_t gid)
+{
+	i_gid_write(inode, gid);
+}
+EXPORT_SYMBOL_GPL(rust_helper_i_gid_write);
+
 off_t rust_helper_i_size_read(const struct inode *inode)
 {
 	return i_size_read(inode);
diff --git a/rust/kernel/fs.rs b/rust/kernel/fs.rs
index 30fa1f312f33..f3a41cf57502 100644
--- a/rust/kernel/fs.rs
+++ b/rust/kernel/fs.rs
@@ -7,9 +7,9 @@ 
 //! C headers: [`include/linux/fs.h`](../../include/linux/fs.h)
 
 use crate::error::{code::*, from_result, to_result, Error, Result};
-use crate::types::{AlwaysRefCounted, Opaque};
-use crate::{bindings, init::PinInit, str::CStr, try_pin_init, ThisModule};
-use core::{marker::PhantomData, marker::PhantomPinned, pin::Pin, ptr};
+use crate::types::{ARef, AlwaysRefCounted, Either, Opaque};
+use crate::{bindings, init::PinInit, str::CStr, time::Timespec, try_pin_init, ThisModule};
+use core::{marker::PhantomData, marker::PhantomPinned, mem::ManuallyDrop, pin::Pin, ptr};
 use macros::{pin_data, pinned_drop};
 
 /// Maximum size of an inode.
@@ -22,6 +22,12 @@  pub trait FileSystem {
 
     /// Returns the parameters to initialise a super block.
     fn super_params(sb: &NewSuperBlock<Self>) -> Result<SuperParams>;
+
+    /// Initialises and returns the root inode of the given superblock.
+    ///
+    /// This is called during initialisation of a superblock after [`FileSystem::super_params`] has
+    /// completed successfully.
+    fn init_root(sb: &SuperBlock<Self>) -> Result<ARef<INode<Self>>>;
 }
 
 /// A registration of a file system.
@@ -143,12 +149,136 @@  unsafe fn dec_ref(obj: ptr::NonNull<Self>) {
     }
 }
 
+/// An inode that is locked and hasn't been initialised yet.
+#[repr(transparent)]
+pub struct NewINode<T: FileSystem + ?Sized>(ARef<INode<T>>);
+
+impl<T: FileSystem + ?Sized> NewINode<T> {
+    /// Initialises the new inode with the given parameters.
+    pub fn init(self, params: INodeParams) -> Result<ARef<INode<T>>> {
+        // SAFETY: This is a new inode, so it's safe to manipulate it mutably.
+        let inode = unsafe { &mut *self.0 .0.get() };
+
+        let mode = match params.typ {
+            INodeType::Dir => {
+                // SAFETY: `simple_dir_operations` never changes, it's safe to reference it.
+                inode.__bindgen_anon_3.i_fop = unsafe { &bindings::simple_dir_operations };
+
+                // SAFETY: `simple_dir_inode_operations` never changes, it's safe to reference it.
+                inode.i_op = unsafe { &bindings::simple_dir_inode_operations };
+                bindings::S_IFDIR
+            }
+        };
+
+        inode.i_mode = (params.mode & 0o777) | u16::try_from(mode)?;
+        inode.i_size = params.size;
+        inode.i_blocks = params.blocks;
+
+        inode.__i_ctime = params.ctime.into();
+        inode.i_mtime = params.mtime.into();
+        inode.i_atime = params.atime.into();
+
+        // SAFETY: inode is a new inode, so it is valid for write.
+        unsafe {
+            bindings::set_nlink(inode, params.nlink);
+            bindings::i_uid_write(inode, params.uid);
+            bindings::i_gid_write(inode, params.gid);
+            bindings::unlock_new_inode(inode);
+        }
+
+        // SAFETY: We are manually destructuring `self` and preventing `drop` from being called.
+        Ok(unsafe { (&ManuallyDrop::new(self).0 as *const ARef<INode<T>>).read() })
+    }
+}
+
+impl<T: FileSystem + ?Sized> Drop for NewINode<T> {
+    fn drop(&mut self) {
+        // SAFETY: The new inode failed to be turned into an initialised inode, so it's safe (and
+        // in fact required) to call `iget_failed` on it.
+        unsafe { bindings::iget_failed(self.0 .0.get()) };
+    }
+}
+
+/// The type of the inode.
+#[derive(Copy, Clone)]
+pub enum INodeType {
+    /// Directory type.
+    Dir,
+}
+
+/// Required inode parameters.
+///
+/// This is used when creating new inodes.
+pub struct INodeParams {
+    /// The access mode. It's a mask that grants execute (1), write (2) and read (4) access to
+    /// everyone, the owner group, and the owner.
+    pub mode: u16,
+
+    /// Type of inode.
+    ///
+    /// Also carries additional per-type data.
+    pub typ: INodeType,
+
+    /// Size of the contents of the inode.
+    ///
+    /// Its maximum value is [`MAX_LFS_FILESIZE`].
+    pub size: i64,
+
+    /// Number of blocks.
+    pub blocks: u64,
+
+    /// Number of links to the inode.
+    pub nlink: u32,
+
+    /// User id.
+    pub uid: u32,
+
+    /// Group id.
+    pub gid: u32,
+
+    /// Creation time.
+    pub ctime: Timespec,
+
+    /// Last modification time.
+    pub mtime: Timespec,
+
+    /// Last access time.
+    pub atime: Timespec,
+}
+
 /// A file system super block.
 ///
 /// Wraps the kernel's `struct super_block`.
 #[repr(transparent)]
 pub struct SuperBlock<T: FileSystem + ?Sized>(Opaque<bindings::super_block>, PhantomData<T>);
 
+impl<T: FileSystem + ?Sized> SuperBlock<T> {
+    /// Tries to get an existing inode or create a new one if it doesn't exist yet.
+    pub fn get_or_create_inode(&self, ino: Ino) -> Result<Either<ARef<INode<T>>, NewINode<T>>> {
+        // SAFETY: The only initialisation missing from the superblock is the root, and this
+        // function is needed to create the root, so it's safe to call it.
+        let inode =
+            ptr::NonNull::new(unsafe { bindings::iget_locked(self.0.get(), ino) }).ok_or(ENOMEM)?;
+
+        // SAFETY: `inode` is valid for read, but there could be concurrent writers (e.g., if it's
+        // an already-initialised inode), so we use `read_volatile` to read its current state.
+        let state = unsafe { ptr::read_volatile(ptr::addr_of!((*inode.as_ptr()).i_state)) };
+        if state & u64::from(bindings::I_NEW) == 0 {
+            // The inode is cached. Just return it.
+            //
+            // SAFETY: `inode` had its refcount incremented by `iget_locked`; this increment is now
+            // owned by `ARef`.
+            Ok(Either::Left(unsafe { ARef::from_raw(inode.cast()) }))
+        } else {
+            // SAFETY: The new inode is valid but not fully initialised yet, so it's ok to create a
+            // `NewINode`.
+            Ok(Either::Right(NewINode(unsafe {
+                ARef::from_raw(inode.cast())
+            })))
+        }
+    }
+}
+
 /// Required superblock parameters.
 ///
 /// This is returned by implementations of [`FileSystem::super_params`].
@@ -215,41 +345,28 @@  impl<T: FileSystem + ?Sized> Tables<T> {
             sb.0.s_blocksize = 1 << sb.0.s_blocksize_bits;
             sb.0.s_flags |= bindings::SB_RDONLY;
 
-            // The following is scaffolding code that will be removed in a subsequent patch. It is
-            // needed to build a root dentry, otherwise core code will BUG().
-            // SAFETY: `sb` is the superblock being initialised, it is valid for read and write.
-            let inode = unsafe { bindings::new_inode(&mut sb.0) };
-            if inode.is_null() {
-                return Err(ENOMEM);
-            }
-
-            // SAFETY: `inode` is valid for write.
-            unsafe { bindings::set_nlink(inode, 2) };
-
-            {
-                // SAFETY: This is a newly-created inode. No other references to it exist, so it is
-                // safe to mutably dereference it.
-                let inode = unsafe { &mut *inode };
-                inode.i_ino = 1;
-                inode.i_mode = (bindings::S_IFDIR | 0o755) as _;
-
-                // SAFETY: `simple_dir_operations` never changes, it's safe to reference it.
-                inode.__bindgen_anon_3.i_fop = unsafe { &bindings::simple_dir_operations };
+            // SAFETY: The callback contract guarantees that `sb_ptr` is a unique pointer to a
+            // newly-created (and initialised above) superblock.
+            let sb = unsafe { &mut *sb_ptr.cast() };
+            let root = T::init_root(sb)?;
 
-                // SAFETY: `simple_dir_inode_operations` never changes, it's safe to reference it.
-                inode.i_op = unsafe { &bindings::simple_dir_inode_operations };
+            // Reject root inode if it belongs to a different superblock.
+            if !ptr::eq(root.super_block(), sb) {
+                return Err(EINVAL);
             }
 
             // SAFETY: `d_make_root` requires that `inode` be valid and referenced, which is the
             // case for this call.
             //
             // It takes over the inode, even on failure, so we don't need to clean it up.
-            let dentry = unsafe { bindings::d_make_root(inode) };
+            let dentry = unsafe { bindings::d_make_root(ManuallyDrop::new(root).0.get()) };
             if dentry.is_null() {
                 return Err(ENOMEM);
             }
 
-            sb.0.s_root = dentry;
+            // SAFETY: The callback contract guarantees that `sb_ptr` is a unique pointer to a
+            // newly-created (and initialised above) superblock.
+            unsafe { (*sb_ptr).s_root = dentry };
 
             Ok(0)
         })
@@ -314,9 +431,9 @@  fn init(module: &'static ThisModule) -> impl PinInit<Self, Error> {
 ///
 /// ```
 /// # mod module_fs_sample {
-/// use kernel::fs::{NewSuperBlock, SuperParams};
+/// use kernel::fs::{INode, NewSuperBlock, SuperBlock, SuperParams};
 /// use kernel::prelude::*;
-/// use kernel::{c_str, fs};
+/// use kernel::{c_str, fs, types::ARef};
 ///
 /// kernel::module_fs! {
 ///     type: MyFs,
@@ -332,6 +449,9 @@  fn init(module: &'static ThisModule) -> impl PinInit<Self, Error> {
 ///     fn super_params(_: &NewSuperBlock<Self>) -> Result<SuperParams> {
 ///         todo!()
 ///     }
+///     fn init_root(_sb: &SuperBlock<Self>) -> Result<ARef<INode<Self>>> {
+///         todo!()
+///     }
 /// }
 /// # }
 /// ```
diff --git a/samples/rust/rust_rofs.rs b/samples/rust/rust_rofs.rs
index 9878bf88b991..9e5f4c7d1c06 100644
--- a/samples/rust/rust_rofs.rs
+++ b/samples/rust/rust_rofs.rs
@@ -2,9 +2,9 @@ 
 
 //! Rust read-only file system sample.
 
-use kernel::fs::{NewSuperBlock, SuperParams};
+use kernel::fs::{INode, INodeParams, INodeType, NewSuperBlock, SuperBlock, SuperParams};
 use kernel::prelude::*;
-use kernel::{c_str, fs};
+use kernel::{c_str, fs, time::UNIX_EPOCH, types::ARef, types::Either};
 
 kernel::module_fs! {
     type: RoFs,
@@ -26,4 +26,22 @@  fn super_params(_sb: &NewSuperBlock<Self>) -> Result<SuperParams> {
             time_gran: 1,
         })
     }
+
+    fn init_root(sb: &SuperBlock<Self>) -> Result<ARef<INode<Self>>> {
+        match sb.get_or_create_inode(1)? {
+            Either::Left(existing) => Ok(existing),
+            Either::Right(new) => new.init(INodeParams {
+                typ: INodeType::Dir,
+                mode: 0o555,
+                size: 1,
+                blocks: 1,
+                nlink: 2,
+                uid: 0,
+                gid: 0,
+                atime: UNIX_EPOCH,
+                ctime: UNIX_EPOCH,
+                mtime: UNIX_EPOCH,
+            }),
+        }
+    }
 }