Message ID | a645646f071e7baa30ef37ea46ea1330ac2eb63f.1708709155.git.john@groves.net (mailing list archive) |
---|---|
State | Superseded, archived |
Headers | show |
Series | Introduce the famfs shared-memory file system | expand |
On Fri, 23 Feb 2024 11:41:55 -0600 John Groves <John@Groves.net> wrote: > This commit introduces the famfs fs_context_operations and > famfs_get_inode() which is used by the context operations. > > Signed-off-by: John Groves <john@groves.net> Trivial comments inline. > --- > fs/famfs/famfs_inode.c | 178 +++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 178 insertions(+) > > diff --git a/fs/famfs/famfs_inode.c b/fs/famfs/famfs_inode.c > index 82c861998093..f98f82962d7b 100644 > --- a/fs/famfs/famfs_inode.c > +++ b/fs/famfs/famfs_inode.c > @@ -41,6 +41,50 @@ static const struct super_operations famfs_ops; > static const struct inode_operations famfs_file_inode_operations; > static const struct inode_operations famfs_dir_inode_operations; > > +static struct inode *famfs_get_inode( > + struct super_block *sb, > + const struct inode *dir, > + umode_t mode, > + dev_t dev) > +{ > + struct inode *inode = new_inode(sb); > + > + if (inode) { reverse logic would be simpler and reduce indent. if (!inode) return NULL; > + struct timespec64 tv; > + > + inode->i_ino = get_next_ino(); > + inode_init_owner(&nop_mnt_idmap, inode, dir, mode); > + inode->i_mapping->a_ops = &ram_aops; > + mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); > + mapping_set_unevictable(inode->i_mapping); > + tv = inode_set_ctime_current(inode); > + inode_set_mtime_to_ts(inode, tv); > + inode_set_atime_to_ts(inode, tv); > + > + switch (mode & S_IFMT) { > + default: > + init_special_inode(inode, mode, dev); > + break; > + case S_IFREG: > + inode->i_op = &famfs_file_inode_operations; > + inode->i_fop = &famfs_file_operations; > + break; > + case S_IFDIR: > + inode->i_op = &famfs_dir_inode_operations; > + inode->i_fop = &simple_dir_operations; > + > + /* Directory inodes start off with i_nlink == 2 (for "." entry) */ > + inc_nlink(inode); > + break; > + case S_IFLNK: > + inode->i_op = &page_symlink_inode_operations; > + inode_nohighmem(inode); > + break; > + } > + } > + return inode; > +} > + > /********************************************************************************** > * famfs super_operations > * > @@ -150,6 +194,140 @@ famfs_open_device( > return 0; > } > > +/***************************************************************************************** > + * fs_context_operations > + */ > +static int > +famfs_fill_super( > + struct super_block *sb, > + struct fs_context *fc) > +{ > + struct famfs_fs_info *fsi = sb->s_fs_info; > + struct inode *inode; > + int rc = 0; Always initialized so no need to do it here. > + > + sb->s_maxbytes = MAX_LFS_FILESIZE; > + sb->s_blocksize = PAGE_SIZE; > + sb->s_blocksize_bits = PAGE_SHIFT; > + sb->s_magic = FAMFS_MAGIC; > + sb->s_op = &famfs_ops; > + sb->s_time_gran = 1; > + > + rc = famfs_open_device(sb, fc); > + if (rc) > + goto out; return rc; //unless you need to do more in out in later patch.. > + > + inode = famfs_get_inode(sb, NULL, S_IFDIR | fsi->mount_opts.mode, 0); > + sb->s_root = d_make_root(inode); > + if (!sb->s_root) > + rc = -ENOMEM; return -ENOMEM; return 0; > + > +out: > + return rc; > +} > + > +enum famfs_param { > + Opt_mode, > + Opt_dax, Why capital O? > +}; > + ... > + > +static DEFINE_MUTEX(famfs_context_mutex); > +static LIST_HEAD(famfs_context_list); > + > +static int famfs_get_tree(struct fs_context *fc) > +{ > + struct famfs_fs_info *fsi_entry; > + struct famfs_fs_info *fsi = fc->s_fs_info; > + > + fsi->rootdev = kstrdup(fc->source, GFP_KERNEL); > + if (!fsi->rootdev) > + return -ENOMEM; > + > + /* Fail if famfs is already mounted from the same device */ > + mutex_lock(&famfs_context_mutex); New toys might be good to use from start to avoid need for explicit unlocks in error paths. scoped_guard(mutex, &famfs_context_mutex) { list_for_each_entry(fsi_entry, &famfs_context_list, fsi_list) { if (strcmp(fsi_entry->rootdev, cs_source) == 0) { //could invert with a continue to reduce indent // or factor this out as a little helper. // famfs_check_not_mounted() pr_err(); return -EALREADY; } } list_add(&fsi->fs_list, &famfs_context_list); } return get_tree_nodev(... > + list_for_each_entry(fsi_entry, &famfs_context_list, fsi_list) { > + if (strcmp(fsi_entry->rootdev, fc->source) == 0) { > + mutex_unlock(&famfs_context_mutex); > + pr_err("%s: already mounted from rootdev %s\n", __func__, fc->source); > + return -EALREADY; > + } > + } > + > + list_add(&fsi->fsi_list, &famfs_context_list); > + mutex_unlock(&famfs_context_mutex); > + > + return get_tree_nodev(fc, famfs_fill_super); > + > +} > > MODULE_LICENSE("GPL");
On 24/02/26 01:20PM, Jonathan Cameron wrote: > On Fri, 23 Feb 2024 11:41:55 -0600 > John Groves <John@Groves.net> wrote: > > > This commit introduces the famfs fs_context_operations and > > famfs_get_inode() which is used by the context operations. > > > > Signed-off-by: John Groves <john@groves.net> > Trivial comments inline. > > > --- > > fs/famfs/famfs_inode.c | 178 +++++++++++++++++++++++++++++++++++++++++ > > 1 file changed, 178 insertions(+) > > > > diff --git a/fs/famfs/famfs_inode.c b/fs/famfs/famfs_inode.c > > index 82c861998093..f98f82962d7b 100644 > > --- a/fs/famfs/famfs_inode.c > > +++ b/fs/famfs/famfs_inode.c > > @@ -41,6 +41,50 @@ static const struct super_operations famfs_ops; > > static const struct inode_operations famfs_file_inode_operations; > > static const struct inode_operations famfs_dir_inode_operations; > > > > +static struct inode *famfs_get_inode( > > + struct super_block *sb, > > + const struct inode *dir, > > + umode_t mode, > > + dev_t dev) > > +{ > > + struct inode *inode = new_inode(sb); > > + > > + if (inode) { > reverse logic would be simpler and reduce indent. > > if (!inode) > return NULL; > Good one - I can be derpy this way. Although I'd bet I just copied that from ramfs... > > > + struct timespec64 tv; > > + > > + inode->i_ino = get_next_ino(); > > + inode_init_owner(&nop_mnt_idmap, inode, dir, mode); > > + inode->i_mapping->a_ops = &ram_aops; > > + mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); > > + mapping_set_unevictable(inode->i_mapping); > > + tv = inode_set_ctime_current(inode); > > + inode_set_mtime_to_ts(inode, tv); > > + inode_set_atime_to_ts(inode, tv); > > + > > + switch (mode & S_IFMT) { > > + default: > > + init_special_inode(inode, mode, dev); > > + break; > > + case S_IFREG: > > + inode->i_op = &famfs_file_inode_operations; > > + inode->i_fop = &famfs_file_operations; > > + break; > > + case S_IFDIR: > > + inode->i_op = &famfs_dir_inode_operations; > > + inode->i_fop = &simple_dir_operations; > > + > > + /* Directory inodes start off with i_nlink == 2 (for "." entry) */ > > + inc_nlink(inode); > > + break; > > + case S_IFLNK: > > + inode->i_op = &page_symlink_inode_operations; > > + inode_nohighmem(inode); > > + break; > > + } > > + } > > + return inode; > > +} > > + > > /********************************************************************************** > > * famfs super_operations > > * > > @@ -150,6 +194,140 @@ famfs_open_device( > > return 0; > > } > > > > +/***************************************************************************************** > > + * fs_context_operations > > + */ > > +static int > > +famfs_fill_super( > > + struct super_block *sb, > > + struct fs_context *fc) > > +{ > > + struct famfs_fs_info *fsi = sb->s_fs_info; > > + struct inode *inode; > > + int rc = 0; > Always initialized so no need to do it here. Fixed in more than one place. > > > + > > + sb->s_maxbytes = MAX_LFS_FILESIZE; > > + sb->s_blocksize = PAGE_SIZE; > > + sb->s_blocksize_bits = PAGE_SHIFT; > > + sb->s_magic = FAMFS_MAGIC; > > + sb->s_op = &famfs_ops; > > + sb->s_time_gran = 1; > > + > > + rc = famfs_open_device(sb, fc); > > + if (rc) > > + goto out; > return rc; //unless you need to do more in out in later patch.. Done > > > + > > + inode = famfs_get_inode(sb, NULL, S_IFDIR | fsi->mount_opts.mode, 0); > > + sb->s_root = d_make_root(inode); > > + if (!sb->s_root) > > + rc = -ENOMEM; > return -ENOMEM; Done > > return 0; Done > > > + > > +out: > > + return rc; > > +} > > + > > +enum famfs_param { > > + Opt_mode, > > + Opt_dax, > Why capital O? Direct copy from ramfs > > > +}; > > + > > ... > > > + > > +static DEFINE_MUTEX(famfs_context_mutex); > > +static LIST_HEAD(famfs_context_list); > > + > > +static int famfs_get_tree(struct fs_context *fc) > > +{ > > + struct famfs_fs_info *fsi_entry; > > + struct famfs_fs_info *fsi = fc->s_fs_info; > > + > > + fsi->rootdev = kstrdup(fc->source, GFP_KERNEL); > > + if (!fsi->rootdev) > > + return -ENOMEM; > > + > > + /* Fail if famfs is already mounted from the same device */ > > + mutex_lock(&famfs_context_mutex); > > New toys might be good to use from start to avoid need for explicit > unlocks in error paths. > > scoped_guard(mutex, &famfs_context_mutex) { > list_for_each_entry(fsi_entry, &famfs_context_list, fsi_list) { > if (strcmp(fsi_entry->rootdev, cs_source) == 0) { > //could invert with a continue to reduce indent > // or factor this out as a little helper. > // famfs_check_not_mounted() > pr_err(); > return -EALREADY; > } > } > list_add(&fsi->fs_list, &famfs_context_list); > } > > return get_tree_nodev(... Hey, I like this one. Thanks! John
On Fri, Feb 23, 2024 at 11:41:55AM -0600, John Groves wrote: > This commit introduces the famfs fs_context_operations and > famfs_get_inode() which is used by the context operations. > > Signed-off-by: John Groves <john@groves.net> > --- > fs/famfs/famfs_inode.c | 178 +++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 178 insertions(+) > > diff --git a/fs/famfs/famfs_inode.c b/fs/famfs/famfs_inode.c > index 82c861998093..f98f82962d7b 100644 > --- a/fs/famfs/famfs_inode.c > +++ b/fs/famfs/famfs_inode.c > @@ -41,6 +41,50 @@ static const struct super_operations famfs_ops; > static const struct inode_operations famfs_file_inode_operations; > static const struct inode_operations famfs_dir_inode_operations; > > +static struct inode *famfs_get_inode( > + struct super_block *sb, > + const struct inode *dir, > + umode_t mode, > + dev_t dev) > +{ > + struct inode *inode = new_inode(sb); > + > + if (inode) { > + struct timespec64 tv; > + > + inode->i_ino = get_next_ino(); > + inode_init_owner(&nop_mnt_idmap, inode, dir, mode); > + inode->i_mapping->a_ops = &ram_aops; > + mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); > + mapping_set_unevictable(inode->i_mapping); > + tv = inode_set_ctime_current(inode); > + inode_set_mtime_to_ts(inode, tv); > + inode_set_atime_to_ts(inode, tv); > + > + switch (mode & S_IFMT) { > + default: > + init_special_inode(inode, mode, dev); > + break; > + case S_IFREG: > + inode->i_op = &famfs_file_inode_operations; > + inode->i_fop = &famfs_file_operations; > + break; > + case S_IFDIR: > + inode->i_op = &famfs_dir_inode_operations; > + inode->i_fop = &simple_dir_operations; > + > + /* Directory inodes start off with i_nlink == 2 (for "." entry) */ > + inc_nlink(inode); > + break; > + case S_IFLNK: > + inode->i_op = &page_symlink_inode_operations; > + inode_nohighmem(inode); > + break; > + } > + } > + return inode; > +} > + > /********************************************************************************** > * famfs super_operations > * > @@ -150,6 +194,140 @@ famfs_open_device( > return 0; > } > > +/***************************************************************************************** > + * fs_context_operations > + */ > +static int > +famfs_fill_super( > + struct super_block *sb, > + struct fs_context *fc) > +{ > + struct famfs_fs_info *fsi = sb->s_fs_info; > + struct inode *inode; > + int rc = 0; > + > + sb->s_maxbytes = MAX_LFS_FILESIZE; > + sb->s_blocksize = PAGE_SIZE; > + sb->s_blocksize_bits = PAGE_SHIFT; > + sb->s_magic = FAMFS_MAGIC; > + sb->s_op = &famfs_ops; > + sb->s_time_gran = 1; > + > + rc = famfs_open_device(sb, fc); > + if (rc) > + goto out; > + > + inode = famfs_get_inode(sb, NULL, S_IFDIR | fsi->mount_opts.mode, 0); > + sb->s_root = d_make_root(inode); > + if (!sb->s_root) > + rc = -ENOMEM; > + > +out: > + return rc; > +} > + > +enum famfs_param { > + Opt_mode, > + Opt_dax, > +}; > + > +const struct fs_parameter_spec famfs_fs_parameters[] = { > + fsparam_u32oct("mode", Opt_mode), > + fsparam_string("dax", Opt_dax), > + {} > +}; > + > +static int famfs_parse_param( > + struct fs_context *fc, > + struct fs_parameter *param) > +{ > + struct famfs_fs_info *fsi = fc->s_fs_info; > + struct fs_parse_result result; > + int opt; > + > + opt = fs_parse(fc, famfs_fs_parameters, param, &result); > + if (opt == -ENOPARAM) { > + opt = vfs_parse_fs_param_source(fc, param); > + if (opt != -ENOPARAM) > + return opt; I'm not sure I understand this. But in any case add, you should add Opt_source to enum famfs_param and then add fsparam_string("source", Opt_source), to famfs_fs_parameters. Then you can add: famfs_parse_source(fc, param); You might want to consider validating your devices right away. So think about: fd_fs = fsopen("famfs", ...); ret = fsconfig(fd_fs, FSCONFIG_SET_STRING, "source", "/definitely/not/valid/device", ...) // succeeds ret = fsconfig(fd_fs, FSCONFIG_SET_FLAG, "OPTION_1", ...) // succeeds ret = fsconfig(fd_fs, FSCONFIG_SET_FLAG, "OPTION_2", ...) // succeeds ret = fsconfig(fd_fs, FSCONFIG_SET_FLAG, "OPTION_3", ...) // succeeds ret = fsconfig(fd_fs, FSCONFIG_SET_FLAG, "OPTION_N", ...) // succeeds ret = fsconfig(fd_fs, FSCONFIG_CMD_CREATE, ...) // superblock creation failed So what failed exactly? Yes, you can log into the fscontext and dmesg that it's @source that's the issue but it's annoying for userspace to setup a whole mount context only to figure out that some option was wrong at the end of it. So validating famfs_parse_source(...) { if (fc->source) return invalfc(fc, "Uhm, we already have a source.... lookup_bdev(fc->source, &dev) // validate it's a device you're actually happy to use fc->source = param->string; param->string = NULL; } Your ->get_tree implementation that actually creates/finds the superblock will validate fc->source again and yes, there's a race here in so far as the path that fc->source points to could change in between validating this in famfs_parse_source() and ->get_tree() superblock creation. This is fixable even right now but then you couldn't reuse common infrastrucute so I would just accept that race for now and we should provide a nicer mechanism on the vfs layer. > + > + return 0; > + } > + if (opt < 0) > + return opt; > + > + switch (opt) { > + case Opt_mode: > + fsi->mount_opts.mode = result.uint_32 & S_IALLUGO; > + break; > + case Opt_dax: > + if (strcmp(param->string, "always")) > + pr_notice("%s: invalid dax mode %s\n", > + __func__, param->string); > + break; > + } > + > + return 0; > +} > + > +static DEFINE_MUTEX(famfs_context_mutex); > +static LIST_HEAD(famfs_context_list); > + > +static int famfs_get_tree(struct fs_context *fc) > +{ > + struct famfs_fs_info *fsi_entry; > + struct famfs_fs_info *fsi = fc->s_fs_info; > + > + fsi->rootdev = kstrdup(fc->source, GFP_KERNEL); > + if (!fsi->rootdev) > + return -ENOMEM; > + > + /* Fail if famfs is already mounted from the same device */ > + mutex_lock(&famfs_context_mutex); > + list_for_each_entry(fsi_entry, &famfs_context_list, fsi_list) { > + if (strcmp(fsi_entry->rootdev, fc->source) == 0) { > + mutex_unlock(&famfs_context_mutex); > + pr_err("%s: already mounted from rootdev %s\n", __func__, fc->source); > + return -EALREADY; What errno is EALREADY? Isn't that socket stuff. In any case, it seems you want EBUSY? But bigger picture I'm lost. And why do you keep that list based on strings? What if I do: mount -t famfs /dev/pmem1234 /mnt # succeeds mount -t famfs /dev/pmem1234 /opt # ah, fsck me, this fails.. But wait a minute.... mount --bind /dev/pmem1234 /evil-masterplan mount -t famfs /evil-masterplan /opt # succeeds. YAY I believe that would trivially defeat your check. > + } > + } > + > + list_add(&fsi->fsi_list, &famfs_context_list); > + mutex_unlock(&famfs_context_mutex); > + > + return get_tree_nodev(fc, famfs_fill_super); So why isn't this using get_tree_bdev()? Note that a while ago I added FSCONFIG_CMD_CREAT_EXCL which prevents silent superblock reuse. To implement that I added fs_context->exclusive. If you unconditionally set fc->exclusive = 1 in your famfs_init_fs_context() and use get_tree_bdev() it will give you EBUSY if fc->source is already in use - including other famfs instances. I also fail to yet understand how that function which actually opens the block device and gets the dax device figures into this. It's a bit hard to follow what's going on since you add all those unused functions and types so there's never a wider context to see that stuff in. > + > +} > + > +static void famfs_free_fc(struct fs_context *fc) > +{ > + struct famfs_fs_info *fsi = fc->s_fs_info; > + > + if (fsi && fsi->rootdev) > + kfree(fsi->rootdev); > + > + kfree(fsi); > +} > + > +static const struct fs_context_operations famfs_context_ops = { > + .free = famfs_free_fc, > + .parse_param = famfs_parse_param, > + .get_tree = famfs_get_tree, > +}; > + > +static int famfs_init_fs_context(struct fs_context *fc) > +{ > + struct famfs_fs_info *fsi; > + > + fsi = kzalloc(sizeof(*fsi), GFP_KERNEL); > + if (!fsi) > + return -ENOMEM; > + > + fsi->mount_opts.mode = FAMFS_DEFAULT_MODE; > + fc->s_fs_info = fsi; > + fc->ops = &famfs_context_ops; > + return 0; > +} > > > MODULE_LICENSE("GPL"); > -- > 2.43.0 >
On 24/02/27 02:41PM, Christian Brauner wrote: > On Fri, Feb 23, 2024 at 11:41:55AM -0600, John Groves wrote: > > This commit introduces the famfs fs_context_operations and > > famfs_get_inode() which is used by the context operations. > > > > Signed-off-by: John Groves <john@groves.net> > > --- > > fs/famfs/famfs_inode.c | 178 +++++++++++++++++++++++++++++++++++++++++ > > 1 file changed, 178 insertions(+) > > > > diff --git a/fs/famfs/famfs_inode.c b/fs/famfs/famfs_inode.c > > index 82c861998093..f98f82962d7b 100644 > > --- a/fs/famfs/famfs_inode.c > > +++ b/fs/famfs/famfs_inode.c <snip> > > +enum famfs_param { > > + Opt_mode, > > + Opt_dax, > > +}; > > + > > +const struct fs_parameter_spec famfs_fs_parameters[] = { > > + fsparam_u32oct("mode", Opt_mode), > > + fsparam_string("dax", Opt_dax), > > + {} > > +}; > > + > > +static int famfs_parse_param( > > + struct fs_context *fc, > > + struct fs_parameter *param) > > +{ > > + struct famfs_fs_info *fsi = fc->s_fs_info; > > + struct fs_parse_result result; > > + int opt; > > + > > + opt = fs_parse(fc, famfs_fs_parameters, param, &result); > > + if (opt == -ENOPARAM) { > > + opt = vfs_parse_fs_param_source(fc, param); > > + if (opt != -ENOPARAM) > > + return opt; > > I'm not sure I understand this. But in any case add, you should add > Opt_source to enum famfs_param and then add > > fsparam_string("source", Opt_source), > > to famfs_fs_parameters. Then you can add: > > famfs_parse_source(fc, param); > > You might want to consider validating your devices right away. So think > about: > > fd_fs = fsopen("famfs", ...); > ret = fsconfig(fd_fs, FSCONFIG_SET_STRING, "source", "/definitely/not/valid/device", ...) // succeeds > ret = fsconfig(fd_fs, FSCONFIG_SET_FLAG, "OPTION_1", ...) // succeeds > ret = fsconfig(fd_fs, FSCONFIG_SET_FLAG, "OPTION_2", ...) // succeeds > ret = fsconfig(fd_fs, FSCONFIG_SET_FLAG, "OPTION_3", ...) // succeeds > ret = fsconfig(fd_fs, FSCONFIG_SET_FLAG, "OPTION_N", ...) // succeeds > ret = fsconfig(fd_fs, FSCONFIG_CMD_CREATE, ...) // superblock creation failed > > So what failed exactly? Yes, you can log into the fscontext and dmesg > that it's @source that's the issue but it's annoying for userspace to > setup a whole mount context only to figure out that some option was > wrong at the end of it. > > So validating > > famfs_parse_source(...) > { > if (fc->source) > return invalfc(fc, "Uhm, we already have a source.... > > lookup_bdev(fc->source, &dev) > // validate it's a device you're actually happy to use > > fc->source = param->string; > param->string = NULL; > } > > Your ->get_tree implementation that actually creates/finds the > superblock will validate fc->source again and yes, there's a race here > in so far as the path that fc->source points to could change in between > validating this in famfs_parse_source() and ->get_tree() superblock > creation. This is fixable even right now but then you couldn't reuse > common infrastrucute so I would just accept that race for now and we > should provide a nicer mechanism on the vfs layer. I wasn't aware of the new fsconfig interface. Is there documentation or a file sytsem that already uses it that I should refer to? I didn't find an obvious candidate, but it might be me. If it should be obvious from the example above, tell me and I'll try harder. My famfs code above was copied from ramfs. If you point me to documentation I might send you a ramfs fsconfig patch too :D. > > > + > > + return 0; > > + } > > + if (opt < 0) > > + return opt; > > + > > + switch (opt) { > > + case Opt_mode: > > + fsi->mount_opts.mode = result.uint_32 & S_IALLUGO; > > + break; > > + case Opt_dax: > > + if (strcmp(param->string, "always")) > > + pr_notice("%s: invalid dax mode %s\n", > > + __func__, param->string); > > + break; > > + } > > + > > + return 0; > > +} > > + > > +static DEFINE_MUTEX(famfs_context_mutex); > > +static LIST_HEAD(famfs_context_list); > > + > > +static int famfs_get_tree(struct fs_context *fc) > > +{ > > + struct famfs_fs_info *fsi_entry; > > + struct famfs_fs_info *fsi = fc->s_fs_info; > > + > > + fsi->rootdev = kstrdup(fc->source, GFP_KERNEL); > > + if (!fsi->rootdev) > > + return -ENOMEM; > > + > > + /* Fail if famfs is already mounted from the same device */ > > + mutex_lock(&famfs_context_mutex); > > + list_for_each_entry(fsi_entry, &famfs_context_list, fsi_list) { > > + if (strcmp(fsi_entry->rootdev, fc->source) == 0) { > > + mutex_unlock(&famfs_context_mutex); > > + pr_err("%s: already mounted from rootdev %s\n", __func__, fc->source); > > + return -EALREADY; > > What errno is EALREADY? Isn't that socket stuff. In any case, it seems > you want EBUSY? Thanks... That should probaby be EBUSY. But the whole famfs_context_list should probably also be removed. More below... > > But bigger picture I'm lost. And why do you keep that list based on > strings? What if I do: > > mount -t famfs /dev/pmem1234 /mnt # succeeds > > mount -t famfs /dev/pmem1234 /opt # ah, fsck me, this fails.. But wait a minute.... > > mount --bind /dev/pmem1234 /evil-masterplan > > mount -t famfs /evil-masterplan /opt # succeeds. YAY > > I believe that would trivially defeat your check. > And I suspect this is related to the get_tree issue you noticed below. This famfs code was working in 6.5 without keeping the linked list of devices, but in 6.6/6.7/6.8 it works provided you don't try to repeat a mount command that has already succeeded. I'm not sure why 6.5 protected me from that, but the later versions don't. In 6.6+ That hits a BUG_ON (have specifics on that but not handy right now). So for a while we just removed repeated mount requests from the famfs smoke tests, but eventually I implemented the list above, which - though you're right it would be easy to circumvent and therefore is not right - it did solve the problem that we were testing for. I suspect that correctly handling get_tree might solve this problem. Please assume that linked list will be removed - it was not the right solution. More below... > > + } > > + } > > + > > + list_add(&fsi->fsi_list, &famfs_context_list); > > + mutex_unlock(&famfs_context_mutex); > > + > > + return get_tree_nodev(fc, famfs_fill_super); > > So why isn't this using get_tree_bdev()? Note that a while ago I > added FSCONFIG_CMD_CREAT_EXCL which prevents silent superblock reuse. To > implement that I added fs_context->exclusive. If you unconditionally set > fc->exclusive = 1 in your famfs_init_fs_context() and use > get_tree_bdev() it will give you EBUSY if fc->source is already in use - > including other famfs instances. > > I also fail to yet understand how that function which actually opens the block > device and gets the dax device figures into this. It's a bit hard to follow > what's going on since you add all those unused functions and types so there's > never a wider context to see that stuff in. Clearly that's a bug in my code. That get_tree_nodev() is from ramfs, which was the starting point for famfs. I'm wondering if doing this correctly (get_tree_bdev() when it's pmem) would have solved my double mount problem on 6.6 onward. However, there's another wrinkle: I'm concluding (see https://lore.kernel.org/linux-fsdevel/ups6cvjw6bx5m3hotn452brbbcgemnarsasre6ep2lbe4tpjsy@ezp6oh5c72ur/) that famfs should drop block support and just work with /dev/dax. So famfs may be the first file system to be hosted on a character device? Certainly first on character dax. Given that, what variant of get_tree() should it call? Should it add get_tree_dax()? I'm not yet familiar enough with that code to have a worthy opinion on this. Please let me know what you think. Thank you for the serious review! John
On 2/27/24 16:59, John Groves wrote: > On 24/02/27 02:41PM, Christian Brauner wrote: >> On Fri, Feb 23, 2024 at 11:41:55AM -0600, John Groves wrote: >>> This commit introduces the famfs fs_context_operations and >>> famfs_get_inode() which is used by the context operations. >>> >>> Signed-off-by: John Groves <john@groves.net> >>> --- >>> fs/famfs/famfs_inode.c | 178 +++++++++++++++++++++++++++++++++++++++++ >>> 1 file changed, 178 insertions(+) >>> >>> diff --git a/fs/famfs/famfs_inode.c b/fs/famfs/famfs_inode.c >>> index 82c861998093..f98f82962d7b 100644 >>> --- a/fs/famfs/famfs_inode.c >>> +++ b/fs/famfs/famfs_inode.c > > <snip> > > > I wasn't aware of the new fsconfig interface. Is there documentation or a > file sytsem that already uses it that I should refer to? I didn't find an > obvious candidate, but it might be me. If it should be obvious from the > example above, tell me and I'll try harder. > My famfs code above was copied from ramfs. If you point me to > documentation I might send you a ramfs fsconfig patch too :D. All that I found was the commit to add fsconfig to the kernel tree: commit ecdab150fddb Author: David Howells <dhowells@redhat.com> Date: Thu Nov 1 23:36:09 2018 +0000 vfs: syscall: Add fsconfig() for configuring and managing a context and the lore archive for its discussion: https://lore.kernel.org/all/153313723557.13253.9055982745313603422.stgit@warthog.procyon.org.uk/ plus David's userspace man page addition for it: https://lore.kernel.org/linux-fsdevel/159680897140.29015.15318866561972877762.stgit@warthog.procyon.org.uk/
> plus David's userspace man page addition for it: > https://lore.kernel.org/linux-fsdevel/159680897140.29015.15318866561972877762.stgit@warthog.procyon.org.uk/ Up to date manpages are https://github.com/brauner/man-pages-md
> I wasn't aware of the new fsconfig interface. Is there documentation or a > file sytsem that already uses it that I should refer to? I didn't find an > obvious candidate, but it might be me. If it should be obvious from the > example above, tell me and I'll try harder. > > My famfs code above was copied from ramfs. If you point me to Ok, but that's the wrong filesystem to use as a model imho. Because it really doesn't deal with devices at all. That's why it uses get_tree_nodev() with "nodev" as in "no device" kinda. So ramfs doesn't have any of these issues. Whereas your filesystems is dealing with devices dax (or pmem). > documentation I might send you a ramfs fsconfig patch too :D. So the manpages are at: https://github.com/brauner/man-pages-md But really, there shouldn't be anything that needs to change for ramfs. > > What errno is EALREADY? Isn't that socket stuff. In any case, it seems > > you want EBUSY? > > Thanks... That should probaby be EBUSY. But the whole famfs_context_list > should probably also be removed. More below... > > > > > But bigger picture I'm lost. And why do you keep that list based on > > strings? What if I do: > > > > mount -t famfs /dev/pmem1234 /mnt # succeeds > > > > mount -t famfs /dev/pmem1234 /opt # ah, fsck me, this fails.. But wait a minute.... > > > > mount --bind /dev/pmem1234 /evil-masterplan > > > > mount -t famfs /evil-masterplan /opt # succeeds. YAY > > > > I believe that would trivially defeat your check. > > > > And I suspect this is related to the get_tree issue you noticed below. > > This famfs code was working in 6.5 without keeping the linked list of devices, > but in 6.6/6.7/6.8 it works provided you don't try to repeat a mount command > that has already succeeded. I'm not sure why 6.5 protected me from that, > but the later versions don't. In 6.6+ That hits a BUG_ON (have specifics on > that but not handy right now). get_tree_nodev() by default will always allocate a new superblock. This is how tmpfs and ramfs work. If you do: mount -t tmpfs tmpfs /mnt mount -t tmpfs tmpfs /opt You get two new, independent superblocks. This is what you want for these multi-instance filesystems: each new mount creates a new instance. If famfs doesn't want to allow reusing devices - which I very much think it wants to prevent - then it cannot use get_tree_nodev() directly without having a hack like you did. Because you'll get a new superblock no problem. So the fact that it did work somehow likely was a bug in your code. The reason your code causes crashes is very likely this: struct famfs_fs_info *fsi = sb->s_fs_info; handlep = bdev_open_by_path(fc->source, FAMFS_BLKDEV_MODE, fsi, &fs_holder_ops); If you look at Documentation/filesystems/porting.rst you should see that if you use @fs_holder_ops then your holder should be the struct super_block, not your personal fsinfo. > So for a while we just removed repeated mount requests from the famfs smoke > tests, but eventually I implemented the list above, which - though you're right > it would be easy to circumvent and therefore is not right - it did solve the > problem that we were testing for. > > I suspect that correctly handling get_tree might solve this problem. > > Please assume that linked list will be removed - it was not the right solution. > > More below... > > > > + } > > > + } > > > + > > > + list_add(&fsi->fsi_list, &famfs_context_list); > > > + mutex_unlock(&famfs_context_mutex); > > > + > > > + return get_tree_nodev(fc, famfs_fill_super); > > > > So why isn't this using get_tree_bdev()? Note that a while ago I > > added FSCONFIG_CMD_CREAT_EXCL which prevents silent superblock reuse. To > > implement that I added fs_context->exclusive. If you unconditionally set > > fc->exclusive = 1 in your famfs_init_fs_context() and use > > get_tree_bdev() it will give you EBUSY if fc->source is already in use - > > including other famfs instances. > > > > I also fail to yet understand how that function which actually opens the block > > device and gets the dax device figures into this. It's a bit hard to follow > > what's going on since you add all those unused functions and types so there's > > never a wider context to see that stuff in. > > Clearly that's a bug in my code. That get_tree_nodev() is from ramfs, which > was the starting point for famfs. > > I'm wondering if doing this correctly (get_tree_bdev() when it's pmem) would > have solved my double mount problem on 6.6 onward. > > However, there's another wrinkle: I'm concluding > (see https://lore.kernel.org/linux-fsdevel/ups6cvjw6bx5m3hotn452brbbcgemnarsasre6ep2lbe4tpjsy@ezp6oh5c72ur/) > that famfs should drop block support and just work with /dev/dax. So famfs > may be the first file system to be hosted on a character device? Certainly > first on character dax. Ugh, ok. I defer to others whether that makes sense or not. It would be a lot easier for you if you used pmem block devices, I guess because it would be easy to detect reuse in common infrastructure. But also, I'm looking at your code a bit closer. There's a bit of a wrinkle the way it's currently written... Say someone went a bit weird and did: mount -t xfs xfs /dev/sda /my/xfs-filesystem mknod DAX_DEVICE /my/xfs-filesystem/dax1234 and then did: mount -t famfs famfs /my/xfs-filesystem/dax1234 /mnt Internally in famfs you do: fsi->dax_filp = filp_open(fc->source, O_RDWR, 0); and you stash that file... Which means that you are pinning that xfs filesystems implicitly. IOW, if someone does: umount /my/xfs-filesystem they get EBUSY for completely opaque reasons. And if they did: umount -l /my/xfs-filesystem followed by mounting that xfs filesystem again they'd get the same superblock for that xfs filesystem. What I'm trying to say is that I think you cannot pin another filesystem like this when you open that device. IOW, you either need to stash the plain dax device or dax needs to become it's own tiny internal pseudo fs such that we can open dax devices internally just like files. Which might actually also be worth doing. But I'm not the maintainer of that. > > Given that, what variant of get_tree() should it call? Should it add > get_tree_dax()? I'm not yet familiar enough with that code to have a worthy > opinion on this. I don't think we need a common helper if famfs would be the only user of this. But maybe I'm wrong. But roughly you'd need something similar to what we do for block devices, I'd reckon. So lookup_daxdev() which is similar to lookup_bdev() and allows you to translate from path to dax device number maybe. lookup_daxdev(const char *name, struct dax_dev? *daxdev) { /* Don't actually open the dax device pointlessly */ kern_path(fc->source, LOOKUP_FOLLOW, path); if (!S_ISCHR(inode->i_mode)) // fail if (!may_open_dev(&path)) // fail // check dax device and pin // get rid of path references path_put(&path); } famfs_get_tree(/* broken broken broken */) { lookup_daxdev(fc->source, &ddev); sb = sget_fc(fc, famfs_test_super, set_anon_super_fc) if (IS_ERR(sb)) // Error here may mean (aside from memory): // * superblock incompatible bc of read-write vs read-only // * non-matching user namespace // * FSCONFIG_CMD_CREATE_EXCL requested by mounter if (!sb->s_root) { // fill_super; new sb; dax device currently not used. } else { // A superblock for that dax device already exists and // may be reused. Any additional rejection reasons for // such an sb are up to the filesystem. } // Now really open or claim the dax device. // If you fail get rid of the superblock // (deactivate_locked_super()). All handwavy, I know and probably I forgot details. But for you to fill that in. ;)
On Wed, Feb 28, 2024 at 11:07:20AM +0100, Christian Brauner wrote: > > I wasn't aware of the new fsconfig interface. Is there documentation or a > > file sytsem that already uses it that I should refer to? I didn't find an > > obvious candidate, but it might be me. If it should be obvious from the > > example above, tell me and I'll try harder. > > > > My famfs code above was copied from ramfs. If you point me to > > Ok, but that's the wrong filesystem to use as a model imho. Because it > really doesn't deal with devices at all. That's why it uses > get_tree_nodev() with "nodev" as in "no device" kinda. So ramfs doesn't > have any of these issues. Whereas your filesystems is dealing with > devices dax (or pmem). > > > documentation I might send you a ramfs fsconfig patch too :D. > > So the manpages are at: > > https://github.com/brauner/man-pages-md > > But really, there shouldn't be anything that needs to change for ramfs. > > > > What errno is EALREADY? Isn't that socket stuff. In any case, it seems > > > you want EBUSY? > > > > Thanks... That should probaby be EBUSY. But the whole famfs_context_list > > should probably also be removed. More below... > > > > > > > > But bigger picture I'm lost. And why do you keep that list based on > > > strings? What if I do: > > > > > > mount -t famfs /dev/pmem1234 /mnt # succeeds > > > > > > mount -t famfs /dev/pmem1234 /opt # ah, fsck me, this fails.. But wait a minute.... > > > > > > mount --bind /dev/pmem1234 /evil-masterplan > > > > > > mount -t famfs /evil-masterplan /opt # succeeds. YAY > > > > > > I believe that would trivially defeat your check. > > > > > > > And I suspect this is related to the get_tree issue you noticed below. > > > > This famfs code was working in 6.5 without keeping the linked list of devices, > > but in 6.6/6.7/6.8 it works provided you don't try to repeat a mount command > > that has already succeeded. I'm not sure why 6.5 protected me from that, > > but the later versions don't. In 6.6+ That hits a BUG_ON (have specifics on > > that but not handy right now). > > get_tree_nodev() by default will always allocate a new superblock. This > is how tmpfs and ramfs work. If you do: > > mount -t tmpfs tmpfs /mnt > mount -t tmpfs tmpfs /opt > > You get two new, independent superblocks. This is what you want for > these multi-instance filesystems: each new mount creates a new instance. > > If famfs doesn't want to allow reusing devices - which I very much think > it wants to prevent - then it cannot use get_tree_nodev() directly > without having a hack like you did. Because you'll get a new superblock > no problem. So the fact that it did work somehow likely was a bug in > your code. > > The reason your code causes crashes is very likely this: > > struct famfs_fs_info *fsi = sb->s_fs_info; > handlep = bdev_open_by_path(fc->source, FAMFS_BLKDEV_MODE, fsi, &fs_holder_ops); > > If you look at Documentation/filesystems/porting.rst you should see that > if you use @fs_holder_ops then your holder should be the struct > super_block, not your personal fsinfo. > > > So for a while we just removed repeated mount requests from the famfs smoke > > tests, but eventually I implemented the list above, which - though you're right > > it would be easy to circumvent and therefore is not right - it did solve the > > problem that we were testing for. > > > > I suspect that correctly handling get_tree might solve this problem. > > > > Please assume that linked list will be removed - it was not the right solution. > > > > More below... > > > > > > + } > > > > + } > > > > + > > > > + list_add(&fsi->fsi_list, &famfs_context_list); > > > > + mutex_unlock(&famfs_context_mutex); > > > > + > > > > + return get_tree_nodev(fc, famfs_fill_super); > > > > > > So why isn't this using get_tree_bdev()? Note that a while ago I > > > added FSCONFIG_CMD_CREAT_EXCL which prevents silent superblock reuse. To > > > implement that I added fs_context->exclusive. If you unconditionally set > > > fc->exclusive = 1 in your famfs_init_fs_context() and use > > > get_tree_bdev() it will give you EBUSY if fc->source is already in use - > > > including other famfs instances. > > > > > > I also fail to yet understand how that function which actually opens the block > > > device and gets the dax device figures into this. It's a bit hard to follow > > > what's going on since you add all those unused functions and types so there's > > > never a wider context to see that stuff in. > > > > Clearly that's a bug in my code. That get_tree_nodev() is from ramfs, which > > was the starting point for famfs. > > > > I'm wondering if doing this correctly (get_tree_bdev() when it's pmem) would > > have solved my double mount problem on 6.6 onward. > > > > However, there's another wrinkle: I'm concluding > > (see https://lore.kernel.org/linux-fsdevel/ups6cvjw6bx5m3hotn452brbbcgemnarsasre6ep2lbe4tpjsy@ezp6oh5c72ur/) > > that famfs should drop block support and just work with /dev/dax. So famfs > > may be the first file system to be hosted on a character device? Certainly > > first on character dax. > > Ugh, ok. I defer to others whether that makes sense or not. It would be > a lot easier for you if you used pmem block devices, I guess because it > would be easy to detect reuse in common infrastructure. > > But also, I'm looking at your code a bit closer. There's a bit of a > wrinkle the way it's currently written... > > Say someone went a bit weird and did: > > mount -t xfs xfs /dev/sda /my/xfs-filesystem > mknod DAX_DEVICE /my/xfs-filesystem/dax1234 > > and then did: > > mount -t famfs famfs /my/xfs-filesystem/dax1234 /mnt > > Internally in famfs you do: > > fsi->dax_filp = filp_open(fc->source, O_RDWR, 0); > > and you stash that file... Which means that you are pinning that xfs > filesystems implicitly. IOW, if someone does: > > umount /my/xfs-filesystem > > they get EBUSY for completely opaque reasons. And if they did: > > umount -l /my/xfs-filesystem > > followed by mounting that xfs filesystem again they'd get the same > superblock for that xfs filesystem. > > What I'm trying to say is that I think you cannot pin another filesystem > like this when you open that device. > > IOW, you either need to stash the plain dax device or dax needs to > become it's own tiny internal pseudo fs such that we can open dax > devices internally just like files. Which might actually also be worth > doing. But I'm not the maintainer of that. Ah, I see it's already like that and I was looking at the wrong file. Great! So in that case you could add helper to open dax devices as files: struct file *dax_file_open(struct dax_device *dev, int flags, /* other stuff */) { /* open that thing */ dax_file = alloc_file_pseudo(dax_inode, dax_vfsmnt, "", flags | O_LARGEFILE, &something_fops); } and then you can treat them as regular files without running into the issues I pointed out.
diff --git a/fs/famfs/famfs_inode.c b/fs/famfs/famfs_inode.c index 82c861998093..f98f82962d7b 100644 --- a/fs/famfs/famfs_inode.c +++ b/fs/famfs/famfs_inode.c @@ -41,6 +41,50 @@ static const struct super_operations famfs_ops; static const struct inode_operations famfs_file_inode_operations; static const struct inode_operations famfs_dir_inode_operations; +static struct inode *famfs_get_inode( + struct super_block *sb, + const struct inode *dir, + umode_t mode, + dev_t dev) +{ + struct inode *inode = new_inode(sb); + + if (inode) { + struct timespec64 tv; + + inode->i_ino = get_next_ino(); + inode_init_owner(&nop_mnt_idmap, inode, dir, mode); + inode->i_mapping->a_ops = &ram_aops; + mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); + mapping_set_unevictable(inode->i_mapping); + tv = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, tv); + inode_set_atime_to_ts(inode, tv); + + switch (mode & S_IFMT) { + default: + init_special_inode(inode, mode, dev); + break; + case S_IFREG: + inode->i_op = &famfs_file_inode_operations; + inode->i_fop = &famfs_file_operations; + break; + case S_IFDIR: + inode->i_op = &famfs_dir_inode_operations; + inode->i_fop = &simple_dir_operations; + + /* Directory inodes start off with i_nlink == 2 (for "." entry) */ + inc_nlink(inode); + break; + case S_IFLNK: + inode->i_op = &page_symlink_inode_operations; + inode_nohighmem(inode); + break; + } + } + return inode; +} + /********************************************************************************** * famfs super_operations * @@ -150,6 +194,140 @@ famfs_open_device( return 0; } +/***************************************************************************************** + * fs_context_operations + */ +static int +famfs_fill_super( + struct super_block *sb, + struct fs_context *fc) +{ + struct famfs_fs_info *fsi = sb->s_fs_info; + struct inode *inode; + int rc = 0; + + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_blocksize = PAGE_SIZE; + sb->s_blocksize_bits = PAGE_SHIFT; + sb->s_magic = FAMFS_MAGIC; + sb->s_op = &famfs_ops; + sb->s_time_gran = 1; + + rc = famfs_open_device(sb, fc); + if (rc) + goto out; + + inode = famfs_get_inode(sb, NULL, S_IFDIR | fsi->mount_opts.mode, 0); + sb->s_root = d_make_root(inode); + if (!sb->s_root) + rc = -ENOMEM; + +out: + return rc; +} + +enum famfs_param { + Opt_mode, + Opt_dax, +}; + +const struct fs_parameter_spec famfs_fs_parameters[] = { + fsparam_u32oct("mode", Opt_mode), + fsparam_string("dax", Opt_dax), + {} +}; + +static int famfs_parse_param( + struct fs_context *fc, + struct fs_parameter *param) +{ + struct famfs_fs_info *fsi = fc->s_fs_info; + struct fs_parse_result result; + int opt; + + opt = fs_parse(fc, famfs_fs_parameters, param, &result); + if (opt == -ENOPARAM) { + opt = vfs_parse_fs_param_source(fc, param); + if (opt != -ENOPARAM) + return opt; + + return 0; + } + if (opt < 0) + return opt; + + switch (opt) { + case Opt_mode: + fsi->mount_opts.mode = result.uint_32 & S_IALLUGO; + break; + case Opt_dax: + if (strcmp(param->string, "always")) + pr_notice("%s: invalid dax mode %s\n", + __func__, param->string); + break; + } + + return 0; +} + +static DEFINE_MUTEX(famfs_context_mutex); +static LIST_HEAD(famfs_context_list); + +static int famfs_get_tree(struct fs_context *fc) +{ + struct famfs_fs_info *fsi_entry; + struct famfs_fs_info *fsi = fc->s_fs_info; + + fsi->rootdev = kstrdup(fc->source, GFP_KERNEL); + if (!fsi->rootdev) + return -ENOMEM; + + /* Fail if famfs is already mounted from the same device */ + mutex_lock(&famfs_context_mutex); + list_for_each_entry(fsi_entry, &famfs_context_list, fsi_list) { + if (strcmp(fsi_entry->rootdev, fc->source) == 0) { + mutex_unlock(&famfs_context_mutex); + pr_err("%s: already mounted from rootdev %s\n", __func__, fc->source); + return -EALREADY; + } + } + + list_add(&fsi->fsi_list, &famfs_context_list); + mutex_unlock(&famfs_context_mutex); + + return get_tree_nodev(fc, famfs_fill_super); + +} + +static void famfs_free_fc(struct fs_context *fc) +{ + struct famfs_fs_info *fsi = fc->s_fs_info; + + if (fsi && fsi->rootdev) + kfree(fsi->rootdev); + + kfree(fsi); +} + +static const struct fs_context_operations famfs_context_ops = { + .free = famfs_free_fc, + .parse_param = famfs_parse_param, + .get_tree = famfs_get_tree, +}; + +static int famfs_init_fs_context(struct fs_context *fc) +{ + struct famfs_fs_info *fsi; + + fsi = kzalloc(sizeof(*fsi), GFP_KERNEL); + if (!fsi) + return -ENOMEM; + + fsi->mount_opts.mode = FAMFS_DEFAULT_MODE; + fc->s_fs_info = fsi; + fc->ops = &famfs_context_ops; + return 0; +} MODULE_LICENSE("GPL");
This commit introduces the famfs fs_context_operations and famfs_get_inode() which is used by the context operations. Signed-off-by: John Groves <john@groves.net> --- fs/famfs/famfs_inode.c | 178 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 178 insertions(+)