diff mbox series

[v2,2/8] fwctl: Basic ioctl dispatch for the character device

Message ID 2-v2-940e479ceba9+3821-fwctl_jgg@nvidia.com
State Superseded
Headers show
Series Introduce fwctl subystem | expand

Commit Message

Jason Gunthorpe June 24, 2024, 10:47 p.m. UTC
Each file descriptor gets a chunk of per-FD driver specific context that
allows the driver to attach a device specific struct to. The core code
takes care of the memory lifetime for this structure.

The ioctl dispatch and design is based on what was built for iommufd. The
ioctls have a struct which has a combined in/out behavior with a typical
'zero pad' scheme for future extension and backwards compatibility.

Like iommufd some shared logic does most of the ioctl marshalling and
compatibility work and tables diatches to some function pointers for
each unique iotcl.

This approach has proven to work quite well in the iommufd and rdma
subsystems.

Allocate an ioctl number space for the subsystem.

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 .../userspace-api/ioctl/ioctl-number.rst      |   1 +
 MAINTAINERS                                   |   1 +
 drivers/fwctl/main.c                          | 124 +++++++++++++++++-
 include/linux/fwctl.h                         |  31 +++++
 include/uapi/fwctl/fwctl.h                    |  41 ++++++
 5 files changed, 196 insertions(+), 2 deletions(-)
 create mode 100644 include/uapi/fwctl/fwctl.h

Comments

Jonathan Cameron July 26, 2024, 3:01 p.m. UTC | #1
On Mon, 24 Jun 2024 19:47:26 -0300
Jason Gunthorpe <jgg@nvidia.com> wrote:

> Each file descriptor gets a chunk of per-FD driver specific context that
> allows the driver to attach a device specific struct to. The core code
> takes care of the memory lifetime for this structure.
> 
> The ioctl dispatch and design is based on what was built for iommufd. The
> ioctls have a struct which has a combined in/out behavior with a typical
> 'zero pad' scheme for future extension and backwards compatibility.
> 
> Like iommufd some shared logic does most of the ioctl marshalling and
> compatibility work and tables diatches to some function pointers for
> each unique iotcl.
> 
> This approach has proven to work quite well in the iommufd and rdma
> subsystems.
> 
> Allocate an ioctl number space for the subsystem.
> 
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Minor stuff inline.

>  M:	Sebastian Reichel <sre@kernel.org>
> diff --git a/drivers/fwctl/main.c b/drivers/fwctl/main.c
> index 6e9bf15c743b5c..6872c01d5c62e8 100644
> --- a/drivers/fwctl/main.c
> +++ b/drivers/fwctl/main.c

> +
> +struct fwctl_ioctl_op {
> +	unsigned int size;
> +	unsigned int min_size;
> +	unsigned int ioctl_num;
> +	int (*execute)(struct fwctl_ucmd *ucmd);
> +};
> +
> +#define IOCTL_OP(_ioctl, _fn, _struct, _last)                         \
> +	[_IOC_NR(_ioctl) - FWCTL_CMD_BASE] = {                        \

If this is always zero indexed, maybe just drop the - FWCTL_CMD_BASE here
and elsewhere?  Maybe through in a BUILD_BUG to confirm it is always 0.


> +		.size = sizeof(_struct) +                             \
> +			BUILD_BUG_ON_ZERO(sizeof(union ucmd_buffer) < \
> +					  sizeof(_struct)),           \
> +		.min_size = offsetofend(_struct, _last),              \
> +		.ioctl_num = _ioctl,                                  \
> +		.execute = _fn,                                       \
> +	}
> +static const struct fwctl_ioctl_op fwctl_ioctl_ops[] = {
> +};
> +
> +static long fwctl_fops_ioctl(struct file *filp, unsigned int cmd,
> +			       unsigned long arg)
> +{
> +	struct fwctl_uctx *uctx = filp->private_data;
> +	const struct fwctl_ioctl_op *op;
> +	struct fwctl_ucmd ucmd = {};
> +	union ucmd_buffer buf;
> +	unsigned int nr;
> +	int ret;
> +
> +	nr = _IOC_NR(cmd);
> +	if ((nr - FWCTL_CMD_BASE) >= ARRAY_SIZE(fwctl_ioctl_ops))
> +		return -ENOIOCTLCMD;

I'd add a blank line here as two unconnected set and error check
blocks.

> +	op = &fwctl_ioctl_ops[nr - FWCTL_CMD_BASE];
> +	if (op->ioctl_num != cmd)
> +		return -ENOIOCTLCMD;
> +
> +	ucmd.uctx = uctx;
> +	ucmd.cmd = &buf;
> +	ucmd.ubuffer = (void __user *)arg;
> +	ret = get_user(ucmd.user_size, (u32 __user *)ucmd.ubuffer);
> +	if (ret)
> +		return ret;
> +
> +	if (ucmd.user_size < op->min_size)
> +		return -EINVAL;
> +
> +	ret = copy_struct_from_user(ucmd.cmd, op->size, ucmd.ubuffer,
> +				    ucmd.user_size);
> +	if (ret)
> +		return ret;
> +
> +	guard(rwsem_read)(&uctx->fwctl->registration_lock);
> +	if (!uctx->fwctl->ops)
> +		return -ENODEV;
> +	return op->execute(&ucmd);
> +}
> +
>  static int fwctl_fops_open(struct inode *inode, struct file *filp)
>  {
>  	struct fwctl_device *fwctl =
>  		container_of(inode->i_cdev, struct fwctl_device, cdev);
> +	int ret;
> +
> +	guard(rwsem_read)(&fwctl->registration_lock);
> +	if (!fwctl->ops)
> +		return -ENODEV;
> +
> +	struct fwctl_uctx *uctx __free(kfree) =
> +		kzalloc(fwctl->ops->uctx_size, GFP_KERNEL | GFP_KERNEL_ACCOUNT);

GFP_KERNEL_ACCOUNT seems to include GFP_KERNEL already.
Did I miss some racing change?

> +	if (!uctx)
> +		return -ENOMEM;
> +
> +	uctx->fwctl = fwctl;
> +	ret = fwctl->ops->open_uctx(uctx);
> +	if (ret)
> +		return ret;
> +
> +	scoped_guard(mutex, &fwctl->uctx_list_lock) {
> +		list_add_tail(&uctx->uctx_list_entry, &fwctl->uctx_list);
> +	}

I guess more may come later but do we need {}?


>  
>  	get_device(&fwctl->dev);
> -	filp->private_data = fwctl;
> +	filp->private_data = no_free_ptr(uctx);
>  	return 0;
>  }
>  
> +static void fwctl_destroy_uctx(struct fwctl_uctx *uctx)
> +{
> +	lockdep_assert_held(&uctx->fwctl->uctx_list_lock);
> +	list_del(&uctx->uctx_list_entry);
> +	uctx->fwctl->ops->close_uctx(uctx);
> +}
> +
>  static int fwctl_fops_release(struct inode *inode, struct file *filp)
>  {
> -	struct fwctl_device *fwctl = filp->private_data;
> +	struct fwctl_uctx *uctx = filp->private_data;
> +	struct fwctl_device *fwctl = uctx->fwctl;
>  
> +	scoped_guard(rwsem_read, &fwctl->registration_lock) {
> +		if (fwctl->ops) {

Maybe a comment on when this path happens to help the reader
along. (when the file is closed and device is still alive).
Otherwise was cleaned up already in fwctl_unregister()

> +			guard(mutex)(&fwctl->uctx_list_lock);
> +			fwctl_destroy_uctx(uctx);
> +		}
> +	}
> +
> +	kfree(uctx);
>  	fwctl_put(fwctl);
>  	return 0;
>  }
> @@ -37,6 +142,7 @@ static const struct file_operations fwctl_fops = {
>  	.owner = THIS_MODULE,
>  	.open = fwctl_fops_open,
>  	.release = fwctl_fops_release,
> +	.unlocked_ioctl = fwctl_fops_ioctl,
>  };

>  
>  	devnum = ida_alloc_max(&fwctl_ida, FWCTL_MAX_DEVICES - 1, GFP_KERNEL);
>  	if (devnum < 0)
> @@ -137,8 +247,18 @@ EXPORT_SYMBOL_NS_GPL(fwctl_register, FWCTL);
>   */
>  void fwctl_unregister(struct fwctl_device *fwctl)
>  {
> +	struct fwctl_uctx *uctx;
> +
>  	cdev_device_del(&fwctl->cdev, &fwctl->dev);
>  
> +	/* Disable and free the driver's resources for any still open FDs. */
> +	guard(rwsem_write)(&fwctl->registration_lock);
> +	guard(mutex)(&fwctl->uctx_list_lock);
> +	while ((uctx = list_first_entry_or_null(&fwctl->uctx_list,
> +						struct fwctl_uctx,
> +						uctx_list_entry)))
> +		fwctl_destroy_uctx(uctx);
> +

Obviously it's a little more heavy weight but I'd just use
list_for_each_entry_safe()

Less effort for reviewers than consider the custom iteration
you are doing instead.


>  	/*
>  	 * The driver module may unload after this returns, the op pointer will
>  	 * not be valid.
> diff --git a/include/linux/fwctl.h b/include/linux/fwctl.h
> index ef4eaa87c945e4..1d9651de92fc19 100644
> --- a/include/linux/fwctl.h
> +++ b/include/linux/fwctl.h
> @@ -11,7 +11,20 @@
>  struct fwctl_device;
>  struct fwctl_uctx;
>  
> +/**
> + * struct fwctl_ops - Driver provided operations
> + * @uctx_size: The size of the fwctl_uctx struct to allocate. The first
> + *	bytes of this memory will be a fwctl_uctx. The driver can use the
> + *	remaining bytes as its private memory.
> + * @open_uctx: Called when a file descriptor is opened before the uctx is ever
> + *	used.
> + * @close_uctx: Called when the uctx is destroyed, usually when the FD is
> + *	closed.
> + */
>  struct fwctl_ops {
> +	size_t uctx_size;
> +	int (*open_uctx)(struct fwctl_uctx *uctx);
> +	void (*close_uctx)(struct fwctl_uctx *uctx);
>  };
>  
>  /**
> @@ -26,6 +39,10 @@ struct fwctl_device {
>  	struct device dev;
>  	/* private: */
>  	struct cdev cdev;
> +
> +	struct rw_semaphore registration_lock;
> +	struct mutex uctx_list_lock;

Even for private locks, a scope statement would
be good to have.

> +	struct list_head uctx_list;
>  	const struct fwctl_ops *ops;
>  };

>  #endif
> diff --git a/include/uapi/fwctl/fwctl.h b/include/uapi/fwctl/fwctl.h
> new file mode 100644
> index 00000000000000..0bdce95b6d69d9
> --- /dev/null
> +++ b/include/uapi/fwctl/fwctl.h
> @@ -0,0 +1,41 @@
> +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
> +/* Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
> + */
> +#ifndef _UAPI_FWCTL_H
> +#define _UAPI_FWCTL_H
> +
> +#include <linux/types.h>

Not used yet.

> +#include <linux/ioctl.h>

Arguably nor is this, but at least this related to the code
here.

> +
> +#define FWCTL_TYPE 0x9A
> +
> +/**
> + * DOC: General ioctl format
> + *
> + * The ioctl interface follows a general format to allow for extensibility. Each
> + * ioctl is passed in a structure pointer as the argument providing the size of
> + * the structure in the first u32. The kernel checks that any structure space
> + * beyond what it understands is 0. This allows userspace to use the backward
> + * compatible portion while consistently using the newer, larger, structures.

Is that particularly helpful?  Userspace needs to know not to put anything in
those fields, not hard for it to also know what the size it should send is?
The two will change together.

> + *
> + * ioctls use a standard meaning for common errnos:
> + *
> + *  - ENOTTY: The IOCTL number itself is not supported at all
> + *  - E2BIG: The IOCTL number is supported, but the provided structure has
> + *    non-zero in a part the kernel does not understand.
> + *  - EOPNOTSUPP: The IOCTL number is supported, and the structure is
> + *    understood, however a known field has a value the kernel does not
> + *    understand or support.
> + *  - EINVAL: Everything about the IOCTL was understood, but a field is not
> + *    correct.
> + *  - ENOMEM: Out of memory.
> + *  - ENODEV: The underlying device has been hot-unplugged and the FD is
> + *            orphaned.
> + *
> + * As well as additional errnos, within specific ioctls.
> + */
> +enum {
> +	FWCTL_CMD_BASE = 0,
> +};
> +
> +#endif
Jason Gunthorpe July 29, 2024, 5:05 p.m. UTC | #2
On Fri, Jul 26, 2024 at 04:01:57PM +0100, Jonathan Cameron wrote:

> > +struct fwctl_ioctl_op {
> > +	unsigned int size;
> > +	unsigned int min_size;
> > +	unsigned int ioctl_num;
> > +	int (*execute)(struct fwctl_ucmd *ucmd);
> > +};
> > +
> > +#define IOCTL_OP(_ioctl, _fn, _struct, _last)                         \
> > +	[_IOC_NR(_ioctl) - FWCTL_CMD_BASE] = {                        \
> 
> If this is always zero indexed, maybe just drop the - FWCTL_CMD_BASE here
> and elsewhere?  Maybe through in a BUILD_BUG to confirm it is always 0.

I left it like this in case someone had different ideas for the number
space (iommufd uses a non 0 base also). I think either is fine, and I
slightly prefer keeping it rather than a static_assert.

> > +static long fwctl_fops_ioctl(struct file *filp, unsigned int cmd,
> > +			       unsigned long arg)
> > +{
> > +	struct fwctl_uctx *uctx = filp->private_data;
> > +	const struct fwctl_ioctl_op *op;
> > +	struct fwctl_ucmd ucmd = {};
> > +	union ucmd_buffer buf;
> > +	unsigned int nr;
> > +	int ret;
> > +
> > +	nr = _IOC_NR(cmd);
> > +	if ((nr - FWCTL_CMD_BASE) >= ARRAY_SIZE(fwctl_ioctl_ops))
> > +		return -ENOIOCTLCMD;
> 
> I'd add a blank line here as two unconnected set and error check
> blocks.

Done

> >  static int fwctl_fops_open(struct inode *inode, struct file *filp)
> >  {
> >  	struct fwctl_device *fwctl =
> >  		container_of(inode->i_cdev, struct fwctl_device, cdev);
> > +	int ret;
> > +
> > +	guard(rwsem_read)(&fwctl->registration_lock);
> > +	if (!fwctl->ops)
> > +		return -ENODEV;
> > +
> > +	struct fwctl_uctx *uctx __free(kfree) =
> > +		kzalloc(fwctl->ops->uctx_size, GFP_KERNEL | GFP_KERNEL_ACCOUNT);
> 
> GFP_KERNEL_ACCOUNT seems to include GFP_KERNEL already.
> Did I miss some racing change?

I'm sure I copy and pasted this carelessly from someplace else
 
> > +	if (!uctx)
> > +		return -ENOMEM;
> > +
> > +	uctx->fwctl = fwctl;
> > +	ret = fwctl->ops->open_uctx(uctx);
> > +	if (ret)
> > +		return ret;
> > +
> > +	scoped_guard(mutex, &fwctl->uctx_list_lock) {
> > +		list_add_tail(&uctx->uctx_list_entry, &fwctl->uctx_list);
> > +	}
> 
> I guess more may come later but do we need {}?

I guessed the extra {} would be style guide for this construct?

> >  static int fwctl_fops_release(struct inode *inode, struct file *filp)
> >  {
> > -	struct fwctl_device *fwctl = filp->private_data;
> > +	struct fwctl_uctx *uctx = filp->private_data;
> > +	struct fwctl_device *fwctl = uctx->fwctl;
> >  
> > +	scoped_guard(rwsem_read, &fwctl->registration_lock) {
> > +		if (fwctl->ops) {
> 
> Maybe a comment on when this path happens to help the reader
> along. (when the file is closed and device is still alive).
> Otherwise was cleaned up already in fwctl_unregister()

	scoped_guard(rwsem_read, &fwctl->registration_lock) {
		/*
		 * fwctl_unregister() has already removed the driver and
		 * destroyed the uctx.
		 */
		if (fwctl->ops) {

> >  void fwctl_unregister(struct fwctl_device *fwctl)
> >  {
> > +	struct fwctl_uctx *uctx;
> > +
> >  	cdev_device_del(&fwctl->cdev, &fwctl->dev);
> >  
> > +	/* Disable and free the driver's resources for any still open FDs. */
> > +	guard(rwsem_write)(&fwctl->registration_lock);
> > +	guard(mutex)(&fwctl->uctx_list_lock);
> > +	while ((uctx = list_first_entry_or_null(&fwctl->uctx_list,
> > +						struct fwctl_uctx,
> > +						uctx_list_entry)))
> > +		fwctl_destroy_uctx(uctx);
> > +
> 
> Obviously it's a little more heavy weight but I'd just use
> list_for_each_entry_safe()
> 
> Less effort for reviewers than consider the custom iteration
> you are doing instead.

For these constructs the goal is the make the list empty, it is a
tinsy bit safer/clearer to drive the list to empty purposefully rather
than iterate over it and hope it is empty once done.

However there is no possible way that list_for_each_entry_safe() would
be an unsafe construct here. I can change it if you feel strongly

> > @@ -26,6 +39,10 @@ struct fwctl_device {
> >  	struct device dev;
> >  	/* private: */
> >  	struct cdev cdev;
> > +
> > +	struct rw_semaphore registration_lock;
> > +	struct mutex uctx_list_lock;
> 
> Even for private locks, a scope statement would
> be good to have.

Like so?

	/*
	 * Protect ops, held for write when ops becomes NULL during unregister,
	 * held for read whenver ops is loaded or an ops function is running.
	 */
	struct rw_semaphore registration_lock;
	/* Protect uctx_list */
	struct mutex uctx_list_lock;

> > +#ifndef _UAPI_FWCTL_H
> > +#define _UAPI_FWCTL_H
> > +
> > +#include <linux/types.h>
> 
> Not used yet.
> 
> > +#include <linux/ioctl.h>
> 
> Arguably nor is this, but at least this related to the code
> here.

Sure, lets move them

> > +/**
> > + * DOC: General ioctl format
> > + *
> > + * The ioctl interface follows a general format to allow for extensibility. Each
> > + * ioctl is passed in a structure pointer as the argument providing the size of
> > + * the structure in the first u32. The kernel checks that any structure space
> > + * beyond what it understands is 0. This allows userspace to use the backward
> > + * compatible portion while consistently using the newer, larger, structures.
> 
> Is that particularly helpful?  Userspace needs to know not to put anything in
> those fields, not hard for it to also know what the size it should send is?
> The two will change together.

It is very helpful for a practical userspace.

Lets say we have an ioctl struct:

struct fwctl_info {
	__u32 size;
	__u32 flags;
	__u32 out_device_type;
	__u32 device_data_len;
	__aligned_u64 out_device_data;
};

And the basic userspace pattern is:

  struct fwctl_info info = {.size = sizeof(info), ...);
  ioctl(fd, FWCTL_INFO, &info);

This works today and generates the 24 byte command.

Tomorrow the kernel adds a new member:

struct fwctl_info {
	__u32 size;
	__u32 flags;
	__u32 out_device_type;
	__u32 device_data_len;
	__aligned_u64 out_device_data;
	__aligned_u64 new_thing;
};

Current builds of the userpace use a 24 byte command. A new kernel
will see the 24 bytes and behave as before.

When I recompile the userspace with the updated header it will issue a
32 byte command with no source change.

Old kernel will see a 32 byte command with the trailing bytes it
doesn't understand as 0 and keep working.

The new kernel will see the new_thing bytes are zero and behave the
same as before.

If then the userspace decides to set new_thing the old kernel will
stop working. Userspace can use some 'try and fail' approach to try
again with new_thing = 0.

It gives a whole bunch of easy paths for userspace, otherwise
userspace has to be very careful to match the size of the struct to
the ABI it is targetting. Realistically nobody will do that right.

Thanks,
Jason
Jonathan Cameron July 30, 2024, 5:28 p.m. UTC | #3
On Mon, 29 Jul 2024 14:05:53 -0300
Jason Gunthorpe <jgg@nvidia.com> wrote:

> On Fri, Jul 26, 2024 at 04:01:57PM +0100, Jonathan Cameron wrote:
> 
> > > +struct fwctl_ioctl_op {
> > > +	unsigned int size;
> > > +	unsigned int min_size;
> > > +	unsigned int ioctl_num;
> > > +	int (*execute)(struct fwctl_ucmd *ucmd);
> > > +};
> > > +
> > > +#define IOCTL_OP(_ioctl, _fn, _struct, _last)                         \
> > > +	[_IOC_NR(_ioctl) - FWCTL_CMD_BASE] = {                        \  
> > 
> > If this is always zero indexed, maybe just drop the - FWCTL_CMD_BASE here
> > and elsewhere?  Maybe through in a BUILD_BUG to confirm it is always 0.  
> 
> I left it like this in case someone had different ideas for the number
> space (iommufd uses a non 0 base also). I think either is fine, and I
> slightly prefer keeping it rather than a static_assert.

Ok. Feels a little messy to me. But fair enough I guess.


> > > +	if (!uctx)
> > > +		return -ENOMEM;
> > > +
> > > +	uctx->fwctl = fwctl;
> > > +	ret = fwctl->ops->open_uctx(uctx);
> > > +	if (ret)
> > > +		return ret;
> > > +
> > > +	scoped_guard(mutex, &fwctl->uctx_list_lock) {
> > > +		list_add_tail(&uctx->uctx_list_entry, &fwctl->uctx_list);
> > > +	}  
> > 
> > I guess more may come later but do we need {}?  
> 
> I guessed the extra {} would be style guide for this construct?

Maybe. Not seen any statements on that yet, and it's becoming
quite common.

> 
> > >  static int fwctl_fops_release(struct inode *inode, struct file *filp)
> > >  {
> > > -	struct fwctl_device *fwctl = filp->private_data;
> > > +	struct fwctl_uctx *uctx = filp->private_data;
> > > +	struct fwctl_device *fwctl = uctx->fwctl;
> > >  
> > > +	scoped_guard(rwsem_read, &fwctl->registration_lock) {
> > > +		if (fwctl->ops) {  
> > 
> > Maybe a comment on when this path happens to help the reader
> > along. (when the file is closed and device is still alive).
> > Otherwise was cleaned up already in fwctl_unregister()  
> 
> 	scoped_guard(rwsem_read, &fwctl->registration_lock) {
> 		/*
> 		 * fwctl_unregister() has already removed the driver and
> 		 * destroyed the uctx.
> 		 */
> 		if (fwctl->ops) {
> 

Good.

> > >  void fwctl_unregister(struct fwctl_device *fwctl)
> > >  {
> > > +	struct fwctl_uctx *uctx;
> > > +
> > >  	cdev_device_del(&fwctl->cdev, &fwctl->dev);
> > >  
> > > +	/* Disable and free the driver's resources for any still open FDs. */
> > > +	guard(rwsem_write)(&fwctl->registration_lock);
> > > +	guard(mutex)(&fwctl->uctx_list_lock);
> > > +	while ((uctx = list_first_entry_or_null(&fwctl->uctx_list,
> > > +						struct fwctl_uctx,
> > > +						uctx_list_entry)))
> > > +		fwctl_destroy_uctx(uctx);
> > > +  
> > 
> > Obviously it's a little more heavy weight but I'd just use
> > list_for_each_entry_safe()
> > 
> > Less effort for reviewers than consider the custom iteration
> > you are doing instead.  
> 
> For these constructs the goal is the make the list empty, it is a
> tinsy bit safer/clearer to drive the list to empty purposefully rather
> than iterate over it and hope it is empty once done.
> 
> However there is no possible way that list_for_each_entry_safe() would
> be an unsafe construct here. I can change it if you feel strongly

Meh. You get to maintain this if it flies, so your choice.

> 
> > > @@ -26,6 +39,10 @@ struct fwctl_device {
> > >  	struct device dev;
> > >  	/* private: */
> > >  	struct cdev cdev;
> > > +
> > > +	struct rw_semaphore registration_lock;
> > > +	struct mutex uctx_list_lock;  
> > 
> > Even for private locks, a scope statement would
> > be good to have.  
> 
> Like so?
> 
> 	/*
> 	 * Protect ops, held for write when ops becomes NULL during unregister,
> 	 * held for read whenver ops is loaded or an ops function is running.

That does the job nicely.

> 	 */
> 	struct rw_semaphore registration_lock;
> 	/* Protect uctx_list */
> 	struct mutex uctx_list_lock;

> 
> > > +/**
> > > + * DOC: General ioctl format
> > > + *
> > > + * The ioctl interface follows a general format to allow for extensibility. Each
> > > + * ioctl is passed in a structure pointer as the argument providing the size of
> > > + * the structure in the first u32. The kernel checks that any structure space
> > > + * beyond what it understands is 0. This allows userspace to use the backward
> > > + * compatible portion while consistently using the newer, larger, structures.  
> > 
> > Is that particularly helpful?  Userspace needs to know not to put anything in
> > those fields, not hard for it to also know what the size it should send is?
> > The two will change together.  
> 
> It is very helpful for a practical userspace.
> 
> Lets say we have an ioctl struct:
> 
> struct fwctl_info {
> 	__u32 size;
> 	__u32 flags;
> 	__u32 out_device_type;
> 	__u32 device_data_len;
> 	__aligned_u64 out_device_data;
> };
> 
> And the basic userspace pattern is:
> 
>   struct fwctl_info info = {.size = sizeof(info), ...);
>   ioctl(fd, FWCTL_INFO, &info);
> 
> This works today and generates the 24 byte command.
> 
> Tomorrow the kernel adds a new member:
> 
> struct fwctl_info {
> 	__u32 size;
> 	__u32 flags;
> 	__u32 out_device_type;
> 	__u32 device_data_len;
> 	__aligned_u64 out_device_data;
> 	__aligned_u64 new_thing;
> };
> 
> Current builds of the userpace use a 24 byte command. A new kernel
> will see the 24 bytes and behave as before.
> 
> When I recompile the userspace with the updated header it will issue a
> 32 byte command with no source change.
> 
> Old kernel will see a 32 byte command with the trailing bytes it
> doesn't understand as 0 and keep working.
> 
> The new kernel will see the new_thing bytes are zero and behave the
> same as before.
> 
> If then the userspace decides to set new_thing the old kernel will
> stop working. Userspace can use some 'try and fail' approach to try
> again with new_thing = 0.

I'm not keen on try and fail interfaces because they become messy
if this has potentially be extended multiple times. Rest
of argument is fair enough. Thanks for the explanation.

> 
> It gives a whole bunch of easy paths for userspace, otherwise
> userspace has to be very careful to match the size of the struct to
> the ABI it is targetting. Realistically nobody will do that right.
> 
> Thanks,
> Jason
Jason Gunthorpe Aug. 1, 2024, 1:05 p.m. UTC | #4
On Tue, Jul 30, 2024 at 06:28:08PM +0100, Jonathan Cameron wrote:
> > And the basic userspace pattern is:
> > 
> >   struct fwctl_info info = {.size = sizeof(info), ...);
> >   ioctl(fd, FWCTL_INFO, &info);
> > 
> > This works today and generates the 24 byte command.
> > 
> > Tomorrow the kernel adds a new member:
> > 
> > struct fwctl_info {
> > 	__u32 size;
> > 	__u32 flags;
> > 	__u32 out_device_type;
> > 	__u32 device_data_len;
> > 	__aligned_u64 out_device_data;
> > 	__aligned_u64 new_thing;
> > };
> > 
> > Current builds of the userpace use a 24 byte command. A new kernel
> > will see the 24 bytes and behave as before.
> > 
> > When I recompile the userspace with the updated header it will issue a
> > 32 byte command with no source change.
> > 
> > Old kernel will see a 32 byte command with the trailing bytes it
> > doesn't understand as 0 and keep working.
> > 
> > The new kernel will see the new_thing bytes are zero and behave the
> > same as before.
> > 
> > If then the userspace decides to set new_thing the old kernel will
> > stop working. Userspace can use some 'try and fail' approach to try
> > again with new_thing = 0.
> 
> I'm not keen on try and fail interfaces because they become messy
> if this has potentially be extended multiple times. Rest
> of argument is fair enough. Thanks for the explanation.

I'd say try-and-fail is just the universal option, if there is merit
we can put cap bits and other things to positively indicate increased
kernel capability.

We have quite a deep experiance on this topic now in RDMA, and there
we've been doing both options, depending on the situation.

For instance you might introduce a new API that returns FOO and extend
a prior API to optionally accept FOO as well. A cap flag that the new
API exists is useful [1], but it is not for the prior API. The
userspace can just blindly pass FOO to the prior API, and if it
happened to get a non-zero FOO somehow then the kernel must also
support it..

[1] "try and fail" works well here too you can invoke the IOCTL with a
0 size and you get ENOTTY if the IOCTL is not understood, and another
error code if it is.

Jason
Daniel Vetter Aug. 6, 2024, 7:36 a.m. UTC | #5
On Mon, Jun 24, 2024 at 07:47:26PM -0300, Jason Gunthorpe wrote:
> diff --git a/include/linux/fwctl.h b/include/linux/fwctl.h
> index ef4eaa87c945e4..1d9651de92fc19 100644
> --- a/include/linux/fwctl.h
> +++ b/include/linux/fwctl.h
> @@ -11,7 +11,20 @@
>  struct fwctl_device;
>  struct fwctl_uctx;
>  
> +/**
> + * struct fwctl_ops - Driver provided operations
> + * @uctx_size: The size of the fwctl_uctx struct to allocate. The first
> + *	bytes of this memory will be a fwctl_uctx. The driver can use the
> + *	remaining bytes as its private memory.
> + * @open_uctx: Called when a file descriptor is opened before the uctx is ever
> + *	used.
> + * @close_uctx: Called when the uctx is destroyed, usually when the FD is
> + *	closed.
> + */
>  struct fwctl_ops {
> +	size_t uctx_size;
> +	int (*open_uctx)(struct fwctl_uctx *uctx);
> +	void (*close_uctx)(struct fwctl_uctx *uctx);

Just a small bikeshed, I much prefer the inline kerneldoc style for ops
structs. It allows you to be appropriately verbose and document details
like error handling (more important for later additions) or that e.g. they
all must finish timely for otherwise fwctl_unregister hangs, without the
comment becoming an eyesore.
-Sima
Jason Gunthorpe Aug. 8, 2024, 12:34 p.m. UTC | #6
On Tue, Aug 06, 2024 at 09:36:33AM +0200, Daniel Vetter wrote:
> On Mon, Jun 24, 2024 at 07:47:26PM -0300, Jason Gunthorpe wrote:
> > diff --git a/include/linux/fwctl.h b/include/linux/fwctl.h
> > index ef4eaa87c945e4..1d9651de92fc19 100644
> > --- a/include/linux/fwctl.h
> > +++ b/include/linux/fwctl.h
> > @@ -11,7 +11,20 @@
> >  struct fwctl_device;
> >  struct fwctl_uctx;
> >  
> > +/**
> > + * struct fwctl_ops - Driver provided operations
> > + * @uctx_size: The size of the fwctl_uctx struct to allocate. The first
> > + *	bytes of this memory will be a fwctl_uctx. The driver can use the
> > + *	remaining bytes as its private memory.
> > + * @open_uctx: Called when a file descriptor is opened before the uctx is ever
> > + *	used.
> > + * @close_uctx: Called when the uctx is destroyed, usually when the FD is
> > + *	closed.
> > + */
> >  struct fwctl_ops {
> > +	size_t uctx_size;
> > +	int (*open_uctx)(struct fwctl_uctx *uctx);
> > +	void (*close_uctx)(struct fwctl_uctx *uctx);
> 
> Just a small bikeshed, I much prefer the inline kerneldoc style for ops
> structs. It allows you to be appropriately verbose and document details
> like error handling (more important for later additions) or that e.g. they
> all must finish timely for otherwise fwctl_unregister hangs, without the
> comment becoming an eyesore.

Done

Jason
diff mbox series

Patch

diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst
index a141e8e65c5d3a..4d91c5a20b98c8 100644
--- a/Documentation/userspace-api/ioctl/ioctl-number.rst
+++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
@@ -324,6 +324,7 @@  Code  Seq#    Include File                                           Comments
 0x97  00-7F  fs/ceph/ioctl.h                                         Ceph file system
 0x99  00-0F                                                          537-Addinboard driver
                                                                      <mailto:buk@buks.ipn.de>
+0x9A  00-0F  include/uapi/fwctl/fwctl.h
 0xA0  all    linux/sdp/sdp.h                                         Industrial Device Project
                                                                      <mailto:kenji@bitgate.com>
 0xA1  0      linux/vtpm_proxy.h                                      TPM Emulator Proxy Driver
diff --git a/MAINTAINERS b/MAINTAINERS
index aa7a760d12f8ef..2090009a6ae98a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9083,6 +9083,7 @@  S:	Maintained
 F:	Documentation/userspace-api/fwctl.rst
 F:	drivers/fwctl/
 F:	include/linux/fwctl.h
+F:	include/uapi/fwctl/
 
 GALAXYCORE GC0308 CAMERA SENSOR DRIVER
 M:	Sebastian Reichel <sre@kernel.org>
diff --git a/drivers/fwctl/main.c b/drivers/fwctl/main.c
index 6e9bf15c743b5c..6872c01d5c62e8 100644
--- a/drivers/fwctl/main.c
+++ b/drivers/fwctl/main.c
@@ -9,26 +9,131 @@ 
 #include <linux/container_of.h>
 #include <linux/fs.h>
 
+#include <uapi/fwctl/fwctl.h>
+
 enum {
 	FWCTL_MAX_DEVICES = 256,
 };
 static dev_t fwctl_dev;
 static DEFINE_IDA(fwctl_ida);
 
+struct fwctl_ucmd {
+	struct fwctl_uctx *uctx;
+	void __user *ubuffer;
+	void *cmd;
+	u32 user_size;
+};
+
+/* On stack memory for the ioctl structs */
+union ucmd_buffer {
+};
+
+struct fwctl_ioctl_op {
+	unsigned int size;
+	unsigned int min_size;
+	unsigned int ioctl_num;
+	int (*execute)(struct fwctl_ucmd *ucmd);
+};
+
+#define IOCTL_OP(_ioctl, _fn, _struct, _last)                         \
+	[_IOC_NR(_ioctl) - FWCTL_CMD_BASE] = {                        \
+		.size = sizeof(_struct) +                             \
+			BUILD_BUG_ON_ZERO(sizeof(union ucmd_buffer) < \
+					  sizeof(_struct)),           \
+		.min_size = offsetofend(_struct, _last),              \
+		.ioctl_num = _ioctl,                                  \
+		.execute = _fn,                                       \
+	}
+static const struct fwctl_ioctl_op fwctl_ioctl_ops[] = {
+};
+
+static long fwctl_fops_ioctl(struct file *filp, unsigned int cmd,
+			       unsigned long arg)
+{
+	struct fwctl_uctx *uctx = filp->private_data;
+	const struct fwctl_ioctl_op *op;
+	struct fwctl_ucmd ucmd = {};
+	union ucmd_buffer buf;
+	unsigned int nr;
+	int ret;
+
+	nr = _IOC_NR(cmd);
+	if ((nr - FWCTL_CMD_BASE) >= ARRAY_SIZE(fwctl_ioctl_ops))
+		return -ENOIOCTLCMD;
+	op = &fwctl_ioctl_ops[nr - FWCTL_CMD_BASE];
+	if (op->ioctl_num != cmd)
+		return -ENOIOCTLCMD;
+
+	ucmd.uctx = uctx;
+	ucmd.cmd = &buf;
+	ucmd.ubuffer = (void __user *)arg;
+	ret = get_user(ucmd.user_size, (u32 __user *)ucmd.ubuffer);
+	if (ret)
+		return ret;
+
+	if (ucmd.user_size < op->min_size)
+		return -EINVAL;
+
+	ret = copy_struct_from_user(ucmd.cmd, op->size, ucmd.ubuffer,
+				    ucmd.user_size);
+	if (ret)
+		return ret;
+
+	guard(rwsem_read)(&uctx->fwctl->registration_lock);
+	if (!uctx->fwctl->ops)
+		return -ENODEV;
+	return op->execute(&ucmd);
+}
+
 static int fwctl_fops_open(struct inode *inode, struct file *filp)
 {
 	struct fwctl_device *fwctl =
 		container_of(inode->i_cdev, struct fwctl_device, cdev);
+	int ret;
+
+	guard(rwsem_read)(&fwctl->registration_lock);
+	if (!fwctl->ops)
+		return -ENODEV;
+
+	struct fwctl_uctx *uctx __free(kfree) =
+		kzalloc(fwctl->ops->uctx_size, GFP_KERNEL | GFP_KERNEL_ACCOUNT);
+	if (!uctx)
+		return -ENOMEM;
+
+	uctx->fwctl = fwctl;
+	ret = fwctl->ops->open_uctx(uctx);
+	if (ret)
+		return ret;
+
+	scoped_guard(mutex, &fwctl->uctx_list_lock) {
+		list_add_tail(&uctx->uctx_list_entry, &fwctl->uctx_list);
+	}
 
 	get_device(&fwctl->dev);
-	filp->private_data = fwctl;
+	filp->private_data = no_free_ptr(uctx);
 	return 0;
 }
 
+static void fwctl_destroy_uctx(struct fwctl_uctx *uctx)
+{
+	lockdep_assert_held(&uctx->fwctl->uctx_list_lock);
+	list_del(&uctx->uctx_list_entry);
+	uctx->fwctl->ops->close_uctx(uctx);
+}
+
 static int fwctl_fops_release(struct inode *inode, struct file *filp)
 {
-	struct fwctl_device *fwctl = filp->private_data;
+	struct fwctl_uctx *uctx = filp->private_data;
+	struct fwctl_device *fwctl = uctx->fwctl;
 
+	scoped_guard(rwsem_read, &fwctl->registration_lock) {
+		if (fwctl->ops) {
+			guard(mutex)(&fwctl->uctx_list_lock);
+			fwctl_destroy_uctx(uctx);
+		}
+	}
+
+	kfree(uctx);
 	fwctl_put(fwctl);
 	return 0;
 }
@@ -37,6 +142,7 @@  static const struct file_operations fwctl_fops = {
 	.owner = THIS_MODULE,
 	.open = fwctl_fops_open,
 	.release = fwctl_fops_release,
+	.unlocked_ioctl = fwctl_fops_ioctl,
 };
 
 static void fwctl_device_release(struct device *device)
@@ -45,6 +151,7 @@  static void fwctl_device_release(struct device *device)
 		container_of(device, struct fwctl_device, dev);
 
 	ida_free(&fwctl_ida, fwctl->dev.devt - fwctl_dev);
+	mutex_destroy(&fwctl->uctx_list_lock);
 	kfree(fwctl);
 }
 
@@ -69,6 +176,9 @@  _alloc_device(struct device *parent, const struct fwctl_ops *ops, size_t size)
 		return NULL;
 	fwctl->dev.class = &fwctl_class;
 	fwctl->dev.parent = parent;
+	init_rwsem(&fwctl->registration_lock);
+	mutex_init(&fwctl->uctx_list_lock);
+	INIT_LIST_HEAD(&fwctl->uctx_list);
 
 	devnum = ida_alloc_max(&fwctl_ida, FWCTL_MAX_DEVICES - 1, GFP_KERNEL);
 	if (devnum < 0)
@@ -137,8 +247,18 @@  EXPORT_SYMBOL_NS_GPL(fwctl_register, FWCTL);
  */
 void fwctl_unregister(struct fwctl_device *fwctl)
 {
+	struct fwctl_uctx *uctx;
+
 	cdev_device_del(&fwctl->cdev, &fwctl->dev);
 
+	/* Disable and free the driver's resources for any still open FDs. */
+	guard(rwsem_write)(&fwctl->registration_lock);
+	guard(mutex)(&fwctl->uctx_list_lock);
+	while ((uctx = list_first_entry_or_null(&fwctl->uctx_list,
+						struct fwctl_uctx,
+						uctx_list_entry)))
+		fwctl_destroy_uctx(uctx);
+
 	/*
 	 * The driver module may unload after this returns, the op pointer will
 	 * not be valid.
diff --git a/include/linux/fwctl.h b/include/linux/fwctl.h
index ef4eaa87c945e4..1d9651de92fc19 100644
--- a/include/linux/fwctl.h
+++ b/include/linux/fwctl.h
@@ -11,7 +11,20 @@ 
 struct fwctl_device;
 struct fwctl_uctx;
 
+/**
+ * struct fwctl_ops - Driver provided operations
+ * @uctx_size: The size of the fwctl_uctx struct to allocate. The first
+ *	bytes of this memory will be a fwctl_uctx. The driver can use the
+ *	remaining bytes as its private memory.
+ * @open_uctx: Called when a file descriptor is opened before the uctx is ever
+ *	used.
+ * @close_uctx: Called when the uctx is destroyed, usually when the FD is
+ *	closed.
+ */
 struct fwctl_ops {
+	size_t uctx_size;
+	int (*open_uctx)(struct fwctl_uctx *uctx);
+	void (*close_uctx)(struct fwctl_uctx *uctx);
 };
 
 /**
@@ -26,6 +39,10 @@  struct fwctl_device {
 	struct device dev;
 	/* private: */
 	struct cdev cdev;
+
+	struct rw_semaphore registration_lock;
+	struct mutex uctx_list_lock;
+	struct list_head uctx_list;
 	const struct fwctl_ops *ops;
 };
 
@@ -65,4 +82,18 @@  DEFINE_FREE(fwctl, struct fwctl_device *, if (_T) fwctl_put(_T));
 int fwctl_register(struct fwctl_device *fwctl);
 void fwctl_unregister(struct fwctl_device *fwctl);
 
+/**
+ * struct fwctl_uctx - Per user FD context
+ * @fwctl: fwctl instance that owns the context
+ *
+ * Every FD opened by userspace will get a unique context allocation. Any driver
+ * private data will follow immediately after.
+ */
+struct fwctl_uctx {
+	struct fwctl_device *fwctl;
+	/* private: */
+	/* Head at fwctl_device::uctx_list */
+	struct list_head uctx_list_entry;
+};
+
 #endif
diff --git a/include/uapi/fwctl/fwctl.h b/include/uapi/fwctl/fwctl.h
new file mode 100644
index 00000000000000..0bdce95b6d69d9
--- /dev/null
+++ b/include/uapi/fwctl/fwctl.h
@@ -0,0 +1,41 @@ 
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/* Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
+ */
+#ifndef _UAPI_FWCTL_H
+#define _UAPI_FWCTL_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+#define FWCTL_TYPE 0x9A
+
+/**
+ * DOC: General ioctl format
+ *
+ * The ioctl interface follows a general format to allow for extensibility. Each
+ * ioctl is passed in a structure pointer as the argument providing the size of
+ * the structure in the first u32. The kernel checks that any structure space
+ * beyond what it understands is 0. This allows userspace to use the backward
+ * compatible portion while consistently using the newer, larger, structures.
+ *
+ * ioctls use a standard meaning for common errnos:
+ *
+ *  - ENOTTY: The IOCTL number itself is not supported at all
+ *  - E2BIG: The IOCTL number is supported, but the provided structure has
+ *    non-zero in a part the kernel does not understand.
+ *  - EOPNOTSUPP: The IOCTL number is supported, and the structure is
+ *    understood, however a known field has a value the kernel does not
+ *    understand or support.
+ *  - EINVAL: Everything about the IOCTL was understood, but a field is not
+ *    correct.
+ *  - ENOMEM: Out of memory.
+ *  - ENODEV: The underlying device has been hot-unplugged and the FD is
+ *            orphaned.
+ *
+ * As well as additional errnos, within specific ioctls.
+ */
+enum {
+	FWCTL_CMD_BASE = 0,
+};
+
+#endif