diff mbox series

[RFC,3/6] char: fastrpc: Add support for context Invoke method

Message ID 20181130104657.14875-4-srinivas.kandagatla@linaro.org (mailing list archive)
State Not Applicable, archived
Headers show
Series char: Add support to Qualcomm FastRPC driver | expand

Commit Message

Srinivas Kandagatla Nov. 30, 2018, 10:46 a.m. UTC
This patch adds support to compute context invoke method
on the remote processor (DSP).
This involves setting up the functions input and output arguments,
input and output handles and mapping the dmabuf fd for the
argument/handle buffers.

Most of the work is derived from various downstream Qualcomm kernels.
Credits to various Qualcomm authors who have contributed to this code.
Specially Tharun Kumar Merugu <mtharu@codeaurora.org>

Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
---
 drivers/char/fastrpc.c       | 790 +++++++++++++++++++++++++++++++++++
 include/uapi/linux/fastrpc.h |  56 +++
 2 files changed, 846 insertions(+)
 create mode 100644 include/uapi/linux/fastrpc.h

Comments

Arnd Bergmann Nov. 30, 2018, 1:41 p.m. UTC | #1
On Fri, Nov 30, 2018 at 11:48 AM Srinivas Kandagatla
<srinivas.kandagatla@linaro.org> wrote:
>
> This patch adds support to compute context invoke method
> on the remote processor (DSP).
> This involves setting up the functions input and output arguments,
> input and output handles and mapping the dmabuf fd for the
> argument/handle buffers.
>
> Most of the work is derived from various downstream Qualcomm kernels.
> Credits to various Qualcomm authors who have contributed to this code.
> Specially Tharun Kumar Merugu <mtharu@codeaurora.org>
>
> Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>

> +
> +       INIT_LIST_HEAD(&ctx->node);
> +       ctx->fl = user;
> +       ctx->maps = (struct fastrpc_map **)(&ctx[1]);
> +       ctx->lpra = (remote_arg_t *)(&ctx->maps[bufs]);
> +       ctx->fds = (int *)(&ctx->lpra[bufs]);
> +       ctx->attrs = (unsigned int *)(&ctx->fds[bufs]);
> +
> +       if (!kernel) {
> +               if (copy_from_user(ctx->lpra,
> +                                    (void const __user *)inv->pra,
> +                                    bufs * sizeof(*ctx->lpra))) {
> +                       err = -EFAULT;
> +                       goto err;
> +               }
> +
> +               if (inv->fds) {
> +                       if (copy_from_user(ctx->fds,
> +                                            (void const __user *)inv->fds,
> +                                            bufs * sizeof(*ctx->fds))) {
> +                               err = -EFAULT;
> +                               goto err;
> +                       }
> +               }
> +               if (inv->attrs) {
> +                       if (copy_from_user(
> +                                       ctx->attrs,
> +                                       (void const __user *)inv->attrs,
> +                                       bufs * sizeof(*ctx->attrs))) {
> +                               err = -EFAULT;
> +                               goto err;
> +                       }
> +               }
> +       } else {
> +               memcpy(ctx->lpra, inv->pra, bufs * sizeof(*ctx->lpra));
> +               if (inv->fds)
> +                       memcpy(ctx->fds, inv->fds,
> +                              bufs * sizeof(*ctx->fds));
> +               if (inv->attrs)
> +                       memcpy(ctx->attrs, inv->attrs,
> +                              bufs * sizeof(*ctx->attrs));
> +       }

I'd split this function into multiple pieces: the internal one that
just takes kernel pointers, and a wrapper for the ioctl
that copies the user space data into the kernel before calling
the second one.

> +static int fastrpc_put_args(struct fastrpc_invoke_ctx *ctx,
> +                           uint32_t kernel, remote_arg_t *upra)
> +{
> +       remote_arg64_t *rpra = ctx->rpra;
> +       int i, inbufs, outbufs, handles;
> +       struct fastrpc_invoke_buf *list;
> +       struct fastrpc_phy_page *pages;
> +       struct fastrpc_map *mmap;
> +       uint32_t sc = ctx->sc;
> +       uint64_t *fdlist;
> +       uint32_t *crclist;
> +       int err = 0;
> +
> +       inbufs = REMOTE_SCALARS_INBUFS(sc);
> +       outbufs = REMOTE_SCALARS_OUTBUFS(sc);
> +       handles = REMOTE_SCALARS_INHANDLES(sc) + REMOTE_SCALARS_OUTHANDLES(sc);
> +       list = fastrpc_invoke_buf_start(ctx->rpra, sc);
> +       pages = fastrpc_phy_page_start(sc, list);
> +       fdlist = (uint64_t *)(pages + inbufs + outbufs + handles);
> +       crclist = (uint32_t *)(fdlist + FASTRPC_MAX_FDLIST);
> +
> +       for (i = inbufs; i < inbufs + outbufs; ++i) {
> +               if (!ctx->maps[i]) {
> +                       if (!kernel)
> +                               err =
> +                               copy_to_user((void __user *)ctx->lpra[i].buf.pv,
> +                                      (void *)rpra[i].buf.pv, rpra[i].buf.len);
> +                       else
> +                               memcpy(ctx->lpra[i].buf.pv,
> +                                      (void *)rpra[i].buf.pv, rpra[i].buf.len);
> +
> +                       if (err)
> +                               goto bail;
> +               } else {
> +                       fastrpc_map_put(ctx->maps[i]);
> +                       ctx->maps[i] = NULL;
> +               }
> +       }

Same here.

> +static int fastrpc_internal_invoke(struct fastrpc_user *fl,
> +                                  uint32_t kernel,
> +                                  struct fastrpc_ioctl_invoke *inv)
> +{
> +       struct fastrpc_invoke_ctx *ctx = NULL;
> +       int err = 0;
> +
> +       if (!fl->sctx)
> +               return -EINVAL;
> +
> +       ctx = fastrpc_context_alloc(fl, kernel, inv);
> +       if (IS_ERR(ctx))
> +               return PTR_ERR(ctx);
> +
> +       if (REMOTE_SCALARS_LENGTH(ctx->sc)) {
> +               err = fastrpc_get_args(kernel, ctx);
> +               if (err)
> +                       goto bail;
> +       }
> +
> +       err = fastrpc_invoke_send(fl->sctx, ctx, kernel, inv->handle);
> +       if (err)
> +               goto bail;
> +
> +       err = wait_for_completion_interruptible(&ctx->work);
> +       if (err)
> +               goto bail;

Can you add comments here to explain the control flow?
What exactly are we waiting for here? Does the completion
indicate that the remote side is done executing the code
and ready to do something else?

> +static long fastrpc_device_ioctl(struct file *file, unsigned int cmd,
> +                                unsigned long arg)
> +{
> +       struct fastrpc_user *fl = (struct fastrpc_user *)file->private_data;
> +       struct fastrpc_channel_ctx *cctx = fl->cctx;
> +       char __user *argp = (char __user *)arg;
> +       int err;
> +
> +       if (!fl->sctx) {
> +               fl->sctx = fastrpc_session_alloc(cctx, 0);
> +               if (!fl->sctx)
> +                       return -ENOENT;
> +       }

Shouldn't that session be allocated during open()?

> +static void fastrpc_notify_users(struct fastrpc_user *user)
> +{
> +       struct fastrpc_invoke_ctx *ctx, *n;
> +
> +       spin_lock(&user->lock);
> +       list_for_each_entry_safe(ctx, n, &user->pending, node)
> +               complete(&ctx->work);
> +       spin_unlock(&user->lock);
> +}

Can you explain here what it means to have multiple 'users' for
a 'fastrpc_user' structure? Why are they all done at the same time?

> +struct remote_dma_handle64 {
> +       int fd;
> +       uint32_t offset;
> +       uint32_t len;
> +};

Maybe always make the offset/len fields and others 64 bit?

> +union remote_arg64 {
> +       struct remote_buf64     buf;
> +       struct remote_dma_handle64 dma;
> +       uint32_t h;
> +};
> +
> +#define remote_arg_t    union remote_arg
> +
> +struct remote_buf {
> +       void *pv;               /* buffer pointer */
> +       size_t len;             /* length of buffer */
> +};
> +
> +struct remote_dma_handle {
> +       int fd;
> +       uint32_t offset;
> +};
> +
> +union remote_arg {
> +       struct remote_buf buf;  /* buffer info */
> +       struct remote_dma_handle dma;
> +       uint32_t h;             /* remote handle */
> +};

Try to avoid the padding at the end of the structure,
if you can't, then add a __reserved member.

I'd also recommend avoiding nested structures and
unions. Add more commands if necessary.

> +struct fastrpc_ioctl_invoke {
> +       uint32_t handle;        /* remote handle */
> +       uint32_t sc;            /* scalars describing the data */
> +       remote_arg_t *pra;      /* remote arguments list */
> +       int *fds;               /* fd list */
> +       unsigned int *attrs;    /* attribute list */
> +       unsigned int *crc;
> +};

This seems too complex for an ioctl argument, with
multiple levels of pointer indirections. I'd normally
try to make each ioctl argument either a scalar, or a
structure with only fixed-length members.

The way we did this in spufs was to set up a context
first with all the information it needed, and make the
actual context switch from host CPU to remote a very
simple operation that took as few arguments as possible,
in case of spu_run() only the instruction pointer and
the location of the return status.

      Arnd
Srinivas Kandagatla Nov. 30, 2018, 3:01 p.m. UTC | #2
Thanks Arnd for the review comments!

On 30/11/18 13:41, Arnd Bergmann wrote:
> On Fri, Nov 30, 2018 at 11:48 AM Srinivas Kandagatla
> <srinivas.kandagatla@linaro.org> wrote:
>>
>> This patch adds support to compute context invoke method
>> on the remote processor (DSP).
>> This involves setting up the functions input and output arguments,
>> input and output handles and mapping the dmabuf fd for the
>> argument/handle buffers.
>>
>> Most of the work is derived from various downstream Qualcomm kernels.
>> Credits to various Qualcomm authors who have contributed to this code.
>> Specially Tharun Kumar Merugu <mtharu@codeaurora.org>
>>
>> Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
> 
>> +
>> +       INIT_LIST_HEAD(&ctx->node);
>> +       ctx->fl = user;
>> +       ctx->maps = (struct fastrpc_map **)(&ctx[1]);
>> +       ctx->lpra = (remote_arg_t *)(&ctx->maps[bufs]);
>> +       ctx->fds = (int *)(&ctx->lpra[bufs]);
>> +       ctx->attrs = (unsigned int *)(&ctx->fds[bufs]);
>> +
>> +       if (!kernel) {
>> +               if (copy_from_user(ctx->lpra,
>> +                                    (void const __user *)inv->pra,
>> +                                    bufs * sizeof(*ctx->lpra))) {
>> +                       err = -EFAULT;
>> +                       goto err;
>> +               }
>> +
>> +               if (inv->fds) {
>> +                       if (copy_from_user(ctx->fds,
>> +                                            (void const __user *)inv->fds,
>> +                                            bufs * sizeof(*ctx->fds))) {
>> +                               err = -EFAULT;
>> +                               goto err;
>> +                       }
>> +               }
>> +               if (inv->attrs) {
>> +                       if (copy_from_user(
>> +                                       ctx->attrs,
>> +                                       (void const __user *)inv->attrs,
>> +                                       bufs * sizeof(*ctx->attrs))) {
>> +                               err = -EFAULT;
>> +                               goto err;
>> +                       }
>> +               }
>> +       } else {
>> +               memcpy(ctx->lpra, inv->pra, bufs * sizeof(*ctx->lpra));
>> +               if (inv->fds)
>> +                       memcpy(ctx->fds, inv->fds,
>> +                              bufs * sizeof(*ctx->fds));
>> +               if (inv->attrs)
>> +                       memcpy(ctx->attrs, inv->attrs,
>> +                              bufs * sizeof(*ctx->attrs));
>> +       }
> 
> I'd split this function into multiple pieces: the internal one that
> just takes kernel pointers, and a wrapper for the ioctl
> that copies the user space data into the kernel before calling
> the second one.

Sure, will be done in next version!
> 
>> +static int fastrpc_put_args(struct fastrpc_invoke_ctx *ctx,
>> +                           uint32_t kernel, remote_arg_t *upra)
>> +{
>> +       remote_arg64_t *rpra = ctx->rpra;
>> +       int i, inbufs, outbufs, handles;
>> +       struct fastrpc_invoke_buf *list;
>> +       struct fastrpc_phy_page *pages;
>> +       struct fastrpc_map *mmap;
>> +       uint32_t sc = ctx->sc;
>> +       uint64_t *fdlist;
>> +       uint32_t *crclist;
>> +       int err = 0;
>> +
>> +       inbufs = REMOTE_SCALARS_INBUFS(sc);
>> +       outbufs = REMOTE_SCALARS_OUTBUFS(sc);
>> +       handles = REMOTE_SCALARS_INHANDLES(sc) + REMOTE_SCALARS_OUTHANDLES(sc);
>> +       list = fastrpc_invoke_buf_start(ctx->rpra, sc);
>> +       pages = fastrpc_phy_page_start(sc, list);
>> +       fdlist = (uint64_t *)(pages + inbufs + outbufs + handles);
>> +       crclist = (uint32_t *)(fdlist + FASTRPC_MAX_FDLIST);
>> +
>> +       for (i = inbufs; i < inbufs + outbufs; ++i) {
>> +               if (!ctx->maps[i]) {
>> +                       if (!kernel)
>> +                               err =
>> +                               copy_to_user((void __user *)ctx->lpra[i].buf.pv,
>> +                                      (void *)rpra[i].buf.pv, rpra[i].buf.len);
>> +                       else
>> +                               memcpy(ctx->lpra[i].buf.pv,
>> +                                      (void *)rpra[i].buf.pv, rpra[i].buf.len);
>> +
>> +                       if (err)
>> +                               goto bail;
>> +               } else {
>> +                       fastrpc_map_put(ctx->maps[i]);
>> +                       ctx->maps[i] = NULL;
>> +               }
>> +       }
> 
> Same here.
> 
>> +static int fastrpc_internal_invoke(struct fastrpc_user *fl,
>> +                                  uint32_t kernel,
>> +                                  struct fastrpc_ioctl_invoke *inv)
>> +{
>> +       struct fastrpc_invoke_ctx *ctx = NULL;
>> +       int err = 0;
>> +
>> +       if (!fl->sctx)
>> +               return -EINVAL;
>> +
>> +       ctx = fastrpc_context_alloc(fl, kernel, inv);
>> +       if (IS_ERR(ctx))
>> +               return PTR_ERR(ctx);
>> +
>> +       if (REMOTE_SCALARS_LENGTH(ctx->sc)) {
>> +               err = fastrpc_get_args(kernel, ctx);
>> +               if (err)
>> +                       goto bail;
>> +       }
>> +
>> +       err = fastrpc_invoke_send(fl->sctx, ctx, kernel, inv->handle);
>> +       if (err)
>> +               goto bail;
>> +
>> +       err = wait_for_completion_interruptible(&ctx->work);
>> +       if (err)
>> +               goto bail;
> 
> Can you add comments here to explain the control flow?
> What exactly are we waiting for here? Does the completion
> indicate that the remote side is done executing the code
> and ready to do something else?

Sure I will add some detailed comment here, completion here means that 
the remote side has finished with the execution of that particular context.

> 
>> +static long fastrpc_device_ioctl(struct file *file, unsigned int cmd,
>> +                                unsigned long arg)
>> +{
>> +       struct fastrpc_user *fl = (struct fastrpc_user *)file->private_data;
>> +       struct fastrpc_channel_ctx *cctx = fl->cctx;
>> +       char __user *argp = (char __user *)arg;
>> +       int err;
>> +
>> +       if (!fl->sctx) {
>> +               fl->sctx = fastrpc_session_alloc(cctx, 0);
>> +               if (!fl->sctx)
>> +                       return -ENOENT;
>> +       }
> 
> Shouldn't that session be allocated during open()?
> 
Yes, and no, we do not need context in all the cases. In cases like we 
just want to allocate dmabuf.

>> +static void fastrpc_notify_users(struct fastrpc_user *user)
>> +{
>> +       struct fastrpc_invoke_ctx *ctx, *n;
>> +
>> +       spin_lock(&user->lock);
>> +       list_for_each_entry_safe(ctx, n, &user->pending, node)
>> +               complete(&ctx->work);
>> +       spin_unlock(&user->lock);
>> +}
> 
> Can you explain here what it means to have multiple 'users'
> a 'fastrpc_user' structure? Why are they all done at the same time?
> 
This is the case where users need to be notified if the dsp goes down 
due to crash or shut down!

>> +struct remote_dma_handle64 {
>> +       int fd;
>> +       uint32_t offset;
>> +       uint32_t len;
>> +};
> 
> Maybe always make the offset/len fields and others 64 bit?
> 
yes, I will do that.
>> +union remote_arg64 {
>> +       struct remote_buf64     buf;
>> +       struct remote_dma_handle64 dma;
>> +       uint32_t h;
>> +};
>> +
>> +#define remote_arg_t    union remote_arg
>> +
>> +struct remote_buf {
>> +       void *pv;               /* buffer pointer */
>> +       size_t len;             /* length of buffer */
>> +};
>> +
>> +struct remote_dma_handle {
>> +       int fd;
>> +       uint32_t offset;
>> +};
>> +
>> +union remote_arg {
>> +       struct remote_buf buf;  /* buffer info */
>> +       struct remote_dma_handle dma;
>> +       uint32_t h;             /* remote handle */
>> +};
> 
> Try to avoid the padding at the end of the structure,
> if you can't, then add a __reserved member.
> 
> I'd also recommend avoiding nested structures and
> unions. Add more commands if necessary.
I will revisit all the data structures and make sure we do not leave any 
holes in the structure!
> 
>> +struct fastrpc_ioctl_invoke {
>> +       uint32_t handle;        /* remote handle */
>> +       uint32_t sc;            /* scalars describing the data */
>> +       remote_arg_t *pra;      /* remote arguments list */
>> +       int *fds;               /* fd list */
>> +       unsigned int *attrs;    /* attribute list */
>> +       unsigned int *crc;
>> +};
> 
> This seems too complex for an ioctl argument, with
> multiple levels of pointer indirections. I'd normally
> try to make each ioctl argument either a scalar, or a
> structure with only fixed-length members.
> 
I totally agree with you and many thanks for your expert inputs,
May be something like below with fixed length members would work?

struct fastrpc_remote_arg {
	__u64 ptr;	/* buffer ptr */
	__u64 len;	/* length */
	__u32 fd;	/* dmabuf fd */
	__u32 reserved1
	__u64 reserved2
};

struct fastrpc_remote_fd {
	__u64 fd;
	__u64 reserved1
	__u64 reserved2
	__u64 reserved3
};

struct fastrpc_remote_attr {
	__u64 attr;
	__u64 reserved1
	__u64 reserved2
	__u64 reserved3
};

struct fastrpc_remote_crc {
	__u64 crc;
	__u64 reserved1
	__u64 reserved2
	__u64 reserved3
};

struct fastrpc_ioctl_invoke {
	__u32 handle;
	__u32 sc;
	/* The minimum size is scalar_length * 32 */
	struct fastrpc_remote_args *rargs;
	struct fastrpc_remote_fd *fds;
	struct fastrpc_remote_attr *attrs;
	struct fastrpc_remote_crc *crc;
};

> The way we did this in spufs was to set up a context
> first with all the information it needed, and make the
> actual context switch from host CPU to remote a very
> simple operation that took as few arguments as possible,
> in case of spu_run() only the instruction pointer and
> the location of the return status.

thanks,
srini
> 
>        Arnd
>
Arnd Bergmann Nov. 30, 2018, 3:08 p.m. UTC | #3
On Fri, Nov 30, 2018 at 4:01 PM Srinivas Kandagatla
<srinivas.kandagatla@linaro.org> wrote:
> Thanks Arnd for the review comments!
> On 30/11/18 13:41, Arnd Bergmann wrote:
> > On Fri, Nov 30, 2018 at 11:48 AM Srinivas Kandagatla
> > <srinivas.kandagatla@linaro.org> wrote:

> >> +static long fastrpc_device_ioctl(struct file *file, unsigned int cmd,
> >> +                                unsigned long arg)
> >> +{
> >> +       struct fastrpc_user *fl = (struct fastrpc_user *)file->private_data;
> >> +       struct fastrpc_channel_ctx *cctx = fl->cctx;
> >> +       char __user *argp = (char __user *)arg;
> >> +       int err;
> >> +
> >> +       if (!fl->sctx) {
> >> +               fl->sctx = fastrpc_session_alloc(cctx, 0);
> >> +               if (!fl->sctx)
> >> +                       return -ENOENT;
> >> +       }
> >
> > Shouldn't that session be allocated during open()?
> >
> Yes, and no, we do not need context in all the cases. In cases like we
> just want to allocate dmabuf.

Can you give an example what that would be good for?

>
> >> +static void fastrpc_notify_users(struct fastrpc_user *user)
> >> +{
> >> +       struct fastrpc_invoke_ctx *ctx, *n;
> >> +
> >> +       spin_lock(&user->lock);
> >> +       list_for_each_entry_safe(ctx, n, &user->pending, node)
> >> +               complete(&ctx->work);
> >> +       spin_unlock(&user->lock);
> >> +}
> >
> > Can you explain here what it means to have multiple 'users'
> > a 'fastrpc_user' structure? Why are they all done at the same time?
> >
> This is the case where users need to be notified if the dsp goes down
> due to crash or shut down!

What is a 'user' then? My point is that it seems to refer to two
different things here. I assume 'fastrpc_user' is whoever
has opened the file descriptor.

> >
> >> +struct fastrpc_ioctl_invoke {
> >> +       uint32_t handle;        /* remote handle */
> >> +       uint32_t sc;            /* scalars describing the data */
> >> +       remote_arg_t *pra;      /* remote arguments list */
> >> +       int *fds;               /* fd list */
> >> +       unsigned int *attrs;    /* attribute list */
> >> +       unsigned int *crc;
> >> +};
> >
> > This seems too complex for an ioctl argument, with
> > multiple levels of pointer indirections. I'd normally
> > try to make each ioctl argument either a scalar, or a
> > structure with only fixed-length members.
> >
> I totally agree with you and many thanks for your expert inputs,
> May be something like below with fixed length members would work?
>
> struct fastrpc_remote_arg {
>         __u64 ptr;      /* buffer ptr */
>         __u64 len;      /* length */
>         __u32 fd;       /* dmabuf fd */
>         __u32 reserved1
>         __u64 reserved2
> };
>
> struct fastrpc_remote_fd {
>         __u64 fd;
>         __u64 reserved1
>         __u64 reserved2
>         __u64 reserved3
> };
>
> struct fastrpc_remote_attr {
>         __u64 attr;
>         __u64 reserved1
>         __u64 reserved2
>         __u64 reserved3
> };
>
> struct fastrpc_remote_crc {
>         __u64 crc;
>         __u64 reserved1
>         __u64 reserved2
>         __u64 reserved3
> };

I don't see a need to add extra served fields for structures
that are already naturally aligned here, e.g. in
fastrpc_remote_arg we need the 'reserved1' but not
the 'reserved2'.

>
> struct fastrpc_ioctl_invoke {
>         __u32 handle;
>         __u32 sc;
>         /* The minimum size is scalar_length * 32 */
>         struct fastrpc_remote_args *rargs;
>         struct fastrpc_remote_fd *fds;
>         struct fastrpc_remote_attr *attrs;
>         struct fastrpc_remote_crc *crc;
> };

Do these really have to be indirect then? Are they all
lists of variable length? How do you know how long?

      Arnd
Srinivas Kandagatla Nov. 30, 2018, 4:03 p.m. UTC | #4
On 30/11/18 15:08, Arnd Bergmann wrote:
> On Fri, Nov 30, 2018 at 4:01 PM Srinivas Kandagatla
> <srinivas.kandagatla@linaro.org> wrote:
>> Thanks Arnd for the review comments!
>> On 30/11/18 13:41, Arnd Bergmann wrote:
>>> On Fri, Nov 30, 2018 at 11:48 AM Srinivas Kandagatla
>>> <srinivas.kandagatla@linaro.org> wrote:
> 
>>>> +static long fastrpc_device_ioctl(struct file *file, unsigned int cmd,
>>>> +                                unsigned long arg)
>>>> +{
>>>> +       struct fastrpc_user *fl = (struct fastrpc_user *)file->private_data;
>>>> +       struct fastrpc_channel_ctx *cctx = fl->cctx;
>>>> +       char __user *argp = (char __user *)arg;
>>>> +       int err;
>>>> +
>>>> +       if (!fl->sctx) {
>>>> +               fl->sctx = fastrpc_session_alloc(cctx, 0);
>>>> +               if (!fl->sctx)
>>>> +                       return -ENOENT;
>>>> +       }
>>>
>>> Shouldn't that session be allocated during open()?
>>>
>> Yes, and no, we do not need context in all the cases. In cases like we
>> just want to allocate dmabuf.
> 
> Can you give an example what that would be good for?
> 

Currently the instance which does not need session is used as simple 
memory allocator (rpcmem), TBH, this is the side effect of trying to fit 
in with downstream application infrastructure which uses ion for andriod 
usecases.

>>
>>>> +static void fastrpc_notify_users(struct fastrpc_user *user)
>>>> +{
>>>> +       struct fastrpc_invoke_ctx *ctx, *n;will go
>>>> +
>>>> +       spin_lock(&user->lock);
>>>> +       list_for_each_entry_safe(ctx, n, &user->pending, node)
>>>> +               complete(&ctx->work);
>>>> +       spin_unlock(&user->lock);
>>>> +}
>>>
>>> Can you explain here what it means to have multiple 'users'
>>> a 'fastrpc_user' structure? Why are they all done at the same time?

user is allocated on every open(). Having multiple users means that 
there are more than one compute sessions running on a given dsp.

They reason why all the users are notified here is because the dsp is 
going down, so all the compute sessions associated with it will not see 
any response from dsp, so any pending/waiting compute contexts are 
explicitly notified.

>>>
>> This is the case where users need to be notified if the dsp goes down
>> due to crash or shut down!
> 
> What is a 'user' then? My point is that it seems to refer to two
> different things here. I assume 'fastrpc_user' is whoever
> has opened the file descriptor.
> 
>>>
>>>> +struct fastrpc_ioctl_invoke {
>>>> +       uint32_t handle;        /* remote handle */
>>>> +       uint32_t sc;            /* scalars describing the data */
>>>> +       remote_arg_t *pra;      /* remote arguments list */
>>>> +       int *fds;               /* fd list */
>>>> +       unsigned int *attrs;    /* attribute list */
>>>> +       unsigned int *crc;
>>>> +};
>>>
>>> This seems too complex for an ioctl argument, with
>>> multiple levels of pointer indirections. I'd normally
>>> try to make each ioctl argument either a scalar, or a
>>> structure with only fixed-length members.
>>>
>> I totally agree with you and many thanks for your expert inputs,
>> May be something like below with fixed length members would work?
>>
>> struct fastrpc_remote_arg {
>>          __u64 ptr;      /* buffer ptr */
>>          __u64 len;      /* length */
>>          __u32 fd;       /* dmabuf fd */
>>          __u32 reserved1
>>          __u64 reserved2
>> };
>>
>> struct fastrpc_remote_fd {
>>          __u64 fd;
>>          __u64 reserved1
>>          __u64 reserved2
>>          __u64 reserved3
>> };
>>
>> struct fastrpc_remote_attr {
>>          __u64 attr;
>>          __u64 reserved1
>>          __u64 reserved2
>>          __u64 reserved3
>> };
>>
>> struct fastrpc_remote_crc {
>>          __u64 crc;
>>          __u64 reserved1
>>          __u64 reserved2
>>          __u64 reserved3
>> };
> 
> I don't see a need to add extra served fields for structures
> that are already naturally aligned here, e.g. in
> fastrpc_remote_arg we need the 'reserved1' but not
> the 'reserved2'.
Yes, I see, I overdone it!
Other idea, is, may be I can try to combine these into single structure 
something like:

struct fastrpc_invoke_arg {
	__u64 ptr;
	__u64 len;
	__u32 fd;
	__u32 reserved1
	__u64 attr;
	__u64 crc;
};

struct fastrpc_ioctl_invoke {
	__u32 handle;
	__u32 sc;
	/* The minimum size is scalar_length * 32*/
	struct fastrpc_invoke_args *args;
};

> 
>>
>> struct fastrpc_ioctl_invoke {
>>          __u32 handle;
>>          __u32 sc;
>>          /* The minimum size is scalar_length * 32 */
>>          struct fastrpc_remote_args *rargs;
>>          struct fastrpc_remote_fd *fds;
>>          struct fastrpc_remote_attr *attrs;
>>          struct fastrpc_remote_crc *crc;
>> };
> 
> Do these really have to be indirect then? Are they all
> lists of variable length? How do you know how long?
Yes, they are variable length and will be scalar length long.
Scalar length is derived from sc variable in this structure.

--srini


> 
>        Arnd
>
Arnd Bergmann Nov. 30, 2018, 4:19 p.m. UTC | #5
On Fri, Nov 30, 2018 at 5:03 PM Srinivas Kandagatla
<srinivas.kandagatla@linaro.org> wrote:
> On 30/11/18 15:08, Arnd Bergmann wrote:
> > On Fri, Nov 30, 2018 at 4:01 PM Srinivas Kandagatla
> > <srinivas.kandagatla@linaro.org> wrote:
> >> Thanks Arnd for the review comments!
> >> On 30/11/18 13:41, Arnd Bergmann wrote:
> >>> On Fri, Nov 30, 2018 at 11:48 AM Srinivas Kandagatla
> >>> <srinivas.kandagatla@linaro.org> wrote:
> >
> >>>> +static long fastrpc_device_ioctl(struct file *file, unsigned int cmd,
> >>>> +                                unsigned long arg)
> >>>> +{
> >>>> +       struct fastrpc_user *fl = (struct fastrpc_user *)file->private_data;
> >>>> +       struct fastrpc_channel_ctx *cctx = fl->cctx;
> >>>> +       char __user *argp = (char __user *)arg;
> >>>> +       int err;
> >>>> +
> >>>> +       if (!fl->sctx) {
> >>>> +               fl->sctx = fastrpc_session_alloc(cctx, 0);
> >>>> +               if (!fl->sctx)
> >>>> +                       return -ENOENT;
> >>>> +       }
> >>>
> >>> Shouldn't that session be allocated during open()?
> >>>
> >> Yes, and no, we do not need context in all the cases. In cases like we
> >> just want to allocate dmabuf.
> >
> > Can you give an example what that would be good for?
> >
>
> Currently the instance which does not need session is used as simple
> memory allocator (rpcmem), TBH, this is the side effect of trying to fit
> in with downstream application infrastructure which uses ion for andriod
> usecases.

That does not sound like enough of a reason then, user space is
easy to change here to just allocate the memory from the device itself.
The only reason that I can see for needing a dmabuf would be if
you have to share a buffer between two instances, and then you
can use either of them.

> >>>> +static void fastrpc_notify_users(struct fastrpc_user *user)
> >>>> +{
> >>>> +       struct fastrpc_invoke_ctx *ctx, *n;will go
> >>>> +
> >>>> +       spin_lock(&user->lock);
> >>>> +       list_for_each_entry_safe(ctx, n, &user->pending, node)
> >>>> +               complete(&ctx->work);
> >>>> +       spin_unlock(&user->lock);
> >>>> +}
> >>>
> >>> Can you explain here what it means to have multiple 'users'
> >>> a 'fastrpc_user' structure? Why are they all done at the same time?
>
> user is allocated on every open(). Having multiple users means that
> there are more than one compute sessions running on a given dsp.
>
> They reason why all the users are notified here is because the dsp is
> going down, so all the compute sessions associated with it will not see
> any response from dsp, so any pending/waiting compute contexts are
> explicitly notified.

I don't get it yet. What are 'compute sessions'? Do you have
multiple threads running on a single instance at the same time?
I would have expected to only ever see one thread in the
'wait_for_completion()' above, and others possibly waiting
for a chance to get to but not already running.

> >> struct fastrpc_remote_crc {
> >>          __u64 crc;
> >>          __u64 reserved1
> >>          __u64 reserved2
> >>          __u64 reserved3
> >> };
> >
> > I don't see a need to add extra served fields for structures
> > that are already naturally aligned here, e.g. in
> > fastrpc_remote_arg we need the 'reserved1' but not
> > the 'reserved2'.
> Yes, I see, I overdone it!
> Other idea, is, may be I can try to combine these into single structure
> something like:
>
> struct fastrpc_invoke_arg {
>         __u64 ptr;
>         __u64 len;
>         __u32 fd;
>         __u32 reserved1
>         __u64 attr;
>         __u64 crc;
> };
>
> struct fastrpc_ioctl_invoke {
>         __u32 handle;
>         __u32 sc;
>         /* The minimum size is scalar_length * 32*/
>         struct fastrpc_invoke_args *args;
> };

That is still two structure, not one ;-)

> >> struct fastrpc_ioctl_invoke {
> >>          __u32 handle;
> >>          __u32 sc;
> >>          /* The minimum size is scalar_length * 32 */
> >>          struct fastrpc_remote_args *rargs;
> >>          struct fastrpc_remote_fd *fds;
> >>          struct fastrpc_remote_attr *attrs;
> >>          struct fastrpc_remote_crc *crc;
> >> };
> >
> > Do these really have to be indirect then? Are they all
> > lists of variable length? How do you know how long?
> Yes, they are variable length and will be scalar length long.
> Scalar length is derived from sc variable in this structure.

Do you mean we have a variable number 'sc', but each array
always has the same length as the other ones? In that
case: yes, combining them seems sensible.

The other question this raises is: what is 'handle'?
Why is the file descriptor not enough to identify the
instance we want to talk to?

      Arnd
Srinivas Kandagatla Nov. 30, 2018, 4:40 p.m. UTC | #6
On 30/11/18 16:19, Arnd Bergmann wrote:
> On Fri, Nov 30, 2018 at 5:03 PM Srinivas Kandagatla
> <srinivas.kandagatla@linaro.org> wrote:
>> On 30/11/18 15:08, Arnd Bergmann wrote:
>>> On Fri, Nov 30, 2018 at 4:01 PM Srinivas Kandagatla
>>> <srinivas.kandagatla@linaro.org> wrote:
>>>> Thanks Arnd for the review comments!
>>>> On 30/11/18 13:41, Arnd Bergmann wrote:
>>>>> On Fri, Nov 30, 2018 at 11:48 AM Srinivas Kandagatla
>>>>> <srinivas.kandagatla@linaro.org> wrote:
>>>
>>>>>> +static long fastrpc_device_ioctl(struct file *file, unsigned int cmd,
>>>>>> +                                unsigned long arg)
>>>>>> +{
>>>>>> +       struct fastrpc_user *fl = (struct fastrpc_user *)file->private_data;
>>>>>> +       struct fastrpc_channel_ctx *cctx = fl->cctx;
>>>>>> +       char __user *argp = (char __user *)arg;
>>>>>> +       int err;
>>>>>> +
>>>>>> +       if (!fl->sctx) {
>>>>>> +               fl->sctx = fastrpc_session_alloc(cctx, 0);
>>>>>> +               if (!fl->sctx)
>>>>>> +                       return -ENOENT;
>>>>>> +       }
>>>>>
>>>>> Shouldn't that session be allocated during open()?
>>>>>
>>>> Yes, and no, we do not need context in all the cases. In cases like we
>>>> just want to allocate dmabuf.
>>>
>>> Can you give an example what that would be good for?
>>>
>>
>> Currently the instance which does not need session is used as simple
>> memory allocator (rpcmem), TBH, this is the side effect of trying to fit
>> in with downstream application infrastructure which uses ion for andriod
>> usecases.
> 
> That does not sound like enough of a reason then, user space is
> easy to change here to just allocate the memory from the device itself.
> The only reason that I can see for needing a dmabuf would be if
> you have to share a buffer between two instances, and then you
> can use either of them.

I agree, I will try rework this and remove the instances that does not 
require sessions!

Sharing buffer is also a reason for dmabuf here.

> 
>>>>>> +static void fastrpc_notify_users(struct fastrpc_user *user)
>>>>>> +{
>>>>>> +       struct fastrpc_invoke_ctx *ctx, *n;will go
>>>>>> +
>>>>>> +       spin_lock(&user->lock);
>>>>>> +       list_for_each_entry_safe(ctx, n, &user->pending, node)
>>>>>> +               complete(&ctx->work);
>>>>>> +       spin_unlock(&user->lock);
>>>>>> +}
>>>>>
>>>>> Can you explain here what it means to have multiple 'users'
>>>>> a 'fastrpc_user' structure? Why are they all done at the same time?
>>
>> user is allocated on every open(). Having multiple users means that
>> there are more than one compute sessions running on a given dsp.
>>
>> They reason why all the users are notified here is because the dsp is
>> going down, so all the compute sessions associated with it will not see
>> any response from dsp, so any pending/waiting compute contexts are
>> explicitly notified.
> 
> I don't get it yet. What are 'compute sessions'? Do you have
> multiple threads running on a single instance at the same time?

compute sessions are "compute context-banks" instances in DSP.

DSP supports multiple compute banks, Ideally 12 context banks, 4 which 
are reserved for other purposes and 8 of them are used for compute, one 
for each process. So ideally we can run 8 parallel computes.


> I would have expected to only ever see one thread in the
> 'wait_for_completion()' above, and others possibly waiting
> for a chance to get to but not already running.
> 
>>>> struct fastrpc_remote_crc {
>>>>           __u64 crc;
>>>>           __u64 reserved1
>>>>           __u64 reserved2
>>>>           __u64 reserved3
>>>> };
>>>
>>> I don't see a need to add extra served fields for structures
>>> that are already naturally aligned here, e.g. in
>>> fastrpc_remote_arg we need the 'reserved1' but not
>>> the 'reserved2'.
>> Yes, I see, I overdone it!
>> Other idea, is, may be I can try to combine these into single structure
>> something like:
>>
>> struct fastrpc_invoke_arg {
>>          __u64 ptr;
>>          __u64 len;
>>          __u32 fd;
>>          __u32 reserved1
>>          __u64 attr;
>>          __u64 crc;
>> };
>>
>> struct fastrpc_ioctl_invoke {
>>          __u32 handle;
>>          __u32 sc;
>>          /* The minimum size is scalar_length * 32*/
>>          struct fastrpc_invoke_args *args;
>> };
> 
> That is still two structure, not one ;-)
> 
>>>> struct fastrpc_ioctl_invoke {
>>>>           __u32 handle;
>>>>           __u32 sc;
>>>>           /* The minimum size is scalar_length * 32 */
>>>>           struct fastrpc_remote_args *rargs;
>>>>           struct fastrpc_remote_fd *fds;
>>>>           struct fastrpc_remote_attr *attrs;
>>>>           struct fastrpc_remote_crc *crc;
>>>> };
>>>
>>> Do these really have to be indirect then? Are they all
>>> lists of variable length? How do you know how long?
>> Yes, they are variable length and will be scalar length long.
>> Scalar length is derived from sc variable in this structure.
> 
> Do you mean we have a variable number 'sc', but each array
> always has the same length as the other ones? In that
> case: yes, combining them seems sensible.
Yes thats what I meant!

> 
> The other question this raises is: what is 'handle'?
> Why is the file descriptor not enough to identify the
> instance we want to talk to?
This is remote handle to opened interface on which this method has to be 
invoked.
For example we are running a calculator application, calculator will 
have a unique handle on which calculate() method needs to be invoked.


thanks,
srini
> 
>        Arnd
>
diff mbox series

Patch

diff --git a/drivers/char/fastrpc.c b/drivers/char/fastrpc.c
index 97d8062eb3e1..5bb224adc24f 100644
--- a/drivers/char/fastrpc.c
+++ b/drivers/char/fastrpc.c
@@ -3,7 +3,9 @@ 
 // Copyright (c) 2018, Linaro Limited
 
 #include <linux/cdev.h>
+#include <linux/completion.h>
 #include <linux/device.h>
+#include <linux/dma-buf.h>
 #include <linux/dma-mapping.h>
 #include <linux/idr.h>
 #include <linux/list.h>
@@ -14,6 +16,7 @@ 
 #include <linux/rpmsg.h>
 #include <linux/scatterlist.h>
 #include <linux/slab.h>
+#include <uapi/linux/fastrpc.h>
 
 #define ADSP_DOMAIN_ID (0)
 #define MDSP_DOMAIN_ID (1)
@@ -21,10 +24,41 @@ 
 #define CDSP_DOMAIN_ID (3)
 #define FASTRPC_DEV_MAX		4 /* adsp, mdsp, slpi, cdsp*/
 #define FASTRPC_MAX_SESSIONS	9 /*8 compute, 1 cpz*/
+#define FASTRPC_ALIGN		128
+#define FASTRPC_MAX_FDLIST	16
+#define FASTRPC_MAX_CRCLIST	64
+#define FASTRPC_PHYS(p)	(p & 0xffffffff)
 #define FASTRPC_CTX_MAX (256)
 #define FASTRPC_CTXID_MASK (0xFF0)
 #define FASTRPC_DEVICE_NAME	"fastrpc"
 
+/* Retrives number of input buffers from the scalars parameter */
+#define REMOTE_SCALARS_INBUFS(sc)        (((sc) >> 16) & 0x0ff)
+
+/* Retrives number of output buffers from the scalars parameter */
+#define REMOTE_SCALARS_OUTBUFS(sc)       (((sc) >> 8) & 0x0ff)
+
+/* Retrives number of input handles from the scalars parameter */
+#define REMOTE_SCALARS_INHANDLES(sc)     (((sc) >> 4) & 0x0f)
+
+/* Retrives number of output handles from the scalars parameter */
+#define REMOTE_SCALARS_OUTHANDLES(sc)    ((sc) & 0x0f)
+
+#define REMOTE_SCALARS_LENGTH(sc)	(REMOTE_SCALARS_INBUFS(sc) +\
+					REMOTE_SCALARS_OUTBUFS(sc) +\
+					REMOTE_SCALARS_INHANDLES(sc) +\
+					REMOTE_SCALARS_OUTHANDLES(sc))
+
+#define FASTRPC_BUILD_SCALARS(attr, method, in, out, oin, oout) \
+		((((uint32_t)   (attr) & 0x7) << 29) | \
+		(((uint32_t) (method) & 0x1f) << 24) | \
+		(((uint32_t)     (in) & 0xff) << 16) | \
+		(((uint32_t)    (out) & 0xff) <<  8) | \
+		(((uint32_t)    (oin) & 0x0f) <<  4) | \
+		((uint32_t)   (oout) & 0x0f))
+
+#define FASTRPC_SCALARS(method, in, out) \
+		FASTRPC_BUILD_SCALARS(0, method, in, out, 0, 0)
 #define cdev_to_cctx(d) container_of(d, struct fastrpc_channel_ctx, cdev)
 
 static const char *domains[FASTRPC_DEV_MAX] = { "adsp", "mdsp",
@@ -32,6 +66,82 @@  static const char *domains[FASTRPC_DEV_MAX] = { "adsp", "mdsp",
 static dev_t fastrpc_major;
 static struct class *fastrpc_class;
 
+struct fastrpc_invoke_header {
+	uint64_t ctx;		/* invoke caller context */
+	uint32_t handle;	/* handle to invoke */
+	uint32_t sc;		/* scalars structure describing the data */
+};
+
+struct fastrpc_phy_page {
+	uint64_t addr;		/* physical address */
+	uint64_t size;		/* size of contiguous region */
+};
+
+struct fastrpc_invoke_buf {
+	int num;		/* number of contiguous regions */
+	int pgidx;		/* index to start of contiguous region */
+};
+
+struct fastrpc_invoke {
+	struct fastrpc_invoke_header header;
+	struct fastrpc_phy_page page; /* list of pages address */
+};
+
+struct fastrpc_msg {
+	uint32_t pid;		/* process group id */
+	uint32_t tid;		/* thread id */
+	struct fastrpc_invoke invoke;
+};
+
+struct fastrpc_invoke_rsp {
+	uint64_t ctx;		/* invoke caller context */
+	int retval;		/* invoke return value */
+};
+
+struct fastrpc_buf {
+	struct fastrpc_user *fl;
+	struct device *dev;
+	void *virt;
+	uint64_t phys;
+	size_t size;
+};
+
+struct fastrpc_map {
+	struct list_head node;
+	struct fastrpc_user *fl;
+	int fd;
+	struct dma_buf *buf;
+	struct sg_table *table;
+	struct dma_buf_attachment *attach;
+	uint64_t phys;
+	size_t size;
+	uintptr_t va;
+	size_t len;
+	struct kref refcount;
+};
+
+struct fastrpc_invoke_ctx {
+	struct fastrpc_user *fl;
+	struct list_head node; /* list of ctxs */
+	struct completion work;
+	int retval;
+	int pid;
+	int tgid;
+	uint32_t sc;
+	struct fastrpc_msg msg;
+	uint64_t ctxid;
+	size_t used_sz;
+
+	remote_arg_t *lpra;
+	unsigned int *attrs;
+	int *fds;
+	uint32_t *crc;
+
+	remote_arg64_t *rpra;
+	struct fastrpc_map **maps;
+	struct fastrpc_buf *buf;
+};
+
 struct fastrpc_session_ctx {
 	struct device *dev;
 	int sid;
@@ -59,6 +169,7 @@  struct fastrpc_user {
 
 	struct fastrpc_channel_ctx *cctx;
 	struct fastrpc_session_ctx *sctx;
+	struct fastrpc_buf *init_mem;
 
 	int tgid;
 	int pd;
@@ -69,6 +180,590 @@  struct fastrpc_user {
 	struct device *dev;
 };
 
+static void fastrpc_free_map(struct kref *ref)
+{
+	struct fastrpc_map *map;
+
+	map = container_of(ref, struct fastrpc_map, refcount);
+
+	list_del(&map->node);
+
+	if (map->table) {
+		dma_buf_unmap_attachment(map->attach, map->table,
+				DMA_BIDIRECTIONAL);
+		dma_buf_detach(map->buf, map->attach);
+		dma_buf_put(map->buf);
+	}
+
+	kfree(map);
+}
+
+static void fastrpc_map_put(struct fastrpc_map *map)
+{
+	struct fastrpc_user *fl;
+
+	if (map) {
+		fl = map->fl;
+		mutex_lock(&fl->mutex);
+		kref_put(&map->refcount, fastrpc_free_map);
+		mutex_unlock(&fl->mutex);
+	}
+}
+
+static int fastrpc_map_get(struct fastrpc_user *fl, int fd,
+			     uintptr_t va, size_t len,
+			     struct fastrpc_map **ppmap)
+{
+	struct fastrpc_map *map = NULL, *n;
+
+	mutex_lock(&fl->mutex);
+	list_for_each_entry_safe(map, n, &fl->maps, node) {
+		if (map->fd == fd) {
+			kref_get(&map->refcount);
+			*ppmap = map;
+			mutex_unlock(&fl->mutex);
+			return 0;
+		}
+	}
+	mutex_unlock(&fl->mutex);
+
+	return -ENOENT;
+}
+
+static void fastrpc_buf_free(struct fastrpc_buf *buf)
+{
+	dma_free_coherent(buf->dev, buf->size, buf->virt,
+				FASTRPC_PHYS(buf->phys));
+	kfree(buf);
+}
+
+static int fastrpc_buf_alloc(struct fastrpc_user *fl, struct device *dev,
+			     size_t size, struct fastrpc_buf **obuf)
+{
+	struct fastrpc_buf *buf;
+
+	buf = kzalloc(sizeof(*buf), GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	buf->fl = fl;
+	buf->virt = NULL;
+	buf->phys = 0;
+	buf->size = size;
+	buf->dev = dev;
+
+	buf->virt = dma_alloc_coherent(dev, buf->size, (dma_addr_t *)&buf->phys,
+				       GFP_KERNEL);
+	if (!buf->virt)
+		return -ENOMEM;
+
+	if (fl->sctx && fl->sctx->sid)
+		buf->phys += ((uint64_t)fl->sctx->sid << 32);
+
+	*obuf = buf;
+
+	return 0;
+}
+
+static void fastrpc_context_free(struct fastrpc_invoke_ctx *ctx)
+{
+	struct fastrpc_channel_ctx *cctx = ctx->fl->cctx;
+	struct fastrpc_user *user = ctx->fl;
+	int scalars = REMOTE_SCALARS_LENGTH(ctx->sc);
+	int i;
+
+	spin_lock(&user->lock);
+	list_del(&ctx->node);
+	spin_unlock(&user->lock);
+
+	for (i = 0; i < scalars; i++) {
+		if (ctx->maps[i])
+			fastrpc_map_put(ctx->maps[i]);
+	}
+
+	if (ctx->buf)
+		fastrpc_buf_free(ctx->buf);
+
+	spin_lock(&cctx->lock);
+	idr_remove(&cctx->ctx_idr, ctx->ctxid >> 4);
+	spin_unlock(&cctx->lock);
+
+	kfree(ctx);
+}
+
+static struct fastrpc_invoke_ctx *fastrpc_context_alloc(
+					struct fastrpc_user *user,
+					uint32_t kernel,
+					struct fastrpc_ioctl_invoke *inv)
+{
+	struct fastrpc_channel_ctx *cctx = user->cctx;
+	struct fastrpc_invoke_ctx *ctx = NULL;
+	int bufs, size, ret;
+	int err = 0;
+
+	bufs = REMOTE_SCALARS_LENGTH(inv->sc);
+	size = (sizeof(*ctx->lpra) + sizeof(*ctx->maps) +
+		sizeof(*ctx->fds)  + sizeof(*ctx->attrs)) * bufs;
+
+	ctx = kzalloc(sizeof(*ctx) + size, GFP_KERNEL);
+	if (!ctx)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&ctx->node);
+	ctx->fl = user;
+	ctx->maps = (struct fastrpc_map **)(&ctx[1]);
+	ctx->lpra = (remote_arg_t *)(&ctx->maps[bufs]);
+	ctx->fds = (int *)(&ctx->lpra[bufs]);
+	ctx->attrs = (unsigned int *)(&ctx->fds[bufs]);
+
+	if (!kernel) {
+		if (copy_from_user(ctx->lpra,
+				     (void const __user *)inv->pra,
+				     bufs * sizeof(*ctx->lpra))) {
+			err = -EFAULT;
+			goto err;
+		}
+
+		if (inv->fds) {
+			if (copy_from_user(ctx->fds,
+					     (void const __user *)inv->fds,
+					     bufs * sizeof(*ctx->fds))) {
+				err = -EFAULT;
+				goto err;
+			}
+		}
+		if (inv->attrs) {
+			if (copy_from_user(
+					ctx->attrs,
+					(void const __user *)inv->attrs,
+					bufs * sizeof(*ctx->attrs))) {
+				err = -EFAULT;
+				goto err;
+			}
+		}
+	} else {
+		memcpy(ctx->lpra, inv->pra, bufs * sizeof(*ctx->lpra));
+		if (inv->fds)
+			memcpy(ctx->fds, inv->fds,
+			       bufs * sizeof(*ctx->fds));
+		if (inv->attrs)
+			memcpy(ctx->attrs, inv->attrs,
+			       bufs * sizeof(*ctx->attrs));
+	}
+
+	ctx->crc = (uint32_t *)inv->crc;
+	ctx->sc = inv->sc;
+	ctx->retval = -1;
+	ctx->pid = current->pid;
+	ctx->tgid = user->tgid;
+	init_completion(&ctx->work);
+
+	spin_lock(&user->lock);
+	list_add_tail(&ctx->node, &user->pending);
+	spin_unlock(&user->lock);
+
+	spin_lock(&cctx->lock);
+	ret = idr_alloc_cyclic(&cctx->ctx_idr, ctx, 1,
+				FASTRPC_CTX_MAX, GFP_ATOMIC);
+	if (ret < 0) {
+		spin_unlock(&cctx->lock);
+		err = ret;
+		goto err_idr;
+	}
+	ctx->ctxid = ret << 4;
+	spin_unlock(&cctx->lock);
+
+	return ctx;
+err_idr:
+	spin_lock(&user->lock);
+	list_del(&ctx->node);
+	spin_unlock(&user->lock);
+err:
+	kfree(ctx);
+
+	return ERR_PTR(err);
+}
+
+static int fastrpc_map_create(struct fastrpc_user *fl, int fd, uintptr_t va,
+			       size_t len, struct fastrpc_map **ppmap)
+{
+	struct fastrpc_session_ctx *sess = fl->sctx;
+	struct fastrpc_map *map = NULL;
+	int err = 0;
+
+	if (!fastrpc_map_get(fl, fd, va, len, ppmap))
+		return 0;
+
+	map = kzalloc(sizeof(*map), GFP_KERNEL);
+	if (!map)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&map->node);
+	map->fl = fl;
+	map->fd = fd;
+	map->buf = dma_buf_get(fd);
+	if (!map->buf) {
+		err = -EINVAL;
+		goto get_err;
+	}
+
+	map->attach = dma_buf_attach(map->buf, sess->dev);
+	if (IS_ERR(map->attach)) {
+		dev_err(sess->dev, "Failed to attach dmabuf\n");
+		err = PTR_ERR(map->attach);
+		goto attach_err;
+	}
+
+	map->table = dma_buf_map_attachment(map->attach,
+					    DMA_BIDIRECTIONAL);
+	if (IS_ERR(map->table)) {
+		err = PTR_ERR(map->table);
+		goto map_err;
+	}
+
+	map->phys = sg_dma_address(map->table->sgl);
+	map->phys += ((uint64_t)fl->sctx->sid << 32);
+	map->size = len;
+	map->va = (uintptr_t)sg_virt(map->table->sgl);
+	map->len = len;
+	kref_init(&map->refcount);
+
+	spin_lock(&fl->lock);
+	list_add_tail(&map->node, &fl->maps);
+	spin_unlock(&fl->lock);
+	*ppmap = map;
+
+	return 0;
+
+map_err:
+	dma_buf_detach(map->buf, map->attach);
+attach_err:
+	dma_buf_put(map->buf);
+get_err:
+	kfree(map);
+
+	return err;
+}
+
+static inline struct fastrpc_invoke_buf *fastrpc_invoke_buf_start(
+							remote_arg64_t *pra,
+							uint32_t sc)
+{
+	return (struct fastrpc_invoke_buf *)(&pra[REMOTE_SCALARS_LENGTH(sc)]);
+}
+
+static inline struct fastrpc_phy_page *fastrpc_phy_page_start(uint32_t sc,
+						struct fastrpc_invoke_buf *buf)
+{
+	return (struct fastrpc_phy_page *)(&buf[REMOTE_SCALARS_LENGTH(sc)]);
+}
+
+static int fastrpc_get_args(uint32_t kernel, struct fastrpc_invoke_ctx *ctx)
+{
+	remote_arg64_t *rpra;
+	remote_arg_t *lpra = ctx->lpra;
+	struct fastrpc_invoke_buf *list;
+	struct fastrpc_phy_page *pages;
+	uint32_t sc = ctx->sc;
+	uintptr_t args;
+	size_t rlen = 0, copylen = 0, metalen = 0;
+	int inbufs, handles, bufs, i, err = 0;
+	uint64_t *fdlist;
+	uint32_t *crclist;
+
+	inbufs = REMOTE_SCALARS_INBUFS(sc);
+	bufs = inbufs + REMOTE_SCALARS_OUTBUFS(sc);
+	handles = REMOTE_SCALARS_INHANDLES(sc) + REMOTE_SCALARS_OUTHANDLES(sc);
+	metalen = (bufs + handles) * (sizeof(remote_arg64_t) +
+		  sizeof(struct fastrpc_invoke_buf) +
+		  sizeof(struct fastrpc_phy_page)) +
+		  sizeof(uint64_t) * FASTRPC_MAX_FDLIST +
+		  sizeof(uint32_t) * FASTRPC_MAX_CRCLIST;
+
+	copylen = metalen;
+
+	for (i = 0; i < bufs + handles; ++i) {
+		uintptr_t buf = (uintptr_t)lpra[i].buf.pv;
+		size_t len = lpra[i].buf.len;
+
+		if (i < bufs) {
+			if (ctx->fds[i] && (ctx->fds[i] != -1))
+				fastrpc_map_create(ctx->fl, ctx->fds[i], buf,
+						len, &ctx->maps[i]);
+
+			if (!len)
+				continue;
+
+			if (ctx->maps[i])
+				continue;
+
+			copylen = ALIGN(copylen, FASTRPC_ALIGN);
+			copylen += len;
+		} else {
+			err = fastrpc_map_create(ctx->fl, ctx->fds[i], 0,
+						  0, &ctx->maps[i]);
+			if (err)
+				goto bail;
+		}
+	}
+	ctx->used_sz = copylen;
+
+	/* allocate new buffer */
+	if (copylen) {
+		err = fastrpc_buf_alloc(ctx->fl, ctx->fl->sctx->dev,
+					copylen, &ctx->buf);
+		if (err)
+			goto bail;
+	}
+
+	/* copy metadata */
+	rpra = ctx->buf->virt;
+	ctx->rpra = rpra;
+	list = fastrpc_invoke_buf_start(rpra, sc);
+	pages = fastrpc_phy_page_start(sc, list);
+	args = (uintptr_t)ctx->buf->virt + metalen;
+	fdlist = (uint64_t *)&pages[bufs + handles];
+	memset(fdlist, 0, sizeof(uint32_t)*FASTRPC_MAX_FDLIST);
+	crclist = (uint32_t *)&fdlist[FASTRPC_MAX_FDLIST];
+	memset(crclist, 0, sizeof(uint32_t)*FASTRPC_MAX_CRCLIST);
+	rlen = copylen - metalen;
+
+	for (i = 0; i < bufs; ++i) {
+		struct fastrpc_map *map = ctx->maps[i];
+		size_t len = lpra[i].buf.len;
+		size_t mlen;
+
+		if (len)
+			list[i].num = 1;
+		else
+			list[i].num = 0;
+
+		list[i].pgidx = i;
+
+		rpra[i].buf.pv = 0;
+		rpra[i].buf.len = len;
+		if (!len)
+			continue;
+		if (map) {
+			uintptr_t offset = 0;
+			uint64_t num = roundup(len,
+					       PAGE_SIZE) / PAGE_SIZE;
+			int idx = list[i].pgidx;
+
+			pages[idx].addr = map->phys + offset;
+			pages[idx].size = num << PAGE_SHIFT;
+			rpra[i].buf.pv =
+				(uint64_t)((uintptr_t)lpra[i].buf.pv);
+		} else {
+			rlen -= ALIGN(args, FASTRPC_ALIGN) - args;
+			args = ALIGN(args, FASTRPC_ALIGN);
+			mlen = len;
+			if (rlen < mlen)
+				goto bail;
+
+			rpra[i].buf.pv = (args);
+			pages[list[i].pgidx].addr = ctx->buf->phys +
+							(copylen - rlen);
+			pages[list[i].pgidx].addr = pages[list[i].pgidx].addr &
+							PAGE_MASK;
+			pages[list[i].pgidx].size = roundup(len, PAGE_SIZE);
+
+			if (i < inbufs) {
+				if (!kernel) {
+					err = copy_from_user(
+					(void *)rpra[i].buf.pv,
+					(void const __user *)lpra[i].buf.pv,
+					len);
+					if (err)
+						goto bail;
+				} else {
+					memcpy((void *)rpra[i].buf.pv,
+					       lpra[i].buf.pv, len);
+				}
+			}
+			args = args + mlen;
+			rlen -= mlen;
+		}
+	}
+
+	for (i = bufs; i < handles; ++i) {
+		struct fastrpc_map *map = ctx->maps[i];
+		size_t len = lpra[i].buf.len;
+
+		if (len)
+			list[i].num = 1;
+		else
+			list[i].num = 0;
+
+		list[i].pgidx = i;
+
+		pages[i].addr = map->phys;
+		pages[i].size = map->size;
+		rpra[i].dma.fd = ctx->fds[i];
+		rpra[i].dma.len = len;
+		rpra[i].dma.offset = (uint32_t)(uintptr_t)lpra[i].buf.pv;
+	}
+
+bail:
+	return err;
+}
+
+static int fastrpc_put_args(struct fastrpc_invoke_ctx *ctx,
+			    uint32_t kernel, remote_arg_t *upra)
+{
+	remote_arg64_t *rpra = ctx->rpra;
+	int i, inbufs, outbufs, handles;
+	struct fastrpc_invoke_buf *list;
+	struct fastrpc_phy_page *pages;
+	struct fastrpc_map *mmap;
+	uint32_t sc = ctx->sc;
+	uint64_t *fdlist;
+	uint32_t *crclist;
+	int err = 0;
+
+	inbufs = REMOTE_SCALARS_INBUFS(sc);
+	outbufs = REMOTE_SCALARS_OUTBUFS(sc);
+	handles = REMOTE_SCALARS_INHANDLES(sc) + REMOTE_SCALARS_OUTHANDLES(sc);
+	list = fastrpc_invoke_buf_start(ctx->rpra, sc);
+	pages = fastrpc_phy_page_start(sc, list);
+	fdlist = (uint64_t *)(pages + inbufs + outbufs + handles);
+	crclist = (uint32_t *)(fdlist + FASTRPC_MAX_FDLIST);
+
+	for (i = inbufs; i < inbufs + outbufs; ++i) {
+		if (!ctx->maps[i]) {
+			if (!kernel)
+				err =
+				copy_to_user((void __user *)ctx->lpra[i].buf.pv,
+				       (void *)rpra[i].buf.pv, rpra[i].buf.len);
+			else
+				memcpy(ctx->lpra[i].buf.pv,
+				       (void *)rpra[i].buf.pv, rpra[i].buf.len);
+
+			if (err)
+				goto bail;
+		} else {
+			fastrpc_map_put(ctx->maps[i]);
+			ctx->maps[i] = NULL;
+		}
+	}
+
+	if (inbufs + outbufs + handles) {
+		for (i = 0; i < FASTRPC_MAX_FDLIST; i++) {
+			if (!fdlist[i])
+				break;
+			if (!fastrpc_map_get(ctx->fl, (int)fdlist[i], 0,
+					       0, &mmap))
+				fastrpc_map_put(mmap);
+		}
+	}
+
+	if (ctx->crc && crclist) {
+		if (!kernel)
+			err = copy_to_user((void __user *)ctx->crc, crclist,
+					FASTRPC_MAX_CRCLIST*sizeof(uint32_t));
+		else
+			memcpy(ctx->crc, crclist,
+					FASTRPC_MAX_CRCLIST*sizeof(uint32_t));
+	}
+
+bail:
+	return err;
+}
+
+static int fastrpc_invoke_send(struct fastrpc_session_ctx *sctx,
+			       struct fastrpc_invoke_ctx *ctx,
+			       uint32_t kernel, uint32_t handle)
+{
+	struct fastrpc_channel_ctx *cctx;
+	struct fastrpc_user *fl = ctx->fl;
+	struct fastrpc_msg *msg = &ctx->msg;
+
+	cctx = fl->cctx;
+	msg->pid = fl->tgid;
+	msg->tid = current->pid;
+
+	if (kernel)
+		msg->pid = 0;
+
+	msg->invoke.header.ctx = ctx->ctxid | fl->pd;
+	msg->invoke.header.handle = handle;
+	msg->invoke.header.sc = ctx->sc;
+	msg->invoke.page.addr = ctx->buf ? ctx->buf->phys : 0;
+	msg->invoke.page.size = roundup(ctx->used_sz, PAGE_SIZE);
+
+	return rpmsg_send(cctx->rpdev->ept, (void *)msg, sizeof(*msg));
+}
+
+static int fastrpc_internal_invoke(struct fastrpc_user *fl,
+				   uint32_t kernel,
+				   struct fastrpc_ioctl_invoke *inv)
+{
+	struct fastrpc_invoke_ctx *ctx = NULL;
+	int err = 0;
+
+	if (!fl->sctx)
+		return -EINVAL;
+
+	ctx = fastrpc_context_alloc(fl, kernel, inv);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	if (REMOTE_SCALARS_LENGTH(ctx->sc)) {
+		err = fastrpc_get_args(kernel, ctx);
+		if (err)
+			goto bail;
+	}
+
+	err = fastrpc_invoke_send(fl->sctx, ctx, kernel, inv->handle);
+	if (err)
+		goto bail;
+
+	err = wait_for_completion_interruptible(&ctx->work);
+	if (err)
+		goto bail;
+
+	err = ctx->retval;
+	if (err)
+		goto bail;
+
+	err = fastrpc_put_args(ctx, kernel, inv->pra);
+	if (err)
+		goto bail;
+bail:
+	if (ctx)
+		fastrpc_context_free(ctx);
+
+	return err;
+}
+static struct fastrpc_session_ctx *fastrpc_session_alloc(
+					struct fastrpc_channel_ctx *cctx,
+					int secure)
+{
+	struct fastrpc_session_ctx *session = NULL;
+	int i;
+
+	spin_lock(&cctx->lock);
+	for (i = 0; i < cctx->sesscount; i++) {
+		if (!cctx->session[i].used && cctx->session[i].valid &&
+				cctx->session[i].secure == secure) {
+			cctx->session[i].used = true;
+			session = &cctx->session[i];
+			break;
+		}
+	}
+	spin_unlock(&cctx->lock);
+
+	return session;
+}
+
+static void fastrpc_session_free(struct fastrpc_channel_ctx *cctx,
+				 struct fastrpc_session_ctx *session)
+{
+	spin_lock(&cctx->lock);
+	session->used = false;
+	spin_unlock(&cctx->lock);
+}
+
 static const struct of_device_id fastrpc_match_table[] = {
 	{ .compatible = "qcom,fastrpc-compute-cb", },
 	{}
@@ -78,11 +773,26 @@  static int fastrpc_device_release(struct inode *inode, struct file *file)
 {
 	struct fastrpc_user *fl = (struct fastrpc_user *)file->private_data;
 	struct fastrpc_channel_ctx *cctx = cdev_to_cctx(inode->i_cdev);
+	struct fastrpc_invoke_ctx *ctx, *n;
+	struct fastrpc_map *map, *m;
 
 	spin_lock(&cctx->lock);
 	list_del(&fl->user);
 	spin_unlock(&cctx->lock);
 
+	if (fl->init_mem)
+		fastrpc_buf_free(fl->init_mem);
+
+	list_for_each_entry_safe(ctx, n, &fl->pending, node)
+		fastrpc_context_free(ctx);
+
+	list_for_each_entry_safe(map, m, &fl->maps, node)
+		fastrpc_map_put(map);
+
+	if (fl->sctx)
+		fastrpc_session_free(fl->cctx, fl->sctx);
+
+	mutex_destroy(&fl->mutex);
 	kfree(fl);
 	file->private_data = NULL;
 
@@ -116,9 +826,48 @@  static int fastrpc_device_open(struct inode *inode, struct file *filp)
 	return 0;
 }
 
+static long fastrpc_device_ioctl(struct file *file, unsigned int cmd,
+				 unsigned long arg)
+{
+	struct fastrpc_user *fl = (struct fastrpc_user *)file->private_data;
+	struct fastrpc_channel_ctx *cctx = fl->cctx;
+	char __user *argp = (char __user *)arg;
+	int err;
+
+	if (!fl->sctx) {
+		fl->sctx = fastrpc_session_alloc(cctx, 0);
+		if (!fl->sctx)
+			return -ENOENT;
+	}
+
+	switch (cmd) {
+	case FASTRPC_IOCTL_INVOKE: {
+		struct fastrpc_ioctl_invoke inv;
+
+		inv.fds = NULL;
+		inv.attrs = NULL;
+		inv.crc = NULL;
+		err = copy_from_user(&inv, argp, sizeof(inv));
+		if (err)
+			goto bail;
+		err = fastrpc_internal_invoke(fl, 0, &inv);
+		if (err)
+			goto bail;
+		break;
+		}
+default:
+		err = -ENOTTY;
+		pr_info("bad ioctl: %d\n", cmd);
+		break;
+	}
+bail:
+	return err;
+}
+
 static const struct file_operations fastrpc_fops = {
 	.open = fastrpc_device_open,
 	.release = fastrpc_device_release,
+	.unlocked_ioctl = fastrpc_device_ioctl,
 };
 
 static int fastrpc_cb_probe(struct platform_device *pdev)
@@ -251,9 +1000,25 @@  static int fastrpc_rpmsg_probe(struct rpmsg_device *rpdev)
 	return err;
 }
 
+static void fastrpc_notify_users(struct fastrpc_user *user)
+{
+	struct fastrpc_invoke_ctx *ctx, *n;
+
+	spin_lock(&user->lock);
+	list_for_each_entry_safe(ctx, n, &user->pending, node)
+		complete(&ctx->work);
+	spin_unlock(&user->lock);
+}
+
 static void fastrpc_rpmsg_remove(struct rpmsg_device *rpdev)
 {
 	struct fastrpc_channel_ctx *cctx = dev_get_drvdata(&rpdev->dev);
+	struct fastrpc_user *user, *n;
+
+	spin_lock(&cctx->lock);
+	list_for_each_entry_safe(user, n, &cctx->users, user)
+		fastrpc_notify_users(user);
+	spin_unlock(&cctx->lock);
 
 	device_del(&cctx->dev);
 	put_device(&cctx->dev);
@@ -264,6 +1029,31 @@  static void fastrpc_rpmsg_remove(struct rpmsg_device *rpdev)
 static int fastrpc_rpmsg_callback(struct rpmsg_device *rpdev, void *data,
 				  int len, void *priv, u32 addr)
 {
+	struct fastrpc_channel_ctx *cctx = dev_get_drvdata(&rpdev->dev);
+	struct fastrpc_invoke_rsp *rsp = data;
+	struct fastrpc_invoke_ctx *ctx;
+	unsigned long flags;
+	int ctxid;
+
+	if (rsp && len < sizeof(*rsp)) {
+		dev_err(&rpdev->dev, "invalid response or context\n");
+		return -EINVAL;
+	}
+
+	ctxid = (uint32_t)((rsp->ctx & FASTRPC_CTXID_MASK) >> 4);
+
+	spin_lock_irqsave(&cctx->lock, flags);
+	ctx = idr_find(&cctx->ctx_idr, ctxid);
+	spin_unlock_irqrestore(&cctx->lock, flags);
+
+	if (!ctx) {
+		dev_err(&rpdev->dev, "No context ID matches response\n");
+		return -ENOENT;
+	}
+
+	ctx->retval = rsp->retval;
+	complete(&ctx->work);
+
 	return 0;
 }
 
diff --git a/include/uapi/linux/fastrpc.h b/include/uapi/linux/fastrpc.h
new file mode 100644
index 000000000000..8fec66601337
--- /dev/null
+++ b/include/uapi/linux/fastrpc.h
@@ -0,0 +1,56 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __QCOM_FASTRPC_H__
+#define __QCOM_FASTRPC_H__
+
+#include <linux/types.h>
+
+#define FASTRPC_IOCTL_INVOKE	_IOWR('R', 3, struct fastrpc_ioctl_invoke)
+
+#define remote_arg64_t    union remote_arg64
+
+struct remote_buf64 {
+	uint64_t pv;
+	uint64_t len;
+};
+
+struct remote_dma_handle64 {
+	int fd;
+	uint32_t offset;
+	uint32_t len;
+};
+
+union remote_arg64 {
+	struct remote_buf64	buf;
+	struct remote_dma_handle64 dma;
+	uint32_t h;
+};
+
+#define remote_arg_t    union remote_arg
+
+struct remote_buf {
+	void *pv;		/* buffer pointer */
+	size_t len;		/* length of buffer */
+};
+
+struct remote_dma_handle {
+	int fd;
+	uint32_t offset;
+};
+
+union remote_arg {
+	struct remote_buf buf;	/* buffer info */
+	struct remote_dma_handle dma;
+	uint32_t h;		/* remote handle */
+};
+
+struct fastrpc_ioctl_invoke {
+	uint32_t handle;	/* remote handle */
+	uint32_t sc;		/* scalars describing the data */
+	remote_arg_t *pra;	/* remote arguments list */
+	int *fds;		/* fd list */
+	unsigned int *attrs;	/* attribute list */
+	unsigned int *crc;
+};
+
+#endif /* __QCOM_FASTRPC_H__ */