diff mbox series

[v3,4/4] io_uring: add support for zone-append

Message ID 1593974870-18919-5-git-send-email-joshi.k@samsung.com (mailing list archive)
State New, archived
Headers show
Series zone-append support in io-uring and aio | expand

Commit Message

Kanchan Joshi July 5, 2020, 6:47 p.m. UTC
From: Selvakumar S <selvakuma.s1@samsung.com>

For zone-append, block-layer will return zone-relative offset via ret2
of ki_complete interface. Make changes to collect it, and send to
user-space using cqe->flags.

Signed-off-by: Selvakumar S <selvakuma.s1@samsung.com>
Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
Signed-off-by: Javier Gonzalez <javier.gonz@samsung.com>
---
 fs/io_uring.c | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

Comments

Jens Axboe July 5, 2020, 9 p.m. UTC | #1
On 7/5/20 12:47 PM, Kanchan Joshi wrote:
> From: Selvakumar S <selvakuma.s1@samsung.com>
> 
> For zone-append, block-layer will return zone-relative offset via ret2
> of ki_complete interface. Make changes to collect it, and send to
> user-space using cqe->flags.
> 
> Signed-off-by: Selvakumar S <selvakuma.s1@samsung.com>
> Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
> Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
> Signed-off-by: Javier Gonzalez <javier.gonz@samsung.com>
> ---
>  fs/io_uring.c | 21 +++++++++++++++++++--
>  1 file changed, 19 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 155f3d8..cbde4df 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -402,6 +402,8 @@ struct io_rw {
>  	struct kiocb			kiocb;
>  	u64				addr;
>  	u64				len;
> +	/* zone-relative offset for append, in sectors */
> +	u32			append_offset;
>  };

I don't like this very much at all. As it stands, the first cacheline
of io_kiocb is set aside for request-private data. io_rw is already
exactly 64 bytes, which means that you're now growing io_rw beyond
a cacheline and increasing the size of io_kiocb as a whole.

Maybe you can reuse io_rw->len for this, as that is only used on the
submission side of things.
Matthew Wilcox July 5, 2020, 9:09 p.m. UTC | #2
On Sun, Jul 05, 2020 at 03:00:47PM -0600, Jens Axboe wrote:
> On 7/5/20 12:47 PM, Kanchan Joshi wrote:
> > From: Selvakumar S <selvakuma.s1@samsung.com>
> > 
> > For zone-append, block-layer will return zone-relative offset via ret2
> > of ki_complete interface. Make changes to collect it, and send to
> > user-space using cqe->flags.
> > 
> > Signed-off-by: Selvakumar S <selvakuma.s1@samsung.com>
> > Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
> > Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
> > Signed-off-by: Javier Gonzalez <javier.gonz@samsung.com>
> > ---
> >  fs/io_uring.c | 21 +++++++++++++++++++--
> >  1 file changed, 19 insertions(+), 2 deletions(-)
> > 
> > diff --git a/fs/io_uring.c b/fs/io_uring.c
> > index 155f3d8..cbde4df 100644
> > --- a/fs/io_uring.c
> > +++ b/fs/io_uring.c
> > @@ -402,6 +402,8 @@ struct io_rw {
> >  	struct kiocb			kiocb;
> >  	u64				addr;
> >  	u64				len;
> > +	/* zone-relative offset for append, in sectors */
> > +	u32			append_offset;
> >  };
> 
> I don't like this very much at all. As it stands, the first cacheline
> of io_kiocb is set aside for request-private data. io_rw is already
> exactly 64 bytes, which means that you're now growing io_rw beyond
> a cacheline and increasing the size of io_kiocb as a whole.
> 
> Maybe you can reuse io_rw->len for this, as that is only used on the
> submission side of things.

I'm surprised you aren't more upset by the abuse of cqe->flags for the
address.

What do you think to my idea of interpreting the user_data as being a
pointer to somewhere to store the address?  Obviously other things
can be stored after the address in the user_data.

Or we could have a separate flag to indicate that is how to interpret
the user_data.
Jens Axboe July 5, 2020, 9:12 p.m. UTC | #3
On 7/5/20 3:09 PM, Matthew Wilcox wrote:
> On Sun, Jul 05, 2020 at 03:00:47PM -0600, Jens Axboe wrote:
>> On 7/5/20 12:47 PM, Kanchan Joshi wrote:
>>> From: Selvakumar S <selvakuma.s1@samsung.com>
>>>
>>> For zone-append, block-layer will return zone-relative offset via ret2
>>> of ki_complete interface. Make changes to collect it, and send to
>>> user-space using cqe->flags.
>>>
>>> Signed-off-by: Selvakumar S <selvakuma.s1@samsung.com>
>>> Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
>>> Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
>>> Signed-off-by: Javier Gonzalez <javier.gonz@samsung.com>
>>> ---
>>>  fs/io_uring.c | 21 +++++++++++++++++++--
>>>  1 file changed, 19 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>>> index 155f3d8..cbde4df 100644
>>> --- a/fs/io_uring.c
>>> +++ b/fs/io_uring.c
>>> @@ -402,6 +402,8 @@ struct io_rw {
>>>  	struct kiocb			kiocb;
>>>  	u64				addr;
>>>  	u64				len;
>>> +	/* zone-relative offset for append, in sectors */
>>> +	u32			append_offset;
>>>  };
>>
>> I don't like this very much at all. As it stands, the first cacheline
>> of io_kiocb is set aside for request-private data. io_rw is already
>> exactly 64 bytes, which means that you're now growing io_rw beyond
>> a cacheline and increasing the size of io_kiocb as a whole.
>>
>> Maybe you can reuse io_rw->len for this, as that is only used on the
>> submission side of things.
> 
> I'm surprised you aren't more upset by the abuse of cqe->flags for the
> address.

Yeah, it's not great either, but we have less leeway there in terms of
how much space is available to pass back extra data.

> What do you think to my idea of interpreting the user_data as being a
> pointer to somewhere to store the address?  Obviously other things
> can be stored after the address in the user_data.

I don't like that at all, as all other commands just pass user_data
through. This means the application would have to treat this very
differently, and potentially not have a way to store any data for
locating the original command on the user side.

> Or we could have a separate flag to indicate that is how to interpret
> the user_data.

I'd be vehemently against changing user_data in any shape or form.
It's to be passed through from sqe to cqe, that's how the command flow
works. It's never kernel generated, and it's also used as a key for
command lookup.
Kanchan Joshi July 6, 2020, 1:58 p.m. UTC | #4
On Sun, Jul 05, 2020 at 03:00:47PM -0600, Jens Axboe wrote:
>On 7/5/20 12:47 PM, Kanchan Joshi wrote:
>> From: Selvakumar S <selvakuma.s1@samsung.com>
>>
>> For zone-append, block-layer will return zone-relative offset via ret2
>> of ki_complete interface. Make changes to collect it, and send to
>> user-space using cqe->flags.
>>
>> Signed-off-by: Selvakumar S <selvakuma.s1@samsung.com>
>> Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
>> Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
>> Signed-off-by: Javier Gonzalez <javier.gonz@samsung.com>
>> ---
>>  fs/io_uring.c | 21 +++++++++++++++++++--
>>  1 file changed, 19 insertions(+), 2 deletions(-)
>>
>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>> index 155f3d8..cbde4df 100644
>> --- a/fs/io_uring.c
>> +++ b/fs/io_uring.c
>> @@ -402,6 +402,8 @@ struct io_rw {
>>  	struct kiocb			kiocb;
>>  	u64				addr;
>>  	u64				len;
>> +	/* zone-relative offset for append, in sectors */
>> +	u32			append_offset;
>>  };
>
>I don't like this very much at all. As it stands, the first cacheline
>of io_kiocb is set aside for request-private data. io_rw is already
>exactly 64 bytes, which means that you're now growing io_rw beyond
>a cacheline and increasing the size of io_kiocb as a whole.
>
>Maybe you can reuse io_rw->len for this, as that is only used on the
>submission side of things.

Yes, this will be good. Thanks.
Matthew Wilcox July 6, 2020, 2:10 p.m. UTC | #5
On Sun, Jul 05, 2020 at 03:12:50PM -0600, Jens Axboe wrote:
> On 7/5/20 3:09 PM, Matthew Wilcox wrote:
> > On Sun, Jul 05, 2020 at 03:00:47PM -0600, Jens Axboe wrote:
> >> On 7/5/20 12:47 PM, Kanchan Joshi wrote:
> >>> From: Selvakumar S <selvakuma.s1@samsung.com>
> >>>
> >>> For zone-append, block-layer will return zone-relative offset via ret2
> >>> of ki_complete interface. Make changes to collect it, and send to
> >>> user-space using cqe->flags.

> > I'm surprised you aren't more upset by the abuse of cqe->flags for the
> > address.
> 
> Yeah, it's not great either, but we have less leeway there in terms of
> how much space is available to pass back extra data.
> 
> > What do you think to my idea of interpreting the user_data as being a
> > pointer to somewhere to store the address?  Obviously other things
> > can be stored after the address in the user_data.
> 
> I don't like that at all, as all other commands just pass user_data
> through. This means the application would have to treat this very
> differently, and potentially not have a way to store any data for
> locating the original command on the user side.

I think you misunderstood me.  You seem to have thought I meant
"use the user_data field to return the address" when I actually meant
"interpret the user_data field as a pointer to where userspace
wants the address stored".
Jens Axboe July 6, 2020, 2:27 p.m. UTC | #6
On 7/6/20 8:10 AM, Matthew Wilcox wrote:
> On Sun, Jul 05, 2020 at 03:12:50PM -0600, Jens Axboe wrote:
>> On 7/5/20 3:09 PM, Matthew Wilcox wrote:
>>> On Sun, Jul 05, 2020 at 03:00:47PM -0600, Jens Axboe wrote:
>>>> On 7/5/20 12:47 PM, Kanchan Joshi wrote:
>>>>> From: Selvakumar S <selvakuma.s1@samsung.com>
>>>>>
>>>>> For zone-append, block-layer will return zone-relative offset via ret2
>>>>> of ki_complete interface. Make changes to collect it, and send to
>>>>> user-space using cqe->flags.
> 
>>> I'm surprised you aren't more upset by the abuse of cqe->flags for the
>>> address.
>>
>> Yeah, it's not great either, but we have less leeway there in terms of
>> how much space is available to pass back extra data.
>>
>>> What do you think to my idea of interpreting the user_data as being a
>>> pointer to somewhere to store the address?  Obviously other things
>>> can be stored after the address in the user_data.
>>
>> I don't like that at all, as all other commands just pass user_data
>> through. This means the application would have to treat this very
>> differently, and potentially not have a way to store any data for
>> locating the original command on the user side.
> 
> I think you misunderstood me.  You seem to have thought I meant
> "use the user_data field to return the address" when I actually meant
> "interpret the user_data field as a pointer to where userspace
> wants the address stored".

It's still somewhat weird to have user_data have special meaning, you're
now having the kernel interpret it while every other command it's just
an opaque that is passed through.

But it could of course work, and the app could embed the necessary
u32/u64 in some other structure that's persistent across IO. If it
doesn't have that, then it'd need to now have one allocated and freed
across the lifetime of the IO.

If we're going that route, it'd be better to define the write such that
you're passing in the necessary information upfront. In syscall terms,
then that'd be something ala:

ssize_t my_append_write(int fd, const struct iovec *iov, int iovcnt,
			off_t *offset, int flags);

where *offset is copied out when the write completes. That removes the
need to abuse user_data, with just providing the storage pointer for the
offset upfront.
Matthew Wilcox July 6, 2020, 2:32 p.m. UTC | #7
On Mon, Jul 06, 2020 at 08:27:17AM -0600, Jens Axboe wrote:
> On 7/6/20 8:10 AM, Matthew Wilcox wrote:
> > On Sun, Jul 05, 2020 at 03:12:50PM -0600, Jens Axboe wrote:
> >> On 7/5/20 3:09 PM, Matthew Wilcox wrote:
> >>> On Sun, Jul 05, 2020 at 03:00:47PM -0600, Jens Axboe wrote:
> >>>> On 7/5/20 12:47 PM, Kanchan Joshi wrote:
> >>>>> From: Selvakumar S <selvakuma.s1@samsung.com>
> >>>>>
> >>>>> For zone-append, block-layer will return zone-relative offset via ret2
> >>>>> of ki_complete interface. Make changes to collect it, and send to
> >>>>> user-space using cqe->flags.
> > 
> >>> I'm surprised you aren't more upset by the abuse of cqe->flags for the
> >>> address.
> >>
> >> Yeah, it's not great either, but we have less leeway there in terms of
> >> how much space is available to pass back extra data.
> >>
> >>> What do you think to my idea of interpreting the user_data as being a
> >>> pointer to somewhere to store the address?  Obviously other things
> >>> can be stored after the address in the user_data.
> >>
> >> I don't like that at all, as all other commands just pass user_data
> >> through. This means the application would have to treat this very
> >> differently, and potentially not have a way to store any data for
> >> locating the original command on the user side.
> > 
> > I think you misunderstood me.  You seem to have thought I meant
> > "use the user_data field to return the address" when I actually meant
> > "interpret the user_data field as a pointer to where userspace
> > wants the address stored".
> 
> It's still somewhat weird to have user_data have special meaning, you're
> now having the kernel interpret it while every other command it's just
> an opaque that is passed through.
> 
> But it could of course work, and the app could embed the necessary
> u32/u64 in some other structure that's persistent across IO. If it
> doesn't have that, then it'd need to now have one allocated and freed
> across the lifetime of the IO.
> 
> If we're going that route, it'd be better to define the write such that
> you're passing in the necessary information upfront. In syscall terms,
> then that'd be something ala:
> 
> ssize_t my_append_write(int fd, const struct iovec *iov, int iovcnt,
> 			off_t *offset, int flags);
> 
> where *offset is copied out when the write completes. That removes the
> need to abuse user_data, with just providing the storage pointer for the
> offset upfront.

That works for me!  In io_uring terms, would you like to see that done
as adding:

        union {
                __u64   off;    /* offset into file */
+		__u64   *offp;	/* appending writes */
                __u64   addr2;
        };
Jens Axboe July 6, 2020, 2:33 p.m. UTC | #8
On 7/6/20 8:32 AM, Matthew Wilcox wrote:
> On Mon, Jul 06, 2020 at 08:27:17AM -0600, Jens Axboe wrote:
>> On 7/6/20 8:10 AM, Matthew Wilcox wrote:
>>> On Sun, Jul 05, 2020 at 03:12:50PM -0600, Jens Axboe wrote:
>>>> On 7/5/20 3:09 PM, Matthew Wilcox wrote:
>>>>> On Sun, Jul 05, 2020 at 03:00:47PM -0600, Jens Axboe wrote:
>>>>>> On 7/5/20 12:47 PM, Kanchan Joshi wrote:
>>>>>>> From: Selvakumar S <selvakuma.s1@samsung.com>
>>>>>>>
>>>>>>> For zone-append, block-layer will return zone-relative offset via ret2
>>>>>>> of ki_complete interface. Make changes to collect it, and send to
>>>>>>> user-space using cqe->flags.
>>>
>>>>> I'm surprised you aren't more upset by the abuse of cqe->flags for the
>>>>> address.
>>>>
>>>> Yeah, it's not great either, but we have less leeway there in terms of
>>>> how much space is available to pass back extra data.
>>>>
>>>>> What do you think to my idea of interpreting the user_data as being a
>>>>> pointer to somewhere to store the address?  Obviously other things
>>>>> can be stored after the address in the user_data.
>>>>
>>>> I don't like that at all, as all other commands just pass user_data
>>>> through. This means the application would have to treat this very
>>>> differently, and potentially not have a way to store any data for
>>>> locating the original command on the user side.
>>>
>>> I think you misunderstood me.  You seem to have thought I meant
>>> "use the user_data field to return the address" when I actually meant
>>> "interpret the user_data field as a pointer to where userspace
>>> wants the address stored".
>>
>> It's still somewhat weird to have user_data have special meaning, you're
>> now having the kernel interpret it while every other command it's just
>> an opaque that is passed through.
>>
>> But it could of course work, and the app could embed the necessary
>> u32/u64 in some other structure that's persistent across IO. If it
>> doesn't have that, then it'd need to now have one allocated and freed
>> across the lifetime of the IO.
>>
>> If we're going that route, it'd be better to define the write such that
>> you're passing in the necessary information upfront. In syscall terms,
>> then that'd be something ala:
>>
>> ssize_t my_append_write(int fd, const struct iovec *iov, int iovcnt,
>> 			off_t *offset, int flags);
>>
>> where *offset is copied out when the write completes. That removes the
>> need to abuse user_data, with just providing the storage pointer for the
>> offset upfront.
> 
> That works for me!  In io_uring terms, would you like to see that done
> as adding:
> 
>         union {
>                 __u64   off;    /* offset into file */
> +		__u64   *offp;	/* appending writes */
>                 __u64   addr2;
>         };
> 

Either that, or just use addr2 for it directly. I consider the appending
writes a marginal enough use case that it doesn't really warrant adding
a specially named field for that.
Kanchan Joshi July 7, 2020, 3:11 p.m. UTC | #9
On Mon, Jul 06, 2020 at 03:32:08PM +0100, Matthew Wilcox wrote:
>On Mon, Jul 06, 2020 at 08:27:17AM -0600, Jens Axboe wrote:
>> On 7/6/20 8:10 AM, Matthew Wilcox wrote:
>> > On Sun, Jul 05, 2020 at 03:12:50PM -0600, Jens Axboe wrote:
>> >> On 7/5/20 3:09 PM, Matthew Wilcox wrote:
>> >>> On Sun, Jul 05, 2020 at 03:00:47PM -0600, Jens Axboe wrote:
>> >>>> On 7/5/20 12:47 PM, Kanchan Joshi wrote:
>> >>>>> From: Selvakumar S <selvakuma.s1@samsung.com>
>> >>>>>
>> >>>>> For zone-append, block-layer will return zone-relative offset via ret2
>> >>>>> of ki_complete interface. Make changes to collect it, and send to
>> >>>>> user-space using cqe->flags.
>> >
>> >>> I'm surprised you aren't more upset by the abuse of cqe->flags for the
>> >>> address.

Documentation (https://kernel.dk/io_uring.pdf) mentioned cqe->flags can carry
the metadata for the operation. I wonder if this should be called abuse.

>> >> Yeah, it's not great either, but we have less leeway there in terms of
>> >> how much space is available to pass back extra data.
>> >>
>> >>> What do you think to my idea of interpreting the user_data as being a
>> >>> pointer to somewhere to store the address?  Obviously other things
>> >>> can be stored after the address in the user_data.
>> >>
>> >> I don't like that at all, as all other commands just pass user_data
>> >> through. This means the application would have to treat this very
>> >> differently, and potentially not have a way to store any data for
>> >> locating the original command on the user side.
>> >
>> > I think you misunderstood me.  You seem to have thought I meant
>> > "use the user_data field to return the address" when I actually meant
>> > "interpret the user_data field as a pointer to where userspace
>> > wants the address stored".
>>
>> It's still somewhat weird to have user_data have special meaning, you're
>> now having the kernel interpret it while every other command it's just
>> an opaque that is passed through.
>>
>> But it could of course work, and the app could embed the necessary
>> u32/u64 in some other structure that's persistent across IO. If it
>> doesn't have that, then it'd need to now have one allocated and freed
>> across the lifetime of the IO.
>>
>> If we're going that route, it'd be better to define the write such that
>> you're passing in the necessary information upfront. In syscall terms,
>> then that'd be something ala:
>>
>> ssize_t my_append_write(int fd, const struct iovec *iov, int iovcnt,
>> 			off_t *offset, int flags);
>>
>> where *offset is copied out when the write completes. That removes the
>> need to abuse user_data, with just providing the storage pointer for the
>> offset upfront.
>
>That works for me!  In io_uring terms, would you like to see that done
>as adding:
>
>        union {
>                __u64   off;    /* offset into file */
>+		__u64   *offp;	/* appending writes */
>                __u64   addr2;
>        };
But there are peformance implications of this approach?
If I got it right, the workflow is: 
- Application allocates 64bit of space, writes "off" into it and pass it
  in the sqe->addr2
- Kernel first reads sqe->addr2, reads the value to know the intended
  write-location, and stores the address somewhere (?) to be used during
  completion. Storing this address seems tricky as this may add one more
  cacheline (in io_kiocb->rw)?
- During completion cqe res/flags are written as before, but extra step
  to copy the append-completion-result into that user-space address.

Extra steps are due to the pointer indirection.
And it seems application needs to be careful about managing this 64bit of
space for a cluster of writes, especially if it wants to reuse the sqe
before the completion.
New one can handle 64bit result cleanly, but seems slower than current
one.
It will be good to have the tradeoff cleared before we take things to V4.
Matthew Wilcox July 7, 2020, 3:52 p.m. UTC | #10
On Tue, Jul 07, 2020 at 08:41:05PM +0530, Kanchan Joshi wrote:
> On Mon, Jul 06, 2020 at 03:32:08PM +0100, Matthew Wilcox wrote:
> > On Mon, Jul 06, 2020 at 08:27:17AM -0600, Jens Axboe wrote:
> > > On 7/6/20 8:10 AM, Matthew Wilcox wrote:
> > > > On Sun, Jul 05, 2020 at 03:12:50PM -0600, Jens Axboe wrote:
> > > >> On 7/5/20 3:09 PM, Matthew Wilcox wrote:
> > > >>> On Sun, Jul 05, 2020 at 03:00:47PM -0600, Jens Axboe wrote:
> > > >>>> On 7/5/20 12:47 PM, Kanchan Joshi wrote:
> > > >>>>> From: Selvakumar S <selvakuma.s1@samsung.com>
> > > >>>>>
> > > >>>>> For zone-append, block-layer will return zone-relative offset via ret2
> > > >>>>> of ki_complete interface. Make changes to collect it, and send to
> > > >>>>> user-space using cqe->flags.
> > > >
> > > >>> I'm surprised you aren't more upset by the abuse of cqe->flags for the
> > > >>> address.
> 
> Documentation (https://kernel.dk/io_uring.pdf) mentioned cqe->flags can carry
> the metadata for the operation. I wonder if this should be called abuse.
> 
> > > >> Yeah, it's not great either, but we have less leeway there in terms of
> > > >> how much space is available to pass back extra data.
> > > >>
> > > >>> What do you think to my idea of interpreting the user_data as being a
> > > >>> pointer to somewhere to store the address?  Obviously other things
> > > >>> can be stored after the address in the user_data.
> > > >>
> > > >> I don't like that at all, as all other commands just pass user_data
> > > >> through. This means the application would have to treat this very
> > > >> differently, and potentially not have a way to store any data for
> > > >> locating the original command on the user side.
> > > >
> > > > I think you misunderstood me.  You seem to have thought I meant
> > > > "use the user_data field to return the address" when I actually meant
> > > > "interpret the user_data field as a pointer to where userspace
> > > > wants the address stored".
> > > 
> > > It's still somewhat weird to have user_data have special meaning, you're
> > > now having the kernel interpret it while every other command it's just
> > > an opaque that is passed through.
> > > 
> > > But it could of course work, and the app could embed the necessary
> > > u32/u64 in some other structure that's persistent across IO. If it
> > > doesn't have that, then it'd need to now have one allocated and freed
> > > across the lifetime of the IO.
> > > 
> > > If we're going that route, it'd be better to define the write such that
> > > you're passing in the necessary information upfront. In syscall terms,
> > > then that'd be something ala:
> > > 
> > > ssize_t my_append_write(int fd, const struct iovec *iov, int iovcnt,
> > > 			off_t *offset, int flags);
> > > 
> > > where *offset is copied out when the write completes. That removes the
> > > need to abuse user_data, with just providing the storage pointer for the
> > > offset upfront.
> > 
> > That works for me!  In io_uring terms, would you like to see that done
> > as adding:
> > 
> >        union {
> >                __u64   off;    /* offset into file */
> > +		__u64   *offp;	/* appending writes */
> >                __u64   addr2;
> >        };
> But there are peformance implications of this approach?
> If I got it right, the workflow is: - Application allocates 64bit of space,
> writes "off" into it and pass it
>  in the sqe->addr2
> - Kernel first reads sqe->addr2, reads the value to know the intended
>  write-location, and stores the address somewhere (?) to be used during
>  completion. Storing this address seems tricky as this may add one more
>  cacheline (in io_kiocb->rw)?

io_kiocb is:
        /* size: 232, cachelines: 4, members: 19 */
        /* forced alignments: 1 */
        /* last cacheline: 40 bytes */
so we have another 24 bytes before io_kiocb takes up another cacheline.
If that's a serious problem, I have an idea about how to shrink struct
kiocb by 8 bytes so struct io_rw would have space to store another
pointer.

> - During completion cqe res/flags are written as before, but extra step
>  to copy the append-completion-result into that user-space address.
> Extra steps are due to the pointer indirection.

... we've just done an I/O.  Concern about an extra pointer access
seems misplaced?

> And it seems application needs to be careful about managing this 64bit of
> space for a cluster of writes, especially if it wants to reuse the sqe
> before the completion.
> New one can handle 64bit result cleanly, but seems slower than current
> one.

But userspace has to _do_ something with that information anyway.  So
it must already have somewhere to put that information.

I do think that interpretation of that field should be a separate flag
from WRITE_APPEND so apps which genuinely don't care about where the I/O
ended up don't have to allocate some temporary storage.  eg a logging
application which just needs to know that it managed to append to the
end of the log and doesn't need to do anything if it's successful.
Christoph Hellwig July 7, 2020, 4 p.m. UTC | #11
On Tue, Jul 07, 2020 at 04:52:37PM +0100, Matthew Wilcox wrote:
> But userspace has to _do_ something with that information anyway.  So
> it must already have somewhere to put that information.
> 
> I do think that interpretation of that field should be a separate flag
> from WRITE_APPEND so apps which genuinely don't care about where the I/O
> ended up don't have to allocate some temporary storage.  eg a logging
> application which just needs to know that it managed to append to the
> end of the log and doesn't need to do anything if it's successful.

I agree with the concept of a flag for just returning the write
location.  Because if you don't need that O_APPEND is all you need
anyway.
Kanchan Joshi July 7, 2020, 8:23 p.m. UTC | #12
On Tue, Jul 07, 2020 at 04:52:37PM +0100, Matthew Wilcox wrote:
>On Tue, Jul 07, 2020 at 08:41:05PM +0530, Kanchan Joshi wrote:
>> On Mon, Jul 06, 2020 at 03:32:08PM +0100, Matthew Wilcox wrote:
>> > On Mon, Jul 06, 2020 at 08:27:17AM -0600, Jens Axboe wrote:
>> > > On 7/6/20 8:10 AM, Matthew Wilcox wrote:
>> > > > On Sun, Jul 05, 2020 at 03:12:50PM -0600, Jens Axboe wrote:
>> > > >> On 7/5/20 3:09 PM, Matthew Wilcox wrote:
>> > > >>> On Sun, Jul 05, 2020 at 03:00:47PM -0600, Jens Axboe wrote:
>> > > >>>> On 7/5/20 12:47 PM, Kanchan Joshi wrote:
>> > > >>>>> From: Selvakumar S <selvakuma.s1@samsung.com>
>> > > >>>>>
>> > > >>>>> For zone-append, block-layer will return zone-relative offset via ret2
>> > > >>>>> of ki_complete interface. Make changes to collect it, and send to
>> > > >>>>> user-space using cqe->flags.
>> > > >
>> > > >>> I'm surprised you aren't more upset by the abuse of cqe->flags for the
>> > > >>> address.
>>
>> Documentation (https://protect2.fireeye.com/url?k=297dbcbf-74aee030-297c37f0-0cc47a31ce52-632d3561909b91fc&q=1&u=https%3A%2F%2Fkernel.dk%2Fio_uring.pdf) mentioned cqe->flags can carry
>> the metadata for the operation. I wonder if this should be called abuse.
>>
>> > > >> Yeah, it's not great either, but we have less leeway there in terms of
>> > > >> how much space is available to pass back extra data.
>> > > >>
>> > > >>> What do you think to my idea of interpreting the user_data as being a
>> > > >>> pointer to somewhere to store the address?  Obviously other things
>> > > >>> can be stored after the address in the user_data.
>> > > >>
>> > > >> I don't like that at all, as all other commands just pass user_data
>> > > >> through. This means the application would have to treat this very
>> > > >> differently, and potentially not have a way to store any data for
>> > > >> locating the original command on the user side.
>> > > >
>> > > > I think you misunderstood me.  You seem to have thought I meant
>> > > > "use the user_data field to return the address" when I actually meant
>> > > > "interpret the user_data field as a pointer to where userspace
>> > > > wants the address stored".
>> > >
>> > > It's still somewhat weird to have user_data have special meaning, you're
>> > > now having the kernel interpret it while every other command it's just
>> > > an opaque that is passed through.
>> > >
>> > > But it could of course work, and the app could embed the necessary
>> > > u32/u64 in some other structure that's persistent across IO. If it
>> > > doesn't have that, then it'd need to now have one allocated and freed
>> > > across the lifetime of the IO.
>> > >
>> > > If we're going that route, it'd be better to define the write such that
>> > > you're passing in the necessary information upfront. In syscall terms,
>> > > then that'd be something ala:
>> > >
>> > > ssize_t my_append_write(int fd, const struct iovec *iov, int iovcnt,
>> > > 			off_t *offset, int flags);
>> > >
>> > > where *offset is copied out when the write completes. That removes the
>> > > need to abuse user_data, with just providing the storage pointer for the
>> > > offset upfront.
>> >
>> > That works for me!  In io_uring terms, would you like to see that done
>> > as adding:
>> >
>> >        union {
>> >                __u64   off;    /* offset into file */
>> > +		__u64   *offp;	/* appending writes */
>> >                __u64   addr2;
>> >        };
>> But there are peformance implications of this approach?
>> If I got it right, the workflow is: - Application allocates 64bit of space,
>> writes "off" into it and pass it
>>  in the sqe->addr2
>> - Kernel first reads sqe->addr2, reads the value to know the intended
>>  write-location, and stores the address somewhere (?) to be used during
>>  completion. Storing this address seems tricky as this may add one more
>>  cacheline (in io_kiocb->rw)?
>
>io_kiocb is:
>        /* size: 232, cachelines: 4, members: 19 */
>        /* forced alignments: 1 */
>        /* last cacheline: 40 bytes */
>so we have another 24 bytes before io_kiocb takes up another cacheline.
>If that's a serious problem, I have an idea about how to shrink struct
>kiocb by 8 bytes so struct io_rw would have space to store another
>pointer.
Yes, io_kiocb has room. Cache-locality wise whether that is fine or
it must be placed within io_rw - I'll come to know once I get to
implement this. Please share the idea you have, it can come handy.

>> - During completion cqe res/flags are written as before, but extra step
>>  to copy the append-completion-result into that user-space address.
>> Extra steps are due to the pointer indirection.
>
>... we've just done an I/O.  Concern about an extra pointer access
>seems misplaced?

I was thinking about both read-from (submission) and write-to
(completion) from user-space pointer, and all those checks APIs
(get_user, copy_from_user) perform.....but when seen against I/O (that
too direct), it does look small. Down the line it may matter for cached-IO
but I get your point. 

>> And it seems application needs to be careful about managing this 64bit of
>> space for a cluster of writes, especially if it wants to reuse the sqe
>> before the completion.
>> New one can handle 64bit result cleanly, but seems slower than current
>> one.
>
>But userspace has to _do_ something with that information anyway.  So
>it must already have somewhere to put that information.

Right. But it is part of SQE/CQE currently, and in new scheme it gets
decoupled and needs to be managed separately. 
>I do think that interpretation of that field should be a separate flag
>from WRITE_APPEND so apps which genuinely don't care about where the I/O
>ended up don't have to allocate some temporary storage.  eg a logging
>application which just needs to know that it managed to append to the
>end of the log and doesn't need to do anything if it's successful.
Would you want that new flag to do what RWF_APPEND does as well? 
In v2, we had a separate flag RWF_ZONE_APPEND and did not use
file-append infra at all. Thought was: apps using the new flag will
always look at where write landed.

In v3, I've been looking at this as: 
zone-append = file-append + something-extra-to-know-where-write-landed.
We see to it that something-extra does not get executed for regular
files/block-dev append (ref: FMODE_ZONE_APPEND patch1)....and existing
behavior (the one you described for logging application) is retained.
But on a zoned-device/file, file-append becomes zone-append, all the
time.
Jens Axboe July 7, 2020, 8:40 p.m. UTC | #13
On 7/7/20 2:23 PM, Kanchan Joshi wrote:
> On Tue, Jul 07, 2020 at 04:52:37PM +0100, Matthew Wilcox wrote:
>> On Tue, Jul 07, 2020 at 08:41:05PM +0530, Kanchan Joshi wrote:
>>> On Mon, Jul 06, 2020 at 03:32:08PM +0100, Matthew Wilcox wrote:
>>>> On Mon, Jul 06, 2020 at 08:27:17AM -0600, Jens Axboe wrote:
>>>>> On 7/6/20 8:10 AM, Matthew Wilcox wrote:
>>>>>> On Sun, Jul 05, 2020 at 03:12:50PM -0600, Jens Axboe wrote:
>>>>>>> On 7/5/20 3:09 PM, Matthew Wilcox wrote:
>>>>>>>> On Sun, Jul 05, 2020 at 03:00:47PM -0600, Jens Axboe wrote:
>>>>>>>>> On 7/5/20 12:47 PM, Kanchan Joshi wrote:
>>>>>>>>>> From: Selvakumar S <selvakuma.s1@samsung.com>
>>>>>>>>>>
>>>>>>>>>> For zone-append, block-layer will return zone-relative offset via ret2
>>>>>>>>>> of ki_complete interface. Make changes to collect it, and send to
>>>>>>>>>> user-space using cqe->flags.
>>>>>>
>>>>>>>> I'm surprised you aren't more upset by the abuse of cqe->flags for the
>>>>>>>> address.
>>>
>>> Documentation (https://protect2.fireeye.com/url?k=297dbcbf-74aee030-297c37f0-0cc47a31ce52-632d3561909b91fc&q=1&u=https%3A%2F%2Fkernel.dk%2Fio_uring.pdf) mentioned cqe->flags can carry
>>> the metadata for the operation. I wonder if this should be called abuse.
>>>
>>>>>>> Yeah, it's not great either, but we have less leeway there in terms of
>>>>>>> how much space is available to pass back extra data.
>>>>>>>
>>>>>>>> What do you think to my idea of interpreting the user_data as being a
>>>>>>>> pointer to somewhere to store the address?  Obviously other things
>>>>>>>> can be stored after the address in the user_data.
>>>>>>>
>>>>>>> I don't like that at all, as all other commands just pass user_data
>>>>>>> through. This means the application would have to treat this very
>>>>>>> differently, and potentially not have a way to store any data for
>>>>>>> locating the original command on the user side.
>>>>>>
>>>>>> I think you misunderstood me.  You seem to have thought I meant
>>>>>> "use the user_data field to return the address" when I actually meant
>>>>>> "interpret the user_data field as a pointer to where userspace
>>>>>> wants the address stored".
>>>>>
>>>>> It's still somewhat weird to have user_data have special meaning, you're
>>>>> now having the kernel interpret it while every other command it's just
>>>>> an opaque that is passed through.
>>>>>
>>>>> But it could of course work, and the app could embed the necessary
>>>>> u32/u64 in some other structure that's persistent across IO. If it
>>>>> doesn't have that, then it'd need to now have one allocated and freed
>>>>> across the lifetime of the IO.
>>>>>
>>>>> If we're going that route, it'd be better to define the write such that
>>>>> you're passing in the necessary information upfront. In syscall terms,
>>>>> then that'd be something ala:
>>>>>
>>>>> ssize_t my_append_write(int fd, const struct iovec *iov, int iovcnt,
>>>>> 			off_t *offset, int flags);
>>>>>
>>>>> where *offset is copied out when the write completes. That removes the
>>>>> need to abuse user_data, with just providing the storage pointer for the
>>>>> offset upfront.
>>>>
>>>> That works for me!  In io_uring terms, would you like to see that done
>>>> as adding:
>>>>
>>>>        union {
>>>>                __u64   off;    /* offset into file */
>>>> +		__u64   *offp;	/* appending writes */
>>>>                __u64   addr2;
>>>>        };
>>> But there are peformance implications of this approach?
>>> If I got it right, the workflow is: - Application allocates 64bit of space,
>>> writes "off" into it and pass it
>>>  in the sqe->addr2
>>> - Kernel first reads sqe->addr2, reads the value to know the intended
>>>  write-location, and stores the address somewhere (?) to be used during
>>>  completion. Storing this address seems tricky as this may add one more
>>>  cacheline (in io_kiocb->rw)?
>>
>> io_kiocb is:
>>        /* size: 232, cachelines: 4, members: 19 */
>>        /* forced alignments: 1 */
>>        /* last cacheline: 40 bytes */
>> so we have another 24 bytes before io_kiocb takes up another cacheline.
>> If that's a serious problem, I have an idea about how to shrink struct
>> kiocb by 8 bytes so struct io_rw would have space to store another
>> pointer.
> Yes, io_kiocb has room. Cache-locality wise whether that is fine or
> it must be placed within io_rw - I'll come to know once I get to
> implement this. Please share the idea you have, it can come handy.

Except it doesn't, I'm not interested in adding per-request type fields
to the generic part of it. Before we know it, we'll blow past the next
cacheline.

If we can find space in the kiocb, that'd be much better. Note that once
the async buffered bits go in for 5.9, then there's no longer a 4-byte
hole in struct kiocb.

>> ... we've just done an I/O.  Concern about an extra pointer access
>> seems misplaced?
> 
> I was thinking about both read-from (submission) and write-to
> (completion) from user-space pointer, and all those checks APIs
> (get_user, copy_from_user) perform.....but when seen against I/O (that
> too direct), it does look small. Down the line it may matter for cached-IO
> but I get your point. 

Really don't think that matters at all.
Matthew Wilcox July 7, 2020, 10:18 p.m. UTC | #14
On Tue, Jul 07, 2020 at 02:40:06PM -0600, Jens Axboe wrote:
> >> so we have another 24 bytes before io_kiocb takes up another cacheline.
> >> If that's a serious problem, I have an idea about how to shrink struct
> >> kiocb by 8 bytes so struct io_rw would have space to store another
> >> pointer.
> > Yes, io_kiocb has room. Cache-locality wise whether that is fine or
> > it must be placed within io_rw - I'll come to know once I get to
> > implement this. Please share the idea you have, it can come handy.
> 
> Except it doesn't, I'm not interested in adding per-request type fields
> to the generic part of it. Before we know it, we'll blow past the next
> cacheline.
> 
> If we can find space in the kiocb, that'd be much better. Note that once
> the async buffered bits go in for 5.9, then there's no longer a 4-byte
> hole in struct kiocb.

Well, poot, I was planning on using that.  OK, how about this:

+#define IOCB_NO_CMPL		(15 << 28)

 struct kiocb {
[...]
-	void (*ki_complete)(struct kiocb *iocb, long ret, long ret2);
+	loff_t __user *ki_uposp;
-	int			ki_flags;
+	unsigned int		ki_flags;

+typedef void ki_cmpl(struct kiocb *, long ret, long ret2);
+static ki_cmpl * const ki_cmpls[15];

+void ki_complete(struct kiocb *iocb, long ret, long ret2)
+{
+	unsigned int id = iocb->ki_flags >> 28;
+
+	if (id < 15)
+		ki_cmpls[id](iocb, ret, ret2);
+}

+int kiocb_cmpl_register(void (*cb)(struct kiocb *, long, long))
+{
+	for (i = 0; i < 15; i++) {
+		if (ki_cmpls[id])
+			continue;
+		ki_cmpls[id] = cb;
+		return id;
+	}
+	WARN();
+	return -1;
+}

... etc, also need an unregister.
Jens Axboe July 7, 2020, 10:37 p.m. UTC | #15
On 7/7/20 4:18 PM, Matthew Wilcox wrote:
> On Tue, Jul 07, 2020 at 02:40:06PM -0600, Jens Axboe wrote:
>>>> so we have another 24 bytes before io_kiocb takes up another cacheline.
>>>> If that's a serious problem, I have an idea about how to shrink struct
>>>> kiocb by 8 bytes so struct io_rw would have space to store another
>>>> pointer.
>>> Yes, io_kiocb has room. Cache-locality wise whether that is fine or
>>> it must be placed within io_rw - I'll come to know once I get to
>>> implement this. Please share the idea you have, it can come handy.
>>
>> Except it doesn't, I'm not interested in adding per-request type fields
>> to the generic part of it. Before we know it, we'll blow past the next
>> cacheline.
>>
>> If we can find space in the kiocb, that'd be much better. Note that once
>> the async buffered bits go in for 5.9, then there's no longer a 4-byte
>> hole in struct kiocb.
> 
> Well, poot, I was planning on using that.  OK, how about this:

Figured you might have had your sights set on that one, which is why I
wanted to bring it up upfront :-)

> +#define IOCB_NO_CMPL		(15 << 28)
> 
>  struct kiocb {
> [...]
> -	void (*ki_complete)(struct kiocb *iocb, long ret, long ret2);
> +	loff_t __user *ki_uposp;
> -	int			ki_flags;
> +	unsigned int		ki_flags;
> 
> +typedef void ki_cmpl(struct kiocb *, long ret, long ret2);
> +static ki_cmpl * const ki_cmpls[15];
> 
> +void ki_complete(struct kiocb *iocb, long ret, long ret2)
> +{
> +	unsigned int id = iocb->ki_flags >> 28;
> +
> +	if (id < 15)
> +		ki_cmpls[id](iocb, ret, ret2);
> +}
> 
> +int kiocb_cmpl_register(void (*cb)(struct kiocb *, long, long))
> +{
> +	for (i = 0; i < 15; i++) {
> +		if (ki_cmpls[id])
> +			continue;
> +		ki_cmpls[id] = cb;
> +		return id;
> +	}
> +	WARN();
> +	return -1;
> +}

That could work, we don't really have a lot of different completion
types in the kernel.
Kanchan Joshi July 8, 2020, 12:58 p.m. UTC | #16
On Tue, Jul 07, 2020 at 04:37:55PM -0600, Jens Axboe wrote:
>On 7/7/20 4:18 PM, Matthew Wilcox wrote:
>> On Tue, Jul 07, 2020 at 02:40:06PM -0600, Jens Axboe wrote:
>>>>> so we have another 24 bytes before io_kiocb takes up another cacheline.
>>>>> If that's a serious problem, I have an idea about how to shrink struct
>>>>> kiocb by 8 bytes so struct io_rw would have space to store another
>>>>> pointer.
>>>> Yes, io_kiocb has room. Cache-locality wise whether that is fine or
>>>> it must be placed within io_rw - I'll come to know once I get to
>>>> implement this. Please share the idea you have, it can come handy.
>>>
>>> Except it doesn't, I'm not interested in adding per-request type fields
>>> to the generic part of it. Before we know it, we'll blow past the next
>>> cacheline.
>>>
>>> If we can find space in the kiocb, that'd be much better. Note that once
>>> the async buffered bits go in for 5.9, then there's no longer a 4-byte
>>> hole in struct kiocb.
>>
>> Well, poot, I was planning on using that.  OK, how about this:
>
>Figured you might have had your sights set on that one, which is why I
>wanted to bring it up upfront :-)
>
>> +#define IOCB_NO_CMPL		(15 << 28)
>>
>>  struct kiocb {
>> [...]
>> -	void (*ki_complete)(struct kiocb *iocb, long ret, long ret2);
>> +	loff_t __user *ki_uposp;
>> -	int			ki_flags;
>> +	unsigned int		ki_flags;
>>
>> +typedef void ki_cmpl(struct kiocb *, long ret, long ret2);
>> +static ki_cmpl * const ki_cmpls[15];
>>
>> +void ki_complete(struct kiocb *iocb, long ret, long ret2)
>> +{
>> +	unsigned int id = iocb->ki_flags >> 28;
>> +
>> +	if (id < 15)
>> +		ki_cmpls[id](iocb, ret, ret2);
>> +}
>>
>> +int kiocb_cmpl_register(void (*cb)(struct kiocb *, long, long))
>> +{
>> +	for (i = 0; i < 15; i++) {
>> +		if (ki_cmpls[id])
>> +			continue;
>> +		ki_cmpls[id] = cb;
>> +		return id;
>> +	}
>> +	WARN();
>> +	return -1;
>> +}
>
>That could work, we don't really have a lot of different completion
>types in the kernel.

Thanks, this looks sorted.
The last thing is about the flag used to trigger this processing. 
Will it be fine to intoduce new flag (RWF_APPEND2 or RWF_APPEND_OFFSET)
instead of using RWF_APPEND? 

New flag will do what RWF_APPEND does and will also return the 
written-location (and therefore expects pointer setup in application).
Matthew Wilcox July 8, 2020, 2:22 p.m. UTC | #17
On Wed, Jul 08, 2020 at 06:28:05PM +0530, Kanchan Joshi wrote:
> The last thing is about the flag used to trigger this processing. Will it be
> fine to intoduce new flag (RWF_APPEND2 or RWF_APPEND_OFFSET)
> instead of using RWF_APPEND?
> 
> New flag will do what RWF_APPEND does and will also return the
> written-location (and therefore expects pointer setup in application).

I think it's simpler to understand if it's called RWF_INDIRECT_OFFSET
Then it'd look like:

+	rwf_t rwf = READ_ONCE(sqe->rw_flags);
...
-	iocb->ki_pos = READ_ONCE(sqe->off);
+	if (rwf & RWF_INDIRECT_OFFSET) {
+		loff_t __user *loffp = u64_to_user_ptr(sqe->addr2);
+
+		if (get_user(iocb->ki_pos, loffp)
+			return -EFAULT;
+		iocb->ki_loffp = loffp;
+	} else {
+		iocb->ki_pos = READ_ONCE(sqe->off);
+	}
...
-	ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
+	ret = kiocb_set_rw_flags(kiocb, rwf);
Jens Axboe July 8, 2020, 2:54 p.m. UTC | #18
On 7/8/20 6:58 AM, Kanchan Joshi wrote:
> On Tue, Jul 07, 2020 at 04:37:55PM -0600, Jens Axboe wrote:
>> On 7/7/20 4:18 PM, Matthew Wilcox wrote:
>>> On Tue, Jul 07, 2020 at 02:40:06PM -0600, Jens Axboe wrote:
>>>>>> so we have another 24 bytes before io_kiocb takes up another cacheline.
>>>>>> If that's a serious problem, I have an idea about how to shrink struct
>>>>>> kiocb by 8 bytes so struct io_rw would have space to store another
>>>>>> pointer.
>>>>> Yes, io_kiocb has room. Cache-locality wise whether that is fine or
>>>>> it must be placed within io_rw - I'll come to know once I get to
>>>>> implement this. Please share the idea you have, it can come handy.
>>>>
>>>> Except it doesn't, I'm not interested in adding per-request type fields
>>>> to the generic part of it. Before we know it, we'll blow past the next
>>>> cacheline.
>>>>
>>>> If we can find space in the kiocb, that'd be much better. Note that once
>>>> the async buffered bits go in for 5.9, then there's no longer a 4-byte
>>>> hole in struct kiocb.
>>>
>>> Well, poot, I was planning on using that.  OK, how about this:
>>
>> Figured you might have had your sights set on that one, which is why I
>> wanted to bring it up upfront :-)
>>
>>> +#define IOCB_NO_CMPL		(15 << 28)
>>>
>>>  struct kiocb {
>>> [...]
>>> -	void (*ki_complete)(struct kiocb *iocb, long ret, long ret2);
>>> +	loff_t __user *ki_uposp;
>>> -	int			ki_flags;
>>> +	unsigned int		ki_flags;
>>>
>>> +typedef void ki_cmpl(struct kiocb *, long ret, long ret2);
>>> +static ki_cmpl * const ki_cmpls[15];
>>>
>>> +void ki_complete(struct kiocb *iocb, long ret, long ret2)
>>> +{
>>> +	unsigned int id = iocb->ki_flags >> 28;
>>> +
>>> +	if (id < 15)
>>> +		ki_cmpls[id](iocb, ret, ret2);
>>> +}
>>>
>>> +int kiocb_cmpl_register(void (*cb)(struct kiocb *, long, long))
>>> +{
>>> +	for (i = 0; i < 15; i++) {
>>> +		if (ki_cmpls[id])
>>> +			continue;
>>> +		ki_cmpls[id] = cb;
>>> +		return id;
>>> +	}
>>> +	WARN();
>>> +	return -1;
>>> +}
>>
>> That could work, we don't really have a lot of different completion
>> types in the kernel.
> 
> Thanks, this looks sorted.

Not really, someone still needs to do that work. I took a quick look, and
most of it looks straight forward. The only potential complication is
ocfs2, which does a swap of the completion for the kiocb. That would just
turn into an upper flag swap. And potential sync kiocb with NULL
ki_complete. The latter should be fine, I think we just need to reserve
completion nr 0 for being that.
Matthew Wilcox July 8, 2020, 2:58 p.m. UTC | #19
On Wed, Jul 08, 2020 at 08:54:07AM -0600, Jens Axboe wrote:
> On 7/8/20 6:58 AM, Kanchan Joshi wrote:
> >>> +#define IOCB_NO_CMPL		(15 << 28)
> >>>
> >>>  struct kiocb {
> >>> [...]
> >>> -	void (*ki_complete)(struct kiocb *iocb, long ret, long ret2);
> >>> +	loff_t __user *ki_uposp;
> >>> -	int			ki_flags;
> >>> +	unsigned int		ki_flags;
> >>>
> >>> +typedef void ki_cmpl(struct kiocb *, long ret, long ret2);
> >>> +static ki_cmpl * const ki_cmpls[15];
> >>>
> >>> +void ki_complete(struct kiocb *iocb, long ret, long ret2)
> >>> +{
> >>> +	unsigned int id = iocb->ki_flags >> 28;
> >>> +
> >>> +	if (id < 15)
> >>> +		ki_cmpls[id](iocb, ret, ret2);
> >>> +}
> >>>
> >>> +int kiocb_cmpl_register(void (*cb)(struct kiocb *, long, long))
> >>> +{
> >>> +	for (i = 0; i < 15; i++) {
> >>> +		if (ki_cmpls[id])
> >>> +			continue;
> >>> +		ki_cmpls[id] = cb;
> >>> +		return id;
> >>> +	}
> >>> +	WARN();
> >>> +	return -1;
> >>> +}
> >>
> >> That could work, we don't really have a lot of different completion
> >> types in the kernel.
> > 
> > Thanks, this looks sorted.
> 
> Not really, someone still needs to do that work. I took a quick look, and
> most of it looks straight forward. The only potential complication is
> ocfs2, which does a swap of the completion for the kiocb. That would just
> turn into an upper flag swap. And potential sync kiocb with NULL
> ki_complete. The latter should be fine, I think we just need to reserve
> completion nr 0 for being that.

I was reserving completion 15 for that ;-)

+#define IOCB_NO_CMPL		(15 << 28)
...
+	if (id < 15)
+		ki_cmpls[id](iocb, ret, ret2);

Saves us one pointer in the array ...
Jens Axboe July 8, 2020, 2:59 p.m. UTC | #20
On 7/8/20 8:58 AM, Matthew Wilcox wrote:
> On Wed, Jul 08, 2020 at 08:54:07AM -0600, Jens Axboe wrote:
>> On 7/8/20 6:58 AM, Kanchan Joshi wrote:
>>>>> +#define IOCB_NO_CMPL		(15 << 28)
>>>>>
>>>>>  struct kiocb {
>>>>> [...]
>>>>> -	void (*ki_complete)(struct kiocb *iocb, long ret, long ret2);
>>>>> +	loff_t __user *ki_uposp;
>>>>> -	int			ki_flags;
>>>>> +	unsigned int		ki_flags;
>>>>>
>>>>> +typedef void ki_cmpl(struct kiocb *, long ret, long ret2);
>>>>> +static ki_cmpl * const ki_cmpls[15];
>>>>>
>>>>> +void ki_complete(struct kiocb *iocb, long ret, long ret2)
>>>>> +{
>>>>> +	unsigned int id = iocb->ki_flags >> 28;
>>>>> +
>>>>> +	if (id < 15)
>>>>> +		ki_cmpls[id](iocb, ret, ret2);
>>>>> +}
>>>>>
>>>>> +int kiocb_cmpl_register(void (*cb)(struct kiocb *, long, long))
>>>>> +{
>>>>> +	for (i = 0; i < 15; i++) {
>>>>> +		if (ki_cmpls[id])
>>>>> +			continue;
>>>>> +		ki_cmpls[id] = cb;
>>>>> +		return id;
>>>>> +	}
>>>>> +	WARN();
>>>>> +	return -1;
>>>>> +}
>>>>
>>>> That could work, we don't really have a lot of different completion
>>>> types in the kernel.
>>>
>>> Thanks, this looks sorted.
>>
>> Not really, someone still needs to do that work. I took a quick look, and
>> most of it looks straight forward. The only potential complication is
>> ocfs2, which does a swap of the completion for the kiocb. That would just
>> turn into an upper flag swap. And potential sync kiocb with NULL
>> ki_complete. The latter should be fine, I think we just need to reserve
>> completion nr 0 for being that.
> 
> I was reserving completion 15 for that ;-)
> 
> +#define IOCB_NO_CMPL		(15 << 28)
> ...
> +	if (id < 15)
> +		ki_cmpls[id](iocb, ret, ret2);
> 
> Saves us one pointer in the array ...

That works. Are you going to turn this into an actual series of patches,
adding the functionality and converting users?
Matthew Wilcox July 8, 2020, 3:02 p.m. UTC | #21
On Wed, Jul 08, 2020 at 08:59:50AM -0600, Jens Axboe wrote:
> On 7/8/20 8:58 AM, Matthew Wilcox wrote:
> > On Wed, Jul 08, 2020 at 08:54:07AM -0600, Jens Axboe wrote:
> >> On 7/8/20 6:58 AM, Kanchan Joshi wrote:
> >>>>> +#define IOCB_NO_CMPL		(15 << 28)
> >>>>>
> >>>>>  struct kiocb {
> >>>>> [...]
> >>>>> -	void (*ki_complete)(struct kiocb *iocb, long ret, long ret2);
> >>>>> +	loff_t __user *ki_uposp;
> >>>>> -	int			ki_flags;
> >>>>> +	unsigned int		ki_flags;
> >>>>>
> >>>>> +typedef void ki_cmpl(struct kiocb *, long ret, long ret2);
> >>>>> +static ki_cmpl * const ki_cmpls[15];
> >>>>>
> >>>>> +void ki_complete(struct kiocb *iocb, long ret, long ret2)
> >>>>> +{
> >>>>> +	unsigned int id = iocb->ki_flags >> 28;
> >>>>> +
> >>>>> +	if (id < 15)
> >>>>> +		ki_cmpls[id](iocb, ret, ret2);
> >>>>> +}
> >>>>>
> >>>>> +int kiocb_cmpl_register(void (*cb)(struct kiocb *, long, long))
> >>>>> +{
> >>>>> +	for (i = 0; i < 15; i++) {
> >>>>> +		if (ki_cmpls[id])
> >>>>> +			continue;
> >>>>> +		ki_cmpls[id] = cb;
> >>>>> +		return id;
> >>>>> +	}
> >>>>> +	WARN();
> >>>>> +	return -1;
> >>>>> +}
> >>>>
> >>>> That could work, we don't really have a lot of different completion
> >>>> types in the kernel.
> >>>
> >>> Thanks, this looks sorted.
> >>
> >> Not really, someone still needs to do that work. I took a quick look, and
> >> most of it looks straight forward. The only potential complication is
> >> ocfs2, which does a swap of the completion for the kiocb. That would just
> >> turn into an upper flag swap. And potential sync kiocb with NULL
> >> ki_complete. The latter should be fine, I think we just need to reserve
> >> completion nr 0 for being that.
> > 
> > I was reserving completion 15 for that ;-)
> > 
> > +#define IOCB_NO_CMPL		(15 << 28)
> > ...
> > +	if (id < 15)
> > +		ki_cmpls[id](iocb, ret, ret2);
> > 
> > Saves us one pointer in the array ...
> 
> That works. Are you going to turn this into an actual series of patches,
> adding the functionality and converting users?

I was under the impression Kanchan was going to do that, but I can run it
off quickly ...
Jens Axboe July 8, 2020, 3:06 p.m. UTC | #22
On 7/8/20 9:02 AM, Matthew Wilcox wrote:
> On Wed, Jul 08, 2020 at 08:59:50AM -0600, Jens Axboe wrote:
>> On 7/8/20 8:58 AM, Matthew Wilcox wrote:
>>> On Wed, Jul 08, 2020 at 08:54:07AM -0600, Jens Axboe wrote:
>>>> On 7/8/20 6:58 AM, Kanchan Joshi wrote:
>>>>>>> +#define IOCB_NO_CMPL		(15 << 28)
>>>>>>>
>>>>>>>  struct kiocb {
>>>>>>> [...]
>>>>>>> -	void (*ki_complete)(struct kiocb *iocb, long ret, long ret2);
>>>>>>> +	loff_t __user *ki_uposp;
>>>>>>> -	int			ki_flags;
>>>>>>> +	unsigned int		ki_flags;
>>>>>>>
>>>>>>> +typedef void ki_cmpl(struct kiocb *, long ret, long ret2);
>>>>>>> +static ki_cmpl * const ki_cmpls[15];
>>>>>>>
>>>>>>> +void ki_complete(struct kiocb *iocb, long ret, long ret2)
>>>>>>> +{
>>>>>>> +	unsigned int id = iocb->ki_flags >> 28;
>>>>>>> +
>>>>>>> +	if (id < 15)
>>>>>>> +		ki_cmpls[id](iocb, ret, ret2);
>>>>>>> +}
>>>>>>>
>>>>>>> +int kiocb_cmpl_register(void (*cb)(struct kiocb *, long, long))
>>>>>>> +{
>>>>>>> +	for (i = 0; i < 15; i++) {
>>>>>>> +		if (ki_cmpls[id])
>>>>>>> +			continue;
>>>>>>> +		ki_cmpls[id] = cb;
>>>>>>> +		return id;
>>>>>>> +	}
>>>>>>> +	WARN();
>>>>>>> +	return -1;
>>>>>>> +}
>>>>>>
>>>>>> That could work, we don't really have a lot of different completion
>>>>>> types in the kernel.
>>>>>
>>>>> Thanks, this looks sorted.
>>>>
>>>> Not really, someone still needs to do that work. I took a quick look, and
>>>> most of it looks straight forward. The only potential complication is
>>>> ocfs2, which does a swap of the completion for the kiocb. That would just
>>>> turn into an upper flag swap. And potential sync kiocb with NULL
>>>> ki_complete. The latter should be fine, I think we just need to reserve
>>>> completion nr 0 for being that.
>>>
>>> I was reserving completion 15 for that ;-)
>>>
>>> +#define IOCB_NO_CMPL		(15 << 28)
>>> ...
>>> +	if (id < 15)
>>> +		ki_cmpls[id](iocb, ret, ret2);
>>>
>>> Saves us one pointer in the array ...
>>
>> That works. Are you going to turn this into an actual series of patches,
>> adding the functionality and converting users?
> 
> I was under the impression Kanchan was going to do that, but I can run it
> off quickly ...

I just wanted to get clarification there, because to me it sounded like
you expected Kanchan to do it, and Kanchan assuming it "was sorted". I'd
consider that a prerequisite for the append series as far as io_uring is
concerned, hence _someone_ needs to actually do it ;-)
Javier González July 8, 2020, 4:08 p.m. UTC | #23
> On 8 Jul 2020, at 17.06, Jens Axboe <axboe@kernel.dk> wrote:
> 
> On 7/8/20 9:02 AM, Matthew Wilcox wrote:
>>> On Wed, Jul 08, 2020 at 08:59:50AM -0600, Jens Axboe wrote:
>>> On 7/8/20 8:58 AM, Matthew Wilcox wrote:
>>>> On Wed, Jul 08, 2020 at 08:54:07AM -0600, Jens Axboe wrote:
>>>>> On 7/8/20 6:58 AM, Kanchan Joshi wrote:
>>>>>>>> +#define IOCB_NO_CMPL        (15 << 28)
>>>>>>>> 
>>>>>>>> struct kiocb {
>>>>>>>> [...]
>>>>>>>> -    void (*ki_complete)(struct kiocb *iocb, long ret, long ret2);
>>>>>>>> +    loff_t __user *ki_uposp;
>>>>>>>> -    int            ki_flags;
>>>>>>>> +    unsigned int        ki_flags;
>>>>>>>> 
>>>>>>>> +typedef void ki_cmpl(struct kiocb *, long ret, long ret2);
>>>>>>>> +static ki_cmpl * const ki_cmpls[15];
>>>>>>>> 
>>>>>>>> +void ki_complete(struct kiocb *iocb, long ret, long ret2)
>>>>>>>> +{
>>>>>>>> +    unsigned int id = iocb->ki_flags >> 28;
>>>>>>>> +
>>>>>>>> +    if (id < 15)
>>>>>>>> +        ki_cmpls[id](iocb, ret, ret2);
>>>>>>>> +}
>>>>>>>> 
>>>>>>>> +int kiocb_cmpl_register(void (*cb)(struct kiocb *, long, long))
>>>>>>>> +{
>>>>>>>> +    for (i = 0; i < 15; i++) {
>>>>>>>> +        if (ki_cmpls[id])
>>>>>>>> +            continue;
>>>>>>>> +        ki_cmpls[id] = cb;
>>>>>>>> +        return id;
>>>>>>>> +    }
>>>>>>>> +    WARN();
>>>>>>>> +    return -1;
>>>>>>>> +}
>>>>>>> 
>>>>>>> That could work, we don't really have a lot of different completion
>>>>>>> types in the kernel.
>>>>>> 
>>>>>> Thanks, this looks sorted.
>>>>> 
>>>>> Not really, someone still needs to do that work. I took a quick look, and
>>>>> most of it looks straight forward. The only potential complication is
>>>>> ocfs2, which does a swap of the completion for the kiocb. That would just
>>>>> turn into an upper flag swap. And potential sync kiocb with NULL
>>>>> ki_complete. The latter should be fine, I think we just need to reserve
>>>>> completion nr 0 for being that.
>>>> 
>>>> I was reserving completion 15 for that ;-)
>>>> 
>>>> +#define IOCB_NO_CMPL        (15 << 28)
>>>> ...
>>>> +    if (id < 15)
>>>> +        ki_cmpls[id](iocb, ret, ret2);
>>>> 
>>>> Saves us one pointer in the array ...
>>> 
>>> That works. Are you going to turn this into an actual series of patches,
>>> adding the functionality and converting users?
>> 
>> I was under the impression Kanchan was going to do that, but I can run it
>> off quickly ...
> 
> I just wanted to get clarification there, because to me it sounded like
> you expected Kanchan to do it, and Kanchan assuming it "was sorted". I'd
> consider that a prerequisite for the append series as far as io_uring is
> concerned, hence _someone_ needs to actually do it ;-)
> 

I believe Kanchan meant that now the trade-off we were asking to clear out is sorted. 

We will send a new version shortly for the current functionality - we can see what we are missing on when the uring interface is clear. 

We really want this to be stable as a lot of other things are depending on this (e.g., fio patches)

Javier
Matthew Wilcox July 8, 2020, 4:33 p.m. UTC | #24
On Wed, Jul 08, 2020 at 06:08:12PM +0200, Javier González wrote:
> > I just wanted to get clarification there, because to me it sounded like
> > you expected Kanchan to do it, and Kanchan assuming it "was sorted". I'd
> > consider that a prerequisite for the append series as far as io_uring is
> > concerned, hence _someone_ needs to actually do it ;-)

I don't know that it's a prerequisite in terms of the patches actually
depend on it.  I appreciate you want it first to ensure that we don't bloat
the kiocb.

> I believe Kanchan meant that now the trade-off we were asking to clear out is sorted. 
> 
> We will send a new version shortly for the current functionality - we can see what we are missing on when the uring interface is clear. 

I've started work on a patch series for this.  Mostly just waiting for
compilation now ... should be done in the next few hours.
Jens Axboe July 8, 2020, 4:38 p.m. UTC | #25
On 7/8/20 10:33 AM, Matthew Wilcox wrote:
> On Wed, Jul 08, 2020 at 06:08:12PM +0200, Javier González wrote:
>>> I just wanted to get clarification there, because to me it sounded like
>>> you expected Kanchan to do it, and Kanchan assuming it "was sorted". I'd
>>> consider that a prerequisite for the append series as far as io_uring is
>>> concerned, hence _someone_ needs to actually do it ;-)
> 
> I don't know that it's a prerequisite in terms of the patches actually
> depend on it.  I appreciate you want it first to ensure that we don't bloat
> the kiocb.

Maybe not for the series, but for the io_uring addition it is.

>> I believe Kanchan meant that now the trade-off we were asking to
>> clear out is sorted. 
>>
>> We will send a new version shortly for the current functionality - we
>> can see what we are missing on when the uring interface is clear. 
> 
> I've started work on a patch series for this.  Mostly just waiting for
> compilation now ... should be done in the next few hours.

Great!
Kanchan Joshi July 8, 2020, 4:41 p.m. UTC | #26
On Wed, Jul 08, 2020 at 03:22:51PM +0100, Matthew Wilcox wrote:
>On Wed, Jul 08, 2020 at 06:28:05PM +0530, Kanchan Joshi wrote:
>> The last thing is about the flag used to trigger this processing. Will it be
>> fine to intoduce new flag (RWF_APPEND2 or RWF_APPEND_OFFSET)
>> instead of using RWF_APPEND?
>>
>> New flag will do what RWF_APPEND does and will also return the
>> written-location (and therefore expects pointer setup in application).
>
>I think it's simpler to understand if it's called RWF_INDIRECT_OFFSET
>Then it'd look like:
>
>+	rwf_t rwf = READ_ONCE(sqe->rw_flags);
>...
>-	iocb->ki_pos = READ_ONCE(sqe->off);
>+	if (rwf & RWF_INDIRECT_OFFSET) {
>+		loff_t __user *loffp = u64_to_user_ptr(sqe->addr2);
>+
>+		if (get_user(iocb->ki_pos, loffp)
>+			return -EFAULT;
>+		iocb->ki_loffp = loffp;
>+	} else {
>+		iocb->ki_pos = READ_ONCE(sqe->off);
>+	}
>...
>-	ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
>+	ret = kiocb_set_rw_flags(kiocb, rwf);

It will sure go like this in io_uring, except I was thinking to use
io_kiocb rather than iocb for "loffp". 
I am fine with RWF_INDIRECT_OFFSET, but wondering - whether to build
this over base-behavior offered by RWF_APPEND.
This is what I mean in code (I used RWF_APPEND2 here)- 

static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags)
        ki->ki_flags |= (IOCB_DSYNC | IOCB_SYNC);
        if (flags & RWF_APPEND)
                ki->ki_flags |= IOCB_APPEND;
+       if (flags & RWF_APPEND2) {
+               /*
+                * RWF_APPEND2 is "file-append + return write-location"
+                * Use IOCB_APPEND for file-append, and new IOCB_ZONE_APPEND
+                * to return where write landed
+                */
+               ki->ki_flags |= IOCB_APPEND;
+               if (ki->ki_filp->f_mode & FMODE_ZONE_APPEND) /*revisit the need*/
+                       ki->ki_flags |= IOCB_ZONE_APPEND;
+       }
+
Javier González July 8, 2020, 4:43 p.m. UTC | #27
> On 8 Jul 2020, at 18.34, Matthew Wilcox <willy@infradead.org> wrote:
> 
> On Wed, Jul 08, 2020 at 06:08:12PM +0200, Javier González wrote:
>>> I just wanted to get clarification there, because to me it sounded like
>>> you expected Kanchan to do it, and Kanchan assuming it "was sorted". I'd
>>> consider that a prerequisite for the append series as far as io_uring is
>>> concerned, hence _someone_ needs to actually do it ;-)
> 
> I don't know that it's a prerequisite in terms of the patches actually
> depend on it.  I appreciate you want it first to ensure that we don't bloat
> the kiocb.
> 
>> I believe Kanchan meant that now the trade-off we were asking to clear out is sorted. 
>> 
>> We will send a new version shortly for the current functionality - we can see what we are missing on when the uring interface is clear. 
> 
> I've started work on a patch series for this.  Mostly just waiting for
> compilation now ... should be done in the next few hours.


Awesome!
Kanchan Joshi July 8, 2020, 5:13 p.m. UTC | #28
On Wed, Jul 08, 2020 at 10:38:44AM -0600, Jens Axboe wrote:
>On 7/8/20 10:33 AM, Matthew Wilcox wrote:
>> On Wed, Jul 08, 2020 at 06:08:12PM +0200, Javier González wrote:
>>>> I just wanted to get clarification there, because to me it sounded like
>>>> you expected Kanchan to do it, and Kanchan assuming it "was sorted". I'd
>>>> consider that a prerequisite for the append series as far as io_uring is
>>>> concerned, hence _someone_ needs to actually do it ;-)
>>
>> I don't know that it's a prerequisite in terms of the patches actually
>> depend on it.  I appreciate you want it first to ensure that we don't bloat
>> the kiocb.
>
>Maybe not for the series, but for the io_uring addition it is.
>
>>> I believe Kanchan meant that now the trade-off we were asking to
>>> clear out is sorted.
>>>
>>> We will send a new version shortly for the current functionality - we
>>> can see what we are missing on when the uring interface is clear.
>>
>> I've started work on a patch series for this.  Mostly just waiting for
>> compilation now ... should be done in the next few hours.
>
>Great!

Jens, Matthew - I'm sorry for creating the confusion. By "looks sorted"
I meant the performance-implications and the room-for-pointer. For the
latter I was thinking to go by your suggestion not to bloat the kiocb, and
use io_kiocb instead.
If we keep, there will be two paths to update that pointer, one using
ki_complete(....,ret2) and another directly - which does not seem good.

On a different note: trimming kiocb by decoupling ki_complete work looks
too good to be done by me :-)
Christoph Hellwig July 9, 2020, 10:15 a.m. UTC | #29
On Sun, Jul 05, 2020 at 03:00:47PM -0600, Jens Axboe wrote:
> > diff --git a/fs/io_uring.c b/fs/io_uring.c
> > index 155f3d8..cbde4df 100644
> > --- a/fs/io_uring.c
> > +++ b/fs/io_uring.c
> > @@ -402,6 +402,8 @@ struct io_rw {
> >  	struct kiocb			kiocb;
> >  	u64				addr;
> >  	u64				len;
> > +	/* zone-relative offset for append, in sectors */
> > +	u32			append_offset;
> >  };
> 
> I don't like this very much at all. As it stands, the first cacheline
> of io_kiocb is set aside for request-private data. io_rw is already
> exactly 64 bytes, which means that you're now growing io_rw beyond
> a cacheline and increasing the size of io_kiocb as a whole.
> 
> Maybe you can reuse io_rw->len for this, as that is only used on the
> submission side of things.

We don't actually need any new field at all.  By the time the write
returned ki_pos contains the offset after the write, and the res
argument to ->ki_complete contains the amount of bytes written, which
allow us to trivially derive the starting position.
Jens Axboe July 9, 2020, 1:58 p.m. UTC | #30
On 7/9/20 4:15 AM, Christoph Hellwig wrote:
> On Sun, Jul 05, 2020 at 03:00:47PM -0600, Jens Axboe wrote:
>>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>>> index 155f3d8..cbde4df 100644
>>> --- a/fs/io_uring.c
>>> +++ b/fs/io_uring.c
>>> @@ -402,6 +402,8 @@ struct io_rw {
>>>  	struct kiocb			kiocb;
>>>  	u64				addr;
>>>  	u64				len;
>>> +	/* zone-relative offset for append, in sectors */
>>> +	u32			append_offset;
>>>  };
>>
>> I don't like this very much at all. As it stands, the first cacheline
>> of io_kiocb is set aside for request-private data. io_rw is already
>> exactly 64 bytes, which means that you're now growing io_rw beyond
>> a cacheline and increasing the size of io_kiocb as a whole.
>>
>> Maybe you can reuse io_rw->len for this, as that is only used on the
>> submission side of things.
> 
> We don't actually need any new field at all.  By the time the write
> returned ki_pos contains the offset after the write, and the res
> argument to ->ki_complete contains the amount of bytes written, which
> allow us to trivially derive the starting position.

Then let's just do that instead of jumping through hoops either
justifying growing io_rw/io_kiocb or turning kiocb into a global
completion thing.
Christoph Hellwig July 9, 2020, 2 p.m. UTC | #31
On Thu, Jul 09, 2020 at 07:58:04AM -0600, Jens Axboe wrote:
> > We don't actually need any new field at all.  By the time the write
> > returned ki_pos contains the offset after the write, and the res
> > argument to ->ki_complete contains the amount of bytes written, which
> > allow us to trivially derive the starting position.
> 
> Then let's just do that instead of jumping through hoops either
> justifying growing io_rw/io_kiocb or turning kiocb into a global
> completion thing.

Unfortunately that is a totally separate issue - the in-kernel offset
can be trivially calculated.  But we still need to figure out a way to
pass it on to userspace.  The current patchset does that by abusing
the flags, which doesn't really work as the flags are way too small.
So we somewhere need to have an address to do the put_user to.
Jens Axboe July 9, 2020, 2:05 p.m. UTC | #32
On 7/9/20 8:00 AM, Christoph Hellwig wrote:
> On Thu, Jul 09, 2020 at 07:58:04AM -0600, Jens Axboe wrote:
>>> We don't actually need any new field at all.  By the time the write
>>> returned ki_pos contains the offset after the write, and the res
>>> argument to ->ki_complete contains the amount of bytes written, which
>>> allow us to trivially derive the starting position.
>>
>> Then let's just do that instead of jumping through hoops either
>> justifying growing io_rw/io_kiocb or turning kiocb into a global
>> completion thing.
> 
> Unfortunately that is a totally separate issue - the in-kernel offset
> can be trivially calculated.  But we still need to figure out a way to
> pass it on to userspace.  The current patchset does that by abusing
> the flags, which doesn't really work as the flags are way too small.
> So we somewhere need to have an address to do the put_user to.

Right, we're just trading the 'append_offset' for a 'copy_offset_here'
pointer, which are stored in the same spot...
Kanchan Joshi July 9, 2020, 6:36 p.m. UTC | #33
On Thu, Jul 9, 2020 at 7:36 PM Jens Axboe <axboe@kernel.dk> wrote:
>
> On 7/9/20 8:00 AM, Christoph Hellwig wrote:
> > On Thu, Jul 09, 2020 at 07:58:04AM -0600, Jens Axboe wrote:
> >>> We don't actually need any new field at all.  By the time the write
> >>> returned ki_pos contains the offset after the write, and the res
> >>> argument to ->ki_complete contains the amount of bytes written, which
> >>> allow us to trivially derive the starting position.

Deriving starting position was not the purpose at all.
But yes, append-offset is not needed, for a different reason.
It was kept for uring specific handling. Completion-result from lower
layer was always coming to uring in ret2 via ki_complete(....,ret2).
And ret2 goes to CQE (and user-space) without any conversion in between.
For polled-completion, there is a short window when we get ret2 but cannot
write into CQE immediately, so thought of storing that in append_offset
(but should not have done, solving was possible without it).

FWIW, if we move to indirect-offset approach, append_offset gets
eliminated automatically, because there is no need to write to CQE
itself.

> >> Then let's just do that instead of jumping through hoops either
> >> justifying growing io_rw/io_kiocb or turning kiocb into a global
> >> completion thing.
> >
> > Unfortunately that is a totally separate issue - the in-kernel offset
> > can be trivially calculated.  But we still need to figure out a way to
> > pass it on to userspace.  The current patchset does that by abusing
> > the flags, which doesn't really work as the flags are way too small.
> > So we somewhere need to have an address to do the put_user to.
>
> Right, we're just trading the 'append_offset' for a 'copy_offset_here'
> pointer, which are stored in the same spot...

The address needs to be stored somewhere. And there does not seem
other option but to use io_kiocb?
The bigger problem with address/indirect-offset is to be able to write to it
during completion as process-context is different. Will that require entering
into task_work_add() world, and may make it costly affair?

Using flags have not been liked here, but given the upheaval involved so
far I have begun to feel - it was keeping things simple. Should it be
reconsidered?


--
Joshi
Pavel Begunkov July 9, 2020, 6:50 p.m. UTC | #34
On 09/07/2020 21:36, Kanchan Joshi wrote:
> On Thu, Jul 9, 2020 at 7:36 PM Jens Axboe <axboe@kernel.dk> wrote:
>>
>> On 7/9/20 8:00 AM, Christoph Hellwig wrote:
>>> On Thu, Jul 09, 2020 at 07:58:04AM -0600, Jens Axboe wrote:
>>>>> We don't actually need any new field at all.  By the time the write
>>>>> returned ki_pos contains the offset after the write, and the res
>>>>> argument to ->ki_complete contains the amount of bytes written, which
>>>>> allow us to trivially derive the starting position.
> 
> Deriving starting position was not the purpose at all.
> But yes, append-offset is not needed, for a different reason.
> It was kept for uring specific handling. Completion-result from lower
> layer was always coming to uring in ret2 via ki_complete(....,ret2).
> And ret2 goes to CQE (and user-space) without any conversion in between.
> For polled-completion, there is a short window when we get ret2 but cannot
> write into CQE immediately, so thought of storing that in append_offset
> (but should not have done, solving was possible without it).

fwiw, there are more cases when it's postponed.

> FWIW, if we move to indirect-offset approach, append_offset gets
> eliminated automatically, because there is no need to write to CQE
> itself.

Right, for the indirect approach we can write offset right after getting it.
If not, then it's somehow stored in an CQE, so may be placed into
existing req->{result,cflags}, which mimic CQE's fields.

> 
>>>> Then let's just do that instead of jumping through hoops either
>>>> justifying growing io_rw/io_kiocb or turning kiocb into a global
>>>> completion thing.
>>>
>>> Unfortunately that is a totally separate issue - the in-kernel offset
>>> can be trivially calculated.  But we still need to figure out a way to
>>> pass it on to userspace.  The current patchset does that by abusing
>>> the flags, which doesn't really work as the flags are way too small.
>>> So we somewhere need to have an address to do the put_user to.
>>
>> Right, we're just trading the 'append_offset' for a 'copy_offset_here'
>> pointer, which are stored in the same spot...
> 
> The address needs to be stored somewhere. And there does not seem
> other option but to use io_kiocb?
> The bigger problem with address/indirect-offset is to be able to write to it
> during completion as process-context is different. Will that require entering
> into task_work_add() world, and may make it costly affair?
> 
> Using flags have not been liked here, but given the upheaval involved so
> far I have begun to feel - it was keeping things simple. Should it be
> reconsidered?
> 
> 
> --
> Joshi
>
Jens Axboe July 9, 2020, 6:50 p.m. UTC | #35
On 7/9/20 12:36 PM, Kanchan Joshi wrote:
> On Thu, Jul 9, 2020 at 7:36 PM Jens Axboe <axboe@kernel.dk> wrote:
>>
>> On 7/9/20 8:00 AM, Christoph Hellwig wrote:
>>> On Thu, Jul 09, 2020 at 07:58:04AM -0600, Jens Axboe wrote:
>>>>> We don't actually need any new field at all.  By the time the write
>>>>> returned ki_pos contains the offset after the write, and the res
>>>>> argument to ->ki_complete contains the amount of bytes written, which
>>>>> allow us to trivially derive the starting position.
> 
> Deriving starting position was not the purpose at all.
> But yes, append-offset is not needed, for a different reason.
> It was kept for uring specific handling. Completion-result from lower
> layer was always coming to uring in ret2 via ki_complete(....,ret2).
> And ret2 goes to CQE (and user-space) without any conversion in between.
> For polled-completion, there is a short window when we get ret2 but cannot
> write into CQE immediately, so thought of storing that in append_offset
> (but should not have done, solving was possible without it).
> 
> FWIW, if we move to indirect-offset approach, append_offset gets
> eliminated automatically, because there is no need to write to CQE
> itself.
> 
>>>> Then let's just do that instead of jumping through hoops either
>>>> justifying growing io_rw/io_kiocb or turning kiocb into a global
>>>> completion thing.
>>>
>>> Unfortunately that is a totally separate issue - the in-kernel offset
>>> can be trivially calculated.  But we still need to figure out a way to
>>> pass it on to userspace.  The current patchset does that by abusing
>>> the flags, which doesn't really work as the flags are way too small.
>>> So we somewhere need to have an address to do the put_user to.
>>
>> Right, we're just trading the 'append_offset' for a 'copy_offset_here'
>> pointer, which are stored in the same spot...
> 
> The address needs to be stored somewhere. And there does not seem
> other option but to use io_kiocb?

That is where it belongs, not sure this was ever questioned. And inside
io_rw at that.

> The bigger problem with address/indirect-offset is to be able to write
> to it during completion as process-context is different. Will that
> require entering into task_work_add() world, and may make it costly
> affair?

It might, if you have IRQ context for the completion. task_work isn't
expensive, however. It's not like a thread offload.

> Using flags have not been liked here, but given the upheaval involved so
> far I have begun to feel - it was keeping things simple. Should it be
> reconsidered?

It's definitely worth considering, especially since we can use cflags
like Pavel suggested upfront and not need any extra storage. But it
brings us back to the 32-bit vs 64-bit discussion, and then using blocks
instead of bytes. Which isn't exactly super pretty.
Pavel Begunkov July 9, 2020, 6:53 p.m. UTC | #36
On 09/07/2020 21:50, Pavel Begunkov wrote:
> On 09/07/2020 21:36, Kanchan Joshi wrote:
>> On Thu, Jul 9, 2020 at 7:36 PM Jens Axboe <axboe@kernel.dk> wrote:
>>>
>>> On 7/9/20 8:00 AM, Christoph Hellwig wrote:
>>>> On Thu, Jul 09, 2020 at 07:58:04AM -0600, Jens Axboe wrote:
>>>>>> We don't actually need any new field at all.  By the time the write
>>>>>> returned ki_pos contains the offset after the write, and the res
>>>>>> argument to ->ki_complete contains the amount of bytes written, which
>>>>>> allow us to trivially derive the starting position.
>>
>> Deriving starting position was not the purpose at all.
>> But yes, append-offset is not needed, for a different reason.
>> It was kept for uring specific handling. Completion-result from lower
>> layer was always coming to uring in ret2 via ki_complete(....,ret2).
>> And ret2 goes to CQE (and user-space) without any conversion in between.
>> For polled-completion, there is a short window when we get ret2 but cannot
>> write into CQE immediately, so thought of storing that in append_offset
>> (but should not have done, solving was possible without it).
> 
> fwiw, there are more cases when it's postponed.
> 
>> FWIW, if we move to indirect-offset approach, append_offset gets
>> eliminated automatically, because there is no need to write to CQE
>> itself.
> 
> Right, for the indirect approach we can write offset right after getting it.

Take it back, as you mentioned with task_work, we may need the right context.

BTW, there is one more way to get space for it, and it would also shed 8 bytes
from io_kiocb, but that would need some work to be done.

> If not, then it's somehow stored in an CQE, so may be placed into
> existing req->{result,cflags}, which mimic CQE's fields.
> 
>>
>>>>> Then let's just do that instead of jumping through hoops either
>>>>> justifying growing io_rw/io_kiocb or turning kiocb into a global
>>>>> completion thing.
>>>>
>>>> Unfortunately that is a totally separate issue - the in-kernel offset
>>>> can be trivially calculated.  But we still need to figure out a way to
>>>> pass it on to userspace.  The current patchset does that by abusing
>>>> the flags, which doesn't really work as the flags are way too small.
>>>> So we somewhere need to have an address to do the put_user to.
>>>
>>> Right, we're just trading the 'append_offset' for a 'copy_offset_here'
>>> pointer, which are stored in the same spot...
>>
>> The address needs to be stored somewhere. And there does not seem
>> other option but to use io_kiocb?
>> The bigger problem with address/indirect-offset is to be able to write to it
>> during completion as process-context is different. Will that require entering
>> into task_work_add() world, and may make it costly affair?
>>
>> Using flags have not been liked here, but given the upheaval involved so
>> far I have begun to feel - it was keeping things simple. Should it be
>> reconsidered?
>>
>>
>> --
>> Joshi
>>
>
Kanchan Joshi July 9, 2020, 7:05 p.m. UTC | #37
On Fri, Jul 10, 2020 at 12:20 AM Jens Axboe <axboe@kernel.dk> wrote:
>
> On 7/9/20 12:36 PM, Kanchan Joshi wrote:
> > On Thu, Jul 9, 2020 at 7:36 PM Jens Axboe <axboe@kernel.dk> wrote:
> >>
> >> On 7/9/20 8:00 AM, Christoph Hellwig wrote:
> >>> On Thu, Jul 09, 2020 at 07:58:04AM -0600, Jens Axboe wrote:
> >>>>> We don't actually need any new field at all.  By the time the write
> >>>>> returned ki_pos contains the offset after the write, and the res
> >>>>> argument to ->ki_complete contains the amount of bytes written, which
> >>>>> allow us to trivially derive the starting position.
> >
> > Deriving starting position was not the purpose at all.
> > But yes, append-offset is not needed, for a different reason.
> > It was kept for uring specific handling. Completion-result from lower
> > layer was always coming to uring in ret2 via ki_complete(....,ret2).
> > And ret2 goes to CQE (and user-space) without any conversion in between.
> > For polled-completion, there is a short window when we get ret2 but cannot
> > write into CQE immediately, so thought of storing that in append_offset
> > (but should not have done, solving was possible without it).
> >
> > FWIW, if we move to indirect-offset approach, append_offset gets
> > eliminated automatically, because there is no need to write to CQE
> > itself.
> >
> >>>> Then let's just do that instead of jumping through hoops either
> >>>> justifying growing io_rw/io_kiocb or turning kiocb into a global
> >>>> completion thing.
> >>>
> >>> Unfortunately that is a totally separate issue - the in-kernel offset
> >>> can be trivially calculated.  But we still need to figure out a way to
> >>> pass it on to userspace.  The current patchset does that by abusing
> >>> the flags, which doesn't really work as the flags are way too small.
> >>> So we somewhere need to have an address to do the put_user to.
> >>
> >> Right, we're just trading the 'append_offset' for a 'copy_offset_here'
> >> pointer, which are stored in the same spot...
> >
> > The address needs to be stored somewhere. And there does not seem
> > other option but to use io_kiocb?
>
> That is where it belongs, not sure this was ever questioned. And inside
> io_rw at that.
>
> > The bigger problem with address/indirect-offset is to be able to write
> > to it during completion as process-context is different. Will that
> > require entering into task_work_add() world, and may make it costly
> > affair?
>
> It might, if you have IRQ context for the completion. task_work isn't
> expensive, however. It's not like a thread offload.
>
> > Using flags have not been liked here, but given the upheaval involved so
> > far I have begun to feel - it was keeping things simple. Should it be
> > reconsidered?
>
> It's definitely worth considering, especially since we can use cflags
> like Pavel suggested upfront and not need any extra storage. But it
> brings us back to the 32-bit vs 64-bit discussion, and then using blocks
> instead of bytes. Which isn't exactly super pretty.
>
I agree that what we had was not great.
Append required special treatment (conversion for sector to bytes) for io_uring.
And we were planning a user-space wrapper to abstract that.

But good part (as it seems now) was: append result went along with cflags at
virtually no additional cost. And uring code changes became super clean/minimal
with further revisions.
While indirect-offset requires doing allocation/mgmt in application,
io-uring submission
and in completion path (which seems trickier), and those CQE flags
still get written
user-space and serve no purpose for append-write.
Christoph Hellwig July 10, 2020, 1:09 p.m. UTC | #38
On Thu, Jul 09, 2020 at 12:50:27PM -0600, Jens Axboe wrote:
> It might, if you have IRQ context for the completion. task_work isn't
> expensive, however. It's not like a thread offload.
> 
> > Using flags have not been liked here, but given the upheaval involved so
> > far I have begun to feel - it was keeping things simple. Should it be
> > reconsidered?
> 
> It's definitely worth considering, especially since we can use cflags
> like Pavel suggested upfront and not need any extra storage. But it
> brings us back to the 32-bit vs 64-bit discussion, and then using blocks
> instead of bytes. Which isn't exactly super pretty.

block doesn't work for the case of writes to files that don't have
to be aligned in any way.  And that I think is the more broadly
applicable use case than zone append on block devices.
Christoph Hellwig July 10, 2020, 1:10 p.m. UTC | #39
On Fri, Jul 10, 2020 at 12:35:43AM +0530, Kanchan Joshi wrote:
> Append required special treatment (conversion for sector to bytes) for io_uring.
> And we were planning a user-space wrapper to abstract that.
> 
> But good part (as it seems now) was: append result went along with cflags at
> virtually no additional cost. And uring code changes became super clean/minimal
> with further revisions.
> While indirect-offset requires doing allocation/mgmt in application,
> io-uring submission
> and in completion path (which seems trickier), and those CQE flags
> still get written
> user-space and serve no purpose for append-write.

I have to say that storing the results in the CQE generally make
so much more sense.  I wonder if we need a per-fd "large CGE" flag
that adds two extra u64s to the CQE, and some ops just require this
version.
Kanchan Joshi July 10, 2020, 1:29 p.m. UTC | #40
On Fri, Jul 10, 2020 at 6:39 PM Christoph Hellwig <hch@infradead.org> wrote:
>
> On Thu, Jul 09, 2020 at 12:50:27PM -0600, Jens Axboe wrote:
> > It might, if you have IRQ context for the completion. task_work isn't
> > expensive, however. It's not like a thread offload.
> >
> > > Using flags have not been liked here, but given the upheaval involved so
> > > far I have begun to feel - it was keeping things simple. Should it be
> > > reconsidered?
> >
> > It's definitely worth considering, especially since we can use cflags
> > like Pavel suggested upfront and not need any extra storage. But it
> > brings us back to the 32-bit vs 64-bit discussion, and then using blocks
> > instead of bytes. Which isn't exactly super pretty.
>
> block doesn't work for the case of writes to files that don't have
> to be aligned in any way.  And that I think is the more broadly
> applicable use case than zone append on block devices.

But when can it happen that we do zone-append on a file (zonefs I
asssume), and device returns a location (write-pointer essentially)
which is not in multiple of 512b?
Christoph Hellwig July 10, 2020, 1:43 p.m. UTC | #41
On Fri, Jul 10, 2020 at 06:59:45PM +0530, Kanchan Joshi wrote:
> > block doesn't work for the case of writes to files that don't have
> > to be aligned in any way.  And that I think is the more broadly
> > applicable use case than zone append on block devices.
> 
> But when can it happen that we do zone-append on a file (zonefs I
> asssume), and device returns a location (write-pointer essentially)
> which is not in multiple of 512b?

All the time.  You open a file with O_APPEND.  You write a record to
it of any kind of size, then the next write will return the position
it got written at, which can be anything.
Matthew Wilcox July 10, 2020, 1:48 p.m. UTC | #42
On Fri, Jul 10, 2020 at 02:10:54PM +0100, Christoph Hellwig wrote:
> On Fri, Jul 10, 2020 at 12:35:43AM +0530, Kanchan Joshi wrote:
> > Append required special treatment (conversion for sector to bytes) for io_uring.
> > And we were planning a user-space wrapper to abstract that.
> > 
> > But good part (as it seems now) was: append result went along with cflags at
> > virtually no additional cost. And uring code changes became super clean/minimal
> > with further revisions.
> > While indirect-offset requires doing allocation/mgmt in application,
> > io-uring submission
> > and in completion path (which seems trickier), and those CQE flags
> > still get written
> > user-space and serve no purpose for append-write.
> 
> I have to say that storing the results in the CQE generally make
> so much more sense.  I wonder if we need a per-fd "large CGE" flag
> that adds two extra u64s to the CQE, and some ops just require this
> version.

If we're going to go the route of changing the CQE, how about:

 struct io_uring_cqe {
         __u64   user_data;      /* sqe->data submission passed back */
-        __s32   res;            /* result code for this event */
-        __u32   flags;
+	union {
+		struct {
+		        __s32   res;            /* result code for this event */
+		        __u32   flags;
+		};
+		__s64	res64;
+	};
 };

then we don't need to change the CQE size and it just depends on the SQE
whether the CQE for it uses res+flags or res64.
Christoph Hellwig July 10, 2020, 1:49 p.m. UTC | #43
On Fri, Jul 10, 2020 at 02:48:24PM +0100, Matthew Wilcox wrote:
> If we're going to go the route of changing the CQE, how about:
> 
>  struct io_uring_cqe {
>          __u64   user_data;      /* sqe->data submission passed back */
> -        __s32   res;            /* result code for this event */
> -        __u32   flags;
> +	union {
> +		struct {
> +		        __s32   res;            /* result code for this event */
> +		        __u32   flags;
> +		};
> +		__s64	res64;
> +	};
>  };
> 
> then we don't need to change the CQE size and it just depends on the SQE
> whether the CQE for it uses res+flags or res64.

How do you return a status code or short write when you just have
a u64 that is needed for the offset?
Matthew Wilcox July 10, 2020, 1:51 p.m. UTC | #44
On Fri, Jul 10, 2020 at 02:49:32PM +0100, Christoph Hellwig wrote:
> On Fri, Jul 10, 2020 at 02:48:24PM +0100, Matthew Wilcox wrote:
> > If we're going to go the route of changing the CQE, how about:
> > 
> >  struct io_uring_cqe {
> >          __u64   user_data;      /* sqe->data submission passed back */
> > -        __s32   res;            /* result code for this event */
> > -        __u32   flags;
> > +	union {
> > +		struct {
> > +		        __s32   res;            /* result code for this event */
> > +		        __u32   flags;
> > +		};
> > +		__s64	res64;
> > +	};
> >  };
> > 
> > then we don't need to change the CQE size and it just depends on the SQE
> > whether the CQE for it uses res+flags or res64.
> 
> How do you return a status code or short write when you just have
> a u64 that is needed for the offset?

it's an s64 not a u64 so you can return a negative errno.  i didn't
think we allowed short writes for objects-which-have-a-pos.
Kanchan Joshi July 10, 2020, 1:57 p.m. UTC | #45
On Fri, Jul 10, 2020 at 6:59 PM Kanchan Joshi <joshiiitr@gmail.com> wrote:
>
> On Fri, Jul 10, 2020 at 6:39 PM Christoph Hellwig <hch@infradead.org> wrote:
> >
> > On Thu, Jul 09, 2020 at 12:50:27PM -0600, Jens Axboe wrote:
> > > It might, if you have IRQ context for the completion. task_work isn't
> > > expensive, however. It's not like a thread offload.

Not sure about polled-completion but we have IRQ context for regular completion.
If I've got it right, I need to store task_struct during submission,
and use that to register a task_work during completion. At some point
when this task_work gets called it will update the user-space pointer
with the result.
It can be the case that we get N completions parallely, but they all
would get serialized because all N task-works need to be executed in
the context of single task/process?

> > > > Using flags have not been liked here, but given the upheaval involved so
> > > > far I have begun to feel - it was keeping things simple. Should it be
> > > > reconsidered?
> > >
> > > It's definitely worth considering, especially since we can use cflags
> > > like Pavel suggested upfront and not need any extra storage. But it
> > > brings us back to the 32-bit vs 64-bit discussion, and then using blocks
> > > instead of bytes. Which isn't exactly super pretty.
> >
> > block doesn't work for the case of writes to files that don't have
> > to be aligned in any way.  And that I think is the more broadly
> > applicable use case than zone append on block devices.
>
> But when can it happen that we do zone-append on a file (zonefs I
> asssume), and device returns a location (write-pointer essentially)
> which is not in multiple of 512b?
>
>
> --
> Joshi
Jens Axboe July 10, 2020, 2:09 p.m. UTC | #46
On 7/10/20 7:10 AM, Christoph Hellwig wrote:
> On Fri, Jul 10, 2020 at 12:35:43AM +0530, Kanchan Joshi wrote:
>> Append required special treatment (conversion for sector to bytes) for io_uring.
>> And we were planning a user-space wrapper to abstract that.
>>
>> But good part (as it seems now) was: append result went along with cflags at
>> virtually no additional cost. And uring code changes became super clean/minimal
>> with further revisions.
>> While indirect-offset requires doing allocation/mgmt in application,
>> io-uring submission
>> and in completion path (which seems trickier), and those CQE flags
>> still get written
>> user-space and serve no purpose for append-write.
> 
> I have to say that storing the results in the CQE generally make
> so much more sense.  I wonder if we need a per-fd "large CGE" flag
> that adds two extra u64s to the CQE, and some ops just require this
> version.

I have been pondering the same thing, we could make certain ops consume
two CQEs if it makes sense. It's a bit ugly on the app side with two
different CQEs for a request, though. We can't just treat it as a large
CQE, as they might not be sequential if we happen to wrap. But maybe
it's not too bad.
Kanchan Joshi July 10, 2020, 2:11 p.m. UTC | #47
On Fri, Jul 10, 2020 at 7:21 PM Matthew Wilcox <willy@infradead.org> wrote:
>
> On Fri, Jul 10, 2020 at 02:49:32PM +0100, Christoph Hellwig wrote:
> > On Fri, Jul 10, 2020 at 02:48:24PM +0100, Matthew Wilcox wrote:
> > > If we're going to go the route of changing the CQE, how about:
> > >
> > >  struct io_uring_cqe {
> > >          __u64   user_data;      /* sqe->data submission passed back */
> > > -        __s32   res;            /* result code for this event */
> > > -        __u32   flags;
> > > +   union {
> > > +           struct {
> > > +                   __s32   res;            /* result code for this event */
> > > +                   __u32   flags;
> > > +           };
> > > +           __s64   res64;
> > > +   };
> > >  };
> > >
> > > then we don't need to change the CQE size and it just depends on the SQE
> > > whether the CQE for it uses res+flags or res64.
> >
> > How do you return a status code or short write when you just have
> > a u64 that is needed for the offset?
>
> it's an s64 not a u64 so you can return a negative errno.  i didn't
> think we allowed short writes for objects-which-have-a-pos.

If we are doing this for zone-append (and not general cases), "__s64
res64" should work -.
64 bits = 1 (sign) + 23 (bytes-copied: cqe->res) + 40
(written-location: chunk_sector bytes limit)
Kanchan Joshi July 20, 2020, 4:46 p.m. UTC | #48
On Fri, Jul 10, 2020 at 7:39 PM Jens Axboe <axboe@kernel.dk> wrote:
>
> On 7/10/20 7:10 AM, Christoph Hellwig wrote:
> > On Fri, Jul 10, 2020 at 12:35:43AM +0530, Kanchan Joshi wrote:
> >> Append required special treatment (conversion for sector to bytes) for io_uring.
> >> And we were planning a user-space wrapper to abstract that.
> >>
> >> But good part (as it seems now) was: append result went along with cflags at
> >> virtually no additional cost. And uring code changes became super clean/minimal
> >> with further revisions.
> >> While indirect-offset requires doing allocation/mgmt in application,
> >> io-uring submission
> >> and in completion path (which seems trickier), and those CQE flags
> >> still get written
> >> user-space and serve no purpose for append-write.
> >
> > I have to say that storing the results in the CQE generally make
> > so much more sense.  I wonder if we need a per-fd "large CGE" flag
> > that adds two extra u64s to the CQE, and some ops just require this
> > version.
>
> I have been pondering the same thing, we could make certain ops consume
> two CQEs if it makes sense. It's a bit ugly on the app side with two
> different CQEs for a request, though. We can't just treat it as a large
> CQE, as they might not be sequential if we happen to wrap. But maybe
> it's not too bad.

Did some work on the two-cqe scheme for zone-append.
First CQE is the same (as before), while second CQE does not keep
res/flags and instead has 64bit result to report append-location.
It would look like this -

struct io_uring_cqe {
        __u64   user_data;      /* sqe->data submission passed back */
-       __s32   res;            /* result code for this event */
-       __u32   flags;
+       union {
+               struct {
+                       __s32   res;            /* result code for this event */
+                       __u32   flags;
+               };
+               __u64   append_res;   /*only used for append, in
secondary cqe */
+       };

And kernel will produce two CQEs for append completion-

static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
{
-       struct io_uring_cqe *cqe;
+       struct io_uring_cqe *cqe, *cqe2 = NULL;

-       cqe = io_get_cqring(ctx);
+       if (unlikely(req->flags & REQ_F_ZONE_APPEND))
+ /* obtain two CQEs for append. NULL if two CQEs are not available */
+               cqe = io_get_two_cqring(ctx, &cqe2);
+       else
+               cqe = io_get_cqring(ctx);
+
        if (likely(cqe)) {
                WRITE_ONCE(cqe->user_data, req->user_data);
                WRITE_ONCE(cqe->res, res);
                WRITE_ONCE(cqe->flags, cflags);
+               /* update secondary cqe for zone-append */
+               if (req->flags & REQ_F_ZONE_APPEND) {
+                       WRITE_ONCE(cqe2->append_res,
+                               (u64)req->append_offset << SECTOR_SHIFT);
+                       WRITE_ONCE(cqe2->user_data, req->user_data);
+               }
  mutex_unlock(&ctx->uring_lock);


This seems to go fine in Kernel.
But the application will have few differences such as:

- When it submits N appends, and decides to wait for all completions
it needs to specify min_complete as 2*N (or at least 2N-1).
Two appends will produce 4 completion events, and if application
decides to wait for both it must specify 4 (or 3).

io_uring_enter(unsigned int fd, unsigned int to_submit,
                   unsigned int min_complete, unsigned int flags,
                   sigset_t *sig);

- Completion-processing sequence for mixed-workload (few reads + few
appends, on the same ring).
Currently there is a one-to-one relationship. Application looks at N
CQE entries, and treats each as distinct IO completion - a for loop
does the work.
With two-cqe scheme, extracting, from a bunch of completion, the ones
for read (one cqe) and append (two cqe): flow gets somewhat
non-linear.

Perhaps this is not too bad, but felt that it must be put here upfront.
Kanchan Joshi July 20, 2020, 4:49 p.m. UTC | #49
On Fri, Jul 10, 2020 at 7:41 PM Kanchan Joshi <joshiiitr@gmail.com> wrote:
>
> On Fri, Jul 10, 2020 at 7:21 PM Matthew Wilcox <willy@infradead.org> wrote:
> >
> > On Fri, Jul 10, 2020 at 02:49:32PM +0100, Christoph Hellwig wrote:
> > > On Fri, Jul 10, 2020 at 02:48:24PM +0100, Matthew Wilcox wrote:
> > > > If we're going to go the route of changing the CQE, how about:
> > > >
> > > >  struct io_uring_cqe {
> > > >          __u64   user_data;      /* sqe->data submission passed back */
> > > > -        __s32   res;            /* result code for this event */
> > > > -        __u32   flags;
> > > > +   union {
> > > > +           struct {
> > > > +                   __s32   res;            /* result code for this event */
> > > > +                   __u32   flags;
> > > > +           };
> > > > +           __s64   res64;
> > > > +   };
> > > >  };
> > > >
> > > > then we don't need to change the CQE size and it just depends on the SQE
> > > > whether the CQE for it uses res+flags or res64.
> > >
> > > How do you return a status code or short write when you just have
> > > a u64 that is needed for the offset?
> >
> > it's an s64 not a u64 so you can return a negative errno.  i didn't
> > think we allowed short writes for objects-which-have-a-pos.
>
> If we are doing this for zone-append (and not general cases), "__s64
> res64" should work -.
> 64 bits = 1 (sign) + 23 (bytes-copied: cqe->res) + 40
> (written-location: chunk_sector bytes limit)

And this is for the scheme when single CQE is used with bits
refactoring into "_s64 res64" instead of res/flags.

41 bits for zone-append completion = in bytes, sufficient to cover
chunk_sectors size zone
1+22 bits for zone-append bytes-copied = can cover 4MB bytes copied
(single I/O is capped at 4MB in NVMe)

+ * zone-append specific flags
+#define APPEND_OFFSET_BITS     (41)
+#define APPEND_RES_BITS                (23)
+
+/*
  * IO completion data structure (Completion Queue Entry)
  */
 struct io_uring_cqe {
-       __u64   user_data;      /* sqe->data submission passed back */
-       __s32   res;            /* result code for this event */
-       __u32   flags;
+       __u64   user_data;      /* sqe->data submission passed back */
+        union {
+                struct {
+                        __s32   res;            /* result code for
this event */
+                        __u32   flags;
+                };
+               /* Alternate for zone-append */
+               struct {
+                       union {
+                               /*
+                                * kernel uses this to store append result
+                                * Most significant 23 bits to return number of
+                                * bytes or error, and least significant 41 bits
+                                * to return zone-relative offset in bytes
+                                * */
+                               __s64 res64;
+                               /*for user-space ease, kernel does not use*/
+                               struct {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+                                       __u64 append_offset :
APPEND_OFFSET_BITS;
+                                       __s32 append_res : APPEND_RES_BITS;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+                                       __s32 append_res : APPEND_RES_BITS;
+                                       __u64 append_offset :
APPEND_OFFSET_BITS;
+#endif
+                               }__attribute__ ((__packed__));
+                       };
+                };
+        };
 };
Kanchan Joshi July 20, 2020, 5:02 p.m. UTC | #50
On Fri, Jul 10, 2020 at 7:13 PM Christoph Hellwig <hch@infradead.org> wrote:
>
> On Fri, Jul 10, 2020 at 06:59:45PM +0530, Kanchan Joshi wrote:
> > > block doesn't work for the case of writes to files that don't have
> > > to be aligned in any way.  And that I think is the more broadly
> > > applicable use case than zone append on block devices.
> >
> > But when can it happen that we do zone-append on a file (zonefs I
> > asssume), and device returns a location (write-pointer essentially)
> > which is not in multiple of 512b?
>
> All the time.  You open a file with O_APPEND.  You write a record to
> it of any kind of size, then the next write will return the position
> it got written at, which can be anything.
I understand if this is about cached write and we are talking about
O_APPEND in general.
But for direct block I/O write and ZoneFS writes, page-cache is not
used, so write(and zone-append result) will be aligned to underlying
block size.
Even though this patchset uses O_APPEND, it filters regular files and
non zoned-block devices by using new FMODE_ZONE_APPEND flag.
Matthew Wilcox July 20, 2020, 5:14 p.m. UTC | #51
On Mon, Jul 20, 2020 at 10:19:57PM +0530, Kanchan Joshi wrote:
> On Fri, Jul 10, 2020 at 7:41 PM Kanchan Joshi <joshiiitr@gmail.com> wrote:
> > If we are doing this for zone-append (and not general cases), "__s64
> > res64" should work -.
> > 64 bits = 1 (sign) + 23 (bytes-copied: cqe->res) + 40
> > (written-location: chunk_sector bytes limit)

No, don't do this.

 struct io_uring_cqe {
	__u64   user_data;      /* sqe->data submission passed back */
-	__s32   res;            /* result code for this event */
-	__u32   flags;
+	union {
+		struct {
+			__s32   res;    /* result code for this event */
+			__u32   flags;
+		};
+		__s64		res64;
+	};
 };

Return the value in bytes in res64, or a negative errno.  Done.
			
>   * IO completion data structure (Completion Queue Entry)
>   */
>  struct io_uring_cqe {
> -       __u64   user_data;      /* sqe->data submission passed back */
> -       __s32   res;            /* result code for this event */
> -       __u32   flags;
> +       __u64   user_data;      /* sqe->data submission passed back */
> +        union {
> +                struct {
> +                        __s32   res;            /* result code for
> this event */
> +                        __u32   flags;
> +                };
> +               /* Alternate for zone-append */
> +               struct {
> +                       union {
> +                               /*
> +                                * kernel uses this to store append result
> +                                * Most significant 23 bits to return number of
> +                                * bytes or error, and least significant 41 bits
> +                                * to return zone-relative offset in bytes
> +                                * */
> +                               __s64 res64;
> +                               /*for user-space ease, kernel does not use*/
> +                               struct {
> +#if defined(__LITTLE_ENDIAN_BITFIELD)
> +                                       __u64 append_offset :
> APPEND_OFFSET_BITS;
> +                                       __s32 append_res : APPEND_RES_BITS;
> +#elif defined(__BIG_ENDIAN_BITFIELD)
> +                                       __s32 append_res : APPEND_RES_BITS;
> +                                       __u64 append_offset :
> APPEND_OFFSET_BITS;
> +#endif
> +                               }__attribute__ ((__packed__));
> +                       };
> +                };
> +        };
>  };
> 
> -- 
> Joshi
Kanchan Joshi July 20, 2020, 8:17 p.m. UTC | #52
On Mon, Jul 20, 2020 at 10:44 PM Matthew Wilcox <willy@infradead.org> wrote:
>
> On Mon, Jul 20, 2020 at 10:19:57PM +0530, Kanchan Joshi wrote:
> > On Fri, Jul 10, 2020 at 7:41 PM Kanchan Joshi <joshiiitr@gmail.com> wrote:
> > > If we are doing this for zone-append (and not general cases), "__s64
> > > res64" should work -.
> > > 64 bits = 1 (sign) + 23 (bytes-copied: cqe->res) + 40
> > > (written-location: chunk_sector bytes limit)
>
> No, don't do this.
>
>  struct io_uring_cqe {
>         __u64   user_data;      /* sqe->data submission passed back */
> -       __s32   res;            /* result code for this event */
> -       __u32   flags;
> +       union {
> +               struct {
> +                       __s32   res;    /* result code for this event */
> +                       __u32   flags;
> +               };
> +               __s64           res64;
> +       };
>  };
>
> Return the value in bytes in res64, or a negative errno.  Done.

I concur. Can do away with bytes-copied. It's either in its entirety
or not at all.
Damien Le Moal July 21, 2020, 12:59 a.m. UTC | #53
On 2020/07/21 5:17, Kanchan Joshi wrote:
> On Mon, Jul 20, 2020 at 10:44 PM Matthew Wilcox <willy@infradead.org> wrote:
>>
>> On Mon, Jul 20, 2020 at 10:19:57PM +0530, Kanchan Joshi wrote:
>>> On Fri, Jul 10, 2020 at 7:41 PM Kanchan Joshi <joshiiitr@gmail.com> wrote:
>>>> If we are doing this for zone-append (and not general cases), "__s64
>>>> res64" should work -.
>>>> 64 bits = 1 (sign) + 23 (bytes-copied: cqe->res) + 40
>>>> (written-location: chunk_sector bytes limit)
>>
>> No, don't do this.
>>
>>  struct io_uring_cqe {
>>         __u64   user_data;      /* sqe->data submission passed back */
>> -       __s32   res;            /* result code for this event */
>> -       __u32   flags;
>> +       union {
>> +               struct {
>> +                       __s32   res;    /* result code for this event */
>> +                       __u32   flags;
>> +               };
>> +               __s64           res64;
>> +       };
>>  };
>>
>> Return the value in bytes in res64, or a negative errno.  Done.
> 
> I concur. Can do away with bytes-copied. It's either in its entirety
> or not at all.
> 

SAS SMR drives may return a partial completion. So the size written may be less
than requested, but not necessarily 0, which would be an error anyway since any
condition that would lead to 0B being written will cause the drive to fail the
command with an error.

Also, the completed size should be in res in the first cqe to follow io_uring
current interface, no ?. The second cqe would use the res64 field to return the
written offset. Wasn't that the plan ?
Matthew Wilcox July 21, 2020, 1:15 a.m. UTC | #54
On Tue, Jul 21, 2020 at 12:59:59AM +0000, Damien Le Moal wrote:
> On 2020/07/21 5:17, Kanchan Joshi wrote:
> > On Mon, Jul 20, 2020 at 10:44 PM Matthew Wilcox <willy@infradead.org> wrote:
> >>  struct io_uring_cqe {
> >>         __u64   user_data;      /* sqe->data submission passed back */
> >> -       __s32   res;            /* result code for this event */
> >> -       __u32   flags;
> >> +       union {
> >> +               struct {
> >> +                       __s32   res;    /* result code for this event */
> >> +                       __u32   flags;
> >> +               };
> >> +               __s64           res64;
> >> +       };
> >>  };
> >>
> >> Return the value in bytes in res64, or a negative errno.  Done.
> > 
> > I concur. Can do away with bytes-copied. It's either in its entirety
> > or not at all.
> > 
> 
> SAS SMR drives may return a partial completion. So the size written may be less
> than requested, but not necessarily 0, which would be an error anyway since any
> condition that would lead to 0B being written will cause the drive to fail the
> command with an error.

Why might it return a short write?  And, given how assiduous programmers
are about checking for exceptional conditions, is it useful to tell
userspace "only the first 512 bytes of your 2kB write made it to storage"?
Or would we rather just tell userspace "you got an error" and _not_
tell them that the first N bytes made it to storage?

> Also, the completed size should be in res in the first cqe to follow io_uring
> current interface, no ?. The second cqe would use the res64 field to return the
> written offset. Wasn't that the plan ?

two cqes for one sqe seems like a bad idea to me.
Jens Axboe July 21, 2020, 1:29 a.m. UTC | #55
On 7/20/20 7:15 PM, Matthew Wilcox wrote:
>> Also, the completed size should be in res in the first cqe to follow
>> io_uring current interface, no ?. The second cqe would use the res64
>> field to return the written offset. Wasn't that the plan ?
> 
> two cqes for one sqe seems like a bad idea to me.

I have to agree with that, it's counter to everything else. The app will
then have to wait for two CQEs when it issues that one "special" SQE,
which is really iffy. And we'd have to promise that they are adjacent in
the ring. This isn't necessarily a problem right now, but I've been
playing with un-serialized completions and this would then become an
issue. The io_uring interface is clearly defined as "any sqe will either
return an error on submit (if the error is not specific to the sqe
contents), or post a completion event". Not two events, one.

And imho, zoned device append isn't an interesting enough use case to
warrant doing something special. If there was a super strong (and
generic) use case for passing back more information in the cqe then
maybe it would be considered. But it'd have to be a killer application.
If that's not the case, then the use case should work within the
constraints of the existing API.
Damien Le Moal July 21, 2020, 2:19 a.m. UTC | #56
On 2020/07/21 10:15, Matthew Wilcox wrote:
> On Tue, Jul 21, 2020 at 12:59:59AM +0000, Damien Le Moal wrote:
>> On 2020/07/21 5:17, Kanchan Joshi wrote:
>>> On Mon, Jul 20, 2020 at 10:44 PM Matthew Wilcox <willy@infradead.org> wrote:
>>>>  struct io_uring_cqe {
>>>>         __u64   user_data;      /* sqe->data submission passed back */
>>>> -       __s32   res;            /* result code for this event */
>>>> -       __u32   flags;
>>>> +       union {
>>>> +               struct {
>>>> +                       __s32   res;    /* result code for this event */
>>>> +                       __u32   flags;
>>>> +               };
>>>> +               __s64           res64;
>>>> +       };
>>>>  };
>>>>
>>>> Return the value in bytes in res64, or a negative errno.  Done.
>>>
>>> I concur. Can do away with bytes-copied. It's either in its entirety
>>> or not at all.
>>>
>>
>> SAS SMR drives may return a partial completion. So the size written may be less
>> than requested, but not necessarily 0, which would be an error anyway since any
>> condition that would lead to 0B being written will cause the drive to fail the
>> command with an error.
> 
> Why might it return a short write?  And, given how assiduous programmers
> are about checking for exceptional conditions, is it useful to tell
> userspace "only the first 512 bytes of your 2kB write made it to storage"?
> Or would we rather just tell userspace "you got an error" and _not_
> tell them that the first N bytes made it to storage?

If the write hits a bad sector on disk half-way through, a SAS drive may return
a short write with a non 0 residual. SATA drives will fail the entire command
and libata will retry the failed command. That said, if the drive fails to remap
a bad sector and return an error to the host, it is generally an indicator that
one should go to the store to get a new drive :)

Yes, you have a good point. Returning an error for the entire write would be
fine. The typical error handling for a failed write to a zone is for the user to
first do a zone report to inspect the zone condition and WP position, resync its
view of the zone state and restart the write in the same zone or somewhere else.
So failing the entire write is OK.
I am actually not 100% sure what the bio interface does if the "restart
remaining" of a partially failed request fails again after all retry attempts.
The entire BIO is I think failed. Need to check. So the high level user would
not see the partial failure as that stays within the scsi layer.

>> Also, the completed size should be in res in the first cqe to follow io_uring
>> current interface, no ?. The second cqe would use the res64 field to return the
>> written offset. Wasn't that the plan ?
> 
> two cqes for one sqe seems like a bad idea to me.

Yes, this is not very nice. I got lost in the thread. I thought that was the plan.
diff mbox series

Patch

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 155f3d8..cbde4df 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -402,6 +402,8 @@  struct io_rw {
 	struct kiocb			kiocb;
 	u64				addr;
 	u64				len;
+	/* zone-relative offset for append, in sectors */
+	u32			append_offset;
 };
 
 struct io_connect {
@@ -541,6 +543,7 @@  enum {
 	REQ_F_NO_FILE_TABLE_BIT,
 	REQ_F_QUEUE_TIMEOUT_BIT,
 	REQ_F_WORK_INITIALIZED_BIT,
+	REQ_F_ZONE_APPEND_BIT,
 
 	/* not a real bit, just to check we're not overflowing the space */
 	__REQ_F_LAST_BIT,
@@ -598,6 +601,8 @@  enum {
 	REQ_F_QUEUE_TIMEOUT	= BIT(REQ_F_QUEUE_TIMEOUT_BIT),
 	/* io_wq_work is initialized */
 	REQ_F_WORK_INITIALIZED	= BIT(REQ_F_WORK_INITIALIZED_BIT),
+	/* to return zone relative offset for zone append*/
+	REQ_F_ZONE_APPEND	= BIT(REQ_F_ZONE_APPEND_BIT),
 };
 
 struct async_poll {
@@ -1745,6 +1750,8 @@  static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 
 		if (req->flags & REQ_F_BUFFER_SELECTED)
 			cflags = io_put_kbuf(req);
+		if (req->flags & REQ_F_ZONE_APPEND)
+			cflags = req->rw.append_offset;
 
 		__io_cqring_fill_event(req, req->result, cflags);
 		(*nr_events)++;
@@ -1943,7 +1950,7 @@  static inline void req_set_fail_links(struct io_kiocb *req)
 		req->flags |= REQ_F_FAIL_LINK;
 }
 
-static void io_complete_rw_common(struct kiocb *kiocb, long res)
+static void io_complete_rw_common(struct kiocb *kiocb, long res, long res2)
 {
 	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
 	int cflags = 0;
@@ -1955,6 +1962,10 @@  static void io_complete_rw_common(struct kiocb *kiocb, long res)
 		req_set_fail_links(req);
 	if (req->flags & REQ_F_BUFFER_SELECTED)
 		cflags = io_put_kbuf(req);
+	/* use cflags to return zone append completion result */
+	if (req->flags & REQ_F_ZONE_APPEND)
+		cflags = res2;
+
 	__io_cqring_add_event(req, res, cflags);
 }
 
@@ -1962,7 +1973,7 @@  static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
 {
 	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
 
-	io_complete_rw_common(kiocb, res);
+	io_complete_rw_common(kiocb, res, res2);
 	io_put_req(req);
 }
 
@@ -1975,6 +1986,9 @@  static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
 
 	if (res != req->result)
 		req_set_fail_links(req);
+	if (req->flags & REQ_F_ZONE_APPEND)
+		req->rw.append_offset = res2;
+
 	req->result = res;
 	if (res != -EAGAIN)
 		WRITE_ONCE(req->iopoll_completed, 1);
@@ -2739,6 +2753,9 @@  static int io_write(struct io_kiocb *req, bool force_nonblock)
 						SB_FREEZE_WRITE);
 		}
 		kiocb->ki_flags |= IOCB_WRITE;
+		/* zone-append requires few extra steps during completion */
+		if (kiocb->ki_flags & IOCB_ZONE_APPEND)
+			req->flags |= REQ_F_ZONE_APPEND;
 
 		if (!force_nonblock)
 			current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize;