diff mbox series

[5/5] iomap: support IOCB_DIO_DEFER

Message ID 20230718194920.1472184-7-axboe@kernel.dk (mailing list archive)
State Superseded
Headers show
Series Improve async iomap DIO performance | expand

Commit Message

Jens Axboe July 18, 2023, 7:49 p.m. UTC
If IOCB_DIO_DEFER is set, utilize that to set kiocb->dio_complete handler
and data for that callback. Rather than punt the completion to a
workqueue, we pass back the handler and data to the issuer and will get a
callback from a safe task context.

Using the following fio job to randomly dio write 4k blocks at
queue depths of 1..16:

fio --name=dio-write --filename=/data1/file --time_based=1 \
--runtime=10 --bs=4096 --rw=randwrite --norandommap --buffered=0 \
--cpus_allowed=4 --ioengine=io_uring --iodepth=16

shows the following results before and after this patch:

	Stock	Patched		Diff
=======================================
QD1	155K	162K		+ 4.5%
QD2	290K	313K		+ 7.9%
QD4	533K	597K		+12.0%
QD8	604K	827K		+36.9%
QD16	615K	845K		+37.4%

which shows nice wins all around. If we factored in per-IOP efficiency,
the wins look even nicer. This becomes apparent as queue depth rises,
as the offloaded workqueue completions runs out of steam.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/iomap/direct-io.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

Comments

Dave Chinner July 18, 2023, 11:50 p.m. UTC | #1
On Tue, Jul 18, 2023 at 01:49:20PM -0600, Jens Axboe wrote:
> If IOCB_DIO_DEFER is set, utilize that to set kiocb->dio_complete handler
> and data for that callback. Rather than punt the completion to a
> workqueue, we pass back the handler and data to the issuer and will get a
> callback from a safe task context.
> 
> Using the following fio job to randomly dio write 4k blocks at
> queue depths of 1..16:
> 
> fio --name=dio-write --filename=/data1/file --time_based=1 \
> --runtime=10 --bs=4096 --rw=randwrite --norandommap --buffered=0 \
> --cpus_allowed=4 --ioengine=io_uring --iodepth=16
> 
> shows the following results before and after this patch:
> 
> 	Stock	Patched		Diff
> =======================================
> QD1	155K	162K		+ 4.5%
> QD2	290K	313K		+ 7.9%
> QD4	533K	597K		+12.0%
> QD8	604K	827K		+36.9%
> QD16	615K	845K		+37.4%

Nice.

> which shows nice wins all around. If we factored in per-IOP efficiency,
> the wins look even nicer. This becomes apparent as queue depth rises,
> as the offloaded workqueue completions runs out of steam.
> 
> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> ---
>  fs/iomap/direct-io.c | 24 ++++++++++++++++++++++++
>  1 file changed, 24 insertions(+)
> 
> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
> index 92b9b9db8b67..ed615177e1f6 100644
> --- a/fs/iomap/direct-io.c
> +++ b/fs/iomap/direct-io.c
> @@ -131,6 +131,11 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio)
>  }
>  EXPORT_SYMBOL_GPL(iomap_dio_complete);
>  
> +static ssize_t iomap_dio_deferred_complete(void *data)
> +{
> +	return iomap_dio_complete(data);
> +}
> +
>  static void iomap_dio_complete_work(struct work_struct *work)
>  {
>  	struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
> @@ -167,6 +172,25 @@ void iomap_dio_bio_end_io(struct bio *bio)
>  		} else if ((dio->flags & IOMAP_DIO_INLINE_COMP) && in_task()) {
>  			WRITE_ONCE(dio->iocb->private, NULL);
>  			iomap_dio_complete_work(&dio->aio.work);
> +		} else if ((dio->flags & IOMAP_DIO_INLINE_COMP) &&
> +			   (iocb->ki_flags & IOCB_DIO_DEFER)) {
> +			/* only polled IO cares about private cleared */
> +			iocb->private = dio;
> +			iocb->dio_complete = iomap_dio_deferred_complete;
> +			/*
> +			 * Invoke ->ki_complete() directly. We've assigned
> +			 * out dio_complete callback handler, and since the
> +			 * issuer set IOCB_DIO_DEFER, we know their
> +			 * ki_complete handler will notice ->dio_complete
> +			 * being set and will defer calling that handler
> +			 * until it can be done from a safe task context.
> +			 *
> +			 * Note that the 'res' being passed in here is
> +			 * not important for this case. The actual completion
> +			 * value of the request will be gotten from dio_complete
> +			 * when that is run by the issuer.
> +			 */
> +			iocb->ki_complete(iocb, 0);
>  		} else {
>  			struct inode *inode = file_inode(iocb->ki_filp);
>  

Hmmm. No problems with the change, but all the special cases is
making the completion function a bit of a mess.

Given that all read DIOs use inline completions, we can largely
simplify the completion down to just looking at
dio->wait_for_completion and IOMAP_DIO_COMPLETE_INLINE, and not
caring about what type of IO is being completed at all.

Hence I think that at the end of this series, the completion
function should look something like this:

void iomap_dio_bio_end_io(struct bio *bio)
{
	struct iomap_dio *dio = bio->bi_private;
	struct kiocb *iocb = dio->iocb;
	bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
	ssize_t result = 0;

	if (bio->bi_status)
		iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status));

	if (!atomic_dec_and_test(&dio->ref))
		goto release_bio;

	/* Synchronous IO completion. */
	if (dio->wait_for_completion) {
		struct task_struct *waiter = dio->submit.waiter;
		WRITE_ONCE(dio->submit.waiter, NULL);
		blk_wake_io_task(waiter);
		goto release_bio;
	}

	/*
	 * Async DIO completion that requires filesystem level
	 * completion work gets punted to a work queue to complete
	 * as the operation may require more IO to be issued to
	 * finalise filesystem metadata changes or guarantee data
	 * integrity.
	 */
	if (!(dio->flags & IOMAP_DIO_COMPLETE_INLINE)) {
		struct inode *inode = file_inode(iocb->ki_filp);

		WRITE_ONCE(iocb->private, NULL);
		INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
		queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work);
		goto release_bio;
	}

	/*
	 * Inline completion for async DIO.
	 *
	 * If the IO submitter is running DIO completions directly
	 * itself, set up the callback it needs. The value we pass
	 * to .ki_complete in this case does not matter, the defered
	 * completion will pull the result from the completion
	 * callback we provide.
	 *
	 * Otherwise, run the dio completion directly, then pass the
	 * result to the iocb completion function to finish the IO.
	 */
	if (iocb->ki_flags & IOCB_DEFER_DIO) {
		WRITE_ONCE(iocb->private, dio);
		iocb->dio_complete = iomap_dio_deferred_complete;
	} else {
		WRITE_ONCE(dio->iocb->private, NULL);
		result = iomap_dio_complete(dio);
	}
	iocb->ki_complete(iocb, result);

release_bio:
	if (should_dirty) {
		bio_check_pages_dirty(bio);
	} else {
		bio_release_pages(bio, false);
		bio_put(bio);
	}
}

-Dave.
Jens Axboe July 19, 2023, 7:55 p.m. UTC | #2
On 7/18/23 5:50?PM, Dave Chinner wrote:
>> @@ -167,6 +172,25 @@ void iomap_dio_bio_end_io(struct bio *bio)
>>  		} else if ((dio->flags & IOMAP_DIO_INLINE_COMP) && in_task()) {
>>  			WRITE_ONCE(dio->iocb->private, NULL);
>>  			iomap_dio_complete_work(&dio->aio.work);
>> +		} else if ((dio->flags & IOMAP_DIO_INLINE_COMP) &&
>> +			   (iocb->ki_flags & IOCB_DIO_DEFER)) {
>> +			/* only polled IO cares about private cleared */
>> +			iocb->private = dio;
>> +			iocb->dio_complete = iomap_dio_deferred_complete;
>> +			/*
>> +			 * Invoke ->ki_complete() directly. We've assigned
>> +			 * out dio_complete callback handler, and since the
>> +			 * issuer set IOCB_DIO_DEFER, we know their
>> +			 * ki_complete handler will notice ->dio_complete
>> +			 * being set and will defer calling that handler
>> +			 * until it can be done from a safe task context.
>> +			 *
>> +			 * Note that the 'res' being passed in here is
>> +			 * not important for this case. The actual completion
>> +			 * value of the request will be gotten from dio_complete
>> +			 * when that is run by the issuer.
>> +			 */
>> +			iocb->ki_complete(iocb, 0);
>>  		} else {
>>  			struct inode *inode = file_inode(iocb->ki_filp);
>>  
> 
> Hmmm. No problems with the change, but all the special cases is
> making the completion function a bit of a mess.
> 
> Given that all read DIOs use inline completions, we can largely
> simplify the completion down to just looking at
> dio->wait_for_completion and IOMAP_DIO_COMPLETE_INLINE, and not
> caring about what type of IO is being completed at all.
> 
> Hence I think that at the end of this series, the completion
> function should look something like this:

I took inspiration from this as I think it's a good idea, and did a few
cleanups and introduced things like the above as we go. It's in v3 I
just posted.
diff mbox series

Patch

diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 92b9b9db8b67..ed615177e1f6 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -131,6 +131,11 @@  ssize_t iomap_dio_complete(struct iomap_dio *dio)
 }
 EXPORT_SYMBOL_GPL(iomap_dio_complete);
 
+static ssize_t iomap_dio_deferred_complete(void *data)
+{
+	return iomap_dio_complete(data);
+}
+
 static void iomap_dio_complete_work(struct work_struct *work)
 {
 	struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
@@ -167,6 +172,25 @@  void iomap_dio_bio_end_io(struct bio *bio)
 		} else if ((dio->flags & IOMAP_DIO_INLINE_COMP) && in_task()) {
 			WRITE_ONCE(dio->iocb->private, NULL);
 			iomap_dio_complete_work(&dio->aio.work);
+		} else if ((dio->flags & IOMAP_DIO_INLINE_COMP) &&
+			   (iocb->ki_flags & IOCB_DIO_DEFER)) {
+			/* only polled IO cares about private cleared */
+			iocb->private = dio;
+			iocb->dio_complete = iomap_dio_deferred_complete;
+			/*
+			 * Invoke ->ki_complete() directly. We've assigned
+			 * out dio_complete callback handler, and since the
+			 * issuer set IOCB_DIO_DEFER, we know their
+			 * ki_complete handler will notice ->dio_complete
+			 * being set and will defer calling that handler
+			 * until it can be done from a safe task context.
+			 *
+			 * Note that the 'res' being passed in here is
+			 * not important for this case. The actual completion
+			 * value of the request will be gotten from dio_complete
+			 * when that is run by the issuer.
+			 */
+			iocb->ki_complete(iocb, 0);
 		} else {
 			struct inode *inode = file_inode(iocb->ki_filp);