diff mbox series

[2/2] scsi-disk: Add native FUA support

Message ID 20250304155232.1325581-3-afaria@redhat.com (mailing list archive)
State New, archived
Headers show
Series scsi-disk: Add FUA write support | expand

Commit Message

Alberto Faria March 4, 2025, 3:52 p.m. UTC
Avoid emulating FUA when the driver supports it natively. This should
provide better performance than a full flush after the write.

Signed-off-by: Alberto Faria <afaria@redhat.com>
---
 hw/scsi/scsi-disk.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

Comments

Kevin Wolf March 6, 2025, 10:33 a.m. UTC | #1
Am 04.03.2025 um 16:52 hat Alberto Faria geschrieben:
> Avoid emulating FUA when the driver supports it natively. This should
> provide better performance than a full flush after the write.
> 
> Signed-off-by: Alberto Faria <afaria@redhat.com>

Did you try out if you can see performance improvements in practice?
It's always nice to have numbers in the commit message for patches that
promise performance improvements.

>  hw/scsi/scsi-disk.c | 17 +++++++++++------
>  1 file changed, 11 insertions(+), 6 deletions(-)
> 
> diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
> index 8cf50845ab..ce48e20ee6 100644
> --- a/hw/scsi/scsi-disk.c
> +++ b/hw/scsi/scsi-disk.c
> @@ -43,6 +43,7 @@
>  #include "qemu/cutils.h"
>  #include "trace.h"
>  #include "qom/object.h"
> +#include "block/block_int-common.h"
>  
>  #ifdef __linux
>  #include <scsi/sg.h>
> @@ -75,7 +76,7 @@ struct SCSIDiskClass {
>       */
>      DMAIOFunc       *dma_readv;
>      DMAIOFunc       *dma_writev;
> -    bool            (*need_fua_emulation)(SCSICommand *cmd);
> +    bool            (*need_fua)(SCSICommand *cmd);
>      void            (*update_sense)(SCSIRequest *r);
>  };
>  
> @@ -86,6 +87,7 @@ typedef struct SCSIDiskReq {
>      uint32_t sector_count;
>      uint32_t buflen;
>      bool started;
> +    bool need_fua;
>      bool need_fua_emulation;
>      struct iovec iov;
>      QEMUIOVector qiov;
> @@ -553,7 +555,7 @@ static void scsi_read_data(SCSIRequest *req)
>  
>      first = !r->started;
>      r->started = true;
> -    if (first && r->need_fua_emulation) {
> +    if (first && r->need_fua) {
>          block_acct_start(blk_get_stats(s->qdev.conf.blk), &r->acct, 0,
>                           BLOCK_ACCT_FLUSH);
>          r->req.aiocb = blk_aio_flush(s->qdev.conf.blk, scsi_do_read_cb, r);
> @@ -2384,7 +2386,9 @@ static int32_t scsi_disk_dma_command(SCSIRequest *req, uint8_t *buf)
>          scsi_check_condition(r, SENSE_CODE(LBA_OUT_OF_RANGE));
>          return 0;
>      }
> -    r->need_fua_emulation = sdc->need_fua_emulation(&r->req.cmd);
> +    r->need_fua = sdc->need_fua(&r->req.cmd);
> +    r->need_fua_emulation = r->need_fua &&
> +        (blk_bs(s->qdev.conf.blk)->supported_write_flags & BDRV_REQ_FUA) == 0;

You can just use BDRV_REQ_FUA unconditionally. If the driver doesn't
support it directly, the block layer already emulates it internally. We
don't have to duplicate this here. If scsi_write_data() does a flush
directly for VERIFY (like scsi_read_data() already does),
scsi_write_do_fua() can go away completely.

However, we can only apply this to write requests. We still need to know
that FUA needs to be emulated for reads. scsi_read_data() issues a flush
for FUA requests and your patch would break it if writes support
BDRV_REQ_FUA.

Kevin
Kevin Wolf March 25, 2025, 12:48 p.m. UTC | #2
Am 06.03.2025 um 11:33 hat Kevin Wolf geschrieben:
> Am 04.03.2025 um 16:52 hat Alberto Faria geschrieben:
> > Avoid emulating FUA when the driver supports it natively. This should
> > provide better performance than a full flush after the write.
> > 
> > Signed-off-by: Alberto Faria <afaria@redhat.com>
> 
> Did you try out if you can see performance improvements in practice?
> It's always nice to have numbers in the commit message for patches that
> promise performance improvements.

I was curious enough to see how this and the recent series by Stefan
(virtio-scsi multiqueue) and myself (FUA on the backend + polling
improvements) play out with virtio-scsi, so I just ran some fio
benchmarks with sync=1 myself to compare:

iops bs=4k cache=none           |    virtio-scsi    |     virtio-blk    |
O_SYNC workload                 |   qd 1  |  qd 16  |   qd 1  |  qd 16  |
--------------------------------+---------+---------+---------+---------+
master                          |   21296 |  109747 |   25762 |  130576 |
+ virtio-scsi multiqueue        |   28798 |  121170 |       - |       - |
+ FUA in scsi-disk              |   51893 |  204199 |       - |       - |
--------------------------------+---------+---------+---------+---------+
Total change                    | +143.7% |  +86.1% |       - |       - |

(No new numbers for virtio-blk because virtio-scsi patches obviously
don't change anything about it. Also no numbers for FUA in file-posix
because it's unused with cache=none.)

iops bs=4k cache=directsync     |    virtio-scsi    |     virtio-blk    |
O_SYNC workload                 |   qd 1  |  qd 16  |   qd 1  |  qd 16  |
--------------------------------+---------+---------+---------+---------+
master                          |   32223 |  109748 |   45583 |  258416 |
+ FUA in file-posix + polling   |   32148 |  198665 |   58601 |  320190 |
+ virtio-scsi multiqueue        |   51739 |  225031 |       - |       - |
+ FUA in scsi-disk              |   56061 |  227535 |       - |       - |
--------------------------------+---------+---------+---------+---------+
Total change                    |  +74.0% | +107.3% |  +28.6% |  +23.9% |

Of course, the huge improvements on the virtio-scsi side only show how
bad it was before. In most numbers it is still behind virtio-blk even
after all three patch series (apart from cache=none where the
availability of FUA on the device side makes a big difference, and I
expect that virtio-blk will improve similarly once we implement it
there).

Also note that when testing the virtio-scsi multiqueue patches, this
was still a single iothread, i.e. I wasn't even making use of the new
feature per se. I assume much of this comes from enabling polling
because the series moved the event queue handling to the main loop,
which prevented polling for virtio-scsi before. The series also got rid
of an extra coroutine per request for the blk_is_available() call in
virtio_scsi_ctx_check(), which might play a role, too.

Anyway, I like these numbers for FUA in scsi-disk. It makes write back
cache modes almost catch up to write through with O_SYNC workloads. We
should definitely get this merged and do the same for virtio-blk.

Kevin
Stefan Hajnoczi March 25, 2025, 3:54 p.m. UTC | #3
On Tue, Mar 25, 2025 at 01:48:35PM +0100, Kevin Wolf wrote:
> Am 06.03.2025 um 11:33 hat Kevin Wolf geschrieben:
> > Am 04.03.2025 um 16:52 hat Alberto Faria geschrieben:
> > > Avoid emulating FUA when the driver supports it natively. This should
> > > provide better performance than a full flush after the write.
> > > 
> > > Signed-off-by: Alberto Faria <afaria@redhat.com>
> > 
> > Did you try out if you can see performance improvements in practice?
> > It's always nice to have numbers in the commit message for patches that
> > promise performance improvements.
> 
> I was curious enough to see how this and the recent series by Stefan
> (virtio-scsi multiqueue) and myself (FUA on the backend + polling
> improvements) play out with virtio-scsi, so I just ran some fio
> benchmarks with sync=1 myself to compare:
> 
> iops bs=4k cache=none           |    virtio-scsi    |     virtio-blk    |
> O_SYNC workload                 |   qd 1  |  qd 16  |   qd 1  |  qd 16  |
> --------------------------------+---------+---------+---------+---------+
> master                          |   21296 |  109747 |   25762 |  130576 |
> + virtio-scsi multiqueue        |   28798 |  121170 |       - |       - |
> + FUA in scsi-disk              |   51893 |  204199 |       - |       - |
> --------------------------------+---------+---------+---------+---------+
> Total change                    | +143.7% |  +86.1% |       - |       - |
> 
> (No new numbers for virtio-blk because virtio-scsi patches obviously
> don't change anything about it. Also no numbers for FUA in file-posix
> because it's unused with cache=none.)
> 
> iops bs=4k cache=directsync     |    virtio-scsi    |     virtio-blk    |
> O_SYNC workload                 |   qd 1  |  qd 16  |   qd 1  |  qd 16  |
> --------------------------------+---------+---------+---------+---------+
> master                          |   32223 |  109748 |   45583 |  258416 |
> + FUA in file-posix + polling   |   32148 |  198665 |   58601 |  320190 |
> + virtio-scsi multiqueue        |   51739 |  225031 |       - |       - |
> + FUA in scsi-disk              |   56061 |  227535 |       - |       - |
> --------------------------------+---------+---------+---------+---------+
> Total change                    |  +74.0% | +107.3% |  +28.6% |  +23.9% |
> 
> Of course, the huge improvements on the virtio-scsi side only show how
> bad it was before. In most numbers it is still behind virtio-blk even
> after all three patch series (apart from cache=none where the
> availability of FUA on the device side makes a big difference, and I
> expect that virtio-blk will improve similarly once we implement it
> there).
> 
> Also note that when testing the virtio-scsi multiqueue patches, this
> was still a single iothread, i.e. I wasn't even making use of the new
> feature per se. I assume much of this comes from enabling polling
> because the series moved the event queue handling to the main loop,
> which prevented polling for virtio-scsi before. The series also got rid
> of an extra coroutine per request for the blk_is_available() call in
> virtio_scsi_ctx_check(), which might play a role, too.
> 
> Anyway, I like these numbers for FUA in scsi-disk. It makes write back
> cache modes almost catch up to write through with O_SYNC workloads. We
> should definitely get this merged and do the same for virtio-blk.

Thanks for sharing! Nice IOPS improvements across the board.

Stefan
diff mbox series

Patch

diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
index 8cf50845ab..ce48e20ee6 100644
--- a/hw/scsi/scsi-disk.c
+++ b/hw/scsi/scsi-disk.c
@@ -43,6 +43,7 @@ 
 #include "qemu/cutils.h"
 #include "trace.h"
 #include "qom/object.h"
+#include "block/block_int-common.h"
 
 #ifdef __linux
 #include <scsi/sg.h>
@@ -75,7 +76,7 @@  struct SCSIDiskClass {
      */
     DMAIOFunc       *dma_readv;
     DMAIOFunc       *dma_writev;
-    bool            (*need_fua_emulation)(SCSICommand *cmd);
+    bool            (*need_fua)(SCSICommand *cmd);
     void            (*update_sense)(SCSIRequest *r);
 };
 
@@ -86,6 +87,7 @@  typedef struct SCSIDiskReq {
     uint32_t sector_count;
     uint32_t buflen;
     bool started;
+    bool need_fua;
     bool need_fua_emulation;
     struct iovec iov;
     QEMUIOVector qiov;
@@ -553,7 +555,7 @@  static void scsi_read_data(SCSIRequest *req)
 
     first = !r->started;
     r->started = true;
-    if (first && r->need_fua_emulation) {
+    if (first && r->need_fua) {
         block_acct_start(blk_get_stats(s->qdev.conf.blk), &r->acct, 0,
                          BLOCK_ACCT_FLUSH);
         r->req.aiocb = blk_aio_flush(s->qdev.conf.blk, scsi_do_read_cb, r);
@@ -2384,7 +2386,9 @@  static int32_t scsi_disk_dma_command(SCSIRequest *req, uint8_t *buf)
         scsi_check_condition(r, SENSE_CODE(LBA_OUT_OF_RANGE));
         return 0;
     }
-    r->need_fua_emulation = sdc->need_fua_emulation(&r->req.cmd);
+    r->need_fua = sdc->need_fua(&r->req.cmd);
+    r->need_fua_emulation = r->need_fua &&
+        (blk_bs(s->qdev.conf.blk)->supported_write_flags & BDRV_REQ_FUA) == 0;
     if (r->sector_count == 0) {
         scsi_req_complete(&r->req, GOOD);
     }
@@ -3134,7 +3138,8 @@  BlockAIOCB *scsi_dma_writev(int64_t offset, QEMUIOVector *iov,
 {
     SCSIDiskReq *r = opaque;
     SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
-    return blk_aio_pwritev(s->qdev.conf.blk, offset, iov, 0, cb, cb_opaque);
+    int flags = r->need_fua && !r->need_fua_emulation ? BDRV_REQ_FUA : 0;
+    return blk_aio_pwritev(s->qdev.conf.blk, offset, iov, flags, cb, cb_opaque);
 }
 
 static char *scsi_property_get_loadparm(Object *obj, Error **errp)
@@ -3183,7 +3188,7 @@  static void scsi_disk_base_class_initfn(ObjectClass *klass, void *data)
     device_class_set_legacy_reset(dc, scsi_disk_reset);
     sdc->dma_readv = scsi_dma_readv;
     sdc->dma_writev = scsi_dma_writev;
-    sdc->need_fua_emulation = scsi_is_cmd_fua;
+    sdc->need_fua = scsi_is_cmd_fua;
 }
 
 static const TypeInfo scsi_disk_base_info = {
@@ -3335,7 +3340,7 @@  static void scsi_block_class_initfn(ObjectClass *klass, void *data)
     sdc->dma_readv   = scsi_block_dma_readv;
     sdc->dma_writev  = scsi_block_dma_writev;
     sdc->update_sense = scsi_block_update_sense;
-    sdc->need_fua_emulation = scsi_block_no_fua;
+    sdc->need_fua = scsi_block_no_fua;
     dc->desc = "SCSI block device passthrough";
     device_class_set_props(dc, scsi_block_properties);
     dc->vmsd  = &vmstate_scsi_disk_state;