diff mbox series

[v2,2/4] virtio-scsi: default num_queues to -smp N

Message ID 20200124100159.736209-3-stefanha@redhat.com (mailing list archive)
State New, archived
Headers show
Series virtio-pci: enable blk and scsi multi-queue by default | expand

Commit Message

Stefan Hajnoczi Jan. 24, 2020, 10:01 a.m. UTC
Automatically size the number of request virtqueues to match the number
of vCPUs.  This ensures that completion interrupts are handled on the
same vCPU that submitted the request.  No IPI is necessary to complete
an I/O request and performance is improved.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 hw/core/machine.c               |  3 +++
 hw/scsi/vhost-scsi.c            |  3 ++-
 hw/scsi/vhost-user-scsi.c       |  3 ++-
 hw/scsi/virtio-scsi.c           |  6 +++++-
 hw/virtio/vhost-scsi-pci.c      | 10 ++++++++--
 hw/virtio/vhost-user-scsi-pci.c | 10 ++++++++--
 hw/virtio/virtio-scsi-pci.c     | 10 ++++++++--
 include/hw/virtio/virtio-scsi.h |  2 ++
 8 files changed, 38 insertions(+), 9 deletions(-)

Comments

Cornelia Huck Jan. 27, 2020, 1:10 p.m. UTC | #1
On Fri, 24 Jan 2020 10:01:57 +0000
Stefan Hajnoczi <stefanha@redhat.com> wrote:

> Automatically size the number of request virtqueues to match the number

"If the pci transport is used, ..." ?

> of vCPUs.  This ensures that completion interrupts are handled on the
> same vCPU that submitted the request.  No IPI is necessary to complete
> an I/O request and performance is improved.

"For other transports, the number of request queues continues to
default to 1." ?

> 
> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> ---
>  hw/core/machine.c               |  3 +++
>  hw/scsi/vhost-scsi.c            |  3 ++-
>  hw/scsi/vhost-user-scsi.c       |  3 ++-
>  hw/scsi/virtio-scsi.c           |  6 +++++-
>  hw/virtio/vhost-scsi-pci.c      | 10 ++++++++--
>  hw/virtio/vhost-user-scsi-pci.c | 10 ++++++++--
>  hw/virtio/virtio-scsi-pci.c     | 10 ++++++++--
>  include/hw/virtio/virtio-scsi.h |  2 ++
>  8 files changed, 38 insertions(+), 9 deletions(-)
> 
(...)
> diff --git a/hw/virtio/vhost-scsi-pci.c b/hw/virtio/vhost-scsi-pci.c
> index e8dfbfc60f..38a8f0c3ef 100644
> --- a/hw/virtio/vhost-scsi-pci.c
> +++ b/hw/virtio/vhost-scsi-pci.c
> @@ -17,6 +17,7 @@
>  #include "qemu/osdep.h"
>  
>  #include "standard-headers/linux/virtio_pci.h"
> +#include "hw/boards.h"
>  #include "hw/qdev-properties.h"
>  #include "hw/virtio/vhost-scsi.h"
>  #include "qapi/error.h"
> @@ -47,10 +48,15 @@ static void vhost_scsi_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
>  {
>      VHostSCSIPCI *dev = VHOST_SCSI_PCI(vpci_dev);
>      DeviceState *vdev = DEVICE(&dev->vdev);
> -    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(vdev);
> +    VirtIOSCSIConf *conf = &dev->vdev.parent_obj.parent_obj.conf;
> +
> +    /* 1:1 vq to vcpu mapping is ideal because it avoids IPIs */
> +    if (conf->num_queues == VIRTIO_SCSI_AUTO_NUM_QUEUES) {
> +        conf->num_queues = current_machine->smp.cpus;

This now maps the request vqs 1:1 to the vcpus. What about the fixed
vqs? If they don't really matter, amend the comment to explain that?

> +    }
>  
>      if (vpci_dev->nvectors == DEV_NVECTORS_UNSPECIFIED) {
> -        vpci_dev->nvectors = vs->conf.num_queues + 3;
> +        vpci_dev->nvectors = conf->num_queues + VIRTIO_SCSI_VQ_NUM_FIXED + 1;
>      }
>  
>      qdev_set_parent_bus(vdev, BUS(&vpci_dev->bus));
Stefan Hajnoczi Jan. 29, 2020, 3:44 p.m. UTC | #2
On Mon, Jan 27, 2020 at 02:10:31PM +0100, Cornelia Huck wrote:
> On Fri, 24 Jan 2020 10:01:57 +0000
> Stefan Hajnoczi <stefanha@redhat.com> wrote:
> > @@ -47,10 +48,15 @@ static void vhost_scsi_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
> >  {
> >      VHostSCSIPCI *dev = VHOST_SCSI_PCI(vpci_dev);
> >      DeviceState *vdev = DEVICE(&dev->vdev);
> > -    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(vdev);
> > +    VirtIOSCSIConf *conf = &dev->vdev.parent_obj.parent_obj.conf;
> > +
> > +    /* 1:1 vq to vcpu mapping is ideal because it avoids IPIs */
> > +    if (conf->num_queues == VIRTIO_SCSI_AUTO_NUM_QUEUES) {
> > +        conf->num_queues = current_machine->smp.cpus;
> 
> This now maps the request vqs 1:1 to the vcpus. What about the fixed
> vqs? If they don't really matter, amend the comment to explain that?

The fixed vqs don't matter.  They are typically not involved in the data
path, only the control path where performance doesn't matter.

Stefan
Paolo Bonzini Jan. 30, 2020, 12:29 a.m. UTC | #3
On 29/01/20 16:44, Stefan Hajnoczi wrote:
> On Mon, Jan 27, 2020 at 02:10:31PM +0100, Cornelia Huck wrote:
>> On Fri, 24 Jan 2020 10:01:57 +0000
>> Stefan Hajnoczi <stefanha@redhat.com> wrote:
>>> @@ -47,10 +48,15 @@ static void vhost_scsi_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
>>>  {
>>>      VHostSCSIPCI *dev = VHOST_SCSI_PCI(vpci_dev);
>>>      DeviceState *vdev = DEVICE(&dev->vdev);
>>> -    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(vdev);
>>> +    VirtIOSCSIConf *conf = &dev->vdev.parent_obj.parent_obj.conf;
>>> +
>>> +    /* 1:1 vq to vcpu mapping is ideal because it avoids IPIs */
>>> +    if (conf->num_queues == VIRTIO_SCSI_AUTO_NUM_QUEUES) {
>>> +        conf->num_queues = current_machine->smp.cpus;
>> This now maps the request vqs 1:1 to the vcpus. What about the fixed
>> vqs? If they don't really matter, amend the comment to explain that?
> The fixed vqs don't matter.  They are typically not involved in the data
> path, only the control path where performance doesn't matter.

Should we put a limit on the number of vCPUs?  For anything above ~128
the guest is probably not going to be disk or network bound.

Paolo
Stefan Hajnoczi Jan. 30, 2020, 10:52 a.m. UTC | #4
On Thu, Jan 30, 2020 at 01:29:16AM +0100, Paolo Bonzini wrote:
> On 29/01/20 16:44, Stefan Hajnoczi wrote:
> > On Mon, Jan 27, 2020 at 02:10:31PM +0100, Cornelia Huck wrote:
> >> On Fri, 24 Jan 2020 10:01:57 +0000
> >> Stefan Hajnoczi <stefanha@redhat.com> wrote:
> >>> @@ -47,10 +48,15 @@ static void vhost_scsi_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
> >>>  {
> >>>      VHostSCSIPCI *dev = VHOST_SCSI_PCI(vpci_dev);
> >>>      DeviceState *vdev = DEVICE(&dev->vdev);
> >>> -    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(vdev);
> >>> +    VirtIOSCSIConf *conf = &dev->vdev.parent_obj.parent_obj.conf;
> >>> +
> >>> +    /* 1:1 vq to vcpu mapping is ideal because it avoids IPIs */
> >>> +    if (conf->num_queues == VIRTIO_SCSI_AUTO_NUM_QUEUES) {
> >>> +        conf->num_queues = current_machine->smp.cpus;
> >> This now maps the request vqs 1:1 to the vcpus. What about the fixed
> >> vqs? If they don't really matter, amend the comment to explain that?
> > The fixed vqs don't matter.  They are typically not involved in the data
> > path, only the control path where performance doesn't matter.
> 
> Should we put a limit on the number of vCPUs?  For anything above ~128
> the guest is probably not going to be disk or network bound.

Michael Tsirkin pointed out there's a hard limit of VIRTIO_QUEUE_MAX
(1024).  We need to at least stay under that limit.

Should the guest have >128 virtqueues?  Each virtqueue requires guest
RAM and 2 host eventfds.  Eventually these resource requirements will
become a scalability problem, but how do we choose a hard limit and what
happens to guest performance above that limit?

Stefan
Cornelia Huck Jan. 30, 2020, 11:03 a.m. UTC | #5
On Thu, 30 Jan 2020 10:52:35 +0000
Stefan Hajnoczi <stefanha@redhat.com> wrote:

> On Thu, Jan 30, 2020 at 01:29:16AM +0100, Paolo Bonzini wrote:
> > On 29/01/20 16:44, Stefan Hajnoczi wrote:  
> > > On Mon, Jan 27, 2020 at 02:10:31PM +0100, Cornelia Huck wrote:  
> > >> On Fri, 24 Jan 2020 10:01:57 +0000
> > >> Stefan Hajnoczi <stefanha@redhat.com> wrote:  
> > >>> @@ -47,10 +48,15 @@ static void vhost_scsi_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
> > >>>  {
> > >>>      VHostSCSIPCI *dev = VHOST_SCSI_PCI(vpci_dev);
> > >>>      DeviceState *vdev = DEVICE(&dev->vdev);
> > >>> -    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(vdev);
> > >>> +    VirtIOSCSIConf *conf = &dev->vdev.parent_obj.parent_obj.conf;
> > >>> +
> > >>> +    /* 1:1 vq to vcpu mapping is ideal because it avoids IPIs */
> > >>> +    if (conf->num_queues == VIRTIO_SCSI_AUTO_NUM_QUEUES) {
> > >>> +        conf->num_queues = current_machine->smp.cpus;  
> > >> This now maps the request vqs 1:1 to the vcpus. What about the fixed
> > >> vqs? If they don't really matter, amend the comment to explain that?  
> > > The fixed vqs don't matter.  They are typically not involved in the data
> > > path, only the control path where performance doesn't matter.  
> > 
> > Should we put a limit on the number of vCPUs?  For anything above ~128
> > the guest is probably not going to be disk or network bound.  
> 
> Michael Tsirkin pointed out there's a hard limit of VIRTIO_QUEUE_MAX
> (1024).  We need to at least stay under that limit.
> 
> Should the guest have >128 virtqueues?  Each virtqueue requires guest
> RAM and 2 host eventfds.  Eventually these resource requirements will
> become a scalability problem, but how do we choose a hard limit and what
> happens to guest performance above that limit?

There's probably two kind of limits involved here:

- a hard limit (we cannot do more), which should be checked even for
  user-specified values, and
- a soft limit (it does not make sense to go beyond this for the
  default case), which can be overridden if explicitly specified.

VIRTIO_QUEUE_MAX (and two less for virtio-scsi) sounds like a hard
limit, maybe 128 is a reasonable candidate for a soft limit.

(I would expect systems that give 128 vcpus to the guest to also be
generously sized in other respects.)
Sergio Lopez Feb. 3, 2020, 10:25 a.m. UTC | #6
On Thu, Jan 30, 2020 at 10:52:35AM +0000, Stefan Hajnoczi wrote:
> On Thu, Jan 30, 2020 at 01:29:16AM +0100, Paolo Bonzini wrote:
> > On 29/01/20 16:44, Stefan Hajnoczi wrote:
> > > On Mon, Jan 27, 2020 at 02:10:31PM +0100, Cornelia Huck wrote:
> > >> On Fri, 24 Jan 2020 10:01:57 +0000
> > >> Stefan Hajnoczi <stefanha@redhat.com> wrote:
> > >>> @@ -47,10 +48,15 @@ static void vhost_scsi_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
> > >>>  {
> > >>>      VHostSCSIPCI *dev = VHOST_SCSI_PCI(vpci_dev);
> > >>>      DeviceState *vdev = DEVICE(&dev->vdev);
> > >>> -    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(vdev);
> > >>> +    VirtIOSCSIConf *conf = &dev->vdev.parent_obj.parent_obj.conf;
> > >>> +
> > >>> +    /* 1:1 vq to vcpu mapping is ideal because it avoids IPIs */
> > >>> +    if (conf->num_queues == VIRTIO_SCSI_AUTO_NUM_QUEUES) {
> > >>> +        conf->num_queues = current_machine->smp.cpus;
> > >> This now maps the request vqs 1:1 to the vcpus. What about the fixed
> > >> vqs? If they don't really matter, amend the comment to explain that?
> > > The fixed vqs don't matter.  They are typically not involved in the data
> > > path, only the control path where performance doesn't matter.
> > 
> > Should we put a limit on the number of vCPUs?  For anything above ~128
> > the guest is probably not going to be disk or network bound.
> 
> Michael Tsirkin pointed out there's a hard limit of VIRTIO_QUEUE_MAX
> (1024).  We need to at least stay under that limit.
> 
> Should the guest have >128 virtqueues?  Each virtqueue requires guest
> RAM and 2 host eventfds.  Eventually these resource requirements will
> become a scalability problem, but how do we choose a hard limit and what
> happens to guest performance above that limit?

From the UX perspective, I think it's safer to use a rather low upper
limit for the automatic configuration.

Users of large VMs (>=32 vCPUs) aiming for the optimal performance are
already facing the need of manually tuning (or relying on a software
to do that for them) other aspects of it, like vNUMA, IOThreads and
CPU pinning, so I don't think we should focus on this group.

On the other hand, the increase in host resource requirements may have
unforeseen in some environments, specially to virtio-blk users with
multiple disks.

All in all, I don't have data that would justify setting the limit to
one value or the other. The only argument I can put on the table is
that, so far, we only had one VQ per device, so perhaps a conservative
value (4? 8?) would make sense from a safety and compatibility point
of view.

Thanks,
Sergio.
Michael S. Tsirkin Feb. 3, 2020, 10:35 a.m. UTC | #7
On Mon, Feb 03, 2020 at 11:25:29AM +0100, Sergio Lopez wrote:
> On Thu, Jan 30, 2020 at 10:52:35AM +0000, Stefan Hajnoczi wrote:
> > On Thu, Jan 30, 2020 at 01:29:16AM +0100, Paolo Bonzini wrote:
> > > On 29/01/20 16:44, Stefan Hajnoczi wrote:
> > > > On Mon, Jan 27, 2020 at 02:10:31PM +0100, Cornelia Huck wrote:
> > > >> On Fri, 24 Jan 2020 10:01:57 +0000
> > > >> Stefan Hajnoczi <stefanha@redhat.com> wrote:
> > > >>> @@ -47,10 +48,15 @@ static void vhost_scsi_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
> > > >>>  {
> > > >>>      VHostSCSIPCI *dev = VHOST_SCSI_PCI(vpci_dev);
> > > >>>      DeviceState *vdev = DEVICE(&dev->vdev);
> > > >>> -    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(vdev);
> > > >>> +    VirtIOSCSIConf *conf = &dev->vdev.parent_obj.parent_obj.conf;
> > > >>> +
> > > >>> +    /* 1:1 vq to vcpu mapping is ideal because it avoids IPIs */
> > > >>> +    if (conf->num_queues == VIRTIO_SCSI_AUTO_NUM_QUEUES) {
> > > >>> +        conf->num_queues = current_machine->smp.cpus;
> > > >> This now maps the request vqs 1:1 to the vcpus. What about the fixed
> > > >> vqs? If they don't really matter, amend the comment to explain that?
> > > > The fixed vqs don't matter.  They are typically not involved in the data
> > > > path, only the control path where performance doesn't matter.
> > > 
> > > Should we put a limit on the number of vCPUs?  For anything above ~128
> > > the guest is probably not going to be disk or network bound.
> > 
> > Michael Tsirkin pointed out there's a hard limit of VIRTIO_QUEUE_MAX
> > (1024).  We need to at least stay under that limit.
> > 
> > Should the guest have >128 virtqueues?  Each virtqueue requires guest
> > RAM and 2 host eventfds.  Eventually these resource requirements will
> > become a scalability problem, but how do we choose a hard limit and what
> > happens to guest performance above that limit?
> 
> From the UX perspective, I think it's safer to use a rather low upper
> limit for the automatic configuration.
> 
> Users of large VMs (>=32 vCPUs) aiming for the optimal performance are
> already facing the need of manually tuning (or relying on a software
> to do that for them) other aspects of it, like vNUMA, IOThreads and
> CPU pinning, so I don't think we should focus on this group.
> 
> On the other hand, the increase in host resource requirements may have
> unforeseen in some environments, specially to virtio-blk users with
> multiple disks.
> 
> All in all, I don't have data that would justify setting the limit to
> one value or the other. The only argument I can put on the table is
> that, so far, we only had one VQ per device, so perhaps a conservative
> value (4? 8?) would make sense from a safety and compatibility point
> of view.
> 
> Thanks,
> Sergio.
> 

A bit more testing with different vcpu values can't hurt here ...
Stefan?
Cornelia Huck Feb. 3, 2020, 10:51 a.m. UTC | #8
On Mon, 3 Feb 2020 11:25:29 +0100
Sergio Lopez <slp@redhat.com> wrote:

> On Thu, Jan 30, 2020 at 10:52:35AM +0000, Stefan Hajnoczi wrote:
> > On Thu, Jan 30, 2020 at 01:29:16AM +0100, Paolo Bonzini wrote:  
> > > On 29/01/20 16:44, Stefan Hajnoczi wrote:  
> > > > On Mon, Jan 27, 2020 at 02:10:31PM +0100, Cornelia Huck wrote:  
> > > >> On Fri, 24 Jan 2020 10:01:57 +0000
> > > >> Stefan Hajnoczi <stefanha@redhat.com> wrote:  
> > > >>> @@ -47,10 +48,15 @@ static void vhost_scsi_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
> > > >>>  {
> > > >>>      VHostSCSIPCI *dev = VHOST_SCSI_PCI(vpci_dev);
> > > >>>      DeviceState *vdev = DEVICE(&dev->vdev);
> > > >>> -    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(vdev);
> > > >>> +    VirtIOSCSIConf *conf = &dev->vdev.parent_obj.parent_obj.conf;
> > > >>> +
> > > >>> +    /* 1:1 vq to vcpu mapping is ideal because it avoids IPIs */
> > > >>> +    if (conf->num_queues == VIRTIO_SCSI_AUTO_NUM_QUEUES) {
> > > >>> +        conf->num_queues = current_machine->smp.cpus;  
> > > >> This now maps the request vqs 1:1 to the vcpus. What about the fixed
> > > >> vqs? If they don't really matter, amend the comment to explain that?  
> > > > The fixed vqs don't matter.  They are typically not involved in the data
> > > > path, only the control path where performance doesn't matter.  
> > > 
> > > Should we put a limit on the number of vCPUs?  For anything above ~128
> > > the guest is probably not going to be disk or network bound.  
> > 
> > Michael Tsirkin pointed out there's a hard limit of VIRTIO_QUEUE_MAX
> > (1024).  We need to at least stay under that limit.
> > 
> > Should the guest have >128 virtqueues?  Each virtqueue requires guest
> > RAM and 2 host eventfds.  Eventually these resource requirements will
> > become a scalability problem, but how do we choose a hard limit and what
> > happens to guest performance above that limit?  
> 
> From the UX perspective, I think it's safer to use a rather low upper
> limit for the automatic configuration.
> 
> Users of large VMs (>=32 vCPUs) aiming for the optimal performance are
> already facing the need of manually tuning (or relying on a software
> to do that for them) other aspects of it, like vNUMA, IOThreads and
> CPU pinning, so I don't think we should focus on this group.
> 
> On the other hand, the increase in host resource requirements may have
> unforeseen in some environments, specially to virtio-blk users with
> multiple disks.

Yes... what happens on systems that have both a lot of vcpus and a lot
of disks? We don't know how many other disks are there in the
configuration, and they might be hotplugged later, anyway.

> 
> All in all, I don't have data that would justify setting the limit to
> one value or the other. The only argument I can put on the table is
> that, so far, we only had one VQ per device, so perhaps a conservative
> value (4? 8?) would make sense from a safety and compatibility point
> of view.

The more I think about it, the more I agree. Aiming a bit lower will
hopefully give more performance with less opportunity for unforeseen
breakage due to resource exhaustion.
Daniel P. Berrangé Feb. 3, 2020, 10:57 a.m. UTC | #9
On Mon, Feb 03, 2020 at 11:25:29AM +0100, Sergio Lopez wrote:
> On Thu, Jan 30, 2020 at 10:52:35AM +0000, Stefan Hajnoczi wrote:
> > On Thu, Jan 30, 2020 at 01:29:16AM +0100, Paolo Bonzini wrote:
> > > On 29/01/20 16:44, Stefan Hajnoczi wrote:
> > > > On Mon, Jan 27, 2020 at 02:10:31PM +0100, Cornelia Huck wrote:
> > > >> On Fri, 24 Jan 2020 10:01:57 +0000
> > > >> Stefan Hajnoczi <stefanha@redhat.com> wrote:
> > > >>> @@ -47,10 +48,15 @@ static void vhost_scsi_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
> > > >>>  {
> > > >>>      VHostSCSIPCI *dev = VHOST_SCSI_PCI(vpci_dev);
> > > >>>      DeviceState *vdev = DEVICE(&dev->vdev);
> > > >>> -    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(vdev);
> > > >>> +    VirtIOSCSIConf *conf = &dev->vdev.parent_obj.parent_obj.conf;
> > > >>> +
> > > >>> +    /* 1:1 vq to vcpu mapping is ideal because it avoids IPIs */
> > > >>> +    if (conf->num_queues == VIRTIO_SCSI_AUTO_NUM_QUEUES) {
> > > >>> +        conf->num_queues = current_machine->smp.cpus;
> > > >> This now maps the request vqs 1:1 to the vcpus. What about the fixed
> > > >> vqs? If they don't really matter, amend the comment to explain that?
> > > > The fixed vqs don't matter.  They are typically not involved in the data
> > > > path, only the control path where performance doesn't matter.
> > > 
> > > Should we put a limit on the number of vCPUs?  For anything above ~128
> > > the guest is probably not going to be disk or network bound.
> > 
> > Michael Tsirkin pointed out there's a hard limit of VIRTIO_QUEUE_MAX
> > (1024).  We need to at least stay under that limit.
> > 
> > Should the guest have >128 virtqueues?  Each virtqueue requires guest
> > RAM and 2 host eventfds.  Eventually these resource requirements will
> > become a scalability problem, but how do we choose a hard limit and what
> > happens to guest performance above that limit?
> 
> From the UX perspective, I think it's safer to use a rather low upper
> limit for the automatic configuration.
> 
> Users of large VMs (>=32 vCPUs) aiming for the optimal performance are
> already facing the need of manually tuning (or relying on a software
> to do that for them) other aspects of it, like vNUMA, IOThreads and
> CPU pinning, so I don't think we should focus on this group.

Whether they're runing manually, or relying on software to tune for
them, we (QEMU maintainers) still need to provide credible guidance
on what todo with tuning for large CPU counts. Without clear info
from QEMU, it just descends into hearsay and guesswork, both of which
approaches leave QEMU looking bad.

So I think we need to, at the very least, make a clear statement here
about what tuning approach should be applied vCPU count gets high,
and probably even apply that  as a default out of the box approach.

Regards,
Daniel
Sergio Lopez Feb. 3, 2020, 11:39 a.m. UTC | #10
On Mon, Feb 03, 2020 at 10:57:44AM +0000, Daniel P. Berrangé wrote:
> On Mon, Feb 03, 2020 at 11:25:29AM +0100, Sergio Lopez wrote:
> > On Thu, Jan 30, 2020 at 10:52:35AM +0000, Stefan Hajnoczi wrote:
> > > On Thu, Jan 30, 2020 at 01:29:16AM +0100, Paolo Bonzini wrote:
> > > > On 29/01/20 16:44, Stefan Hajnoczi wrote:
> > > > > On Mon, Jan 27, 2020 at 02:10:31PM +0100, Cornelia Huck wrote:
> > > > >> On Fri, 24 Jan 2020 10:01:57 +0000
> > > > >> Stefan Hajnoczi <stefanha@redhat.com> wrote:
> > > > >>> @@ -47,10 +48,15 @@ static void vhost_scsi_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
> > > > >>>  {
> > > > >>>      VHostSCSIPCI *dev = VHOST_SCSI_PCI(vpci_dev);
> > > > >>>      DeviceState *vdev = DEVICE(&dev->vdev);
> > > > >>> -    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(vdev);
> > > > >>> +    VirtIOSCSIConf *conf = &dev->vdev.parent_obj.parent_obj.conf;
> > > > >>> +
> > > > >>> +    /* 1:1 vq to vcpu mapping is ideal because it avoids IPIs */
> > > > >>> +    if (conf->num_queues == VIRTIO_SCSI_AUTO_NUM_QUEUES) {
> > > > >>> +        conf->num_queues = current_machine->smp.cpus;
> > > > >> This now maps the request vqs 1:1 to the vcpus. What about the fixed
> > > > >> vqs? If they don't really matter, amend the comment to explain that?
> > > > > The fixed vqs don't matter.  They are typically not involved in the data
> > > > > path, only the control path where performance doesn't matter.
> > > > 
> > > > Should we put a limit on the number of vCPUs?  For anything above ~128
> > > > the guest is probably not going to be disk or network bound.
> > > 
> > > Michael Tsirkin pointed out there's a hard limit of VIRTIO_QUEUE_MAX
> > > (1024).  We need to at least stay under that limit.
> > > 
> > > Should the guest have >128 virtqueues?  Each virtqueue requires guest
> > > RAM and 2 host eventfds.  Eventually these resource requirements will
> > > become a scalability problem, but how do we choose a hard limit and what
> > > happens to guest performance above that limit?
> > 
> > From the UX perspective, I think it's safer to use a rather low upper
> > limit for the automatic configuration.
> > 
> > Users of large VMs (>=32 vCPUs) aiming for the optimal performance are
> > already facing the need of manually tuning (or relying on a software
> > to do that for them) other aspects of it, like vNUMA, IOThreads and
> > CPU pinning, so I don't think we should focus on this group.
> 
> Whether they're runing manually, or relying on software to tune for
> them, we (QEMU maintainers) still need to provide credible guidance
> on what todo with tuning for large CPU counts. Without clear info
> from QEMU, it just descends into hearsay and guesswork, both of which
> approaches leave QEMU looking bad.

I agree. Good documentation, ideally with some benchmarks, and safe
defaults sound like a good approach to me.

> So I think we need to, at the very least, make a clear statement here
> about what tuning approach should be applied vCPU count gets high,
> and probably even apply that  as a default out of the box approach.

In general, I would agree, but in this particular case the
optimization has an impact on something outside's QEMU control (host's
resources), so we lack the information needed to make a proper guess.

My main concern here is users upgrading QEMU to hit some kind of crash
or performance issue, without having touched their VM config. And
let's not forget that Stefan said in the cover that this amounts to a
1-4% improvement on 4k operations on an SSD, and I guess that's with
iodepth=1. I suspect with a larger block size and/or higher iodepth
the improvement will be barely noticeable, which means it'll only have
a positive impact on users running DB/OLTP or similar workloads on
dedicated, directly attached, low-latency storage.

But don't get me wrong, this is a *good* optimization. It's just I
think we should play safe here.

Sergio.
Michael S. Tsirkin Feb. 3, 2020, 12:53 p.m. UTC | #11
On Mon, Feb 03, 2020 at 12:39:49PM +0100, Sergio Lopez wrote:
> On Mon, Feb 03, 2020 at 10:57:44AM +0000, Daniel P. Berrangé wrote:
> > On Mon, Feb 03, 2020 at 11:25:29AM +0100, Sergio Lopez wrote:
> > > On Thu, Jan 30, 2020 at 10:52:35AM +0000, Stefan Hajnoczi wrote:
> > > > On Thu, Jan 30, 2020 at 01:29:16AM +0100, Paolo Bonzini wrote:
> > > > > On 29/01/20 16:44, Stefan Hajnoczi wrote:
> > > > > > On Mon, Jan 27, 2020 at 02:10:31PM +0100, Cornelia Huck wrote:
> > > > > >> On Fri, 24 Jan 2020 10:01:57 +0000
> > > > > >> Stefan Hajnoczi <stefanha@redhat.com> wrote:
> > > > > >>> @@ -47,10 +48,15 @@ static void vhost_scsi_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
> > > > > >>>  {
> > > > > >>>      VHostSCSIPCI *dev = VHOST_SCSI_PCI(vpci_dev);
> > > > > >>>      DeviceState *vdev = DEVICE(&dev->vdev);
> > > > > >>> -    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(vdev);
> > > > > >>> +    VirtIOSCSIConf *conf = &dev->vdev.parent_obj.parent_obj.conf;
> > > > > >>> +
> > > > > >>> +    /* 1:1 vq to vcpu mapping is ideal because it avoids IPIs */
> > > > > >>> +    if (conf->num_queues == VIRTIO_SCSI_AUTO_NUM_QUEUES) {
> > > > > >>> +        conf->num_queues = current_machine->smp.cpus;
> > > > > >> This now maps the request vqs 1:1 to the vcpus. What about the fixed
> > > > > >> vqs? If they don't really matter, amend the comment to explain that?
> > > > > > The fixed vqs don't matter.  They are typically not involved in the data
> > > > > > path, only the control path where performance doesn't matter.
> > > > > 
> > > > > Should we put a limit on the number of vCPUs?  For anything above ~128
> > > > > the guest is probably not going to be disk or network bound.
> > > > 
> > > > Michael Tsirkin pointed out there's a hard limit of VIRTIO_QUEUE_MAX
> > > > (1024).  We need to at least stay under that limit.
> > > > 
> > > > Should the guest have >128 virtqueues?  Each virtqueue requires guest
> > > > RAM and 2 host eventfds.  Eventually these resource requirements will
> > > > become a scalability problem, but how do we choose a hard limit and what
> > > > happens to guest performance above that limit?
> > > 
> > > From the UX perspective, I think it's safer to use a rather low upper
> > > limit for the automatic configuration.
> > > 
> > > Users of large VMs (>=32 vCPUs) aiming for the optimal performance are
> > > already facing the need of manually tuning (or relying on a software
> > > to do that for them) other aspects of it, like vNUMA, IOThreads and
> > > CPU pinning, so I don't think we should focus on this group.
> > 
> > Whether they're runing manually, or relying on software to tune for
> > them, we (QEMU maintainers) still need to provide credible guidance
> > on what todo with tuning for large CPU counts. Without clear info
> > from QEMU, it just descends into hearsay and guesswork, both of which
> > approaches leave QEMU looking bad.
> 
> I agree. Good documentation, ideally with some benchmarks, and safe
> defaults sound like a good approach to me.
> 
> > So I think we need to, at the very least, make a clear statement here
> > about what tuning approach should be applied vCPU count gets high,
> > and probably even apply that  as a default out of the box approach.
> 
> In general, I would agree, but in this particular case the
> optimization has an impact on something outside's QEMU control (host's
> resources), so we lack the information needed to make a proper guess.
> 
> My main concern here is users upgrading QEMU to hit some kind of crash
> or performance issue, without having touched their VM config. And
> let's not forget that Stefan said in the cover that this amounts to a
> 1-4% improvement on 4k operations on an SSD, and I guess that's with
> iodepth=1. I suspect with a larger block size and/or higher iodepth
> the improvement will be barely noticeable, which means it'll only have
> a positive impact on users running DB/OLTP or similar workloads on
> dedicated, directly attached, low-latency storage.
> 
> But don't get me wrong, this is a *good* optimization. It's just I
> think we should play safe here.
> 
> Sergio.

Yea I think a bit more benchmarking than with 4 vcpus
so at least we can see the trend can't hurt.
Stefan Hajnoczi Feb. 11, 2020, 4:20 p.m. UTC | #12
On Mon, Feb 03, 2020 at 12:39:49PM +0100, Sergio Lopez wrote:
> On Mon, Feb 03, 2020 at 10:57:44AM +0000, Daniel P. Berrangé wrote:
> > On Mon, Feb 03, 2020 at 11:25:29AM +0100, Sergio Lopez wrote:
> > > On Thu, Jan 30, 2020 at 10:52:35AM +0000, Stefan Hajnoczi wrote:
> > > > On Thu, Jan 30, 2020 at 01:29:16AM +0100, Paolo Bonzini wrote:
> > > > > On 29/01/20 16:44, Stefan Hajnoczi wrote:
> > > > > > On Mon, Jan 27, 2020 at 02:10:31PM +0100, Cornelia Huck wrote:
> > > > > >> On Fri, 24 Jan 2020 10:01:57 +0000
> > > > > >> Stefan Hajnoczi <stefanha@redhat.com> wrote:
> > So I think we need to, at the very least, make a clear statement here
> > about what tuning approach should be applied vCPU count gets high,
> > and probably even apply that  as a default out of the box approach.
> 
> In general, I would agree, but in this particular case the
> optimization has an impact on something outside's QEMU control (host's
> resources), so we lack the information needed to make a proper guess.
> 
> My main concern here is users upgrading QEMU to hit some kind of crash
> or performance issue, without having touched their VM config. And

I don't think this is an issue since only newly created guests are
affected.  Existing machine types are unchanged.

> let's not forget that Stefan said in the cover that this amounts to a
> 1-4% improvement on 4k operations on an SSD, and I guess that's with
> iodepth=1. I suspect with a larger block size and/or higher iodepth
> the improvement will be barely noticeable, which means it'll only have
> a positive impact on users running DB/OLTP or similar workloads on
> dedicated, directly attached, low-latency storage.
> 
> But don't get me wrong, this is a *good* optimization. It's just I
> think we should play safe here.

The NVMe card I've been testing has 64 queues.  Let's keep the virtio
limit roughly the same as real hardware.  That way, multi-queue block
layer support in QEMU will be able to fully exploit the hardware
(similar to how we size request queues to be larger than the common 64
/sys/block/FOO/queue/nr_requests).

The point of this change is to improve performance on SMP guests.
Setting the limit to 4-8 is too low, since it leaves guests that most
need this optimization with a sub-optimal configuration.

I will create a 32 vCPU guest with 100 virtio-blk devices and verify
that enabling multi-queue is successful.

Stefan
Michael S. Tsirkin Feb. 11, 2020, 4:31 p.m. UTC | #13
On Tue, Feb 11, 2020 at 04:20:41PM +0000, Stefan Hajnoczi wrote:
> On Mon, Feb 03, 2020 at 12:39:49PM +0100, Sergio Lopez wrote:
> > On Mon, Feb 03, 2020 at 10:57:44AM +0000, Daniel P. Berrangé wrote:
> > > On Mon, Feb 03, 2020 at 11:25:29AM +0100, Sergio Lopez wrote:
> > > > On Thu, Jan 30, 2020 at 10:52:35AM +0000, Stefan Hajnoczi wrote:
> > > > > On Thu, Jan 30, 2020 at 01:29:16AM +0100, Paolo Bonzini wrote:
> > > > > > On 29/01/20 16:44, Stefan Hajnoczi wrote:
> > > > > > > On Mon, Jan 27, 2020 at 02:10:31PM +0100, Cornelia Huck wrote:
> > > > > > >> On Fri, 24 Jan 2020 10:01:57 +0000
> > > > > > >> Stefan Hajnoczi <stefanha@redhat.com> wrote:
> > > So I think we need to, at the very least, make a clear statement here
> > > about what tuning approach should be applied vCPU count gets high,
> > > and probably even apply that  as a default out of the box approach.
> > 
> > In general, I would agree, but in this particular case the
> > optimization has an impact on something outside's QEMU control (host's
> > resources), so we lack the information needed to make a proper guess.
> > 
> > My main concern here is users upgrading QEMU to hit some kind of crash
> > or performance issue, without having touched their VM config. And
> 
> I don't think this is an issue since only newly created guests are
> affected.  Existing machine types are unchanged.
> 
> > let's not forget that Stefan said in the cover that this amounts to a
> > 1-4% improvement on 4k operations on an SSD, and I guess that's with
> > iodepth=1. I suspect with a larger block size and/or higher iodepth
> > the improvement will be barely noticeable, which means it'll only have
> > a positive impact on users running DB/OLTP or similar workloads on
> > dedicated, directly attached, low-latency storage.
> > 
> > But don't get me wrong, this is a *good* optimization. It's just I
> > think we should play safe here.
> 
> The NVMe card I've been testing has 64 queues.  Let's keep the virtio
> limit roughly the same as real hardware.  That way, multi-queue block
> layer support in QEMU will be able to fully exploit the hardware
> (similar to how we size request queues to be larger than the common 64
> /sys/block/FOO/queue/nr_requests).
> 
> The point of this change is to improve performance on SMP guests.
> Setting the limit to 4-8 is too low, since it leaves guests that most
> need this optimization with a sub-optimal configuration.
> 
> I will create a 32 vCPU guest with 100 virtio-blk devices and verify
> that enabling multi-queue is successful.
> 
> Stefan


and that it's helpful for performance?
Stefan Hajnoczi Feb. 12, 2020, 11:18 a.m. UTC | #14
On Tue, Feb 11, 2020 at 11:31:17AM -0500, Michael S. Tsirkin wrote:
> On Tue, Feb 11, 2020 at 04:20:41PM +0000, Stefan Hajnoczi wrote:
> > On Mon, Feb 03, 2020 at 12:39:49PM +0100, Sergio Lopez wrote:
> > > On Mon, Feb 03, 2020 at 10:57:44AM +0000, Daniel P. Berrangé wrote:
> > > > On Mon, Feb 03, 2020 at 11:25:29AM +0100, Sergio Lopez wrote:
> > > > > On Thu, Jan 30, 2020 at 10:52:35AM +0000, Stefan Hajnoczi wrote:
> > > > > > On Thu, Jan 30, 2020 at 01:29:16AM +0100, Paolo Bonzini wrote:
> > > > > > > On 29/01/20 16:44, Stefan Hajnoczi wrote:
> > > > > > > > On Mon, Jan 27, 2020 at 02:10:31PM +0100, Cornelia Huck wrote:
> > > > > > > >> On Fri, 24 Jan 2020 10:01:57 +0000
> > > > > > > >> Stefan Hajnoczi <stefanha@redhat.com> wrote:
> > I will create a 32 vCPU guest with 100 virtio-blk devices and verify
> > that enabling multi-queue is successful.
> 
> and that it's helpful for performance?

I may be a little while before the next revision of this patch series.
Testing reveals scalability problems when creating so many virtqueues
:).

I've measured boot time, memory consumption, and random read IOPS.  They
are all significantly worse (32 vCPUs, 24 GB RAM, 101 virtio-blk
devices, 32 queues/device).

Time to see what's going on and whether some general scalability
improvements are possible here before we enable multi-queue by default.

Stefan
Stefan Hajnoczi Feb. 21, 2020, 10:55 a.m. UTC | #15
On Wed, Feb 12, 2020 at 11:18:32AM +0000, Stefan Hajnoczi wrote:
> On Tue, Feb 11, 2020 at 11:31:17AM -0500, Michael S. Tsirkin wrote:
> > On Tue, Feb 11, 2020 at 04:20:41PM +0000, Stefan Hajnoczi wrote:
> > > On Mon, Feb 03, 2020 at 12:39:49PM +0100, Sergio Lopez wrote:
> > > > On Mon, Feb 03, 2020 at 10:57:44AM +0000, Daniel P. Berrangé wrote:
> > > > > On Mon, Feb 03, 2020 at 11:25:29AM +0100, Sergio Lopez wrote:
> > > > > > On Thu, Jan 30, 2020 at 10:52:35AM +0000, Stefan Hajnoczi wrote:
> > > > > > > On Thu, Jan 30, 2020 at 01:29:16AM +0100, Paolo Bonzini wrote:
> > > > > > > > On 29/01/20 16:44, Stefan Hajnoczi wrote:
> > > > > > > > > On Mon, Jan 27, 2020 at 02:10:31PM +0100, Cornelia Huck wrote:
> > > > > > > > >> On Fri, 24 Jan 2020 10:01:57 +0000
> > > > > > > > >> Stefan Hajnoczi <stefanha@redhat.com> wrote:
> > > I will create a 32 vCPU guest with 100 virtio-blk devices and verify
> > > that enabling multi-queue is successful.
> > 
> > and that it's helpful for performance?
> 
> I may be a little while before the next revision of this patch series.
> Testing reveals scalability problems when creating so many virtqueues
> :).
> 
> I've measured boot time, memory consumption, and random read IOPS.  They
> are all significantly worse (32 vCPUs, 24 GB RAM, 101 virtio-blk
> devices, 32 queues/device).
> 
> Time to see what's going on and whether some general scalability
> improvements are possible here before we enable multi-queue by default.

Update:

Boot time has improved with "[PATCH] memory: batch allocate ioeventfds[]
in address_space_update_ioeventfds()".

IOPS looks a lot better with the O(1) QEMU event loop patches that I've
posted.  This work is not complete yet, I still need to make AioContext
polling O(1) too (it consumes too much CPU with many idle devices).

After this work is complete I'll measure boot time, memory consumption,
and IOPS again.  Then we can decide whether multiqueue by default is a
good idea.

Stefan
diff mbox series

Patch

diff --git a/hw/core/machine.c b/hw/core/machine.c
index 3e288bfceb..d6e2370c77 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -30,8 +30,11 @@ 
 GlobalProperty hw_compat_4_2[] = {
     { "virtio-blk-device", "x-enable-wce-if-config-wce", "off" },
     { "virtio-blk-device", "seg-max-adjust", "off"},
+    { "virtio-scsi-device", "num_queues", "1"},
     { "virtio-scsi-device", "seg_max_adjust", "off"},
     { "vhost-blk-device", "seg_max_adjust", "off"},
+    { "vhost-scsi", "num_queues", "1"},
+    { "vhost-user-scsi", "num_queues", "1"},
     { "usb-host", "suppress-remote-wake", "off" },
     { "usb-redir", "suppress-remote-wake", "off" },
 };
diff --git a/hw/scsi/vhost-scsi.c b/hw/scsi/vhost-scsi.c
index 26f710d3ec..80fe5d999a 100644
--- a/hw/scsi/vhost-scsi.c
+++ b/hw/scsi/vhost-scsi.c
@@ -272,7 +272,8 @@  static Property vhost_scsi_properties[] = {
     DEFINE_PROP_STRING("vhostfd", VirtIOSCSICommon, conf.vhostfd),
     DEFINE_PROP_STRING("wwpn", VirtIOSCSICommon, conf.wwpn),
     DEFINE_PROP_UINT32("boot_tpgt", VirtIOSCSICommon, conf.boot_tpgt, 0),
-    DEFINE_PROP_UINT32("num_queues", VirtIOSCSICommon, conf.num_queues, 1),
+    DEFINE_PROP_UINT32("num_queues", VirtIOSCSICommon, conf.num_queues,
+                       VIRTIO_SCSI_AUTO_NUM_QUEUES),
     DEFINE_PROP_UINT32("virtqueue_size", VirtIOSCSICommon, conf.virtqueue_size,
                        128),
     DEFINE_PROP_BOOL("seg_max_adjust", VirtIOSCSICommon, conf.seg_max_adjust,
diff --git a/hw/scsi/vhost-user-scsi.c b/hw/scsi/vhost-user-scsi.c
index eb37733bd0..655d300875 100644
--- a/hw/scsi/vhost-user-scsi.c
+++ b/hw/scsi/vhost-user-scsi.c
@@ -163,7 +163,8 @@  static void vhost_user_scsi_unrealize(DeviceState *dev, Error **errp)
 static Property vhost_user_scsi_properties[] = {
     DEFINE_PROP_CHR("chardev", VirtIOSCSICommon, conf.chardev),
     DEFINE_PROP_UINT32("boot_tpgt", VirtIOSCSICommon, conf.boot_tpgt, 0),
-    DEFINE_PROP_UINT32("num_queues", VirtIOSCSICommon, conf.num_queues, 1),
+    DEFINE_PROP_UINT32("num_queues", VirtIOSCSICommon, conf.num_queues,
+                       VIRTIO_SCSI_AUTO_NUM_QUEUES),
     DEFINE_PROP_UINT32("virtqueue_size", VirtIOSCSICommon, conf.virtqueue_size,
                        128),
     DEFINE_PROP_UINT32("max_sectors", VirtIOSCSICommon, conf.max_sectors,
diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
index 224a290498..c9342004ef 100644
--- a/hw/scsi/virtio-scsi.c
+++ b/hw/scsi/virtio-scsi.c
@@ -891,6 +891,9 @@  void virtio_scsi_common_realize(DeviceState *dev,
     virtio_init(vdev, "virtio-scsi", VIRTIO_ID_SCSI,
                 sizeof(VirtIOSCSIConfig));
 
+    if (s->conf.num_queues == VIRTIO_SCSI_AUTO_NUM_QUEUES) {
+        s->conf.num_queues = 1;
+    }
     if (s->conf.num_queues == 0 ||
             s->conf.num_queues > VIRTIO_QUEUE_MAX - VIRTIO_SCSI_VQ_NUM_FIXED) {
         error_setg(errp, "Invalid number of queues (= %" PRIu32 "), "
@@ -964,7 +967,8 @@  static void virtio_scsi_device_unrealize(DeviceState *dev, Error **errp)
 }
 
 static Property virtio_scsi_properties[] = {
-    DEFINE_PROP_UINT32("num_queues", VirtIOSCSI, parent_obj.conf.num_queues, 1),
+    DEFINE_PROP_UINT32("num_queues", VirtIOSCSI, parent_obj.conf.num_queues,
+                       VIRTIO_SCSI_AUTO_NUM_QUEUES),
     DEFINE_PROP_UINT32("virtqueue_size", VirtIOSCSI,
                                          parent_obj.conf.virtqueue_size, 128),
     DEFINE_PROP_BOOL("seg_max_adjust", VirtIOSCSI,
diff --git a/hw/virtio/vhost-scsi-pci.c b/hw/virtio/vhost-scsi-pci.c
index e8dfbfc60f..38a8f0c3ef 100644
--- a/hw/virtio/vhost-scsi-pci.c
+++ b/hw/virtio/vhost-scsi-pci.c
@@ -17,6 +17,7 @@ 
 #include "qemu/osdep.h"
 
 #include "standard-headers/linux/virtio_pci.h"
+#include "hw/boards.h"
 #include "hw/qdev-properties.h"
 #include "hw/virtio/vhost-scsi.h"
 #include "qapi/error.h"
@@ -47,10 +48,15 @@  static void vhost_scsi_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
 {
     VHostSCSIPCI *dev = VHOST_SCSI_PCI(vpci_dev);
     DeviceState *vdev = DEVICE(&dev->vdev);
-    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(vdev);
+    VirtIOSCSIConf *conf = &dev->vdev.parent_obj.parent_obj.conf;
+
+    /* 1:1 vq to vcpu mapping is ideal because it avoids IPIs */
+    if (conf->num_queues == VIRTIO_SCSI_AUTO_NUM_QUEUES) {
+        conf->num_queues = current_machine->smp.cpus;
+    }
 
     if (vpci_dev->nvectors == DEV_NVECTORS_UNSPECIFIED) {
-        vpci_dev->nvectors = vs->conf.num_queues + 3;
+        vpci_dev->nvectors = conf->num_queues + VIRTIO_SCSI_VQ_NUM_FIXED + 1;
     }
 
     qdev_set_parent_bus(vdev, BUS(&vpci_dev->bus));
diff --git a/hw/virtio/vhost-user-scsi-pci.c b/hw/virtio/vhost-user-scsi-pci.c
index ff13af7030..0cad29eb67 100644
--- a/hw/virtio/vhost-user-scsi-pci.c
+++ b/hw/virtio/vhost-user-scsi-pci.c
@@ -18,6 +18,7 @@ 
 #include "qemu/osdep.h"
 
 #include "standard-headers/linux/virtio_pci.h"
+#include "hw/boards.h"
 #include "hw/virtio/vhost-user-scsi.h"
 #include "hw/virtio/virtio.h"
 #include "hw/virtio/virtio-scsi.h"
@@ -53,10 +54,15 @@  static void vhost_user_scsi_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
 {
     VHostUserSCSIPCI *dev = VHOST_USER_SCSI_PCI(vpci_dev);
     DeviceState *vdev = DEVICE(&dev->vdev);
-    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(vdev);
+    VirtIOSCSIConf *conf = &dev->vdev.parent_obj.parent_obj.conf;
+
+    /* 1:1 vq to vcpu mapping is ideal because it avoids IPIs */
+    if (conf->num_queues == VIRTIO_SCSI_AUTO_NUM_QUEUES) {
+        conf->num_queues = current_machine->smp.cpus;
+    }
 
     if (vpci_dev->nvectors == DEV_NVECTORS_UNSPECIFIED) {
-        vpci_dev->nvectors = vs->conf.num_queues + 3;
+        vpci_dev->nvectors = conf->num_queues + VIRTIO_SCSI_VQ_NUM_FIXED + 1;
     }
 
     qdev_set_parent_bus(vdev, BUS(&vpci_dev->bus));
diff --git a/hw/virtio/virtio-scsi-pci.c b/hw/virtio/virtio-scsi-pci.c
index 3c55dc19a1..b22c8b79e2 100644
--- a/hw/virtio/virtio-scsi-pci.c
+++ b/hw/virtio/virtio-scsi-pci.c
@@ -15,6 +15,7 @@ 
 
 #include "qemu/osdep.h"
 
+#include "hw/boards.h"
 #include "hw/qdev-properties.h"
 #include "hw/virtio/virtio-scsi.h"
 #include "qemu/module.h"
@@ -46,12 +47,17 @@  static void virtio_scsi_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
 {
     VirtIOSCSIPCI *dev = VIRTIO_SCSI_PCI(vpci_dev);
     DeviceState *vdev = DEVICE(&dev->vdev);
-    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(vdev);
     DeviceState *proxy = DEVICE(vpci_dev);
+    VirtIOSCSIConf *conf = &dev->vdev.parent_obj.conf;
     char *bus_name;
 
+    /* 1:1 vq to vcpu mapping is ideal because it avoids IPIs */
+    if (conf->num_queues == VIRTIO_SCSI_AUTO_NUM_QUEUES) {
+        conf->num_queues = current_machine->smp.cpus;
+    }
+
     if (vpci_dev->nvectors == DEV_NVECTORS_UNSPECIFIED) {
-        vpci_dev->nvectors = vs->conf.num_queues + 3;
+        vpci_dev->nvectors = conf->num_queues + VIRTIO_SCSI_VQ_NUM_FIXED + 1;
     }
 
     /*
diff --git a/include/hw/virtio/virtio-scsi.h b/include/hw/virtio/virtio-scsi.h
index 9f293bcb80..c0b8e4dd7e 100644
--- a/include/hw/virtio/virtio-scsi.h
+++ b/include/hw/virtio/virtio-scsi.h
@@ -39,6 +39,8 @@ 
 /* Number of virtqueues that are always present */
 #define VIRTIO_SCSI_VQ_NUM_FIXED    2
 
+#define VIRTIO_SCSI_AUTO_NUM_QUEUES UINT32_MAX
+
 typedef struct virtio_scsi_cmd_req VirtIOSCSICmdReq;
 typedef struct virtio_scsi_cmd_resp VirtIOSCSICmdResp;
 typedef struct virtio_scsi_ctrl_tmf_req VirtIOSCSICtrlTMFReq;