diff mbox series

[v2,1/3] virtio: turn VIRTQUEUE_MAX_SIZE into a variable

Message ID c9dea2e27ae19b2b9a51e8f08687067bf3e47bd5.1633376313.git.qemu_oss@crudebyte.com (mailing list archive)
State New, archived
Headers show
Series virtio: increase VIRTQUEUE_MAX_SIZE to 32k | expand

Commit Message

Christian Schoenebeck Oct. 4, 2021, 7:38 p.m. UTC
Refactor VIRTQUEUE_MAX_SIZE to effectively become a runtime
variable per virtio user.

Reasons:

(1) VIRTQUEUE_MAX_SIZE should reflect the absolute theoretical
    maximum queue size possible. Which is actually the maximum
    queue size allowed by the virtio protocol. The appropriate
    value for VIRTQUEUE_MAX_SIZE would therefore be 32768:

    https://docs.oasis-open.org/virtio/virtio/v1.1/cs01/virtio-v1.1-cs01.html#x1-240006

    Apparently VIRTQUEUE_MAX_SIZE was instead defined with a
    more or less arbitrary value of 1024 in the past, which
    limits the maximum transfer size with virtio to 4M
    (more precise: 1024 * PAGE_SIZE, with the latter typically
    being 4k).

(2) Additionally the current value of 1024 poses a hidden limit,
    invisible to guest, which causes a system hang with the
    following QEMU error if guest tries to exceed it:

    virtio: too many write descriptors in indirect table

(3) Unfortunately not all virtio users in QEMU would currently
    work correctly with the new value of 32768.

So let's turn this hard coded global value into a runtime
variable as a first step in this commit, configurable for each
virtio user by passing a corresponding value with virtio_init()
call.

Signed-off-by: Christian Schoenebeck <qemu_oss@crudebyte.com>
---
 hw/9pfs/virtio-9p-device.c     |  3 ++-
 hw/block/vhost-user-blk.c      |  2 +-
 hw/block/virtio-blk.c          |  3 ++-
 hw/char/virtio-serial-bus.c    |  2 +-
 hw/display/virtio-gpu-base.c   |  2 +-
 hw/input/virtio-input.c        |  2 +-
 hw/net/virtio-net.c            | 15 ++++++++-------
 hw/scsi/virtio-scsi.c          |  2 +-
 hw/virtio/vhost-user-fs.c      |  2 +-
 hw/virtio/vhost-user-i2c.c     |  3 ++-
 hw/virtio/vhost-vsock-common.c |  2 +-
 hw/virtio/virtio-balloon.c     |  4 ++--
 hw/virtio/virtio-crypto.c      |  3 ++-
 hw/virtio/virtio-iommu.c       |  2 +-
 hw/virtio/virtio-mem.c         |  2 +-
 hw/virtio/virtio-pmem.c        |  2 +-
 hw/virtio/virtio-rng.c         |  2 +-
 hw/virtio/virtio.c             | 35 +++++++++++++++++++++++-----------
 include/hw/virtio/virtio.h     |  5 ++++-
 19 files changed, 57 insertions(+), 36 deletions(-)

Comments

Greg Kurz Oct. 5, 2021, 7:36 a.m. UTC | #1
On Mon, 4 Oct 2021 21:38:04 +0200
Christian Schoenebeck <qemu_oss@crudebyte.com> wrote:

> Refactor VIRTQUEUE_MAX_SIZE to effectively become a runtime
> variable per virtio user.
> 
> Reasons:
> 
> (1) VIRTQUEUE_MAX_SIZE should reflect the absolute theoretical
>     maximum queue size possible. Which is actually the maximum
>     queue size allowed by the virtio protocol. The appropriate
>     value for VIRTQUEUE_MAX_SIZE would therefore be 32768:
> 
>     https://docs.oasis-open.org/virtio/virtio/v1.1/cs01/virtio-v1.1-cs01.html#x1-240006
> 
>     Apparently VIRTQUEUE_MAX_SIZE was instead defined with a
>     more or less arbitrary value of 1024 in the past, which
>     limits the maximum transfer size with virtio to 4M
>     (more precise: 1024 * PAGE_SIZE, with the latter typically
>     being 4k).
> 
> (2) Additionally the current value of 1024 poses a hidden limit,
>     invisible to guest, which causes a system hang with the
>     following QEMU error if guest tries to exceed it:
> 
>     virtio: too many write descriptors in indirect table
> 
> (3) Unfortunately not all virtio users in QEMU would currently
>     work correctly with the new value of 32768.
> 
> So let's turn this hard coded global value into a runtime
> variable as a first step in this commit, configurable for each
> virtio user by passing a corresponding value with virtio_init()
> call.
> 
> Signed-off-by: Christian Schoenebeck <qemu_oss@crudebyte.com>
> ---

Reviewed-by: Greg Kurz <groug@kaod.org>

>  hw/9pfs/virtio-9p-device.c     |  3 ++-
>  hw/block/vhost-user-blk.c      |  2 +-
>  hw/block/virtio-blk.c          |  3 ++-
>  hw/char/virtio-serial-bus.c    |  2 +-
>  hw/display/virtio-gpu-base.c   |  2 +-
>  hw/input/virtio-input.c        |  2 +-
>  hw/net/virtio-net.c            | 15 ++++++++-------
>  hw/scsi/virtio-scsi.c          |  2 +-
>  hw/virtio/vhost-user-fs.c      |  2 +-
>  hw/virtio/vhost-user-i2c.c     |  3 ++-
>  hw/virtio/vhost-vsock-common.c |  2 +-
>  hw/virtio/virtio-balloon.c     |  4 ++--
>  hw/virtio/virtio-crypto.c      |  3 ++-
>  hw/virtio/virtio-iommu.c       |  2 +-
>  hw/virtio/virtio-mem.c         |  2 +-
>  hw/virtio/virtio-pmem.c        |  2 +-
>  hw/virtio/virtio-rng.c         |  2 +-
>  hw/virtio/virtio.c             | 35 +++++++++++++++++++++++-----------
>  include/hw/virtio/virtio.h     |  5 ++++-
>  19 files changed, 57 insertions(+), 36 deletions(-)
> 
> diff --git a/hw/9pfs/virtio-9p-device.c b/hw/9pfs/virtio-9p-device.c
> index 54ee93b71f..cd5d95dd51 100644
> --- a/hw/9pfs/virtio-9p-device.c
> +++ b/hw/9pfs/virtio-9p-device.c
> @@ -216,7 +216,8 @@ static void virtio_9p_device_realize(DeviceState *dev, Error **errp)
>      }
>  
>      v->config_size = sizeof(struct virtio_9p_config) + strlen(s->fsconf.tag);
> -    virtio_init(vdev, "virtio-9p", VIRTIO_ID_9P, v->config_size);
> +    virtio_init(vdev, "virtio-9p", VIRTIO_ID_9P, v->config_size,
> +                VIRTQUEUE_MAX_SIZE);
>      v->vq = virtio_add_queue(vdev, MAX_REQ, handle_9p_output);
>  }
>  
> diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
> index ba13cb87e5..336f56705c 100644
> --- a/hw/block/vhost-user-blk.c
> +++ b/hw/block/vhost-user-blk.c
> @@ -491,7 +491,7 @@ static void vhost_user_blk_device_realize(DeviceState *dev, Error **errp)
>      }
>  
>      virtio_init(vdev, "virtio-blk", VIRTIO_ID_BLOCK,
> -                sizeof(struct virtio_blk_config));
> +                sizeof(struct virtio_blk_config), VIRTQUEUE_MAX_SIZE);
>  
>      s->virtqs = g_new(VirtQueue *, s->num_queues);
>      for (i = 0; i < s->num_queues; i++) {
> diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
> index f139cd7cc9..9c0f46815c 100644
> --- a/hw/block/virtio-blk.c
> +++ b/hw/block/virtio-blk.c
> @@ -1213,7 +1213,8 @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
>  
>      virtio_blk_set_config_size(s, s->host_features);
>  
> -    virtio_init(vdev, "virtio-blk", VIRTIO_ID_BLOCK, s->config_size);
> +    virtio_init(vdev, "virtio-blk", VIRTIO_ID_BLOCK, s->config_size,
> +                VIRTQUEUE_MAX_SIZE);
>  
>      s->blk = conf->conf.blk;
>      s->rq = NULL;
> diff --git a/hw/char/virtio-serial-bus.c b/hw/char/virtio-serial-bus.c
> index f01ec2137c..9ad9111115 100644
> --- a/hw/char/virtio-serial-bus.c
> +++ b/hw/char/virtio-serial-bus.c
> @@ -1045,7 +1045,7 @@ static void virtio_serial_device_realize(DeviceState *dev, Error **errp)
>          config_size = offsetof(struct virtio_console_config, emerg_wr);
>      }
>      virtio_init(vdev, "virtio-serial", VIRTIO_ID_CONSOLE,
> -                config_size);
> +                config_size, VIRTQUEUE_MAX_SIZE);
>  
>      /* Spawn a new virtio-serial bus on which the ports will ride as devices */
>      qbus_init(&vser->bus, sizeof(vser->bus), TYPE_VIRTIO_SERIAL_BUS,
> diff --git a/hw/display/virtio-gpu-base.c b/hw/display/virtio-gpu-base.c
> index c8da4806e0..20b06a7adf 100644
> --- a/hw/display/virtio-gpu-base.c
> +++ b/hw/display/virtio-gpu-base.c
> @@ -171,7 +171,7 @@ virtio_gpu_base_device_realize(DeviceState *qdev,
>  
>      g->virtio_config.num_scanouts = cpu_to_le32(g->conf.max_outputs);
>      virtio_init(VIRTIO_DEVICE(g), "virtio-gpu", VIRTIO_ID_GPU,
> -                sizeof(struct virtio_gpu_config));
> +                sizeof(struct virtio_gpu_config), VIRTQUEUE_MAX_SIZE);
>  
>      if (virtio_gpu_virgl_enabled(g->conf)) {
>          /* use larger control queue in 3d mode */
> diff --git a/hw/input/virtio-input.c b/hw/input/virtio-input.c
> index 54bcb46c74..345eb2cce7 100644
> --- a/hw/input/virtio-input.c
> +++ b/hw/input/virtio-input.c
> @@ -258,7 +258,7 @@ static void virtio_input_device_realize(DeviceState *dev, Error **errp)
>      assert(vinput->cfg_size <= sizeof(virtio_input_config));
>  
>      virtio_init(vdev, "virtio-input", VIRTIO_ID_INPUT,
> -                vinput->cfg_size);
> +                vinput->cfg_size, VIRTQUEUE_MAX_SIZE);
>      vinput->evt = virtio_add_queue(vdev, 64, virtio_input_handle_evt);
>      vinput->sts = virtio_add_queue(vdev, 64, virtio_input_handle_sts);
>  }
> diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
> index f205331dcf..f74b5f6268 100644
> --- a/hw/net/virtio-net.c
> +++ b/hw/net/virtio-net.c
> @@ -1746,9 +1746,9 @@ static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf,
>      VirtIONet *n = qemu_get_nic_opaque(nc);
>      VirtIONetQueue *q = virtio_net_get_subqueue(nc);
>      VirtIODevice *vdev = VIRTIO_DEVICE(n);
> -    VirtQueueElement *elems[VIRTQUEUE_MAX_SIZE];
> -    size_t lens[VIRTQUEUE_MAX_SIZE];
> -    struct iovec mhdr_sg[VIRTQUEUE_MAX_SIZE];
> +    VirtQueueElement *elems[vdev->queue_max_size];
> +    size_t lens[vdev->queue_max_size];
> +    struct iovec mhdr_sg[vdev->queue_max_size];
>      struct virtio_net_hdr_mrg_rxbuf mhdr;
>      unsigned mhdr_cnt = 0;
>      size_t offset, i, guest_offset, j;
> @@ -1783,7 +1783,7 @@ static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf,
>  
>          total = 0;
>  
> -        if (i == VIRTQUEUE_MAX_SIZE) {
> +        if (i == vdev->queue_max_size) {
>              virtio_error(vdev, "virtio-net unexpected long buffer chain");
>              err = size;
>              goto err;
> @@ -2532,7 +2532,7 @@ static int32_t virtio_net_flush_tx(VirtIONetQueue *q)
>      for (;;) {
>          ssize_t ret;
>          unsigned int out_num;
> -        struct iovec sg[VIRTQUEUE_MAX_SIZE], sg2[VIRTQUEUE_MAX_SIZE + 1], *out_sg;
> +        struct iovec sg[vdev->queue_max_size], sg2[vdev->queue_max_size + 1], *out_sg;
>          struct virtio_net_hdr_mrg_rxbuf mhdr;
>  
>          elem = virtqueue_pop(q->tx_vq, sizeof(VirtQueueElement));
> @@ -2564,7 +2564,7 @@ static int32_t virtio_net_flush_tx(VirtIONetQueue *q)
>                  out_num = iov_copy(&sg2[1], ARRAY_SIZE(sg2) - 1,
>                                     out_sg, out_num,
>                                     n->guest_hdr_len, -1);
> -                if (out_num == VIRTQUEUE_MAX_SIZE) {
> +                if (out_num == vdev->queue_max_size) {
>                      goto drop;
>                  }
>                  out_num += 1;
> @@ -3364,7 +3364,8 @@ static void virtio_net_device_realize(DeviceState *dev, Error **errp)
>      }
>  
>      virtio_net_set_config_size(n, n->host_features);
> -    virtio_init(vdev, "virtio-net", VIRTIO_ID_NET, n->config_size);
> +    virtio_init(vdev, "virtio-net", VIRTIO_ID_NET, n->config_size,
> +                VIRTQUEUE_MAX_SIZE);
>  
>      /*
>       * We set a lower limit on RX queue size to what it always was.
> diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
> index 51fd09522a..5e5e657e1d 100644
> --- a/hw/scsi/virtio-scsi.c
> +++ b/hw/scsi/virtio-scsi.c
> @@ -973,7 +973,7 @@ void virtio_scsi_common_realize(DeviceState *dev,
>      int i;
>  
>      virtio_init(vdev, "virtio-scsi", VIRTIO_ID_SCSI,
> -                sizeof(VirtIOSCSIConfig));
> +                sizeof(VirtIOSCSIConfig), VIRTQUEUE_MAX_SIZE);
>  
>      if (s->conf.num_queues == VIRTIO_SCSI_AUTO_NUM_QUEUES) {
>          s->conf.num_queues = 1;
> diff --git a/hw/virtio/vhost-user-fs.c b/hw/virtio/vhost-user-fs.c
> index c595957983..ae1672d667 100644
> --- a/hw/virtio/vhost-user-fs.c
> +++ b/hw/virtio/vhost-user-fs.c
> @@ -220,7 +220,7 @@ static void vuf_device_realize(DeviceState *dev, Error **errp)
>      }
>  
>      virtio_init(vdev, "vhost-user-fs", VIRTIO_ID_FS,
> -                sizeof(struct virtio_fs_config));
> +                sizeof(struct virtio_fs_config), VIRTQUEUE_MAX_SIZE);
>  
>      /* Hiprio queue */
>      fs->hiprio_vq = virtio_add_queue(vdev, fs->conf.queue_size, vuf_handle_output);
> diff --git a/hw/virtio/vhost-user-i2c.c b/hw/virtio/vhost-user-i2c.c
> index d172632bb0..eeb1d8853a 100644
> --- a/hw/virtio/vhost-user-i2c.c
> +++ b/hw/virtio/vhost-user-i2c.c
> @@ -220,7 +220,8 @@ static void vu_i2c_device_realize(DeviceState *dev, Error **errp)
>          return;
>      }
>  
> -    virtio_init(vdev, "vhost-user-i2c", VIRTIO_ID_I2C_ADAPTER, 0);
> +    virtio_init(vdev, "vhost-user-i2c", VIRTIO_ID_I2C_ADAPTER, 0,
> +                VIRTQUEUE_MAX_SIZE);
>  
>      i2c->vhost_dev.nvqs = 1;
>      i2c->vq = virtio_add_queue(vdev, 4, vu_i2c_handle_output);
> diff --git a/hw/virtio/vhost-vsock-common.c b/hw/virtio/vhost-vsock-common.c
> index 4ad6e234ad..a81fa884a8 100644
> --- a/hw/virtio/vhost-vsock-common.c
> +++ b/hw/virtio/vhost-vsock-common.c
> @@ -201,7 +201,7 @@ void vhost_vsock_common_realize(VirtIODevice *vdev, const char *name)
>      VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev);
>  
>      virtio_init(vdev, name, VIRTIO_ID_VSOCK,
> -                sizeof(struct virtio_vsock_config));
> +                sizeof(struct virtio_vsock_config), VIRTQUEUE_MAX_SIZE);
>  
>      /* Receive and transmit queues belong to vhost */
>      vvc->recv_vq = virtio_add_queue(vdev, VHOST_VSOCK_QUEUE_SIZE,
> diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c
> index 5a69dce35d..067c73223d 100644
> --- a/hw/virtio/virtio-balloon.c
> +++ b/hw/virtio/virtio-balloon.c
> @@ -886,7 +886,7 @@ static void virtio_balloon_device_realize(DeviceState *dev, Error **errp)
>      int ret;
>  
>      virtio_init(vdev, "virtio-balloon", VIRTIO_ID_BALLOON,
> -                virtio_balloon_config_size(s));
> +                virtio_balloon_config_size(s), VIRTQUEUE_MAX_SIZE);
>  
>      ret = qemu_add_balloon_handler(virtio_balloon_to_target,
>                                     virtio_balloon_stat, s);
> @@ -909,7 +909,7 @@ static void virtio_balloon_device_realize(DeviceState *dev, Error **errp)
>      s->svq = virtio_add_queue(vdev, 128, virtio_balloon_receive_stats);
>  
>      if (virtio_has_feature(s->host_features, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) {
> -        s->free_page_vq = virtio_add_queue(vdev, VIRTQUEUE_MAX_SIZE,
> +        s->free_page_vq = virtio_add_queue(vdev, vdev->queue_max_size,
>                                             virtio_balloon_handle_free_page_vq);
>          precopy_add_notifier(&s->free_page_hint_notify);
>  
> diff --git a/hw/virtio/virtio-crypto.c b/hw/virtio/virtio-crypto.c
> index 54f9bbb789..1e70d4d2a8 100644
> --- a/hw/virtio/virtio-crypto.c
> +++ b/hw/virtio/virtio-crypto.c
> @@ -810,7 +810,8 @@ static void virtio_crypto_device_realize(DeviceState *dev, Error **errp)
>          return;
>      }
>  
> -    virtio_init(vdev, "virtio-crypto", VIRTIO_ID_CRYPTO, vcrypto->config_size);
> +    virtio_init(vdev, "virtio-crypto", VIRTIO_ID_CRYPTO, vcrypto->config_size,
> +                VIRTQUEUE_MAX_SIZE);
>      vcrypto->curr_queues = 1;
>      vcrypto->vqs = g_malloc0(sizeof(VirtIOCryptoQueue) * vcrypto->max_queues);
>      for (i = 0; i < vcrypto->max_queues; i++) {
> diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c
> index 1b23e8e18c..ca360e74eb 100644
> --- a/hw/virtio/virtio-iommu.c
> +++ b/hw/virtio/virtio-iommu.c
> @@ -974,7 +974,7 @@ static void virtio_iommu_device_realize(DeviceState *dev, Error **errp)
>      VirtIOIOMMU *s = VIRTIO_IOMMU(dev);
>  
>      virtio_init(vdev, "virtio-iommu", VIRTIO_ID_IOMMU,
> -                sizeof(struct virtio_iommu_config));
> +                sizeof(struct virtio_iommu_config), VIRTQUEUE_MAX_SIZE);
>  
>      memset(s->iommu_pcibus_by_bus_num, 0, sizeof(s->iommu_pcibus_by_bus_num));
>  
> diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c
> index df91e454b2..1d9d01b871 100644
> --- a/hw/virtio/virtio-mem.c
> +++ b/hw/virtio/virtio-mem.c
> @@ -738,7 +738,7 @@ static void virtio_mem_device_realize(DeviceState *dev, Error **errp)
>      vmem->bitmap = bitmap_new(vmem->bitmap_size);
>  
>      virtio_init(vdev, TYPE_VIRTIO_MEM, VIRTIO_ID_MEM,
> -                sizeof(struct virtio_mem_config));
> +                sizeof(struct virtio_mem_config), VIRTQUEUE_MAX_SIZE);
>      vmem->vq = virtio_add_queue(vdev, 128, virtio_mem_handle_request);
>  
>      host_memory_backend_set_mapped(vmem->memdev, true);
> diff --git a/hw/virtio/virtio-pmem.c b/hw/virtio/virtio-pmem.c
> index d1aeb90a31..82b54b00c5 100644
> --- a/hw/virtio/virtio-pmem.c
> +++ b/hw/virtio/virtio-pmem.c
> @@ -124,7 +124,7 @@ static void virtio_pmem_realize(DeviceState *dev, Error **errp)
>  
>      host_memory_backend_set_mapped(pmem->memdev, true);
>      virtio_init(vdev, TYPE_VIRTIO_PMEM, VIRTIO_ID_PMEM,
> -                sizeof(struct virtio_pmem_config));
> +                sizeof(struct virtio_pmem_config), VIRTQUEUE_MAX_SIZE);
>      pmem->rq_vq = virtio_add_queue(vdev, 128, virtio_pmem_flush);
>  }
>  
> diff --git a/hw/virtio/virtio-rng.c b/hw/virtio/virtio-rng.c
> index cc8e9f775d..0e91d60106 100644
> --- a/hw/virtio/virtio-rng.c
> +++ b/hw/virtio/virtio-rng.c
> @@ -215,7 +215,7 @@ static void virtio_rng_device_realize(DeviceState *dev, Error **errp)
>          return;
>      }
>  
> -    virtio_init(vdev, "virtio-rng", VIRTIO_ID_RNG, 0);
> +    virtio_init(vdev, "virtio-rng", VIRTIO_ID_RNG, 0, VIRTQUEUE_MAX_SIZE);
>  
>      vrng->vq = virtio_add_queue(vdev, 8, handle_input);
>      vrng->quota_remaining = vrng->conf.max_bytes;
> diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
> index 240759ff0b..60e094d96a 100644
> --- a/hw/virtio/virtio.c
> +++ b/hw/virtio/virtio.c
> @@ -1419,8 +1419,8 @@ static void *virtqueue_split_pop(VirtQueue *vq, size_t sz)
>      VirtIODevice *vdev = vq->vdev;
>      VirtQueueElement *elem = NULL;
>      unsigned out_num, in_num, elem_entries;
> -    hwaddr addr[VIRTQUEUE_MAX_SIZE];
> -    struct iovec iov[VIRTQUEUE_MAX_SIZE];
> +    hwaddr addr[vdev->queue_max_size];
> +    struct iovec iov[vdev->queue_max_size];
>      VRingDesc desc;
>      int rc;
>  
> @@ -1492,7 +1492,7 @@ static void *virtqueue_split_pop(VirtQueue *vq, size_t sz)
>          if (desc.flags & VRING_DESC_F_WRITE) {
>              map_ok = virtqueue_map_desc(vdev, &in_num, addr + out_num,
>                                          iov + out_num,
> -                                        VIRTQUEUE_MAX_SIZE - out_num, true,
> +                                        vdev->queue_max_size - out_num, true,
>                                          desc.addr, desc.len);
>          } else {
>              if (in_num) {
> @@ -1500,7 +1500,7 @@ static void *virtqueue_split_pop(VirtQueue *vq, size_t sz)
>                  goto err_undo_map;
>              }
>              map_ok = virtqueue_map_desc(vdev, &out_num, addr, iov,
> -                                        VIRTQUEUE_MAX_SIZE, false,
> +                                        vdev->queue_max_size, false,
>                                          desc.addr, desc.len);
>          }
>          if (!map_ok) {
> @@ -1556,8 +1556,8 @@ static void *virtqueue_packed_pop(VirtQueue *vq, size_t sz)
>      VirtIODevice *vdev = vq->vdev;
>      VirtQueueElement *elem = NULL;
>      unsigned out_num, in_num, elem_entries;
> -    hwaddr addr[VIRTQUEUE_MAX_SIZE];
> -    struct iovec iov[VIRTQUEUE_MAX_SIZE];
> +    hwaddr addr[vdev->queue_max_size];
> +    struct iovec iov[vdev->queue_max_size];
>      VRingPackedDesc desc;
>      uint16_t id;
>      int rc;
> @@ -1620,7 +1620,7 @@ static void *virtqueue_packed_pop(VirtQueue *vq, size_t sz)
>          if (desc.flags & VRING_DESC_F_WRITE) {
>              map_ok = virtqueue_map_desc(vdev, &in_num, addr + out_num,
>                                          iov + out_num,
> -                                        VIRTQUEUE_MAX_SIZE - out_num, true,
> +                                        vdev->queue_max_size - out_num, true,
>                                          desc.addr, desc.len);
>          } else {
>              if (in_num) {
> @@ -1628,7 +1628,7 @@ static void *virtqueue_packed_pop(VirtQueue *vq, size_t sz)
>                  goto err_undo_map;
>              }
>              map_ok = virtqueue_map_desc(vdev, &out_num, addr, iov,
> -                                        VIRTQUEUE_MAX_SIZE, false,
> +                                        vdev->queue_max_size, false,
>                                          desc.addr, desc.len);
>          }
>          if (!map_ok) {
> @@ -2249,7 +2249,7 @@ void virtio_queue_set_num(VirtIODevice *vdev, int n, int num)
>       * nonexistent states, or to set it to an invalid size.
>       */
>      if (!!num != !!vdev->vq[n].vring.num ||
> -        num > VIRTQUEUE_MAX_SIZE ||
> +        num > vdev->queue_max_size ||
>          num < 0) {
>          return;
>      }
> @@ -2400,7 +2400,7 @@ VirtQueue *virtio_add_queue(VirtIODevice *vdev, int queue_size,
>              break;
>      }
>  
> -    if (i == VIRTIO_QUEUE_MAX || queue_size > VIRTQUEUE_MAX_SIZE)
> +    if (i == VIRTIO_QUEUE_MAX || queue_size > vdev->queue_max_size)
>          abort();
>  
>      vdev->vq[i].vring.num = queue_size;
> @@ -3239,13 +3239,25 @@ void virtio_instance_init_common(Object *proxy_obj, void *data,
>  }
>  
>  void virtio_init(VirtIODevice *vdev, const char *name,
> -                 uint16_t device_id, size_t config_size)
> +                 uint16_t device_id, size_t config_size,
> +                 uint16_t queue_max_size)
>  {
>      BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
>      VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
>      int i;
>      int nvectors = k->query_nvectors ? k->query_nvectors(qbus->parent) : 0;
>  
> +    if (queue_max_size > VIRTQUEUE_MAX_SIZE ||
> +        !is_power_of_2(queue_max_size))
> +    {
> +        error_report(
> +            "virtio: invalid queue_max_size (= %" PRIu16 "), must be a "
> +            "power of 2 and between 1 ... %d.",
> +            queue_max_size, VIRTQUEUE_MAX_SIZE
> +        );
> +        abort();
> +    }
> +
>      if (nvectors) {
>          vdev->vector_queues =
>              g_malloc0(sizeof(*vdev->vector_queues) * nvectors);
> @@ -3258,6 +3270,7 @@ void virtio_init(VirtIODevice *vdev, const char *name,
>      qatomic_set(&vdev->isr, 0);
>      vdev->queue_sel = 0;
>      vdev->config_vector = VIRTIO_NO_VECTOR;
> +    vdev->queue_max_size = queue_max_size;
>      vdev->vq = g_malloc0(sizeof(VirtQueue) * VIRTIO_QUEUE_MAX);
>      vdev->vm_running = runstate_is_running();
>      vdev->broken = false;
> diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
> index 8bab9cfb75..a37d1f7d52 100644
> --- a/include/hw/virtio/virtio.h
> +++ b/include/hw/virtio/virtio.h
> @@ -89,6 +89,7 @@ struct VirtIODevice
>      size_t config_len;
>      void *config;
>      uint16_t config_vector;
> +    uint16_t queue_max_size;
>      uint32_t generation;
>      int nvectors;
>      VirtQueue *vq;
> @@ -166,7 +167,9 @@ void virtio_instance_init_common(Object *proxy_obj, void *data,
>                                   size_t vdev_size, const char *vdev_name);
>  
>  void virtio_init(VirtIODevice *vdev, const char *name,
> -                         uint16_t device_id, size_t config_size);
> +                 uint16_t device_id, size_t config_size,
> +                 uint16_t queue_max_size);
> +
>  void virtio_cleanup(VirtIODevice *vdev);
>  
>  void virtio_error(VirtIODevice *vdev, const char *fmt, ...) GCC_FMT_ATTR(2, 3);
Stefan Hajnoczi Oct. 5, 2021, 12:45 p.m. UTC | #2
On Mon, Oct 04, 2021 at 09:38:04PM +0200, Christian Schoenebeck wrote:
> Refactor VIRTQUEUE_MAX_SIZE to effectively become a runtime
> variable per virtio user.

virtio user == virtio device model?

> 
> Reasons:
> 
> (1) VIRTQUEUE_MAX_SIZE should reflect the absolute theoretical
>     maximum queue size possible. Which is actually the maximum
>     queue size allowed by the virtio protocol. The appropriate
>     value for VIRTQUEUE_MAX_SIZE would therefore be 32768:
> 
>     https://docs.oasis-open.org/virtio/virtio/v1.1/cs01/virtio-v1.1-cs01.html#x1-240006
> 
>     Apparently VIRTQUEUE_MAX_SIZE was instead defined with a
>     more or less arbitrary value of 1024 in the past, which
>     limits the maximum transfer size with virtio to 4M
>     (more precise: 1024 * PAGE_SIZE, with the latter typically
>     being 4k).

Being equal to IOV_MAX is a likely reason. Buffers with more iovecs than
that cannot be passed to host system calls (sendmsg(2), pwritev(2),
etc).

> (2) Additionally the current value of 1024 poses a hidden limit,
>     invisible to guest, which causes a system hang with the
>     following QEMU error if guest tries to exceed it:
> 
>     virtio: too many write descriptors in indirect table

I don't understand this point. 2.6.5 The Virtqueue Descriptor Table says:

  The number of descriptors in the table is defined by the queue size for this virtqueue: this is the maximum possible descriptor chain length.

and 2.6.5.3.1 Driver Requirements: Indirect Descriptors says:

  A driver MUST NOT create a descriptor chain longer than the Queue Size of the device.

Do you mean a broken/malicious guest driver that is violating the spec?
That's not a hidden limit, it's defined by the spec.

> (3) Unfortunately not all virtio users in QEMU would currently
>     work correctly with the new value of 32768.
> 
> So let's turn this hard coded global value into a runtime
> variable as a first step in this commit, configurable for each
> virtio user by passing a corresponding value with virtio_init()
> call.

virtio_add_queue() already has an int queue_size argument, why isn't
that enough to deal with the maximum queue size? There's probably a good
reason for it, but please include it in the commit description.

> 
> Signed-off-by: Christian Schoenebeck <qemu_oss@crudebyte.com>
> ---
>  hw/9pfs/virtio-9p-device.c     |  3 ++-
>  hw/block/vhost-user-blk.c      |  2 +-
>  hw/block/virtio-blk.c          |  3 ++-
>  hw/char/virtio-serial-bus.c    |  2 +-
>  hw/display/virtio-gpu-base.c   |  2 +-
>  hw/input/virtio-input.c        |  2 +-
>  hw/net/virtio-net.c            | 15 ++++++++-------
>  hw/scsi/virtio-scsi.c          |  2 +-
>  hw/virtio/vhost-user-fs.c      |  2 +-
>  hw/virtio/vhost-user-i2c.c     |  3 ++-
>  hw/virtio/vhost-vsock-common.c |  2 +-
>  hw/virtio/virtio-balloon.c     |  4 ++--
>  hw/virtio/virtio-crypto.c      |  3 ++-
>  hw/virtio/virtio-iommu.c       |  2 +-
>  hw/virtio/virtio-mem.c         |  2 +-
>  hw/virtio/virtio-pmem.c        |  2 +-
>  hw/virtio/virtio-rng.c         |  2 +-
>  hw/virtio/virtio.c             | 35 +++++++++++++++++++++++-----------
>  include/hw/virtio/virtio.h     |  5 ++++-
>  19 files changed, 57 insertions(+), 36 deletions(-)
> 
> diff --git a/hw/9pfs/virtio-9p-device.c b/hw/9pfs/virtio-9p-device.c
> index 54ee93b71f..cd5d95dd51 100644
> --- a/hw/9pfs/virtio-9p-device.c
> +++ b/hw/9pfs/virtio-9p-device.c
> @@ -216,7 +216,8 @@ static void virtio_9p_device_realize(DeviceState *dev, Error **errp)
>      }
>  
>      v->config_size = sizeof(struct virtio_9p_config) + strlen(s->fsconf.tag);
> -    virtio_init(vdev, "virtio-9p", VIRTIO_ID_9P, v->config_size);
> +    virtio_init(vdev, "virtio-9p", VIRTIO_ID_9P, v->config_size,
> +                VIRTQUEUE_MAX_SIZE);
>      v->vq = virtio_add_queue(vdev, MAX_REQ, handle_9p_output);
>  }
>  
> diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
> index ba13cb87e5..336f56705c 100644
> --- a/hw/block/vhost-user-blk.c
> +++ b/hw/block/vhost-user-blk.c
> @@ -491,7 +491,7 @@ static void vhost_user_blk_device_realize(DeviceState *dev, Error **errp)
>      }
>  
>      virtio_init(vdev, "virtio-blk", VIRTIO_ID_BLOCK,
> -                sizeof(struct virtio_blk_config));
> +                sizeof(struct virtio_blk_config), VIRTQUEUE_MAX_SIZE);
>  
>      s->virtqs = g_new(VirtQueue *, s->num_queues);
>      for (i = 0; i < s->num_queues; i++) {
> diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
> index f139cd7cc9..9c0f46815c 100644
> --- a/hw/block/virtio-blk.c
> +++ b/hw/block/virtio-blk.c
> @@ -1213,7 +1213,8 @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
>  
>      virtio_blk_set_config_size(s, s->host_features);
>  
> -    virtio_init(vdev, "virtio-blk", VIRTIO_ID_BLOCK, s->config_size);
> +    virtio_init(vdev, "virtio-blk", VIRTIO_ID_BLOCK, s->config_size,
> +                VIRTQUEUE_MAX_SIZE);
>  
>      s->blk = conf->conf.blk;
>      s->rq = NULL;
> diff --git a/hw/char/virtio-serial-bus.c b/hw/char/virtio-serial-bus.c
> index f01ec2137c..9ad9111115 100644
> --- a/hw/char/virtio-serial-bus.c
> +++ b/hw/char/virtio-serial-bus.c
> @@ -1045,7 +1045,7 @@ static void virtio_serial_device_realize(DeviceState *dev, Error **errp)
>          config_size = offsetof(struct virtio_console_config, emerg_wr);
>      }
>      virtio_init(vdev, "virtio-serial", VIRTIO_ID_CONSOLE,
> -                config_size);
> +                config_size, VIRTQUEUE_MAX_SIZE);
>  
>      /* Spawn a new virtio-serial bus on which the ports will ride as devices */
>      qbus_init(&vser->bus, sizeof(vser->bus), TYPE_VIRTIO_SERIAL_BUS,
> diff --git a/hw/display/virtio-gpu-base.c b/hw/display/virtio-gpu-base.c
> index c8da4806e0..20b06a7adf 100644
> --- a/hw/display/virtio-gpu-base.c
> +++ b/hw/display/virtio-gpu-base.c
> @@ -171,7 +171,7 @@ virtio_gpu_base_device_realize(DeviceState *qdev,
>  
>      g->virtio_config.num_scanouts = cpu_to_le32(g->conf.max_outputs);
>      virtio_init(VIRTIO_DEVICE(g), "virtio-gpu", VIRTIO_ID_GPU,
> -                sizeof(struct virtio_gpu_config));
> +                sizeof(struct virtio_gpu_config), VIRTQUEUE_MAX_SIZE);
>  
>      if (virtio_gpu_virgl_enabled(g->conf)) {
>          /* use larger control queue in 3d mode */
> diff --git a/hw/input/virtio-input.c b/hw/input/virtio-input.c
> index 54bcb46c74..345eb2cce7 100644
> --- a/hw/input/virtio-input.c
> +++ b/hw/input/virtio-input.c
> @@ -258,7 +258,7 @@ static void virtio_input_device_realize(DeviceState *dev, Error **errp)
>      assert(vinput->cfg_size <= sizeof(virtio_input_config));
>  
>      virtio_init(vdev, "virtio-input", VIRTIO_ID_INPUT,
> -                vinput->cfg_size);
> +                vinput->cfg_size, VIRTQUEUE_MAX_SIZE);
>      vinput->evt = virtio_add_queue(vdev, 64, virtio_input_handle_evt);
>      vinput->sts = virtio_add_queue(vdev, 64, virtio_input_handle_sts);
>  }
> diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
> index f205331dcf..f74b5f6268 100644
> --- a/hw/net/virtio-net.c
> +++ b/hw/net/virtio-net.c
> @@ -1746,9 +1746,9 @@ static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf,
>      VirtIONet *n = qemu_get_nic_opaque(nc);
>      VirtIONetQueue *q = virtio_net_get_subqueue(nc);
>      VirtIODevice *vdev = VIRTIO_DEVICE(n);
> -    VirtQueueElement *elems[VIRTQUEUE_MAX_SIZE];
> -    size_t lens[VIRTQUEUE_MAX_SIZE];
> -    struct iovec mhdr_sg[VIRTQUEUE_MAX_SIZE];
> +    VirtQueueElement *elems[vdev->queue_max_size];
> +    size_t lens[vdev->queue_max_size];
> +    struct iovec mhdr_sg[vdev->queue_max_size];

Can you make this value per-vq instead of per-vdev since virtqueues can
have different queue sizes?

The same applies to the rest of this patch. Anything using
vdev->queue_max_size should probably use vq->vring.num instead.
Christian Schoenebeck Oct. 5, 2021, 1:15 p.m. UTC | #3
On Dienstag, 5. Oktober 2021 14:45:56 CEST Stefan Hajnoczi wrote:
> On Mon, Oct 04, 2021 at 09:38:04PM +0200, Christian Schoenebeck wrote:
> > Refactor VIRTQUEUE_MAX_SIZE to effectively become a runtime
> > variable per virtio user.
> 
> virtio user == virtio device model?

Yes

> > Reasons:
> > 
> > (1) VIRTQUEUE_MAX_SIZE should reflect the absolute theoretical
> > 
> >     maximum queue size possible. Which is actually the maximum
> >     queue size allowed by the virtio protocol. The appropriate
> >     value for VIRTQUEUE_MAX_SIZE would therefore be 32768:
> >     
> >     https://docs.oasis-open.org/virtio/virtio/v1.1/cs01/virtio-v1.1-cs01.h
> >     tml#x1-240006
> >     
> >     Apparently VIRTQUEUE_MAX_SIZE was instead defined with a
> >     more or less arbitrary value of 1024 in the past, which
> >     limits the maximum transfer size with virtio to 4M
> >     (more precise: 1024 * PAGE_SIZE, with the latter typically
> >     being 4k).
> 
> Being equal to IOV_MAX is a likely reason. Buffers with more iovecs than
> that cannot be passed to host system calls (sendmsg(2), pwritev(2),
> etc).

Yes, that's use case dependent. Hence the solution to opt-in if it is desired 
and feasible.

> > (2) Additionally the current value of 1024 poses a hidden limit,
> > 
> >     invisible to guest, which causes a system hang with the
> >     following QEMU error if guest tries to exceed it:
> >     
> >     virtio: too many write descriptors in indirect table
> 
> I don't understand this point. 2.6.5 The Virtqueue Descriptor Table says:
> 
>   The number of descriptors in the table is defined by the queue size for
> this virtqueue: this is the maximum possible descriptor chain length.
> 
> and 2.6.5.3.1 Driver Requirements: Indirect Descriptors says:
> 
>   A driver MUST NOT create a descriptor chain longer than the Queue Size of
> the device.
> 
> Do you mean a broken/malicious guest driver that is violating the spec?
> That's not a hidden limit, it's defined by the spec.

https://lists.gnu.org/archive/html/qemu-devel/2021-10/msg00781.html
https://lists.gnu.org/archive/html/qemu-devel/2021-10/msg00788.html

You can already go beyond that queue size at runtime with the indirection 
table. The only actual limit is the currently hard coded value of 1k pages. 
Hence the suggestion to turn that into a variable.

> > (3) Unfortunately not all virtio users in QEMU would currently
> > 
> >     work correctly with the new value of 32768.
> > 
> > So let's turn this hard coded global value into a runtime
> > variable as a first step in this commit, configurable for each
> > virtio user by passing a corresponding value with virtio_init()
> > call.
> 
> virtio_add_queue() already has an int queue_size argument, why isn't
> that enough to deal with the maximum queue size? There's probably a good
> reason for it, but please include it in the commit description.
[...]
> Can you make this value per-vq instead of per-vdev since virtqueues can
> have different queue sizes?
> 
> The same applies to the rest of this patch. Anything using
> vdev->queue_max_size should probably use vq->vring.num instead.

I would like to avoid that and keep it per device. The maximum size stored 
there is the maximum size supported by virtio user (or vortio device model, 
however you want to call it). So that's really a limit per device, not per 
queue, as no queue of the device would ever exceed that limit.

Plus a lot more code would need to be refactored, which I think is 
unnecessary.

Best regards,
Christian Schoenebeck
Stefan Hajnoczi Oct. 5, 2021, 3:10 p.m. UTC | #4
On Tue, Oct 05, 2021 at 03:15:26PM +0200, Christian Schoenebeck wrote:
> On Dienstag, 5. Oktober 2021 14:45:56 CEST Stefan Hajnoczi wrote:
> > On Mon, Oct 04, 2021 at 09:38:04PM +0200, Christian Schoenebeck wrote:
> > > Refactor VIRTQUEUE_MAX_SIZE to effectively become a runtime
> > > variable per virtio user.
> > 
> > virtio user == virtio device model?
> 
> Yes
> 
> > > Reasons:
> > > 
> > > (1) VIRTQUEUE_MAX_SIZE should reflect the absolute theoretical
> > > 
> > >     maximum queue size possible. Which is actually the maximum
> > >     queue size allowed by the virtio protocol. The appropriate
> > >     value for VIRTQUEUE_MAX_SIZE would therefore be 32768:
> > >     
> > >     https://docs.oasis-open.org/virtio/virtio/v1.1/cs01/virtio-v1.1-cs01.h
> > >     tml#x1-240006
> > >     
> > >     Apparently VIRTQUEUE_MAX_SIZE was instead defined with a
> > >     more or less arbitrary value of 1024 in the past, which
> > >     limits the maximum transfer size with virtio to 4M
> > >     (more precise: 1024 * PAGE_SIZE, with the latter typically
> > >     being 4k).
> > 
> > Being equal to IOV_MAX is a likely reason. Buffers with more iovecs than
> > that cannot be passed to host system calls (sendmsg(2), pwritev(2),
> > etc).
> 
> Yes, that's use case dependent. Hence the solution to opt-in if it is desired 
> and feasible.
> 
> > > (2) Additionally the current value of 1024 poses a hidden limit,
> > > 
> > >     invisible to guest, which causes a system hang with the
> > >     following QEMU error if guest tries to exceed it:
> > >     
> > >     virtio: too many write descriptors in indirect table
> > 
> > I don't understand this point. 2.6.5 The Virtqueue Descriptor Table says:
> > 
> >   The number of descriptors in the table is defined by the queue size for
> > this virtqueue: this is the maximum possible descriptor chain length.
> > 
> > and 2.6.5.3.1 Driver Requirements: Indirect Descriptors says:
> > 
> >   A driver MUST NOT create a descriptor chain longer than the Queue Size of
> > the device.
> > 
> > Do you mean a broken/malicious guest driver that is violating the spec?
> > That's not a hidden limit, it's defined by the spec.
> 
> https://lists.gnu.org/archive/html/qemu-devel/2021-10/msg00781.html
> https://lists.gnu.org/archive/html/qemu-devel/2021-10/msg00788.html
> 
> You can already go beyond that queue size at runtime with the indirection 
> table. The only actual limit is the currently hard coded value of 1k pages. 
> Hence the suggestion to turn that into a variable.

Exceeding Queue Size is a VIRTIO spec violation. Drivers that operate
outsided the spec do so at their own risk. They may not be compatible
with all device implementations.

The limit is not hidden, it's Queue Size as defined by the spec :).

If you have a driver that is exceeding the limit, then please fix the
driver.

> > > (3) Unfortunately not all virtio users in QEMU would currently
> > > 
> > >     work correctly with the new value of 32768.
> > > 
> > > So let's turn this hard coded global value into a runtime
> > > variable as a first step in this commit, configurable for each
> > > virtio user by passing a corresponding value with virtio_init()
> > > call.
> > 
> > virtio_add_queue() already has an int queue_size argument, why isn't
> > that enough to deal with the maximum queue size? There's probably a good
> > reason for it, but please include it in the commit description.
> [...]
> > Can you make this value per-vq instead of per-vdev since virtqueues can
> > have different queue sizes?
> > 
> > The same applies to the rest of this patch. Anything using
> > vdev->queue_max_size should probably use vq->vring.num instead.
> 
> I would like to avoid that and keep it per device. The maximum size stored 
> there is the maximum size supported by virtio user (or vortio device model, 
> however you want to call it). So that's really a limit per device, not per 
> queue, as no queue of the device would ever exceed that limit.
>
> Plus a lot more code would need to be refactored, which I think is 
> unnecessary.

I'm against a per-device limit because it's a concept that cannot
accurately describe reality. Some devices have multiple classes of
virtqueues and they are sized differently, so a per-device limit is
insufficient. virtio-net has separate rx_queue_size and tx_queue_size
parameters (plus a control vq hardcoded to 64 descriptors).

The specification already gives us Queue Size (vring.num in QEMU). The
variable exists in QEMU and just needs to be used.

If per-vq limits require a lot of work, please describe why. I think
replacing the variable from this patch with virtio_queue_get_num()
should be fairly straightforward, but maybe I'm missing something? (If
you prefer VirtQueue *vq instead of the index-based
virtio_queue_get_num() API, you can introduce a virtqueue_get_num()
API.)

Stefan
Christian Schoenebeck Oct. 5, 2021, 4:32 p.m. UTC | #5
On Dienstag, 5. Oktober 2021 17:10:40 CEST Stefan Hajnoczi wrote:
> On Tue, Oct 05, 2021 at 03:15:26PM +0200, Christian Schoenebeck wrote:
> > On Dienstag, 5. Oktober 2021 14:45:56 CEST Stefan Hajnoczi wrote:
> > > On Mon, Oct 04, 2021 at 09:38:04PM +0200, Christian Schoenebeck wrote:
> > > > Refactor VIRTQUEUE_MAX_SIZE to effectively become a runtime
> > > > variable per virtio user.
> > > 
> > > virtio user == virtio device model?
> > 
> > Yes
> > 
> > > > Reasons:
> > > > 
> > > > (1) VIRTQUEUE_MAX_SIZE should reflect the absolute theoretical
> > > > 
> > > >     maximum queue size possible. Which is actually the maximum
> > > >     queue size allowed by the virtio protocol. The appropriate
> > > >     value for VIRTQUEUE_MAX_SIZE would therefore be 32768:
> > > >     
> > > >     https://docs.oasis-open.org/virtio/virtio/v1.1/cs01/virtio-v1.1-cs
> > > >     01.h
> > > >     tml#x1-240006
> > > >     
> > > >     Apparently VIRTQUEUE_MAX_SIZE was instead defined with a
> > > >     more or less arbitrary value of 1024 in the past, which
> > > >     limits the maximum transfer size with virtio to 4M
> > > >     (more precise: 1024 * PAGE_SIZE, with the latter typically
> > > >     being 4k).
> > > 
> > > Being equal to IOV_MAX is a likely reason. Buffers with more iovecs than
> > > that cannot be passed to host system calls (sendmsg(2), pwritev(2),
> > > etc).
> > 
> > Yes, that's use case dependent. Hence the solution to opt-in if it is
> > desired and feasible.
> > 
> > > > (2) Additionally the current value of 1024 poses a hidden limit,
> > > > 
> > > >     invisible to guest, which causes a system hang with the
> > > >     following QEMU error if guest tries to exceed it:
> > > >     
> > > >     virtio: too many write descriptors in indirect table
> > > 
> > > I don't understand this point. 2.6.5 The Virtqueue Descriptor Table 
says:
> > >   The number of descriptors in the table is defined by the queue size
> > >   for
> > > 
> > > this virtqueue: this is the maximum possible descriptor chain length.
> > > 
> > > and 2.6.5.3.1 Driver Requirements: Indirect Descriptors says:
> > >   A driver MUST NOT create a descriptor chain longer than the Queue Size
> > >   of
> > > 
> > > the device.
> > > 
> > > Do you mean a broken/malicious guest driver that is violating the spec?
> > > That's not a hidden limit, it's defined by the spec.
> > 
> > https://lists.gnu.org/archive/html/qemu-devel/2021-10/msg00781.html
> > https://lists.gnu.org/archive/html/qemu-devel/2021-10/msg00788.html
> > 
> > You can already go beyond that queue size at runtime with the indirection
> > table. The only actual limit is the currently hard coded value of 1k
> > pages.
> > Hence the suggestion to turn that into a variable.
> 
> Exceeding Queue Size is a VIRTIO spec violation. Drivers that operate
> outsided the spec do so at their own risk. They may not be compatible
> with all device implementations.

Yes, I am ware about that. And still, this practice is already done, which 
apparently is not limited to 9pfs.

> The limit is not hidden, it's Queue Size as defined by the spec :).
> 
> If you have a driver that is exceeding the limit, then please fix the
> driver.

I absolutely understand your position, but I hope you also understand that 
this violation of the specs is a theoretical issue, it is not a real-life 
problem right now, and due to lack of man power unfortunately I have to 
prioritize real-life problems over theoretical ones ATM. Keep in mind that 
right now I am the only person working on 9pfs actively, I do this voluntarily 
whenever I find a free time slice, and I am not paid for it either.

I don't see any reasonable way with reasonable effort to do what you are 
asking for here in 9pfs, and Greg may correct me here if I am saying anything 
wrong. If you are seeing any specific real-life issue here, then please tell 
me which one, otherwise I have to postpone that "specs violation" issue.

There is still a long list of real problems that I need to hunt down in 9pfs, 
afterwards I can continue with theoretical ones if you want, but right now I 
simply can't, sorry.

> > > > (3) Unfortunately not all virtio users in QEMU would currently
> > > > 
> > > >     work correctly with the new value of 32768.
> > > > 
> > > > So let's turn this hard coded global value into a runtime
> > > > variable as a first step in this commit, configurable for each
> > > > virtio user by passing a corresponding value with virtio_init()
> > > > call.
> > > 
> > > virtio_add_queue() already has an int queue_size argument, why isn't
> > > that enough to deal with the maximum queue size? There's probably a good
> > > reason for it, but please include it in the commit description.
> > 
> > [...]
> > 
> > > Can you make this value per-vq instead of per-vdev since virtqueues can
> > > have different queue sizes?
> > > 
> > > The same applies to the rest of this patch. Anything using
> > > vdev->queue_max_size should probably use vq->vring.num instead.
> > 
> > I would like to avoid that and keep it per device. The maximum size stored
> > there is the maximum size supported by virtio user (or vortio device
> > model,
> > however you want to call it). So that's really a limit per device, not per
> > queue, as no queue of the device would ever exceed that limit.
> > 
> > Plus a lot more code would need to be refactored, which I think is
> > unnecessary.
> 
> I'm against a per-device limit because it's a concept that cannot
> accurately describe reality. Some devices have multiple classes of

It describes current reality, because VIRTQUEUE_MAX_SIZE obviously is not per 
queue either ATM, and nobody ever cared.

All this series does, is allowing to override that currently project-wide 
compile-time constant to a per-driver-model compile-time constant. Which makes 
sense, because that's what it is: some drivers could cope with any transfer 
size, and some drivers are constrained to a certain maximum application 
specific transfer size (e.g. IOV_MAX).

> virtqueues and they are sized differently, so a per-device limit is
> insufficient. virtio-net has separate rx_queue_size and tx_queue_size
> parameters (plus a control vq hardcoded to 64 descriptors).

I simply find this overkill. This value semantically means "my driver model 
supports at any time and at any coincidence at the very most x * PAGE_SIZE = 
max_transfer_size". Do you see any driver that might want a more fine graded 
control over this?

As far as I can see, no other driver maintainer even seems to care to 
transition to 32k. So I simply doubt that anybody would even want a more 
fained graded control over this in practice, but anyway ...

> The specification already gives us Queue Size (vring.num in QEMU). The
> variable exists in QEMU and just needs to be used.
> 
> If per-vq limits require a lot of work, please describe why. I think
> replacing the variable from this patch with virtio_queue_get_num()
> should be fairly straightforward, but maybe I'm missing something? (If
> you prefer VirtQueue *vq instead of the index-based
> virtio_queue_get_num() API, you can introduce a virtqueue_get_num()
> API.)
> 
> Stefan

... I leave that up to Michael or whoever might be in charge to decide. I 
still find this overkill, but I will adapt this to whatever the decision 
eventually will be in v3.

But then please tell me the precise representation that you find appropriate, 
i.e. whether you want a new function for that, or rather an additional 
argument to virtio_add_queue(). Your call.

Best regards,
Christian Schoenebeck
Stefan Hajnoczi Oct. 6, 2021, 11:06 a.m. UTC | #6
On Tue, Oct 05, 2021 at 06:32:46PM +0200, Christian Schoenebeck wrote:
> On Dienstag, 5. Oktober 2021 17:10:40 CEST Stefan Hajnoczi wrote:
> > On Tue, Oct 05, 2021 at 03:15:26PM +0200, Christian Schoenebeck wrote:
> > > On Dienstag, 5. Oktober 2021 14:45:56 CEST Stefan Hajnoczi wrote:
> > > > On Mon, Oct 04, 2021 at 09:38:04PM +0200, Christian Schoenebeck wrote:
> > > > > Refactor VIRTQUEUE_MAX_SIZE to effectively become a runtime
> > > > > variable per virtio user.
> > > > 
> > > > virtio user == virtio device model?
> > > 
> > > Yes
> > > 
> > > > > Reasons:
> > > > > 
> > > > > (1) VIRTQUEUE_MAX_SIZE should reflect the absolute theoretical
> > > > > 
> > > > >     maximum queue size possible. Which is actually the maximum
> > > > >     queue size allowed by the virtio protocol. The appropriate
> > > > >     value for VIRTQUEUE_MAX_SIZE would therefore be 32768:
> > > > >     
> > > > >     https://docs.oasis-open.org/virtio/virtio/v1.1/cs01/virtio-v1.1-cs
> > > > >     01.h
> > > > >     tml#x1-240006
> > > > >     
> > > > >     Apparently VIRTQUEUE_MAX_SIZE was instead defined with a
> > > > >     more or less arbitrary value of 1024 in the past, which
> > > > >     limits the maximum transfer size with virtio to 4M
> > > > >     (more precise: 1024 * PAGE_SIZE, with the latter typically
> > > > >     being 4k).
> > > > 
> > > > Being equal to IOV_MAX is a likely reason. Buffers with more iovecs than
> > > > that cannot be passed to host system calls (sendmsg(2), pwritev(2),
> > > > etc).
> > > 
> > > Yes, that's use case dependent. Hence the solution to opt-in if it is
> > > desired and feasible.
> > > 
> > > > > (2) Additionally the current value of 1024 poses a hidden limit,
> > > > > 
> > > > >     invisible to guest, which causes a system hang with the
> > > > >     following QEMU error if guest tries to exceed it:
> > > > >     
> > > > >     virtio: too many write descriptors in indirect table
> > > > 
> > > > I don't understand this point. 2.6.5 The Virtqueue Descriptor Table 
> says:
> > > >   The number of descriptors in the table is defined by the queue size
> > > >   for
> > > > 
> > > > this virtqueue: this is the maximum possible descriptor chain length.
> > > > 
> > > > and 2.6.5.3.1 Driver Requirements: Indirect Descriptors says:
> > > >   A driver MUST NOT create a descriptor chain longer than the Queue Size
> > > >   of
> > > > 
> > > > the device.
> > > > 
> > > > Do you mean a broken/malicious guest driver that is violating the spec?
> > > > That's not a hidden limit, it's defined by the spec.
> > > 
> > > https://lists.gnu.org/archive/html/qemu-devel/2021-10/msg00781.html
> > > https://lists.gnu.org/archive/html/qemu-devel/2021-10/msg00788.html
> > > 
> > > You can already go beyond that queue size at runtime with the indirection
> > > table. The only actual limit is the currently hard coded value of 1k
> > > pages.
> > > Hence the suggestion to turn that into a variable.
> > 
> > Exceeding Queue Size is a VIRTIO spec violation. Drivers that operate
> > outsided the spec do so at their own risk. They may not be compatible
> > with all device implementations.
> 
> Yes, I am ware about that. And still, this practice is already done, which 
> apparently is not limited to 9pfs.
> 
> > The limit is not hidden, it's Queue Size as defined by the spec :).
> > 
> > If you have a driver that is exceeding the limit, then please fix the
> > driver.
> 
> I absolutely understand your position, but I hope you also understand that 
> this violation of the specs is a theoretical issue, it is not a real-life 
> problem right now, and due to lack of man power unfortunately I have to 
> prioritize real-life problems over theoretical ones ATM. Keep in mind that 
> right now I am the only person working on 9pfs actively, I do this voluntarily 
> whenever I find a free time slice, and I am not paid for it either.
> 
> I don't see any reasonable way with reasonable effort to do what you are 
> asking for here in 9pfs, and Greg may correct me here if I am saying anything 
> wrong. If you are seeing any specific real-life issue here, then please tell 
> me which one, otherwise I have to postpone that "specs violation" issue.
> 
> There is still a long list of real problems that I need to hunt down in 9pfs, 
> afterwards I can continue with theoretical ones if you want, but right now I 
> simply can't, sorry.

I understand. If you don't have time to fix the Linux virtio-9p driver
then that's fine.

I still wanted us to agree on the spec position because the commit
description says it's a "hidden limit", which is incorrect. It might
seem pedantic, but my concern is that misconceptions can spread if we
let them. That could cause people to write incorrect code later on.
Please update the commit description either by dropping 2) or by
replacing it with something else. For example:

  2) The Linux virtio-9p guest driver does not honor the VIRTIO Queue
     Size value and can submit descriptor chains that exceed it. That is
     a spec violation but is accepted by QEMU's device implementation.

     When the guest creates a descriptor chain larger than 1024 the
     following QEMU error is printed and the guest hangs:

     virtio: too many write descriptors in indirect table

> > > > > (3) Unfortunately not all virtio users in QEMU would currently
> > > > > 
> > > > >     work correctly with the new value of 32768.
> > > > > 
> > > > > So let's turn this hard coded global value into a runtime
> > > > > variable as a first step in this commit, configurable for each
> > > > > virtio user by passing a corresponding value with virtio_init()
> > > > > call.
> > > > 
> > > > virtio_add_queue() already has an int queue_size argument, why isn't
> > > > that enough to deal with the maximum queue size? There's probably a good
> > > > reason for it, but please include it in the commit description.
> > > 
> > > [...]
> > > 
> > > > Can you make this value per-vq instead of per-vdev since virtqueues can
> > > > have different queue sizes?
> > > > 
> > > > The same applies to the rest of this patch. Anything using
> > > > vdev->queue_max_size should probably use vq->vring.num instead.
> > > 
> > > I would like to avoid that and keep it per device. The maximum size stored
> > > there is the maximum size supported by virtio user (or vortio device
> > > model,
> > > however you want to call it). So that's really a limit per device, not per
> > > queue, as no queue of the device would ever exceed that limit.
> > > 
> > > Plus a lot more code would need to be refactored, which I think is
> > > unnecessary.
> > 
> > I'm against a per-device limit because it's a concept that cannot
> > accurately describe reality. Some devices have multiple classes of
> 
> It describes current reality, because VIRTQUEUE_MAX_SIZE obviously is not per 
> queue either ATM, and nobody ever cared.
> 
> All this series does, is allowing to override that currently project-wide 
> compile-time constant to a per-driver-model compile-time constant. Which makes 
> sense, because that's what it is: some drivers could cope with any transfer 
> size, and some drivers are constrained to a certain maximum application 
> specific transfer size (e.g. IOV_MAX).
> 
> > virtqueues and they are sized differently, so a per-device limit is
> > insufficient. virtio-net has separate rx_queue_size and tx_queue_size
> > parameters (plus a control vq hardcoded to 64 descriptors).
> 
> I simply find this overkill. This value semantically means "my driver model 
> supports at any time and at any coincidence at the very most x * PAGE_SIZE = 
> max_transfer_size". Do you see any driver that might want a more fine graded 
> control over this?

One reason why per-vq limits could make sense is that the maximum
possible number of struct elements is allocated upfront in some code
paths. Those code paths may need to differentiate between per-vq limits
for performance or memory utilization reasons. Today some places
allocate 1024 elements on the stack in some code paths, but maybe that's
not acceptable when the per-device limit is 32k. This can matter when a
device has vqs with very different sizes.

> As far as I can see, no other driver maintainer even seems to care to 
> transition to 32k. So I simply doubt that anybody would even want a more 
> fained graded control over this in practice, but anyway ...
> 
> > The specification already gives us Queue Size (vring.num in QEMU). The
> > variable exists in QEMU and just needs to be used.
> > 
> > If per-vq limits require a lot of work, please describe why. I think
> > replacing the variable from this patch with virtio_queue_get_num()
> > should be fairly straightforward, but maybe I'm missing something? (If
> > you prefer VirtQueue *vq instead of the index-based
> > virtio_queue_get_num() API, you can introduce a virtqueue_get_num()
> > API.)
> > 
> > Stefan
> 
> ... I leave that up to Michael or whoever might be in charge to decide. I 
> still find this overkill, but I will adapt this to whatever the decision 
> eventually will be in v3.
> 
> But then please tell me the precise representation that you find appropriate, 
> i.e. whether you want a new function for that, or rather an additional 
> argument to virtio_add_queue(). Your call.

virtio_add_queue() already takes an int queue_size argument. I think the
necessary information is already there.

This patch just needs to be tweaked to use the virtio_queue_get_num()
(or a new virtqueue_get_num() API if that's easier because only a
VirtQueue *vq pointer is available) instead of introducing a new
per-device limit.

The patch will probably become smaller.

Stefan
Christian Schoenebeck Oct. 6, 2021, 12:50 p.m. UTC | #7
On Mittwoch, 6. Oktober 2021 13:06:55 CEST Stefan Hajnoczi wrote:
> On Tue, Oct 05, 2021 at 06:32:46PM +0200, Christian Schoenebeck wrote:
> > On Dienstag, 5. Oktober 2021 17:10:40 CEST Stefan Hajnoczi wrote:
> > > On Tue, Oct 05, 2021 at 03:15:26PM +0200, Christian Schoenebeck wrote:
> > > > On Dienstag, 5. Oktober 2021 14:45:56 CEST Stefan Hajnoczi wrote:
> > > > > On Mon, Oct 04, 2021 at 09:38:04PM +0200, Christian Schoenebeck 
wrote:
> > > > > > Refactor VIRTQUEUE_MAX_SIZE to effectively become a runtime
> > > > > > variable per virtio user.
> > > > > 
> > > > > virtio user == virtio device model?
> > > > 
> > > > Yes
> > > > 
> > > > > > Reasons:
> > > > > > 
> > > > > > (1) VIRTQUEUE_MAX_SIZE should reflect the absolute theoretical
> > > > > > 
> > > > > >     maximum queue size possible. Which is actually the maximum
> > > > > >     queue size allowed by the virtio protocol. The appropriate
> > > > > >     value for VIRTQUEUE_MAX_SIZE would therefore be 32768:
> > > > > >     
> > > > > >     https://docs.oasis-open.org/virtio/virtio/v1.1/cs01/virtio-v1.
> > > > > >     1-cs
> > > > > >     01.h
> > > > > >     tml#x1-240006
> > > > > >     
> > > > > >     Apparently VIRTQUEUE_MAX_SIZE was instead defined with a
> > > > > >     more or less arbitrary value of 1024 in the past, which
> > > > > >     limits the maximum transfer size with virtio to 4M
> > > > > >     (more precise: 1024 * PAGE_SIZE, with the latter typically
> > > > > >     being 4k).
> > > > > 
> > > > > Being equal to IOV_MAX is a likely reason. Buffers with more iovecs
> > > > > than
> > > > > that cannot be passed to host system calls (sendmsg(2), pwritev(2),
> > > > > etc).
> > > > 
> > > > Yes, that's use case dependent. Hence the solution to opt-in if it is
> > > > desired and feasible.
> > > > 
> > > > > > (2) Additionally the current value of 1024 poses a hidden limit,
> > > > > > 
> > > > > >     invisible to guest, which causes a system hang with the
> > > > > >     following QEMU error if guest tries to exceed it:
> > > > > >     
> > > > > >     virtio: too many write descriptors in indirect table
> > > > > 
> > > > > I don't understand this point. 2.6.5 The Virtqueue Descriptor Table
> > 
> > says:
> > > > >   The number of descriptors in the table is defined by the queue
> > > > >   size
> > > > >   for
> > > > > 
> > > > > this virtqueue: this is the maximum possible descriptor chain
> > > > > length.
> > > > > 
> > > > > and 2.6.5.3.1 Driver Requirements: Indirect Descriptors says:
> > > > >   A driver MUST NOT create a descriptor chain longer than the Queue
> > > > >   Size
> > > > >   of
> > > > > 
> > > > > the device.
> > > > > 
> > > > > Do you mean a broken/malicious guest driver that is violating the
> > > > > spec?
> > > > > That's not a hidden limit, it's defined by the spec.
> > > > 
> > > > https://lists.gnu.org/archive/html/qemu-devel/2021-10/msg00781.html
> > > > https://lists.gnu.org/archive/html/qemu-devel/2021-10/msg00788.html
> > > > 
> > > > You can already go beyond that queue size at runtime with the
> > > > indirection
> > > > table. The only actual limit is the currently hard coded value of 1k
> > > > pages.
> > > > Hence the suggestion to turn that into a variable.
> > > 
> > > Exceeding Queue Size is a VIRTIO spec violation. Drivers that operate
> > > outsided the spec do so at their own risk. They may not be compatible
> > > with all device implementations.
> > 
> > Yes, I am ware about that. And still, this practice is already done, which
> > apparently is not limited to 9pfs.
> > 
> > > The limit is not hidden, it's Queue Size as defined by the spec :).
> > > 
> > > If you have a driver that is exceeding the limit, then please fix the
> > > driver.
> > 
> > I absolutely understand your position, but I hope you also understand that
> > this violation of the specs is a theoretical issue, it is not a real-life
> > problem right now, and due to lack of man power unfortunately I have to
> > prioritize real-life problems over theoretical ones ATM. Keep in mind that
> > right now I am the only person working on 9pfs actively, I do this
> > voluntarily whenever I find a free time slice, and I am not paid for it
> > either.
> > 
> > I don't see any reasonable way with reasonable effort to do what you are
> > asking for here in 9pfs, and Greg may correct me here if I am saying
> > anything wrong. If you are seeing any specific real-life issue here, then
> > please tell me which one, otherwise I have to postpone that "specs
> > violation" issue.
> > 
> > There is still a long list of real problems that I need to hunt down in
> > 9pfs, afterwards I can continue with theoretical ones if you want, but
> > right now I simply can't, sorry.
> 
> I understand. If you don't have time to fix the Linux virtio-9p driver
> then that's fine.

I will look at this again, but it might be tricky. On doubt I'll postpone it.

> I still wanted us to agree on the spec position because the commit
> description says it's a "hidden limit", which is incorrect. It might
> seem pedantic, but my concern is that misconceptions can spread if we
> let them. That could cause people to write incorrect code later on.
> Please update the commit description either by dropping 2) or by
> replacing it with something else. For example:
> 
>   2) The Linux virtio-9p guest driver does not honor the VIRTIO Queue
>      Size value and can submit descriptor chains that exceed it. That is
>      a spec violation but is accepted by QEMU's device implementation.
> 
>      When the guest creates a descriptor chain larger than 1024 the
>      following QEMU error is printed and the guest hangs:
> 
>      virtio: too many write descriptors in indirect table

I am fine with both, probably preferring the text block above instead of 
silently dropping the reason, just for clarity.

But keep in mind that this might not be limited to virtio-9p as your text 
would suggest, see below.

> > > > > > (3) Unfortunately not all virtio users in QEMU would currently
> > > > > > 
> > > > > >     work correctly with the new value of 32768.
> > > > > > 
> > > > > > So let's turn this hard coded global value into a runtime
> > > > > > variable as a first step in this commit, configurable for each
> > > > > > virtio user by passing a corresponding value with virtio_init()
> > > > > > call.
> > > > > 
> > > > > virtio_add_queue() already has an int queue_size argument, why isn't
> > > > > that enough to deal with the maximum queue size? There's probably a
> > > > > good
> > > > > reason for it, but please include it in the commit description.
> > > > 
> > > > [...]
> > > > 
> > > > > Can you make this value per-vq instead of per-vdev since virtqueues
> > > > > can
> > > > > have different queue sizes?
> > > > > 
> > > > > The same applies to the rest of this patch. Anything using
> > > > > vdev->queue_max_size should probably use vq->vring.num instead.
> > > > 
> > > > I would like to avoid that and keep it per device. The maximum size
> > > > stored
> > > > there is the maximum size supported by virtio user (or vortio device
> > > > model,
> > > > however you want to call it). So that's really a limit per device, not
> > > > per
> > > > queue, as no queue of the device would ever exceed that limit.
> > > > 
> > > > Plus a lot more code would need to be refactored, which I think is
> > > > unnecessary.
> > > 
> > > I'm against a per-device limit because it's a concept that cannot
> > > accurately describe reality. Some devices have multiple classes of
> > 
> > It describes current reality, because VIRTQUEUE_MAX_SIZE obviously is not
> > per queue either ATM, and nobody ever cared.
> > 
> > All this series does, is allowing to override that currently project-wide
> > compile-time constant to a per-driver-model compile-time constant. Which
> > makes sense, because that's what it is: some drivers could cope with any
> > transfer size, and some drivers are constrained to a certain maximum
> > application specific transfer size (e.g. IOV_MAX).
> > 
> > > virtqueues and they are sized differently, so a per-device limit is
> > > insufficient. virtio-net has separate rx_queue_size and tx_queue_size
> > > parameters (plus a control vq hardcoded to 64 descriptors).
> > 
> > I simply find this overkill. This value semantically means "my driver
> > model
> > supports at any time and at any coincidence at the very most x * PAGE_SIZE
> > = max_transfer_size". Do you see any driver that might want a more fine
> > graded control over this?
> 
> One reason why per-vq limits could make sense is that the maximum
> possible number of struct elements is allocated upfront in some code
> paths. Those code paths may need to differentiate between per-vq limits
> for performance or memory utilization reasons. Today some places
> allocate 1024 elements on the stack in some code paths, but maybe that's
> not acceptable when the per-device limit is 32k. This can matter when a
> device has vqs with very different sizes.
> 
[...]
> > ... I leave that up to Michael or whoever might be in charge to decide. I
> > still find this overkill, but I will adapt this to whatever the decision
> > eventually will be in v3.
> > 
> > But then please tell me the precise representation that you find
> > appropriate, i.e. whether you want a new function for that, or rather an
> > additional argument to virtio_add_queue(). Your call.
> 
> virtio_add_queue() already takes an int queue_size argument. I think the
> necessary information is already there.
> 
> This patch just needs to be tweaked to use the virtio_queue_get_num()
> (or a new virtqueue_get_num() API if that's easier because only a
> VirtQueue *vq pointer is available) instead of introducing a new
> per-device limit.

My understanding is that both the original 9p virtio device authors, as well 
as other virtio device authors in QEMU have been and are still using this as a 
default value (i.e. to allocate some upfront, and the rest on demand).

So yes, I know your argument about the specs, but AFAICS if I would just take 
this existing numeric argument for the limit, then it would probably break 
those other QEMU devices as well.

Best regards,
Christian Schoenebeck
Stefan Hajnoczi Oct. 6, 2021, 2:42 p.m. UTC | #8
On Wed, Oct 06, 2021 at 02:50:07PM +0200, Christian Schoenebeck wrote:
> On Mittwoch, 6. Oktober 2021 13:06:55 CEST Stefan Hajnoczi wrote:
> > On Tue, Oct 05, 2021 at 06:32:46PM +0200, Christian Schoenebeck wrote:
> > > On Dienstag, 5. Oktober 2021 17:10:40 CEST Stefan Hajnoczi wrote:
> > > > On Tue, Oct 05, 2021 at 03:15:26PM +0200, Christian Schoenebeck wrote:
> > > > > On Dienstag, 5. Oktober 2021 14:45:56 CEST Stefan Hajnoczi wrote:
> > > > > > On Mon, Oct 04, 2021 at 09:38:04PM +0200, Christian Schoenebeck 
> wrote:
> > > > > > > Refactor VIRTQUEUE_MAX_SIZE to effectively become a runtime
> > > > > > > variable per virtio user.
> > > > > > 
> > > > > > virtio user == virtio device model?
> > > > > 
> > > > > Yes
> > > > > 
> > > > > > > Reasons:
> > > > > > > 
> > > > > > > (1) VIRTQUEUE_MAX_SIZE should reflect the absolute theoretical
> > > > > > > 
> > > > > > >     maximum queue size possible. Which is actually the maximum
> > > > > > >     queue size allowed by the virtio protocol. The appropriate
> > > > > > >     value for VIRTQUEUE_MAX_SIZE would therefore be 32768:
> > > > > > >     
> > > > > > >     https://docs.oasis-open.org/virtio/virtio/v1.1/cs01/virtio-v1.
> > > > > > >     1-cs
> > > > > > >     01.h
> > > > > > >     tml#x1-240006
> > > > > > >     
> > > > > > >     Apparently VIRTQUEUE_MAX_SIZE was instead defined with a
> > > > > > >     more or less arbitrary value of 1024 in the past, which
> > > > > > >     limits the maximum transfer size with virtio to 4M
> > > > > > >     (more precise: 1024 * PAGE_SIZE, with the latter typically
> > > > > > >     being 4k).
> > > > > > 
> > > > > > Being equal to IOV_MAX is a likely reason. Buffers with more iovecs
> > > > > > than
> > > > > > that cannot be passed to host system calls (sendmsg(2), pwritev(2),
> > > > > > etc).
> > > > > 
> > > > > Yes, that's use case dependent. Hence the solution to opt-in if it is
> > > > > desired and feasible.
> > > > > 
> > > > > > > (2) Additionally the current value of 1024 poses a hidden limit,
> > > > > > > 
> > > > > > >     invisible to guest, which causes a system hang with the
> > > > > > >     following QEMU error if guest tries to exceed it:
> > > > > > >     
> > > > > > >     virtio: too many write descriptors in indirect table
> > > > > > 
> > > > > > I don't understand this point. 2.6.5 The Virtqueue Descriptor Table
> > > 
> > > says:
> > > > > >   The number of descriptors in the table is defined by the queue
> > > > > >   size
> > > > > >   for
> > > > > > 
> > > > > > this virtqueue: this is the maximum possible descriptor chain
> > > > > > length.
> > > > > > 
> > > > > > and 2.6.5.3.1 Driver Requirements: Indirect Descriptors says:
> > > > > >   A driver MUST NOT create a descriptor chain longer than the Queue
> > > > > >   Size
> > > > > >   of
> > > > > > 
> > > > > > the device.
> > > > > > 
> > > > > > Do you mean a broken/malicious guest driver that is violating the
> > > > > > spec?
> > > > > > That's not a hidden limit, it's defined by the spec.
> > > > > 
> > > > > https://lists.gnu.org/archive/html/qemu-devel/2021-10/msg00781.html
> > > > > https://lists.gnu.org/archive/html/qemu-devel/2021-10/msg00788.html
> > > > > 
> > > > > You can already go beyond that queue size at runtime with the
> > > > > indirection
> > > > > table. The only actual limit is the currently hard coded value of 1k
> > > > > pages.
> > > > > Hence the suggestion to turn that into a variable.
> > > > 
> > > > Exceeding Queue Size is a VIRTIO spec violation. Drivers that operate
> > > > outsided the spec do so at their own risk. They may not be compatible
> > > > with all device implementations.
> > > 
> > > Yes, I am ware about that. And still, this practice is already done, which
> > > apparently is not limited to 9pfs.
> > > 
> > > > The limit is not hidden, it's Queue Size as defined by the spec :).
> > > > 
> > > > If you have a driver that is exceeding the limit, then please fix the
> > > > driver.
> > > 
> > > I absolutely understand your position, but I hope you also understand that
> > > this violation of the specs is a theoretical issue, it is not a real-life
> > > problem right now, and due to lack of man power unfortunately I have to
> > > prioritize real-life problems over theoretical ones ATM. Keep in mind that
> > > right now I am the only person working on 9pfs actively, I do this
> > > voluntarily whenever I find a free time slice, and I am not paid for it
> > > either.
> > > 
> > > I don't see any reasonable way with reasonable effort to do what you are
> > > asking for here in 9pfs, and Greg may correct me here if I am saying
> > > anything wrong. If you are seeing any specific real-life issue here, then
> > > please tell me which one, otherwise I have to postpone that "specs
> > > violation" issue.
> > > 
> > > There is still a long list of real problems that I need to hunt down in
> > > 9pfs, afterwards I can continue with theoretical ones if you want, but
> > > right now I simply can't, sorry.
> > 
> > I understand. If you don't have time to fix the Linux virtio-9p driver
> > then that's fine.
> 
> I will look at this again, but it might be tricky. On doubt I'll postpone it.
> 
> > I still wanted us to agree on the spec position because the commit
> > description says it's a "hidden limit", which is incorrect. It might
> > seem pedantic, but my concern is that misconceptions can spread if we
> > let them. That could cause people to write incorrect code later on.
> > Please update the commit description either by dropping 2) or by
> > replacing it with something else. For example:
> > 
> >   2) The Linux virtio-9p guest driver does not honor the VIRTIO Queue
> >      Size value and can submit descriptor chains that exceed it. That is
> >      a spec violation but is accepted by QEMU's device implementation.
> > 
> >      When the guest creates a descriptor chain larger than 1024 the
> >      following QEMU error is printed and the guest hangs:
> > 
> >      virtio: too many write descriptors in indirect table
> 
> I am fine with both, probably preferring the text block above instead of 
> silently dropping the reason, just for clarity.
> 
> But keep in mind that this might not be limited to virtio-9p as your text 
> would suggest, see below.
> 
> > > > > > > (3) Unfortunately not all virtio users in QEMU would currently
> > > > > > > 
> > > > > > >     work correctly with the new value of 32768.
> > > > > > > 
> > > > > > > So let's turn this hard coded global value into a runtime
> > > > > > > variable as a first step in this commit, configurable for each
> > > > > > > virtio user by passing a corresponding value with virtio_init()
> > > > > > > call.
> > > > > > 
> > > > > > virtio_add_queue() already has an int queue_size argument, why isn't
> > > > > > that enough to deal with the maximum queue size? There's probably a
> > > > > > good
> > > > > > reason for it, but please include it in the commit description.
> > > > > 
> > > > > [...]
> > > > > 
> > > > > > Can you make this value per-vq instead of per-vdev since virtqueues
> > > > > > can
> > > > > > have different queue sizes?
> > > > > > 
> > > > > > The same applies to the rest of this patch. Anything using
> > > > > > vdev->queue_max_size should probably use vq->vring.num instead.
> > > > > 
> > > > > I would like to avoid that and keep it per device. The maximum size
> > > > > stored
> > > > > there is the maximum size supported by virtio user (or vortio device
> > > > > model,
> > > > > however you want to call it). So that's really a limit per device, not
> > > > > per
> > > > > queue, as no queue of the device would ever exceed that limit.
> > > > > 
> > > > > Plus a lot more code would need to be refactored, which I think is
> > > > > unnecessary.
> > > > 
> > > > I'm against a per-device limit because it's a concept that cannot
> > > > accurately describe reality. Some devices have multiple classes of
> > > 
> > > It describes current reality, because VIRTQUEUE_MAX_SIZE obviously is not
> > > per queue either ATM, and nobody ever cared.
> > > 
> > > All this series does, is allowing to override that currently project-wide
> > > compile-time constant to a per-driver-model compile-time constant. Which
> > > makes sense, because that's what it is: some drivers could cope with any
> > > transfer size, and some drivers are constrained to a certain maximum
> > > application specific transfer size (e.g. IOV_MAX).
> > > 
> > > > virtqueues and they are sized differently, so a per-device limit is
> > > > insufficient. virtio-net has separate rx_queue_size and tx_queue_size
> > > > parameters (plus a control vq hardcoded to 64 descriptors).
> > > 
> > > I simply find this overkill. This value semantically means "my driver
> > > model
> > > supports at any time and at any coincidence at the very most x * PAGE_SIZE
> > > = max_transfer_size". Do you see any driver that might want a more fine
> > > graded control over this?
> > 
> > One reason why per-vq limits could make sense is that the maximum
> > possible number of struct elements is allocated upfront in some code
> > paths. Those code paths may need to differentiate between per-vq limits
> > for performance or memory utilization reasons. Today some places
> > allocate 1024 elements on the stack in some code paths, but maybe that's
> > not acceptable when the per-device limit is 32k. This can matter when a
> > device has vqs with very different sizes.
> > 
> [...]
> > > ... I leave that up to Michael or whoever might be in charge to decide. I
> > > still find this overkill, but I will adapt this to whatever the decision
> > > eventually will be in v3.
> > > 
> > > But then please tell me the precise representation that you find
> > > appropriate, i.e. whether you want a new function for that, or rather an
> > > additional argument to virtio_add_queue(). Your call.
> > 
> > virtio_add_queue() already takes an int queue_size argument. I think the
> > necessary information is already there.
> > 
> > This patch just needs to be tweaked to use the virtio_queue_get_num()
> > (or a new virtqueue_get_num() API if that's easier because only a
> > VirtQueue *vq pointer is available) instead of introducing a new
> > per-device limit.
> 
> My understanding is that both the original 9p virtio device authors, as well 
> as other virtio device authors in QEMU have been and are still using this as a 
> default value (i.e. to allocate some upfront, and the rest on demand).
> 
> So yes, I know your argument about the specs, but AFAICS if I would just take 
> this existing numeric argument for the limit, then it would probably break 
> those other QEMU devices as well.

This is a good point that I didn't consider. If guest drivers currently
violate the spec, then restricting descriptor chain length to vring.num
will introduce regressions.

We can't use virtio_queue_get_num() directly. A backwards-compatible
limit is required:

  int virtio_queue_get_desc_chain_max(VirtIODevice *vdev, int n)
  {
      /*
       * QEMU historically allowed 1024 descriptors even if the
       * descriptor table was smaller.
       */
      return MAX(virtio_queue_get_num(vdev, qidx), 1024);
  }

Device models should call virtio_queue_get_desc_chain_max(). It
preserves the 1024 descriptor chain length but also allows larger values
if the virtqueue was configured appropriately.

Does this address the breakage you were thinking about?

Stefan
Christian Schoenebeck Oct. 7, 2021, 1:09 p.m. UTC | #9
On Mittwoch, 6. Oktober 2021 16:42:34 CEST Stefan Hajnoczi wrote:
> On Wed, Oct 06, 2021 at 02:50:07PM +0200, Christian Schoenebeck wrote:
> > On Mittwoch, 6. Oktober 2021 13:06:55 CEST Stefan Hajnoczi wrote:
> > > On Tue, Oct 05, 2021 at 06:32:46PM +0200, Christian Schoenebeck wrote:
> > > > On Dienstag, 5. Oktober 2021 17:10:40 CEST Stefan Hajnoczi wrote:
> > > > > On Tue, Oct 05, 2021 at 03:15:26PM +0200, Christian Schoenebeck 
wrote:
> > > > > > On Dienstag, 5. Oktober 2021 14:45:56 CEST Stefan Hajnoczi wrote:
> > > > > > > On Mon, Oct 04, 2021 at 09:38:04PM +0200, Christian Schoenebeck
> > 
> > wrote:
> > > > > > > > Refactor VIRTQUEUE_MAX_SIZE to effectively become a runtime
> > > > > > > > variable per virtio user.
> > > > > > > 
> > > > > > > virtio user == virtio device model?
> > > > > > 
> > > > > > Yes
> > > > > > 
> > > > > > > > Reasons:
> > > > > > > > 
> > > > > > > > (1) VIRTQUEUE_MAX_SIZE should reflect the absolute theoretical
> > > > > > > > 
> > > > > > > >     maximum queue size possible. Which is actually the maximum
> > > > > > > >     queue size allowed by the virtio protocol. The appropriate
> > > > > > > >     value for VIRTQUEUE_MAX_SIZE would therefore be 32768:
> > > > > > > >     
> > > > > > > >     https://docs.oasis-open.org/virtio/virtio/v1.1/cs01/virtio
> > > > > > > >     -v1.
> > > > > > > >     1-cs
> > > > > > > >     01.h
> > > > > > > >     tml#x1-240006
> > > > > > > >     
> > > > > > > >     Apparently VIRTQUEUE_MAX_SIZE was instead defined with a
> > > > > > > >     more or less arbitrary value of 1024 in the past, which
> > > > > > > >     limits the maximum transfer size with virtio to 4M
> > > > > > > >     (more precise: 1024 * PAGE_SIZE, with the latter typically
> > > > > > > >     being 4k).
> > > > > > > 
> > > > > > > Being equal to IOV_MAX is a likely reason. Buffers with more
> > > > > > > iovecs
> > > > > > > than
> > > > > > > that cannot be passed to host system calls (sendmsg(2),
> > > > > > > pwritev(2),
> > > > > > > etc).
> > > > > > 
> > > > > > Yes, that's use case dependent. Hence the solution to opt-in if it
> > > > > > is
> > > > > > desired and feasible.
> > > > > > 
> > > > > > > > (2) Additionally the current value of 1024 poses a hidden
> > > > > > > > limit,
> > > > > > > > 
> > > > > > > >     invisible to guest, which causes a system hang with the
> > > > > > > >     following QEMU error if guest tries to exceed it:
> > > > > > > >     
> > > > > > > >     virtio: too many write descriptors in indirect table
> > > > > > > 
> > > > > > > I don't understand this point. 2.6.5 The Virtqueue Descriptor
> > > > > > > Table
> > > > 
> > > > says:
> > > > > > >   The number of descriptors in the table is defined by the queue
> > > > > > >   size
> > > > > > >   for
> > > > > > > 
> > > > > > > this virtqueue: this is the maximum possible descriptor chain
> > > > > > > length.
> > > > > > > 
> > > > > > > and 2.6.5.3.1 Driver Requirements: Indirect Descriptors says:
> > > > > > >   A driver MUST NOT create a descriptor chain longer than the
> > > > > > >   Queue
> > > > > > >   Size
> > > > > > >   of
> > > > > > > 
> > > > > > > the device.
> > > > > > > 
> > > > > > > Do you mean a broken/malicious guest driver that is violating
> > > > > > > the
> > > > > > > spec?
> > > > > > > That's not a hidden limit, it's defined by the spec.
> > > > > > 
> > > > > > https://lists.gnu.org/archive/html/qemu-devel/2021-10/msg00781.htm
> > > > > > l
> > > > > > https://lists.gnu.org/archive/html/qemu-devel/2021-10/msg00788.htm
> > > > > > l
> > > > > > 
> > > > > > You can already go beyond that queue size at runtime with the
> > > > > > indirection
> > > > > > table. The only actual limit is the currently hard coded value of
> > > > > > 1k
> > > > > > pages.
> > > > > > Hence the suggestion to turn that into a variable.
> > > > > 
> > > > > Exceeding Queue Size is a VIRTIO spec violation. Drivers that
> > > > > operate
> > > > > outsided the spec do so at their own risk. They may not be
> > > > > compatible
> > > > > with all device implementations.
> > > > 
> > > > Yes, I am ware about that. And still, this practice is already done,
> > > > which
> > > > apparently is not limited to 9pfs.
> > > > 
> > > > > The limit is not hidden, it's Queue Size as defined by the spec :).
> > > > > 
> > > > > If you have a driver that is exceeding the limit, then please fix
> > > > > the
> > > > > driver.
> > > > 
> > > > I absolutely understand your position, but I hope you also understand
> > > > that
> > > > this violation of the specs is a theoretical issue, it is not a
> > > > real-life
> > > > problem right now, and due to lack of man power unfortunately I have
> > > > to
> > > > prioritize real-life problems over theoretical ones ATM. Keep in mind
> > > > that
> > > > right now I am the only person working on 9pfs actively, I do this
> > > > voluntarily whenever I find a free time slice, and I am not paid for
> > > > it
> > > > either.
> > > > 
> > > > I don't see any reasonable way with reasonable effort to do what you
> > > > are
> > > > asking for here in 9pfs, and Greg may correct me here if I am saying
> > > > anything wrong. If you are seeing any specific real-life issue here,
> > > > then
> > > > please tell me which one, otherwise I have to postpone that "specs
> > > > violation" issue.
> > > > 
> > > > There is still a long list of real problems that I need to hunt down
> > > > in
> > > > 9pfs, afterwards I can continue with theoretical ones if you want, but
> > > > right now I simply can't, sorry.
> > > 
> > > I understand. If you don't have time to fix the Linux virtio-9p driver
> > > then that's fine.
> > 
> > I will look at this again, but it might be tricky. On doubt I'll postpone
> > it.> 
> > > I still wanted us to agree on the spec position because the commit
> > > description says it's a "hidden limit", which is incorrect. It might
> > > seem pedantic, but my concern is that misconceptions can spread if we
> > > let them. That could cause people to write incorrect code later on.
> > > Please update the commit description either by dropping 2) or by
> > > 
> > > replacing it with something else. For example:
> > >   2) The Linux virtio-9p guest driver does not honor the VIRTIO Queue
> > >   
> > >      Size value and can submit descriptor chains that exceed it. That is
> > >      a spec violation but is accepted by QEMU's device implementation.
> > >      
> > >      When the guest creates a descriptor chain larger than 1024 the
> > >      following QEMU error is printed and the guest hangs:
> > >      
> > >      virtio: too many write descriptors in indirect table
> > 
> > I am fine with both, probably preferring the text block above instead of
> > silently dropping the reason, just for clarity.
> > 
> > But keep in mind that this might not be limited to virtio-9p as your text
> > would suggest, see below.
> > 
> > > > > > > > (3) Unfortunately not all virtio users in QEMU would currently
> > > > > > > > 
> > > > > > > >     work correctly with the new value of 32768.
> > > > > > > > 
> > > > > > > > So let's turn this hard coded global value into a runtime
> > > > > > > > variable as a first step in this commit, configurable for each
> > > > > > > > virtio user by passing a corresponding value with
> > > > > > > > virtio_init()
> > > > > > > > call.
> > > > > > > 
> > > > > > > virtio_add_queue() already has an int queue_size argument, why
> > > > > > > isn't
> > > > > > > that enough to deal with the maximum queue size? There's
> > > > > > > probably a
> > > > > > > good
> > > > > > > reason for it, but please include it in the commit description.
> > > > > > 
> > > > > > [...]
> > > > > > 
> > > > > > > Can you make this value per-vq instead of per-vdev since
> > > > > > > virtqueues
> > > > > > > can
> > > > > > > have different queue sizes?
> > > > > > > 
> > > > > > > The same applies to the rest of this patch. Anything using
> > > > > > > vdev->queue_max_size should probably use vq->vring.num instead.
> > > > > > 
> > > > > > I would like to avoid that and keep it per device. The maximum
> > > > > > size
> > > > > > stored
> > > > > > there is the maximum size supported by virtio user (or vortio
> > > > > > device
> > > > > > model,
> > > > > > however you want to call it). So that's really a limit per device,
> > > > > > not
> > > > > > per
> > > > > > queue, as no queue of the device would ever exceed that limit.
> > > > > > 
> > > > > > Plus a lot more code would need to be refactored, which I think is
> > > > > > unnecessary.
> > > > > 
> > > > > I'm against a per-device limit because it's a concept that cannot
> > > > > accurately describe reality. Some devices have multiple classes of
> > > > 
> > > > It describes current reality, because VIRTQUEUE_MAX_SIZE obviously is
> > > > not
> > > > per queue either ATM, and nobody ever cared.
> > > > 
> > > > All this series does, is allowing to override that currently
> > > > project-wide
> > > > compile-time constant to a per-driver-model compile-time constant.
> > > > Which
> > > > makes sense, because that's what it is: some drivers could cope with
> > > > any
> > > > transfer size, and some drivers are constrained to a certain maximum
> > > > application specific transfer size (e.g. IOV_MAX).
> > > > 
> > > > > virtqueues and they are sized differently, so a per-device limit is
> > > > > insufficient. virtio-net has separate rx_queue_size and
> > > > > tx_queue_size
> > > > > parameters (plus a control vq hardcoded to 64 descriptors).
> > > > 
> > > > I simply find this overkill. This value semantically means "my driver
> > > > model
> > > > supports at any time and at any coincidence at the very most x *
> > > > PAGE_SIZE
> > > > = max_transfer_size". Do you see any driver that might want a more
> > > > fine
> > > > graded control over this?
> > > 
> > > One reason why per-vq limits could make sense is that the maximum
> > > possible number of struct elements is allocated upfront in some code
> > > paths. Those code paths may need to differentiate between per-vq limits
> > > for performance or memory utilization reasons. Today some places
> > > allocate 1024 elements on the stack in some code paths, but maybe that's
> > > not acceptable when the per-device limit is 32k. This can matter when a
> > > device has vqs with very different sizes.
> > 
> > [...]
> > 
> > > > ... I leave that up to Michael or whoever might be in charge to
> > > > decide. I
> > > > still find this overkill, but I will adapt this to whatever the
> > > > decision
> > > > eventually will be in v3.
> > > > 
> > > > But then please tell me the precise representation that you find
> > > > appropriate, i.e. whether you want a new function for that, or rather
> > > > an
> > > > additional argument to virtio_add_queue(). Your call.
> > > 
> > > virtio_add_queue() already takes an int queue_size argument. I think the
> > > necessary information is already there.
> > > 
> > > This patch just needs to be tweaked to use the virtio_queue_get_num()
> > > (or a new virtqueue_get_num() API if that's easier because only a
> > > VirtQueue *vq pointer is available) instead of introducing a new
> > > per-device limit.
> > 
> > My understanding is that both the original 9p virtio device authors, as
> > well as other virtio device authors in QEMU have been and are still using
> > this as a default value (i.e. to allocate some upfront, and the rest on
> > demand).
> > 
> > So yes, I know your argument about the specs, but AFAICS if I would just
> > take this existing numeric argument for the limit, then it would probably
> > break those other QEMU devices as well.
> 
> This is a good point that I didn't consider. If guest drivers currently
> violate the spec, then restricting descriptor chain length to vring.num
> will introduce regressions.
> 
> We can't use virtio_queue_get_num() directly. A backwards-compatible
> limit is required:
> 
>   int virtio_queue_get_desc_chain_max(VirtIODevice *vdev, int n)
>   {
>       /*
>        * QEMU historically allowed 1024 descriptors even if the
>        * descriptor table was smaller.
>        */
>       return MAX(virtio_queue_get_num(vdev, qidx), 1024);
>   }

That was an alternative that I thought about as well, but decided against. It 
would require devices (that would want to support large transmissions sizes) 
to create the virtio queue(s) with the maximum possible size, i.e:

  virtio_add_queue(32k);

And that's the point where my current lack of knowledge, of what this would 
precisely mean to the resulting allocation set, decided against it. I mean 
would that mean would QEMU's virtio implementation would just a) allocate 32k 
scatter gather list entries? Or would it rather b) additionally also allocate 
the destination memory pages as well?

If you know the answer to that, that would be appreciated. Otherwise I will 
check this when I find some time.

Because if it is b) then I guess many people would not be happy about this. 
Because that change would mean it would allocate 128M for every 9p mount 
point, even if people don't care about high transmission sizes.

> Device models should call virtio_queue_get_desc_chain_max(). It
> preserves the 1024 descriptor chain length but also allows larger values
> if the virtqueue was configured appropriately.
> 
> Does this address the breakage you were thinking about?
> 
> Stefan

Best regards,
Christian Schoenebeck
Stefan Hajnoczi Oct. 7, 2021, 3:18 p.m. UTC | #10
On Thu, Oct 07, 2021 at 03:09:16PM +0200, Christian Schoenebeck wrote:
> On Mittwoch, 6. Oktober 2021 16:42:34 CEST Stefan Hajnoczi wrote:
> > On Wed, Oct 06, 2021 at 02:50:07PM +0200, Christian Schoenebeck wrote:
> > > On Mittwoch, 6. Oktober 2021 13:06:55 CEST Stefan Hajnoczi wrote:
> > > > On Tue, Oct 05, 2021 at 06:32:46PM +0200, Christian Schoenebeck wrote:
> > > > > On Dienstag, 5. Oktober 2021 17:10:40 CEST Stefan Hajnoczi wrote:
> > > > > > On Tue, Oct 05, 2021 at 03:15:26PM +0200, Christian Schoenebeck 
> wrote:
> > > > > > > On Dienstag, 5. Oktober 2021 14:45:56 CEST Stefan Hajnoczi wrote:
> > > > > > > > On Mon, Oct 04, 2021 at 09:38:04PM +0200, Christian Schoenebeck
> > > 
> > > wrote:
> > > > > > > > > Refactor VIRTQUEUE_MAX_SIZE to effectively become a runtime
> > > > > > > > > variable per virtio user.
> > > > > > > > 
> > > > > > > > virtio user == virtio device model?
> > > > > > > 
> > > > > > > Yes
> > > > > > > 
> > > > > > > > > Reasons:
> > > > > > > > > 
> > > > > > > > > (1) VIRTQUEUE_MAX_SIZE should reflect the absolute theoretical
> > > > > > > > > 
> > > > > > > > >     maximum queue size possible. Which is actually the maximum
> > > > > > > > >     queue size allowed by the virtio protocol. The appropriate
> > > > > > > > >     value for VIRTQUEUE_MAX_SIZE would therefore be 32768:
> > > > > > > > >     
> > > > > > > > >     https://docs.oasis-open.org/virtio/virtio/v1.1/cs01/virtio
> > > > > > > > >     -v1.
> > > > > > > > >     1-cs
> > > > > > > > >     01.h
> > > > > > > > >     tml#x1-240006
> > > > > > > > >     
> > > > > > > > >     Apparently VIRTQUEUE_MAX_SIZE was instead defined with a
> > > > > > > > >     more or less arbitrary value of 1024 in the past, which
> > > > > > > > >     limits the maximum transfer size with virtio to 4M
> > > > > > > > >     (more precise: 1024 * PAGE_SIZE, with the latter typically
> > > > > > > > >     being 4k).
> > > > > > > > 
> > > > > > > > Being equal to IOV_MAX is a likely reason. Buffers with more
> > > > > > > > iovecs
> > > > > > > > than
> > > > > > > > that cannot be passed to host system calls (sendmsg(2),
> > > > > > > > pwritev(2),
> > > > > > > > etc).
> > > > > > > 
> > > > > > > Yes, that's use case dependent. Hence the solution to opt-in if it
> > > > > > > is
> > > > > > > desired and feasible.
> > > > > > > 
> > > > > > > > > (2) Additionally the current value of 1024 poses a hidden
> > > > > > > > > limit,
> > > > > > > > > 
> > > > > > > > >     invisible to guest, which causes a system hang with the
> > > > > > > > >     following QEMU error if guest tries to exceed it:
> > > > > > > > >     
> > > > > > > > >     virtio: too many write descriptors in indirect table
> > > > > > > > 
> > > > > > > > I don't understand this point. 2.6.5 The Virtqueue Descriptor
> > > > > > > > Table
> > > > > 
> > > > > says:
> > > > > > > >   The number of descriptors in the table is defined by the queue
> > > > > > > >   size
> > > > > > > >   for
> > > > > > > > 
> > > > > > > > this virtqueue: this is the maximum possible descriptor chain
> > > > > > > > length.
> > > > > > > > 
> > > > > > > > and 2.6.5.3.1 Driver Requirements: Indirect Descriptors says:
> > > > > > > >   A driver MUST NOT create a descriptor chain longer than the
> > > > > > > >   Queue
> > > > > > > >   Size
> > > > > > > >   of
> > > > > > > > 
> > > > > > > > the device.
> > > > > > > > 
> > > > > > > > Do you mean a broken/malicious guest driver that is violating
> > > > > > > > the
> > > > > > > > spec?
> > > > > > > > That's not a hidden limit, it's defined by the spec.
> > > > > > > 
> > > > > > > https://lists.gnu.org/archive/html/qemu-devel/2021-10/msg00781.htm
> > > > > > > l
> > > > > > > https://lists.gnu.org/archive/html/qemu-devel/2021-10/msg00788.htm
> > > > > > > l
> > > > > > > 
> > > > > > > You can already go beyond that queue size at runtime with the
> > > > > > > indirection
> > > > > > > table. The only actual limit is the currently hard coded value of
> > > > > > > 1k
> > > > > > > pages.
> > > > > > > Hence the suggestion to turn that into a variable.
> > > > > > 
> > > > > > Exceeding Queue Size is a VIRTIO spec violation. Drivers that
> > > > > > operate
> > > > > > outsided the spec do so at their own risk. They may not be
> > > > > > compatible
> > > > > > with all device implementations.
> > > > > 
> > > > > Yes, I am ware about that. And still, this practice is already done,
> > > > > which
> > > > > apparently is not limited to 9pfs.
> > > > > 
> > > > > > The limit is not hidden, it's Queue Size as defined by the spec :).
> > > > > > 
> > > > > > If you have a driver that is exceeding the limit, then please fix
> > > > > > the
> > > > > > driver.
> > > > > 
> > > > > I absolutely understand your position, but I hope you also understand
> > > > > that
> > > > > this violation of the specs is a theoretical issue, it is not a
> > > > > real-life
> > > > > problem right now, and due to lack of man power unfortunately I have
> > > > > to
> > > > > prioritize real-life problems over theoretical ones ATM. Keep in mind
> > > > > that
> > > > > right now I am the only person working on 9pfs actively, I do this
> > > > > voluntarily whenever I find a free time slice, and I am not paid for
> > > > > it
> > > > > either.
> > > > > 
> > > > > I don't see any reasonable way with reasonable effort to do what you
> > > > > are
> > > > > asking for here in 9pfs, and Greg may correct me here if I am saying
> > > > > anything wrong. If you are seeing any specific real-life issue here,
> > > > > then
> > > > > please tell me which one, otherwise I have to postpone that "specs
> > > > > violation" issue.
> > > > > 
> > > > > There is still a long list of real problems that I need to hunt down
> > > > > in
> > > > > 9pfs, afterwards I can continue with theoretical ones if you want, but
> > > > > right now I simply can't, sorry.
> > > > 
> > > > I understand. If you don't have time to fix the Linux virtio-9p driver
> > > > then that's fine.
> > > 
> > > I will look at this again, but it might be tricky. On doubt I'll postpone
> > > it.> 
> > > > I still wanted us to agree on the spec position because the commit
> > > > description says it's a "hidden limit", which is incorrect. It might
> > > > seem pedantic, but my concern is that misconceptions can spread if we
> > > > let them. That could cause people to write incorrect code later on.
> > > > Please update the commit description either by dropping 2) or by
> > > > 
> > > > replacing it with something else. For example:
> > > >   2) The Linux virtio-9p guest driver does not honor the VIRTIO Queue
> > > >   
> > > >      Size value and can submit descriptor chains that exceed it. That is
> > > >      a spec violation but is accepted by QEMU's device implementation.
> > > >      
> > > >      When the guest creates a descriptor chain larger than 1024 the
> > > >      following QEMU error is printed and the guest hangs:
> > > >      
> > > >      virtio: too many write descriptors in indirect table
> > > 
> > > I am fine with both, probably preferring the text block above instead of
> > > silently dropping the reason, just for clarity.
> > > 
> > > But keep in mind that this might not be limited to virtio-9p as your text
> > > would suggest, see below.
> > > 
> > > > > > > > > (3) Unfortunately not all virtio users in QEMU would currently
> > > > > > > > > 
> > > > > > > > >     work correctly with the new value of 32768.
> > > > > > > > > 
> > > > > > > > > So let's turn this hard coded global value into a runtime
> > > > > > > > > variable as a first step in this commit, configurable for each
> > > > > > > > > virtio user by passing a corresponding value with
> > > > > > > > > virtio_init()
> > > > > > > > > call.
> > > > > > > > 
> > > > > > > > virtio_add_queue() already has an int queue_size argument, why
> > > > > > > > isn't
> > > > > > > > that enough to deal with the maximum queue size? There's
> > > > > > > > probably a
> > > > > > > > good
> > > > > > > > reason for it, but please include it in the commit description.
> > > > > > > 
> > > > > > > [...]
> > > > > > > 
> > > > > > > > Can you make this value per-vq instead of per-vdev since
> > > > > > > > virtqueues
> > > > > > > > can
> > > > > > > > have different queue sizes?
> > > > > > > > 
> > > > > > > > The same applies to the rest of this patch. Anything using
> > > > > > > > vdev->queue_max_size should probably use vq->vring.num instead.
> > > > > > > 
> > > > > > > I would like to avoid that and keep it per device. The maximum
> > > > > > > size
> > > > > > > stored
> > > > > > > there is the maximum size supported by virtio user (or vortio
> > > > > > > device
> > > > > > > model,
> > > > > > > however you want to call it). So that's really a limit per device,
> > > > > > > not
> > > > > > > per
> > > > > > > queue, as no queue of the device would ever exceed that limit.
> > > > > > > 
> > > > > > > Plus a lot more code would need to be refactored, which I think is
> > > > > > > unnecessary.
> > > > > > 
> > > > > > I'm against a per-device limit because it's a concept that cannot
> > > > > > accurately describe reality. Some devices have multiple classes of
> > > > > 
> > > > > It describes current reality, because VIRTQUEUE_MAX_SIZE obviously is
> > > > > not
> > > > > per queue either ATM, and nobody ever cared.
> > > > > 
> > > > > All this series does, is allowing to override that currently
> > > > > project-wide
> > > > > compile-time constant to a per-driver-model compile-time constant.
> > > > > Which
> > > > > makes sense, because that's what it is: some drivers could cope with
> > > > > any
> > > > > transfer size, and some drivers are constrained to a certain maximum
> > > > > application specific transfer size (e.g. IOV_MAX).
> > > > > 
> > > > > > virtqueues and they are sized differently, so a per-device limit is
> > > > > > insufficient. virtio-net has separate rx_queue_size and
> > > > > > tx_queue_size
> > > > > > parameters (plus a control vq hardcoded to 64 descriptors).
> > > > > 
> > > > > I simply find this overkill. This value semantically means "my driver
> > > > > model
> > > > > supports at any time and at any coincidence at the very most x *
> > > > > PAGE_SIZE
> > > > > = max_transfer_size". Do you see any driver that might want a more
> > > > > fine
> > > > > graded control over this?
> > > > 
> > > > One reason why per-vq limits could make sense is that the maximum
> > > > possible number of struct elements is allocated upfront in some code
> > > > paths. Those code paths may need to differentiate between per-vq limits
> > > > for performance or memory utilization reasons. Today some places
> > > > allocate 1024 elements on the stack in some code paths, but maybe that's
> > > > not acceptable when the per-device limit is 32k. This can matter when a
> > > > device has vqs with very different sizes.
> > > 
> > > [...]
> > > 
> > > > > ... I leave that up to Michael or whoever might be in charge to
> > > > > decide. I
> > > > > still find this overkill, but I will adapt this to whatever the
> > > > > decision
> > > > > eventually will be in v3.
> > > > > 
> > > > > But then please tell me the precise representation that you find
> > > > > appropriate, i.e. whether you want a new function for that, or rather
> > > > > an
> > > > > additional argument to virtio_add_queue(). Your call.
> > > > 
> > > > virtio_add_queue() already takes an int queue_size argument. I think the
> > > > necessary information is already there.
> > > > 
> > > > This patch just needs to be tweaked to use the virtio_queue_get_num()
> > > > (or a new virtqueue_get_num() API if that's easier because only a
> > > > VirtQueue *vq pointer is available) instead of introducing a new
> > > > per-device limit.
> > > 
> > > My understanding is that both the original 9p virtio device authors, as
> > > well as other virtio device authors in QEMU have been and are still using
> > > this as a default value (i.e. to allocate some upfront, and the rest on
> > > demand).
> > > 
> > > So yes, I know your argument about the specs, but AFAICS if I would just
> > > take this existing numeric argument for the limit, then it would probably
> > > break those other QEMU devices as well.
> > 
> > This is a good point that I didn't consider. If guest drivers currently
> > violate the spec, then restricting descriptor chain length to vring.num
> > will introduce regressions.
> > 
> > We can't use virtio_queue_get_num() directly. A backwards-compatible
> > limit is required:
> > 
> >   int virtio_queue_get_desc_chain_max(VirtIODevice *vdev, int n)
> >   {
> >       /*
> >        * QEMU historically allowed 1024 descriptors even if the
> >        * descriptor table was smaller.
> >        */
> >       return MAX(virtio_queue_get_num(vdev, qidx), 1024);
> >   }
> 
> That was an alternative that I thought about as well, but decided against. It 
> would require devices (that would want to support large transmissions sizes) 
> to create the virtio queue(s) with the maximum possible size, i.e:
> 
>   virtio_add_queue(32k);

The spec allows drivers to set the size of the vring as long as they do
not exceed Queue Size.

The Linux drivers accept the device's default size, so you're right that
this would cause large vrings to be allocated if the device sets the
virtqueue size to 32k.

> And that's the point where my current lack of knowledge, of what this would 
> precisely mean to the resulting allocation set, decided against it. I mean 
> would that mean would QEMU's virtio implementation would just a) allocate 32k 
> scatter gather list entries? Or would it rather b) additionally also allocate 
> the destination memory pages as well?

The vring consumes guest RAM but it just consists of descriptors, not
the buffer memory pages. The guest RAM requirements are:
- split layout: 32k * 16 + 6 + 32k * 2 + 6 + 8 * 32k = 851,980 bytes
- packed layout: 32k * 16 + 4 + 4 = 524,296 bytes

That's still quite large!

By the way, virtio-blk currently uses a virtqueue size of 256
descriptors and this has been found reasonable for disk I/O performance.
The Linux block layer splits requests at around 1.25 MB for virtio-blk.
The virtio-blk queue limits are reported by the device and the guest
Linux block layer uses them to size/split requests appropriately. I'm
not sure 9p really needs 32k, although you're right that fragmented
physical memory requires 32k descriptors to describe 128 MB of buffers.

Going back to the original problem, a vring feature bit could be added
to the VIRTIO specification indicating that indirect descriptor tables
are limited to the maximum (32k) instead of Queue Size. This way the
device's default vring size could be small but drivers could allocate
indirect descriptor tables that are large when necessary. Then the Linux
virtio driver API would need to report the maximum supported sglist
length for a given virtqueue so drivers can take advantage of this
information.

Stefan
Christian Schoenebeck Oct. 8, 2021, 2:48 p.m. UTC | #11
On Donnerstag, 7. Oktober 2021 17:18:03 CEST Stefan Hajnoczi wrote:
> On Thu, Oct 07, 2021 at 03:09:16PM +0200, Christian Schoenebeck wrote:
> > On Mittwoch, 6. Oktober 2021 16:42:34 CEST Stefan Hajnoczi wrote:
> > > On Wed, Oct 06, 2021 at 02:50:07PM +0200, Christian Schoenebeck wrote:
> > > > On Mittwoch, 6. Oktober 2021 13:06:55 CEST Stefan Hajnoczi wrote:
> > > > > On Tue, Oct 05, 2021 at 06:32:46PM +0200, Christian Schoenebeck 
wrote:
> > > > > > On Dienstag, 5. Oktober 2021 17:10:40 CEST Stefan Hajnoczi wrote:
> > > > > > > On Tue, Oct 05, 2021 at 03:15:26PM +0200, Christian Schoenebeck
> > 
> > wrote:
> > > > > > > > On Dienstag, 5. Oktober 2021 14:45:56 CEST Stefan Hajnoczi 
wrote:
> > > > > > > > > On Mon, Oct 04, 2021 at 09:38:04PM +0200, Christian
> > > > > > > > > Schoenebeck
> > > > 
> > > > wrote:
> > > > > > > > > > Refactor VIRTQUEUE_MAX_SIZE to effectively become a
> > > > > > > > > > runtime
> > > > > > > > > > variable per virtio user.
> > > > > > > > > 
> > > > > > > > > virtio user == virtio device model?
> > > > > > > > 
> > > > > > > > Yes
> > > > > > > > 
> > > > > > > > > > Reasons:
> > > > > > > > > > 
> > > > > > > > > > (1) VIRTQUEUE_MAX_SIZE should reflect the absolute
> > > > > > > > > > theoretical
> > > > > > > > > > 
> > > > > > > > > >     maximum queue size possible. Which is actually the
> > > > > > > > > >     maximum
> > > > > > > > > >     queue size allowed by the virtio protocol. The
> > > > > > > > > >     appropriate
> > > > > > > > > >     value for VIRTQUEUE_MAX_SIZE would therefore be 32768:
> > > > > > > > > >     
> > > > > > > > > >     https://docs.oasis-open.org/virtio/virtio/v1.1/cs01/vi
> > > > > > > > > >     rtio
> > > > > > > > > >     -v1.
> > > > > > > > > >     1-cs
> > > > > > > > > >     01.h
> > > > > > > > > >     tml#x1-240006
> > > > > > > > > >     
> > > > > > > > > >     Apparently VIRTQUEUE_MAX_SIZE was instead defined with
> > > > > > > > > >     a
> > > > > > > > > >     more or less arbitrary value of 1024 in the past,
> > > > > > > > > >     which
> > > > > > > > > >     limits the maximum transfer size with virtio to 4M
> > > > > > > > > >     (more precise: 1024 * PAGE_SIZE, with the latter
> > > > > > > > > >     typically
> > > > > > > > > >     being 4k).
> > > > > > > > > 
> > > > > > > > > Being equal to IOV_MAX is a likely reason. Buffers with more
> > > > > > > > > iovecs
> > > > > > > > > than
> > > > > > > > > that cannot be passed to host system calls (sendmsg(2),
> > > > > > > > > pwritev(2),
> > > > > > > > > etc).
> > > > > > > > 
> > > > > > > > Yes, that's use case dependent. Hence the solution to opt-in
> > > > > > > > if it
> > > > > > > > is
> > > > > > > > desired and feasible.
> > > > > > > > 
> > > > > > > > > > (2) Additionally the current value of 1024 poses a hidden
> > > > > > > > > > limit,
> > > > > > > > > > 
> > > > > > > > > >     invisible to guest, which causes a system hang with
> > > > > > > > > >     the
> > > > > > > > > >     following QEMU error if guest tries to exceed it:
> > > > > > > > > >     
> > > > > > > > > >     virtio: too many write descriptors in indirect table
> > > > > > > > > 
> > > > > > > > > I don't understand this point. 2.6.5 The Virtqueue
> > > > > > > > > Descriptor
> > > > > > > > > Table
> > > > > > 
> > > > > > says:
> > > > > > > > >   The number of descriptors in the table is defined by the
> > > > > > > > >   queue
> > > > > > > > >   size
> > > > > > > > >   for
> > > > > > > > > 
> > > > > > > > > this virtqueue: this is the maximum possible descriptor
> > > > > > > > > chain
> > > > > > > > > length.
> > > > > > > > > 
> > > > > > > > > and 2.6.5.3.1 Driver Requirements: Indirect Descriptors 
says:
> > > > > > > > >   A driver MUST NOT create a descriptor chain longer than
> > > > > > > > >   the
> > > > > > > > >   Queue
> > > > > > > > >   Size
> > > > > > > > >   of
> > > > > > > > > 
> > > > > > > > > the device.
> > > > > > > > > 
> > > > > > > > > Do you mean a broken/malicious guest driver that is
> > > > > > > > > violating
> > > > > > > > > the
> > > > > > > > > spec?
> > > > > > > > > That's not a hidden limit, it's defined by the spec.
> > > > > > > > 
> > > > > > > > https://lists.gnu.org/archive/html/qemu-devel/2021-10/msg00781
> > > > > > > > .htm
> > > > > > > > l
> > > > > > > > https://lists.gnu.org/archive/html/qemu-devel/2021-10/msg00788
> > > > > > > > .htm
> > > > > > > > l
> > > > > > > > 
> > > > > > > > You can already go beyond that queue size at runtime with the
> > > > > > > > indirection
> > > > > > > > table. The only actual limit is the currently hard coded value
> > > > > > > > of
> > > > > > > > 1k
> > > > > > > > pages.
> > > > > > > > Hence the suggestion to turn that into a variable.
> > > > > > > 
> > > > > > > Exceeding Queue Size is a VIRTIO spec violation. Drivers that
> > > > > > > operate
> > > > > > > outsided the spec do so at their own risk. They may not be
> > > > > > > compatible
> > > > > > > with all device implementations.
> > > > > > 
> > > > > > Yes, I am ware about that. And still, this practice is already
> > > > > > done,
> > > > > > which
> > > > > > apparently is not limited to 9pfs.
> > > > > > 
> > > > > > > The limit is not hidden, it's Queue Size as defined by the spec
> > > > > > > :).
> > > > > > > 
> > > > > > > If you have a driver that is exceeding the limit, then please
> > > > > > > fix
> > > > > > > the
> > > > > > > driver.
> > > > > > 
> > > > > > I absolutely understand your position, but I hope you also
> > > > > > understand
> > > > > > that
> > > > > > this violation of the specs is a theoretical issue, it is not a
> > > > > > real-life
> > > > > > problem right now, and due to lack of man power unfortunately I
> > > > > > have
> > > > > > to
> > > > > > prioritize real-life problems over theoretical ones ATM. Keep in
> > > > > > mind
> > > > > > that
> > > > > > right now I am the only person working on 9pfs actively, I do this
> > > > > > voluntarily whenever I find a free time slice, and I am not paid
> > > > > > for
> > > > > > it
> > > > > > either.
> > > > > > 
> > > > > > I don't see any reasonable way with reasonable effort to do what
> > > > > > you
> > > > > > are
> > > > > > asking for here in 9pfs, and Greg may correct me here if I am
> > > > > > saying
> > > > > > anything wrong. If you are seeing any specific real-life issue
> > > > > > here,
> > > > > > then
> > > > > > please tell me which one, otherwise I have to postpone that "specs
> > > > > > violation" issue.
> > > > > > 
> > > > > > There is still a long list of real problems that I need to hunt
> > > > > > down
> > > > > > in
> > > > > > 9pfs, afterwards I can continue with theoretical ones if you want,
> > > > > > but
> > > > > > right now I simply can't, sorry.
> > > > > 
> > > > > I understand. If you don't have time to fix the Linux virtio-9p
> > > > > driver
> > > > > then that's fine.
> > > > 
> > > > I will look at this again, but it might be tricky. On doubt I'll
> > > > postpone
> > > > it.>
> > > > 
> > > > > I still wanted us to agree on the spec position because the commit
> > > > > description says it's a "hidden limit", which is incorrect. It might
> > > > > seem pedantic, but my concern is that misconceptions can spread if
> > > > > we
> > > > > let them. That could cause people to write incorrect code later on.
> > > > > Please update the commit description either by dropping 2) or by
> > > > > 
> > > > > replacing it with something else. For example:
> > > > >   2) The Linux virtio-9p guest driver does not honor the VIRTIO
> > > > >   Queue
> > > > >   
> > > > >      Size value and can submit descriptor chains that exceed it.
> > > > >      That is
> > > > >      a spec violation but is accepted by QEMU's device
> > > > >      implementation.
> > > > >      
> > > > >      When the guest creates a descriptor chain larger than 1024 the
> > > > >      following QEMU error is printed and the guest hangs:
> > > > >      
> > > > >      virtio: too many write descriptors in indirect table
> > > > 
> > > > I am fine with both, probably preferring the text block above instead
> > > > of
> > > > silently dropping the reason, just for clarity.
> > > > 
> > > > But keep in mind that this might not be limited to virtio-9p as your
> > > > text
> > > > would suggest, see below.
> > > > 
> > > > > > > > > > (3) Unfortunately not all virtio users in QEMU would
> > > > > > > > > > currently
> > > > > > > > > > 
> > > > > > > > > >     work correctly with the new value of 32768.
> > > > > > > > > > 
> > > > > > > > > > So let's turn this hard coded global value into a runtime
> > > > > > > > > > variable as a first step in this commit, configurable for
> > > > > > > > > > each
> > > > > > > > > > virtio user by passing a corresponding value with
> > > > > > > > > > virtio_init()
> > > > > > > > > > call.
> > > > > > > > > 
> > > > > > > > > virtio_add_queue() already has an int queue_size argument,
> > > > > > > > > why
> > > > > > > > > isn't
> > > > > > > > > that enough to deal with the maximum queue size? There's
> > > > > > > > > probably a
> > > > > > > > > good
> > > > > > > > > reason for it, but please include it in the commit
> > > > > > > > > description.
> > > > > > > > 
> > > > > > > > [...]
> > > > > > > > 
> > > > > > > > > Can you make this value per-vq instead of per-vdev since
> > > > > > > > > virtqueues
> > > > > > > > > can
> > > > > > > > > have different queue sizes?
> > > > > > > > > 
> > > > > > > > > The same applies to the rest of this patch. Anything using
> > > > > > > > > vdev->queue_max_size should probably use vq->vring.num
> > > > > > > > > instead.
> > > > > > > > 
> > > > > > > > I would like to avoid that and keep it per device. The maximum
> > > > > > > > size
> > > > > > > > stored
> > > > > > > > there is the maximum size supported by virtio user (or vortio
> > > > > > > > device
> > > > > > > > model,
> > > > > > > > however you want to call it). So that's really a limit per
> > > > > > > > device,
> > > > > > > > not
> > > > > > > > per
> > > > > > > > queue, as no queue of the device would ever exceed that limit.
> > > > > > > > 
> > > > > > > > Plus a lot more code would need to be refactored, which I
> > > > > > > > think is
> > > > > > > > unnecessary.
> > > > > > > 
> > > > > > > I'm against a per-device limit because it's a concept that
> > > > > > > cannot
> > > > > > > accurately describe reality. Some devices have multiple classes
> > > > > > > of
> > > > > > 
> > > > > > It describes current reality, because VIRTQUEUE_MAX_SIZE obviously
> > > > > > is
> > > > > > not
> > > > > > per queue either ATM, and nobody ever cared.
> > > > > > 
> > > > > > All this series does, is allowing to override that currently
> > > > > > project-wide
> > > > > > compile-time constant to a per-driver-model compile-time constant.
> > > > > > Which
> > > > > > makes sense, because that's what it is: some drivers could cope
> > > > > > with
> > > > > > any
> > > > > > transfer size, and some drivers are constrained to a certain
> > > > > > maximum
> > > > > > application specific transfer size (e.g. IOV_MAX).
> > > > > > 
> > > > > > > virtqueues and they are sized differently, so a per-device limit
> > > > > > > is
> > > > > > > insufficient. virtio-net has separate rx_queue_size and
> > > > > > > tx_queue_size
> > > > > > > parameters (plus a control vq hardcoded to 64 descriptors).
> > > > > > 
> > > > > > I simply find this overkill. This value semantically means "my
> > > > > > driver
> > > > > > model
> > > > > > supports at any time and at any coincidence at the very most x *
> > > > > > PAGE_SIZE
> > > > > > = max_transfer_size". Do you see any driver that might want a more
> > > > > > fine
> > > > > > graded control over this?
> > > > > 
> > > > > One reason why per-vq limits could make sense is that the maximum
> > > > > possible number of struct elements is allocated upfront in some code
> > > > > paths. Those code paths may need to differentiate between per-vq
> > > > > limits
> > > > > for performance or memory utilization reasons. Today some places
> > > > > allocate 1024 elements on the stack in some code paths, but maybe
> > > > > that's
> > > > > not acceptable when the per-device limit is 32k. This can matter
> > > > > when a
> > > > > device has vqs with very different sizes.
> > > > 
> > > > [...]
> > > > 
> > > > > > ... I leave that up to Michael or whoever might be in charge to
> > > > > > decide. I
> > > > > > still find this overkill, but I will adapt this to whatever the
> > > > > > decision
> > > > > > eventually will be in v3.
> > > > > > 
> > > > > > But then please tell me the precise representation that you find
> > > > > > appropriate, i.e. whether you want a new function for that, or
> > > > > > rather
> > > > > > an
> > > > > > additional argument to virtio_add_queue(). Your call.
> > > > > 
> > > > > virtio_add_queue() already takes an int queue_size argument. I think
> > > > > the
> > > > > necessary information is already there.
> > > > > 
> > > > > This patch just needs to be tweaked to use the
> > > > > virtio_queue_get_num()
> > > > > (or a new virtqueue_get_num() API if that's easier because only a
> > > > > VirtQueue *vq pointer is available) instead of introducing a new
> > > > > per-device limit.
> > > > 
> > > > My understanding is that both the original 9p virtio device authors,
> > > > as
> > > > well as other virtio device authors in QEMU have been and are still
> > > > using
> > > > this as a default value (i.e. to allocate some upfront, and the rest
> > > > on
> > > > demand).
> > > > 
> > > > So yes, I know your argument about the specs, but AFAICS if I would
> > > > just
> > > > take this existing numeric argument for the limit, then it would
> > > > probably
> > > > break those other QEMU devices as well.
> > > 
> > > This is a good point that I didn't consider. If guest drivers currently
> > > violate the spec, then restricting descriptor chain length to vring.num
> > > will introduce regressions.
> > > 
> > > We can't use virtio_queue_get_num() directly. A backwards-compatible
> > > 
> > > limit is required:
> > >   int virtio_queue_get_desc_chain_max(VirtIODevice *vdev, int n)
> > >   {
> > >   
> > >       /*
> > >       
> > >        * QEMU historically allowed 1024 descriptors even if the
> > >        * descriptor table was smaller.
> > >        */
> > >       
> > >       return MAX(virtio_queue_get_num(vdev, qidx), 1024);
> > >   
> > >   }
> > 
> > That was an alternative that I thought about as well, but decided against.
> > It would require devices (that would want to support large transmissions
> > sizes)> 
> > to create the virtio queue(s) with the maximum possible size, i.e:
> >   virtio_add_queue(32k);
> 
> The spec allows drivers to set the size of the vring as long as they do
> not exceed Queue Size.
> 
> The Linux drivers accept the device's default size, so you're right that
> this would cause large vrings to be allocated if the device sets the
> virtqueue size to 32k.
> 
> > And that's the point where my current lack of knowledge, of what this
> > would
> > precisely mean to the resulting allocation set, decided against it. I mean
> > would that mean would QEMU's virtio implementation would just a) allocate
> > 32k scatter gather list entries? Or would it rather b) additionally also
> > allocate the destination memory pages as well?
> 
> The vring consumes guest RAM but it just consists of descriptors, not
> the buffer memory pages. The guest RAM requirements are:
> - split layout: 32k * 16 + 6 + 32k * 2 + 6 + 8 * 32k = 851,980 bytes
> - packed layout: 32k * 16 + 4 + 4 = 524,296 bytes
> 
> That's still quite large!
> 
> By the way, virtio-blk currently uses a virtqueue size of 256
> descriptors and this has been found reasonable for disk I/O performance.
> The Linux block layer splits requests at around 1.25 MB for virtio-blk.
> The virtio-blk queue limits are reported by the device and the guest
> Linux block layer uses them to size/split requests appropriately. I'm
> not sure 9p really needs 32k, although you're right that fragmented
> physical memory requires 32k descriptors to describe 128 MB of buffers.
> 
> Going back to the original problem, a vring feature bit could be added
> to the VIRTIO specification indicating that indirect descriptor tables
> are limited to the maximum (32k) instead of Queue Size. This way the
> device's default vring size could be small but drivers could allocate
> indirect descriptor tables that are large when necessary. Then the Linux
> virtio driver API would need to report the maximum supported sglist
> length for a given virtqueue so drivers can take advantage of this
> information.

Due to forced pragmatism, ~1M unused/wasted space would be acceptable IMHO.

But if that might really go up to 128M+ as you said if physical RAM is highly 
fragmented, then a cleaner solution would definitely make sense, yes, if 
that's possible.

But as changing specs etc. is probably a long process, it would make sense 
first doing some more tests with the kernel patches to find out whether there 
was probably some show stopper like IOV_MAX anyway.

As for whether large transfer sizes make sense at all: well, from the 
benchmarks I made so far I "think" it does make sense going >4M. It might be 
something specific to 9p (its a full file server), I guess it has a higher 
latency than raw virtio block devices. OTOH with M.2 SSDs we now have several 
thousand MB/s, so not sure if the old common transfer size of 1M for block 
devices is still reasonable today.

Best regards,
Christian Schoenebeck
diff mbox series

Patch

diff --git a/hw/9pfs/virtio-9p-device.c b/hw/9pfs/virtio-9p-device.c
index 54ee93b71f..cd5d95dd51 100644
--- a/hw/9pfs/virtio-9p-device.c
+++ b/hw/9pfs/virtio-9p-device.c
@@ -216,7 +216,8 @@  static void virtio_9p_device_realize(DeviceState *dev, Error **errp)
     }
 
     v->config_size = sizeof(struct virtio_9p_config) + strlen(s->fsconf.tag);
-    virtio_init(vdev, "virtio-9p", VIRTIO_ID_9P, v->config_size);
+    virtio_init(vdev, "virtio-9p", VIRTIO_ID_9P, v->config_size,
+                VIRTQUEUE_MAX_SIZE);
     v->vq = virtio_add_queue(vdev, MAX_REQ, handle_9p_output);
 }
 
diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
index ba13cb87e5..336f56705c 100644
--- a/hw/block/vhost-user-blk.c
+++ b/hw/block/vhost-user-blk.c
@@ -491,7 +491,7 @@  static void vhost_user_blk_device_realize(DeviceState *dev, Error **errp)
     }
 
     virtio_init(vdev, "virtio-blk", VIRTIO_ID_BLOCK,
-                sizeof(struct virtio_blk_config));
+                sizeof(struct virtio_blk_config), VIRTQUEUE_MAX_SIZE);
 
     s->virtqs = g_new(VirtQueue *, s->num_queues);
     for (i = 0; i < s->num_queues; i++) {
diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index f139cd7cc9..9c0f46815c 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -1213,7 +1213,8 @@  static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
 
     virtio_blk_set_config_size(s, s->host_features);
 
-    virtio_init(vdev, "virtio-blk", VIRTIO_ID_BLOCK, s->config_size);
+    virtio_init(vdev, "virtio-blk", VIRTIO_ID_BLOCK, s->config_size,
+                VIRTQUEUE_MAX_SIZE);
 
     s->blk = conf->conf.blk;
     s->rq = NULL;
diff --git a/hw/char/virtio-serial-bus.c b/hw/char/virtio-serial-bus.c
index f01ec2137c..9ad9111115 100644
--- a/hw/char/virtio-serial-bus.c
+++ b/hw/char/virtio-serial-bus.c
@@ -1045,7 +1045,7 @@  static void virtio_serial_device_realize(DeviceState *dev, Error **errp)
         config_size = offsetof(struct virtio_console_config, emerg_wr);
     }
     virtio_init(vdev, "virtio-serial", VIRTIO_ID_CONSOLE,
-                config_size);
+                config_size, VIRTQUEUE_MAX_SIZE);
 
     /* Spawn a new virtio-serial bus on which the ports will ride as devices */
     qbus_init(&vser->bus, sizeof(vser->bus), TYPE_VIRTIO_SERIAL_BUS,
diff --git a/hw/display/virtio-gpu-base.c b/hw/display/virtio-gpu-base.c
index c8da4806e0..20b06a7adf 100644
--- a/hw/display/virtio-gpu-base.c
+++ b/hw/display/virtio-gpu-base.c
@@ -171,7 +171,7 @@  virtio_gpu_base_device_realize(DeviceState *qdev,
 
     g->virtio_config.num_scanouts = cpu_to_le32(g->conf.max_outputs);
     virtio_init(VIRTIO_DEVICE(g), "virtio-gpu", VIRTIO_ID_GPU,
-                sizeof(struct virtio_gpu_config));
+                sizeof(struct virtio_gpu_config), VIRTQUEUE_MAX_SIZE);
 
     if (virtio_gpu_virgl_enabled(g->conf)) {
         /* use larger control queue in 3d mode */
diff --git a/hw/input/virtio-input.c b/hw/input/virtio-input.c
index 54bcb46c74..345eb2cce7 100644
--- a/hw/input/virtio-input.c
+++ b/hw/input/virtio-input.c
@@ -258,7 +258,7 @@  static void virtio_input_device_realize(DeviceState *dev, Error **errp)
     assert(vinput->cfg_size <= sizeof(virtio_input_config));
 
     virtio_init(vdev, "virtio-input", VIRTIO_ID_INPUT,
-                vinput->cfg_size);
+                vinput->cfg_size, VIRTQUEUE_MAX_SIZE);
     vinput->evt = virtio_add_queue(vdev, 64, virtio_input_handle_evt);
     vinput->sts = virtio_add_queue(vdev, 64, virtio_input_handle_sts);
 }
diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index f205331dcf..f74b5f6268 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -1746,9 +1746,9 @@  static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf,
     VirtIONet *n = qemu_get_nic_opaque(nc);
     VirtIONetQueue *q = virtio_net_get_subqueue(nc);
     VirtIODevice *vdev = VIRTIO_DEVICE(n);
-    VirtQueueElement *elems[VIRTQUEUE_MAX_SIZE];
-    size_t lens[VIRTQUEUE_MAX_SIZE];
-    struct iovec mhdr_sg[VIRTQUEUE_MAX_SIZE];
+    VirtQueueElement *elems[vdev->queue_max_size];
+    size_t lens[vdev->queue_max_size];
+    struct iovec mhdr_sg[vdev->queue_max_size];
     struct virtio_net_hdr_mrg_rxbuf mhdr;
     unsigned mhdr_cnt = 0;
     size_t offset, i, guest_offset, j;
@@ -1783,7 +1783,7 @@  static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf,
 
         total = 0;
 
-        if (i == VIRTQUEUE_MAX_SIZE) {
+        if (i == vdev->queue_max_size) {
             virtio_error(vdev, "virtio-net unexpected long buffer chain");
             err = size;
             goto err;
@@ -2532,7 +2532,7 @@  static int32_t virtio_net_flush_tx(VirtIONetQueue *q)
     for (;;) {
         ssize_t ret;
         unsigned int out_num;
-        struct iovec sg[VIRTQUEUE_MAX_SIZE], sg2[VIRTQUEUE_MAX_SIZE + 1], *out_sg;
+        struct iovec sg[vdev->queue_max_size], sg2[vdev->queue_max_size + 1], *out_sg;
         struct virtio_net_hdr_mrg_rxbuf mhdr;
 
         elem = virtqueue_pop(q->tx_vq, sizeof(VirtQueueElement));
@@ -2564,7 +2564,7 @@  static int32_t virtio_net_flush_tx(VirtIONetQueue *q)
                 out_num = iov_copy(&sg2[1], ARRAY_SIZE(sg2) - 1,
                                    out_sg, out_num,
                                    n->guest_hdr_len, -1);
-                if (out_num == VIRTQUEUE_MAX_SIZE) {
+                if (out_num == vdev->queue_max_size) {
                     goto drop;
                 }
                 out_num += 1;
@@ -3364,7 +3364,8 @@  static void virtio_net_device_realize(DeviceState *dev, Error **errp)
     }
 
     virtio_net_set_config_size(n, n->host_features);
-    virtio_init(vdev, "virtio-net", VIRTIO_ID_NET, n->config_size);
+    virtio_init(vdev, "virtio-net", VIRTIO_ID_NET, n->config_size,
+                VIRTQUEUE_MAX_SIZE);
 
     /*
      * We set a lower limit on RX queue size to what it always was.
diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
index 51fd09522a..5e5e657e1d 100644
--- a/hw/scsi/virtio-scsi.c
+++ b/hw/scsi/virtio-scsi.c
@@ -973,7 +973,7 @@  void virtio_scsi_common_realize(DeviceState *dev,
     int i;
 
     virtio_init(vdev, "virtio-scsi", VIRTIO_ID_SCSI,
-                sizeof(VirtIOSCSIConfig));
+                sizeof(VirtIOSCSIConfig), VIRTQUEUE_MAX_SIZE);
 
     if (s->conf.num_queues == VIRTIO_SCSI_AUTO_NUM_QUEUES) {
         s->conf.num_queues = 1;
diff --git a/hw/virtio/vhost-user-fs.c b/hw/virtio/vhost-user-fs.c
index c595957983..ae1672d667 100644
--- a/hw/virtio/vhost-user-fs.c
+++ b/hw/virtio/vhost-user-fs.c
@@ -220,7 +220,7 @@  static void vuf_device_realize(DeviceState *dev, Error **errp)
     }
 
     virtio_init(vdev, "vhost-user-fs", VIRTIO_ID_FS,
-                sizeof(struct virtio_fs_config));
+                sizeof(struct virtio_fs_config), VIRTQUEUE_MAX_SIZE);
 
     /* Hiprio queue */
     fs->hiprio_vq = virtio_add_queue(vdev, fs->conf.queue_size, vuf_handle_output);
diff --git a/hw/virtio/vhost-user-i2c.c b/hw/virtio/vhost-user-i2c.c
index d172632bb0..eeb1d8853a 100644
--- a/hw/virtio/vhost-user-i2c.c
+++ b/hw/virtio/vhost-user-i2c.c
@@ -220,7 +220,8 @@  static void vu_i2c_device_realize(DeviceState *dev, Error **errp)
         return;
     }
 
-    virtio_init(vdev, "vhost-user-i2c", VIRTIO_ID_I2C_ADAPTER, 0);
+    virtio_init(vdev, "vhost-user-i2c", VIRTIO_ID_I2C_ADAPTER, 0,
+                VIRTQUEUE_MAX_SIZE);
 
     i2c->vhost_dev.nvqs = 1;
     i2c->vq = virtio_add_queue(vdev, 4, vu_i2c_handle_output);
diff --git a/hw/virtio/vhost-vsock-common.c b/hw/virtio/vhost-vsock-common.c
index 4ad6e234ad..a81fa884a8 100644
--- a/hw/virtio/vhost-vsock-common.c
+++ b/hw/virtio/vhost-vsock-common.c
@@ -201,7 +201,7 @@  void vhost_vsock_common_realize(VirtIODevice *vdev, const char *name)
     VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev);
 
     virtio_init(vdev, name, VIRTIO_ID_VSOCK,
-                sizeof(struct virtio_vsock_config));
+                sizeof(struct virtio_vsock_config), VIRTQUEUE_MAX_SIZE);
 
     /* Receive and transmit queues belong to vhost */
     vvc->recv_vq = virtio_add_queue(vdev, VHOST_VSOCK_QUEUE_SIZE,
diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c
index 5a69dce35d..067c73223d 100644
--- a/hw/virtio/virtio-balloon.c
+++ b/hw/virtio/virtio-balloon.c
@@ -886,7 +886,7 @@  static void virtio_balloon_device_realize(DeviceState *dev, Error **errp)
     int ret;
 
     virtio_init(vdev, "virtio-balloon", VIRTIO_ID_BALLOON,
-                virtio_balloon_config_size(s));
+                virtio_balloon_config_size(s), VIRTQUEUE_MAX_SIZE);
 
     ret = qemu_add_balloon_handler(virtio_balloon_to_target,
                                    virtio_balloon_stat, s);
@@ -909,7 +909,7 @@  static void virtio_balloon_device_realize(DeviceState *dev, Error **errp)
     s->svq = virtio_add_queue(vdev, 128, virtio_balloon_receive_stats);
 
     if (virtio_has_feature(s->host_features, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) {
-        s->free_page_vq = virtio_add_queue(vdev, VIRTQUEUE_MAX_SIZE,
+        s->free_page_vq = virtio_add_queue(vdev, vdev->queue_max_size,
                                            virtio_balloon_handle_free_page_vq);
         precopy_add_notifier(&s->free_page_hint_notify);
 
diff --git a/hw/virtio/virtio-crypto.c b/hw/virtio/virtio-crypto.c
index 54f9bbb789..1e70d4d2a8 100644
--- a/hw/virtio/virtio-crypto.c
+++ b/hw/virtio/virtio-crypto.c
@@ -810,7 +810,8 @@  static void virtio_crypto_device_realize(DeviceState *dev, Error **errp)
         return;
     }
 
-    virtio_init(vdev, "virtio-crypto", VIRTIO_ID_CRYPTO, vcrypto->config_size);
+    virtio_init(vdev, "virtio-crypto", VIRTIO_ID_CRYPTO, vcrypto->config_size,
+                VIRTQUEUE_MAX_SIZE);
     vcrypto->curr_queues = 1;
     vcrypto->vqs = g_malloc0(sizeof(VirtIOCryptoQueue) * vcrypto->max_queues);
     for (i = 0; i < vcrypto->max_queues; i++) {
diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c
index 1b23e8e18c..ca360e74eb 100644
--- a/hw/virtio/virtio-iommu.c
+++ b/hw/virtio/virtio-iommu.c
@@ -974,7 +974,7 @@  static void virtio_iommu_device_realize(DeviceState *dev, Error **errp)
     VirtIOIOMMU *s = VIRTIO_IOMMU(dev);
 
     virtio_init(vdev, "virtio-iommu", VIRTIO_ID_IOMMU,
-                sizeof(struct virtio_iommu_config));
+                sizeof(struct virtio_iommu_config), VIRTQUEUE_MAX_SIZE);
 
     memset(s->iommu_pcibus_by_bus_num, 0, sizeof(s->iommu_pcibus_by_bus_num));
 
diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c
index df91e454b2..1d9d01b871 100644
--- a/hw/virtio/virtio-mem.c
+++ b/hw/virtio/virtio-mem.c
@@ -738,7 +738,7 @@  static void virtio_mem_device_realize(DeviceState *dev, Error **errp)
     vmem->bitmap = bitmap_new(vmem->bitmap_size);
 
     virtio_init(vdev, TYPE_VIRTIO_MEM, VIRTIO_ID_MEM,
-                sizeof(struct virtio_mem_config));
+                sizeof(struct virtio_mem_config), VIRTQUEUE_MAX_SIZE);
     vmem->vq = virtio_add_queue(vdev, 128, virtio_mem_handle_request);
 
     host_memory_backend_set_mapped(vmem->memdev, true);
diff --git a/hw/virtio/virtio-pmem.c b/hw/virtio/virtio-pmem.c
index d1aeb90a31..82b54b00c5 100644
--- a/hw/virtio/virtio-pmem.c
+++ b/hw/virtio/virtio-pmem.c
@@ -124,7 +124,7 @@  static void virtio_pmem_realize(DeviceState *dev, Error **errp)
 
     host_memory_backend_set_mapped(pmem->memdev, true);
     virtio_init(vdev, TYPE_VIRTIO_PMEM, VIRTIO_ID_PMEM,
-                sizeof(struct virtio_pmem_config));
+                sizeof(struct virtio_pmem_config), VIRTQUEUE_MAX_SIZE);
     pmem->rq_vq = virtio_add_queue(vdev, 128, virtio_pmem_flush);
 }
 
diff --git a/hw/virtio/virtio-rng.c b/hw/virtio/virtio-rng.c
index cc8e9f775d..0e91d60106 100644
--- a/hw/virtio/virtio-rng.c
+++ b/hw/virtio/virtio-rng.c
@@ -215,7 +215,7 @@  static void virtio_rng_device_realize(DeviceState *dev, Error **errp)
         return;
     }
 
-    virtio_init(vdev, "virtio-rng", VIRTIO_ID_RNG, 0);
+    virtio_init(vdev, "virtio-rng", VIRTIO_ID_RNG, 0, VIRTQUEUE_MAX_SIZE);
 
     vrng->vq = virtio_add_queue(vdev, 8, handle_input);
     vrng->quota_remaining = vrng->conf.max_bytes;
diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index 240759ff0b..60e094d96a 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -1419,8 +1419,8 @@  static void *virtqueue_split_pop(VirtQueue *vq, size_t sz)
     VirtIODevice *vdev = vq->vdev;
     VirtQueueElement *elem = NULL;
     unsigned out_num, in_num, elem_entries;
-    hwaddr addr[VIRTQUEUE_MAX_SIZE];
-    struct iovec iov[VIRTQUEUE_MAX_SIZE];
+    hwaddr addr[vdev->queue_max_size];
+    struct iovec iov[vdev->queue_max_size];
     VRingDesc desc;
     int rc;
 
@@ -1492,7 +1492,7 @@  static void *virtqueue_split_pop(VirtQueue *vq, size_t sz)
         if (desc.flags & VRING_DESC_F_WRITE) {
             map_ok = virtqueue_map_desc(vdev, &in_num, addr + out_num,
                                         iov + out_num,
-                                        VIRTQUEUE_MAX_SIZE - out_num, true,
+                                        vdev->queue_max_size - out_num, true,
                                         desc.addr, desc.len);
         } else {
             if (in_num) {
@@ -1500,7 +1500,7 @@  static void *virtqueue_split_pop(VirtQueue *vq, size_t sz)
                 goto err_undo_map;
             }
             map_ok = virtqueue_map_desc(vdev, &out_num, addr, iov,
-                                        VIRTQUEUE_MAX_SIZE, false,
+                                        vdev->queue_max_size, false,
                                         desc.addr, desc.len);
         }
         if (!map_ok) {
@@ -1556,8 +1556,8 @@  static void *virtqueue_packed_pop(VirtQueue *vq, size_t sz)
     VirtIODevice *vdev = vq->vdev;
     VirtQueueElement *elem = NULL;
     unsigned out_num, in_num, elem_entries;
-    hwaddr addr[VIRTQUEUE_MAX_SIZE];
-    struct iovec iov[VIRTQUEUE_MAX_SIZE];
+    hwaddr addr[vdev->queue_max_size];
+    struct iovec iov[vdev->queue_max_size];
     VRingPackedDesc desc;
     uint16_t id;
     int rc;
@@ -1620,7 +1620,7 @@  static void *virtqueue_packed_pop(VirtQueue *vq, size_t sz)
         if (desc.flags & VRING_DESC_F_WRITE) {
             map_ok = virtqueue_map_desc(vdev, &in_num, addr + out_num,
                                         iov + out_num,
-                                        VIRTQUEUE_MAX_SIZE - out_num, true,
+                                        vdev->queue_max_size - out_num, true,
                                         desc.addr, desc.len);
         } else {
             if (in_num) {
@@ -1628,7 +1628,7 @@  static void *virtqueue_packed_pop(VirtQueue *vq, size_t sz)
                 goto err_undo_map;
             }
             map_ok = virtqueue_map_desc(vdev, &out_num, addr, iov,
-                                        VIRTQUEUE_MAX_SIZE, false,
+                                        vdev->queue_max_size, false,
                                         desc.addr, desc.len);
         }
         if (!map_ok) {
@@ -2249,7 +2249,7 @@  void virtio_queue_set_num(VirtIODevice *vdev, int n, int num)
      * nonexistent states, or to set it to an invalid size.
      */
     if (!!num != !!vdev->vq[n].vring.num ||
-        num > VIRTQUEUE_MAX_SIZE ||
+        num > vdev->queue_max_size ||
         num < 0) {
         return;
     }
@@ -2400,7 +2400,7 @@  VirtQueue *virtio_add_queue(VirtIODevice *vdev, int queue_size,
             break;
     }
 
-    if (i == VIRTIO_QUEUE_MAX || queue_size > VIRTQUEUE_MAX_SIZE)
+    if (i == VIRTIO_QUEUE_MAX || queue_size > vdev->queue_max_size)
         abort();
 
     vdev->vq[i].vring.num = queue_size;
@@ -3239,13 +3239,25 @@  void virtio_instance_init_common(Object *proxy_obj, void *data,
 }
 
 void virtio_init(VirtIODevice *vdev, const char *name,
-                 uint16_t device_id, size_t config_size)
+                 uint16_t device_id, size_t config_size,
+                 uint16_t queue_max_size)
 {
     BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
     VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
     int i;
     int nvectors = k->query_nvectors ? k->query_nvectors(qbus->parent) : 0;
 
+    if (queue_max_size > VIRTQUEUE_MAX_SIZE ||
+        !is_power_of_2(queue_max_size))
+    {
+        error_report(
+            "virtio: invalid queue_max_size (= %" PRIu16 "), must be a "
+            "power of 2 and between 1 ... %d.",
+            queue_max_size, VIRTQUEUE_MAX_SIZE
+        );
+        abort();
+    }
+
     if (nvectors) {
         vdev->vector_queues =
             g_malloc0(sizeof(*vdev->vector_queues) * nvectors);
@@ -3258,6 +3270,7 @@  void virtio_init(VirtIODevice *vdev, const char *name,
     qatomic_set(&vdev->isr, 0);
     vdev->queue_sel = 0;
     vdev->config_vector = VIRTIO_NO_VECTOR;
+    vdev->queue_max_size = queue_max_size;
     vdev->vq = g_malloc0(sizeof(VirtQueue) * VIRTIO_QUEUE_MAX);
     vdev->vm_running = runstate_is_running();
     vdev->broken = false;
diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
index 8bab9cfb75..a37d1f7d52 100644
--- a/include/hw/virtio/virtio.h
+++ b/include/hw/virtio/virtio.h
@@ -89,6 +89,7 @@  struct VirtIODevice
     size_t config_len;
     void *config;
     uint16_t config_vector;
+    uint16_t queue_max_size;
     uint32_t generation;
     int nvectors;
     VirtQueue *vq;
@@ -166,7 +167,9 @@  void virtio_instance_init_common(Object *proxy_obj, void *data,
                                  size_t vdev_size, const char *vdev_name);
 
 void virtio_init(VirtIODevice *vdev, const char *name,
-                         uint16_t device_id, size_t config_size);
+                 uint16_t device_id, size_t config_size,
+                 uint16_t queue_max_size);
+
 void virtio_cleanup(VirtIODevice *vdev);
 
 void virtio_error(VirtIODevice *vdev, const char *fmt, ...) GCC_FMT_ATTR(2, 3);