diff mbox

[5/5] virtio-scsi: introduce multiqueue support

Message ID 1346154857-12487-6-git-send-email-pbonzini@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Paolo Bonzini Aug. 28, 2012, 11:54 a.m. UTC
This patch adds queue steering to virtio-scsi.  When a target is sent
multiple requests, we always drive them to the same queue so that FIFO
processing order is kept.  However, if a target was idle, we can choose
a queue arbitrarily.  In this case the queue is chosen according to the
current VCPU, so the driver expects the number of request queues to be
equal to the number of VCPUs.  This makes it easy and fast to select
the queue, and also lets the driver optimize the IRQ affinity for the
virtqueues (each virtqueue's affinity is set to the CPU that "owns"
the queue).

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 drivers/scsi/virtio_scsi.c |  162 +++++++++++++++++++++++++++++++++++---------
 1 files changed, 130 insertions(+), 32 deletions(-)

Comments

Nicholas A. Bellinger Sept. 4, 2012, 2:21 a.m. UTC | #1
On Tue, 2012-08-28 at 13:54 +0200, Paolo Bonzini wrote:
> This patch adds queue steering to virtio-scsi.  When a target is sent
> multiple requests, we always drive them to the same queue so that FIFO
> processing order is kept.  However, if a target was idle, we can choose
> a queue arbitrarily.  In this case the queue is chosen according to the
> current VCPU, so the driver expects the number of request queues to be
> equal to the number of VCPUs.  This makes it easy and fast to select
> the queue, and also lets the driver optimize the IRQ affinity for the
> virtqueues (each virtqueue's affinity is set to the CPU that "owns"
> the queue).
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---

Hey Paolo & Co,

I've not had a chance to try this with tcm_vhost just yet, but noticed
one thing wrt to assumptions about virtio_scsi_target_state->reqs access
below..

>  drivers/scsi/virtio_scsi.c |  162 +++++++++++++++++++++++++++++++++++---------
>  1 files changed, 130 insertions(+), 32 deletions(-)
> 
> diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
> index 6414ea0..0c4b096 100644
> --- a/drivers/scsi/virtio_scsi.c
> +++ b/drivers/scsi/virtio_scsi.c
> @@ -26,6 +26,7 @@
>  
>  #define VIRTIO_SCSI_MEMPOOL_SZ 64
>  #define VIRTIO_SCSI_EVENT_LEN 8
> +#define VIRTIO_SCSI_VQ_BASE 2
>  
>  /* Command queue element */
>  struct virtio_scsi_cmd {
> @@ -59,9 +60,13 @@ struct virtio_scsi_vq {
>  
>  /* Per-target queue state */
>  struct virtio_scsi_target_state {
> -	/* Protects sg.  Lock hierarchy is tgt_lock -> vq_lock.  */
> +	/* Protects sg, req_vq.  Lock hierarchy is tgt_lock -> vq_lock.  */
>  	spinlock_t tgt_lock;
>  
> +	struct virtio_scsi_vq *req_vq;
> +
> +	atomic_t reqs;
> +
>  	/* For sglist construction when adding commands to the virtqueue.  */
>  	struct scatterlist sg[];
>  };
> @@ -70,14 +75,15 @@ struct virtio_scsi_target_state {
>  struct virtio_scsi {
>  	struct virtio_device *vdev;
>  
> -	struct virtio_scsi_vq ctrl_vq;
> -	struct virtio_scsi_vq event_vq;
> -	struct virtio_scsi_vq req_vq;
> -
>  	/* Get some buffers ready for event vq */
>  	struct virtio_scsi_event_node event_list[VIRTIO_SCSI_EVENT_LEN];
>  
> +	u32 num_queues;
>  	struct virtio_scsi_target_state **tgt;
> +
> +	struct virtio_scsi_vq ctrl_vq;
> +	struct virtio_scsi_vq event_vq;
> +	struct virtio_scsi_vq req_vqs[];
>  };
>  
>  static struct kmem_cache *virtscsi_cmd_cache;
> @@ -112,6 +118,9 @@ static void virtscsi_complete_cmd(struct virtio_scsi *vscsi, void *buf)
>  	struct virtio_scsi_cmd *cmd = buf;
>  	struct scsi_cmnd *sc = cmd->sc;
>  	struct virtio_scsi_cmd_resp *resp = &cmd->resp.cmd;
> +	struct virtio_scsi_target_state *tgt = vscsi->tgt[sc->device->id];
> +
> +	atomic_dec(&tgt->reqs);
>  

As tgt->tgt_lock is taken in virtscsi_queuecommand_multi() before the
atomic_inc_return(tgt->reqs) check, it seems like using atomic_dec() w/o
smp_mb__after_atomic_dec or tgt_lock access here is not using atomic.h
accessors properly, no..?

>  	dev_dbg(&sc->device->sdev_gendev,
>  		"cmd %p response %u status %#02x sense_len %u\n",
> @@ -185,11 +194,13 @@ static void virtscsi_req_done(struct virtqueue *vq)
>  {
>  	struct Scsi_Host *sh = virtio_scsi_host(vq->vdev);
>  	struct virtio_scsi *vscsi = shost_priv(sh);
> +	int index = virtqueue_get_queue_index(vq) - VIRTIO_SCSI_VQ_BASE;
> +	struct virtio_scsi_vq *req_vq = &vscsi->req_vqs[index];
>  	unsigned long flags;
>  
> -	spin_lock_irqsave(&vscsi->req_vq.vq_lock, flags);
> +	spin_lock_irqsave(&req_vq->vq_lock, flags);
>  	virtscsi_vq_done(vscsi, vq, virtscsi_complete_cmd);
> -	spin_unlock_irqrestore(&vscsi->req_vq.vq_lock, flags);
> +	spin_unlock_irqrestore(&req_vq->vq_lock, flags);
>  };
>  
>  static void virtscsi_complete_free(struct virtio_scsi *vscsi, void *buf)
> @@ -429,10 +440,10 @@ static int virtscsi_kick_cmd(struct virtio_scsi_target_state *tgt,
>  	return ret;
>  }
>  
> -static int virtscsi_queuecommand(struct Scsi_Host *sh, struct scsi_cmnd *sc)
> +static int virtscsi_queuecommand(struct virtio_scsi *vscsi,
> +				 struct virtio_scsi_target_state *tgt,
> +				 struct scsi_cmnd *sc)
>  {
> -	struct virtio_scsi *vscsi = shost_priv(sh);
> -	struct virtio_scsi_target_state *tgt = vscsi->tgt[sc->device->id];
>  	struct virtio_scsi_cmd *cmd;
>  	int ret;
>  
> @@ -466,7 +477,7 @@ static int virtscsi_queuecommand(struct Scsi_Host *sh, struct scsi_cmnd *sc)
>  	BUG_ON(sc->cmd_len > VIRTIO_SCSI_CDB_SIZE);
>  	memcpy(cmd->req.cmd.cdb, sc->cmnd, sc->cmd_len);
>  
> -	if (virtscsi_kick_cmd(tgt, &vscsi->req_vq, cmd,
> +	if (virtscsi_kick_cmd(tgt, tgt->req_vq, cmd,
>  			      sizeof cmd->req.cmd, sizeof cmd->resp.cmd,
>  			      GFP_ATOMIC) >= 0)
>  		ret = 0;
> @@ -475,6 +486,38 @@ out:
>  	return ret;
>  }
>  
> +static int virtscsi_queuecommand_single(struct Scsi_Host *sh,
> +					struct scsi_cmnd *sc)
> +{
> +	struct virtio_scsi *vscsi = shost_priv(sh);
> +	struct virtio_scsi_target_state *tgt = vscsi->tgt[sc->device->id];
> +
> +	atomic_inc(&tgt->reqs);
> +	return virtscsi_queuecommand(vscsi, tgt, sc);
> +}
> +

...

> +static int virtscsi_queuecommand_multi(struct Scsi_Host *sh,
> +				       struct scsi_cmnd *sc)
> +{
> +	struct virtio_scsi *vscsi = shost_priv(sh);
> +	struct virtio_scsi_target_state *tgt = vscsi->tgt[sc->device->id];
> +	unsigned long flags;
> +	u32 queue_num;
> +
> +	/* Using an atomic_t for tgt->reqs lets the virtqueue handler
> +	 * decrement it without taking the spinlock.
> +	 */
> +	spin_lock_irqsave(&tgt->tgt_lock, flags);
> +	if (atomic_inc_return(&tgt->reqs) == 1) {
> +		queue_num = smp_processor_id();
> +		while (unlikely(queue_num >= vscsi->num_queues))
> +			queue_num -= vscsi->num_queues;
> +		tgt->req_vq = &vscsi->req_vqs[queue_num];
> +	}
> +	spin_unlock_irqrestore(&tgt->tgt_lock, flags);
> +	return virtscsi_queuecommand(vscsi, tgt, sc);
> +}
> +

The extra memory barriers to get this right for the current approach are
just going to slow things down even more for virtio-scsi-mq..

After hearing Jen's blk-mq talk last week in San Diego + having a look
at the new code in linux-block.git/new-queue, the approach of using a
per-cpu lock-less-list  hw -> sw queue that uses IPI + numa_node hints
to make smart decisions for the completion path is making alot of sense.

Jen's approach is what we will ultimately need to re-architect in SCSI
core if we're ever going to move beyond the issues of legacy host_lock,
so I'm wondering if maybe this is the direction that virtio-scsi-mq
needs to go in as well..?

--nab

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini Sept. 4, 2012, 6:46 a.m. UTC | #2
Il 04/09/2012 04:21, Nicholas A. Bellinger ha scritto:
>> @@ -112,6 +118,9 @@ static void virtscsi_complete_cmd(struct virtio_scsi *vscsi, void *buf)
>>  	struct virtio_scsi_cmd *cmd = buf;
>>  	struct scsi_cmnd *sc = cmd->sc;
>>  	struct virtio_scsi_cmd_resp *resp = &cmd->resp.cmd;
>> +	struct virtio_scsi_target_state *tgt = vscsi->tgt[sc->device->id];
>> +
>> +	atomic_dec(&tgt->reqs);
>>  
> 
> As tgt->tgt_lock is taken in virtscsi_queuecommand_multi() before the
> atomic_inc_return(tgt->reqs) check, it seems like using atomic_dec() w/o
> smp_mb__after_atomic_dec or tgt_lock access here is not using atomic.h
> accessors properly, no..?

No, only a single "thing" is being accessed, and there is no need to
order the decrement with respect to preceding or subsequent accesses to
other locations.

In other words, tgt->reqs is already synchronized with itself, and that
is enough.

(Besides, on x86 smp_mb__after_atomic_dec is a nop).

>> +static int virtscsi_queuecommand_multi(struct Scsi_Host *sh,
>> +				       struct scsi_cmnd *sc)
>> +{
>> +	struct virtio_scsi *vscsi = shost_priv(sh);
>> +	struct virtio_scsi_target_state *tgt = vscsi->tgt[sc->device->id];
>> +	unsigned long flags;
>> +	u32 queue_num;
>> +
>> +	/* Using an atomic_t for tgt->reqs lets the virtqueue handler
>> +	 * decrement it without taking the spinlock.
>> +	 */
>> +	spin_lock_irqsave(&tgt->tgt_lock, flags);
>> +	if (atomic_inc_return(&tgt->reqs) == 1) {
>> +		queue_num = smp_processor_id();
>> +		while (unlikely(queue_num >= vscsi->num_queues))
>> +			queue_num -= vscsi->num_queues;
>> +		tgt->req_vq = &vscsi->req_vqs[queue_num];
>> +	}
>> +	spin_unlock_irqrestore(&tgt->tgt_lock, flags);
>> +	return virtscsi_queuecommand(vscsi, tgt, sc);
>> +}
>> +
> 
> The extra memory barriers to get this right for the current approach are
> just going to slow things down even more for virtio-scsi-mq..

virtio-scsi multiqueue has a performance benefit up to 20% (for a single
LUN) or 40% (on overall bandwidth across multiple LUNs).  I doubt that a
single memory barrier can have that much impact. :)

The way to go to improve performance even more is to add new virtio APIs
for finer control of the usage of the ring.  These should let us avoid
copying the sg list and almost get rid of the tgt_lock; even though the
locking is quite efficient in virtio-scsi (see how tgt_lock and vq_lock
are "pipelined" so as to overlap the preparation of two requests), it
should give a nice improvement and especially avoid a kmalloc with small
requests.  I may have some time for it next month.

> Jen's approach is what we will ultimately need to re-architect in SCSI
> core if we're ever going to move beyond the issues of legacy host_lock,
> so I'm wondering if maybe this is the direction that virtio-scsi-mq
> needs to go in as well..?

We can see after the block layer multiqueue work goes in...  I also need
to look more closely at Jens's changes.

Have you measured the host_lock to be a bottleneck in high-iops
benchmarks, even for a modern driver that does not hold it in
queuecommand?  (Certainly it will become more important as the
virtio-scsi queuecommand becomes thinner and thinner).  If so, we can
start looking at limiting host_lock usage in the fast path.

BTW, supporting this in tcm-vhost should be quite trivial, as all the
request queues are the same and all serialization is done in the
virtio-scsi driver.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin Sept. 4, 2012, 8:46 a.m. UTC | #3
On Tue, Sep 04, 2012 at 08:46:12AM +0200, Paolo Bonzini wrote:
> Il 04/09/2012 04:21, Nicholas A. Bellinger ha scritto:
> >> @@ -112,6 +118,9 @@ static void virtscsi_complete_cmd(struct virtio_scsi *vscsi, void *buf)
> >>  	struct virtio_scsi_cmd *cmd = buf;
> >>  	struct scsi_cmnd *sc = cmd->sc;
> >>  	struct virtio_scsi_cmd_resp *resp = &cmd->resp.cmd;
> >> +	struct virtio_scsi_target_state *tgt = vscsi->tgt[sc->device->id];
> >> +
> >> +	atomic_dec(&tgt->reqs);
> >>  
> > 
> > As tgt->tgt_lock is taken in virtscsi_queuecommand_multi() before the
> > atomic_inc_return(tgt->reqs) check, it seems like using atomic_dec() w/o
> > smp_mb__after_atomic_dec or tgt_lock access here is not using atomic.h
> > accessors properly, no..?
> 
> No, only a single "thing" is being accessed, and there is no need to
> order the decrement with respect to preceding or subsequent accesses to
> other locations.
>
> In other words, tgt->reqs is already synchronized with itself, and that
> is enough.

I think your logic is correct and barrier is not needed,
but this needs better documentation.

> (Besides, on x86 smp_mb__after_atomic_dec is a nop).
> >> +static int virtscsi_queuecommand_multi(struct Scsi_Host *sh,
> >> +				       struct scsi_cmnd *sc)
> >> +{
> >> +	struct virtio_scsi *vscsi = shost_priv(sh);
> >> +	struct virtio_scsi_target_state *tgt = vscsi->tgt[sc->device->id];
> >> +	unsigned long flags;
> >> +	u32 queue_num;
> >> +
> >> +	/* Using an atomic_t for tgt->reqs lets the virtqueue handler
> >> +	 * decrement it without taking the spinlock.
> >> +	 */

Above comment is not really helpful - reader can be safely assumed to
know what atomic_t is.

Please delete, and replace with the text from commit log
that explains the heuristic used to select req_vq.

Also please add a comment near 'reqs' definition.
Something like "number of outstanding requests - used to detect idle
target".


> >> +	spin_lock_irqsave(&tgt->tgt_lock, flags);

Looks like this lock can be removed - req_vq is only
modified when target is idle and only used when it is
not idle.

> >> +	if (atomic_inc_return(&tgt->reqs) == 1) {
> >> +		queue_num = smp_processor_id();
> >> +		while (unlikely(queue_num >= vscsi->num_queues))
> >> +			queue_num -= vscsi->num_queues;
> >> +		tgt->req_vq = &vscsi->req_vqs[queue_num];
> >> +	}
> >> +	spin_unlock_irqrestore(&tgt->tgt_lock, flags);
> >> +	return virtscsi_queuecommand(vscsi, tgt, sc);
> >> +}
> >> +
> >> +

.....

> >> +static int virtscsi_queuecommand_single(struct Scsi_Host *sh,
> >> +                                       struct scsi_cmnd *sc)
> >> +{
> >> +       struct virtio_scsi *vscsi = shost_priv(sh);
> >> +       struct virtio_scsi_target_state *tgt = vscsi->tgt[sc->device->id];
> >> +
> >> +       atomic_inc(&tgt->reqs);
> >> +       return virtscsi_queuecommand(vscsi, tgt, sc);
> >> +}
> >> +

Here, reqs is unused - why bother incrementing it?
A branch on completion would be cheaper IMHO.


>virtio-scsi multiqueue has a performance benefit up to 20%

To be fair, you could be running in single queue mode.
In that case extra atomics and indirection that this code
brings will just add overhead without benefits.
I don't know how significant would that be.
Paolo Bonzini Sept. 4, 2012, 10:25 a.m. UTC | #4
Il 04/09/2012 10:46, Michael S. Tsirkin ha scritto:
>>>> +static int virtscsi_queuecommand_multi(struct Scsi_Host *sh,
>>>> +				       struct scsi_cmnd *sc)
>>>> +{
>>>> +	struct virtio_scsi *vscsi = shost_priv(sh);
>>>> +	struct virtio_scsi_target_state *tgt = vscsi->tgt[sc->device->id];
>>>> +	unsigned long flags;
>>>> +	u32 queue_num;
>>>> +
>>>> +	/* Using an atomic_t for tgt->reqs lets the virtqueue handler
>>>> +	 * decrement it without taking the spinlock.
>>>> +	 */
> 
> Above comment is not really helpful - reader can be safely assumed to
> know what atomic_t is.

Sure, the comment explains that we use an atomic because _elsewhere_ the
tgt_lock is not held while modifying reqs.

> Please delete, and replace with the text from commit log
> that explains the heuristic used to select req_vq.

Ok.

> Also please add a comment near 'reqs' definition.
> Something like "number of outstanding requests - used to detect idle
> target".

Ok.

> 
>>>> +	spin_lock_irqsave(&tgt->tgt_lock, flags);
> 
> Looks like this lock can be removed - req_vq is only
> modified when target is idle and only used when it is
> not idle.

If you have two incoming requests at the same time, req_vq is also
modified when the target is not idle; that's the point of the lock.

Suppose tgt->reqs = 0 initially, and you have two processors/queues.
Initially tgt->req_vq is queue #1.  If you have this:

    queuecommand on CPU #0         queuecommand #2 on CPU #1
  --------------------------------------------------------------
    atomic_inc_return(...) == 1
                                   atomic_inc_return(...) == 2
                                   virtscsi_queuecommand to queue #1
    tgt->req_vq = queue #0
    virtscsi_queuecommand to queue #0

then two requests are issued to different queues without a quiescent
point in the middle.

>>>> +	if (atomic_inc_return(&tgt->reqs) == 1) {
>>>> +		queue_num = smp_processor_id();
>>>> +		while (unlikely(queue_num >= vscsi->num_queues))
>>>> +			queue_num -= vscsi->num_queues;
>>>> +		tgt->req_vq = &vscsi->req_vqs[queue_num];
>>>> +	}
>>>> +	spin_unlock_irqrestore(&tgt->tgt_lock, flags);
>>>> +	return virtscsi_queuecommand(vscsi, tgt, sc);
>>>> +}
>>>> +
>>>> +
> 
> .....
> 
>>>> +static int virtscsi_queuecommand_single(struct Scsi_Host *sh,
>>>> +                                       struct scsi_cmnd *sc)
>>>> +{
>>>> +       struct virtio_scsi *vscsi = shost_priv(sh);
>>>> +       struct virtio_scsi_target_state *tgt = vscsi->tgt[sc->device->id];
>>>> +
>>>> +       atomic_inc(&tgt->reqs);
>>>> +       return virtscsi_queuecommand(vscsi, tgt, sc);
>>>> +}
>>>> +
> 
> Here, reqs is unused - why bother incrementing it?
> A branch on completion would be cheaper IMHO.

Well, I could also let tgt->reqs go negative, but it would be a bit untidy.

Another alternative is to access the target's target_busy field with
ACCESS_ONCE, and drop reqs altogether.  Too tricky to do this kind of
micro-optimization so early, though.

>> virtio-scsi multiqueue has a performance benefit up to 20%
> 
> To be fair, you could be running in single queue mode.
> In that case extra atomics and indirection that this code
> brings will just add overhead without benefits.
> I don't know how significant would that be.

Not measurable in my experiments.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin Sept. 4, 2012, 11:09 a.m. UTC | #5
On Tue, Sep 04, 2012 at 12:25:03PM +0200, Paolo Bonzini wrote:
> Il 04/09/2012 10:46, Michael S. Tsirkin ha scritto:
> >>>> +static int virtscsi_queuecommand_multi(struct Scsi_Host *sh,
> >>>> +				       struct scsi_cmnd *sc)
> >>>> +{
> >>>> +	struct virtio_scsi *vscsi = shost_priv(sh);
> >>>> +	struct virtio_scsi_target_state *tgt = vscsi->tgt[sc->device->id];
> >>>> +	unsigned long flags;
> >>>> +	u32 queue_num;
> >>>> +
> >>>> +	/* Using an atomic_t for tgt->reqs lets the virtqueue handler
> >>>> +	 * decrement it without taking the spinlock.
> >>>> +	 */
> > 
> > Above comment is not really helpful - reader can be safely assumed to
> > know what atomic_t is.
> 
> Sure, the comment explains that we use an atomic because _elsewhere_ the
> tgt_lock is not held while modifying reqs.
> 
> > Please delete, and replace with the text from commit log
> > that explains the heuristic used to select req_vq.
> 
> Ok.
> 
> > Also please add a comment near 'reqs' definition.
> > Something like "number of outstanding requests - used to detect idle
> > target".
> 
> Ok.
> 
> > 
> >>>> +	spin_lock_irqsave(&tgt->tgt_lock, flags);
> > 
> > Looks like this lock can be removed - req_vq is only
> > modified when target is idle and only used when it is
> > not idle.
> 
> If you have two incoming requests at the same time, req_vq is also
> modified when the target is not idle; that's the point of the lock.
> 
> Suppose tgt->reqs = 0 initially, and you have two processors/queues.
> Initially tgt->req_vq is queue #1.  If you have this:
> 
>     queuecommand on CPU #0         queuecommand #2 on CPU #1
>   --------------------------------------------------------------
>     atomic_inc_return(...) == 1
>                                    atomic_inc_return(...) == 2
>                                    virtscsi_queuecommand to queue #1
>     tgt->req_vq = queue #0
>     virtscsi_queuecommand to queue #0
> 
> then two requests are issued to different queues without a quiescent
> point in the middle.

What happens then? Does this break correctness?

> >>>> +	if (atomic_inc_return(&tgt->reqs) == 1) {
> >>>> +		queue_num = smp_processor_id();
> >>>> +		while (unlikely(queue_num >= vscsi->num_queues))
> >>>> +			queue_num -= vscsi->num_queues;
> >>>> +		tgt->req_vq = &vscsi->req_vqs[queue_num];
> >>>> +	}
> >>>> +	spin_unlock_irqrestore(&tgt->tgt_lock, flags);
> >>>> +	return virtscsi_queuecommand(vscsi, tgt, sc);
> >>>> +}
> >>>> +
> >>>> +
> > 
> > .....
> > 
> >>>> +static int virtscsi_queuecommand_single(struct Scsi_Host *sh,
> >>>> +                                       struct scsi_cmnd *sc)
> >>>> +{
> >>>> +       struct virtio_scsi *vscsi = shost_priv(sh);
> >>>> +       struct virtio_scsi_target_state *tgt = vscsi->tgt[sc->device->id];
> >>>> +
> >>>> +       atomic_inc(&tgt->reqs);
> >>>> +       return virtscsi_queuecommand(vscsi, tgt, sc);
> >>>> +}
> >>>> +
> > 
> > Here, reqs is unused - why bother incrementing it?
> > A branch on completion would be cheaper IMHO.
> 
> Well, I could also let tgt->reqs go negative, but it would be a bit untidy.
> 
> Another alternative is to access the target's target_busy field with
> ACCESS_ONCE, and drop reqs altogether.  Too tricky to do this kind of
> micro-optimization so early, though.

So keep it simple and just check a flag.

> >> virtio-scsi multiqueue has a performance benefit up to 20%
> > 
> > To be fair, you could be running in single queue mode.
> > In that case extra atomics and indirection that this code
> > brings will just add overhead without benefits.
> > I don't know how significant would that be.
> 
> Not measurable in my experiments.
> 
> Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini Sept. 4, 2012, 11:18 a.m. UTC | #6
Il 04/09/2012 13:09, Michael S. Tsirkin ha scritto:
>> >     queuecommand on CPU #0         queuecommand #2 on CPU #1
>> >   --------------------------------------------------------------
>> >     atomic_inc_return(...) == 1
>> >                                    atomic_inc_return(...) == 2
>> >                                    virtscsi_queuecommand to queue #1
>> >     tgt->req_vq = queue #0
>> >     virtscsi_queuecommand to queue #0
>> > 
>> > then two requests are issued to different queues without a quiescent
>> > point in the middle.
> What happens then? Does this break correctness?

Yes, requests to the same target should be processed in FIFO order, or
you have things like a flush issued before the write it was supposed to
flush.  This is why I can only change the queue when there is no request
pending.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin Sept. 4, 2012, 12:48 p.m. UTC | #7
On Tue, Aug 28, 2012 at 01:54:17PM +0200, Paolo Bonzini wrote:
> This patch adds queue steering to virtio-scsi.  When a target is sent
> multiple requests, we always drive them to the same queue so that FIFO
> processing order is kept.  However, if a target was idle, we can choose
> a queue arbitrarily.  In this case the queue is chosen according to the
> current VCPU, so the driver expects the number of request queues to be
> equal to the number of VCPUs.  This makes it easy and fast to select
> the queue, and also lets the driver optimize the IRQ affinity for the
> virtqueues (each virtqueue's affinity is set to the CPU that "owns"
> the queue).
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>

I guess an alternative is a per-target vq.
Is the reason you avoid this that you expect more targets
than cpus? If yes this is something you might want to
mention in the log.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin Sept. 4, 2012, 1:35 p.m. UTC | #8
On Tue, Sep 04, 2012 at 01:18:31PM +0200, Paolo Bonzini wrote:
> Il 04/09/2012 13:09, Michael S. Tsirkin ha scritto:
> >> >     queuecommand on CPU #0         queuecommand #2 on CPU #1
> >> >   --------------------------------------------------------------
> >> >     atomic_inc_return(...) == 1
> >> >                                    atomic_inc_return(...) == 2
> >> >                                    virtscsi_queuecommand to queue #1
> >> >     tgt->req_vq = queue #0
> >> >     virtscsi_queuecommand to queue #0
> >> > 
> >> > then two requests are issued to different queues without a quiescent
> >> > point in the middle.
> > What happens then? Does this break correctness?
> 
> Yes, requests to the same target should be processed in FIFO order, or
> you have things like a flush issued before the write it was supposed to
> flush.  This is why I can only change the queue when there is no request
> pending.
> 
> Paolo

I see.  I guess you can rewrite this as:
atomic_inc
if (atomic_read() == 1)
which is a bit cheaper, and make the fact
that you do not need increment and return to be atomic,
explicit.

Another simple idea: store last processor id in target,
if it is unchanged no need to play with req_vq
and take spinlock.

Also - some kind of comment explaining why a similar race can not happen
with this lock in place would be nice: I see why this specific race can
not trigger but since lock is dropped later before you submit command, I
have hard time convincing myself what exactly gurantees that vq is never
switched before or even while command is submitted.
Paolo Bonzini Sept. 4, 2012, 1:45 p.m. UTC | #9
Il 04/09/2012 15:35, Michael S. Tsirkin ha scritto:
> I see.  I guess you can rewrite this as:
> atomic_inc
> if (atomic_read() == 1)
> which is a bit cheaper, and make the fact
> that you do not need increment and return to be atomic,
> explicit.

It seems more complicated to me for hardly any reason.  (Besides, is it
cheaper?  It has one less memory barrier on some architectures I frankly
do not care much about---not on x86---but it also has two memory
accesses instead of one on all architectures).

> Another simple idea: store last processor id in target,
> if it is unchanged no need to play with req_vq
> and take spinlock.

Not so sure, consider the previous example with last_processor_id equal
to 1.

    queuecommand on CPU #0         queuecommand #2 on CPU #1
  --------------------------------------------------------------
    atomic_inc_return(...) == 1
                                   atomic_inc_return(...) == 2
                                   virtscsi_queuecommand to queue #1
    last_processor_id == 0? no
    spin_lock
    tgt->req_vq = queue #0
    spin_unlock
    virtscsi_queuecommand to queue #0

This is not a network driver, there are still a lot of locks around.
This micro-optimization doesn't pay enough for the pain.

> Also - some kind of comment explaining why a similar race can not happen
> with this lock in place would be nice: I see why this specific race can
> not trigger but since lock is dropped later before you submit command, I
> have hard time convincing myself what exactly gurantees that vq is never
> switched before or even while command is submitted.

Because tgt->reqs will never become zero (which is a necessary condition
for tgt->req_vq to change), as long as one request is executing
virtscsi_queuecommand.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini Sept. 4, 2012, 1:49 p.m. UTC | #10
Il 04/09/2012 14:48, Michael S. Tsirkin ha scritto:
>> > This patch adds queue steering to virtio-scsi.  When a target is sent
>> > multiple requests, we always drive them to the same queue so that FIFO
>> > processing order is kept.  However, if a target was idle, we can choose
>> > a queue arbitrarily.  In this case the queue is chosen according to the
>> > current VCPU, so the driver expects the number of request queues to be
>> > equal to the number of VCPUs.  This makes it easy and fast to select
>> > the queue, and also lets the driver optimize the IRQ affinity for the
>> > virtqueues (each virtqueue's affinity is set to the CPU that "owns"
>> > the queue).
>> > 
>> > Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> I guess an alternative is a per-target vq.
> Is the reason you avoid this that you expect more targets
> than cpus? If yes this is something you might want to
> mention in the log.

One reason is that, even though in practice I expect roughly the same
number of targets and VCPUs, hotplug means the number of targets is
difficult to predict and is usually fixed to 256.

The other reason is that per-target vq didn't give any performance
advantage.  The bonus comes from cache locality and less process
migrations, more than from the independent virtqueues.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin Sept. 4, 2012, 2:19 p.m. UTC | #11
On Tue, Sep 04, 2012 at 03:45:57PM +0200, Paolo Bonzini wrote:
> > Also - some kind of comment explaining why a similar race can not happen
> > with this lock in place would be nice: I see why this specific race can
> > not trigger but since lock is dropped later before you submit command, I
> > have hard time convincing myself what exactly gurantees that vq is never
> > switched before or even while command is submitted.
> 
> Because tgt->reqs will never become zero (which is a necessary condition
> for tgt->req_vq to change), as long as one request is executing
> virtscsi_queuecommand.
> 
> Paolo

Yes but this logic would apparently imply the lock is not necessary, and
it actually is. I am not saying anything is wrong just that it
looks scary.
Michael S. Tsirkin Sept. 4, 2012, 2:21 p.m. UTC | #12
On Tue, Sep 04, 2012 at 03:49:42PM +0200, Paolo Bonzini wrote:
> Il 04/09/2012 14:48, Michael S. Tsirkin ha scritto:
> >> > This patch adds queue steering to virtio-scsi.  When a target is sent
> >> > multiple requests, we always drive them to the same queue so that FIFO
> >> > processing order is kept.  However, if a target was idle, we can choose
> >> > a queue arbitrarily.  In this case the queue is chosen according to the
> >> > current VCPU, so the driver expects the number of request queues to be
> >> > equal to the number of VCPUs.  This makes it easy and fast to select
> >> > the queue, and also lets the driver optimize the IRQ affinity for the
> >> > virtqueues (each virtqueue's affinity is set to the CPU that "owns"
> >> > the queue).
> >> > 
> >> > Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> > I guess an alternative is a per-target vq.
> > Is the reason you avoid this that you expect more targets
> > than cpus? If yes this is something you might want to
> > mention in the log.
> 
> One reason is that, even though in practice I expect roughly the same
> number of targets and VCPUs, hotplug means the number of targets is
> difficult to predict and is usually fixed to 256.
> 
> The other reason is that per-target vq didn't give any performance
> advantage.  The bonus comes from cache locality and less process
> migrations, more than from the independent virtqueues.
> 
> Paolo

Okay, and why is per-target worse for cache locality?
Paolo Bonzini Sept. 4, 2012, 2:25 p.m. UTC | #13
Il 04/09/2012 16:19, Michael S. Tsirkin ha scritto:
> > > Also - some kind of comment explaining why a similar race can not happen
> > > with this lock in place would be nice: I see why this specific race can
> > > not trigger but since lock is dropped later before you submit command, I
> > > have hard time convincing myself what exactly gurantees that vq is never
> > > switched before or even while command is submitted.
> > 
> > Because tgt->reqs will never become zero (which is a necessary condition
> > for tgt->req_vq to change), as long as one request is executing
> > virtscsi_queuecommand.
> 
> Yes but this logic would apparently imply the lock is not necessary, and
> it actually is. I am not saying anything is wrong just that it
> looks scary.

Ok, I get the misunderstanding.  For the logic to hold, you need a
serialization point after which tgt->req_vq is not changed.  The lock
provides one such serialization point: after you unlock tgt->tgt_lock,
nothing else will change tgt->req_vq until your request completes.

Without the lock, there could always be a thread that is in the "then"
branch but has been scheduled out, and when rescheduled it will change
tgt->req_vq.

Perhaps the confusion comes from the atomic_inc_return, and that was
what my "why is this atomic" wanted to clear.  **tgt->reqs is only
atomic to avoid taking a spinlock in the ISR**.  If you read the code
with the lock, but with tgt->reqs as a regular non-atomic int, it should
be much easier to reason on the code.  I can split the patch if needed.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini Sept. 4, 2012, 2:30 p.m. UTC | #14
Il 04/09/2012 16:21, Michael S. Tsirkin ha scritto:
> > One reason is that, even though in practice I expect roughly the same
> > number of targets and VCPUs, hotplug means the number of targets is
> > difficult to predict and is usually fixed to 256.
> > 
> > The other reason is that per-target vq didn't give any performance
> > advantage.  The bonus comes from cache locality and less process
> > migrations, more than from the independent virtqueues.
> 
> Okay, and why is per-target worse for cache locality?

Because per-target doesn't have IRQ affinity for a particular CPU.

Assuming that the thread that is sending requests to the device is
I/O-bound, it is likely to be sleeping at the time the ISR is executed,
and thus executing the ISR on the same processor that sent the requests
is cheap.

But if you have many such I/O-bound processes, the kernel will execute
the ISR on a random processor, rather than the one that is sending
requests to the device.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin Sept. 4, 2012, 2:41 p.m. UTC | #15
On Tue, Sep 04, 2012 at 04:30:35PM +0200, Paolo Bonzini wrote:
> Il 04/09/2012 16:21, Michael S. Tsirkin ha scritto:
> > > One reason is that, even though in practice I expect roughly the same
> > > number of targets and VCPUs, hotplug means the number of targets is
> > > difficult to predict and is usually fixed to 256.
> > > 
> > > The other reason is that per-target vq didn't give any performance
> > > advantage.  The bonus comes from cache locality and less process
> > > migrations, more than from the independent virtqueues.
> > 
> > Okay, and why is per-target worse for cache locality?
> 
> Because per-target doesn't have IRQ affinity for a particular CPU.
> 
> Assuming that the thread that is sending requests to the device is
> I/O-bound, it is likely to be sleeping at the time the ISR is executed,
> and thus executing the ISR on the same processor that sent the requests
> is cheap.
> 
> But if you have many such I/O-bound processes, the kernel will execute
> the ISR on a random processor, rather than the one that is sending
> requests to the device.
> 
> Paolo

I see, another case where our irq balancing makes bad decisions.
You could do it differently - pin irq to the cpu of the last task that
executed, tweak irq affinity when that changes.
Still if you want to support 256 targets vector per target
is not going to work.

Would be nice to add this motivation to commit log I think.
Michael S. Tsirkin Sept. 4, 2012, 2:47 p.m. UTC | #16
On Tue, Aug 28, 2012 at 01:54:17PM +0200, Paolo Bonzini wrote:
> @@ -575,15 +630,19 @@ static struct scsi_host_template virtscsi_host_template = {
>  				  &__val, sizeof(__val)); \
>  	})
>  
> +

Pls don't add empty lines.

>  static void virtscsi_init_vq(struct virtio_scsi_vq *virtscsi_vq,
> -			     struct virtqueue *vq)
> +			     struct virtqueue *vq, bool affinity)
>  {
>  	spin_lock_init(&virtscsi_vq->vq_lock);
>  	virtscsi_vq->vq = vq;
> +	if (affinity)
> +		virtqueue_set_affinity(vq, virtqueue_get_queue_index(vq) -
> +				       VIRTIO_SCSI_VQ_BASE);
>  }
>  

This means in practice if you have less virtqueues than CPUs,
things are not going to work well, will they?

Any idea what to do?
Paolo Bonzini Sept. 4, 2012, 2:55 p.m. UTC | #17
Il 04/09/2012 16:47, Michael S. Tsirkin ha scritto:
>> >  static void virtscsi_init_vq(struct virtio_scsi_vq *virtscsi_vq,
>> > -			     struct virtqueue *vq)
>> > +			     struct virtqueue *vq, bool affinity)
>> >  {
>> >  	spin_lock_init(&virtscsi_vq->vq_lock);
>> >  	virtscsi_vq->vq = vq;
>> > +	if (affinity)
>> > +		virtqueue_set_affinity(vq, virtqueue_get_queue_index(vq) -
>> > +				       VIRTIO_SCSI_VQ_BASE);
>> >  }
>> >  
> This means in practice if you have less virtqueues than CPUs,
> things are not going to work well, will they?

Not particularly.  It could be better or worse than single queue
depending on the workload.

> Any idea what to do?

Two possibilities:

1) Add a stride argument to virtqueue_set_affinity, and make it equal to
the number of queues.

2) Make multiqueue the default in QEMU, and make the default number of
queues equal to the number of VCPUs.

I was going for (2).

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin Sept. 4, 2012, 3:03 p.m. UTC | #18
On Tue, Sep 04, 2012 at 04:55:56PM +0200, Paolo Bonzini wrote:
> Il 04/09/2012 16:47, Michael S. Tsirkin ha scritto:
> >> >  static void virtscsi_init_vq(struct virtio_scsi_vq *virtscsi_vq,
> >> > -			     struct virtqueue *vq)
> >> > +			     struct virtqueue *vq, bool affinity)
> >> >  {
> >> >  	spin_lock_init(&virtscsi_vq->vq_lock);
> >> >  	virtscsi_vq->vq = vq;
> >> > +	if (affinity)
> >> > +		virtqueue_set_affinity(vq, virtqueue_get_queue_index(vq) -
> >> > +				       VIRTIO_SCSI_VQ_BASE);
> >> >  }
> >> >  
> > This means in practice if you have less virtqueues than CPUs,
> > things are not going to work well, will they?
> 
> Not particularly.  It could be better or worse than single queue
> depending on the workload.

Well interrupts will go to CPU different from the one
that sends commands so ...

> > Any idea what to do?
> 
> Two possibilities:
> 
> 1) Add a stride argument to virtqueue_set_affinity, and make it equal to
> the number of queues.
> 
> 2) Make multiqueue the default in QEMU, and make the default number of
> queues equal to the number of VCPUs.
> 
> I was going for (2).
> 
> Paolo

3. use per target queue if less targets than cpus?
Nicholas A. Bellinger Sept. 4, 2012, 8:11 p.m. UTC | #19
On Tue, 2012-09-04 at 08:46 +0200, Paolo Bonzini wrote:
> Il 04/09/2012 04:21, Nicholas A. Bellinger ha scritto:
> >> @@ -112,6 +118,9 @@ static void virtscsi_complete_cmd(struct virtio_scsi *vscsi, void *buf)
> >>  	struct virtio_scsi_cmd *cmd = buf;
> >>  	struct scsi_cmnd *sc = cmd->sc;
> >>  	struct virtio_scsi_cmd_resp *resp = &cmd->resp.cmd;
> >> +	struct virtio_scsi_target_state *tgt = vscsi->tgt[sc->device->id];
> >> +
> >> +	atomic_dec(&tgt->reqs);
> >>  
> > 
> > As tgt->tgt_lock is taken in virtscsi_queuecommand_multi() before the
> > atomic_inc_return(tgt->reqs) check, it seems like using atomic_dec() w/o
> > smp_mb__after_atomic_dec or tgt_lock access here is not using atomic.h
> > accessors properly, no..?
> 
> No, only a single "thing" is being accessed, and there is no need to
> order the decrement with respect to preceding or subsequent accesses to
> other locations.
> 
> In other words, tgt->reqs is already synchronized with itself, and that
> is enough.
> 
> (Besides, on x86 smp_mb__after_atomic_dec is a nop).
> 

So the implementation detail wrt to requests to the same target being
processed in FIFO ordering + only being able to change the queue when no
requests are pending helps understand this code more.  Thanks for the
explanation on that bit..

However, it's still my understanding that the use of atomic_dec() in the
completion path mean that smp_mb__after_atomic_dec() is a requirement to
be proper portable atomic.hcode, no..?  Otherwise tgt->regs should be
using something other than an atomic_t, right..?

> >> +static int virtscsi_queuecommand_multi(struct Scsi_Host *sh,
> >> +				       struct scsi_cmnd *sc)
> >> +{
> >> +	struct virtio_scsi *vscsi = shost_priv(sh);
> >> +	struct virtio_scsi_target_state *tgt = vscsi->tgt[sc->device->id];
> >> +	unsigned long flags;
> >> +	u32 queue_num;
> >> +
> >> +	/* Using an atomic_t for tgt->reqs lets the virtqueue handler
> >> +	 * decrement it without taking the spinlock.
> >> +	 */
> >> +	spin_lock_irqsave(&tgt->tgt_lock, flags);
> >> +	if (atomic_inc_return(&tgt->reqs) == 1) {
> >> +		queue_num = smp_processor_id();
> >> +		while (unlikely(queue_num >= vscsi->num_queues))
> >> +			queue_num -= vscsi->num_queues;
> >> +		tgt->req_vq = &vscsi->req_vqs[queue_num];
> >> +	}
> >> +	spin_unlock_irqrestore(&tgt->tgt_lock, flags);
> >> +	return virtscsi_queuecommand(vscsi, tgt, sc);
> >> +}
> >> +
> > 
> > The extra memory barriers to get this right for the current approach are
> > just going to slow things down even more for virtio-scsi-mq..
> 
> virtio-scsi multiqueue has a performance benefit up to 20% (for a single
> LUN) or 40% (on overall bandwidth across multiple LUNs).  I doubt that a
> single memory barrier can have that much impact. :)
> 

I've no doubt that this series increases the large block high bandwidth
for virtio-scsi, but historically that has always been the easier
workload to scale.  ;)

> The way to go to improve performance even more is to add new virtio APIs
> for finer control of the usage of the ring.  These should let us avoid
> copying the sg list and almost get rid of the tgt_lock; even though the
> locking is quite efficient in virtio-scsi (see how tgt_lock and vq_lock
> are "pipelined" so as to overlap the preparation of two requests), it
> should give a nice improvement and especially avoid a kmalloc with small
> requests.  I may have some time for it next month.
> 
> > Jen's approach is what we will ultimately need to re-architect in SCSI
> > core if we're ever going to move beyond the issues of legacy host_lock,
> > so I'm wondering if maybe this is the direction that virtio-scsi-mq
> > needs to go in as well..?
> 
> We can see after the block layer multiqueue work goes in...  I also need
> to look more closely at Jens's changes.
> 

Yes, I think Jen's new approach is providing some pretty significant
gains for raw block drivers with extremly high packet (small block
random I/O) workloads, esp with hw block drivers that support genuine mq
with hw num_queues > 1.

He also has virtio-blk converted to run in num_queues=1 mode.

> Have you measured the host_lock to be a bottleneck in high-iops
> benchmarks, even for a modern driver that does not hold it in
> queuecommand?  (Certainly it will become more important as the
> virtio-scsi queuecommand becomes thinner and thinner).

This is exactly why it would make such a good vehicle to re-architect
SCSI core.  I'm thinking it can be the first sw LLD we attempt to get
running on an (currently) future scsi-mq prototype.

>   If so, we can
> start looking at limiting host_lock usage in the fast path.
> 

That would be a good incremental step for SCSI core, but I'm not sure
that that we'll be able to scale compared to blk-mq without a
new-approach for sw/hw LLDs along the lines of what Jen's is doing.

> BTW, supporting this in tcm-vhost should be quite trivial, as all the
> request queues are the same and all serialization is done in the
> virtio-scsi driver.
> 

Looking forward to that too..  ;)

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini Sept. 5, 2012, 7:03 a.m. UTC | #20
Il 04/09/2012 22:11, Nicholas A. Bellinger ha scritto:
>>> As tgt->tgt_lock is taken in virtscsi_queuecommand_multi() before the
>>> atomic_inc_return(tgt->reqs) check, it seems like using atomic_dec() w/o
>>> smp_mb__after_atomic_dec or tgt_lock access here is not using atomic.h
>>> accessors properly, no..?
>>
>> No, only a single "thing" is being accessed, and there is no need to
>> order the decrement with respect to preceding or subsequent accesses to
>> other locations.
>>
>> In other words, tgt->reqs is already synchronized with itself, and that
>> is enough.
>>
> 
> However, it's still my understanding that the use of atomic_dec() in the
> completion path mean that smp_mb__after_atomic_dec() is a requirement to
> be proper portable atomic.hcode, no..?  Otherwise tgt->regs should be
> using something other than an atomic_t, right..?

Memory barriers aren't _always_ requested, only when you need to order
accesses to multiple locations.

In this case, there is no other location that the
queuecommand/completion handlers needs to synchronize against, so no
barrier is required.  You can see plenty of atomic_inc/atomic_dec in the
code without a barrier afterwards (the typical case is the opposite as
in this patch: a refcount increment needs no barrier, a refcount
decrement uses atomic_dec_return).

>> virtio-scsi multiqueue has a performance benefit up to 20% (for a single
>> LUN) or 40% (on overall bandwidth across multiple LUNs).  I doubt that a
>> single memory barrier can have that much impact. :)
>>
> 
> I've no doubt that this series increases the large block high bandwidth
> for virtio-scsi, but historically that has always been the easier
> workload to scale.  ;)

This is with a mixed workload (random 4k-64k) and tmpfs backend on the host.

> Yes, I think Jen's new approach is providing some pretty significant
> gains for raw block drivers with extremly high packet (small block
> random I/O) workloads, esp with hw block drivers that support genuine mq
> with hw num_queues > 1.

I need to look into it, to understand how the queue steering here can be
adapted to his code.

>> Have you measured the host_lock to be a bottleneck in high-iops
>> benchmarks, even for a modern driver that does not hold it in
>> queuecommand?  (Certainly it will become more important as the
>> virtio-scsi queuecommand becomes thinner and thinner).
> 
> This is exactly why it would make such a good vehicle to re-architect
> SCSI core.  I'm thinking it can be the first sw LLD we attempt to get
> running on an (currently) future scsi-mq prototype.

Agreed.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
index 6414ea0..0c4b096 100644
--- a/drivers/scsi/virtio_scsi.c
+++ b/drivers/scsi/virtio_scsi.c
@@ -26,6 +26,7 @@ 
 
 #define VIRTIO_SCSI_MEMPOOL_SZ 64
 #define VIRTIO_SCSI_EVENT_LEN 8
+#define VIRTIO_SCSI_VQ_BASE 2
 
 /* Command queue element */
 struct virtio_scsi_cmd {
@@ -59,9 +60,13 @@  struct virtio_scsi_vq {
 
 /* Per-target queue state */
 struct virtio_scsi_target_state {
-	/* Protects sg.  Lock hierarchy is tgt_lock -> vq_lock.  */
+	/* Protects sg, req_vq.  Lock hierarchy is tgt_lock -> vq_lock.  */
 	spinlock_t tgt_lock;
 
+	struct virtio_scsi_vq *req_vq;
+
+	atomic_t reqs;
+
 	/* For sglist construction when adding commands to the virtqueue.  */
 	struct scatterlist sg[];
 };
@@ -70,14 +75,15 @@  struct virtio_scsi_target_state {
 struct virtio_scsi {
 	struct virtio_device *vdev;
 
-	struct virtio_scsi_vq ctrl_vq;
-	struct virtio_scsi_vq event_vq;
-	struct virtio_scsi_vq req_vq;
-
 	/* Get some buffers ready for event vq */
 	struct virtio_scsi_event_node event_list[VIRTIO_SCSI_EVENT_LEN];
 
+	u32 num_queues;
 	struct virtio_scsi_target_state **tgt;
+
+	struct virtio_scsi_vq ctrl_vq;
+	struct virtio_scsi_vq event_vq;
+	struct virtio_scsi_vq req_vqs[];
 };
 
 static struct kmem_cache *virtscsi_cmd_cache;
@@ -112,6 +118,9 @@  static void virtscsi_complete_cmd(struct virtio_scsi *vscsi, void *buf)
 	struct virtio_scsi_cmd *cmd = buf;
 	struct scsi_cmnd *sc = cmd->sc;
 	struct virtio_scsi_cmd_resp *resp = &cmd->resp.cmd;
+	struct virtio_scsi_target_state *tgt = vscsi->tgt[sc->device->id];
+
+	atomic_dec(&tgt->reqs);
 
 	dev_dbg(&sc->device->sdev_gendev,
 		"cmd %p response %u status %#02x sense_len %u\n",
@@ -185,11 +194,13 @@  static void virtscsi_req_done(struct virtqueue *vq)
 {
 	struct Scsi_Host *sh = virtio_scsi_host(vq->vdev);
 	struct virtio_scsi *vscsi = shost_priv(sh);
+	int index = virtqueue_get_queue_index(vq) - VIRTIO_SCSI_VQ_BASE;
+	struct virtio_scsi_vq *req_vq = &vscsi->req_vqs[index];
 	unsigned long flags;
 
-	spin_lock_irqsave(&vscsi->req_vq.vq_lock, flags);
+	spin_lock_irqsave(&req_vq->vq_lock, flags);
 	virtscsi_vq_done(vscsi, vq, virtscsi_complete_cmd);
-	spin_unlock_irqrestore(&vscsi->req_vq.vq_lock, flags);
+	spin_unlock_irqrestore(&req_vq->vq_lock, flags);
 };
 
 static void virtscsi_complete_free(struct virtio_scsi *vscsi, void *buf)
@@ -429,10 +440,10 @@  static int virtscsi_kick_cmd(struct virtio_scsi_target_state *tgt,
 	return ret;
 }
 
-static int virtscsi_queuecommand(struct Scsi_Host *sh, struct scsi_cmnd *sc)
+static int virtscsi_queuecommand(struct virtio_scsi *vscsi,
+				 struct virtio_scsi_target_state *tgt,
+				 struct scsi_cmnd *sc)
 {
-	struct virtio_scsi *vscsi = shost_priv(sh);
-	struct virtio_scsi_target_state *tgt = vscsi->tgt[sc->device->id];
 	struct virtio_scsi_cmd *cmd;
 	int ret;
 
@@ -466,7 +477,7 @@  static int virtscsi_queuecommand(struct Scsi_Host *sh, struct scsi_cmnd *sc)
 	BUG_ON(sc->cmd_len > VIRTIO_SCSI_CDB_SIZE);
 	memcpy(cmd->req.cmd.cdb, sc->cmnd, sc->cmd_len);
 
-	if (virtscsi_kick_cmd(tgt, &vscsi->req_vq, cmd,
+	if (virtscsi_kick_cmd(tgt, tgt->req_vq, cmd,
 			      sizeof cmd->req.cmd, sizeof cmd->resp.cmd,
 			      GFP_ATOMIC) >= 0)
 		ret = 0;
@@ -475,6 +486,38 @@  out:
 	return ret;
 }
 
+static int virtscsi_queuecommand_single(struct Scsi_Host *sh,
+					struct scsi_cmnd *sc)
+{
+	struct virtio_scsi *vscsi = shost_priv(sh);
+	struct virtio_scsi_target_state *tgt = vscsi->tgt[sc->device->id];
+
+	atomic_inc(&tgt->reqs);
+	return virtscsi_queuecommand(vscsi, tgt, sc);
+}
+
+static int virtscsi_queuecommand_multi(struct Scsi_Host *sh,
+				       struct scsi_cmnd *sc)
+{
+	struct virtio_scsi *vscsi = shost_priv(sh);
+	struct virtio_scsi_target_state *tgt = vscsi->tgt[sc->device->id];
+	unsigned long flags;
+	u32 queue_num;
+
+	/* Using an atomic_t for tgt->reqs lets the virtqueue handler
+	 * decrement it without taking the spinlock.
+	 */
+	spin_lock_irqsave(&tgt->tgt_lock, flags);
+	if (atomic_inc_return(&tgt->reqs) == 1) {
+		queue_num = smp_processor_id();
+		while (unlikely(queue_num >= vscsi->num_queues))
+			queue_num -= vscsi->num_queues;
+		tgt->req_vq = &vscsi->req_vqs[queue_num];
+	}
+	spin_unlock_irqrestore(&tgt->tgt_lock, flags);
+	return virtscsi_queuecommand(vscsi, tgt, sc);
+}
+
 static int virtscsi_tmf(struct virtio_scsi *vscsi, struct virtio_scsi_cmd *cmd)
 {
 	DECLARE_COMPLETION_ONSTACK(comp);
@@ -544,12 +585,26 @@  static int virtscsi_abort(struct scsi_cmnd *sc)
 	return virtscsi_tmf(vscsi, cmd);
 }
 
-static struct scsi_host_template virtscsi_host_template = {
+static struct scsi_host_template virtscsi_host_template_single = {
 	.module = THIS_MODULE,
 	.name = "Virtio SCSI HBA",
 	.proc_name = "virtio_scsi",
-	.queuecommand = virtscsi_queuecommand,
 	.this_id = -1,
+	.queuecommand = virtscsi_queuecommand_single,
+	.eh_abort_handler = virtscsi_abort,
+	.eh_device_reset_handler = virtscsi_device_reset,
+
+	.can_queue = 1024,
+	.dma_boundary = UINT_MAX,
+	.use_clustering = ENABLE_CLUSTERING,
+};
+
+static struct scsi_host_template virtscsi_host_template_multi = {
+	.module = THIS_MODULE,
+	.name = "Virtio SCSI HBA",
+	.proc_name = "virtio_scsi",
+	.this_id = -1,
+	.queuecommand = virtscsi_queuecommand_multi,
 	.eh_abort_handler = virtscsi_abort,
 	.eh_device_reset_handler = virtscsi_device_reset,
 
@@ -575,15 +630,19 @@  static struct scsi_host_template virtscsi_host_template = {
 				  &__val, sizeof(__val)); \
 	})
 
+
 static void virtscsi_init_vq(struct virtio_scsi_vq *virtscsi_vq,
-			     struct virtqueue *vq)
+			     struct virtqueue *vq, bool affinity)
 {
 	spin_lock_init(&virtscsi_vq->vq_lock);
 	virtscsi_vq->vq = vq;
+	if (affinity)
+		virtqueue_set_affinity(vq, virtqueue_get_queue_index(vq) -
+				       VIRTIO_SCSI_VQ_BASE);
 }
 
 static struct virtio_scsi_target_state *virtscsi_alloc_tgt(
-	struct virtio_device *vdev, int sg_elems)
+	struct virtio_scsi *vscsi, u32 sg_elems)
 {
 	struct virtio_scsi_target_state *tgt;
 	gfp_t gfp_mask = GFP_KERNEL;
@@ -597,6 +656,13 @@  static struct virtio_scsi_target_state *virtscsi_alloc_tgt(
 
 	spin_lock_init(&tgt->tgt_lock);
 	sg_init_table(tgt->sg, sg_elems + 2);
+	atomic_set(&tgt->reqs, 0);
+
+	/*
+	 * The default is unused for multiqueue, but with a single queue
+	 * or target we use it in virtscsi_queuecommand.
+	 */
+	tgt->req_vq = &vscsi->req_vqs[0];
 	return tgt;
 }
 
@@ -632,28 +698,41 @@  static int virtscsi_init(struct virtio_device *vdev,
 			 struct virtio_scsi *vscsi, int num_targets)
 {
 	int err;
-	struct virtqueue *vqs[3];
 	u32 i, sg_elems;
+	u32 num_vqs;
+	vq_callback_t **callbacks;
+	const char **names;
+	struct virtqueue **vqs;
 
-	vq_callback_t *callbacks[] = {
-		virtscsi_ctrl_done,
-		virtscsi_event_done,
-		virtscsi_req_done
-	};
-	const char *names[] = {
-		"control",
-		"event",
-		"request"
-	};
+	num_vqs = vscsi->num_queues + VIRTIO_SCSI_VQ_BASE;
+	vqs = kmalloc(num_vqs * sizeof(struct virtqueue *), GFP_KERNEL);
+	callbacks = kmalloc(num_vqs * sizeof(vq_callback_t *), GFP_KERNEL);
+	names = kmalloc(num_vqs * sizeof(char *), GFP_KERNEL);
+
+	if (!callbacks || !vqs || !names) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	callbacks[0] = virtscsi_ctrl_done;
+	callbacks[1] = virtscsi_event_done;
+	names[0] = "control";
+	names[1] = "event";
+	for (i = VIRTIO_SCSI_VQ_BASE; i < num_vqs; i++) {
+		callbacks[i] = virtscsi_req_done;
+		names[i] = "request";
+	}
 
 	/* Discover virtqueues and write information to configuration.  */
-	err = vdev->config->find_vqs(vdev, 3, vqs, callbacks, names);
+	err = vdev->config->find_vqs(vdev, num_vqs, vqs, callbacks, names);
 	if (err)
 		return err;
 
-	virtscsi_init_vq(&vscsi->ctrl_vq, vqs[0]);
-	virtscsi_init_vq(&vscsi->event_vq, vqs[1]);
-	virtscsi_init_vq(&vscsi->req_vq, vqs[2]);
+	virtscsi_init_vq(&vscsi->ctrl_vq, vqs[0], false);
+	virtscsi_init_vq(&vscsi->event_vq, vqs[1], false);
+	for (i = VIRTIO_SCSI_VQ_BASE; i < num_vqs; i++)
+		virtscsi_init_vq(&vscsi->req_vqs[i - VIRTIO_SCSI_VQ_BASE],
+				 vqs[i], vscsi->num_queues > 1);
 
 	virtscsi_config_set(vdev, cdb_size, VIRTIO_SCSI_CDB_SIZE);
 	virtscsi_config_set(vdev, sense_size, VIRTIO_SCSI_SENSE_SIZE);
@@ -671,7 +750,7 @@  static int virtscsi_init(struct virtio_device *vdev,
 		goto out;
 	}
 	for (i = 0; i < num_targets; i++) {
-		vscsi->tgt[i] = virtscsi_alloc_tgt(vdev, sg_elems);
+		vscsi->tgt[i] = virtscsi_alloc_tgt(vscsi, sg_elems);
 		if (!vscsi->tgt[i]) {
 			err = -ENOMEM;
 			goto out;
@@ -680,6 +759,9 @@  static int virtscsi_init(struct virtio_device *vdev,
 	err = 0;
 
 out:
+	kfree(names);
+	kfree(callbacks);
+	kfree(vqs);
 	if (err)
 		virtscsi_remove_vqs(vdev);
 	return err;
@@ -692,11 +774,26 @@  static int __devinit virtscsi_probe(struct virtio_device *vdev)
 	int err;
 	u32 sg_elems, num_targets;
 	u32 cmd_per_lun;
+	u32 num_queues;
+	struct scsi_host_template *hostt;
+
+	/* We need to know how many queues before we allocate.  */
+	num_queues = virtscsi_config_get(vdev, num_queues) ?: 1;
 
 	/* Allocate memory and link the structs together.  */
 	num_targets = virtscsi_config_get(vdev, max_target) + 1;
-	shost = scsi_host_alloc(&virtscsi_host_template, sizeof(*vscsi));
 
+	/* Multiqueue is not beneficial with a single target.  */
+	if (num_targets == 1)
+		num_queues = 1;
+
+	if (num_queues == 1)
+		hostt = &virtscsi_host_template_single;
+	else
+		hostt = &virtscsi_host_template_multi;
+
+	shost = scsi_host_alloc(hostt,
+		sizeof(*vscsi) + sizeof(vscsi->req_vqs[0]) * num_queues);
 	if (!shost)
 		return -ENOMEM;
 
@@ -704,6 +801,7 @@  static int __devinit virtscsi_probe(struct virtio_device *vdev)
 	shost->sg_tablesize = sg_elems;
 	vscsi = shost_priv(shost);
 	vscsi->vdev = vdev;
+	vscsi->num_queues = num_queues;
 	vdev->priv = shost;
 
 	err = virtscsi_init(vdev, vscsi, num_targets);