Message ID | 871uigj747.fsf@rustcorp.com.au (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Thu, Sep 06, 2012 at 10:32:48AM +0930, Rusty Russell wrote: > Sasha Levin <levinsasha928@gmail.com> writes: > >> On Wed, Aug 29, 2012 at 05:03:03PM +0200, Sasha Levin wrote: > >>> I've also re-ran it on a IBM server type host instead of my laptop. Here are the > >>> results: > >>> > >>> Vanilla kernel: > >>> > >>> MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.33.1 > >>> () port 0 AF_INET > >>> enable_enobufs failed: getprotobyname > >>> Recv Send Send > >>> Socket Socket Message Elapsed > >>> Size Size Size Time Throughput > >>> bytes bytes bytes secs. 10^6bits/sec > >>> > >>> 87380 16384 16384 10.00 7922.72 > >>> > >>> Patch 1, with threshold=16: > >>> > >>> MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.33.1 > >>> () port 0 AF_INET > >>> enable_enobufs failed: getprotobyname > >>> Recv Send Send > >>> Socket Socket Message Elapsed > >>> Size Size Size Time Throughput > >>> bytes bytes bytes secs. 10^6bits/sec > >>> > >>> 87380 16384 16384 10.00 8415.07 > >>> > >>> Patch 2: > >>> > >>> MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.33.1 > >>> () port 0 AF_INET > >>> enable_enobufs failed: getprotobyname > >>> Recv Send Send > >>> Socket Socket Message Elapsed > >>> Size Size Size Time Throughput > >>> bytes bytes bytes secs. 10^6bits/sec > >>> > >>> 87380 16384 16384 10.00 8931.05 > >>> > >>> > >>> Note that these are simple tests with netperf listening on one end and a simple > >>> 'netperf -H [host]' within the guest. If there are other tests which may be > >>> interesting please let me know. > > It might be worth just unconditionally having a cache for the 2 > descriptor case. This is what I get with qemu tap, though for some > reason the device features don't have guest or host CSUM, so my setup is > probably screwed: Yes without checksum net core always linearizes packets, so yes it is screwed. For -net, skb always allocates space for 17 frags + linear part so it seems sane to do same in virtio core, and allocate, for -net, up to max_frags + 1 from cache. We can adjust it: no _SG -> 2 otherwise 18. Not sure about other drivers, maybe really use 2 there for now. > Queue histogram for virtio0: > Size distribution for input (max=128427): > 1: 128427 ################################################################ > Size distribution for output (max=256485): > 2: 256485 ################################################################ > Size distribution for control (max=10): > 3: 10 ################################################################ > 4: 5 ################################ > > Here's a patch, what do you get (run ifconfig to trigger the dump; yeah, > it's a hack!) > > Hack: histogram of buffer sizes for virtio devices. > > Currently triggered by a stats query (eg ifconfig) on a net device. > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > --- a/drivers/net/virtio_net.c > +++ b/drivers/net/virtio_net.c > @@ -727,6 +727,8 @@ static struct rtnl_link_stats64 *virtnet > tot->rx_length_errors = dev->stats.rx_length_errors; > tot->rx_frame_errors = dev->stats.rx_frame_errors; > > + virtio_dev_dump_histogram(vi->vdev); > + > return tot; > } > > diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c > --- a/drivers/virtio/virtio.c > +++ b/drivers/virtio/virtio.c > @@ -108,6 +108,16 @@ void virtio_check_driver_offered_feature > } > EXPORT_SYMBOL_GPL(virtio_check_driver_offered_feature); > > +void virtio_dev_dump_histogram(const struct virtio_device *vdev) > +{ > + const struct virtqueue *vq; > + > + printk("Queue histogram for %s:\n", dev_name(&vdev->dev)); > + list_for_each_entry(vq, &vdev->vqs, list) > + virtqueue_dump_histogram(vq); > +} > +EXPORT_SYMBOL_GPL(virtio_dev_dump_histogram); > + > static int virtio_dev_probe(struct device *_d) > { > int err, i; > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c > --- a/drivers/virtio/virtio_ring.c > +++ b/drivers/virtio/virtio_ring.c > @@ -120,6 +120,8 @@ struct vring_virtqueue > ktime_t last_add_time; > #endif > > + unsigned int *histo; > + > /* Tokens for callbacks. */ > void *data[]; > }; > @@ -259,6 +261,8 @@ int virtqueue_add_buf(struct virtqueue * > BUG_ON(out + in > vq->vring.num); > BUG_ON(out + in == 0); > > + vq->histo[out+in]++; > + > /* If the host supports indirect descriptor tables, consider it. */ > if (vq->indirect) { > bool try_indirect; > @@ -726,6 +730,7 @@ struct virtqueue *vring_new_virtqueue(un > } > vq->data[i] = NULL; > > + vq->histo = kzalloc(num * sizeof(vq->histo[0]), GFP_KERNEL); > return &vq->vq; > } > EXPORT_SYMBOL_GPL(vring_new_virtqueue); > @@ -772,4 +777,33 @@ unsigned int virtqueue_get_vring_size(st > } > EXPORT_SYMBOL_GPL(virtqueue_get_vring_size); > > +void virtqueue_dump_histogram(const struct virtqueue *_vq) > +{ > + const struct vring_virtqueue *vq = to_vvq(_vq); > + int i, j, start = 0, end = 0, max = 1; > + char line[120]; > + > + for (i = 0; i < vq->vring.num; i++) { > + if (!vq->histo[i]) > + continue; > + > + end = i; > + if (!vq->histo[start]) > + start = i; > + > + if (vq->histo[i] > max) > + max = vq->histo[i]; > + } > + > + printk("Size distribution for %s (max=%u):\n", _vq->name, max); > + for (i = start; i <= end; i++) { > + unsigned int off; > + off = sprintf(line, "%3u: %-7u ", i, vq->histo[i]); > + for (j = 0; j < vq->histo[i] * 64 / max; j++) > + line[off++] = '#'; > + line[off] = '\0'; > + printk("%s\n", line); > + } > +} > + > MODULE_LICENSE("GPL"); > diff --git a/include/linux/virtio.h b/include/linux/virtio.h > --- a/include/linux/virtio.h > +++ b/include/linux/virtio.h > @@ -52,6 +52,9 @@ unsigned int virtqueue_get_vring_size(st > > int virtqueue_get_queue_index(struct virtqueue *vq); > > +void virtio_dev_dump_histogram(const struct virtio_device *vdev); > +void virtqueue_dump_histogram(const struct virtqueue *vq); > + > /** > * virtio_device - representation of a device using virtio > * @index: unique position on the virtio bus -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
"Michael S. Tsirkin" <mst@redhat.com> writes: > Yes without checksum net core always linearizes packets, so yes it is > screwed. > For -net, skb always allocates space for 17 frags + linear part so > it seems sane to do same in virtio core, and allocate, for -net, > up to max_frags + 1 from cache. > We can adjust it: no _SG -> 2 otherwise 18. But I thought it used individual buffers these days? Cheers, Rusty. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Sep 06, 2012 at 05:27:23PM +0930, Rusty Russell wrote: > "Michael S. Tsirkin" <mst@redhat.com> writes: > > Yes without checksum net core always linearizes packets, so yes it is > > screwed. > > For -net, skb always allocates space for 17 frags + linear part so > > it seems sane to do same in virtio core, and allocate, for -net, > > up to max_frags + 1 from cache. > > We can adjust it: no _SG -> 2 otherwise 18. > > But I thought it used individual buffers these days? Yes for receive, no for transmit. That's probably why we should have the threshold per vq, not per device, BTW.
"Michael S. Tsirkin" <mst@redhat.com> writes: > On Thu, Sep 06, 2012 at 05:27:23PM +0930, Rusty Russell wrote: >> "Michael S. Tsirkin" <mst@redhat.com> writes: >> > Yes without checksum net core always linearizes packets, so yes it is >> > screwed. >> > For -net, skb always allocates space for 17 frags + linear part so >> > it seems sane to do same in virtio core, and allocate, for -net, >> > up to max_frags + 1 from cache. >> > We can adjust it: no _SG -> 2 otherwise 18. >> >> But I thought it used individual buffers these days? > > Yes for receive, no for transmit. That's probably why > we should have the threshold per vq, not per device, BTW. Can someone actually run with my histogram patch and see what the real numbers are? I'm not convinced that the ideal 17-buffer case actually happens as much as we think. And if it's not happening with this netperf test, we're testing the wrong thing. Thanks, Rusty. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Sep 07, 2012 at 09:19:04AM +0930, Rusty Russell wrote: > "Michael S. Tsirkin" <mst@redhat.com> writes: > > > On Thu, Sep 06, 2012 at 05:27:23PM +0930, Rusty Russell wrote: > >> "Michael S. Tsirkin" <mst@redhat.com> writes: > >> > Yes without checksum net core always linearizes packets, so yes it is > >> > screwed. > >> > For -net, skb always allocates space for 17 frags + linear part so > >> > it seems sane to do same in virtio core, and allocate, for -net, > >> > up to max_frags + 1 from cache. > >> > We can adjust it: no _SG -> 2 otherwise 18. > >> > >> But I thought it used individual buffers these days? > > > > Yes for receive, no for transmit. That's probably why > > we should have the threshold per vq, not per device, BTW. > > Can someone actually run with my histogram patch and see what the real > numbers are? > > I'm not convinced that the ideal 17-buffer case actually happens as much > as we think. And if it's not happening with this netperf test, we're > testing the wrong thing. > > Thanks, > Rusty. hope to play with it next week -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Il 06/09/2012 07:02, Michael S. Tsirkin ha scritto: >> > It might be worth just unconditionally having a cache for the 2 >> > descriptor case. This is what I get with qemu tap, though for some >> > reason the device features don't have guest or host CSUM, so my setup is >> > probably screwed: > Yes without checksum net core always linearizes packets, so yes it is > screwed. > For -net, skb always allocates space for 17 frags + linear part so > it seems sane to do same in virtio core, and allocate, for -net, > up to max_frags + 1 from cache. > We can adjust it: no _SG -> 2 otherwise 18. > > Not sure about other drivers, maybe really use 2 there for now. 2 should also be good for virtio-blk and virtio-scsi 4KB random rw workloads. Paolo -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Friday, September 07, 2012 09:19:04 AM Rusty Russell wrote: > "Michael S. Tsirkin" <mst@redhat.com> writes: > > On Thu, Sep 06, 2012 at 05:27:23PM +0930, Rusty Russell wrote: > >> "Michael S. Tsirkin" <mst@redhat.com> writes: > >> > Yes without checksum net core always linearizes packets, so yes it is > >> > screwed. > >> > For -net, skb always allocates space for 17 frags + linear part so > >> > it seems sane to do same in virtio core, and allocate, for -net, > >> > up to max_frags + 1 from cache. > >> > We can adjust it: no _SG -> 2 otherwise 18. > >> > >> But I thought it used individual buffers these days? > > > > Yes for receive, no for transmit. That's probably why > > we should have the threshold per vq, not per device, BTW. > > Can someone actually run with my histogram patch and see what the real > numbers are? Somehow some HTML got in my first reply, resending... I ran some TCP_RR and TCP_STREAM sessions, both host-to-guest and guest-to-host, with a form of the histogram patch applied against a RHEL6.3 kernel. The histogram values were reset after each test. Here are the results: 60 session TCP_RR from host-to-guest with 256 byte request and 256 byte response for 60 seconds: Queue histogram for virtio1: Size distribution for input (max=7818456): 1: 7818456 ################################################################ Size distribution for output (max=7816698): 2: 149 3: 7816698 ################################################################ 4: 2 5: 1 Size distribution for control (max=1): 0: 0 4 session TCP_STREAM from host-to-guest with 4K message size for 60 seconds: Queue histogram for virtio1: Size distribution for input (max=16050941): 1: 16050941 ################################################################ Size distribution for output (max=1877796): 2: 1877796 ################################################################ 3: 5 Size distribution for control (max=1): 0: 0 4 session TCP_STREAM from host-to-guest with 16K message size for 60 seconds: Queue histogram for virtio1: Size distribution for input (max=16831151): 1: 16831151 ################################################################ Size distribution for output (max=1923965): 2: 1923965 ################################################################ 3: 5 Size distribution for control (max=1): 0: 0 4 session TCP_STREAM from guest-to-host with 4K message size for 60 seconds: Queue histogram for virtio1: Size distribution for input (max=1316069): 1: 1316069 ################################################################ Size distribution for output (max=879213): 2: 24 3: 24097 # 4: 23176 # 5: 3412 6: 4446 7: 4663 8: 4195 9: 3772 10: 3388 11: 3666 12: 2885 13: 2759 14: 2997 15: 3060 16: 2651 17: 2235 18: 92721 ###### 19: 879213 ################################################################ Size distribution for control (max=1): 0: 0 4 session TCP_STREAM from guest-to-host with 16K message size for 60 seconds: Queue histogram for virtio1: Size distribution for input (max=1428590): 1: 1428590 ################################################################ Size distribution for output (max=957774): 2: 20 3: 54955 ### 4: 34281 ## 5: 2967 6: 3394 7: 9400 8: 3061 9: 3397 10: 3258 11: 3275 12: 3147 13: 2876 14: 2747 15: 2832 16: 2013 17: 1670 18: 100369 ###### 19: 957774 ################################################################ Size distribution for control (max=1): 0: 0 Thanks, Tom > > I'm not convinced that the ideal 17-buffer case actually happens as much > as we think. And if it's not happening with this netperf test, we're > testing the wrong thing. > > Thanks, > Rusty. > > -- > To unsubscribe from this list: send the line "unsubscribe kvm" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Thomas Lendacky <tahm@linux.vnet.ibm.com> writes: > I ran some TCP_RR and TCP_STREAM sessions, both host-to-guest and > guest-to-host, with a form of the histogram patch applied against a > RHEL6.3 kernel. The histogram values were reset after each test. Hey, thanks! This is exactly what I wanted to see... > 60 session TCP_RR from host-to-guest with 256 byte request and 256 byte > response for 60 seconds: > > Queue histogram for virtio1: > Size distribution for input (max=7818456): > 1: 7818456 ################################################################ These are always 1, so we don't indirect them anyway, so no cache required. > Size distribution for output (max=7816698): > 2: 149 > 3: 7816698 ################################################################ > 4: 2 > 5: 1 > Size distribution for control (max=1): > 0: 0 OK, tiny TCP data, but latency sensitive. > Queue histogram for virtio1: > Size distribution for input (max=16050941): > 1: 16050941 ################################################################ > Size distribution for output (max=1877796): > 2: 1877796 ################################################################ > 3: 5 > Size distribution for control (max=1): > 0: 0 Acks. Not that many, not that latency sensitive. > 4 session TCP_STREAM from guest-to-host with 4K message size for 60 seconds: > > Queue histogram for virtio1: > Size distribution for input (max=1316069): > 1: 1316069 ################################################################ > Size distribution for output (max=879213): > 2: 24 > 3: 24097 # > 4: 23176 # > 5: 3412 > 6: 4446 > 7: 4663 > 8: 4195 > 9: 3772 > 10: 3388 > 11: 3666 > 12: 2885 > 13: 2759 > 14: 2997 > 15: 3060 > 16: 2651 > 17: 2235 > 18: 92721 ###### > 19: 879213 ################################################################ Hey, that +1 is there in MAX_SKB_FRAGS for a reason! Who knew? This looks like we could really use a: int vq_set_indirect_cache(struct virtqueue *vq, unsigned num); Which networking would set on the xmit queue(s) if we have GSO. The real question is now whether we'd want a separate indirect cache for the 3 case (so num above should be a bitmap?), or reuse the same one, or not use it at all? Benchmarking will tell... Thanks, Rusty. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 09/12/2012 08:13 AM, Rusty Russell wrote: > The real question is now whether we'd want a separate indirect cache for > the 3 case (so num above should be a bitmap?), or reuse the same one, or > not use it at all? > > Benchmarking will tell... Since there are no specific decisions about actual values, I'll just modify the code to use cache per-vq instead of per-device. Thanks, Sasha -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -727,6 +727,8 @@ static struct rtnl_link_stats64 *virtnet tot->rx_length_errors = dev->stats.rx_length_errors; tot->rx_frame_errors = dev->stats.rx_frame_errors; + virtio_dev_dump_histogram(vi->vdev); + return tot; } diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c --- a/drivers/virtio/virtio.c +++ b/drivers/virtio/virtio.c @@ -108,6 +108,16 @@ void virtio_check_driver_offered_feature } EXPORT_SYMBOL_GPL(virtio_check_driver_offered_feature); +void virtio_dev_dump_histogram(const struct virtio_device *vdev) +{ + const struct virtqueue *vq; + + printk("Queue histogram for %s:\n", dev_name(&vdev->dev)); + list_for_each_entry(vq, &vdev->vqs, list) + virtqueue_dump_histogram(vq); +} +EXPORT_SYMBOL_GPL(virtio_dev_dump_histogram); + static int virtio_dev_probe(struct device *_d) { int err, i; diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -120,6 +120,8 @@ struct vring_virtqueue ktime_t last_add_time; #endif + unsigned int *histo; + /* Tokens for callbacks. */ void *data[]; }; @@ -259,6 +261,8 @@ int virtqueue_add_buf(struct virtqueue * BUG_ON(out + in > vq->vring.num); BUG_ON(out + in == 0); + vq->histo[out+in]++; + /* If the host supports indirect descriptor tables, consider it. */ if (vq->indirect) { bool try_indirect; @@ -726,6 +730,7 @@ struct virtqueue *vring_new_virtqueue(un } vq->data[i] = NULL; + vq->histo = kzalloc(num * sizeof(vq->histo[0]), GFP_KERNEL); return &vq->vq; } EXPORT_SYMBOL_GPL(vring_new_virtqueue); @@ -772,4 +777,33 @@ unsigned int virtqueue_get_vring_size(st } EXPORT_SYMBOL_GPL(virtqueue_get_vring_size); +void virtqueue_dump_histogram(const struct virtqueue *_vq) +{ + const struct vring_virtqueue *vq = to_vvq(_vq); + int i, j, start = 0, end = 0, max = 1; + char line[120]; + + for (i = 0; i < vq->vring.num; i++) { + if (!vq->histo[i]) + continue; + + end = i; + if (!vq->histo[start]) + start = i; + + if (vq->histo[i] > max) + max = vq->histo[i]; + } + + printk("Size distribution for %s (max=%u):\n", _vq->name, max); + for (i = start; i <= end; i++) { + unsigned int off; + off = sprintf(line, "%3u: %-7u ", i, vq->histo[i]); + for (j = 0; j < vq->histo[i] * 64 / max; j++) + line[off++] = '#'; + line[off] = '\0'; + printk("%s\n", line); + } +} + MODULE_LICENSE("GPL"); diff --git a/include/linux/virtio.h b/include/linux/virtio.h --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -52,6 +52,9 @@ unsigned int virtqueue_get_vring_size(st int virtqueue_get_queue_index(struct virtqueue *vq); +void virtio_dev_dump_histogram(const struct virtio_device *vdev); +void virtqueue_dump_histogram(const struct virtqueue *vq); + /** * virtio_device - representation of a device using virtio * @index: unique position on the virtio bus