kvm: irqchip: Break up high order allocations of kvm_irq_routing_table
diff mbox

Message ID 1431088304-11365-1-git-send-email-joro@8bytes.org
State New
Headers show

Commit Message

Joerg Roedel May 8, 2015, 12:31 p.m. UTC
From: Joerg Roedel <jroedel@suse.de>

The allocation size of the kvm_irq_routing_table depends on
the number of irq routing entries because they are all
allocated with one kzalloc call.

When the irq routing table gets bigger this requires high
order allocations which fail from time to time:

	qemu-kvm: page allocation failure: order:4, mode:0xd0

This patch fixes this issue by breaking up the allocation of
the table and its entries into individual kzalloc calls.
These could all be satisfied with order-0 allocations, which
are less likely to fail.

The downside of this change is the lower performance, because
of more calls to kzalloc. But given how often kvm_set_irq_routing
is called in the lifetime of a guest, it doesn't really
matter much.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 virt/kvm/irqchip.c | 40 ++++++++++++++++++++++++++++++++--------
 1 file changed, 32 insertions(+), 8 deletions(-)

Comments

Paolo Bonzini May 8, 2015, 4:26 p.m. UTC | #1
On 08/05/2015 14:31, Joerg Roedel wrote:
> From: Joerg Roedel <jroedel@suse.de>
> 
> The allocation size of the kvm_irq_routing_table depends on
> the number of irq routing entries because they are all
> allocated with one kzalloc call.
> 
> When the irq routing table gets bigger this requires high
> order allocations which fail from time to time:
> 
> 	qemu-kvm: page allocation failure: order:4, mode:0xd0
> 
> This patch fixes this issue by breaking up the allocation of
> the table and its entries into individual kzalloc calls.
> These could all be satisfied with order-0 allocations, which
> are less likely to fail.
> 
> The downside of this change is the lower performance, because
> of more calls to kzalloc. But given how often kvm_set_irq_routing
> is called in the lifetime of a guest, it doesn't really
> matter much.

It probably doesn't matter much indeed, but can you time the difference?
 kvm_set_irq_routing is not too frequent, but happens enough often that
we had to use a separate SRCU instance just to speed it up (see commit
719d93cd5f5, kvm/irqchip: Speed up KVM_SET_GSI_ROUTING, 2014-01-16).

Paolo

> Signed-off-by: Joerg Roedel <jroedel@suse.de>
> ---
>  virt/kvm/irqchip.c | 40 ++++++++++++++++++++++++++++++++--------
>  1 file changed, 32 insertions(+), 8 deletions(-)
> 
> diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c
> index 1d56a90..b56168f 100644
> --- a/virt/kvm/irqchip.c
> +++ b/virt/kvm/irqchip.c
> @@ -33,7 +33,6 @@
>  
>  struct kvm_irq_routing_table {
>  	int chip[KVM_NR_IRQCHIPS][KVM_IRQCHIP_NUM_PINS];
> -	struct kvm_kernel_irq_routing_entry *rt_entries;
>  	u32 nr_rt_entries;
>  	/*
>  	 * Array indexed by gsi. Each entry contains list of irq chips
> @@ -118,11 +117,31 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
>  	return ret;
>  }
>  
> +static void free_irq_routing_table(struct kvm_irq_routing_table *rt)
> +{
> +	int i;
> +
> +	if (!rt)
> +		return;
> +
> +	for (i = 0; i < rt->nr_rt_entries; ++i) {
> +		struct kvm_kernel_irq_routing_entry *e;
> +		struct hlist_node *n;
> +
> +		hlist_for_each_entry_safe(e, n, &rt->map[i], link) {
> +			hlist_del(&e->link);
> +			kfree(e);
> +		}
> +	}
> +
> +	kfree(rt);
> +}
> +
>  void kvm_free_irq_routing(struct kvm *kvm)
>  {
>  	/* Called only during vm destruction. Nobody can use the pointer
>  	   at this stage */
> -	kfree(kvm->irq_routing);
> +	free_irq_routing_table(kvm->irq_routing);
>  }
>  
>  static int setup_routing_entry(struct kvm_irq_routing_table *rt,
> @@ -173,25 +192,29 @@ int kvm_set_irq_routing(struct kvm *kvm,
>  
>  	nr_rt_entries += 1;
>  
> -	new = kzalloc(sizeof(*new) + (nr_rt_entries * sizeof(struct hlist_head))
> -		      + (nr * sizeof(struct kvm_kernel_irq_routing_entry)),
> +	new = kzalloc(sizeof(*new) + (nr_rt_entries * sizeof(struct hlist_head)),
>  		      GFP_KERNEL);
>  
>  	if (!new)
>  		return -ENOMEM;
>  
> -	new->rt_entries = (void *)&new->map[nr_rt_entries];
> -
>  	new->nr_rt_entries = nr_rt_entries;
>  	for (i = 0; i < KVM_NR_IRQCHIPS; i++)
>  		for (j = 0; j < KVM_IRQCHIP_NUM_PINS; j++)
>  			new->chip[i][j] = -1;
>  
>  	for (i = 0; i < nr; ++i) {
> +		struct kvm_kernel_irq_routing_entry *e;
> +
> +		r = -ENOMEM;
> +		e = kzalloc(sizeof(*e), GFP_KERNEL);
> +		if (!e)
> +			goto out;
> +
>  		r = -EINVAL;
>  		if (ue->flags)
>  			goto out;
> -		r = setup_routing_entry(new, &new->rt_entries[i], ue);
> +		r = setup_routing_entry(new, e, ue);
>  		if (r)
>  			goto out;
>  		++ue;
> @@ -209,6 +232,7 @@ int kvm_set_irq_routing(struct kvm *kvm,
>  	r = 0;
>  
>  out:
> -	kfree(new);
> +	free_irq_routing_table(new);
> +
>  	return r;
>  }
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Joerg Roedel May 11, 2015, 11:25 a.m. UTC | #2
Hi Paolo,

On Fri, May 08, 2015 at 06:26:13PM +0200, Paolo Bonzini wrote:
> It probably doesn't matter much indeed, but can you time the difference?
>  kvm_set_irq_routing is not too frequent, but happens enough often that
> we had to use a separate SRCU instance just to speed it up (see commit
> 719d93cd5f5, kvm/irqchip: Speed up KVM_SET_GSI_ROUTING, 2014-01-16).

The results vary a lot, but what I can say for sure is that the
kvm_set_irq_routing function takes at least twice as long (~10.000 vs
~22.000 cycles) as before on my AMD Kaveri machine (maximum was between
3-4 times as long).

On the other side this function is only called 2 times at boot in my
test, so I couldn't detect a noticable effect on the overall boot time
of the guest (37 disks were attached).


	Joerg

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini May 11, 2015, 11:45 a.m. UTC | #3
On 11/05/2015 13:25, Joerg Roedel wrote:
>> > It probably doesn't matter much indeed, but can you time the difference?
>> >  kvm_set_irq_routing is not too frequent, but happens enough often that
>> > we had to use a separate SRCU instance just to speed it up (see commit
>> > 719d93cd5f5, kvm/irqchip: Speed up KVM_SET_GSI_ROUTING, 2014-01-16).
> The results vary a lot, but what I can say for sure is that the
> kvm_set_irq_routing function takes at least twice as long (~10.000 vs
> ~22.000 cycles) as before on my AMD Kaveri machine (maximum was between
> 3-4 times as long).
> 
> On the other side this function is only called 2 times at boot in my
> test, so I couldn't detect a noticable effect on the overall boot time
> of the guest (37 disks were attached).

Christian, can you test this?

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christian Borntraeger May 11, 2015, 12:50 p.m. UTC | #4
Am 11.05.2015 um 13:45 schrieb Paolo Bonzini:
> 
> 
> On 11/05/2015 13:25, Joerg Roedel wrote:
>>>> It probably doesn't matter much indeed, but can you time the difference?
>>>>  kvm_set_irq_routing is not too frequent, but happens enough often that
>>>> we had to use a separate SRCU instance just to speed it up (see commit
>>>> 719d93cd5f5, kvm/irqchip: Speed up KVM_SET_GSI_ROUTING, 2014-01-16).
>> The results vary a lot, but what I can say for sure is that the
>> kvm_set_irq_routing function takes at least twice as long (~10.000 vs
>> ~22.000 cycles) as before on my AMD Kaveri machine (maximum was between
>> 3-4 times as long).
>>
>> On the other side this function is only called 2 times at boot in my
>> test, so I couldn't detect a noticable effect on the overall boot time
>> of the guest (37 disks were attached).

x86 probably has only some irq lines for this, (or Joerg is using virtio-scsi)

s390 has a route per device, but with 100 virtio-blk devices the difference seem
pretty much on the "dont care" side. qemu aio-poll/drain code seems to cause
much more delay since we elimited the kernel delays by using 
synchronize_srcu_expedited.


> Christian, can you test this?


guest comes up and performance is ok.
I did not do any additional thing (lockdep, kmemleak) but I think the
generic approach is good.
in case the host is overcommited and paging, order-0 allocations might
be much faster and much more reliable than one big order-2, 3 or 4.

Bonus points for the future: We might be able to rework this to re-use
the old  allocations for struct kvm_kernel_irq_routing_entry (bascially
replacing only chip, mr_rt_entries and hlist)

Christian

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christian Borntraeger May 11, 2015, 12:53 p.m. UTC | #5
Am 11.05.2015 um 14:50 schrieb Christian Borntraeger:

> s390 has a route per device, but with 100 virtio-blk devices the difference seem
> pretty much on the "dont care" side. qemu aio-poll/drain code seems to cause
> much more delay since we elimited the kernel delays by using 
> synchronize_srcu_expedited.

This is ambiguous:
My point is: qemu did not get slower, it was already slow and the kernel got fast
enough that it is no longer the slowest part.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini May 11, 2015, 1:27 p.m. UTC | #6
On 11/05/2015 14:53, Christian Borntraeger wrote:
> Am 11.05.2015 um 14:50 schrieb Christian Borntraeger:
> 
>> s390 has a route per device, but with 100 virtio-blk devices the difference seem
>> pretty much on the "dont care" side. qemu aio-poll/drain code seems to cause
>> much more delay since we elimited the kernel delays by using 
>> synchronize_srcu_expedited.
> 
> This is ambiguous:
> My point is: qemu did not get slower, it was already slow and the kernel got fast
> enough that it is no longer the slowest part.

Great, I'll apply the patch.

Paolo

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Joerg Roedel June 5, 2015, 10:50 a.m. UTC | #7
Hi Paolo,

On Mon, May 11, 2015 at 03:27:26PM +0200, Paolo Bonzini wrote:
> Great, I'll apply the patch.

Gentle ping. I don't see the patch in the queue or next branches of the
KVM tree yet. Do you plan to apply it for v4.2?


	Joerg

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini June 5, 2015, 11:39 a.m. UTC | #8
On 05/06/2015 12:50, Joerg Roedel wrote:
> > Great, I'll apply the patch.
> 
> Gentle ping. I don't see the patch in the queue or next branches of the
> KVM tree yet. Do you plan to apply it for v4.2?

Fell through the cracks, sorry.  I will apply it today.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch
diff mbox

diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c
index 1d56a90..b56168f 100644
--- a/virt/kvm/irqchip.c
+++ b/virt/kvm/irqchip.c
@@ -33,7 +33,6 @@ 
 
 struct kvm_irq_routing_table {
 	int chip[KVM_NR_IRQCHIPS][KVM_IRQCHIP_NUM_PINS];
-	struct kvm_kernel_irq_routing_entry *rt_entries;
 	u32 nr_rt_entries;
 	/*
 	 * Array indexed by gsi. Each entry contains list of irq chips
@@ -118,11 +117,31 @@  int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
 	return ret;
 }
 
+static void free_irq_routing_table(struct kvm_irq_routing_table *rt)
+{
+	int i;
+
+	if (!rt)
+		return;
+
+	for (i = 0; i < rt->nr_rt_entries; ++i) {
+		struct kvm_kernel_irq_routing_entry *e;
+		struct hlist_node *n;
+
+		hlist_for_each_entry_safe(e, n, &rt->map[i], link) {
+			hlist_del(&e->link);
+			kfree(e);
+		}
+	}
+
+	kfree(rt);
+}
+
 void kvm_free_irq_routing(struct kvm *kvm)
 {
 	/* Called only during vm destruction. Nobody can use the pointer
 	   at this stage */
-	kfree(kvm->irq_routing);
+	free_irq_routing_table(kvm->irq_routing);
 }
 
 static int setup_routing_entry(struct kvm_irq_routing_table *rt,
@@ -173,25 +192,29 @@  int kvm_set_irq_routing(struct kvm *kvm,
 
 	nr_rt_entries += 1;
 
-	new = kzalloc(sizeof(*new) + (nr_rt_entries * sizeof(struct hlist_head))
-		      + (nr * sizeof(struct kvm_kernel_irq_routing_entry)),
+	new = kzalloc(sizeof(*new) + (nr_rt_entries * sizeof(struct hlist_head)),
 		      GFP_KERNEL);
 
 	if (!new)
 		return -ENOMEM;
 
-	new->rt_entries = (void *)&new->map[nr_rt_entries];
-
 	new->nr_rt_entries = nr_rt_entries;
 	for (i = 0; i < KVM_NR_IRQCHIPS; i++)
 		for (j = 0; j < KVM_IRQCHIP_NUM_PINS; j++)
 			new->chip[i][j] = -1;
 
 	for (i = 0; i < nr; ++i) {
+		struct kvm_kernel_irq_routing_entry *e;
+
+		r = -ENOMEM;
+		e = kzalloc(sizeof(*e), GFP_KERNEL);
+		if (!e)
+			goto out;
+
 		r = -EINVAL;
 		if (ue->flags)
 			goto out;
-		r = setup_routing_entry(new, &new->rt_entries[i], ue);
+		r = setup_routing_entry(new, e, ue);
 		if (r)
 			goto out;
 		++ue;
@@ -209,6 +232,7 @@  int kvm_set_irq_routing(struct kvm *kvm,
 	r = 0;
 
 out:
-	kfree(new);
+	free_irq_routing_table(new);
+
 	return r;
 }