diff mbox

[4/7] kvm mmu: implement necessary data structures for second huge page accounting

Message ID 1238164319-16092-5-git-send-email-joerg.roedel@amd.com (mailing list archive)
State New, archived
Headers show

Commit Message

Joerg Roedel March 27, 2009, 2:31 p.m. UTC
This patch adds the necessary data structures to take care of write
protections in place within a second huge page sized page.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/include/asm/kvm_host.h |   10 ++++++++++
 arch/x86/kvm/mmu.c              |   33 +++++++++++++++++++++++++++++++++
 include/linux/kvm_host.h        |    2 +-
 virt/kvm/kvm_main.c             |   26 ++++++++++++++++++++++++++
 4 files changed, 70 insertions(+), 1 deletions(-)

Comments

Avi Kivity March 29, 2009, 11:45 a.m. UTC | #1
Joerg Roedel wrote:
> This patch adds the necessary data structures to take care of write
> protections in place within a second huge page sized page.
>
>  
>  struct kvm_vcpu_arch {
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 9936b45..7d4162d 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -390,6 +390,15 @@ static int *slot_largepage_idx(gfn_t gfn, struct kvm_memory_slot *slot)
>  	return &slot->lpage_info[idx].write_count;
>  }
>  
> +static int *slot_hugepage_idx(gfn_t gfn, struct kvm_memory_slot *slot)
> +{
> +	unsigned long idx;
> +
> +	idx = (gfn / KVM_PAGES_PER_1G_PAGE) -
> +	      (slot->base_gfn / KVM_PAGES_PER_1G_PAGE);
> +	return &slot->hpage_info[idx].write_count;
> +}
>   

A page level argument would remove the need for this duplication, as 
well as all the constants.

>  
> +static int has_wrprotected_largepage(struct kvm *kvm, gfn_t gfn)
> +{
> +	struct kvm_memory_slot *slot;
> +	int *hugepage_idx;
> +
> +	gfn = unalias_gfn(kvm, gfn);
> +	slot = gfn_to_memslot_unaliased(kvm, gfn);
> +	if (slot) {
> +		hugepage_idx = slot_hugepage_idx(gfn, slot);
>   

slot_largepage_idx() here?

I don't think we ever write protect large pages, so why is this needed?

> +		return *hugepage_idx;
> +	}
> +
> +	return 1;
> +}
>   


> +
>  static enum kvm_page_size host_page_size(struct kvm *kvm, gfn_t gfn)
>  {
>  	struct vm_area_struct *vma;
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 095ebb6..2f05d48 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -103,7 +103,7 @@ struct kvm_memory_slot {
>  	struct {
>  		unsigned long rmap_pde;
>  		int write_count;
> -	} *lpage_info;
> +	} *lpage_info, *hpage_info;
>   

} * lpage_info[KVM_NR_PAGE_LEVELS];

> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 8aa3b95..c4842f4 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -1001,10 +1001,14 @@ static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
>  	if (!dont || free->lpage_info != dont->lpage_info)
>  		vfree(free->lpage_info);
>  
> +	if (!dont || free->hpage_info != dont->hpage_info)
> +		vfree(free->hpage_info);
>   

loop

>  void kvm_free_physmem(struct kvm *kvm)
> @@ -1170,6 +1174,28 @@ int __kvm_set_memory_region(struct kvm *kvm,
>  			new.lpage_info[largepages-1].write_count = 1;
>  	}
>  
> +#ifdef KVM_PAGES_PER_LHPAGE
> +	if (npages && !new.hpage_info) {
> +		int hugepages = npages / KVM_PAGES_PER_LHPAGE;
> +		if (npages % KVM_PAGES_PER_LHPAGE)
> +			hugepages++;
> +		if (base_gfn % KVM_PAGES_PER_LHPAGE)
> +			hugepages++;
> +
> +		new.hpage_info = vmalloc(hugepages * sizeof(*new.hpage_info));
> +
> +		if (!new.hpage_info)
> +			goto out_free;
> +
> +		memset(new.hpage_info, 0, hugepages * sizeof(*new.hpage_info));
> +
> +		if (base_gfn % KVM_PAGES_PER_LHPAGE)
> +			new.hpage_info[0].write_count = 1;
> +		if ((base_gfn+npages) % KVM_PAGES_PER_LHPAGE)
> +			new.hpage_info[hugepages-1].write_count = 1;
> +	}
> +#endif
>   

Loop, KVM_NR_PAGE_LEVELS defined per arch.
Joerg Roedel March 29, 2009, 1:03 p.m. UTC | #2
On Sun, Mar 29, 2009 at 02:45:44PM +0300, Avi Kivity wrote:
> Joerg Roedel wrote:
>> This patch adds the necessary data structures to take care of write
>> protections in place within a second huge page sized page.
>>
>>   struct kvm_vcpu_arch {
>> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
>> index 9936b45..7d4162d 100644
>> --- a/arch/x86/kvm/mmu.c
>> +++ b/arch/x86/kvm/mmu.c
>> @@ -390,6 +390,15 @@ static int *slot_largepage_idx(gfn_t gfn, struct kvm_memory_slot *slot)
>>  	return &slot->lpage_info[idx].write_count;
>>  }
>>  +static int *slot_hugepage_idx(gfn_t gfn, struct kvm_memory_slot 
>> *slot)
>> +{
>> +	unsigned long idx;
>> +
>> +	idx = (gfn / KVM_PAGES_PER_1G_PAGE) -
>> +	      (slot->base_gfn / KVM_PAGES_PER_1G_PAGE);
>> +	return &slot->hpage_info[idx].write_count;
>> +}
>>   
>
> A page level argument would remove the need for this duplication, as  
> well as all the constants.
>
>>  +static int has_wrprotected_largepage(struct kvm *kvm, gfn_t gfn)
>> +{
>> +	struct kvm_memory_slot *slot;
>> +	int *hugepage_idx;
>> +
>> +	gfn = unalias_gfn(kvm, gfn);
>> +	slot = gfn_to_memslot_unaliased(kvm, gfn);
>> +	if (slot) {
>> +		hugepage_idx = slot_hugepage_idx(gfn, slot);
>>   
>
> slot_largepage_idx() here?
>
> I don't think we ever write protect large pages, so why is this needed?

For 2mb pages we need to check if there is a write-protected 4k page in it
before we map a 2mb page for writing. If there is any write-protected 4k
page in a 2mb area this 2mb page is considered write-protected. These
'write-protected' 2mb pages are accounted in the account_shadow()
function. This information is taken into account when we decide if we
can map a guest 1gb page as a 1gb page on the host too.

	Joerg
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity March 29, 2009, 1:15 p.m. UTC | #3
Joerg Roedel wrote:
>>     
>>>  +static int has_wrprotected_largepage(struct kvm *kvm, gfn_t gfn)
>>> +{
>>> +	struct kvm_memory_slot *slot;
>>> +	int *hugepage_idx;
>>> +
>>> +	gfn = unalias_gfn(kvm, gfn);
>>> +	slot = gfn_to_memslot_unaliased(kvm, gfn);
>>> +	if (slot) {
>>> +		hugepage_idx = slot_hugepage_idx(gfn, slot);
>>>   
>>>       
>> slot_largepage_idx() here?
>>
>> I don't think we ever write protect large pages, so why is this needed?
>>     
>
> For 2mb pages we need to check if there is a write-protected 4k page in it
> before we map a 2mb page for writing. If there is any write-protected 4k
> page in a 2mb area this 2mb page is considered write-protected. These
> 'write-protected' 2mb pages are accounted in the account_shadow()
> function. This information is taken into account when we decide if we
> can map a guest 1gb page as a 1gb page on the host too.
>   

account_shadowed() actually increments a hugepage write_count by 1 for 
every 4K page, not 2M page, if I read the code correctly.  The code I 
commented on is right though.

The naming is confusing.  I suggest 
has_wrprotected_page_in_{large,huge}page().  although with the a level 
parameter we can keep has_wrprotected_page().

btw, if we implement account_shadow() as you describe it (only account 
hugepages on largepage transition 0->1 or 1->0) we save a potential 
cacheline bounce on the hugepage write_count accounting array.
Avi Kivity March 29, 2009, 1:26 p.m. UTC | #4
Joerg Roedel wrote:
> This patch adds the necessary data structures to take care of write
> protections in place within a second huge page sized page.
>
>
> +#ifdef KVM_PAGES_PER_LHPAGE
> +	if (npages && !new.hpage_info) {
> +		int hugepages = npages / KVM_PAGES_PER_LHPAGE;
> +		if (npages % KVM_PAGES_PER_LHPAGE)
> +			hugepages++;
> +		if (base_gfn % KVM_PAGES_PER_LHPAGE)
> +			hugepages++;
>   

Consider a slot with base_gfn == 1 and npages == 1.  This will have 
hugepages == 2, which is wrong.

I think the right calculation is

  ((base_gfn + npages - 1) / N) - (base_gfn / N) + 1

i.e. index of last page, plus one so we can store it.

The small huge page calculation is off as well.

> +
> +		new.hpage_info = vmalloc(hugepages * sizeof(*new.hpage_info));
> +
> +		if (!new.hpage_info)
> +			goto out_free;
> +
> +		memset(new.hpage_info, 0, hugepages * sizeof(*new.hpage_info));
> +
> +		if (base_gfn % KVM_PAGES_PER_LHPAGE)
> +			new.hpage_info[0].write_count = 1;
> +		if ((base_gfn+npages) % KVM_PAGES_PER_LHPAGE)
> +			new.hpage_info[hugepages-1].write_count = 1;
> +	}
> +#endif
> +
>
Joerg Roedel March 29, 2009, 1:32 p.m. UTC | #5
On Sun, Mar 29, 2009 at 04:15:07PM +0300, Avi Kivity wrote:
> Joerg Roedel wrote:
>>>     
>>>>  +static int has_wrprotected_largepage(struct kvm *kvm, gfn_t gfn)
>>>> +{
>>>> +	struct kvm_memory_slot *slot;
>>>> +	int *hugepage_idx;
>>>> +
>>>> +	gfn = unalias_gfn(kvm, gfn);
>>>> +	slot = gfn_to_memslot_unaliased(kvm, gfn);
>>>> +	if (slot) {
>>>> +		hugepage_idx = slot_hugepage_idx(gfn, slot);
>>>>         
>>> slot_largepage_idx() here?
>>>
>>> I don't think we ever write protect large pages, so why is this needed?
>>>     
>>
>> For 2mb pages we need to check if there is a write-protected 4k page in it
>> before we map a 2mb page for writing. If there is any write-protected 4k
>> page in a 2mb area this 2mb page is considered write-protected. These
>> 'write-protected' 2mb pages are accounted in the account_shadow()
>> function. This information is taken into account when we decide if we
>> can map a guest 1gb page as a 1gb page on the host too.
>>   
>
> account_shadowed() actually increments a hugepage write_count by 1 for  
> every 4K page, not 2M page, if I read the code correctly.  The code I  
> commented on is right though.
>
> The naming is confusing.  I suggest  
> has_wrprotected_page_in_{large,huge}page().  although with the a level  
> parameter we can keep has_wrprotected_page().

Yeah true, the name is a bit confusing. I think a level parameter for
has_wrprotected_page() is the best solution.

	Joerg
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f268f99..a1df2a3 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -58,7 +58,16 @@ 
 
 #define KVM_PAGES_PER_2M_PAGE (KVM_2M_PAGE_SIZE / PAGE_SIZE)
 
+#define KVM_1G_PAGE_SHIFT 30
+#define KVM_1G_PAGE_SIZE (1UL << KVM_1G_PAGE_SHIFT)
+#define KVM_1G_PAGE_MASK (~(KVM_1G_PAGE_SIZE - 1))
+
+#define KVM_PAGES_PER_1G_PAGE (KVM_1G_PAGE_SIZE / PAGE_SIZE)
+
+/* Huge Page */
 #define KVM_PAGES_PER_HPAGE	KVM_PAGES_PER_2M_PAGE
+/* Large Huge Page ;-) */
+#define KVM_PAGES_PER_LHPAGE	KVM_PAGES_PER_1G_PAGE
 
 #define DE_VECTOR 0
 #define DB_VECTOR 1
@@ -268,6 +277,7 @@  struct kvm_mmu {
 enum kvm_page_size {
 	KVM_PAGE_SIZE_4k = (1 << 12),
 	KVM_PAGE_SIZE_2M = (1 << 21),
+	KVM_PAGE_SIZE_1G = (1 << 30),
 };
 
 struct kvm_vcpu_arch {
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9936b45..7d4162d 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -390,6 +390,15 @@  static int *slot_largepage_idx(gfn_t gfn, struct kvm_memory_slot *slot)
 	return &slot->lpage_info[idx].write_count;
 }
 
+static int *slot_hugepage_idx(gfn_t gfn, struct kvm_memory_slot *slot)
+{
+	unsigned long idx;
+
+	idx = (gfn / KVM_PAGES_PER_1G_PAGE) -
+	      (slot->base_gfn / KVM_PAGES_PER_1G_PAGE);
+	return &slot->hpage_info[idx].write_count;
+}
+
 static void account_shadowed(struct kvm *kvm, gfn_t gfn)
 {
 	int *write_count;
@@ -398,6 +407,10 @@  static void account_shadowed(struct kvm *kvm, gfn_t gfn)
 	write_count = slot_largepage_idx(gfn,
 					 gfn_to_memslot_unaliased(kvm, gfn));
 	*write_count += 1;
+
+	write_count = slot_hugepage_idx(gfn,
+					gfn_to_memslot_unaliased(kvm, gfn));
+	*write_count += 1;
 }
 
 static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
@@ -409,6 +422,11 @@  static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
 					 gfn_to_memslot_unaliased(kvm, gfn));
 	*write_count -= 1;
 	WARN_ON(*write_count < 0);
+
+	write_count = slot_hugepage_idx(gfn,
+					gfn_to_memslot_unaliased(kvm, gfn));
+	*write_count -= 1;
+	WARN_ON(*write_count < 0);
 }
 
 static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn)
@@ -426,6 +444,21 @@  static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn)
 	return 1;
 }
 
+static int has_wrprotected_largepage(struct kvm *kvm, gfn_t gfn)
+{
+	struct kvm_memory_slot *slot;
+	int *hugepage_idx;
+
+	gfn = unalias_gfn(kvm, gfn);
+	slot = gfn_to_memslot_unaliased(kvm, gfn);
+	if (slot) {
+		hugepage_idx = slot_hugepage_idx(gfn, slot);
+		return *hugepage_idx;
+	}
+
+	return 1;
+}
+
 static enum kvm_page_size host_page_size(struct kvm *kvm, gfn_t gfn)
 {
 	struct vm_area_struct *vma;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 095ebb6..2f05d48 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -103,7 +103,7 @@  struct kvm_memory_slot {
 	struct {
 		unsigned long rmap_pde;
 		int write_count;
-	} *lpage_info;
+	} *lpage_info, *hpage_info;
 	unsigned long userspace_addr;
 	int user_alloc;
 };
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 8aa3b95..c4842f4 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1001,10 +1001,14 @@  static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
 	if (!dont || free->lpage_info != dont->lpage_info)
 		vfree(free->lpage_info);
 
+	if (!dont || free->hpage_info != dont->hpage_info)
+		vfree(free->hpage_info);
+
 	free->npages = 0;
 	free->dirty_bitmap = NULL;
 	free->rmap = NULL;
 	free->lpage_info = NULL;
+	free->hpage_info = NULL;
 }
 
 void kvm_free_physmem(struct kvm *kvm)
@@ -1170,6 +1174,28 @@  int __kvm_set_memory_region(struct kvm *kvm,
 			new.lpage_info[largepages-1].write_count = 1;
 	}
 
+#ifdef KVM_PAGES_PER_LHPAGE
+	if (npages && !new.hpage_info) {
+		int hugepages = npages / KVM_PAGES_PER_LHPAGE;
+		if (npages % KVM_PAGES_PER_LHPAGE)
+			hugepages++;
+		if (base_gfn % KVM_PAGES_PER_LHPAGE)
+			hugepages++;
+
+		new.hpage_info = vmalloc(hugepages * sizeof(*new.hpage_info));
+
+		if (!new.hpage_info)
+			goto out_free;
+
+		memset(new.hpage_info, 0, hugepages * sizeof(*new.hpage_info));
+
+		if (base_gfn % KVM_PAGES_PER_LHPAGE)
+			new.hpage_info[0].write_count = 1;
+		if ((base_gfn+npages) % KVM_PAGES_PER_LHPAGE)
+			new.hpage_info[hugepages-1].write_count = 1;
+	}
+#endif
+
 	/* Allocate page dirty bitmap if needed */
 	if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
 		unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8;