diff mbox

[1/4] KVM: MMU audit: update count_writable_mappings / count_rmaps

Message ID 20090602214226.820226306@localhost.localdomain (mailing list archive)
State New, archived
Headers show

Commit Message

Marcelo Tosatti June 2, 2009, 9:36 p.m. UTC
Under testing, count_writable_mappings returns a value that is 2 integers
larger than what count_rmaps returns.

Suspicion is that either of the two functions is counting a duplicate (either
positively or negatively). 

Modifying check_writable_mappings_rmap to check for rmap existance on
all present MMU pages fails to trigger an error, which should keep Avi
happy.

Also introduce mmu_spte_walk to invoke a callback on all present sptes visible
to the current vcpu, might be useful in the future.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Comments

Avi Kivity June 8, 2009, 9:24 a.m. UTC | #1
Marcelo Tosatti wrote:
> Under testing, count_writable_mappings returns a value that is 2 integers
> larger than what count_rmaps returns.
>
> Suspicion is that either of the two functions is counting a duplicate (either
> positively or negatively). 
>
> Modifying check_writable_mappings_rmap to check for rmap existance on
> all present MMU pages fails to trigger an error, which should keep Avi
> happy.
>
> Also introduce mmu_spte_walk to invoke a callback on all present sptes visible
> to the current vcpu, might be useful in the future.
>
> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
>
> Index: kvm/arch/x86/kvm/mmu.c
> ===================================================================
> --- kvm.orig/arch/x86/kvm/mmu.c
> +++ kvm/arch/x86/kvm/mmu.c
> @@ -3017,6 +3017,55 @@ static gva_t canonicalize(gva_t gva)
>  	return gva;
>  }
>  
> +
> +typedef void (*inspect_spte_fn) (struct kvm *kvm, struct kvm_mmu_page *sp,
> +				 u64 *sptep);
> +
> +static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
> +			    inspect_spte_fn fn)
> +{
> +	int i;
> +
> +	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
> +		u64 ent = sp->spt[i];
> +
> +		if (is_shadow_present_pte(ent)) {
> +			if (sp->role.level > 1) {
>   

I think this is broken wrt large pages.  We should recurse if role.level 
 > 1 or the G bit is set.

> +	if (*sptep & PT_WRITABLE_MASK) {
> +		rev_sp = page_header(__pa(sptep));
> +		gfn = rev_sp->gfns[sptep - rev_sp->spt];
> +
> +		if (!gfn_to_memslot(kvm, gfn)) {
> +			printk(KERN_ERR "%s: no memslot for gfn %ld\n",
> +					 audit_msg, gfn);
> +			printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n",
> +					audit_msg, sptep - rev_sp->spt,
> +					rev_sp->gfn);
> +			dump_stack();
> +			return;
> +		}
> +
> +		rmapp = gfn_to_rmap(kvm, rev_sp->gfns[sptep - rev_sp->spt], 0);
> +		if (!*rmapp) {
> +			printk(KERN_ERR "%s: no rmap for writable spte %llx\n",
> +					 audit_msg, *sptep);
> +			dump_stack();
> +		}
> +	}
>   

Semi-related: we should set up a new exit code to halt the VM so it can 
be inspected, otherwise all those printks and dump_stack()s will quickly 
overwhelm the logging facilities.
Marcelo Tosatti June 9, 2009, 12:33 p.m. UTC | #2
On Mon, Jun 08, 2009 at 12:24:08PM +0300, Avi Kivity wrote:
>> +static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
>> +			    inspect_spte_fn fn)
>> +{
>> +	int i;
>> +
>> +	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
>> +		u64 ent = sp->spt[i];
>> +
>> +		if (is_shadow_present_pte(ent)) {
>> +			if (sp->role.level > 1) {
>>   
>
> I think this is broken wrt large pages.  We should recurse if role.level  
> > 1 or the G bit is set.

Yes, fixed. Plan to add largepages validity checks later.

> Semi-related: we should set up a new exit code to halt the VM so it can  
> be inspected, otherwise all those printks and dump_stack()s will quickly  
> overwhelm the logging facilities.

Can you clarify on the halt exit code?

Because for other exit codes which similar behaviour is wanted, say,
unhandled vm exit, the policy can be handled in userspace (and the
decision to halt or not seems better suited to happen there). So perhaps
KVM_EXIT_MMU_AUDIT_FAILED?

I wondered before whether it would be good to stop auditing on the first
error, but gave up on the idea.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity June 9, 2009, 12:40 p.m. UTC | #3
Marcelo Tosatti wrote:
>> Semi-related: we should set up a new exit code to halt the VM so it can  
>> be inspected, otherwise all those printks and dump_stack()s will quickly  
>> overwhelm the logging facilities.
>>     
>
> Can you clarify on the halt exit code?
>   

set a bit that tells KVM_RUN to quit (like KVM_EXIT_UNKNOWN), when 
userspace sees it it vm_stop()s, waiting for someone to dig in.

Clean logs, clean state.

> Because for other exit codes which similar behaviour is wanted, say,
> unhandled vm exit, the policy can be handled in userspace (and the
> decision to halt or not seems better suited to happen there). So perhaps
> KVM_EXIT_MMU_AUDIT_FAILED?
>   

Yes, the name was bad.  We can do KVM_EXIT_INTERNAL_ERROR and have a 
maintainer_name field in the union to choose who will handle it.

> I wondered before whether it would be good to stop auditing on the first
> error, but gave up on the idea.
>   

It's harder without exceptions.
diff mbox

Patch

Index: kvm/arch/x86/kvm/mmu.c
===================================================================
--- kvm.orig/arch/x86/kvm/mmu.c
+++ kvm/arch/x86/kvm/mmu.c
@@ -3017,6 +3017,55 @@  static gva_t canonicalize(gva_t gva)
 	return gva;
 }
 
+
+typedef void (*inspect_spte_fn) (struct kvm *kvm, struct kvm_mmu_page *sp,
+				 u64 *sptep);
+
+static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
+			    inspect_spte_fn fn)
+{
+	int i;
+
+	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+		u64 ent = sp->spt[i];
+
+		if (is_shadow_present_pte(ent)) {
+			if (sp->role.level > 1) {
+				struct kvm_mmu_page *child;
+				child = page_header(ent & PT64_BASE_ADDR_MASK);
+				__mmu_spte_walk(kvm, child, fn);
+			}
+			if (sp->role.level == 1)
+				fn(kvm, sp, &sp->spt[i]);
+		}
+	}
+}
+
+static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
+{
+	int i;
+	struct kvm_mmu_page *sp;
+
+	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+		return;
+	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+		hpa_t root = vcpu->arch.mmu.root_hpa;
+		sp = page_header(root);
+		__mmu_spte_walk(vcpu->kvm, sp, fn);
+		return;
+	}
+	for (i = 0; i < 4; ++i) {
+		hpa_t root = vcpu->arch.mmu.pae_root[i];
+
+		if (root && VALID_PAGE(root)) {
+			root &= PT64_BASE_ADDR_MASK;
+			sp = page_header(root);
+			__mmu_spte_walk(vcpu->kvm, sp, fn);
+		}
+	}
+	return;
+}
+
 static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
 				gva_t va, int level)
 {
@@ -3109,9 +3158,43 @@  static int count_rmaps(struct kvm_vcpu *
 	return nmaps;
 }
 
-static int count_writable_mappings(struct kvm_vcpu *vcpu)
+void inspect_spte_has_rmap(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *sptep)
+{
+	unsigned long *rmapp;
+	struct kvm_mmu_page *rev_sp;
+	gfn_t gfn;
+
+	if (*sptep & PT_WRITABLE_MASK) {
+		rev_sp = page_header(__pa(sptep));
+		gfn = rev_sp->gfns[sptep - rev_sp->spt];
+
+		if (!gfn_to_memslot(kvm, gfn)) {
+			printk(KERN_ERR "%s: no memslot for gfn %ld\n",
+					 audit_msg, gfn);
+			printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n",
+					audit_msg, sptep - rev_sp->spt,
+					rev_sp->gfn);
+			dump_stack();
+			return;
+		}
+
+		rmapp = gfn_to_rmap(kvm, rev_sp->gfns[sptep - rev_sp->spt], 0);
+		if (!*rmapp) {
+			printk(KERN_ERR "%s: no rmap for writable spte %llx\n",
+					 audit_msg, *sptep);
+			dump_stack();
+		}
+	}
+
+}
+
+void audit_writable_sptes_have_rmaps(struct kvm_vcpu *vcpu)
+{
+	mmu_spte_walk(vcpu, inspect_spte_has_rmap);
+}
+
+static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu)
 {
-	int nmaps = 0;
 	struct kvm_mmu_page *sp;
 	int i;
 
@@ -3128,20 +3211,16 @@  static int count_writable_mappings(struc
 				continue;
 			if (!(ent & PT_WRITABLE_MASK))
 				continue;
-			++nmaps;
+			inspect_spte_has_rmap(vcpu->kvm, sp, &pt[i]);
 		}
 	}
-	return nmaps;
+	return;
 }
 
 static void audit_rmap(struct kvm_vcpu *vcpu)
 {
-	int n_rmap = count_rmaps(vcpu);
-	int n_actual = count_writable_mappings(vcpu);
-
-	if (n_rmap != n_actual)
-		printk(KERN_ERR "%s: (%s) rmap %d actual %d\n",
-		       __func__, audit_msg, n_rmap, n_actual);
+	check_writable_mappings_rmap(vcpu);
+	count_rmaps(vcpu);
 }
 
 static void audit_write_protection(struct kvm_vcpu *vcpu)
@@ -3175,6 +3254,7 @@  static void kvm_mmu_audit(struct kvm_vcp
 	audit_rmap(vcpu);
 	audit_write_protection(vcpu);
 	audit_mappings(vcpu);
+	audit_writable_sptes_have_rmaps(vcpu);
 	dbg = olddbg;
 }