diff mbox

[RFC,v4,02/18] add memory map/unmap support for VM introspection on the guest side

Message ID 20171218190642.7790-3-alazar@bitdefender.com (mailing list archive)
State New, archived
Headers show

Commit Message

Adalbert Lazăr Dec. 18, 2017, 7:06 p.m. UTC
From: Adalbert Lazar <alazar@bitdefender.com>

An introspection tool running in a dedicated VM can use the new device
(/dev/kvmmem) to map memory from other introspected VM-s.

Two ioctl operations are supported:
  - KVM_INTRO_MEM_MAP/struct kvmi_mem_map
  - KVM_INTRO_MEM_UNMAP/unsigned long

In order to map an introspected gpa to the local gva, the process using
this device needs to obtain a token from the host KVMI subsystem (see
Documentation/virtual/kvm/kvmi.rst - KVMI_GET_MAP_TOKEN).

Both operations use hypercalls (KVM_HC_MEM_MAP, KVM_INTRO_MEM_UNMAP)
to pass the requests to the host kernel/KVMi (see hypercalls.txt).

Signed-off-by: Mircea Cîrjaliu <mcirjaliu@bitdefender.com>
---
 arch/x86/Kconfig                  |   9 +
 arch/x86/include/asm/kvmi_guest.h |  10 +
 arch/x86/kernel/Makefile          |   1 +
 arch/x86/kernel/kvmi_mem_guest.c  |  26 +++
 virt/kvm/kvmi_mem_guest.c         | 379 ++++++++++++++++++++++++++++++++++++++
 5 files changed, 425 insertions(+)
 create mode 100644 arch/x86/include/asm/kvmi_guest.h
 create mode 100644 arch/x86/kernel/kvmi_mem_guest.c
 create mode 100644 virt/kvm/kvmi_mem_guest.c

Comments

Patrick Colp Dec. 21, 2017, 9:17 p.m. UTC | #1
On 2017-12-18 02:06 PM, Adalber Lazăr wrote:
> From: Adalbert Lazar <alazar@bitdefender.com>
> 
> An introspection tool running in a dedicated VM can use the new device
> (/dev/kvmmem) to map memory from other introspected VM-s.
> 
> Two ioctl operations are supported:
>    - KVM_INTRO_MEM_MAP/struct kvmi_mem_map
>    - KVM_INTRO_MEM_UNMAP/unsigned long
> 
> In order to map an introspected gpa to the local gva, the process using
> this device needs to obtain a token from the host KVMI subsystem (see
> Documentation/virtual/kvm/kvmi.rst - KVMI_GET_MAP_TOKEN).
> 
> Both operations use hypercalls (KVM_HC_MEM_MAP, KVM_INTRO_MEM_UNMAP)
> to pass the requests to the host kernel/KVMi (see hypercalls.txt).
> 
> Signed-off-by: Mircea Cîrjaliu <mcirjaliu@bitdefender.com>
> ---
>   arch/x86/Kconfig                  |   9 +
>   arch/x86/include/asm/kvmi_guest.h |  10 +
>   arch/x86/kernel/Makefile          |   1 +
>   arch/x86/kernel/kvmi_mem_guest.c  |  26 +++
>   virt/kvm/kvmi_mem_guest.c         | 379 ++++++++++++++++++++++++++++++++++++++
>   5 files changed, 425 insertions(+)
>   create mode 100644 arch/x86/include/asm/kvmi_guest.h
>   create mode 100644 arch/x86/kernel/kvmi_mem_guest.c
>   create mode 100644 virt/kvm/kvmi_mem_guest.c
> 
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index 8eed3f94bfc7..6e2548f4d44c 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -782,6 +782,15 @@ config KVM_DEBUG_FS
>   	  Statistics are displayed in debugfs filesystem. Enabling this option
>   	  may incur significant overhead.
>   
> +config KVMI_MEM_GUEST
> +	bool "KVM Memory Introspection support on Guest"
> +	depends on KVM_GUEST
> +	default n
> +	---help---
> +	  This option enables functions and hypercalls for security applications
> +	  running in a separate VM to control the execution of other VM-s, query
> +	  the state of the vCPU-s (GPR-s, MSR-s etc.).
> +
>   config PARAVIRT_TIME_ACCOUNTING
>   	bool "Paravirtual steal time accounting"
>   	depends on PARAVIRT
> diff --git a/arch/x86/include/asm/kvmi_guest.h b/arch/x86/include/asm/kvmi_guest.h
> new file mode 100644
> index 000000000000..c7ed53a938e0
> --- /dev/null
> +++ b/arch/x86/include/asm/kvmi_guest.h
> @@ -0,0 +1,10 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef __KVMI_GUEST_H__
> +#define __KVMI_GUEST_H__
> +
> +long kvmi_arch_map_hc(struct kvmi_map_mem_token *tknp,
> +	gpa_t req_gpa, gpa_t map_gpa);
> +long kvmi_arch_unmap_hc(gpa_t map_gpa);
> +
> +
> +#endif
> diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
> index 81bb565f4497..fdb54b65e46e 100644
> --- a/arch/x86/kernel/Makefile
> +++ b/arch/x86/kernel/Makefile
> @@ -111,6 +111,7 @@ obj-$(CONFIG_PARAVIRT)		+= paravirt.o paravirt_patch_$(BITS).o
>   obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o
>   obj-$(CONFIG_PARAVIRT_CLOCK)	+= pvclock.o
>   obj-$(CONFIG_X86_PMEM_LEGACY_DEVICE) += pmem.o
> +obj-$(CONFIG_KVMI_MEM_GUEST)	+= kvmi_mem_guest.o ../../../virt/kvm/kvmi_mem_guest.o
>   
>   obj-$(CONFIG_EISA)		+= eisa.o
>   obj-$(CONFIG_PCSPKR_PLATFORM)	+= pcspeaker.o
> diff --git a/arch/x86/kernel/kvmi_mem_guest.c b/arch/x86/kernel/kvmi_mem_guest.c
> new file mode 100644
> index 000000000000..c4e2613f90f3
> --- /dev/null
> +++ b/arch/x86/kernel/kvmi_mem_guest.c
> @@ -0,0 +1,26 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * KVM introspection guest implementation
> + *
> + * Copyright (C) 2017 Bitdefender S.R.L.
> + *
> + * Author:
> + *   Mircea Cirjaliu <mcirjaliu@bitdefender.com>
> + */
> +
> +#include <uapi/linux/kvmi.h>
> +#include <uapi/linux/kvm_para.h>
> +#include <linux/kvm_types.h>
> +#include <asm/kvm_para.h>
> +
> +long kvmi_arch_map_hc(struct kvmi_map_mem_token *tknp,
> +		       gpa_t req_gpa, gpa_t map_gpa)
> +{
> +	return kvm_hypercall3(KVM_HC_MEM_MAP, (unsigned long)tknp,
> +			      req_gpa, map_gpa);
> +}
> +
> +long kvmi_arch_unmap_hc(gpa_t map_gpa)
> +{
> +	return kvm_hypercall1(KVM_HC_MEM_UNMAP, map_gpa);
> +}
> diff --git a/virt/kvm/kvmi_mem_guest.c b/virt/kvm/kvmi_mem_guest.c
> new file mode 100644
> index 000000000000..118c22ca47c5
> --- /dev/null
> +++ b/virt/kvm/kvmi_mem_guest.c
> @@ -0,0 +1,379 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * KVM introspection guest implementation
> + *
> + * Copyright (C) 2017 Bitdefender S.R.L.
> + *
> + * Author:
> + *   Mircea Cirjaliu <mcirjaliu@bitdefender.com>
> + */
> +
> +#include <linux/module.h>
> +#include <linux/init.h>
> +#include <linux/kernel.h>
> +#include <linux/miscdevice.h>
> +#include <linux/fs.h>
> +#include <linux/mm.h>
> +#include <linux/mman.h>
> +#include <linux/types.h>
> +#include <linux/kvm_host.h>
> +#include <linux/kvm_para.h>
> +#include <linux/uaccess.h>
> +#include <linux/slab.h>
> +#include <linux/rmap.h>
> +#include <linux/sched.h>
> +#include <linux/list.h>
> +#include <linux/spinlock.h>
> +#include <linux/mutex.h>
> +#include <uapi/linux/kvmi.h>
> +#include <asm/kvmi_guest.h>
> +
> +#define ASSERT(exp) BUG_ON(!(exp))
> +
> +
> +static struct list_head file_list;
> +static spinlock_t file_lock;
> +
> +struct file_map {
> +	struct list_head file_list;
> +	struct file *file;
> +	struct list_head map_list;
> +	struct mutex lock;
> +	int active;	/* for tearing down */
> +};
> +
> +struct page_map {
> +	struct list_head map_list;
> +	__u64 gpa;
> +	unsigned long vaddr;
> +	unsigned long paddr;
> +};
> +
> +
> +static int kvm_dev_open(struct inode *inodep, struct file *filp)
> +{
> +	struct file_map *fmp;
> +
> +	pr_debug("kvmi: file %016lx opened by process %s\n",
> +		 (unsigned long) filp, current->comm);
> +
> +	/* link the file 1:1 with such a structure */
> +	fmp = kmalloc(sizeof(struct file_map), GFP_KERNEL);

I think this is supposed to be "kmalloc(sizeof(*fmp), GFP_KERNEL)".

> +	if (fmp == NULL)
> +		return -ENOMEM;
> +
> +	INIT_LIST_HEAD(&fmp->file_list);
> +	fmp->file = filp;
> +	filp->private_data = fmp;
> +	INIT_LIST_HEAD(&fmp->map_list);
> +	mutex_init(&fmp->lock);
> +	fmp->active = 1;
> +
> +	/* add the entry to the global list */
> +	spin_lock(&file_lock);
> +	list_add_tail(&fmp->file_list, &file_list);
> +	spin_unlock(&file_lock);
> +
> +	return 0;
> +}
> +
> +/* actually does the mapping of a page */
> +static long _do_mapping(struct kvmi_mem_map *map_req, struct page_map *pmap)

Here you have a "struct page_map" and call it pmap. However, in the rest 
of the code, whenever there's a "struct page_map" it's called pmp. It 
seems that it would be good to stay consistent with the naming, so 
perhaps rename it here to pmp as well?


> +{
> +	unsigned long paddr;
> +	struct vm_area_struct *vma = NULL; > +	struct page *page;

Out of curiosity, why do you set "*vma = NULL" but not "*page = NULL"?

> +	long result;
> +
> +	pr_debug("kvmi: mapping remote GPA %016llx into %016llx\n",
> +		 map_req->gpa, map_req->gva);
> +
> +	/* check access to memory location */
> +	if (!access_ok(VERIFY_READ, map_req->gva, PAGE_SIZE)) {
> +		pr_err("kvmi: invalid virtual address for mapping\n");
> +		return -EINVAL;
> +	}
> +
> +	down_read(&current->mm->mmap_sem);
> +
> +	/* find the page to be replaced */
> +	vma = find_vma(current->mm, map_req->gva);
> +	if (IS_ERR_OR_NULL(vma)) {
> +		result = PTR_ERR(vma);
> +		pr_err("kvmi: find_vma() failed with result %ld\n", result);
> +		goto out;
> +	}
> +
> +	page = follow_page(vma, map_req->gva, 0);
> +	if (IS_ERR_OR_NULL(page)) {
> +		result = PTR_ERR(page);
> +		pr_err("kvmi: follow_page() failed with result %ld\n", result);
> +		goto out;
> +	}
> +
> +	if (IS_ENABLED(CONFIG_DEBUG_VM))
> +		dump_page(page, "page to map_req into");
> +
> +	WARN(is_zero_pfn(page_to_pfn(page)), "zero-page still mapped");
> +
> +	/* get the physical address and store it in page_map */
> +	paddr = page_to_phys(page);
> +	pr_debug("kvmi: page phys addr %016lx\n", paddr);
> +	pmap->paddr = paddr;
> +
> +	/* last thing to do is host mapping */
> +	result = kvmi_arch_map_hc(&map_req->token, map_req->gpa, paddr);
> +	if (IS_ERR_VALUE(result)) {
> +		pr_err("kvmi: HC failed with result %ld\n", result);
> +		goto out;
> +	}
> +
> +out:
> +	up_read(&current->mm->mmap_sem);
> +
> +	return result;
> +}
> +
> +/* actually does the unmapping of a page */
> +static long _do_unmapping(unsigned long paddr)
> +{
> +	long result;
> +
> +	pr_debug("kvmi: unmapping request for phys addr %016lx\n", paddr);
> +
> +	/* local GPA uniquely identifies the mapping on the host */
> +	result = kvmi_arch_unmap_hc(paddr);
> +	if (IS_ERR_VALUE(result))
> +		pr_warn("kvmi: HC failed with result %ld\n", result);
> +
> +	return result;
> +}
> +
> +static long kvm_dev_ioctl_map(struct file_map *fmp, struct kvmi_mem_map *map)
> +{
> +	struct page_map *pmp;
> +	long result = 0;

Out of curiosity again, why do you set "result = 0" here when it's 
always set before used (and, for e.g., _do_unmapping() doesn't do 
"result = 0")?

> +
> +	if (!access_ok(VERIFY_READ, map->gva, PAGE_SIZE))
> +		return -EINVAL;
> +	if (!access_ok(VERIFY_WRITE, map->gva, PAGE_SIZE))
> +		return -EINVAL;
> +
> +	/* prepare list entry */
> +	pmp = kmalloc(sizeof(struct page_map), GFP_KERNEL);

This should also probably be "kmalloc(sizeof(*pmp), GFP_KERNEL)".

> +	if (pmp == NULL)
> +		return -ENOMEM;
> +
> +	INIT_LIST_HEAD(&pmp->map_list);
> +	pmp->gpa = map->gpa;
> +	pmp->vaddr = map->gva;
> +
> +	/* acquire the file mapping */
> +	mutex_lock(&fmp->lock);
> +
> +	/* check if other thread is closing the file */
> +	if (!fmp->active) {
> +		result = -ENODEV;
> +		pr_warn("kvmi: unable to map, file is being closed\n");
> +		goto out_err;
> +	}
> +
> +	/* do the actual mapping */
> +	result = _do_mapping(map, pmp);
> +	if (IS_ERR_VALUE(result))
> +		goto out_err;
> +
> +	/* link to list */
> +	list_add_tail(&pmp->map_list, &fmp->map_list);
> +
> +	/* all fine */
> +	result = 0;
> +	goto out_finalize;
> +
> +out_err:
> +	kfree(pmp);
> +
> +out_finalize:
> +	mutex_unlock(&fmp->lock);
> +
> +	return result;
> +}
> +
> +static long kvm_dev_ioctl_unmap(struct file_map *fmp, unsigned long vaddr)
> +{
> +	struct list_head *cur;
> +	struct page_map *pmp;
> +	bool found = false;
> +
> +	/* acquire the file */
> +	mutex_lock(&fmp->lock);
> +
> +	/* check if other thread is closing the file */
> +	if (!fmp->active) {
> +		mutex_unlock(&fmp->lock);

Wouldn't this be better replaced with a "goto out_err" like in 
kvm_dev_ioctl_map()?

> +		pr_warn("kvmi: unable to unmap, file is being closed\n");
> +		return -ENODEV;
> +	}
> +
> +	/* check that this address belongs to us */
> +	list_for_each(cur, &fmp->map_list) {
> +		pmp = list_entry(cur, struct page_map, map_list);
> +
> +		/* found */
> +		if (pmp->vaddr == vaddr) {
> +			found = true;
> +			break;
> +		}
> +	}
> +
> +	/* not found ? */
> +	if (!found) {
> +		mutex_unlock(&fmp->lock);

Here too: "goto out_err".

> +		pr_err("kvmi: address %016lx not mapped\n", vaddr);
> +		return -ENOENT;
> +	}
> +
> +	/* decouple guest mapping */
> +	list_del(&pmp->map_list);
> +	mutex_unlock(&fmp->lock);

In kvm_dev_ioctl_map(), the fmp mutex is held across the _do_mapping() 
call. Is there any particular reason why here the mutex doesn't need to 
be held across the _do_unmapping() call? Or was that more an artifact of 
having a common "out_err" exit in kvm_dev_ioctl_map()?

> +
> +	/* unmap & ignore result */
> +	_do_unmapping(pmp->paddr);
> +
> +	/* free guest mapping */
> +	kfree(pmp);
> +
> +	return 0;
> +}
> +
> +static long kvm_dev_ioctl(struct file *filp,
> +			  unsigned int ioctl, unsigned long arg)
> +{
> +	void __user *argp = (void __user *) arg;
> +	struct file_map *fmp;
> +	long result;
> +
> +	/* minor check */
> +	fmp = filp->private_data;
> +	ASSERT(fmp->file == filp);
> +
> +	switch (ioctl) {
> +	case KVM_INTRO_MEM_MAP: {
> +		struct kvmi_mem_map map;
> +
> +		result = -EFAULT;
> +		if (copy_from_user(&map, argp, sizeof(map)))
> +			break;
> +
> +		result = kvm_dev_ioctl_map(fmp, &map);
> +		if (IS_ERR_VALUE(result))
> +			break;
> +
> +		result = 0;
> +		break;
> +	}

Since kvm_dev_ioctl_map() either returns an error or 0, couldn't this 
just be reduced to:
		result = kvm_dev_ioctl_map(fmap, &map);
		break;
	}

> +	case KVM_INTRO_MEM_UNMAP: {
> +		unsigned long vaddr = (unsigned long) arg;
> +
> +		result = kvm_dev_ioctl_unmap(fmp, vaddr);
> +		if (IS_ERR_VALUE(result))
> +			break;
> +
> +		result = 0;
> +		break;
> +	}

Ditto here.

> +	default:
> +		pr_err("kvmi: ioctl %d not implemented\n", ioctl);
> +		result = -ENOTTY;
> +	}
> +
> +	return result;
> +}
> +
> +static int kvm_dev_release(struct inode *inodep, struct file *filp)
> +{
> +	int result = 0;

You set "result = 0" here, but result isn't used until the end, and just 
to return it.

> +	struct file_map *fmp;
> +	struct list_head *cur, *next;
> +	struct page_map *pmp;
> +
> +	pr_debug("kvmi: file %016lx closed by process %s\n",
> +		 (unsigned long) filp, current->comm);
> +
> +	/* acquire the file */
> +	fmp = filp->private_data;
> +	mutex_lock(&fmp->lock);
> +
> +	/* mark for teardown */
> +	fmp->active = 0;
> +
> +	/* release mappings taken on this instance of the file */
> +	list_for_each_safe(cur, next, &fmp->map_list) {
> +		pmp = list_entry(cur, struct page_map, map_list);
> +
> +		/* unmap address */
> +		_do_unmapping(pmp->paddr);
> +
> +		/* decouple & free guest mapping */
> +		list_del(&pmp->map_list);
> +		kfree(pmp);
> +	}
> +
> +	/* done processing this file mapping */
> +	mutex_unlock(&fmp->lock);
> +
> +	/* decouple file mapping */
> +	spin_lock(&file_lock);
> +	list_del(&fmp->file_list);
> +	spin_unlock(&file_lock);
> +
> +	/* free it */
> +	kfree(fmp);
> +
> +	return result;

This is the first time result is used. Couldn't this just be replaced 
with "return 0"?

> +}
> +
> +
> +static const struct file_operations kvmmem_ops = {
> +	.open		= kvm_dev_open,
> +	.unlocked_ioctl = kvm_dev_ioctl,
> +	.compat_ioctl   = kvm_dev_ioctl,
> +	.release	= kvm_dev_release,
> +};

Here you have all the rvals aligned...

> +
> +static struct miscdevice kvm_mem_dev = {
> +	.minor = MISC_DYNAMIC_MINOR,
> +	.name = "kvmmem",
> +	.fops = &kvmmem_ops,
> +};

...but here you don't. I'm not sure what the "proper" style is, but I 
think it should at least just be consistent.

> +
> +int __init kvm_intro_guest_init(void)
> +{
> +	int result;
> +
> +	if (!kvm_para_available()) {
> +		pr_err("kvmi: paravirt not available\n");
> +		return -EPERM;
> +	}
> +
> +	result = misc_register(&kvm_mem_dev);
> +	if (result) {
> +		pr_err("kvmi: misc device register failed: %d\n", result);
> +		return result;
> +	}
> +
> +	INIT_LIST_HEAD(&file_list);
> +	spin_lock_init(&file_lock);
> +
> +	pr_info("kvmi: guest introspection device created\n");
> +
> +	return 0;
> +}
> +
> +void kvm_intro_guest_exit(void)
> +{
> +	misc_deregister(&kvm_mem_dev);
> +}
> +
> +module_init(kvm_intro_guest_init)
> +module_exit(kvm_intro_guest_exit)
> 


Patrick
Mircea CIRJALIU - MELIU Dec. 22, 2017, 10:44 a.m. UTC | #2
Fixed all the issues addressed in the original comments + more.
Please see inline answers.

-----Original Message-----
From: Patrick Colp [mailto:patrick.colp@oracle.com] 

Sent: Thursday, 21 December 2017 23:17
To: Adalber Lazăr <alazar@bitdefender.com>; kvm@vger.kernel.org
Cc: linux-mm@kvack.org; Paolo Bonzini <pbonzini@redhat.com>; Radim Krčmář <rkrcmar@redhat.com>; Xiao Guangrong <guangrong.xiao@linux.intel.com>; Mihai Donțu <mdontu@bitdefender.com>; Mircea CIRJALIU-MELIU <mcirjaliu@bitdefender.com>
Subject: Re: [RFC PATCH v4 02/18] add memory map/unmap support for VM introspection on the guest side

On 2017-12-18 02:06 PM, Adalber Lazăr wrote:
> From: Adalbert Lazar <alazar@bitdefender.com>

> 

> An introspection tool running in a dedicated VM can use the new device

> (/dev/kvmmem) to map memory from other introspected VM-s.

> 

> Two ioctl operations are supported:

>    - KVM_INTRO_MEM_MAP/struct kvmi_mem_map

>    - KVM_INTRO_MEM_UNMAP/unsigned long

> 

> In order to map an introspected gpa to the local gva, the process 

> using this device needs to obtain a token from the host KVMI subsystem 

> (see Documentation/virtual/kvm/kvmi.rst - KVMI_GET_MAP_TOKEN).

> 

> Both operations use hypercalls (KVM_HC_MEM_MAP, KVM_INTRO_MEM_UNMAP) 

> to pass the requests to the host kernel/KVMi (see hypercalls.txt).

> 

> Signed-off-by: Mircea Cîrjaliu <mcirjaliu@bitdefender.com>

> ---

>   arch/x86/Kconfig                  |   9 +

>   arch/x86/include/asm/kvmi_guest.h |  10 +

>   arch/x86/kernel/Makefile          |   1 +

>   arch/x86/kernel/kvmi_mem_guest.c  |  26 +++

>   virt/kvm/kvmi_mem_guest.c         | 379 ++++++++++++++++++++++++++++++++++++++

>   5 files changed, 425 insertions(+)

>   create mode 100644 arch/x86/include/asm/kvmi_guest.h

>   create mode 100644 arch/x86/kernel/kvmi_mem_guest.c

>   create mode 100644 virt/kvm/kvmi_mem_guest.c

> 

> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 

> 8eed3f94bfc7..6e2548f4d44c 100644

> --- a/arch/x86/Kconfig

> +++ b/arch/x86/Kconfig

> @@ -782,6 +782,15 @@ config KVM_DEBUG_FS

>   	  Statistics are displayed in debugfs filesystem. Enabling this option

>   	  may incur significant overhead.

>   

> +config KVMI_MEM_GUEST

> +	bool "KVM Memory Introspection support on Guest"

> +	depends on KVM_GUEST

> +	default n

> +	---help---

> +	  This option enables functions and hypercalls for security applications

> +	  running in a separate VM to control the execution of other VM-s, query

> +	  the state of the vCPU-s (GPR-s, MSR-s etc.).

> +

>   config PARAVIRT_TIME_ACCOUNTING

>   	bool "Paravirtual steal time accounting"

>   	depends on PARAVIRT

> diff --git a/arch/x86/include/asm/kvmi_guest.h 

> b/arch/x86/include/asm/kvmi_guest.h

> new file mode 100644

> index 000000000000..c7ed53a938e0

> --- /dev/null

> +++ b/arch/x86/include/asm/kvmi_guest.h

> @@ -0,0 +1,10 @@

> +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __KVMI_GUEST_H__ 

> +#define __KVMI_GUEST_H__

> +

> +long kvmi_arch_map_hc(struct kvmi_map_mem_token *tknp,

> +	gpa_t req_gpa, gpa_t map_gpa);

> +long kvmi_arch_unmap_hc(gpa_t map_gpa);

> +

> +

> +#endif

> diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 

> 81bb565f4497..fdb54b65e46e 100644

> --- a/arch/x86/kernel/Makefile

> +++ b/arch/x86/kernel/Makefile

> @@ -111,6 +111,7 @@ obj-$(CONFIG_PARAVIRT)		+= paravirt.o paravirt_patch_$(BITS).o

>   obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o

>   obj-$(CONFIG_PARAVIRT_CLOCK)	+= pvclock.o

>   obj-$(CONFIG_X86_PMEM_LEGACY_DEVICE) += pmem.o

> +obj-$(CONFIG_KVMI_MEM_GUEST)	+= kvmi_mem_guest.o ../../../virt/kvm/kvmi_mem_guest.o

>   

>   obj-$(CONFIG_EISA)		+= eisa.o

>   obj-$(CONFIG_PCSPKR_PLATFORM)	+= pcspeaker.o

> diff --git a/arch/x86/kernel/kvmi_mem_guest.c 

> b/arch/x86/kernel/kvmi_mem_guest.c

> new file mode 100644

> index 000000000000..c4e2613f90f3

> --- /dev/null

> +++ b/arch/x86/kernel/kvmi_mem_guest.c

> @@ -0,0 +1,26 @@

> +// SPDX-License-Identifier: GPL-2.0

> +/*

> + * KVM introspection guest implementation

> + *

> + * Copyright (C) 2017 Bitdefender S.R.L.

> + *

> + * Author:

> + *   Mircea Cirjaliu <mcirjaliu@bitdefender.com>

> + */

> +

> +#include <uapi/linux/kvmi.h>

> +#include <uapi/linux/kvm_para.h>

> +#include <linux/kvm_types.h>

> +#include <asm/kvm_para.h>

> +

> +long kvmi_arch_map_hc(struct kvmi_map_mem_token *tknp,

> +		       gpa_t req_gpa, gpa_t map_gpa) {

> +	return kvm_hypercall3(KVM_HC_MEM_MAP, (unsigned long)tknp,

> +			      req_gpa, map_gpa);

> +}

> +

> +long kvmi_arch_unmap_hc(gpa_t map_gpa) {

> +	return kvm_hypercall1(KVM_HC_MEM_UNMAP, map_gpa); }

> diff --git a/virt/kvm/kvmi_mem_guest.c b/virt/kvm/kvmi_mem_guest.c new 

> file mode 100644 index 000000000000..118c22ca47c5

> --- /dev/null

> +++ b/virt/kvm/kvmi_mem_guest.c

> @@ -0,0 +1,379 @@

> +// SPDX-License-Identifier: GPL-2.0

> +/*

> + * KVM introspection guest implementation

> + *

> + * Copyright (C) 2017 Bitdefender S.R.L.

> + *

> + * Author:

> + *   Mircea Cirjaliu <mcirjaliu@bitdefender.com>

> + */

> +

> +#include <linux/module.h>

> +#include <linux/init.h>

> +#include <linux/kernel.h>

> +#include <linux/miscdevice.h>

> +#include <linux/fs.h>

> +#include <linux/mm.h>

> +#include <linux/mman.h>

> +#include <linux/types.h>

> +#include <linux/kvm_host.h>

> +#include <linux/kvm_para.h>

> +#include <linux/uaccess.h>

> +#include <linux/slab.h>

> +#include <linux/rmap.h>

> +#include <linux/sched.h>

> +#include <linux/list.h>

> +#include <linux/spinlock.h>

> +#include <linux/mutex.h>

> +#include <uapi/linux/kvmi.h>

> +#include <asm/kvmi_guest.h>

> +

> +#define ASSERT(exp) BUG_ON(!(exp))

> +

> +

> +static struct list_head file_list;

> +static spinlock_t file_lock;

> +

> +struct file_map {

> +	struct list_head file_list;

> +	struct file *file;

> +	struct list_head map_list;

> +	struct mutex lock;

> +	int active;	/* for tearing down */

> +};

> +

> +struct page_map {

> +	struct list_head map_list;

> +	__u64 gpa;

> +	unsigned long vaddr;

> +	unsigned long paddr;

> +};

> +

> +

> +static int kvm_dev_open(struct inode *inodep, struct file *filp) {

> +	struct file_map *fmp;

> +

> +	pr_debug("kvmi: file %016lx opened by process %s\n",

> +		 (unsigned long) filp, current->comm);

> +

> +	/* link the file 1:1 with such a structure */

> +	fmp = kmalloc(sizeof(struct file_map), GFP_KERNEL);


I think this is supposed to be "kmalloc(sizeof(*fmp), GFP_KERNEL)".

Fixed all the kmalloc()s in kvmi_mem* files.

> +	if (fmp == NULL)

> +		return -ENOMEM;

> +

> +	INIT_LIST_HEAD(&fmp->file_list);

> +	fmp->file = filp;

> +	filp->private_data = fmp;

> +	INIT_LIST_HEAD(&fmp->map_list);

> +	mutex_init(&fmp->lock);

> +	fmp->active = 1;

> +

> +	/* add the entry to the global list */

> +	spin_lock(&file_lock);

> +	list_add_tail(&fmp->file_list, &file_list);

> +	spin_unlock(&file_lock);

> +

> +	return 0;

> +}

> +

> +/* actually does the mapping of a page */ static long 

> +_do_mapping(struct kvmi_mem_map *map_req, struct page_map *pmap)


Here you have a "struct page_map" and call it pmap. However, in the rest of the code, whenever there's a "struct page_map" it's called pmp. It seems that it would be good to stay consistent with the naming, so perhaps rename it here to pmp as well?

Done.

> +{

> +	unsigned long paddr;

> +	struct vm_area_struct *vma = NULL; > +	struct page *page;


Out of curiosity, why do you set "*vma = NULL" but not "*page = NULL"?

Leftover from older code. Removed.

> +	long result;

> +

> +	pr_debug("kvmi: mapping remote GPA %016llx into %016llx\n",

> +		 map_req->gpa, map_req->gva);

> +

> +	/* check access to memory location */

> +	if (!access_ok(VERIFY_READ, map_req->gva, PAGE_SIZE)) {

> +		pr_err("kvmi: invalid virtual address for mapping\n");

> +		return -EINVAL;

> +	}

> +

> +	down_read(&current->mm->mmap_sem);

> +

> +	/* find the page to be replaced */

> +	vma = find_vma(current->mm, map_req->gva);

> +	if (IS_ERR_OR_NULL(vma)) {

> +		result = PTR_ERR(vma);

> +		pr_err("kvmi: find_vma() failed with result %ld\n", result);

> +		goto out;

> +	}

> +

> +	page = follow_page(vma, map_req->gva, 0);

> +	if (IS_ERR_OR_NULL(page)) {

> +		result = PTR_ERR(page);

> +		pr_err("kvmi: follow_page() failed with result %ld\n", result);

> +		goto out;

> +	}

> +

> +	if (IS_ENABLED(CONFIG_DEBUG_VM))

> +		dump_page(page, "page to map_req into");

> +

> +	WARN(is_zero_pfn(page_to_pfn(page)), "zero-page still mapped");

> +

> +	/* get the physical address and store it in page_map */

> +	paddr = page_to_phys(page);

> +	pr_debug("kvmi: page phys addr %016lx\n", paddr);

> +	pmap->paddr = paddr;

> +

> +	/* last thing to do is host mapping */

> +	result = kvmi_arch_map_hc(&map_req->token, map_req->gpa, paddr);

> +	if (IS_ERR_VALUE(result)) {

> +		pr_err("kvmi: HC failed with result %ld\n", result);

> +		goto out;

> +	}

> +

> +out:

> +	up_read(&current->mm->mmap_sem);

> +

> +	return result;

> +}

> +

> +/* actually does the unmapping of a page */ static long 

> +_do_unmapping(unsigned long paddr) {

> +	long result;

> +

> +	pr_debug("kvmi: unmapping request for phys addr %016lx\n", paddr);

> +

> +	/* local GPA uniquely identifies the mapping on the host */

> +	result = kvmi_arch_unmap_hc(paddr);

> +	if (IS_ERR_VALUE(result))

> +		pr_warn("kvmi: HC failed with result %ld\n", result);

> +

> +	return result;

> +}

> +

> +static long kvm_dev_ioctl_map(struct file_map *fmp, struct 

> +kvmi_mem_map *map) {

> +	struct page_map *pmp;

> +	long result = 0;


Out of curiosity again, why do you set "result = 0" here when it's always set before used (and, for e.g., _do_unmapping() doesn't do "result = 0")?

Same as above. Cleaned up unnecessary assignments to result in the rest of the code.

> +

> +	if (!access_ok(VERIFY_READ, map->gva, PAGE_SIZE))

> +		return -EINVAL;

> +	if (!access_ok(VERIFY_WRITE, map->gva, PAGE_SIZE))

> +		return -EINVAL;

> +

> +	/* prepare list entry */

> +	pmp = kmalloc(sizeof(struct page_map), GFP_KERNEL);


This should also probably be "kmalloc(sizeof(*pmp), GFP_KERNEL)".

Fixed.

> +	if (pmp == NULL)

> +		return -ENOMEM;

> +

> +	INIT_LIST_HEAD(&pmp->map_list);

> +	pmp->gpa = map->gpa;

> +	pmp->vaddr = map->gva;

> +

> +	/* acquire the file mapping */

> +	mutex_lock(&fmp->lock);

> +

> +	/* check if other thread is closing the file */

> +	if (!fmp->active) {

> +		result = -ENODEV;

> +		pr_warn("kvmi: unable to map, file is being closed\n");

> +		goto out_err;

> +	}

> +

> +	/* do the actual mapping */

> +	result = _do_mapping(map, pmp);

> +	if (IS_ERR_VALUE(result))

> +		goto out_err;

> +

> +	/* link to list */

> +	list_add_tail(&pmp->map_list, &fmp->map_list);

> +

> +	/* all fine */

> +	result = 0;

> +	goto out_finalize;

> +

> +out_err:

> +	kfree(pmp);

> +

> +out_finalize:

> +	mutex_unlock(&fmp->lock);

> +

> +	return result;

> +}

> +

> +static long kvm_dev_ioctl_unmap(struct file_map *fmp, unsigned long 

> +vaddr) {

> +	struct list_head *cur;

> +	struct page_map *pmp;

> +	bool found = false;

> +

> +	/* acquire the file */

> +	mutex_lock(&fmp->lock);

> +

> +	/* check if other thread is closing the file */

> +	if (!fmp->active) {

> +		mutex_unlock(&fmp->lock);


Wouldn't this be better replaced with a "goto out_err" like in kvm_dev_ioctl_map()?

Not really. I used the out_err/out_finalize recovery model for cases where all the action happens inside a critical section and the lock has to be closed at the end.
In this case, more actions can be taken outside the lock.
But refactored anyway for the sake of consistency.

> +		pr_warn("kvmi: unable to unmap, file is being closed\n");

> +		return -ENODEV;

> +	}

> +

> +	/* check that this address belongs to us */

> +	list_for_each(cur, &fmp->map_list) {

> +		pmp = list_entry(cur, struct page_map, map_list);

> +

> +		/* found */

> +		if (pmp->vaddr == vaddr) {

> +			found = true;

> +			break;

> +		}

> +	}

> +

> +	/* not found ? */

> +	if (!found) {

> +		mutex_unlock(&fmp->lock);


Here too: "goto out_err".

Fixed for both cases. Only 1 call to mutex_unlock() done now.

> +		pr_err("kvmi: address %016lx not mapped\n", vaddr);

> +		return -ENOENT;

> +	}

> +

> +	/* decouple guest mapping */

> +	list_del(&pmp->map_list);

> +	mutex_unlock(&fmp->lock);


In kvm_dev_ioctl_map(), the fmp mutex is held across the _do_mapping() call. Is there any particular reason why here the mutex doesn't need to be held across the _do_unmapping() call? Or was that more an artifact of having a common "out_err" exit in kvm_dev_ioctl_map()?

The fmp mutex:
1. protects the fmp list against concurrent access.
2. protects against teardown (one thread tries to do a mapping while another closes the file).
The call to _do_mapping() - which can fail, must be done inside the critical section before we add a valid pmp entry to the list. 
On the other hand, inside kvm_dev_ioctl_unmap() we must extract a valid pmp entry from the list before calling _do_unmapping().
There is no real reason for protecting the _do_mapping() call, but I chose not to revert the mapping in case I hit the teardown case.

> +

> +	/* unmap & ignore result */

> +	_do_unmapping(pmp->paddr);

> +

> +	/* free guest mapping */

> +	kfree(pmp);

> +

> +	return 0;

> +}

> +

> +static long kvm_dev_ioctl(struct file *filp,

> +			  unsigned int ioctl, unsigned long arg) {

> +	void __user *argp = (void __user *) arg;

> +	struct file_map *fmp;

> +	long result;

> +

> +	/* minor check */

> +	fmp = filp->private_data;

> +	ASSERT(fmp->file == filp);

> +

> +	switch (ioctl) {

> +	case KVM_INTRO_MEM_MAP: {

> +		struct kvmi_mem_map map;

> +

> +		result = -EFAULT;

> +		if (copy_from_user(&map, argp, sizeof(map)))

> +			break;

> +

> +		result = kvm_dev_ioctl_map(fmp, &map);

> +		if (IS_ERR_VALUE(result))

> +			break;

> +

> +		result = 0;

> +		break;

> +	}


Since kvm_dev_ioctl_map() either returns an error or 0, couldn't this just be reduced to:
		result = kvm_dev_ioctl_map(fmap, &map);
		break;
	}

Again, leftovers from older code. Also fixed that.

> +	case KVM_INTRO_MEM_UNMAP: {

> +		unsigned long vaddr = (unsigned long) arg;

> +

> +		result = kvm_dev_ioctl_unmap(fmp, vaddr);

> +		if (IS_ERR_VALUE(result))

> +			break;

> +

> +		result = 0;

> +		break;

> +	}


Ditto here.

Fixed.

> +	default:

> +		pr_err("kvmi: ioctl %d not implemented\n", ioctl);

> +		result = -ENOTTY;

> +	}

> +

> +	return result;

> +}

> +

> +static int kvm_dev_release(struct inode *inodep, struct file *filp) {

> +	int result = 0;


You set "result = 0" here, but result isn't used until the end, and just to return it.

Returned 0 directly.

> +	struct file_map *fmp;

> +	struct list_head *cur, *next;

> +	struct page_map *pmp;

> +

> +	pr_debug("kvmi: file %016lx closed by process %s\n",

> +		 (unsigned long) filp, current->comm);

> +

> +	/* acquire the file */

> +	fmp = filp->private_data;

> +	mutex_lock(&fmp->lock);

> +

> +	/* mark for teardown */

> +	fmp->active = 0;

> +

> +	/* release mappings taken on this instance of the file */

> +	list_for_each_safe(cur, next, &fmp->map_list) {

> +		pmp = list_entry(cur, struct page_map, map_list);

> +

> +		/* unmap address */

> +		_do_unmapping(pmp->paddr);

> +

> +		/* decouple & free guest mapping */

> +		list_del(&pmp->map_list);

> +		kfree(pmp);

> +	}

> +

> +	/* done processing this file mapping */

> +	mutex_unlock(&fmp->lock);

> +

> +	/* decouple file mapping */

> +	spin_lock(&file_lock);

> +	list_del(&fmp->file_list);

> +	spin_unlock(&file_lock);

> +

> +	/* free it */

> +	kfree(fmp);

> +

> +	return result;


This is the first time result is used. Couldn't this just be replaced with "return 0"?

Yes, it can.

> +}

> +

> +

> +static const struct file_operations kvmmem_ops = {

> +	.open		= kvm_dev_open,

> +	.unlocked_ioctl = kvm_dev_ioctl,

> +	.compat_ioctl   = kvm_dev_ioctl,

> +	.release	= kvm_dev_release,

> +};


Here you have all the rvals aligned...

> +

> +static struct miscdevice kvm_mem_dev = {

> +	.minor = MISC_DYNAMIC_MINOR,

> +	.name = "kvmmem",

> +	.fops = &kvmmem_ops,

> +};


...but here you don't. I'm not sure what the "proper" style is, but I think it should at least just be consistent.

Fixed.

> +

> +int __init kvm_intro_guest_init(void) {

> +	int result;

> +

> +	if (!kvm_para_available()) {

> +		pr_err("kvmi: paravirt not available\n");

> +		return -EPERM;

> +	}

> +

> +	result = misc_register(&kvm_mem_dev);

> +	if (result) {

> +		pr_err("kvmi: misc device register failed: %d\n", result);

> +		return result;

> +	}

> +

> +	INIT_LIST_HEAD(&file_list);

> +	spin_lock_init(&file_lock);

> +

> +	pr_info("kvmi: guest introspection device created\n");

> +

> +	return 0;

> +}

> +

> +void kvm_intro_guest_exit(void)

> +{

> +	misc_deregister(&kvm_mem_dev);

> +}

> +

> +module_init(kvm_intro_guest_init)

> +module_exit(kvm_intro_guest_exit)

> 



Patrick

________________________
This email was scanned by Bitdefender
Patrick Colp Dec. 22, 2017, 2:30 p.m. UTC | #3
*snip*

>> +		pr_err("kvmi: address %016lx not mapped\n", vaddr);
>> +		return -ENOENT;
>> +	}
>> +
>> +	/* decouple guest mapping */
>> +	list_del(&pmp->map_list);
>> +	mutex_unlock(&fmp->lock);
> 
> In kvm_dev_ioctl_map(), the fmp mutex is held across the _do_mapping() call. Is there any particular reason why here the mutex doesn't need to be held across the _do_unmapping() call? Or was that more an artifact of having a common "out_err" exit in kvm_dev_ioctl_map()?
> 
> The fmp mutex:
> 1. protects the fmp list against concurrent access.
> 2. protects against teardown (one thread tries to do a mapping while another closes the file).
> The call to _do_mapping() - which can fail, must be done inside the critical section before we add a valid pmp entry to the list.
> On the other hand, inside kvm_dev_ioctl_unmap() we must extract a valid pmp entry from the list before calling _do_unmapping().
> There is no real reason for protecting the _do_mapping() call, but I chose not to revert the mapping in case I hit the teardown case.
>

Gotcha. That makes sense.


Patrick
diff mbox

Patch

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 8eed3f94bfc7..6e2548f4d44c 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -782,6 +782,15 @@  config KVM_DEBUG_FS
 	  Statistics are displayed in debugfs filesystem. Enabling this option
 	  may incur significant overhead.
 
+config KVMI_MEM_GUEST
+	bool "KVM Memory Introspection support on Guest"
+	depends on KVM_GUEST
+	default n
+	---help---
+	  This option enables functions and hypercalls for security applications
+	  running in a separate VM to control the execution of other VM-s, query
+	  the state of the vCPU-s (GPR-s, MSR-s etc.).
+
 config PARAVIRT_TIME_ACCOUNTING
 	bool "Paravirtual steal time accounting"
 	depends on PARAVIRT
diff --git a/arch/x86/include/asm/kvmi_guest.h b/arch/x86/include/asm/kvmi_guest.h
new file mode 100644
index 000000000000..c7ed53a938e0
--- /dev/null
+++ b/arch/x86/include/asm/kvmi_guest.h
@@ -0,0 +1,10 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVMI_GUEST_H__
+#define __KVMI_GUEST_H__
+
+long kvmi_arch_map_hc(struct kvmi_map_mem_token *tknp,
+	gpa_t req_gpa, gpa_t map_gpa);
+long kvmi_arch_unmap_hc(gpa_t map_gpa);
+
+
+#endif
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 81bb565f4497..fdb54b65e46e 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -111,6 +111,7 @@  obj-$(CONFIG_PARAVIRT)		+= paravirt.o paravirt_patch_$(BITS).o
 obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o
 obj-$(CONFIG_PARAVIRT_CLOCK)	+= pvclock.o
 obj-$(CONFIG_X86_PMEM_LEGACY_DEVICE) += pmem.o
+obj-$(CONFIG_KVMI_MEM_GUEST)	+= kvmi_mem_guest.o ../../../virt/kvm/kvmi_mem_guest.o
 
 obj-$(CONFIG_EISA)		+= eisa.o
 obj-$(CONFIG_PCSPKR_PLATFORM)	+= pcspeaker.o
diff --git a/arch/x86/kernel/kvmi_mem_guest.c b/arch/x86/kernel/kvmi_mem_guest.c
new file mode 100644
index 000000000000..c4e2613f90f3
--- /dev/null
+++ b/arch/x86/kernel/kvmi_mem_guest.c
@@ -0,0 +1,26 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KVM introspection guest implementation
+ *
+ * Copyright (C) 2017 Bitdefender S.R.L.
+ *
+ * Author:
+ *   Mircea Cirjaliu <mcirjaliu@bitdefender.com>
+ */
+
+#include <uapi/linux/kvmi.h>
+#include <uapi/linux/kvm_para.h>
+#include <linux/kvm_types.h>
+#include <asm/kvm_para.h>
+
+long kvmi_arch_map_hc(struct kvmi_map_mem_token *tknp,
+		       gpa_t req_gpa, gpa_t map_gpa)
+{
+	return kvm_hypercall3(KVM_HC_MEM_MAP, (unsigned long)tknp,
+			      req_gpa, map_gpa);
+}
+
+long kvmi_arch_unmap_hc(gpa_t map_gpa)
+{
+	return kvm_hypercall1(KVM_HC_MEM_UNMAP, map_gpa);
+}
diff --git a/virt/kvm/kvmi_mem_guest.c b/virt/kvm/kvmi_mem_guest.c
new file mode 100644
index 000000000000..118c22ca47c5
--- /dev/null
+++ b/virt/kvm/kvmi_mem_guest.c
@@ -0,0 +1,379 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KVM introspection guest implementation
+ *
+ * Copyright (C) 2017 Bitdefender S.R.L.
+ *
+ * Author:
+ *   Mircea Cirjaliu <mcirjaliu@bitdefender.com>
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/miscdevice.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/types.h>
+#include <linux/kvm_host.h>
+#include <linux/kvm_para.h>
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+#include <linux/rmap.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <uapi/linux/kvmi.h>
+#include <asm/kvmi_guest.h>
+
+#define ASSERT(exp) BUG_ON(!(exp))
+
+
+static struct list_head file_list;
+static spinlock_t file_lock;
+
+struct file_map {
+	struct list_head file_list;
+	struct file *file;
+	struct list_head map_list;
+	struct mutex lock;
+	int active;	/* for tearing down */
+};
+
+struct page_map {
+	struct list_head map_list;
+	__u64 gpa;
+	unsigned long vaddr;
+	unsigned long paddr;
+};
+
+
+static int kvm_dev_open(struct inode *inodep, struct file *filp)
+{
+	struct file_map *fmp;
+
+	pr_debug("kvmi: file %016lx opened by process %s\n",
+		 (unsigned long) filp, current->comm);
+
+	/* link the file 1:1 with such a structure */
+	fmp = kmalloc(sizeof(struct file_map), GFP_KERNEL);
+	if (fmp == NULL)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&fmp->file_list);
+	fmp->file = filp;
+	filp->private_data = fmp;
+	INIT_LIST_HEAD(&fmp->map_list);
+	mutex_init(&fmp->lock);
+	fmp->active = 1;
+
+	/* add the entry to the global list */
+	spin_lock(&file_lock);
+	list_add_tail(&fmp->file_list, &file_list);
+	spin_unlock(&file_lock);
+
+	return 0;
+}
+
+/* actually does the mapping of a page */
+static long _do_mapping(struct kvmi_mem_map *map_req, struct page_map *pmap)
+{
+	unsigned long paddr;
+	struct vm_area_struct *vma = NULL;
+	struct page *page;
+	long result;
+
+	pr_debug("kvmi: mapping remote GPA %016llx into %016llx\n",
+		 map_req->gpa, map_req->gva);
+
+	/* check access to memory location */
+	if (!access_ok(VERIFY_READ, map_req->gva, PAGE_SIZE)) {
+		pr_err("kvmi: invalid virtual address for mapping\n");
+		return -EINVAL;
+	}
+
+	down_read(&current->mm->mmap_sem);
+
+	/* find the page to be replaced */
+	vma = find_vma(current->mm, map_req->gva);
+	if (IS_ERR_OR_NULL(vma)) {
+		result = PTR_ERR(vma);
+		pr_err("kvmi: find_vma() failed with result %ld\n", result);
+		goto out;
+	}
+
+	page = follow_page(vma, map_req->gva, 0);
+	if (IS_ERR_OR_NULL(page)) {
+		result = PTR_ERR(page);
+		pr_err("kvmi: follow_page() failed with result %ld\n", result);
+		goto out;
+	}
+
+	if (IS_ENABLED(CONFIG_DEBUG_VM))
+		dump_page(page, "page to map_req into");
+
+	WARN(is_zero_pfn(page_to_pfn(page)), "zero-page still mapped");
+
+	/* get the physical address and store it in page_map */
+	paddr = page_to_phys(page);
+	pr_debug("kvmi: page phys addr %016lx\n", paddr);
+	pmap->paddr = paddr;
+
+	/* last thing to do is host mapping */
+	result = kvmi_arch_map_hc(&map_req->token, map_req->gpa, paddr);
+	if (IS_ERR_VALUE(result)) {
+		pr_err("kvmi: HC failed with result %ld\n", result);
+		goto out;
+	}
+
+out:
+	up_read(&current->mm->mmap_sem);
+
+	return result;
+}
+
+/* actually does the unmapping of a page */
+static long _do_unmapping(unsigned long paddr)
+{
+	long result;
+
+	pr_debug("kvmi: unmapping request for phys addr %016lx\n", paddr);
+
+	/* local GPA uniquely identifies the mapping on the host */
+	result = kvmi_arch_unmap_hc(paddr);
+	if (IS_ERR_VALUE(result))
+		pr_warn("kvmi: HC failed with result %ld\n", result);
+
+	return result;
+}
+
+static long kvm_dev_ioctl_map(struct file_map *fmp, struct kvmi_mem_map *map)
+{
+	struct page_map *pmp;
+	long result = 0;
+
+	if (!access_ok(VERIFY_READ, map->gva, PAGE_SIZE))
+		return -EINVAL;
+	if (!access_ok(VERIFY_WRITE, map->gva, PAGE_SIZE))
+		return -EINVAL;
+
+	/* prepare list entry */
+	pmp = kmalloc(sizeof(struct page_map), GFP_KERNEL);
+	if (pmp == NULL)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&pmp->map_list);
+	pmp->gpa = map->gpa;
+	pmp->vaddr = map->gva;
+
+	/* acquire the file mapping */
+	mutex_lock(&fmp->lock);
+
+	/* check if other thread is closing the file */
+	if (!fmp->active) {
+		result = -ENODEV;
+		pr_warn("kvmi: unable to map, file is being closed\n");
+		goto out_err;
+	}
+
+	/* do the actual mapping */
+	result = _do_mapping(map, pmp);
+	if (IS_ERR_VALUE(result))
+		goto out_err;
+
+	/* link to list */
+	list_add_tail(&pmp->map_list, &fmp->map_list);
+
+	/* all fine */
+	result = 0;
+	goto out_finalize;
+
+out_err:
+	kfree(pmp);
+
+out_finalize:
+	mutex_unlock(&fmp->lock);
+
+	return result;
+}
+
+static long kvm_dev_ioctl_unmap(struct file_map *fmp, unsigned long vaddr)
+{
+	struct list_head *cur;
+	struct page_map *pmp;
+	bool found = false;
+
+	/* acquire the file */
+	mutex_lock(&fmp->lock);
+
+	/* check if other thread is closing the file */
+	if (!fmp->active) {
+		mutex_unlock(&fmp->lock);
+		pr_warn("kvmi: unable to unmap, file is being closed\n");
+		return -ENODEV;
+	}
+
+	/* check that this address belongs to us */
+	list_for_each(cur, &fmp->map_list) {
+		pmp = list_entry(cur, struct page_map, map_list);
+
+		/* found */
+		if (pmp->vaddr == vaddr) {
+			found = true;
+			break;
+		}
+	}
+
+	/* not found ? */
+	if (!found) {
+		mutex_unlock(&fmp->lock);
+		pr_err("kvmi: address %016lx not mapped\n", vaddr);
+		return -ENOENT;
+	}
+
+	/* decouple guest mapping */
+	list_del(&pmp->map_list);
+	mutex_unlock(&fmp->lock);
+
+	/* unmap & ignore result */
+	_do_unmapping(pmp->paddr);
+
+	/* free guest mapping */
+	kfree(pmp);
+
+	return 0;
+}
+
+static long kvm_dev_ioctl(struct file *filp,
+			  unsigned int ioctl, unsigned long arg)
+{
+	void __user *argp = (void __user *) arg;
+	struct file_map *fmp;
+	long result;
+
+	/* minor check */
+	fmp = filp->private_data;
+	ASSERT(fmp->file == filp);
+
+	switch (ioctl) {
+	case KVM_INTRO_MEM_MAP: {
+		struct kvmi_mem_map map;
+
+		result = -EFAULT;
+		if (copy_from_user(&map, argp, sizeof(map)))
+			break;
+
+		result = kvm_dev_ioctl_map(fmp, &map);
+		if (IS_ERR_VALUE(result))
+			break;
+
+		result = 0;
+		break;
+	}
+	case KVM_INTRO_MEM_UNMAP: {
+		unsigned long vaddr = (unsigned long) arg;
+
+		result = kvm_dev_ioctl_unmap(fmp, vaddr);
+		if (IS_ERR_VALUE(result))
+			break;
+
+		result = 0;
+		break;
+	}
+	default:
+		pr_err("kvmi: ioctl %d not implemented\n", ioctl);
+		result = -ENOTTY;
+	}
+
+	return result;
+}
+
+static int kvm_dev_release(struct inode *inodep, struct file *filp)
+{
+	int result = 0;
+	struct file_map *fmp;
+	struct list_head *cur, *next;
+	struct page_map *pmp;
+
+	pr_debug("kvmi: file %016lx closed by process %s\n",
+		 (unsigned long) filp, current->comm);
+
+	/* acquire the file */
+	fmp = filp->private_data;
+	mutex_lock(&fmp->lock);
+
+	/* mark for teardown */
+	fmp->active = 0;
+
+	/* release mappings taken on this instance of the file */
+	list_for_each_safe(cur, next, &fmp->map_list) {
+		pmp = list_entry(cur, struct page_map, map_list);
+
+		/* unmap address */
+		_do_unmapping(pmp->paddr);
+
+		/* decouple & free guest mapping */
+		list_del(&pmp->map_list);
+		kfree(pmp);
+	}
+
+	/* done processing this file mapping */
+	mutex_unlock(&fmp->lock);
+
+	/* decouple file mapping */
+	spin_lock(&file_lock);
+	list_del(&fmp->file_list);
+	spin_unlock(&file_lock);
+
+	/* free it */
+	kfree(fmp);
+
+	return result;
+}
+
+
+static const struct file_operations kvmmem_ops = {
+	.open		= kvm_dev_open,
+	.unlocked_ioctl = kvm_dev_ioctl,
+	.compat_ioctl   = kvm_dev_ioctl,
+	.release	= kvm_dev_release,
+};
+
+static struct miscdevice kvm_mem_dev = {
+	.minor = MISC_DYNAMIC_MINOR,
+	.name = "kvmmem",
+	.fops = &kvmmem_ops,
+};
+
+int __init kvm_intro_guest_init(void)
+{
+	int result;
+
+	if (!kvm_para_available()) {
+		pr_err("kvmi: paravirt not available\n");
+		return -EPERM;
+	}
+
+	result = misc_register(&kvm_mem_dev);
+	if (result) {
+		pr_err("kvmi: misc device register failed: %d\n", result);
+		return result;
+	}
+
+	INIT_LIST_HEAD(&file_list);
+	spin_lock_init(&file_lock);
+
+	pr_info("kvmi: guest introspection device created\n");
+
+	return 0;
+}
+
+void kvm_intro_guest_exit(void)
+{
+	misc_deregister(&kvm_mem_dev);
+}
+
+module_init(kvm_intro_guest_init)
+module_exit(kvm_intro_guest_exit)