diff mbox

[v6,11/12] KVM: introduce readonly memslot

Message ID 5032FA5B.5040308@linux.vnet.ibm.com (mailing list archive)
State New, archived
Headers show

Commit Message

Xiao Guangrong Aug. 21, 2012, 3:02 a.m. UTC
In current code, if we map a readonly memory space from host to guest
and the page is not currently mapped in the host, we will get a fault
pfn and async is not allowed, then the vm will crash

We introduce readonly memory region to map ROM/ROMD to the guest, read access
is happy for readonly memslot, write access on readonly memslot will cause
KVM_EXIT_MMIO exit

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
---
 Documentation/virtual/kvm/api.txt |   10 +++-
 arch/x86/include/asm/kvm.h        |    1 +
 arch/x86/kvm/mmu.c                |    9 ++++
 arch/x86/kvm/x86.c                |    1 +
 include/linux/kvm.h               |    6 ++-
 include/linux/kvm_host.h          |    7 +--
 virt/kvm/kvm_main.c               |   96 ++++++++++++++++++++++++++++++-------
 7 files changed, 102 insertions(+), 28 deletions(-)

Comments

Jan Kiszka Sept. 7, 2012, 10:23 a.m. UTC | #1
On 2012-08-21 05:02, Xiao Guangrong wrote:
> In current code, if we map a readonly memory space from host to guest
> and the page is not currently mapped in the host, we will get a fault
> pfn and async is not allowed, then the vm will crash
> 
> We introduce readonly memory region to map ROM/ROMD to the guest, read access
> is happy for readonly memslot, write access on readonly memslot will cause
> KVM_EXIT_MMIO exit
> 
> Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
> ---
>  Documentation/virtual/kvm/api.txt |   10 +++-
>  arch/x86/include/asm/kvm.h        |    1 +
>  arch/x86/kvm/mmu.c                |    9 ++++
>  arch/x86/kvm/x86.c                |    1 +
>  include/linux/kvm.h               |    6 ++-
>  include/linux/kvm_host.h          |    7 +--
>  virt/kvm/kvm_main.c               |   96 ++++++++++++++++++++++++++++++-------
>  7 files changed, 102 insertions(+), 28 deletions(-)
> 
> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> index bf33aaa..b91bfd4 100644
> --- a/Documentation/virtual/kvm/api.txt
> +++ b/Documentation/virtual/kvm/api.txt
> @@ -857,7 +857,8 @@ struct kvm_userspace_memory_region {
>  };
> 
>  /* for kvm_memory_region::flags */
> -#define KVM_MEM_LOG_DIRTY_PAGES  1UL
> +#define KVM_MEM_LOG_DIRTY_PAGES	(1UL << 0)
> +#define KVM_MEM_READONLY	(1UL << 1)
> 
>  This ioctl allows the user to create or modify a guest physical memory
>  slot.  When changing an existing slot, it may be moved in the guest
> @@ -873,9 +874,12 @@ It is recommended that the lower 21 bits of guest_phys_addr and userspace_addr
>  be identical.  This allows large pages in the guest to be backed by large
>  pages in the host.
> 
> -The flags field supports just one flag, KVM_MEM_LOG_DIRTY_PAGES, which
> +The flags field supports two flag, KVM_MEM_LOG_DIRTY_PAGES, which
>  instructs kvm to keep track of writes to memory within the slot.  See
> -the KVM_GET_DIRTY_LOG ioctl.
> +the KVM_GET_DIRTY_LOG ioctl. Another flag is KVM_MEM_READONLY when the
> +KVM_CAP_READONLY_MEM capability, it indicates the guest memory is read-only,
> +that means, guest is only allowed to read it.

The last sentence requires some refactoring. :) How about: "The
KVM_CAP_READONLY_MEM capability indicates the availability of the
KVM_MEM_READONLY flag. When this flag is set for a memory region, KVM
only allows read accesses."

> Writes will be posted to
> +userspace as KVM_EXIT_MMIO exits.
> 
>  When the KVM_CAP_SYNC_MMU capability, changes in the backing of the memory
>  region are automatically reflected into the guest.  For example, an mmap()
> diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
> index 246617e..521bf25 100644
> --- a/arch/x86/include/asm/kvm.h
> +++ b/arch/x86/include/asm/kvm.h
> @@ -25,6 +25,7 @@
>  #define __KVM_HAVE_DEBUGREGS
>  #define __KVM_HAVE_XSAVE
>  #define __KVM_HAVE_XCRS
> +#define __KVM_HAVE_READONLY_MEM
> 
>  /* Architectural interrupt line count. */
>  #define KVM_NR_INTERRUPTS 256
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 5548971..8e312a2 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -2647,6 +2647,15 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *
> 
>  static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn)
>  {
> +	/*
> +	 * Do not cache the mmio info caused by writing the readonly gfn
> +	 * into the spte otherwise read access on readonly gfn also can
> +	 * caused mmio page fault and treat it as mmio access.
> +	 * Return 1 to tell kvm to emulate it.
> +	 */
> +	if (pfn == KVM_PFN_ERR_RO_FAULT)
> +		return 1;
> +
>  	if (pfn == KVM_PFN_ERR_HWPOISON) {
>  		kvm_send_hwpoison_signal(gfn_to_hva(vcpu->kvm, gfn), current);
>  		return 0;
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 704680d..42bbf41 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -2175,6 +2175,7 @@ int kvm_dev_ioctl_check_extension(long ext)
>  	case KVM_CAP_GET_TSC_KHZ:
>  	case KVM_CAP_PCI_2_3:
>  	case KVM_CAP_KVMCLOCK_CTRL:
> +	case KVM_CAP_READONLY_MEM:
>  		r = 1;
>  		break;
>  	case KVM_CAP_COALESCED_MMIO:
> diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> index 2de335d..d808694 100644
> --- a/include/linux/kvm.h
> +++ b/include/linux/kvm.h
> @@ -106,7 +106,8 @@ struct kvm_userspace_memory_region {
>   * other bits are reserved for kvm internal use which are defined in
>   * include/linux/kvm_host.h.
>   */
> -#define KVM_MEM_LOG_DIRTY_PAGES  1UL
> +#define KVM_MEM_LOG_DIRTY_PAGES	(1UL << 0)
> +#define KVM_MEM_READONLY	(1UL << 1)
> 
>  /* for KVM_IRQ_LINE */
>  struct kvm_irq_level {
> @@ -621,6 +622,9 @@ struct kvm_ppc_smmu_info {
>  #define KVM_CAP_PPC_GET_SMMU_INFO 78
>  #define KVM_CAP_S390_COW 79
>  #define KVM_CAP_PPC_ALLOC_HTAB 80
> +#ifdef __KVM_HAVE_READONLY_MEM
> +#define KVM_CAP_READONLY_MEM 81
> +#endif

See the discussion on the userspace patches: The CAP, as userspace API,
should really be defined unconditionally, kernel code should depend on
__KVM_HAVE_READONLY_MEM or a to-be-introduced CONFIG_KVM_HAVE_xxx
switch. This allows for cleaner userspace code.

Jan
Xiao Guangrong Sept. 7, 2012, 10:47 a.m. UTC | #2
On 09/07/2012 06:23 PM, Jan Kiszka wrote:
> On 2012-08-21 05:02, Xiao Guangrong wrote:
>> In current code, if we map a readonly memory space from host to guest
>> and the page is not currently mapped in the host, we will get a fault
>> pfn and async is not allowed, then the vm will crash
>>
>> We introduce readonly memory region to map ROM/ROMD to the guest, read access
>> is happy for readonly memslot, write access on readonly memslot will cause
>> KVM_EXIT_MMIO exit
>>
>> Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
>> ---
>>  Documentation/virtual/kvm/api.txt |   10 +++-
>>  arch/x86/include/asm/kvm.h        |    1 +
>>  arch/x86/kvm/mmu.c                |    9 ++++
>>  arch/x86/kvm/x86.c                |    1 +
>>  include/linux/kvm.h               |    6 ++-
>>  include/linux/kvm_host.h          |    7 +--
>>  virt/kvm/kvm_main.c               |   96 ++++++++++++++++++++++++++++++-------
>>  7 files changed, 102 insertions(+), 28 deletions(-)
>>
>> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
>> index bf33aaa..b91bfd4 100644
>> --- a/Documentation/virtual/kvm/api.txt
>> +++ b/Documentation/virtual/kvm/api.txt
>> @@ -857,7 +857,8 @@ struct kvm_userspace_memory_region {
>>  };
>>
>>  /* for kvm_memory_region::flags */
>> -#define KVM_MEM_LOG_DIRTY_PAGES  1UL
>> +#define KVM_MEM_LOG_DIRTY_PAGES	(1UL << 0)
>> +#define KVM_MEM_READONLY	(1UL << 1)
>>
>>  This ioctl allows the user to create or modify a guest physical memory
>>  slot.  When changing an existing slot, it may be moved in the guest
>> @@ -873,9 +874,12 @@ It is recommended that the lower 21 bits of guest_phys_addr and userspace_addr
>>  be identical.  This allows large pages in the guest to be backed by large
>>  pages in the host.
>>
>> -The flags field supports just one flag, KVM_MEM_LOG_DIRTY_PAGES, which
>> +The flags field supports two flag, KVM_MEM_LOG_DIRTY_PAGES, which
>>  instructs kvm to keep track of writes to memory within the slot.  See
>> -the KVM_GET_DIRTY_LOG ioctl.
>> +the KVM_GET_DIRTY_LOG ioctl. Another flag is KVM_MEM_READONLY when the
>> +KVM_CAP_READONLY_MEM capability, it indicates the guest memory is read-only,
>> +that means, guest is only allowed to read it.
> 
> The last sentence requires some refactoring. :) How about: "The
> KVM_CAP_READONLY_MEM capability indicates the availability of the
> KVM_MEM_READONLY flag. When this flag is set for a memory region, KVM
> only allows read accesses."

Undoubtedly, your sentence is far better than mine. :)

By the way, this patchset has been applied on kvm -next branch, would
you mind to post a patch to do these works.

> 
>> Writes will be posted to
>> +userspace as KVM_EXIT_MMIO exits.
>>
>>  When the KVM_CAP_SYNC_MMU capability, changes in the backing of the memory
>>  region are automatically reflected into the guest.  For example, an mmap()
>> diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
>> index 246617e..521bf25 100644
>> --- a/arch/x86/include/asm/kvm.h
>> +++ b/arch/x86/include/asm/kvm.h
>> @@ -25,6 +25,7 @@
>>  #define __KVM_HAVE_DEBUGREGS
>>  #define __KVM_HAVE_XSAVE
>>  #define __KVM_HAVE_XCRS
>> +#define __KVM_HAVE_READONLY_MEM
>>
>>  /* Architectural interrupt line count. */
>>  #define KVM_NR_INTERRUPTS 256
>> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
>> index 5548971..8e312a2 100644
>> --- a/arch/x86/kvm/mmu.c
>> +++ b/arch/x86/kvm/mmu.c
>> @@ -2647,6 +2647,15 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *
>>
>>  static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn)
>>  {
>> +	/*
>> +	 * Do not cache the mmio info caused by writing the readonly gfn
>> +	 * into the spte otherwise read access on readonly gfn also can
>> +	 * caused mmio page fault and treat it as mmio access.
>> +	 * Return 1 to tell kvm to emulate it.
>> +	 */
>> +	if (pfn == KVM_PFN_ERR_RO_FAULT)
>> +		return 1;
>> +
>>  	if (pfn == KVM_PFN_ERR_HWPOISON) {
>>  		kvm_send_hwpoison_signal(gfn_to_hva(vcpu->kvm, gfn), current);
>>  		return 0;
>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> index 704680d..42bbf41 100644
>> --- a/arch/x86/kvm/x86.c
>> +++ b/arch/x86/kvm/x86.c
>> @@ -2175,6 +2175,7 @@ int kvm_dev_ioctl_check_extension(long ext)
>>  	case KVM_CAP_GET_TSC_KHZ:
>>  	case KVM_CAP_PCI_2_3:
>>  	case KVM_CAP_KVMCLOCK_CTRL:
>> +	case KVM_CAP_READONLY_MEM:
>>  		r = 1;
>>  		break;
>>  	case KVM_CAP_COALESCED_MMIO:
>> diff --git a/include/linux/kvm.h b/include/linux/kvm.h
>> index 2de335d..d808694 100644
>> --- a/include/linux/kvm.h
>> +++ b/include/linux/kvm.h
>> @@ -106,7 +106,8 @@ struct kvm_userspace_memory_region {
>>   * other bits are reserved for kvm internal use which are defined in
>>   * include/linux/kvm_host.h.
>>   */
>> -#define KVM_MEM_LOG_DIRTY_PAGES  1UL
>> +#define KVM_MEM_LOG_DIRTY_PAGES	(1UL << 0)
>> +#define KVM_MEM_READONLY	(1UL << 1)
>>
>>  /* for KVM_IRQ_LINE */
>>  struct kvm_irq_level {
>> @@ -621,6 +622,9 @@ struct kvm_ppc_smmu_info {
>>  #define KVM_CAP_PPC_GET_SMMU_INFO 78
>>  #define KVM_CAP_S390_COW 79
>>  #define KVM_CAP_PPC_ALLOC_HTAB 80
>> +#ifdef __KVM_HAVE_READONLY_MEM
>> +#define KVM_CAP_READONLY_MEM 81
>> +#endif
> 
> See the discussion on the userspace patches: The CAP, as userspace API,
> should really be defined unconditionally, kernel code should depend on
> __KVM_HAVE_READONLY_MEM or a to-be-introduced CONFIG_KVM_HAVE_xxx
> switch. This allows for cleaner userspace code.

I see that using __KVM_HAVE_  around CAP in kvm.h is very popular:

$ grep __KVM_HAVE include/linux/kvm.h | wc -l
20

As your suggestion, userspace will always use the CAP even if the CAP
is not really supported. We do not need care the overload on other arches?

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kiszka Sept. 7, 2012, 11:14 a.m. UTC | #3
On 2012-09-07 12:47, Xiao Guangrong wrote:
> On 09/07/2012 06:23 PM, Jan Kiszka wrote:
>> On 2012-08-21 05:02, Xiao Guangrong wrote:
>>> In current code, if we map a readonly memory space from host to guest
>>> and the page is not currently mapped in the host, we will get a fault
>>> pfn and async is not allowed, then the vm will crash
>>>
>>> We introduce readonly memory region to map ROM/ROMD to the guest, read access
>>> is happy for readonly memslot, write access on readonly memslot will cause
>>> KVM_EXIT_MMIO exit
>>>
>>> Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
>>> ---
>>>  Documentation/virtual/kvm/api.txt |   10 +++-
>>>  arch/x86/include/asm/kvm.h        |    1 +
>>>  arch/x86/kvm/mmu.c                |    9 ++++
>>>  arch/x86/kvm/x86.c                |    1 +
>>>  include/linux/kvm.h               |    6 ++-
>>>  include/linux/kvm_host.h          |    7 +--
>>>  virt/kvm/kvm_main.c               |   96 ++++++++++++++++++++++++++++++-------
>>>  7 files changed, 102 insertions(+), 28 deletions(-)
>>>
>>> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
>>> index bf33aaa..b91bfd4 100644
>>> --- a/Documentation/virtual/kvm/api.txt
>>> +++ b/Documentation/virtual/kvm/api.txt
>>> @@ -857,7 +857,8 @@ struct kvm_userspace_memory_region {
>>>  };
>>>
>>>  /* for kvm_memory_region::flags */
>>> -#define KVM_MEM_LOG_DIRTY_PAGES  1UL
>>> +#define KVM_MEM_LOG_DIRTY_PAGES	(1UL << 0)
>>> +#define KVM_MEM_READONLY	(1UL << 1)
>>>
>>>  This ioctl allows the user to create or modify a guest physical memory
>>>  slot.  When changing an existing slot, it may be moved in the guest
>>> @@ -873,9 +874,12 @@ It is recommended that the lower 21 bits of guest_phys_addr and userspace_addr
>>>  be identical.  This allows large pages in the guest to be backed by large
>>>  pages in the host.
>>>
>>> -The flags field supports just one flag, KVM_MEM_LOG_DIRTY_PAGES, which
>>> +The flags field supports two flag, KVM_MEM_LOG_DIRTY_PAGES, which
>>>  instructs kvm to keep track of writes to memory within the slot.  See
>>> -the KVM_GET_DIRTY_LOG ioctl.
>>> +the KVM_GET_DIRTY_LOG ioctl. Another flag is KVM_MEM_READONLY when the
>>> +KVM_CAP_READONLY_MEM capability, it indicates the guest memory is read-only,
>>> +that means, guest is only allowed to read it.
>>
>> The last sentence requires some refactoring. :) How about: "The
>> KVM_CAP_READONLY_MEM capability indicates the availability of the
>> KVM_MEM_READONLY flag. When this flag is set for a memory region, KVM
>> only allows read accesses."
> 
> Undoubtedly, your sentence is far better than mine. :)
> 
> By the way, this patchset has been applied on kvm -next branch, would
> you mind to post a patch to do these works.

Can do, found some more improvable parts in this IOCTL anyway.

> 
>>
>>> Writes will be posted to
>>> +userspace as KVM_EXIT_MMIO exits.
>>>
>>>  When the KVM_CAP_SYNC_MMU capability, changes in the backing of the memory
>>>  region are automatically reflected into the guest.  For example, an mmap()
>>> diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
>>> index 246617e..521bf25 100644
>>> --- a/arch/x86/include/asm/kvm.h
>>> +++ b/arch/x86/include/asm/kvm.h
>>> @@ -25,6 +25,7 @@
>>>  #define __KVM_HAVE_DEBUGREGS
>>>  #define __KVM_HAVE_XSAVE
>>>  #define __KVM_HAVE_XCRS
>>> +#define __KVM_HAVE_READONLY_MEM
>>>
>>>  /* Architectural interrupt line count. */
>>>  #define KVM_NR_INTERRUPTS 256
>>> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
>>> index 5548971..8e312a2 100644
>>> --- a/arch/x86/kvm/mmu.c
>>> +++ b/arch/x86/kvm/mmu.c
>>> @@ -2647,6 +2647,15 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *
>>>
>>>  static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn)
>>>  {
>>> +	/*
>>> +	 * Do not cache the mmio info caused by writing the readonly gfn
>>> +	 * into the spte otherwise read access on readonly gfn also can
>>> +	 * caused mmio page fault and treat it as mmio access.
>>> +	 * Return 1 to tell kvm to emulate it.
>>> +	 */
>>> +	if (pfn == KVM_PFN_ERR_RO_FAULT)
>>> +		return 1;
>>> +
>>>  	if (pfn == KVM_PFN_ERR_HWPOISON) {
>>>  		kvm_send_hwpoison_signal(gfn_to_hva(vcpu->kvm, gfn), current);
>>>  		return 0;
>>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>>> index 704680d..42bbf41 100644
>>> --- a/arch/x86/kvm/x86.c
>>> +++ b/arch/x86/kvm/x86.c
>>> @@ -2175,6 +2175,7 @@ int kvm_dev_ioctl_check_extension(long ext)
>>>  	case KVM_CAP_GET_TSC_KHZ:
>>>  	case KVM_CAP_PCI_2_3:
>>>  	case KVM_CAP_KVMCLOCK_CTRL:
>>> +	case KVM_CAP_READONLY_MEM:
>>>  		r = 1;
>>>  		break;
>>>  	case KVM_CAP_COALESCED_MMIO:
>>> diff --git a/include/linux/kvm.h b/include/linux/kvm.h
>>> index 2de335d..d808694 100644
>>> --- a/include/linux/kvm.h
>>> +++ b/include/linux/kvm.h
>>> @@ -106,7 +106,8 @@ struct kvm_userspace_memory_region {
>>>   * other bits are reserved for kvm internal use which are defined in
>>>   * include/linux/kvm_host.h.
>>>   */
>>> -#define KVM_MEM_LOG_DIRTY_PAGES  1UL
>>> +#define KVM_MEM_LOG_DIRTY_PAGES	(1UL << 0)
>>> +#define KVM_MEM_READONLY	(1UL << 1)
>>>
>>>  /* for KVM_IRQ_LINE */
>>>  struct kvm_irq_level {
>>> @@ -621,6 +622,9 @@ struct kvm_ppc_smmu_info {
>>>  #define KVM_CAP_PPC_GET_SMMU_INFO 78
>>>  #define KVM_CAP_S390_COW 79
>>>  #define KVM_CAP_PPC_ALLOC_HTAB 80
>>> +#ifdef __KVM_HAVE_READONLY_MEM
>>> +#define KVM_CAP_READONLY_MEM 81
>>> +#endif
>>
>> See the discussion on the userspace patches: The CAP, as userspace API,
>> should really be defined unconditionally, kernel code should depend on
>> __KVM_HAVE_READONLY_MEM or a to-be-introduced CONFIG_KVM_HAVE_xxx
>> switch. This allows for cleaner userspace code.
> 
> I see that using __KVM_HAVE_  around CAP in kvm.h is very popular:
> 
> $ grep __KVM_HAVE include/linux/kvm.h | wc -l
> 20

Yes, mistakes of the past.

> 
> As your suggestion, userspace will always use the CAP even if the CAP
> is not really supported. We do not need care the overload on other arches?

Generally, userspace can only find out if a CAP is supported by testing
during runtime. Sometimes, the CAP may also be used to switch features
that are only available on certain archs. But this particular feature
should surely become a generic one soon, and the code you could skip in
userspace is truly minimal.

Jan
Avi Kivity Sept. 9, 2012, 1:42 p.m. UTC | #4
On 09/07/2012 02:14 PM, Jan Kiszka wrote:
>> 
>> $ grep __KVM_HAVE include/linux/kvm.h | wc -l
>> 20
> 
> Yes, mistakes of the past.
> 
>> 
>> As your suggestion, userspace will always use the CAP even if the CAP
>> is not really supported. We do not need care the overload on other arches?
> 
> Generally, userspace can only find out if a CAP is supported by testing
> during runtime. Sometimes, the CAP may also be used to switch features
> that are only available on certain archs. But this particular feature
> should surely become a generic one soon, and the code you could skip in
> userspace is truly minimal.

You also need them for build time, to see if the ioctl names and
structures are defined.  qemu chose not to do this ifdeffery but we must
not make that choice for other userspace.

In this case the feature is indeed generic and the names introduced are
globally available, so the conditional is unneeded.  But for other cases
it may be needed.
Jan Kiszka Sept. 9, 2012, 1:52 p.m. UTC | #5
On 2012-09-09 15:42, Avi Kivity wrote:
> On 09/07/2012 02:14 PM, Jan Kiszka wrote:
>>>
>>> $ grep __KVM_HAVE include/linux/kvm.h | wc -l
>>> 20
>>
>> Yes, mistakes of the past.
>>
>>>
>>> As your suggestion, userspace will always use the CAP even if the CAP
>>> is not really supported. We do not need care the overload on other arches?
>>
>> Generally, userspace can only find out if a CAP is supported by testing
>> during runtime. Sometimes, the CAP may also be used to switch features
>> that are only available on certain archs. But this particular feature
>> should surely become a generic one soon, and the code you could skip in
>> userspace is truly minimal.
> 
> You also need them for build time, to see if the ioctl names and
> structures are defined.  qemu chose not to do this ifdeffery but we must
> not make that choice for other userspace.

I disagree. We switched QEMU to the pattern the kernel officially
recommends: carry your own headers if you cannot rely on a recent
distribution. The only valid use case for #ifdef KVM_CAP is to allow
disabling of generic code that only some archs can make use of.

I'm not arguing for cleaning up legacy CAPs but for applying this rule
on newly introduced ones.

Jan
diff mbox

Patch

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index bf33aaa..b91bfd4 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -857,7 +857,8 @@  struct kvm_userspace_memory_region {
 };

 /* for kvm_memory_region::flags */
-#define KVM_MEM_LOG_DIRTY_PAGES  1UL
+#define KVM_MEM_LOG_DIRTY_PAGES	(1UL << 0)
+#define KVM_MEM_READONLY	(1UL << 1)

 This ioctl allows the user to create or modify a guest physical memory
 slot.  When changing an existing slot, it may be moved in the guest
@@ -873,9 +874,12 @@  It is recommended that the lower 21 bits of guest_phys_addr and userspace_addr
 be identical.  This allows large pages in the guest to be backed by large
 pages in the host.

-The flags field supports just one flag, KVM_MEM_LOG_DIRTY_PAGES, which
+The flags field supports two flag, KVM_MEM_LOG_DIRTY_PAGES, which
 instructs kvm to keep track of writes to memory within the slot.  See
-the KVM_GET_DIRTY_LOG ioctl.
+the KVM_GET_DIRTY_LOG ioctl. Another flag is KVM_MEM_READONLY when the
+KVM_CAP_READONLY_MEM capability, it indicates the guest memory is read-only,
+that means, guest is only allowed to read it. Writes will be posted to
+userspace as KVM_EXIT_MMIO exits.

 When the KVM_CAP_SYNC_MMU capability, changes in the backing of the memory
 region are automatically reflected into the guest.  For example, an mmap()
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index 246617e..521bf25 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -25,6 +25,7 @@ 
 #define __KVM_HAVE_DEBUGREGS
 #define __KVM_HAVE_XSAVE
 #define __KVM_HAVE_XCRS
+#define __KVM_HAVE_READONLY_MEM

 /* Architectural interrupt line count. */
 #define KVM_NR_INTERRUPTS 256
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 5548971..8e312a2 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2647,6 +2647,15 @@  static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *

 static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn)
 {
+	/*
+	 * Do not cache the mmio info caused by writing the readonly gfn
+	 * into the spte otherwise read access on readonly gfn also can
+	 * caused mmio page fault and treat it as mmio access.
+	 * Return 1 to tell kvm to emulate it.
+	 */
+	if (pfn == KVM_PFN_ERR_RO_FAULT)
+		return 1;
+
 	if (pfn == KVM_PFN_ERR_HWPOISON) {
 		kvm_send_hwpoison_signal(gfn_to_hva(vcpu->kvm, gfn), current);
 		return 0;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 704680d..42bbf41 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2175,6 +2175,7 @@  int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_GET_TSC_KHZ:
 	case KVM_CAP_PCI_2_3:
 	case KVM_CAP_KVMCLOCK_CTRL:
+	case KVM_CAP_READONLY_MEM:
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 2de335d..d808694 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -106,7 +106,8 @@  struct kvm_userspace_memory_region {
  * other bits are reserved for kvm internal use which are defined in
  * include/linux/kvm_host.h.
  */
-#define KVM_MEM_LOG_DIRTY_PAGES  1UL
+#define KVM_MEM_LOG_DIRTY_PAGES	(1UL << 0)
+#define KVM_MEM_READONLY	(1UL << 1)

 /* for KVM_IRQ_LINE */
 struct kvm_irq_level {
@@ -621,6 +622,9 @@  struct kvm_ppc_smmu_info {
 #define KVM_CAP_PPC_GET_SMMU_INFO 78
 #define KVM_CAP_S390_COW 79
 #define KVM_CAP_PPC_ALLOC_HTAB 80
+#ifdef __KVM_HAVE_READONLY_MEM
+#define KVM_CAP_READONLY_MEM 81
+#endif

 #ifdef KVM_CAP_IRQ_ROUTING

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index a913ac7..5972c98 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -465,6 +465,7 @@  int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,

 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn);
+unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn);
 void kvm_release_page_clean(struct page *page);
 void kvm_release_page_dirty(struct page *page);
 void kvm_set_page_dirty(struct page *page);
@@ -792,12 +793,6 @@  hva_to_gfn_memslot(unsigned long hva, struct kvm_memory_slot *slot)
 	return slot->base_gfn + gfn_offset;
 }

-static inline unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
-					       gfn_t gfn)
-{
-	return slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE;
-}
-
 static inline gpa_t gfn_to_gpa(gfn_t gfn)
 {
 	return (gpa_t)gfn << PAGE_SHIFT;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index e3e1658..3416f8a 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -680,7 +680,13 @@  void update_memslots(struct kvm_memslots *slots, struct kvm_memory_slot *new)

 static int check_memory_region_flags(struct kvm_userspace_memory_region *mem)
 {
-	if (mem->flags & ~KVM_MEM_LOG_DIRTY_PAGES)
+	u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
+
+#ifdef KVM_CAP_READONLY_MEM
+	valid_flags |= KVM_MEM_READONLY;
+#endif
+
+	if (mem->flags & ~valid_flags)
 		return -EINVAL;

 	return 0;
@@ -973,18 +979,45 @@  out:
 	return size;
 }

-static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
-				     gfn_t *nr_pages)
+static bool memslot_is_readonly(struct kvm_memory_slot *slot)
+{
+	return slot->flags & KVM_MEM_READONLY;
+}
+
+static unsigned long __gfn_to_hva_memslot(struct kvm_memory_slot *slot,
+					  gfn_t gfn)
+{
+	return slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE;
+}
+
+static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
+				       gfn_t *nr_pages, bool write)
 {
 	if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
 		return KVM_HVA_ERR_BAD;

+	if (memslot_is_readonly(slot) && write)
+		return KVM_HVA_ERR_RO_BAD;
+
 	if (nr_pages)
 		*nr_pages = slot->npages - (gfn - slot->base_gfn);

-	return gfn_to_hva_memslot(slot, gfn);
+	return __gfn_to_hva_memslot(slot, gfn);
 }

+static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
+				     gfn_t *nr_pages)
+{
+	return __gfn_to_hva_many(slot, gfn, nr_pages, true);
+}
+
+unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
+				 gfn_t gfn)
+{
+	return gfn_to_hva_many(slot, gfn, NULL);
+}
+EXPORT_SYMBOL_GPL(gfn_to_hva_memslot);
+
 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
 {
 	return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
@@ -997,7 +1030,7 @@  EXPORT_SYMBOL_GPL(gfn_to_hva);
  */
 static unsigned long gfn_to_hva_read(struct kvm *kvm, gfn_t gfn)
 {
-	return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
+	return __gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL, false);
 }

 static int kvm_read_hva(void *data, void __user *hva, int len)
@@ -1106,6 +1139,17 @@  static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
 	return npages;
 }

+static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
+{
+	if (unlikely(!(vma->vm_flags & VM_READ)))
+		return false;
+
+	if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE))))
+		return false;
+
+	return true;
+}
+
 /*
  * Pin guest page in memory and return its pfn.
  * @addr: host virtual address which maps memory to the guest
@@ -1130,8 +1174,6 @@  static pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
 	/* we can do it either atomically or asynchronously, not both */
 	BUG_ON(atomic && async);

-	BUG_ON(!write_fault && !writable);
-
 	if (hva_to_pfn_fast(addr, atomic, async, write_fault, writable, &pfn))
 		return pfn;

@@ -1158,7 +1200,7 @@  static pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
 			vma->vm_pgoff;
 		BUG_ON(!kvm_is_mmio_pfn(pfn));
 	} else {
-		if (async && (vma->vm_flags & VM_WRITE))
+		if (async && vma_is_valid(vma, write_fault))
 			*async = true;
 		pfn = KVM_PFN_ERR_FAULT;
 	}
@@ -1167,19 +1209,40 @@  exit:
 	return pfn;
 }

+static pfn_t
+__gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic,
+		     bool *async, bool write_fault, bool *writable)
+{
+	unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
+
+	if (addr == KVM_HVA_ERR_RO_BAD)
+		return KVM_PFN_ERR_RO_FAULT;
+
+	if (kvm_is_error_hva(addr))
+		return KVM_PFN_ERR_BAD;
+
+	/* Do not map writable pfn in the readonly memslot. */
+	if (writable && memslot_is_readonly(slot)) {
+		*writable = false;
+		writable = NULL;
+	}
+
+	return hva_to_pfn(addr, atomic, async, write_fault,
+			  writable);
+}
+
 static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async,
 			  bool write_fault, bool *writable)
 {
-	unsigned long addr;
+	struct kvm_memory_slot *slot;

 	if (async)
 		*async = false;

-	addr = gfn_to_hva(kvm, gfn);
-	if (kvm_is_error_hva(addr))
-		return KVM_PFN_ERR_BAD;
+	slot = gfn_to_memslot(kvm, gfn);

-	return hva_to_pfn(addr, atomic, async, write_fault, writable);
+	return __gfn_to_pfn_memslot(slot, gfn, atomic, async, write_fault,
+				    writable);
 }

 pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
@@ -1210,15 +1273,12 @@  EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);

 pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
 {
-	unsigned long addr = gfn_to_hva_memslot(slot, gfn);
-	return hva_to_pfn(addr, false, NULL, true, NULL);
+	return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL);
 }

 pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn)
 {
-	unsigned long addr = gfn_to_hva_memslot(slot, gfn);
-
-	return hva_to_pfn(addr, true, NULL, true, NULL);
+	return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL);
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);