diff mbox series

[v3,12/21] KVM: X86: Implement ring-based dirty memory tracking

Message ID 20200109145729.32898-13-peterx@redhat.com (mailing list archive)
State New, archived
Headers show
Series KVM: Dirty ring interface | expand

Commit Message

Peter Xu Jan. 9, 2020, 2:57 p.m. UTC
This patch is heavily based on previous work from Lei Cao
<lei.cao@stratus.com> and Paolo Bonzini <pbonzini@redhat.com>. [1]

KVM currently uses large bitmaps to track dirty memory.  These bitmaps
are copied to userspace when userspace queries KVM for its dirty page
information.  The use of bitmaps is mostly sufficient for live
migration, as large parts of memory are be dirtied from one log-dirty
pass to another.  However, in a checkpointing system, the number of
dirty pages is small and in fact it is often bounded---the VM is
paused when it has dirtied a pre-defined number of pages. Traversing a
large, sparsely populated bitmap to find set bits is time-consuming,
as is copying the bitmap to user-space.

A similar issue will be there for live migration when the guest memory
is huge while the page dirty procedure is trivial.  In that case for
each dirty sync we need to pull the whole dirty bitmap to userspace
and analyse every bit even if it's mostly zeros.

The preferred data structure for above scenarios is a dense list of
guest frame numbers (GFN).  This patch series stores the dirty list in
kernel memory that can be memory mapped into userspace to allow speedy
harvesting.

This patch enables dirty ring for X86 only.  However it should be
easily extended to other archs as well.

[1] https://patchwork.kernel.org/patch/10471409/

Signed-off-by: Lei Cao <lei.cao@stratus.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Peter Xu <peterx@redhat.com>
---
 Documentation/virt/kvm/api.txt  |  89 ++++++++++++++++++
 arch/x86/include/asm/kvm_host.h |   3 +
 arch/x86/include/uapi/asm/kvm.h |   1 +
 arch/x86/kvm/Makefile           |   3 +-
 arch/x86/kvm/mmu/mmu.c          |   6 ++
 arch/x86/kvm/vmx/vmx.c          |   7 ++
 arch/x86/kvm/x86.c              |   9 ++
 include/linux/kvm_dirty_ring.h  |  55 +++++++++++
 include/linux/kvm_host.h        |  26 +++++
 include/trace/events/kvm.h      |  78 +++++++++++++++
 include/uapi/linux/kvm.h        |  33 +++++++
 virt/kvm/dirty_ring.c           | 162 ++++++++++++++++++++++++++++++++
 virt/kvm/kvm_main.c             | 137 ++++++++++++++++++++++++++-
 13 files changed, 606 insertions(+), 3 deletions(-)
 create mode 100644 include/linux/kvm_dirty_ring.h
 create mode 100644 virt/kvm/dirty_ring.c

Comments

Michael S. Tsirkin Jan. 9, 2020, 4:29 p.m. UTC | #1
On Thu, Jan 09, 2020 at 09:57:20AM -0500, Peter Xu wrote:
> This patch is heavily based on previous work from Lei Cao
> <lei.cao@stratus.com> and Paolo Bonzini <pbonzini@redhat.com>. [1]
> 
> KVM currently uses large bitmaps to track dirty memory.  These bitmaps
> are copied to userspace when userspace queries KVM for its dirty page
> information.  The use of bitmaps is mostly sufficient for live
> migration, as large parts of memory are be dirtied from one log-dirty
> pass to another.  However, in a checkpointing system, the number of
> dirty pages is small and in fact it is often bounded---the VM is
> paused when it has dirtied a pre-defined number of pages. Traversing a
> large, sparsely populated bitmap to find set bits is time-consuming,
> as is copying the bitmap to user-space.
> 
> A similar issue will be there for live migration when the guest memory
> is huge while the page dirty procedure is trivial.  In that case for
> each dirty sync we need to pull the whole dirty bitmap to userspace
> and analyse every bit even if it's mostly zeros.
> 
> The preferred data structure for above scenarios is a dense list of
> guest frame numbers (GFN).

No longer, this uses an array of structs.

>  This patch series stores the dirty list in
> kernel memory that can be memory mapped into userspace to allow speedy
> harvesting.
> 
> This patch enables dirty ring for X86 only.  However it should be
> easily extended to other archs as well.
> 
> [1] https://patchwork.kernel.org/patch/10471409/
> 
> Signed-off-by: Lei Cao <lei.cao@stratus.com>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
>  Documentation/virt/kvm/api.txt  |  89 ++++++++++++++++++
>  arch/x86/include/asm/kvm_host.h |   3 +
>  arch/x86/include/uapi/asm/kvm.h |   1 +
>  arch/x86/kvm/Makefile           |   3 +-
>  arch/x86/kvm/mmu/mmu.c          |   6 ++
>  arch/x86/kvm/vmx/vmx.c          |   7 ++
>  arch/x86/kvm/x86.c              |   9 ++
>  include/linux/kvm_dirty_ring.h  |  55 +++++++++++
>  include/linux/kvm_host.h        |  26 +++++
>  include/trace/events/kvm.h      |  78 +++++++++++++++
>  include/uapi/linux/kvm.h        |  33 +++++++
>  virt/kvm/dirty_ring.c           | 162 ++++++++++++++++++++++++++++++++
>  virt/kvm/kvm_main.c             | 137 ++++++++++++++++++++++++++-
>  13 files changed, 606 insertions(+), 3 deletions(-)
>  create mode 100644 include/linux/kvm_dirty_ring.h
>  create mode 100644 virt/kvm/dirty_ring.c
> 
> diff --git a/Documentation/virt/kvm/api.txt b/Documentation/virt/kvm/api.txt
> index ebb37b34dcfc..708c3e0f7eae 100644
> --- a/Documentation/virt/kvm/api.txt
> +++ b/Documentation/virt/kvm/api.txt
> @@ -231,6 +231,7 @@ Based on their initialization different VMs may have different capabilities.
>  It is thus encouraged to use the vm ioctl to query for capabilities (available
>  with KVM_CAP_CHECK_EXTENSION_VM on the vm fd)
>  
> +
>  4.5 KVM_GET_VCPU_MMAP_SIZE
>  
>  Capability: basic
> @@ -243,6 +244,18 @@ The KVM_RUN ioctl (cf.) communicates with userspace via a shared
>  memory region.  This ioctl returns the size of that region.  See the
>  KVM_RUN documentation for details.
>  
> +Besides the size of the KVM_RUN communication region, other areas of
> +the VCPU file descriptor can be mmap-ed, including:
> +
> +- if KVM_CAP_COALESCED_MMIO is available, a page at
> +  KVM_COALESCED_MMIO_PAGE_OFFSET * PAGE_SIZE; for historical reasons,
> +  this page is included in the result of KVM_GET_VCPU_MMAP_SIZE.
> +  KVM_CAP_COALESCED_MMIO is not documented yet.
> +
> +- if KVM_CAP_DIRTY_LOG_RING is available, a number of pages at
> +  KVM_DIRTY_LOG_PAGE_OFFSET * PAGE_SIZE.  For more information on
> +  KVM_CAP_DIRTY_LOG_RING, see section 8.3.
> +
>  
>  4.6 KVM_SET_MEMORY_REGION
>  
> @@ -5376,6 +5389,7 @@ CPU when the exception is taken. If this virtual SError is taken to EL1 using
>  AArch64, this value will be reported in the ISS field of ESR_ELx.
>  
>  See KVM_CAP_VCPU_EVENTS for more details.
> +
>  8.20 KVM_CAP_HYPERV_SEND_IPI
>  
>  Architectures: x86
> @@ -5383,6 +5397,7 @@ Architectures: x86
>  This capability indicates that KVM supports paravirtualized Hyper-V IPI send
>  hypercalls:
>  HvCallSendSyntheticClusterIpi, HvCallSendSyntheticClusterIpiEx.
> +
>  8.21 KVM_CAP_HYPERV_DIRECT_TLBFLUSH
>  
>  Architecture: x86
> @@ -5396,3 +5411,77 @@ handling by KVM (as some KVM hypercall may be mistakenly treated as TLB
>  flush hypercalls by Hyper-V) so userspace should disable KVM identification
>  in CPUID and only exposes Hyper-V identification. In this case, guest
>  thinks it's running on Hyper-V and only use Hyper-V hypercalls.
> +
> +8.22 KVM_CAP_DIRTY_LOG_RING
> +
> +Architectures: x86
> +Parameters: args[0] - size of the dirty log ring
> +
> +KVM is capable of tracking dirty memory using ring buffers that are
> +mmaped into userspace; there is one dirty ring per vcpu.
> +
> +One dirty ring is defined as below internally:
> +
> +struct kvm_dirty_ring {
> +	u32 dirty_index;
> +	u32 reset_index;
> +	u32 size;
> +	u32 soft_limit;
> +	struct kvm_dirty_gfn *dirty_gfns;
> +	struct kvm_dirty_ring_indices *indices;
> +	int index;
> +};
> +
> +Dirty GFNs (Guest Frame Numbers) are stored in the dirty_gfns array.
> +For each of the dirty entry it's defined as:
> +
> +struct kvm_dirty_gfn {
> +        __u32 pad;

How about sticking a length here?
This way huge pages can be dirtied in one go.

> +        __u32 slot; /* as_id | slot_id */
> +        __u64 offset;
> +};
> +
> +Most of the ring structure is used by KVM internally, while only the
> +indices are exposed to userspace:
> +
> +struct kvm_dirty_ring_indices {
> +	__u32 avail_index; /* set by kernel */
> +	__u32 fetch_index; /* set by userspace */
> +};
> +
> +The two indices in the ring buffer are free running counters.
> +
> +Userspace calls KVM_ENABLE_CAP ioctl right after KVM_CREATE_VM ioctl
> +to enable this capability for the new guest and set the size of the
> +rings.  It is only allowed before creating any vCPU, and the size of
> +the ring must be a power of two.


I know index design is popular, but testing with virtio showed
that it's better to just have a flags field marking
an entry as valid. In particular this gets rid of the
running counters and power of two limitations.
It also removes the need for a separate index page, which is nice.



>  The larger the ring buffer, the less
> +likely the ring is full and the VM is forced to exit to userspace. The
> +optimal size depends on the workload, but it is recommended that it be
> +at least 64 KiB (4096 entries).

Where's this number coming from? Given you have indices as well,
4K size rings is likely to cause cache contention.

> +Just like for dirty page bitmaps, the buffer tracks writes to
> +all user memory regions for which the KVM_MEM_LOG_DIRTY_PAGES flag was
> +set in KVM_SET_USER_MEMORY_REGION.  Once a memory region is registered
> +with the flag set, userspace can start harvesting dirty pages from the
> +ring buffer.
> +
> +To harvest the dirty pages, userspace accesses the mmaped ring buffer
> +to read the dirty GFNs up to avail_index, and sets the fetch_index
> +accordingly.  This can be done when the guest is running or paused,
> +and dirty pages need not be collected all at once.  After processing
> +one or more entries in the ring buffer, userspace calls the VM ioctl
> +KVM_RESET_DIRTY_RINGS to notify the kernel that it has updated
> +fetch_index and to mark those pages clean.  Therefore, the ioctl
> +must be called *before* reading the content of the dirty pages.
> +
> +However, there is a major difference comparing to the
> +KVM_GET_DIRTY_LOG interface in that when reading the dirty ring from
> +userspace it's still possible that the kernel has not yet flushed the
> +hardware dirty buffers into the kernel buffer (which was previously
> +done by the KVM_GET_DIRTY_LOG ioctl).  To achieve that, one needs to
> +kick the vcpu out for a hardware buffer flush (vmexit) to make sure
> +all the existing dirty gfns are flushed to the dirty rings.
> +
> +If one of the ring buffers is full, the guest will exit to userspace
> +with the exit reason set to KVM_EXIT_DIRTY_LOG_FULL, and the KVM_RUN
> +ioctl will return to userspace with zero.
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index f536d139b3d2..3fe18402e6a3 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1181,6 +1181,7 @@ struct kvm_x86_ops {
>  					   struct kvm_memory_slot *slot,
>  					   gfn_t offset, unsigned long mask);
>  	int (*write_log_dirty)(struct kvm_vcpu *vcpu);
> +	int (*cpu_dirty_log_size)(void);
>  
>  	/* pmu operations of sub-arch */
>  	const struct kvm_pmu_ops *pmu_ops;
> @@ -1666,4 +1667,6 @@ static inline int kvm_cpu_get_apicid(int mps_cpu)
>  #define GET_SMSTATE(type, buf, offset)		\
>  	(*(type *)((buf) + (offset) - 0x7e00))
>  
> +int kvm_cpu_dirty_log_size(void);
> +
>  #endif /* _ASM_X86_KVM_HOST_H */
> diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
> index 503d3f42da16..b59bf356c478 100644
> --- a/arch/x86/include/uapi/asm/kvm.h
> +++ b/arch/x86/include/uapi/asm/kvm.h
> @@ -12,6 +12,7 @@
>  
>  #define KVM_PIO_PAGE_OFFSET 1
>  #define KVM_COALESCED_MMIO_PAGE_OFFSET 2
> +#define KVM_DIRTY_LOG_PAGE_OFFSET 64
>  
>  #define DE_VECTOR 0
>  #define DB_VECTOR 1
> diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> index b19ef421084d..0acee817adfb 100644
> --- a/arch/x86/kvm/Makefile
> +++ b/arch/x86/kvm/Makefile
> @@ -5,7 +5,8 @@ ccflags-y += -Iarch/x86/kvm
>  KVM := ../../../virt/kvm
>  
>  kvm-y			+= $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
> -				$(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o
> +				$(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o \
> +				$(KVM)/dirty_ring.o
>  kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(KVM)/async_pf.o
>  
>  kvm-y			+= x86.o emulate.o i8259.o irq.o lapic.o \
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 7269130ea5e2..621b842a9b7b 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -1832,7 +1832,13 @@ int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu)
>  {
>  	if (kvm_x86_ops->write_log_dirty)
>  		return kvm_x86_ops->write_log_dirty(vcpu);
> +	return 0;
> +}
>  
> +int kvm_cpu_dirty_log_size(void)
> +{
> +	if (kvm_x86_ops->cpu_dirty_log_size)
> +		return kvm_x86_ops->cpu_dirty_log_size();
>  	return 0;
>  }
>  
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index 62175a246bcc..2151de89456d 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -7689,6 +7689,7 @@ static __init int hardware_setup(void)
>  		kvm_x86_ops->slot_disable_log_dirty = NULL;
>  		kvm_x86_ops->flush_log_dirty = NULL;
>  		kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
> +		kvm_x86_ops->cpu_dirty_log_size = NULL;
>  	}
>  
>  	if (!cpu_has_vmx_preemption_timer())
> @@ -7753,6 +7754,11 @@ static __exit void hardware_unsetup(void)
>  	free_kvm_area();
>  }
>  
> +static int vmx_cpu_dirty_log_size(void)
> +{
> +	return enable_pml ? PML_ENTITY_NUM : 0;
> +}
> +
>  static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
>  	.cpu_has_kvm_support = cpu_has_kvm_support,
>  	.disabled_by_bios = vmx_disabled_by_bios,
> @@ -7875,6 +7881,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
>  	.flush_log_dirty = vmx_flush_log_dirty,
>  	.enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
>  	.write_log_dirty = vmx_write_pml_buffer,
> +	.cpu_dirty_log_size = vmx_cpu_dirty_log_size,
>  
>  	.pre_block = vmx_pre_block,
>  	.post_block = vmx_post_block,
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index ff97782b3919..9c3673592826 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -7998,6 +7998,15 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>  
>  	bool req_immediate_exit = false;
>  
> +	/* Forbid vmenter if vcpu dirty ring is soft-full */
> +	if (unlikely(vcpu->kvm->dirty_ring_size &&
> +		     kvm_dirty_ring_soft_full(&vcpu->dirty_ring))) {
> +		vcpu->run->exit_reason = KVM_EXIT_DIRTY_RING_FULL;
> +		trace_kvm_dirty_ring_exit(vcpu);
> +		r = 0;
> +		goto out;
> +	}
> +
>  	if (kvm_request_pending(vcpu)) {
>  		if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu)) {
>  			if (unlikely(!kvm_x86_ops->get_vmcs12_pages(vcpu))) {
> diff --git a/include/linux/kvm_dirty_ring.h b/include/linux/kvm_dirty_ring.h
> new file mode 100644
> index 000000000000..d6fe9e1b7617
> --- /dev/null
> +++ b/include/linux/kvm_dirty_ring.h
> @@ -0,0 +1,55 @@
> +#ifndef KVM_DIRTY_RING_H
> +#define KVM_DIRTY_RING_H
> +
> +/**
> + * kvm_dirty_ring: KVM internal dirty ring structure
> + *
> + * @dirty_index: free running counter that points to the next slot in
> + *               dirty_ring->dirty_gfns, where a new dirty page should go
> + * @reset_index: free running counter that points to the next dirty page
> + *               in dirty_ring->dirty_gfns for which dirty trap needs to
> + *               be reenabled
> + * @size:        size of the compact list, dirty_ring->dirty_gfns
> + * @soft_limit:  when the number of dirty pages in the list reaches this
> + *               limit, vcpu that owns this ring should exit to userspace
> + *               to allow userspace to harvest all the dirty pages
> + * @dirty_gfns:  the array to keep the dirty gfns
> + * @indices:     the pointer to the @kvm_dirty_ring_indices structure
> + *               of this specific ring
> + * @index:       index of this dirty ring
> + */
> +struct kvm_dirty_ring {
> +	u32 dirty_index;
> +	u32 reset_index;
> +	u32 size;
> +	u32 soft_limit;
> +	struct kvm_dirty_gfn *dirty_gfns;
> +	struct kvm_dirty_ring_indices *indices;
> +	int index;
> +};
> +
> +u32 kvm_dirty_ring_get_rsvd_entries(void);
> +int kvm_dirty_ring_alloc(struct kvm_dirty_ring *ring,
> +			 struct kvm_dirty_ring_indices *indices,
> +			 int index, u32 size);
> +struct kvm_dirty_ring *kvm_dirty_ring_get(struct kvm *kvm);
> +
> +/*
> + * called with kvm->slots_lock held, returns the number of
> + * processed pages.
> + */
> +int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring);
> +
> +/*
> + * returns =0: successfully pushed
> + *         <0: unable to push, need to wait
> + */
> +void kvm_dirty_ring_push(struct kvm_dirty_ring *ring, u32 slot, u64 offset);
> +
> +/* for use in vm_operations_struct */
> +struct page *kvm_dirty_ring_get_page(struct kvm_dirty_ring *ring, u32 offset);
> +
> +void kvm_dirty_ring_free(struct kvm_dirty_ring *ring);
> +bool kvm_dirty_ring_soft_full(struct kvm_dirty_ring *ring);
> +
> +#endif
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index cbd633ece959..c96161c6a0c9 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -34,6 +34,7 @@
>  #include <linux/kvm_types.h>
>  
>  #include <asm/kvm_host.h>
> +#include <linux/kvm_dirty_ring.h>
>  
>  #ifndef KVM_MAX_VCPU_ID
>  #define KVM_MAX_VCPU_ID KVM_MAX_VCPUS
> @@ -321,6 +322,7 @@ struct kvm_vcpu {
>  	bool ready;
>  	struct kvm_vcpu_arch arch;
>  	struct dentry *debugfs_dentry;
> +	struct kvm_dirty_ring dirty_ring;
>  };
>  
>  static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu)
> @@ -502,6 +504,7 @@ struct kvm {
>  	struct srcu_struct srcu;
>  	struct srcu_struct irq_srcu;
>  	pid_t userspace_pid;
> +	u32 dirty_ring_size;
>  };
>  
>  #define kvm_err(fmt, ...) \
> @@ -831,6 +834,8 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
>  					gfn_t gfn_offset,
>  					unsigned long mask);
>  
> +void kvm_reset_dirty_gfn(struct kvm *kvm, u32 slot, u64 offset, u64 mask);
> +
>  int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
>  				struct kvm_dirty_log *log);
>  int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
> @@ -1409,4 +1414,25 @@ int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
>  				uintptr_t data, const char *name,
>  				struct task_struct **thread_ptr);
>  
> +/*
> + * This defines how many reserved entries we want to keep before we
> + * kick the vcpu to the userspace to avoid dirty ring full.  This
> + * value can be tuned to higher if e.g. PML is enabled on the host.
> + */
> +#define  KVM_DIRTY_RING_RSVD_ENTRIES  64
> +
> +/* Max number of entries allowed for each kvm dirty ring */
> +#define  KVM_DIRTY_RING_MAX_ENTRIES  65536
> +
> +/*
> + * Arch needs to define these macro after implementing the dirty ring
> + * feature.  KVM_DIRTY_LOG_PAGE_OFFSET should be defined as the
> + * starting page offset of the dirty ring structures, while
> + * KVM_DIRTY_RING_VERSION should be defined as >=1.  By default, this
> + * feature is off on all archs.
> + */
> +#ifndef KVM_DIRTY_LOG_PAGE_OFFSET
> +#define KVM_DIRTY_LOG_PAGE_OFFSET 0
> +#endif
> +
>  #endif
> diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
> index 2c735a3e6613..3d850997940c 100644
> --- a/include/trace/events/kvm.h
> +++ b/include/trace/events/kvm.h
> @@ -399,6 +399,84 @@ TRACE_EVENT(kvm_halt_poll_ns,
>  #define trace_kvm_halt_poll_ns_shrink(vcpu_id, new, old) \
>  	trace_kvm_halt_poll_ns(false, vcpu_id, new, old)
>  
> +TRACE_EVENT(kvm_dirty_ring_push,
> +	TP_PROTO(struct kvm_dirty_ring *ring, u32 slot, u64 offset),
> +	TP_ARGS(ring, slot, offset),
> +
> +	TP_STRUCT__entry(
> +		__field(int, index)
> +		__field(u32, dirty_index)
> +		__field(u32, reset_index)
> +		__field(u32, slot)
> +		__field(u64, offset)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->index          = ring->index;
> +		__entry->dirty_index    = ring->dirty_index;
> +		__entry->reset_index    = ring->reset_index;
> +		__entry->slot           = slot;
> +		__entry->offset         = offset;
> +	),
> +
> +	TP_printk("ring %d: dirty 0x%x reset 0x%x "
> +		  "slot %u offset 0x%llx (used %u)",
> +		  __entry->index, __entry->dirty_index,
> +		  __entry->reset_index,  __entry->slot, __entry->offset,
> +		  __entry->dirty_index - __entry->reset_index)
> +);
> +
> +TRACE_EVENT(kvm_dirty_ring_reset,
> +	TP_PROTO(struct kvm_dirty_ring *ring),
> +	TP_ARGS(ring),
> +
> +	TP_STRUCT__entry(
> +		__field(int, index)
> +		__field(u32, dirty_index)
> +		__field(u32, reset_index)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->index          = ring->index;
> +		__entry->dirty_index    = ring->dirty_index;
> +		__entry->reset_index    = ring->reset_index;
> +	),
> +
> +	TP_printk("ring %d: dirty 0x%x reset 0x%x (used %u)",
> +		  __entry->index, __entry->dirty_index, __entry->reset_index,
> +		  __entry->dirty_index - __entry->reset_index)
> +);
> +
> +TRACE_EVENT(kvm_dirty_ring_waitqueue,
> +	TP_PROTO(bool enter),
> +	TP_ARGS(enter),
> +
> +	TP_STRUCT__entry(
> +	    __field(bool, enter)
> +	),
> +
> +	TP_fast_assign(
> +	    __entry->enter = enter;
> +	),
> +
> +	TP_printk("%s", __entry->enter ? "wait" : "awake")
> +);
> +
> +TRACE_EVENT(kvm_dirty_ring_exit,
> +	TP_PROTO(struct kvm_vcpu *vcpu),
> +	TP_ARGS(vcpu),
> +
> +	TP_STRUCT__entry(
> +	    __field(int, vcpu_id)
> +	),
> +
> +	TP_fast_assign(
> +	    __entry->vcpu_id = vcpu->vcpu_id;
> +	),
> +
> +	TP_printk("vcpu %d", __entry->vcpu_id)
> +);
> +
>  #endif /* _TRACE_KVM_MAIN_H */
>  
>  /* This part must be outside protection */
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index f0a16b4adbbd..df4a1700ff1e 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -236,6 +236,7 @@ struct kvm_hyperv_exit {
>  #define KVM_EXIT_IOAPIC_EOI       26
>  #define KVM_EXIT_HYPERV           27
>  #define KVM_EXIT_ARM_NISV         28
> +#define KVM_EXIT_DIRTY_RING_FULL  29
>  
>  /* For KVM_EXIT_INTERNAL_ERROR */
>  /* Emulate instruction failed. */
> @@ -247,6 +248,13 @@ struct kvm_hyperv_exit {
>  /* Encounter unexpected vm-exit reason */
>  #define KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON	4
>  
> +struct kvm_dirty_ring_indices {
> +	__u32 avail_index; /* set by kernel */
> +	__u32 padding1;
> +	__u32 fetch_index; /* set by userspace */
> +	__u32 padding2;
> +};
> +
>  /* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */
>  struct kvm_run {
>  	/* in */
> @@ -421,6 +429,8 @@ struct kvm_run {
>  		struct kvm_sync_regs regs;
>  		char padding[SYNC_REGS_SIZE_BYTES];
>  	} s;
> +
> +	struct kvm_dirty_ring_indices vcpu_ring_indices;
>  };
>  
>  /* for KVM_REGISTER_COALESCED_MMIO / KVM_UNREGISTER_COALESCED_MMIO */
> @@ -1009,6 +1019,7 @@ struct kvm_ppc_resize_hpt {
>  #define KVM_CAP_PPC_GUEST_DEBUG_SSTEP 176
>  #define KVM_CAP_ARM_NISV_TO_USER 177
>  #define KVM_CAP_ARM_INJECT_EXT_DABT 178
> +#define KVM_CAP_DIRTY_LOG_RING 179
>  
>  #ifdef KVM_CAP_IRQ_ROUTING
>  
> @@ -1473,6 +1484,9 @@ struct kvm_enc_region {
>  /* Available with KVM_CAP_ARM_SVE */
>  #define KVM_ARM_VCPU_FINALIZE	  _IOW(KVMIO,  0xc2, int)
>  
> +/* Available with KVM_CAP_DIRTY_LOG_RING */
> +#define KVM_RESET_DIRTY_RINGS     _IO(KVMIO, 0xc3)
> +
>  /* Secure Encrypted Virtualization command */
>  enum sev_cmd_id {
>  	/* Guest initialization commands */
> @@ -1623,4 +1637,23 @@ struct kvm_hyperv_eventfd {
>  #define KVM_HYPERV_CONN_ID_MASK		0x00ffffff
>  #define KVM_HYPERV_EVENTFD_DEASSIGN	(1 << 0)
>  
> +/*
> + * The following are the requirements for supporting dirty log ring
> + * (by enabling KVM_DIRTY_LOG_PAGE_OFFSET).
> + *
> + * 1. Memory accesses by KVM should call kvm_vcpu_write_* instead
> + *    of kvm_write_* so that the global dirty ring is not filled up
> + *    too quickly.
> + * 2. kvm_arch_mmu_enable_log_dirty_pt_masked should be defined for
> + *    enabling dirty logging.
> + * 3. There should not be a separate step to synchronize hardware
> + *    dirty bitmap with KVM's.
> + */
> +
> +struct kvm_dirty_gfn {
> +	__u32 pad;
> +	__u32 slot;
> +	__u64 offset;
> +};
> +
>  #endif /* __LINUX_KVM_H */
> diff --git a/virt/kvm/dirty_ring.c b/virt/kvm/dirty_ring.c
> new file mode 100644
> index 000000000000..67ec5bbc21c0
> --- /dev/null
> +++ b/virt/kvm/dirty_ring.c
> @@ -0,0 +1,162 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * KVM dirty ring implementation
> + *
> + * Copyright 2019 Red Hat, Inc.
> + */
> +#include <linux/kvm_host.h>
> +#include <linux/kvm.h>
> +#include <linux/vmalloc.h>
> +#include <linux/kvm_dirty_ring.h>
> +#include <trace/events/kvm.h>
> +
> +int __weak kvm_cpu_dirty_log_size(void)
> +{
> +	return 0;
> +}
> +
> +u32 kvm_dirty_ring_get_rsvd_entries(void)
> +{
> +	return KVM_DIRTY_RING_RSVD_ENTRIES + kvm_cpu_dirty_log_size();
> +}
> +
> +static u32 kvm_dirty_ring_used(struct kvm_dirty_ring *ring)
> +{
> +	return READ_ONCE(ring->dirty_index) - READ_ONCE(ring->reset_index);
> +}
> +
> +bool kvm_dirty_ring_soft_full(struct kvm_dirty_ring *ring)
> +{
> +	return kvm_dirty_ring_used(ring) >= ring->soft_limit;
> +}
> +
> +bool kvm_dirty_ring_full(struct kvm_dirty_ring *ring)
> +{
> +	return kvm_dirty_ring_used(ring) >= ring->size;
> +}
> +
> +struct kvm_dirty_ring *kvm_dirty_ring_get(struct kvm *kvm)
> +{
> +	struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
> +
> +	WARN_ON_ONCE(vcpu->kvm != kvm);
> +
> +	return &vcpu->dirty_ring;
> +}
> +
> +int kvm_dirty_ring_alloc(struct kvm_dirty_ring *ring,
> +			 struct kvm_dirty_ring_indices *indices,
> +			 int index, u32 size)
> +{
> +	ring->dirty_gfns = vmalloc(size);
> +	if (!ring->dirty_gfns)
> +		return -ENOMEM;
> +	memset(ring->dirty_gfns, 0, size);
> +
> +	ring->size = size / sizeof(struct kvm_dirty_gfn);
> +	ring->soft_limit = ring->size - kvm_dirty_ring_get_rsvd_entries();
> +	ring->dirty_index = 0;
> +	ring->reset_index = 0;
> +	ring->index = index;
> +	ring->indices = indices;
> +
> +	return 0;
> +}
> +
> +int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring)
> +{
> +	u32 cur_slot, next_slot;
> +	u64 cur_offset, next_offset;
> +	unsigned long mask;
> +	u32 fetch;
> +	int count = 0;
> +	struct kvm_dirty_gfn *entry;
> +	struct kvm_dirty_ring_indices *indices = ring->indices;
> +	bool first_round = true;
> +
> +	fetch = READ_ONCE(indices->fetch_index);

So this does not work if the data cache is virtually tagged.
Which to the best of my knowledge isn't the case on any
CPU kvm supports. However it might not stay being the
case forever. Worth at least commenting.


> +
> +	/*
> +	 * Note that fetch_index is written by the userspace, which
> +	 * should not be trusted.  If this happens, then it's probably
> +	 * that the userspace has written a wrong fetch_index.
> +	 */
> +	if (fetch - ring->reset_index > ring->size)
> +		return -EINVAL;
> +
> +	if (fetch == ring->reset_index)
> +		return 0;
> +
> +	/* This is only needed to make compilers happy */
> +	cur_slot = cur_offset = mask = 0;
> +	while (ring->reset_index != fetch) {
> +		entry = &ring->dirty_gfns[ring->reset_index & (ring->size - 1)];
> +		next_slot = READ_ONCE(entry->slot);
> +		next_offset = READ_ONCE(entry->offset);

What is this READ_ONCE doing? Entries are only written by kernel
and it's under lock.

> +		ring->reset_index++;
> +		count++;
> +		/*
> +		 * Try to coalesce the reset operations when the guest is
> +		 * scanning pages in the same slot.
> +		 */
> +		if (!first_round && next_slot == cur_slot) {
> +			s64 delta = next_offset - cur_offset;
> +
> +			if (delta >= 0 && delta < BITS_PER_LONG) {
> +				mask |= 1ull << delta;
> +				continue;
> +			}
> +
> +			/* Backwards visit, careful about overflows!  */
> +			if (delta > -BITS_PER_LONG && delta < 0 &&
> +			    (mask << -delta >> -delta) == mask) {
> +				cur_offset = next_offset;
> +				mask = (mask << -delta) | 1;
> +				continue;
> +			}
> +		}

Well how important is this logic? Because it will not be
too effective on an SMP system, so don't you need a per-cpu ring?



> +		kvm_reset_dirty_gfn(kvm, cur_slot, cur_offset, mask);
> +		cur_slot = next_slot;
> +		cur_offset = next_offset;
> +		mask = 1;
> +		first_round = false;
> +	}
> +	kvm_reset_dirty_gfn(kvm, cur_slot, cur_offset, mask);
> +
> +	trace_kvm_dirty_ring_reset(ring);
> +
> +	return count;
> +}
> +
> +void kvm_dirty_ring_push(struct kvm_dirty_ring *ring, u32 slot, u64 offset)
> +{
> +	struct kvm_dirty_gfn *entry;
> +	struct kvm_dirty_ring_indices *indices = ring->indices;
> +
> +	/* It should never get full */
> +	WARN_ON_ONCE(kvm_dirty_ring_full(ring));
> +
> +	entry = &ring->dirty_gfns[ring->dirty_index & (ring->size - 1)];
> +	entry->slot = slot;
> +	entry->offset = offset;
> +	/*
> +	 * Make sure the data is filled in before we publish this to
> +	 * the userspace program.  There's no paired kernel-side reader.
> +	 */
> +	smp_wmb();
> +	ring->dirty_index++;


Do I understand it correctly that the ring is shared between CPUs?
If so I don't understand why it's safe for SMP guests.
Don't you need atomics or locking?


> +	WRITE_ONCE(indices->avail_index, ring->dirty_index);
> +
> +	trace_kvm_dirty_ring_push(ring, slot, offset);
> +}
> +
> +struct page *kvm_dirty_ring_get_page(struct kvm_dirty_ring *ring, u32 offset)
> +{
> +	return vmalloc_to_page((void *)ring->dirty_gfns + offset * PAGE_SIZE);
> +}
> +
> +void kvm_dirty_ring_free(struct kvm_dirty_ring *ring)
> +{
> +	vfree(ring->dirty_gfns);
> +	ring->dirty_gfns = NULL;
> +}
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 5bbd8b8730fa..5e36792e15ae 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -64,6 +64,8 @@
>  #define CREATE_TRACE_POINTS
>  #include <trace/events/kvm.h>
>  
> +#include <linux/kvm_dirty_ring.h>
> +
>  /* Worst case buffer size needed for holding an integer. */
>  #define ITOA_MAX_LEN 12
>  
> @@ -357,11 +359,22 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
>  	vcpu->preempted = false;
>  	vcpu->ready = false;
>  
> +	if (kvm->dirty_ring_size) {
> +		r = kvm_dirty_ring_alloc(&vcpu->dirty_ring,
> +					 &vcpu->run->vcpu_ring_indices,
> +					 id, kvm->dirty_ring_size);
> +		if (r)
> +			goto fail_free_run;
> +	}
> +
>  	r = kvm_arch_vcpu_init(vcpu);
>  	if (r < 0)
> -		goto fail_free_run;
> +		goto fail_free_ring;
>  	return 0;
>  
> +fail_free_ring:
> +	if (kvm->dirty_ring_size)
> +		kvm_dirty_ring_free(&vcpu->dirty_ring);
>  fail_free_run:
>  	free_page((unsigned long)vcpu->run);
>  fail:
> @@ -379,6 +392,8 @@ void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
>  	put_pid(rcu_dereference_protected(vcpu->pid, 1));
>  	kvm_arch_vcpu_uninit(vcpu);
>  	free_page((unsigned long)vcpu->run);
> +	if (vcpu->kvm->dirty_ring_size)
> +		kvm_dirty_ring_free(&vcpu->dirty_ring);
>  }
>  EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
>  
> @@ -2284,8 +2299,13 @@ static void mark_page_dirty_in_slot(struct kvm *kvm,
>  {
>  	if (memslot && memslot->dirty_bitmap) {
>  		unsigned long rel_gfn = gfn - memslot->base_gfn;
> +		u32 slot = (memslot->as_id << 16) | memslot->id;
>  
> -		set_bit_le(rel_gfn, memslot->dirty_bitmap);
> +		if (kvm->dirty_ring_size)
> +			kvm_dirty_ring_push(kvm_dirty_ring_get(kvm),
> +					    slot, rel_gfn);
> +		else
> +			set_bit_le(rel_gfn, memslot->dirty_bitmap);
>  	}
>  }
>  
> @@ -2632,6 +2652,16 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
>  }
>  EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
>  
> +static bool kvm_page_in_dirty_ring(struct kvm *kvm, unsigned long pgoff)
> +{
> +	if (!KVM_DIRTY_LOG_PAGE_OFFSET)
> +		return false;
> +
> +	return (pgoff >= KVM_DIRTY_LOG_PAGE_OFFSET) &&
> +	    (pgoff < KVM_DIRTY_LOG_PAGE_OFFSET +
> +	     kvm->dirty_ring_size / PAGE_SIZE);
> +}
> +
>  static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf)
>  {
>  	struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data;
> @@ -2647,6 +2677,10 @@ static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf)
>  	else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
>  		page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
>  #endif
> +	else if (kvm_page_in_dirty_ring(vcpu->kvm, vmf->pgoff))
> +		page = kvm_dirty_ring_get_page(
> +		    &vcpu->dirty_ring,
> +		    vmf->pgoff - KVM_DIRTY_LOG_PAGE_OFFSET);
>  	else
>  		return kvm_arch_vcpu_fault(vcpu, vmf);
>  	get_page(page);
> @@ -2660,6 +2694,15 @@ static const struct vm_operations_struct kvm_vcpu_vm_ops = {
>  
>  static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
>  {
> +	struct kvm_vcpu *vcpu = file->private_data;
> +	unsigned long pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
> +
> +	/* If to map any writable page within dirty ring, fail it */
> +	if ((kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff) ||
> +	     kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff + pages - 1)) &&
> +	    vma->vm_flags & VM_WRITE)
> +		return -EINVAL;
> +
>  	vma->vm_ops = &kvm_vcpu_vm_ops;
>  	return 0;
>  }
> @@ -3242,12 +3285,97 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
>  #endif
>  	case KVM_CAP_NR_MEMSLOTS:
>  		return KVM_USER_MEM_SLOTS;
> +	case KVM_CAP_DIRTY_LOG_RING:
> +#ifdef CONFIG_X86
> +		return KVM_DIRTY_RING_MAX_ENTRIES;
> +#else
> +		return 0;
> +#endif
>  	default:
>  		break;
>  	}
>  	return kvm_vm_ioctl_check_extension(kvm, arg);
>  }
>  
> +void kvm_reset_dirty_gfn(struct kvm *kvm, u32 slot, u64 offset, u64 mask)
> +{
> +	struct kvm_memory_slot *memslot;
> +	int as_id, id;
> +
> +	as_id = slot >> 16;
> +	id = (u16)slot;
> +	if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
> +		return;
> +
> +	memslot = id_to_memslot(__kvm_memslots(kvm, as_id), id);
> +	if (offset >= memslot->npages)
> +		return;
> +
> +	spin_lock(&kvm->mmu_lock);
> +	kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, offset, mask);
> +	spin_unlock(&kvm->mmu_lock);
> +}
> +
> +static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, u32 size)
> +{
> +	int r;
> +
> +	if (!KVM_DIRTY_LOG_PAGE_OFFSET)
> +		return -EINVAL;
> +
> +	/* the size should be power of 2 */
> +	if (!size || (size & (size - 1)))
> +		return -EINVAL;
> +
> +	/* Should be bigger to keep the reserved entries, or a page */
> +	if (size < kvm_dirty_ring_get_rsvd_entries() *
> +	    sizeof(struct kvm_dirty_gfn) || size < PAGE_SIZE)
> +		return -EINVAL;
> +
> +	if (size > KVM_DIRTY_RING_MAX_ENTRIES *
> +	    sizeof(struct kvm_dirty_gfn))
> +		return -E2BIG;
> +
> +	/* We only allow it to set once */
> +	if (kvm->dirty_ring_size)
> +		return -EINVAL;
> +
> +	mutex_lock(&kvm->lock);
> +
> +	if (kvm->created_vcpus) {
> +		/* We don't allow to change this value after vcpu created */
> +		r = -EINVAL;
> +	} else {
> +		kvm->dirty_ring_size = size;
> +		r = 0;
> +	}
> +
> +	mutex_unlock(&kvm->lock);
> +	return r;
> +}
> +
> +static int kvm_vm_ioctl_reset_dirty_pages(struct kvm *kvm)
> +{
> +	int i;
> +	struct kvm_vcpu *vcpu;
> +	int cleared = 0;
> +
> +	if (!kvm->dirty_ring_size)
> +		return -EINVAL;
> +
> +	mutex_lock(&kvm->slots_lock);
> +
> +	kvm_for_each_vcpu(i, vcpu, kvm)
> +		cleared += kvm_dirty_ring_reset(vcpu->kvm, &vcpu->dirty_ring);
> +
> +	mutex_unlock(&kvm->slots_lock);
> +
> +	if (cleared)
> +		kvm_flush_remote_tlbs(kvm);
> +
> +	return cleared;
> +}
> +
>  int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm,
>  						  struct kvm_enable_cap *cap)
>  {
> @@ -3265,6 +3393,8 @@ static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
>  		kvm->manual_dirty_log_protect = cap->args[0];
>  		return 0;
>  #endif
> +	case KVM_CAP_DIRTY_LOG_RING:
> +		return kvm_vm_ioctl_enable_dirty_log_ring(kvm, cap->args[0]);
>  	default:
>  		return kvm_vm_ioctl_enable_cap(kvm, cap);
>  	}
> @@ -3452,6 +3582,9 @@ static long kvm_vm_ioctl(struct file *filp,
>  	case KVM_CHECK_EXTENSION:
>  		r = kvm_vm_ioctl_check_extension_generic(kvm, arg);
>  		break;
> +	case KVM_RESET_DIRTY_RINGS:
> +		r = kvm_vm_ioctl_reset_dirty_pages(kvm);
> +		break;
>  	default:
>  		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
>  	}
> -- 
> 2.24.1
Alex Williamson Jan. 9, 2020, 4:56 p.m. UTC | #2
On Thu, 9 Jan 2020 11:29:28 -0500
"Michael S. Tsirkin" <mst@redhat.com> wrote:

> On Thu, Jan 09, 2020 at 09:57:20AM -0500, Peter Xu wrote:
> > This patch is heavily based on previous work from Lei Cao
> > <lei.cao@stratus.com> and Paolo Bonzini <pbonzini@redhat.com>. [1]
> > 
> > KVM currently uses large bitmaps to track dirty memory.  These bitmaps
> > are copied to userspace when userspace queries KVM for its dirty page
> > information.  The use of bitmaps is mostly sufficient for live
> > migration, as large parts of memory are be dirtied from one log-dirty
> > pass to another.  However, in a checkpointing system, the number of
> > dirty pages is small and in fact it is often bounded---the VM is
> > paused when it has dirtied a pre-defined number of pages. Traversing a
> > large, sparsely populated bitmap to find set bits is time-consuming,
> > as is copying the bitmap to user-space.
> > 
> > A similar issue will be there for live migration when the guest memory
> > is huge while the page dirty procedure is trivial.  In that case for
> > each dirty sync we need to pull the whole dirty bitmap to userspace
> > and analyse every bit even if it's mostly zeros.
> > 
> > The preferred data structure for above scenarios is a dense list of
> > guest frame numbers (GFN).  
> 
> No longer, this uses an array of structs.
> 
> >  This patch series stores the dirty list in
> > kernel memory that can be memory mapped into userspace to allow speedy
> > harvesting.
> > 
> > This patch enables dirty ring for X86 only.  However it should be
> > easily extended to other archs as well.
> > 
> > [1] https://patchwork.kernel.org/patch/10471409/
> > 
> > Signed-off-by: Lei Cao <lei.cao@stratus.com>
> > Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> > Signed-off-by: Peter Xu <peterx@redhat.com>
> > ---
> >  Documentation/virt/kvm/api.txt  |  89 ++++++++++++++++++
> >  arch/x86/include/asm/kvm_host.h |   3 +
> >  arch/x86/include/uapi/asm/kvm.h |   1 +
> >  arch/x86/kvm/Makefile           |   3 +-
> >  arch/x86/kvm/mmu/mmu.c          |   6 ++
> >  arch/x86/kvm/vmx/vmx.c          |   7 ++
> >  arch/x86/kvm/x86.c              |   9 ++
> >  include/linux/kvm_dirty_ring.h  |  55 +++++++++++
> >  include/linux/kvm_host.h        |  26 +++++
> >  include/trace/events/kvm.h      |  78 +++++++++++++++
> >  include/uapi/linux/kvm.h        |  33 +++++++
> >  virt/kvm/dirty_ring.c           | 162 ++++++++++++++++++++++++++++++++
> >  virt/kvm/kvm_main.c             | 137 ++++++++++++++++++++++++++-
> >  13 files changed, 606 insertions(+), 3 deletions(-)
> >  create mode 100644 include/linux/kvm_dirty_ring.h
> >  create mode 100644 virt/kvm/dirty_ring.c
> > 
> > diff --git a/Documentation/virt/kvm/api.txt b/Documentation/virt/kvm/api.txt
> > index ebb37b34dcfc..708c3e0f7eae 100644
> > --- a/Documentation/virt/kvm/api.txt
> > +++ b/Documentation/virt/kvm/api.txt
> > @@ -231,6 +231,7 @@ Based on their initialization different VMs may have different capabilities.
> >  It is thus encouraged to use the vm ioctl to query for capabilities (available
> >  with KVM_CAP_CHECK_EXTENSION_VM on the vm fd)
> >  
> > +
> >  4.5 KVM_GET_VCPU_MMAP_SIZE
> >  
> >  Capability: basic
> > @@ -243,6 +244,18 @@ The KVM_RUN ioctl (cf.) communicates with userspace via a shared
> >  memory region.  This ioctl returns the size of that region.  See the
> >  KVM_RUN documentation for details.
> >  
> > +Besides the size of the KVM_RUN communication region, other areas of
> > +the VCPU file descriptor can be mmap-ed, including:
> > +
> > +- if KVM_CAP_COALESCED_MMIO is available, a page at
> > +  KVM_COALESCED_MMIO_PAGE_OFFSET * PAGE_SIZE; for historical reasons,
> > +  this page is included in the result of KVM_GET_VCPU_MMAP_SIZE.
> > +  KVM_CAP_COALESCED_MMIO is not documented yet.
> > +
> > +- if KVM_CAP_DIRTY_LOG_RING is available, a number of pages at
> > +  KVM_DIRTY_LOG_PAGE_OFFSET * PAGE_SIZE.  For more information on
> > +  KVM_CAP_DIRTY_LOG_RING, see section 8.3.
> > +
> >  
> >  4.6 KVM_SET_MEMORY_REGION
> >  
> > @@ -5376,6 +5389,7 @@ CPU when the exception is taken. If this virtual SError is taken to EL1 using
> >  AArch64, this value will be reported in the ISS field of ESR_ELx.
> >  
> >  See KVM_CAP_VCPU_EVENTS for more details.
> > +
> >  8.20 KVM_CAP_HYPERV_SEND_IPI
> >  
> >  Architectures: x86
> > @@ -5383,6 +5397,7 @@ Architectures: x86
> >  This capability indicates that KVM supports paravirtualized Hyper-V IPI send
> >  hypercalls:
> >  HvCallSendSyntheticClusterIpi, HvCallSendSyntheticClusterIpiEx.
> > +
> >  8.21 KVM_CAP_HYPERV_DIRECT_TLBFLUSH
> >  
> >  Architecture: x86
> > @@ -5396,3 +5411,77 @@ handling by KVM (as some KVM hypercall may be mistakenly treated as TLB
> >  flush hypercalls by Hyper-V) so userspace should disable KVM identification
> >  in CPUID and only exposes Hyper-V identification. In this case, guest
> >  thinks it's running on Hyper-V and only use Hyper-V hypercalls.
> > +
> > +8.22 KVM_CAP_DIRTY_LOG_RING
> > +
> > +Architectures: x86
> > +Parameters: args[0] - size of the dirty log ring
> > +
> > +KVM is capable of tracking dirty memory using ring buffers that are
> > +mmaped into userspace; there is one dirty ring per vcpu.
> > +
> > +One dirty ring is defined as below internally:
> > +
> > +struct kvm_dirty_ring {
> > +	u32 dirty_index;
> > +	u32 reset_index;
> > +	u32 size;
> > +	u32 soft_limit;
> > +	struct kvm_dirty_gfn *dirty_gfns;
> > +	struct kvm_dirty_ring_indices *indices;
> > +	int index;
> > +};
> > +
> > +Dirty GFNs (Guest Frame Numbers) are stored in the dirty_gfns array.
> > +For each of the dirty entry it's defined as:
> > +
> > +struct kvm_dirty_gfn {
> > +        __u32 pad;  
> 
> How about sticking a length here?
> This way huge pages can be dirtied in one go.

Not just huge pages, but any contiguous range of dirty pages could be
reported far more concisely.  Thanks,

Alex
Peter Xu Jan. 9, 2020, 7:15 p.m. UTC | #3
On Thu, Jan 09, 2020 at 11:29:28AM -0500, Michael S. Tsirkin wrote:
> On Thu, Jan 09, 2020 at 09:57:20AM -0500, Peter Xu wrote:
> > This patch is heavily based on previous work from Lei Cao
> > <lei.cao@stratus.com> and Paolo Bonzini <pbonzini@redhat.com>. [1]
> > 
> > KVM currently uses large bitmaps to track dirty memory.  These bitmaps
> > are copied to userspace when userspace queries KVM for its dirty page
> > information.  The use of bitmaps is mostly sufficient for live
> > migration, as large parts of memory are be dirtied from one log-dirty
> > pass to another.  However, in a checkpointing system, the number of
> > dirty pages is small and in fact it is often bounded---the VM is
> > paused when it has dirtied a pre-defined number of pages. Traversing a
> > large, sparsely populated bitmap to find set bits is time-consuming,
> > as is copying the bitmap to user-space.
> > 
> > A similar issue will be there for live migration when the guest memory
> > is huge while the page dirty procedure is trivial.  In that case for
> > each dirty sync we need to pull the whole dirty bitmap to userspace
> > and analyse every bit even if it's mostly zeros.
> > 
> > The preferred data structure for above scenarios is a dense list of
> > guest frame numbers (GFN).
> 
> No longer, this uses an array of structs.

(IMHO it's more or less a wording thing, because it's still an array
 of GFNs behind it...)

[...]

> > +Dirty GFNs (Guest Frame Numbers) are stored in the dirty_gfns array.
> > +For each of the dirty entry it's defined as:
> > +
> > +struct kvm_dirty_gfn {
> > +        __u32 pad;
> 
> How about sticking a length here?
> This way huge pages can be dirtied in one go.

As we've discussed previously, current KVM tracks dirty in 4K page
only, so it seems to be something that is not easily covered in this
series.

We probably need to justify on having KVM to track huge pages first,
or at least a trend that we're going to do that, then we can properly
reserve it here.

> 
> > +        __u32 slot; /* as_id | slot_id */
> > +        __u64 offset;
> > +};
> > +
> > +Most of the ring structure is used by KVM internally, while only the
> > +indices are exposed to userspace:
> > +
> > +struct kvm_dirty_ring_indices {
> > +	__u32 avail_index; /* set by kernel */
> > +	__u32 fetch_index; /* set by userspace */
> > +};
> > +
> > +The two indices in the ring buffer are free running counters.
> > +
> > +Userspace calls KVM_ENABLE_CAP ioctl right after KVM_CREATE_VM ioctl
> > +to enable this capability for the new guest and set the size of the
> > +rings.  It is only allowed before creating any vCPU, and the size of
> > +the ring must be a power of two.
> 
> 
> I know index design is popular, but testing with virtio showed
> that it's better to just have a flags field marking
> an entry as valid. In particular this gets rid of the
> running counters and power of two limitations.
> It also removes the need for a separate index page, which is nice.

Firstly, note that the separate index page has already been dropped
since V2, so we don't need to worry on that.

Regarding dropping the indices: I feel like it can be done, though we
probably need two extra bits for each GFN entry, for example:

  - Bit 0 of the GFN address to show whether this is a valid publish
    of dirty gfn

  - Bit 1 of the GFN address to show whether this is collected by the
    user

We can also use the padding field, but just want to show the idea
first.

Then for each GFN we can go through state changes like this (things
like "00b" stands for "bit1 bit0" values):

  00b (invalid GFN) ->
    01b (valid gfn published by kernel, which is dirty) ->
      10b (gfn dirty page collected by userspace) ->
        00b (gfn reset by kernel, so goes back to invalid gfn)

And we should always guarantee that both the userspace and KVM walks
the GFN array in a linear manner, for example, KVM must publish a new
GFN with bit 1 set right after the previous publish of GFN.  Vice
versa to the userspace when it collects the dirty GFN and mark bit 2.

Michael, do you mean something like this?

I think it should work logically, however IIUC it can expose more
security risks, say, dirty ring is different from virtio in that
userspace is not trusted, while for virtio, both sides (hypervisor,
and the guest driver) are trusted.  Above means we need to do these to
change to the new design:

  - Allow the GFN array to be mapped as writable by userspace (so that
    userspace can publish bit 2),

  - The userspace must be trusted to follow the design (just imagine
    what if the userspace overwrites a GFN when it publishes bit 2
    over a valid dirty gfn entry?  KVM could wrongly unprotect a page
    for the guest...).

While if we use the indices, we restrict the userspace to only be able
to write to one index only (which is the reset_index).  That's all it
can do to mess things up (and it could never as long as we properly
validate the reset_index when read, which only happens during
KVM_RESET_DIRTY_RINGS and is very rare).  From that pov, it seems the
indices solution still has its benefits.

> 
> 
> 
> >  The larger the ring buffer, the less
> > +likely the ring is full and the VM is forced to exit to userspace. The
> > +optimal size depends on the workload, but it is recommended that it be
> > +at least 64 KiB (4096 entries).
> 
> Where's this number coming from? Given you have indices as well,
> 4K size rings is likely to cause cache contention.

I think we've had some similar discussion in previous versions on the
size of ring.  Again imho it's really something that may not have a
direct clue as long as it's big enough (4K should be).

Regarding to the cache contention: could you explain more?  Do you
have a suggestion on the size of ring instead considering the issue?

[...]

> > +int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring)
> > +{
> > +	u32 cur_slot, next_slot;
> > +	u64 cur_offset, next_offset;
> > +	unsigned long mask;
> > +	u32 fetch;
> > +	int count = 0;
> > +	struct kvm_dirty_gfn *entry;
> > +	struct kvm_dirty_ring_indices *indices = ring->indices;
> > +	bool first_round = true;
> > +
> > +	fetch = READ_ONCE(indices->fetch_index);
> 
> So this does not work if the data cache is virtually tagged.
> Which to the best of my knowledge isn't the case on any
> CPU kvm supports. However it might not stay being the
> case forever. Worth at least commenting.

This is the read side.  IIUC even if with virtually tagged archs, we
should do the flushing on the write side rather than the read side,
and that should be enough?

Also, I believe this is the similar question that Jason has asked in
V2.  Sorry I should mention this earlier, but I didn't address that in
this series because if we need to do so we probably need to do it
kvm-wise, rather than only in this series.  I feel like it's missing
probably only because all existing KVM supported archs do not have
virtual-tagged caches as you mentioned.  If so, I would prefer if you
can allow me to ignore that issue until KVM starts to support such an
arch.

> 
> 
> > +
> > +	/*
> > +	 * Note that fetch_index is written by the userspace, which
> > +	 * should not be trusted.  If this happens, then it's probably
> > +	 * that the userspace has written a wrong fetch_index.
> > +	 */
> > +	if (fetch - ring->reset_index > ring->size)
> > +		return -EINVAL;
> > +
> > +	if (fetch == ring->reset_index)
> > +		return 0;
> > +
> > +	/* This is only needed to make compilers happy */
> > +	cur_slot = cur_offset = mask = 0;
> > +	while (ring->reset_index != fetch) {
> > +		entry = &ring->dirty_gfns[ring->reset_index & (ring->size - 1)];
> > +		next_slot = READ_ONCE(entry->slot);
> > +		next_offset = READ_ONCE(entry->offset);
> 
> What is this READ_ONCE doing? Entries are only written by kernel
> and it's under lock.

The entries are written in kvm_dirty_ring_push() where there should
have no lock (there's one wmb() though to guarantee ordering of these
and the index update).

With the wmb(), the write side should guarantee to make it to memory.
For the read side here, I think it's still good to have it to make
sure we read from memory?

> 
> > +		ring->reset_index++;
> > +		count++;
> > +		/*
> > +		 * Try to coalesce the reset operations when the guest is
> > +		 * scanning pages in the same slot.
> > +		 */
> > +		if (!first_round && next_slot == cur_slot) {
> > +			s64 delta = next_offset - cur_offset;
> > +
> > +			if (delta >= 0 && delta < BITS_PER_LONG) {
> > +				mask |= 1ull << delta;
> > +				continue;
> > +			}
> > +
> > +			/* Backwards visit, careful about overflows!  */
> > +			if (delta > -BITS_PER_LONG && delta < 0 &&
> > +			    (mask << -delta >> -delta) == mask) {
> > +				cur_offset = next_offset;
> > +				mask = (mask << -delta) | 1;
> > +				continue;
> > +			}
> > +		}
> 
> Well how important is this logic? Because it will not be
> too effective on an SMP system, so don't you need a per-cpu ring?

It's my fault to have omit the high-level design in the cover letter,
but we do have per-vcpu ring now.  Actually that's what we only have
(we dropped the per-vm ring already) so ring access does not need lock
any more.

This logic is good because kvm_reset_dirty_gfn, especially inside that
there's kvm_arch_mmu_enable_log_dirty_pt_masked() that supports masks,
so it would be good to do the reset for continuous pages (or page
that's close enough) in a single shot.

> 
> 
> 
> > +		kvm_reset_dirty_gfn(kvm, cur_slot, cur_offset, mask);
> > +		cur_slot = next_slot;
> > +		cur_offset = next_offset;
> > +		mask = 1;
> > +		first_round = false;
> > +	}
> > +	kvm_reset_dirty_gfn(kvm, cur_slot, cur_offset, mask);
> > +
> > +	trace_kvm_dirty_ring_reset(ring);
> > +
> > +	return count;
> > +}
> > +
> > +void kvm_dirty_ring_push(struct kvm_dirty_ring *ring, u32 slot, u64 offset)
> > +{
> > +	struct kvm_dirty_gfn *entry;
> > +	struct kvm_dirty_ring_indices *indices = ring->indices;
> > +
> > +	/* It should never get full */
> > +	WARN_ON_ONCE(kvm_dirty_ring_full(ring));
> > +
> > +	entry = &ring->dirty_gfns[ring->dirty_index & (ring->size - 1)];
> > +	entry->slot = slot;
> > +	entry->offset = offset;
> > +	/*
> > +	 * Make sure the data is filled in before we publish this to
> > +	 * the userspace program.  There's no paired kernel-side reader.
> > +	 */
> > +	smp_wmb();
> > +	ring->dirty_index++;
> 
> 
> Do I understand it correctly that the ring is shared between CPUs?
> If so I don't understand why it's safe for SMP guests.
> Don't you need atomics or locking?

No, it's per-vcpu.

Thanks,
Peter Xu Jan. 9, 2020, 7:21 p.m. UTC | #4
On Thu, Jan 09, 2020 at 09:56:10AM -0700, Alex Williamson wrote:

[...]

> > > +Dirty GFNs (Guest Frame Numbers) are stored in the dirty_gfns array.
> > > +For each of the dirty entry it's defined as:
> > > +
> > > +struct kvm_dirty_gfn {
> > > +        __u32 pad;  
> > 
> > How about sticking a length here?
> > This way huge pages can be dirtied in one go.
> 
> Not just huge pages, but any contiguous range of dirty pages could be
> reported far more concisely.  Thanks,

I replied in the other thread on why I thought KVM might not suite
that (while vfio may).

Actually we can even do that for KVM as long as we keep a per-vcpu
last-dirtied GFN range cache (so we don't publish a dirty GFN right
after it's dirtied), then we grow that cached dirtied range as long as
the continuous next/previous page is dirtied.  If we found that the
current dirty GFN is not continuous to the cached range, we publish
the cached range and let the new GFN be the starting of last-dirtied
GFN range cache.

However I am not sure how much we'll gain from it.  Maybe we can do
that when we have a real use case for it.  For now I'm not sure
whether it would worth the effort.

Thanks,
Michael S. Tsirkin Jan. 9, 2020, 7:35 p.m. UTC | #5
On Thu, Jan 09, 2020 at 02:15:14PM -0500, Peter Xu wrote:
> On Thu, Jan 09, 2020 at 11:29:28AM -0500, Michael S. Tsirkin wrote:
> > On Thu, Jan 09, 2020 at 09:57:20AM -0500, Peter Xu wrote:
> > > This patch is heavily based on previous work from Lei Cao
> > > <lei.cao@stratus.com> and Paolo Bonzini <pbonzini@redhat.com>. [1]
> > > 
> > > KVM currently uses large bitmaps to track dirty memory.  These bitmaps
> > > are copied to userspace when userspace queries KVM for its dirty page
> > > information.  The use of bitmaps is mostly sufficient for live
> > > migration, as large parts of memory are be dirtied from one log-dirty
> > > pass to another.  However, in a checkpointing system, the number of
> > > dirty pages is small and in fact it is often bounded---the VM is
> > > paused when it has dirtied a pre-defined number of pages. Traversing a
> > > large, sparsely populated bitmap to find set bits is time-consuming,
> > > as is copying the bitmap to user-space.
> > > 
> > > A similar issue will be there for live migration when the guest memory
> > > is huge while the page dirty procedure is trivial.  In that case for
> > > each dirty sync we need to pull the whole dirty bitmap to userspace
> > > and analyse every bit even if it's mostly zeros.
> > > 
> > > The preferred data structure for above scenarios is a dense list of
> > > guest frame numbers (GFN).
> > 
> > No longer, this uses an array of structs.
> 
> (IMHO it's more or less a wording thing, because it's still an array
>  of GFNs behind it...)
> 
> [...]
> 
> > > +Dirty GFNs (Guest Frame Numbers) are stored in the dirty_gfns array.
> > > +For each of the dirty entry it's defined as:
> > > +
> > > +struct kvm_dirty_gfn {
> > > +        __u32 pad;
> > 
> > How about sticking a length here?
> > This way huge pages can be dirtied in one go.
> 
> As we've discussed previously, current KVM tracks dirty in 4K page
> only, so it seems to be something that is not easily covered in this
> series.
> 
> We probably need to justify on having KVM to track huge pages first,
> or at least a trend that we're going to do that, then we can properly
> reserve it here.
> 
> > 
> > > +        __u32 slot; /* as_id | slot_id */
> > > +        __u64 offset;
> > > +};
> > > +
> > > +Most of the ring structure is used by KVM internally, while only the
> > > +indices are exposed to userspace:
> > > +
> > > +struct kvm_dirty_ring_indices {
> > > +	__u32 avail_index; /* set by kernel */
> > > +	__u32 fetch_index; /* set by userspace */
> > > +};
> > > +
> > > +The two indices in the ring buffer are free running counters.
> > > +
> > > +Userspace calls KVM_ENABLE_CAP ioctl right after KVM_CREATE_VM ioctl
> > > +to enable this capability for the new guest and set the size of the
> > > +rings.  It is only allowed before creating any vCPU, and the size of
> > > +the ring must be a power of two.
> > 
> > 
> > I know index design is popular, but testing with virtio showed
> > that it's better to just have a flags field marking
> > an entry as valid. In particular this gets rid of the
> > running counters and power of two limitations.
> > It also removes the need for a separate index page, which is nice.
> 
> Firstly, note that the separate index page has already been dropped
> since V2, so we don't need to worry on that.

changelog would be nice.
So now, how does userspace tell kvm it's done with the ring?

> Regarding dropping the indices: I feel like it can be done, though we
> probably need two extra bits for each GFN entry, for example:
> 
>   - Bit 0 of the GFN address to show whether this is a valid publish
>     of dirty gfn
> 
>   - Bit 1 of the GFN address to show whether this is collected by the
>     user


I wonder whether you will end up reinventing virtio.
You are already pretty close with avail/used bits in flags.



> We can also use the padding field, but just want to show the idea
> first.
> 
> Then for each GFN we can go through state changes like this (things
> like "00b" stands for "bit1 bit0" values):
> 
>   00b (invalid GFN) ->
>     01b (valid gfn published by kernel, which is dirty) ->
>       10b (gfn dirty page collected by userspace) ->
>         00b (gfn reset by kernel, so goes back to invalid gfn)
> 
> And we should always guarantee that both the userspace and KVM walks
> the GFN array in a linear manner, for example, KVM must publish a new
> GFN with bit 1 set right after the previous publish of GFN.  Vice
> versa to the userspace when it collects the dirty GFN and mark bit 2.
> 
> Michael, do you mean something like this?
> 
> I think it should work logically, however IIUC it can expose more
> security risks, say, dirty ring is different from virtio in that
> userspace is not trusted,

In what sense?

> while for virtio, both sides (hypervisor,
> and the guest driver) are trusted.

What gave you the impression guest is trusted in virtio?


>  Above means we need to do these to
> change to the new design:
> 
>   - Allow the GFN array to be mapped as writable by userspace (so that
>     userspace can publish bit 2),
> 
>   - The userspace must be trusted to follow the design (just imagine
>     what if the userspace overwrites a GFN when it publishes bit 2
>     over a valid dirty gfn entry?  KVM could wrongly unprotect a page
>     for the guest...).

You mean protect, right?  So what?

> While if we use the indices, we restrict the userspace to only be able
> to write to one index only (which is the reset_index).  That's all it
> can do to mess things up (and it could never as long as we properly
> validate the reset_index when read, which only happens during
> KVM_RESET_DIRTY_RINGS and is very rare).  From that pov, it seems the
> indices solution still has its benefits.

So if you mess up index how is this different?

I agree RO page kind of feels safer generally though.

I will have to re-read how does the ring works though,
my comments were based on the old assumption of mmaped
page with indices.



> > 
> > 
> > 
> > >  The larger the ring buffer, the less
> > > +likely the ring is full and the VM is forced to exit to userspace. The
> > > +optimal size depends on the workload, but it is recommended that it be
> > > +at least 64 KiB (4096 entries).
> > 
> > Where's this number coming from? Given you have indices as well,
> > 4K size rings is likely to cause cache contention.
> 
> I think we've had some similar discussion in previous versions on the
> size of ring.  Again imho it's really something that may not have a
> direct clue as long as it's big enough (4K should be).
> 
> Regarding to the cache contention: could you explain more?

4K is a whole cache way. 64K 16 ways.  If there's anything else is a hot
path then you are pushing everything out of cache.  To re-read how do
indices work so see whether an index is on hot path or not. If yes your
structure won't fit in L1 cache which is not great.


>  Do you
> have a suggestion on the size of ring instead considering the issue?
> 
> [...]

I'll have to re-learn how do things work with indices gone
from shared memory.

> > > +int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring)
> > > +{
> > > +	u32 cur_slot, next_slot;
> > > +	u64 cur_offset, next_offset;
> > > +	unsigned long mask;
> > > +	u32 fetch;
> > > +	int count = 0;
> > > +	struct kvm_dirty_gfn *entry;
> > > +	struct kvm_dirty_ring_indices *indices = ring->indices;
> > > +	bool first_round = true;
> > > +
> > > +	fetch = READ_ONCE(indices->fetch_index);
> > 
> > So this does not work if the data cache is virtually tagged.
> > Which to the best of my knowledge isn't the case on any
> > CPU kvm supports. However it might not stay being the
> > case forever. Worth at least commenting.
> 
> This is the read side.  IIUC even if with virtually tagged archs, we
> should do the flushing on the write side rather than the read side,
> and that should be enough?

No.
See e.g.  Documentation/core-api/cachetlb.rst

  ``void flush_dcache_page(struct page *page)``

        Any time the kernel writes to a page cache page, _OR_
        the kernel is about to read from a page cache page and
        user space shared/writable mappings of this page potentially
        exist, this routine is called.


> Also, I believe this is the similar question that Jason has asked in
> V2.  Sorry I should mention this earlier, but I didn't address that in
> this series because if we need to do so we probably need to do it
> kvm-wise, rather than only in this series.

You need to document these things.

>  I feel like it's missing
> probably only because all existing KVM supported archs do not have
> virtual-tagged caches as you mentioned.

But is that a fact? ARM has such a variety of CPUs,
I can't really tell. Did you research this to make sure?

> If so, I would prefer if you
> can allow me to ignore that issue until KVM starts to support such an
> arch.

Document limitations pls.  Don't ignore them.

> > 
> > 
> > > +
> > > +	/*
> > > +	 * Note that fetch_index is written by the userspace, which
> > > +	 * should not be trusted.  If this happens, then it's probably
> > > +	 * that the userspace has written a wrong fetch_index.
> > > +	 */
> > > +	if (fetch - ring->reset_index > ring->size)
> > > +		return -EINVAL;
> > > +
> > > +	if (fetch == ring->reset_index)
> > > +		return 0;
> > > +
> > > +	/* This is only needed to make compilers happy */
> > > +	cur_slot = cur_offset = mask = 0;
> > > +	while (ring->reset_index != fetch) {
> > > +		entry = &ring->dirty_gfns[ring->reset_index & (ring->size - 1)];
> > > +		next_slot = READ_ONCE(entry->slot);
> > > +		next_offset = READ_ONCE(entry->offset);
> > 
> > What is this READ_ONCE doing? Entries are only written by kernel
> > and it's under lock.
> 
> The entries are written in kvm_dirty_ring_push() where there should
> have no lock (there's one wmb() though to guarantee ordering of these
> and the index update).
> 
> With the wmb(), the write side should guarantee to make it to memory.
> For the read side here, I think it's still good to have it to make
> sure we read from memory?
> 
> > 
> > > +		ring->reset_index++;
> > > +		count++;
> > > +		/*
> > > +		 * Try to coalesce the reset operations when the guest is
> > > +		 * scanning pages in the same slot.
> > > +		 */
> > > +		if (!first_round && next_slot == cur_slot) {
> > > +			s64 delta = next_offset - cur_offset;
> > > +
> > > +			if (delta >= 0 && delta < BITS_PER_LONG) {
> > > +				mask |= 1ull << delta;
> > > +				continue;
> > > +			}
> > > +
> > > +			/* Backwards visit, careful about overflows!  */
> > > +			if (delta > -BITS_PER_LONG && delta < 0 &&
> > > +			    (mask << -delta >> -delta) == mask) {
> > > +				cur_offset = next_offset;
> > > +				mask = (mask << -delta) | 1;
> > > +				continue;
> > > +			}
> > > +		}
> > 
> > Well how important is this logic? Because it will not be
> > too effective on an SMP system, so don't you need a per-cpu ring?
> 
> It's my fault to have omit the high-level design in the cover letter,
> but we do have per-vcpu ring now.  Actually that's what we only have
> (we dropped the per-vm ring already) so ring access does not need lock
> any more.
> 
> This logic is good because kvm_reset_dirty_gfn, especially inside that
> there's kvm_arch_mmu_enable_log_dirty_pt_masked() that supports masks,
> so it would be good to do the reset for continuous pages (or page
> that's close enough) in a single shot.
> 
> > 
> > 
> > 
> > > +		kvm_reset_dirty_gfn(kvm, cur_slot, cur_offset, mask);
> > > +		cur_slot = next_slot;
> > > +		cur_offset = next_offset;
> > > +		mask = 1;
> > > +		first_round = false;
> > > +	}
> > > +	kvm_reset_dirty_gfn(kvm, cur_slot, cur_offset, mask);
> > > +
> > > +	trace_kvm_dirty_ring_reset(ring);
> > > +
> > > +	return count;
> > > +}
> > > +
> > > +void kvm_dirty_ring_push(struct kvm_dirty_ring *ring, u32 slot, u64 offset)
> > > +{
> > > +	struct kvm_dirty_gfn *entry;
> > > +	struct kvm_dirty_ring_indices *indices = ring->indices;
> > > +
> > > +	/* It should never get full */
> > > +	WARN_ON_ONCE(kvm_dirty_ring_full(ring));
> > > +
> > > +	entry = &ring->dirty_gfns[ring->dirty_index & (ring->size - 1)];
> > > +	entry->slot = slot;
> > > +	entry->offset = offset;
> > > +	/*
> > > +	 * Make sure the data is filled in before we publish this to
> > > +	 * the userspace program.  There's no paired kernel-side reader.
> > > +	 */
> > > +	smp_wmb();
> > > +	ring->dirty_index++;
> > 
> > 
> > Do I understand it correctly that the ring is shared between CPUs?
> > If so I don't understand why it's safe for SMP guests.
> > Don't you need atomics or locking?
> 
> No, it's per-vcpu.
> 
> Thanks,
> 
> -- 
> Peter Xu
Michael S. Tsirkin Jan. 9, 2020, 7:36 p.m. UTC | #6
On Thu, Jan 09, 2020 at 02:21:16PM -0500, Peter Xu wrote:
> On Thu, Jan 09, 2020 at 09:56:10AM -0700, Alex Williamson wrote:
> 
> [...]
> 
> > > > +Dirty GFNs (Guest Frame Numbers) are stored in the dirty_gfns array.
> > > > +For each of the dirty entry it's defined as:
> > > > +
> > > > +struct kvm_dirty_gfn {
> > > > +        __u32 pad;  
> > > 
> > > How about sticking a length here?
> > > This way huge pages can be dirtied in one go.
> > 
> > Not just huge pages, but any contiguous range of dirty pages could be
> > reported far more concisely.  Thanks,
> 
> I replied in the other thread on why I thought KVM might not suite
> that (while vfio may).
> 
> Actually we can even do that for KVM as long as we keep a per-vcpu
> last-dirtied GFN range cache (so we don't publish a dirty GFN right
> after it's dirtied), then we grow that cached dirtied range as long as
> the continuous next/previous page is dirtied.  If we found that the
> current dirty GFN is not continuous to the cached range, we publish
> the cached range and let the new GFN be the starting of last-dirtied
> GFN range cache.
> 
> However I am not sure how much we'll gain from it.  Maybe we can do
> that when we have a real use case for it.  For now I'm not sure
> whether it would worth the effort.
> 
> Thanks,

I agree for the implementation but I think UAPI should support that
from ground up so we don't need to support two kinds of formats.

> -- 
> Peter Xu
Peter Xu Jan. 9, 2020, 8:19 p.m. UTC | #7
On Thu, Jan 09, 2020 at 02:35:46PM -0500, Michael S. Tsirkin wrote:

[...]

> > > I know index design is popular, but testing with virtio showed
> > > that it's better to just have a flags field marking
> > > an entry as valid. In particular this gets rid of the
> > > running counters and power of two limitations.
> > > It also removes the need for a separate index page, which is nice.
> > 
> > Firstly, note that the separate index page has already been dropped
> > since V2, so we don't need to worry on that.
> 
> changelog would be nice.

Actually I mentioned it in V2:

https://lore.kernel.org/kvm/20191221014938.58831-1-peterx@redhat.com/

There's a section "Per-vm ring is dropped".  But it's indeed hiding
behind the wall that the index page is bound to the per-vm ring...
I'll try to be more clear in the cover letter in the future.

> So now, how does userspace tell kvm it's done with the ring?

It should rarely tell unless the ring reaches soft-full, in that case
the vcpu KVM_RUN will return with KVM_EXIT_DIRTY_RING_FULL.

> 
> > Regarding dropping the indices: I feel like it can be done, though we
> > probably need two extra bits for each GFN entry, for example:
> > 
> >   - Bit 0 of the GFN address to show whether this is a valid publish
> >     of dirty gfn
> > 
> >   - Bit 1 of the GFN address to show whether this is collected by the
> >     user
> 
> 
> I wonder whether you will end up reinventing virtio.
> You are already pretty close with avail/used bits in flags.
> 
> 
> 
> > We can also use the padding field, but just want to show the idea
> > first.
> > 
> > Then for each GFN we can go through state changes like this (things
> > like "00b" stands for "bit1 bit0" values):
> > 
> >   00b (invalid GFN) ->
> >     01b (valid gfn published by kernel, which is dirty) ->
> >       10b (gfn dirty page collected by userspace) ->
> >         00b (gfn reset by kernel, so goes back to invalid gfn)
> > 
> > And we should always guarantee that both the userspace and KVM walks
> > the GFN array in a linear manner, for example, KVM must publish a new
> > GFN with bit 1 set right after the previous publish of GFN.  Vice
> > versa to the userspace when it collects the dirty GFN and mark bit 2.
> > 
> > Michael, do you mean something like this?
> > 
> > I think it should work logically, however IIUC it can expose more
> > security risks, say, dirty ring is different from virtio in that
> > userspace is not trusted,
> 
> In what sense?

In the sense of general syscalls?  Like, we shouldn't allow the kernel
to break and go wild no matter what the userspace does?

> 
> > while for virtio, both sides (hypervisor,
> > and the guest driver) are trusted.
> 
> What gave you the impression guest is trusted in virtio?

Hmm... maybe when I know virtio can bypass vIOMMU as long as it
doesn't provide IOMMU_PLATFORM flag? :)

I think it's logical to trust a virtio guest kernel driver, could you
guide me on what I've missed?

> 
> 
> >  Above means we need to do these to
> > change to the new design:
> > 
> >   - Allow the GFN array to be mapped as writable by userspace (so that
> >     userspace can publish bit 2),
> > 
> >   - The userspace must be trusted to follow the design (just imagine
> >     what if the userspace overwrites a GFN when it publishes bit 2
> >     over a valid dirty gfn entry?  KVM could wrongly unprotect a page
> >     for the guest...).
> 
> You mean protect, right?  So what?

Yes, I mean with that, more things are uncertain from userspace.  It
seems easier to me that we restrict the userspace with one index.

> 
> > While if we use the indices, we restrict the userspace to only be able
> > to write to one index only (which is the reset_index).  That's all it
> > can do to mess things up (and it could never as long as we properly
> > validate the reset_index when read, which only happens during
> > KVM_RESET_DIRTY_RINGS and is very rare).  From that pov, it seems the
> > indices solution still has its benefits.
> 
> So if you mess up index how is this different?

We can't mess up much with that.  We simply check fetch_index (sorry I
meant this when I said reset_index, anyway it's the only index that we
expose to userspace) to make sure:

  reset_index <= fetch_index <= dirty_index

Otherwise we fail the ioctl.  With that, we're 100% safe.

> 
> I agree RO page kind of feels safer generally though.
> 
> I will have to re-read how does the ring works though,
> my comments were based on the old assumption of mmaped
> page with indices.

Yes, sorry again for a bad cover letter.

It's basically the same as before, just that we only have per-vcpu
ring now, and the indices are exposed from kvm_run so we don't need
the extra page, but we still expose that via mmap.

> 
> 
> 
> > > 
> > > 
> > > 
> > > >  The larger the ring buffer, the less
> > > > +likely the ring is full and the VM is forced to exit to userspace. The
> > > > +optimal size depends on the workload, but it is recommended that it be
> > > > +at least 64 KiB (4096 entries).
> > > 
> > > Where's this number coming from? Given you have indices as well,
> > > 4K size rings is likely to cause cache contention.
> > 
> > I think we've had some similar discussion in previous versions on the
> > size of ring.  Again imho it's really something that may not have a
> > direct clue as long as it's big enough (4K should be).
> > 
> > Regarding to the cache contention: could you explain more?
> 
> 4K is a whole cache way. 64K 16 ways.  If there's anything else is a hot
> path then you are pushing everything out of cache.  To re-read how do
> indices work so see whether an index is on hot path or not. If yes your
> structure won't fit in L1 cache which is not great.

I'm not sure whether I get the point correct, but logically we
shouldn't read the whole ring buffer as a whole, but only partly (just
like when we say the ring shouldn't even reach soft-full).  Even if we
read the whole ring, I don't see a difference here comparing to when
we read a huge array of data (e.g. "char buf[65536]") in any program
that covers 64K range - I don't see a good way to fix this but read
the whole chunk in.  It seems to be common in programs where we have
big dataset.

[...]

> > > > +int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring)
> > > > +{
> > > > +	u32 cur_slot, next_slot;
> > > > +	u64 cur_offset, next_offset;
> > > > +	unsigned long mask;
> > > > +	u32 fetch;
> > > > +	int count = 0;
> > > > +	struct kvm_dirty_gfn *entry;
> > > > +	struct kvm_dirty_ring_indices *indices = ring->indices;
> > > > +	bool first_round = true;
> > > > +
> > > > +	fetch = READ_ONCE(indices->fetch_index);
> > > 
> > > So this does not work if the data cache is virtually tagged.
> > > Which to the best of my knowledge isn't the case on any
> > > CPU kvm supports. However it might not stay being the
> > > case forever. Worth at least commenting.
> > 
> > This is the read side.  IIUC even if with virtually tagged archs, we
> > should do the flushing on the write side rather than the read side,
> > and that should be enough?
> 
> No.
> See e.g.  Documentation/core-api/cachetlb.rst
> 
>   ``void flush_dcache_page(struct page *page)``
> 
>         Any time the kernel writes to a page cache page, _OR_
>         the kernel is about to read from a page cache page and
>         user space shared/writable mappings of this page potentially
>         exist, this routine is called.

But I don't understand why.  I feel like for such arch even the
userspace must flush cache after publishing data onto shared memories,
otherwise if the shared memory is between two userspace processes
they'll get inconsistent state.  Then if with that, I'm confused on
why the read side needs to flush it again.

> 
> 
> > Also, I believe this is the similar question that Jason has asked in
> > V2.  Sorry I should mention this earlier, but I didn't address that in
> > this series because if we need to do so we probably need to do it
> > kvm-wise, rather than only in this series.
> 
> You need to document these things.
> 
> >  I feel like it's missing
> > probably only because all existing KVM supported archs do not have
> > virtual-tagged caches as you mentioned.
> 
> But is that a fact? ARM has such a variety of CPUs,
> I can't really tell. Did you research this to make sure?

I didn't.  I only tried to find all callers of flush_dcache_page()
through the whole Linux tree and I cannot see any kvm related code.
To make this simple, let me address the dcache flushing issue in the
next post.

Thanks,
Michael S. Tsirkin Jan. 9, 2020, 10:18 p.m. UTC | #8
On Thu, Jan 09, 2020 at 03:19:16PM -0500, Peter Xu wrote:
> > > while for virtio, both sides (hypervisor,
> > > and the guest driver) are trusted.
> > 
> > What gave you the impression guest is trusted in virtio?
> 
> Hmm... maybe when I know virtio can bypass vIOMMU as long as it
> doesn't provide IOMMU_PLATFORM flag? :)

If guest driver does not provide IOMMU_PLATFORM, and device does,
then negotiation fails.

> I think it's logical to trust a virtio guest kernel driver, could you
> guide me on what I've missed?


guest driver is assumed to be part of guest kernel. It can't
do anything kernel can't do anyway.

> > 
> > 
> > >  Above means we need to do these to
> > > change to the new design:
> > > 
> > >   - Allow the GFN array to be mapped as writable by userspace (so that
> > >     userspace can publish bit 2),
> > > 
> > >   - The userspace must be trusted to follow the design (just imagine
> > >     what if the userspace overwrites a GFN when it publishes bit 2
> > >     over a valid dirty gfn entry?  KVM could wrongly unprotect a page
> > >     for the guest...).
> > 
> > You mean protect, right?  So what?
> 
> Yes, I mean with that, more things are uncertain from userspace.  It
> seems easier to me that we restrict the userspace with one index.

Donnu how to treat vague statements like this.  You need to be specific
with threat models. Otherwise there's no way to tell whether code is
secure.

> > 
> > > While if we use the indices, we restrict the userspace to only be able
> > > to write to one index only (which is the reset_index).  That's all it
> > > can do to mess things up (and it could never as long as we properly
> > > validate the reset_index when read, which only happens during
> > > KVM_RESET_DIRTY_RINGS and is very rare).  From that pov, it seems the
> > > indices solution still has its benefits.
> > 
> > So if you mess up index how is this different?
> 
> We can't mess up much with that.  We simply check fetch_index (sorry I
> meant this when I said reset_index, anyway it's the only index that we
> expose to userspace) to make sure:
> 
>   reset_index <= fetch_index <= dirty_index
> 
> Otherwise we fail the ioctl.  With that, we're 100% safe.

safe from what? userspace can mess up guest memory trivially.
for example skip sending some memory or send junk.

> > 
> > I agree RO page kind of feels safer generally though.
> > 
> > I will have to re-read how does the ring works though,
> > my comments were based on the old assumption of mmaped
> > page with indices.
> 
> Yes, sorry again for a bad cover letter.
> 
> It's basically the same as before, just that we only have per-vcpu
> ring now, and the indices are exposed from kvm_run so we don't need
> the extra page, but we still expose that via mmap.

So that's why changelogs are useful.
Can you please write a changelog for this version so I don't
need to re-read all of it? Thanks!

> > 
> > 
> > 
> > > > 
> > > > 
> > > > 
> > > > >  The larger the ring buffer, the less
> > > > > +likely the ring is full and the VM is forced to exit to userspace. The
> > > > > +optimal size depends on the workload, but it is recommended that it be
> > > > > +at least 64 KiB (4096 entries).
> > > > 
> > > > Where's this number coming from? Given you have indices as well,
> > > > 4K size rings is likely to cause cache contention.
> > > 
> > > I think we've had some similar discussion in previous versions on the
> > > size of ring.  Again imho it's really something that may not have a
> > > direct clue as long as it's big enough (4K should be).
> > > 
> > > Regarding to the cache contention: could you explain more?
> > 
> > 4K is a whole cache way. 64K 16 ways.  If there's anything else is a hot
> > path then you are pushing everything out of cache.  To re-read how do
> > indices work so see whether an index is on hot path or not. If yes your
> > structure won't fit in L1 cache which is not great.
> 
> I'm not sure whether I get the point correct, but logically we
> shouldn't read the whole ring buffer as a whole, but only partly (just
> like when we say the ring shouldn't even reach soft-full).  Even if we
> read the whole ring, I don't see a difference here comparing to when
> we read a huge array of data (e.g. "char buf[65536]") in any program
> that covers 64K range - I don't see a good way to fix this but read
> the whole chunk in.  It seems to be common in programs where we have
> big dataset.
> 
> [...]
> 
> > > > > +int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring)
> > > > > +{
> > > > > +	u32 cur_slot, next_slot;
> > > > > +	u64 cur_offset, next_offset;
> > > > > +	unsigned long mask;
> > > > > +	u32 fetch;
> > > > > +	int count = 0;
> > > > > +	struct kvm_dirty_gfn *entry;
> > > > > +	struct kvm_dirty_ring_indices *indices = ring->indices;
> > > > > +	bool first_round = true;
> > > > > +
> > > > > +	fetch = READ_ONCE(indices->fetch_index);
> > > > 
> > > > So this does not work if the data cache is virtually tagged.
> > > > Which to the best of my knowledge isn't the case on any
> > > > CPU kvm supports. However it might not stay being the
> > > > case forever. Worth at least commenting.
> > > 
> > > This is the read side.  IIUC even if with virtually tagged archs, we
> > > should do the flushing on the write side rather than the read side,
> > > and that should be enough?
> > 
> > No.
> > See e.g.  Documentation/core-api/cachetlb.rst
> > 
> >   ``void flush_dcache_page(struct page *page)``
> > 
> >         Any time the kernel writes to a page cache page, _OR_
> >         the kernel is about to read from a page cache page and
> >         user space shared/writable mappings of this page potentially
> >         exist, this routine is called.
> 
> But I don't understand why.  I feel like for such arch even the
> userspace must flush cache after publishing data onto shared memories,
> otherwise if the shared memory is between two userspace processes
> they'll get inconsistent state.  Then if with that, I'm confused on
> why the read side needs to flush it again.
> 
> > 
> > 
> > > Also, I believe this is the similar question that Jason has asked in
> > > V2.  Sorry I should mention this earlier, but I didn't address that in
> > > this series because if we need to do so we probably need to do it
> > > kvm-wise, rather than only in this series.
> > 
> > You need to document these things.
> > 
> > >  I feel like it's missing
> > > probably only because all existing KVM supported archs do not have
> > > virtual-tagged caches as you mentioned.
> > 
> > But is that a fact? ARM has such a variety of CPUs,
> > I can't really tell. Did you research this to make sure?
> 
> I didn't.  I only tried to find all callers of flush_dcache_page()
> through the whole Linux tree and I cannot see any kvm related code.
> To make this simple, let me address the dcache flushing issue in the
> next post.
> 
> Thanks,
> 
> -- 
> Peter Xu
Peter Xu Jan. 10, 2020, 3:29 p.m. UTC | #9
On Thu, Jan 09, 2020 at 05:18:24PM -0500, Michael S. Tsirkin wrote:
> On Thu, Jan 09, 2020 at 03:19:16PM -0500, Peter Xu wrote:
> > > > while for virtio, both sides (hypervisor,
> > > > and the guest driver) are trusted.
> > > 
> > > What gave you the impression guest is trusted in virtio?
> > 
> > Hmm... maybe when I know virtio can bypass vIOMMU as long as it
> > doesn't provide IOMMU_PLATFORM flag? :)
> 
> If guest driver does not provide IOMMU_PLATFORM, and device does,
> then negotiation fails.

I mean it's still possible to specify "!IOMMU_PLATFORM" for the virtio
device even if vIOMMU is enabled in the guest (rather than the
negociation procedures).  Again I think it's fair, just the same
reason as why we tend to even make "iommu=pt" by default for all the
kernel drivers, because we should trust all the drivers as kernel
itself.  The only thing we want to protect using vIOMMU is the
userspace driver because we do have a line between the userspace and
the kernel, and IMHO it's the same thing here for the new kvm
interface.

> 
> > I think it's logical to trust a virtio guest kernel driver, could you
> > guide me on what I've missed?
> 
> 
> guest driver is assumed to be part of guest kernel. It can't
> do anything kernel can't do anyway.

Right, I think all things belongs to the kernel will have the same
level of trust.  However again, userspace should be differently
treated, and that's why I tend to prefer the index solution that we
expose less to userspace to write (read is far safer comparing to
writes from userspace).

> 
> > > 
> > > 
> > > >  Above means we need to do these to
> > > > change to the new design:
> > > > 
> > > >   - Allow the GFN array to be mapped as writable by userspace (so that
> > > >     userspace can publish bit 2),
> > > > 
> > > >   - The userspace must be trusted to follow the design (just imagine
> > > >     what if the userspace overwrites a GFN when it publishes bit 2
> > > >     over a valid dirty gfn entry?  KVM could wrongly unprotect a page
> > > >     for the guest...).
> > > 
> > > You mean protect, right?  So what?
> > 
> > Yes, I mean with that, more things are uncertain from userspace.  It
> > seems easier to me that we restrict the userspace with one index.
> 
> Donnu how to treat vague statements like this.  You need to be specific
> with threat models. Otherwise there's no way to tell whether code is
> secure.
> 
> > > 
> > > > While if we use the indices, we restrict the userspace to only be able
> > > > to write to one index only (which is the reset_index).  That's all it
> > > > can do to mess things up (and it could never as long as we properly
> > > > validate the reset_index when read, which only happens during
> > > > KVM_RESET_DIRTY_RINGS and is very rare).  From that pov, it seems the
> > > > indices solution still has its benefits.
> > > 
> > > So if you mess up index how is this different?
> > 
> > We can't mess up much with that.  We simply check fetch_index (sorry I
> > meant this when I said reset_index, anyway it's the only index that we
> > expose to userspace) to make sure:
> > 
> >   reset_index <= fetch_index <= dirty_index
> > 
> > Otherwise we fail the ioctl.  With that, we're 100% safe.
> 
> safe from what? userspace can mess up guest memory trivially.
> for example skip sending some memory or send junk.

Yes, QEMU can mess the guest up, but it should never mess the host up,
am I right?  Regarding to QEMU as an userspace, KVM should see it as
untrusted as well from host-wise.  However guest security is another
thing, imho.

> 
> > > 
> > > I agree RO page kind of feels safer generally though.
> > > 
> > > I will have to re-read how does the ring works though,
> > > my comments were based on the old assumption of mmaped
> > > page with indices.
> > 
> > Yes, sorry again for a bad cover letter.
> > 
> > It's basically the same as before, just that we only have per-vcpu
> > ring now, and the indices are exposed from kvm_run so we don't need
> > the extra page, but we still expose that via mmap.
> 
> So that's why changelogs are useful.
> Can you please write a changelog for this version so I don't
> need to re-read all of it? Thanks!

Sure, actually I've got a changelog in the cover letter for this
version [1]... it's:

V3 changelog:

- fail userspace writable maps on dirty ring ranges [Jason]
- commit message fixups [Paolo]
- change __x86_set_memory_region to return hva [Paolo]
- cacheline align for indices [Paolo, Jason]
- drop waitqueue, global lock, etc., include kvmgt rework patchset
- take lock for __x86_set_memory_region() (otherwise it triggers a
  lockdep in latest kvm/queue) [Paolo]
- check KVM_DIRTY_LOG_PAGE_OFFSET in kvm_vm_ioctl_enable_dirty_log_ring
- one more patch to drop x86_set_memory_region [Paolo]
- one more patch to remove extra srcu usage in init_rmode_identity_map()
- add some r-bs for Paolo

I didn't have detailed changelog for v2 because it could be a long
list with trivial details which can hide the major things, but I've
got a small write-up in the cover letter trying to mention the major
changes [2].

Again, I'm very sorry for either missing a complete changelog in v2,
or the high-level overview of v3 in the cover letter.  I'll make it
better in v4.

Thanks,

[1] https://lore.kernel.org/kvm/20200109145729.32898-1-peterx@redhat.com/
[2] https://lore.kernel.org/kvm/20191220211634.51231-1-peterx@redhat.com/
kernel test robot Jan. 11, 2020, 4:49 a.m. UTC | #10
Hi Peter,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on kvm/linux-next]
[also build test ERROR on next-20200110]
[cannot apply to kvmarm/next vfio/next v5.5-rc5]
[if your patch is applied to the wrong git tree, please drop us a note to help
improve the system. BTW, we also suggest to use '--base' option to specify the
base tree in git format-patch, please see https://stackoverflow.com/a/37406982]

url:    https://github.com/0day-ci/linux/commits/Peter-Xu/KVM-Dirty-ring-interface/20200110-152053
base:   https://git.kernel.org/pub/scm/virt/kvm/kvm.git linux-next
config: s390-alldefconfig (attached as .config)
compiler: s390-linux-gcc (GCC) 7.5.0
reproduce:
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        GCC_VERSION=7.5.0 make.cross ARCH=s390 

If you fix the issue, kindly add following tag
Reported-by: kbuild test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   arch/s390/../../virt/kvm/kvm_main.o: In function `mark_page_dirty_in_slot':
>> kvm_main.c:(.text+0x4d6): undefined reference to `kvm_dirty_ring_get'
>> kvm_main.c:(.text+0x4f0): undefined reference to `kvm_dirty_ring_push'
   arch/s390/../../virt/kvm/kvm_main.o: In function `kvm_vcpu_init':
>> kvm_main.c:(.text+0x1fe6): undefined reference to `kvm_dirty_ring_alloc'
>> kvm_main.c:(.text+0x204c): undefined reference to `kvm_dirty_ring_free'
   arch/s390/../../virt/kvm/kvm_main.o: In function `kvm_vcpu_uninit':
   kvm_main.c:(.text+0x20c0): undefined reference to `kvm_dirty_ring_free'
   arch/s390/../../virt/kvm/kvm_main.o: In function `kvm_reset_dirty_gfn':
>> kvm_main.c:(.text+0x6650): undefined reference to `kvm_arch_mmu_enable_log_dirty_pt_masked'
   arch/s390/../../virt/kvm/kvm_main.o: In function `kvm_vm_ioctl':
>> kvm_main.c:(.text+0x6b58): undefined reference to `kvm_dirty_ring_reset'

---
0-DAY kernel test infrastructure                 Open Source Technology Center
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org Intel Corporation
kernel test robot Jan. 11, 2020, 11:19 p.m. UTC | #11
Hi Peter,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on kvm/linux-next]
[also build test ERROR on next-20200110]
[cannot apply to kvmarm/next vfio/next v5.5-rc5]
[if your patch is applied to the wrong git tree, please drop us a note to help
improve the system. BTW, we also suggest to use '--base' option to specify the
base tree in git format-patch, please see https://stackoverflow.com/a/37406982]

url:    https://github.com/0day-ci/linux/commits/Peter-Xu/KVM-Dirty-ring-interface/20200110-152053
base:   https://git.kernel.org/pub/scm/virt/kvm/kvm.git linux-next
config: powerpc-defconfig (attached as .config)
compiler: powerpc64-linux-gcc (GCC) 7.5.0
reproduce:
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        GCC_VERSION=7.5.0 make.cross ARCH=powerpc 

If you fix the issue, kindly add following tag
Reported-by: kbuild test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

>> ERROR: ".kvm_arch_mmu_enable_log_dirty_pt_masked" [arch/powerpc/kvm/kvm.ko] undefined!
>> ERROR: ".kvm_dirty_ring_push" [arch/powerpc/kvm/kvm.ko] undefined!
>> ERROR: ".kvm_dirty_ring_free" [arch/powerpc/kvm/kvm.ko] undefined!
>> ERROR: ".kvm_dirty_ring_get" [arch/powerpc/kvm/kvm.ko] undefined!
>> ERROR: ".kvm_dirty_ring_reset" [arch/powerpc/kvm/kvm.ko] undefined!
>> ERROR: ".kvm_dirty_ring_alloc" [arch/powerpc/kvm/kvm.ko] undefined!

---
0-DAY kernel test infrastructure                 Open Source Technology Center
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org Intel Corporation
Michael S. Tsirkin Jan. 12, 2020, 6:24 a.m. UTC | #12
On Fri, Jan 10, 2020 at 10:29:59AM -0500, Peter Xu wrote:
> On Thu, Jan 09, 2020 at 05:18:24PM -0500, Michael S. Tsirkin wrote:
> > On Thu, Jan 09, 2020 at 03:19:16PM -0500, Peter Xu wrote:
> > > > > while for virtio, both sides (hypervisor,
> > > > > and the guest driver) are trusted.
> > > > 
> > > > What gave you the impression guest is trusted in virtio?
> > > 
> > > Hmm... maybe when I know virtio can bypass vIOMMU as long as it
> > > doesn't provide IOMMU_PLATFORM flag? :)
> > 
> > If guest driver does not provide IOMMU_PLATFORM, and device does,
> > then negotiation fails.
> 
> I mean it's still possible to specify "!IOMMU_PLATFORM" for the virtio
> device even if vIOMMU is enabled in the guest (rather than the
> negociation procedures).  Again I think it's fair, just the same
> reason as why we tend to even make "iommu=pt" by default for all the
> kernel drivers, because we should trust all the drivers as kernel
> itself.  The only thing we want to protect using vIOMMU is the
> userspace driver because we do have a line between the userspace and
> the kernel, and IMHO it's the same thing here for the new kvm
> interface.
> 
> > 
> > > I think it's logical to trust a virtio guest kernel driver, could you
> > > guide me on what I've missed?
> > 
> > 
> > guest driver is assumed to be part of guest kernel. It can't
> > do anything kernel can't do anyway.
> 
> Right, I think all things belongs to the kernel will have the same
> level of trust.  However again, userspace should be differently
> treated, and that's why I tend to prefer the index solution that we
> expose less to userspace to write (read is far safer comparing to
> writes from userspace).

You are mixing up different userspace kinds here. vIOMMU
prtects guest kernel from guest userspace.
Protecting guest kernel against userspace hypervisors
(e.g. QEMU) is mostly futile.


> > 
> > > > 
> > > > 
> > > > >  Above means we need to do these to
> > > > > change to the new design:
> > > > > 
> > > > >   - Allow the GFN array to be mapped as writable by userspace (so that
> > > > >     userspace can publish bit 2),
> > > > > 
> > > > >   - The userspace must be trusted to follow the design (just imagine
> > > > >     what if the userspace overwrites a GFN when it publishes bit 2
> > > > >     over a valid dirty gfn entry?  KVM could wrongly unprotect a page
> > > > >     for the guest...).
> > > > 
> > > > You mean protect, right?  So what?
> > > 
> > > Yes, I mean with that, more things are uncertain from userspace.  It
> > > seems easier to me that we restrict the userspace with one index.
> > 
> > Donnu how to treat vague statements like this.  You need to be specific
> > with threat models. Otherwise there's no way to tell whether code is
> > secure.
> > 
> > > > 
> > > > > While if we use the indices, we restrict the userspace to only be able
> > > > > to write to one index only (which is the reset_index).  That's all it
> > > > > can do to mess things up (and it could never as long as we properly
> > > > > validate the reset_index when read, which only happens during
> > > > > KVM_RESET_DIRTY_RINGS and is very rare).  From that pov, it seems the
> > > > > indices solution still has its benefits.
> > > > 
> > > > So if you mess up index how is this different?
> > > 
> > > We can't mess up much with that.  We simply check fetch_index (sorry I
> > > meant this when I said reset_index, anyway it's the only index that we
> > > expose to userspace) to make sure:
> > > 
> > >   reset_index <= fetch_index <= dirty_index
> > > 
> > > Otherwise we fail the ioctl.  With that, we're 100% safe.
> > 
> > safe from what? userspace can mess up guest memory trivially.
> > for example skip sending some memory or send junk.
> 
> Yes, QEMU can mess the guest up, but it should never mess the host up,
> am I right?  Regarding to QEMU as an userspace, KVM should see it as
> untrusted as well from host-wise.  However guest security is another
> thing, imho.
> 
> > 
> > > > 
> > > > I agree RO page kind of feels safer generally though.
> > > > 
> > > > I will have to re-read how does the ring works though,
> > > > my comments were based on the old assumption of mmaped
> > > > page with indices.
> > > 
> > > Yes, sorry again for a bad cover letter.
> > > 
> > > It's basically the same as before, just that we only have per-vcpu
> > > ring now, and the indices are exposed from kvm_run so we don't need
> > > the extra page, but we still expose that via mmap.
> > 
> > So that's why changelogs are useful.
> > Can you please write a changelog for this version so I don't
> > need to re-read all of it? Thanks!
> 
> Sure, actually I've got a changelog in the cover letter for this
> version [1]... it's:
> 
> V3 changelog:
> 
> - fail userspace writable maps on dirty ring ranges [Jason]
> - commit message fixups [Paolo]
> - change __x86_set_memory_region to return hva [Paolo]
> - cacheline align for indices [Paolo, Jason]
> - drop waitqueue, global lock, etc., include kvmgt rework patchset
> - take lock for __x86_set_memory_region() (otherwise it triggers a
>   lockdep in latest kvm/queue) [Paolo]
> - check KVM_DIRTY_LOG_PAGE_OFFSET in kvm_vm_ioctl_enable_dirty_log_ring
> - one more patch to drop x86_set_memory_region [Paolo]
> - one more patch to remove extra srcu usage in init_rmode_identity_map()
> - add some r-bs for Paolo
> 
> I didn't have detailed changelog for v2 because it could be a long
> list with trivial details which can hide the major things, but I've
> got a small write-up in the cover letter trying to mention the major
> changes [2].
> 
> Again, I'm very sorry for either missing a complete changelog in v2,
> or the high-level overview of v3 in the cover letter.  I'll make it
> better in v4.
> 
> Thanks,
> 
> [1] https://lore.kernel.org/kvm/20200109145729.32898-1-peterx@redhat.com/
> [2] https://lore.kernel.org/kvm/20191220211634.51231-1-peterx@redhat.com/
> 
> -- 
> Peter Xu
Peter Xu Jan. 14, 2020, 8:01 p.m. UTC | #13
On Thu, Jan 09, 2020 at 02:35:46PM -0500, Michael S. Tsirkin wrote:
>   ``void flush_dcache_page(struct page *page)``
> 
>         Any time the kernel writes to a page cache page, _OR_
>         the kernel is about to read from a page cache page and
>         user space shared/writable mappings of this page potentially
>         exist, this routine is called.
> 
> 
> > Also, I believe this is the similar question that Jason has asked in
> > V2.  Sorry I should mention this earlier, but I didn't address that in
> > this series because if we need to do so we probably need to do it
> > kvm-wise, rather than only in this series.
> 
> You need to document these things.
> 
> >  I feel like it's missing
> > probably only because all existing KVM supported archs do not have
> > virtual-tagged caches as you mentioned.
> 
> But is that a fact? ARM has such a variety of CPUs,
> I can't really tell. Did you research this to make sure?
> 
> > If so, I would prefer if you
> > can allow me to ignore that issue until KVM starts to support such an
> > arch.
> 
> Document limitations pls.  Don't ignore them.

Hi, Michael,

I failed to find a good place to document about flush_dcache_page()
for KVM.  Could you give me a suggestion?

And I don't know about whether there's any ARM hosts that requires
flush_dcache_page().  I think not, because again I didn't see any
caller of flush_dcache_page() in KVM code yet.  Otherwise I think we
should at least call it before the kernel reading kvm_run or after
publishing data to kvm_run.  However I'm also CCing Drew for this.

Thanks,
Michael S. Tsirkin Jan. 15, 2020, 6:47 a.m. UTC | #14
On Thu, Jan 09, 2020 at 09:57:20AM -0500, Peter Xu wrote:
> This patch is heavily based on previous work from Lei Cao
> <lei.cao@stratus.com> and Paolo Bonzini <pbonzini@redhat.com>. [1]
> 
> KVM currently uses large bitmaps to track dirty memory.  These bitmaps
> are copied to userspace when userspace queries KVM for its dirty page
> information.  The use of bitmaps is mostly sufficient for live
> migration, as large parts of memory are be dirtied from one log-dirty
> pass to another.  However, in a checkpointing system, the number of
> dirty pages is small and in fact it is often bounded---the VM is
> paused when it has dirtied a pre-defined number of pages. Traversing a
> large, sparsely populated bitmap to find set bits is time-consuming,
> as is copying the bitmap to user-space.
> 
> A similar issue will be there for live migration when the guest memory
> is huge while the page dirty procedure is trivial.  In that case for
> each dirty sync we need to pull the whole dirty bitmap to userspace
> and analyse every bit even if it's mostly zeros.
> 
> The preferred data structure for above scenarios is a dense list of
> guest frame numbers (GFN).  This patch series stores the dirty list in
> kernel memory that can be memory mapped into userspace to allow speedy
> harvesting.
> 
> This patch enables dirty ring for X86 only.  However it should be
> easily extended to other archs as well.
> 
> [1] https://patchwork.kernel.org/patch/10471409/
> 
> Signed-off-by: Lei Cao <lei.cao@stratus.com>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
>  Documentation/virt/kvm/api.txt  |  89 ++++++++++++++++++
>  arch/x86/include/asm/kvm_host.h |   3 +
>  arch/x86/include/uapi/asm/kvm.h |   1 +
>  arch/x86/kvm/Makefile           |   3 +-
>  arch/x86/kvm/mmu/mmu.c          |   6 ++
>  arch/x86/kvm/vmx/vmx.c          |   7 ++
>  arch/x86/kvm/x86.c              |   9 ++
>  include/linux/kvm_dirty_ring.h  |  55 +++++++++++
>  include/linux/kvm_host.h        |  26 +++++
>  include/trace/events/kvm.h      |  78 +++++++++++++++
>  include/uapi/linux/kvm.h        |  33 +++++++
>  virt/kvm/dirty_ring.c           | 162 ++++++++++++++++++++++++++++++++
>  virt/kvm/kvm_main.c             | 137 ++++++++++++++++++++++++++-
>  13 files changed, 606 insertions(+), 3 deletions(-)
>  create mode 100644 include/linux/kvm_dirty_ring.h
>  create mode 100644 virt/kvm/dirty_ring.c
> 
> diff --git a/Documentation/virt/kvm/api.txt b/Documentation/virt/kvm/api.txt
> index ebb37b34dcfc..708c3e0f7eae 100644
> --- a/Documentation/virt/kvm/api.txt
> +++ b/Documentation/virt/kvm/api.txt
> @@ -231,6 +231,7 @@ Based on their initialization different VMs may have different capabilities.
>  It is thus encouraged to use the vm ioctl to query for capabilities (available
>  with KVM_CAP_CHECK_EXTENSION_VM on the vm fd)
>  
> +
>  4.5 KVM_GET_VCPU_MMAP_SIZE
>  
>  Capability: basic
> @@ -243,6 +244,18 @@ The KVM_RUN ioctl (cf.) communicates with userspace via a shared
>  memory region.  This ioctl returns the size of that region.  See the
>  KVM_RUN documentation for details.
>  
> +Besides the size of the KVM_RUN communication region, other areas of
> +the VCPU file descriptor can be mmap-ed, including:
> +
> +- if KVM_CAP_COALESCED_MMIO is available, a page at
> +  KVM_COALESCED_MMIO_PAGE_OFFSET * PAGE_SIZE; for historical reasons,
> +  this page is included in the result of KVM_GET_VCPU_MMAP_SIZE.
> +  KVM_CAP_COALESCED_MMIO is not documented yet.
> +
> +- if KVM_CAP_DIRTY_LOG_RING is available, a number of pages at
> +  KVM_DIRTY_LOG_PAGE_OFFSET * PAGE_SIZE.  For more information on
> +  KVM_CAP_DIRTY_LOG_RING, see section 8.3.
> +
>  
>  4.6 KVM_SET_MEMORY_REGION
>  
> @@ -5376,6 +5389,7 @@ CPU when the exception is taken. If this virtual SError is taken to EL1 using
>  AArch64, this value will be reported in the ISS field of ESR_ELx.
>  
>  See KVM_CAP_VCPU_EVENTS for more details.
> +
>  8.20 KVM_CAP_HYPERV_SEND_IPI
>  
>  Architectures: x86
> @@ -5383,6 +5397,7 @@ Architectures: x86
>  This capability indicates that KVM supports paravirtualized Hyper-V IPI send
>  hypercalls:
>  HvCallSendSyntheticClusterIpi, HvCallSendSyntheticClusterIpiEx.
> +
>  8.21 KVM_CAP_HYPERV_DIRECT_TLBFLUSH
>  
>  Architecture: x86
> @@ -5396,3 +5411,77 @@ handling by KVM (as some KVM hypercall may be mistakenly treated as TLB
>  flush hypercalls by Hyper-V) so userspace should disable KVM identification
>  in CPUID and only exposes Hyper-V identification. In this case, guest
>  thinks it's running on Hyper-V and only use Hyper-V hypercalls.
> +
> +8.22 KVM_CAP_DIRTY_LOG_RING
> +
> +Architectures: x86
> +Parameters: args[0] - size of the dirty log ring
> +
> +KVM is capable of tracking dirty memory using ring buffers that are
> +mmaped into userspace; there is one dirty ring per vcpu.
> +
> +One dirty ring is defined as below internally:
> +
> +struct kvm_dirty_ring {
> +	u32 dirty_index;
> +	u32 reset_index;
> +	u32 size;
> +	u32 soft_limit;
> +	struct kvm_dirty_gfn *dirty_gfns;
> +	struct kvm_dirty_ring_indices *indices;
> +	int index;
> +};
> +
> +Dirty GFNs (Guest Frame Numbers) are stored in the dirty_gfns array.
> +For each of the dirty entry it's defined as:
> +
> +struct kvm_dirty_gfn {
> +        __u32 pad;
> +        __u32 slot; /* as_id | slot_id */
> +        __u64 offset;
> +};
> +
> +Most of the ring structure is used by KVM internally, while only the
> +indices are exposed to userspace:
> +
> +struct kvm_dirty_ring_indices {
> +	__u32 avail_index; /* set by kernel */
> +	__u32 fetch_index; /* set by userspace */
> +};
> +
> +The two indices in the ring buffer are free running counters.
> +
> +Userspace calls KVM_ENABLE_CAP ioctl right after KVM_CREATE_VM ioctl
> +to enable this capability for the new guest and set the size of the
> +rings.  It is only allowed before creating any vCPU, and the size of
> +the ring must be a power of two.  The larger the ring buffer, the less
> +likely the ring is full and the VM is forced to exit to userspace. The
> +optimal size depends on the workload, but it is recommended that it be
> +at least 64 KiB (4096 entries).
> +
> +Just like for dirty page bitmaps, the buffer tracks writes to
> +all user memory regions for which the KVM_MEM_LOG_DIRTY_PAGES flag was
> +set in KVM_SET_USER_MEMORY_REGION.  Once a memory region is registered
> +with the flag set, userspace can start harvesting dirty pages from the
> +ring buffer.
> +
> +To harvest the dirty pages, userspace accesses the mmaped ring buffer
> +to read the dirty GFNs up to avail_index, and sets the fetch_index
> +accordingly.  This can be done when the guest is running or paused,
> +and dirty pages need not be collected all at once.  After processing
> +one or more entries in the ring buffer, userspace calls the VM ioctl
> +KVM_RESET_DIRTY_RINGS to notify the kernel that it has updated
> +fetch_index and to mark those pages clean.  Therefore, the ioctl
> +must be called *before* reading the content of the dirty pages.
> +
> +However, there is a major difference comparing to the
> +KVM_GET_DIRTY_LOG interface in that when reading the dirty ring from
> +userspace it's still possible that the kernel has not yet flushed the
> +hardware dirty buffers into the kernel buffer (which was previously
> +done by the KVM_GET_DIRTY_LOG ioctl).  To achieve that, one needs to
> +kick the vcpu out for a hardware buffer flush (vmexit) to make sure
> +all the existing dirty gfns are flushed to the dirty rings.
> +
> +If one of the ring buffers is full, the guest will exit to userspace
> +with the exit reason set to KVM_EXIT_DIRTY_LOG_FULL, and the KVM_RUN
> +ioctl will return to userspace with zero.
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index f536d139b3d2..3fe18402e6a3 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1181,6 +1181,7 @@ struct kvm_x86_ops {
>  					   struct kvm_memory_slot *slot,
>  					   gfn_t offset, unsigned long mask);
>  	int (*write_log_dirty)(struct kvm_vcpu *vcpu);
> +	int (*cpu_dirty_log_size)(void);
>  
>  	/* pmu operations of sub-arch */
>  	const struct kvm_pmu_ops *pmu_ops;
> @@ -1666,4 +1667,6 @@ static inline int kvm_cpu_get_apicid(int mps_cpu)
>  #define GET_SMSTATE(type, buf, offset)		\
>  	(*(type *)((buf) + (offset) - 0x7e00))
>  
> +int kvm_cpu_dirty_log_size(void);
> +
>  #endif /* _ASM_X86_KVM_HOST_H */
> diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
> index 503d3f42da16..b59bf356c478 100644
> --- a/arch/x86/include/uapi/asm/kvm.h
> +++ b/arch/x86/include/uapi/asm/kvm.h
> @@ -12,6 +12,7 @@
>  
>  #define KVM_PIO_PAGE_OFFSET 1
>  #define KVM_COALESCED_MMIO_PAGE_OFFSET 2
> +#define KVM_DIRTY_LOG_PAGE_OFFSET 64
>  
>  #define DE_VECTOR 0
>  #define DB_VECTOR 1
> diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> index b19ef421084d..0acee817adfb 100644
> --- a/arch/x86/kvm/Makefile
> +++ b/arch/x86/kvm/Makefile
> @@ -5,7 +5,8 @@ ccflags-y += -Iarch/x86/kvm
>  KVM := ../../../virt/kvm
>  
>  kvm-y			+= $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
> -				$(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o
> +				$(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o \
> +				$(KVM)/dirty_ring.o
>  kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(KVM)/async_pf.o
>  
>  kvm-y			+= x86.o emulate.o i8259.o irq.o lapic.o \
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 7269130ea5e2..621b842a9b7b 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -1832,7 +1832,13 @@ int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu)
>  {
>  	if (kvm_x86_ops->write_log_dirty)
>  		return kvm_x86_ops->write_log_dirty(vcpu);
> +	return 0;
> +}
>  
> +int kvm_cpu_dirty_log_size(void)
> +{
> +	if (kvm_x86_ops->cpu_dirty_log_size)
> +		return kvm_x86_ops->cpu_dirty_log_size();
>  	return 0;
>  }
>  
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index 62175a246bcc..2151de89456d 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -7689,6 +7689,7 @@ static __init int hardware_setup(void)
>  		kvm_x86_ops->slot_disable_log_dirty = NULL;
>  		kvm_x86_ops->flush_log_dirty = NULL;
>  		kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
> +		kvm_x86_ops->cpu_dirty_log_size = NULL;
>  	}
>  
>  	if (!cpu_has_vmx_preemption_timer())
> @@ -7753,6 +7754,11 @@ static __exit void hardware_unsetup(void)
>  	free_kvm_area();
>  }
>  
> +static int vmx_cpu_dirty_log_size(void)
> +{
> +	return enable_pml ? PML_ENTITY_NUM : 0;
> +}
> +
>  static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
>  	.cpu_has_kvm_support = cpu_has_kvm_support,
>  	.disabled_by_bios = vmx_disabled_by_bios,
> @@ -7875,6 +7881,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
>  	.flush_log_dirty = vmx_flush_log_dirty,
>  	.enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
>  	.write_log_dirty = vmx_write_pml_buffer,
> +	.cpu_dirty_log_size = vmx_cpu_dirty_log_size,
>  
>  	.pre_block = vmx_pre_block,
>  	.post_block = vmx_post_block,
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index ff97782b3919..9c3673592826 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -7998,6 +7998,15 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>  
>  	bool req_immediate_exit = false;
>  
> +	/* Forbid vmenter if vcpu dirty ring is soft-full */
> +	if (unlikely(vcpu->kvm->dirty_ring_size &&
> +		     kvm_dirty_ring_soft_full(&vcpu->dirty_ring))) {
> +		vcpu->run->exit_reason = KVM_EXIT_DIRTY_RING_FULL;
> +		trace_kvm_dirty_ring_exit(vcpu);
> +		r = 0;
> +		goto out;
> +	}
> +
>  	if (kvm_request_pending(vcpu)) {
>  		if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu)) {
>  			if (unlikely(!kvm_x86_ops->get_vmcs12_pages(vcpu))) {
> diff --git a/include/linux/kvm_dirty_ring.h b/include/linux/kvm_dirty_ring.h
> new file mode 100644
> index 000000000000..d6fe9e1b7617
> --- /dev/null
> +++ b/include/linux/kvm_dirty_ring.h
> @@ -0,0 +1,55 @@
> +#ifndef KVM_DIRTY_RING_H
> +#define KVM_DIRTY_RING_H
> +
> +/**
> + * kvm_dirty_ring: KVM internal dirty ring structure
> + *
> + * @dirty_index: free running counter that points to the next slot in
> + *               dirty_ring->dirty_gfns, where a new dirty page should go
> + * @reset_index: free running counter that points to the next dirty page
> + *               in dirty_ring->dirty_gfns for which dirty trap needs to
> + *               be reenabled
> + * @size:        size of the compact list, dirty_ring->dirty_gfns
> + * @soft_limit:  when the number of dirty pages in the list reaches this
> + *               limit, vcpu that owns this ring should exit to userspace
> + *               to allow userspace to harvest all the dirty pages
> + * @dirty_gfns:  the array to keep the dirty gfns
> + * @indices:     the pointer to the @kvm_dirty_ring_indices structure
> + *               of this specific ring
> + * @index:       index of this dirty ring
> + */
> +struct kvm_dirty_ring {
> +	u32 dirty_index;
> +	u32 reset_index;
> +	u32 size;
> +	u32 soft_limit;
> +	struct kvm_dirty_gfn *dirty_gfns;

Here would be a good place to document that accessing
shared page like this is only safe if archotecture is physically
tagged.

> +	struct kvm_dirty_ring_indices *indices;
> +	int index;
> +};
> +
> +u32 kvm_dirty_ring_get_rsvd_entries(void);
> +int kvm_dirty_ring_alloc(struct kvm_dirty_ring *ring,
> +			 struct kvm_dirty_ring_indices *indices,
> +			 int index, u32 size);
> +struct kvm_dirty_ring *kvm_dirty_ring_get(struct kvm *kvm);
> +
> +/*
> + * called with kvm->slots_lock held, returns the number of
> + * processed pages.
> + */
> +int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring);
> +
> +/*
> + * returns =0: successfully pushed
> + *         <0: unable to push, need to wait
> + */
> +void kvm_dirty_ring_push(struct kvm_dirty_ring *ring, u32 slot, u64 offset);
> +
> +/* for use in vm_operations_struct */
> +struct page *kvm_dirty_ring_get_page(struct kvm_dirty_ring *ring, u32 offset);
> +
> +void kvm_dirty_ring_free(struct kvm_dirty_ring *ring);
> +bool kvm_dirty_ring_soft_full(struct kvm_dirty_ring *ring);
> +
> +#endif
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index cbd633ece959..c96161c6a0c9 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -34,6 +34,7 @@
>  #include <linux/kvm_types.h>
>  
>  #include <asm/kvm_host.h>
> +#include <linux/kvm_dirty_ring.h>
>  
>  #ifndef KVM_MAX_VCPU_ID
>  #define KVM_MAX_VCPU_ID KVM_MAX_VCPUS
> @@ -321,6 +322,7 @@ struct kvm_vcpu {
>  	bool ready;
>  	struct kvm_vcpu_arch arch;
>  	struct dentry *debugfs_dentry;
> +	struct kvm_dirty_ring dirty_ring;
>  };
>  
>  static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu)
> @@ -502,6 +504,7 @@ struct kvm {
>  	struct srcu_struct srcu;
>  	struct srcu_struct irq_srcu;
>  	pid_t userspace_pid;
> +	u32 dirty_ring_size;
>  };
>  
>  #define kvm_err(fmt, ...) \
> @@ -831,6 +834,8 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
>  					gfn_t gfn_offset,
>  					unsigned long mask);
>  
> +void kvm_reset_dirty_gfn(struct kvm *kvm, u32 slot, u64 offset, u64 mask);
> +
>  int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
>  				struct kvm_dirty_log *log);
>  int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
> @@ -1409,4 +1414,25 @@ int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
>  				uintptr_t data, const char *name,
>  				struct task_struct **thread_ptr);
>  
> +/*
> + * This defines how many reserved entries we want to keep before we
> + * kick the vcpu to the userspace to avoid dirty ring full.  This
> + * value can be tuned to higher if e.g. PML is enabled on the host.
> + */
> +#define  KVM_DIRTY_RING_RSVD_ENTRIES  64
> +
> +/* Max number of entries allowed for each kvm dirty ring */
> +#define  KVM_DIRTY_RING_MAX_ENTRIES  65536
> +
> +/*
> + * Arch needs to define these macro after implementing the dirty ring
> + * feature.  KVM_DIRTY_LOG_PAGE_OFFSET should be defined as the
> + * starting page offset of the dirty ring structures, while
> + * KVM_DIRTY_RING_VERSION should be defined as >=1.  By default, this
> + * feature is off on all archs.
> + */
> +#ifndef KVM_DIRTY_LOG_PAGE_OFFSET
> +#define KVM_DIRTY_LOG_PAGE_OFFSET 0
> +#endif
> +
>  #endif
> diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
> index 2c735a3e6613..3d850997940c 100644
> --- a/include/trace/events/kvm.h
> +++ b/include/trace/events/kvm.h
> @@ -399,6 +399,84 @@ TRACE_EVENT(kvm_halt_poll_ns,
>  #define trace_kvm_halt_poll_ns_shrink(vcpu_id, new, old) \
>  	trace_kvm_halt_poll_ns(false, vcpu_id, new, old)
>  
> +TRACE_EVENT(kvm_dirty_ring_push,
> +	TP_PROTO(struct kvm_dirty_ring *ring, u32 slot, u64 offset),
> +	TP_ARGS(ring, slot, offset),
> +
> +	TP_STRUCT__entry(
> +		__field(int, index)
> +		__field(u32, dirty_index)
> +		__field(u32, reset_index)
> +		__field(u32, slot)
> +		__field(u64, offset)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->index          = ring->index;
> +		__entry->dirty_index    = ring->dirty_index;
> +		__entry->reset_index    = ring->reset_index;
> +		__entry->slot           = slot;
> +		__entry->offset         = offset;
> +	),
> +
> +	TP_printk("ring %d: dirty 0x%x reset 0x%x "
> +		  "slot %u offset 0x%llx (used %u)",
> +		  __entry->index, __entry->dirty_index,
> +		  __entry->reset_index,  __entry->slot, __entry->offset,
> +		  __entry->dirty_index - __entry->reset_index)
> +);
> +
> +TRACE_EVENT(kvm_dirty_ring_reset,
> +	TP_PROTO(struct kvm_dirty_ring *ring),
> +	TP_ARGS(ring),
> +
> +	TP_STRUCT__entry(
> +		__field(int, index)
> +		__field(u32, dirty_index)
> +		__field(u32, reset_index)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->index          = ring->index;
> +		__entry->dirty_index    = ring->dirty_index;
> +		__entry->reset_index    = ring->reset_index;
> +	),
> +
> +	TP_printk("ring %d: dirty 0x%x reset 0x%x (used %u)",
> +		  __entry->index, __entry->dirty_index, __entry->reset_index,
> +		  __entry->dirty_index - __entry->reset_index)
> +);
> +
> +TRACE_EVENT(kvm_dirty_ring_waitqueue,
> +	TP_PROTO(bool enter),
> +	TP_ARGS(enter),
> +
> +	TP_STRUCT__entry(
> +	    __field(bool, enter)
> +	),
> +
> +	TP_fast_assign(
> +	    __entry->enter = enter;
> +	),
> +
> +	TP_printk("%s", __entry->enter ? "wait" : "awake")
> +);
> +
> +TRACE_EVENT(kvm_dirty_ring_exit,
> +	TP_PROTO(struct kvm_vcpu *vcpu),
> +	TP_ARGS(vcpu),
> +
> +	TP_STRUCT__entry(
> +	    __field(int, vcpu_id)
> +	),
> +
> +	TP_fast_assign(
> +	    __entry->vcpu_id = vcpu->vcpu_id;
> +	),
> +
> +	TP_printk("vcpu %d", __entry->vcpu_id)
> +);
> +
>  #endif /* _TRACE_KVM_MAIN_H */
>  
>  /* This part must be outside protection */
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index f0a16b4adbbd..df4a1700ff1e 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -236,6 +236,7 @@ struct kvm_hyperv_exit {
>  #define KVM_EXIT_IOAPIC_EOI       26
>  #define KVM_EXIT_HYPERV           27
>  #define KVM_EXIT_ARM_NISV         28
> +#define KVM_EXIT_DIRTY_RING_FULL  29
>  
>  /* For KVM_EXIT_INTERNAL_ERROR */
>  /* Emulate instruction failed. */
> @@ -247,6 +248,13 @@ struct kvm_hyperv_exit {
>  /* Encounter unexpected vm-exit reason */
>  #define KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON	4
>  
> +struct kvm_dirty_ring_indices {
> +	__u32 avail_index; /* set by kernel */
> +	__u32 padding1;
> +	__u32 fetch_index; /* set by userspace */
> +	__u32 padding2;
> +};
> +
>  /* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */
>  struct kvm_run {
>  	/* in */
> @@ -421,6 +429,8 @@ struct kvm_run {
>  		struct kvm_sync_regs regs;
>  		char padding[SYNC_REGS_SIZE_BYTES];
>  	} s;
> +
> +	struct kvm_dirty_ring_indices vcpu_ring_indices;
>  };
>  
>  /* for KVM_REGISTER_COALESCED_MMIO / KVM_UNREGISTER_COALESCED_MMIO */
> @@ -1009,6 +1019,7 @@ struct kvm_ppc_resize_hpt {
>  #define KVM_CAP_PPC_GUEST_DEBUG_SSTEP 176
>  #define KVM_CAP_ARM_NISV_TO_USER 177
>  #define KVM_CAP_ARM_INJECT_EXT_DABT 178
> +#define KVM_CAP_DIRTY_LOG_RING 179
>  
>  #ifdef KVM_CAP_IRQ_ROUTING
>  
> @@ -1473,6 +1484,9 @@ struct kvm_enc_region {
>  /* Available with KVM_CAP_ARM_SVE */
>  #define KVM_ARM_VCPU_FINALIZE	  _IOW(KVMIO,  0xc2, int)
>  
> +/* Available with KVM_CAP_DIRTY_LOG_RING */
> +#define KVM_RESET_DIRTY_RINGS     _IO(KVMIO, 0xc3)
> +
>  /* Secure Encrypted Virtualization command */
>  enum sev_cmd_id {
>  	/* Guest initialization commands */
> @@ -1623,4 +1637,23 @@ struct kvm_hyperv_eventfd {
>  #define KVM_HYPERV_CONN_ID_MASK		0x00ffffff
>  #define KVM_HYPERV_EVENTFD_DEASSIGN	(1 << 0)
>  
> +/*
> + * The following are the requirements for supporting dirty log ring
> + * (by enabling KVM_DIRTY_LOG_PAGE_OFFSET).
> + *
> + * 1. Memory accesses by KVM should call kvm_vcpu_write_* instead
> + *    of kvm_write_* so that the global dirty ring is not filled up
> + *    too quickly.
> + * 2. kvm_arch_mmu_enable_log_dirty_pt_masked should be defined for
> + *    enabling dirty logging.
> + * 3. There should not be a separate step to synchronize hardware
> + *    dirty bitmap with KVM's.


Are these requirement from an architecture? Then you want to move
this out of UAPI, keep things relevant to userspace there.

> + */
> +
> +struct kvm_dirty_gfn {
> +	__u32 pad;
> +	__u32 slot;
> +	__u64 offset;
> +};
> +

Pls add comments about how kvm_dirty_gfn must be mmapped.


>  #endif /* __LINUX_KVM_H */
> diff --git a/virt/kvm/dirty_ring.c b/virt/kvm/dirty_ring.c
> new file mode 100644
> index 000000000000..67ec5bbc21c0
> --- /dev/null
> +++ b/virt/kvm/dirty_ring.c
> @@ -0,0 +1,162 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * KVM dirty ring implementation
> + *
> + * Copyright 2019 Red Hat, Inc.
> + */
> +#include <linux/kvm_host.h>
> +#include <linux/kvm.h>
> +#include <linux/vmalloc.h>
> +#include <linux/kvm_dirty_ring.h>
> +#include <trace/events/kvm.h>
> +
> +int __weak kvm_cpu_dirty_log_size(void)
> +{
> +	return 0;
> +}
> +
> +u32 kvm_dirty_ring_get_rsvd_entries(void)
> +{
> +	return KVM_DIRTY_RING_RSVD_ENTRIES + kvm_cpu_dirty_log_size();
> +}
> +
> +static u32 kvm_dirty_ring_used(struct kvm_dirty_ring *ring)
> +{
> +	return READ_ONCE(ring->dirty_index) - READ_ONCE(ring->reset_index);
> +}
> +
> +bool kvm_dirty_ring_soft_full(struct kvm_dirty_ring *ring)
> +{
> +	return kvm_dirty_ring_used(ring) >= ring->soft_limit;
> +}
> +
> +bool kvm_dirty_ring_full(struct kvm_dirty_ring *ring)
> +{
> +	return kvm_dirty_ring_used(ring) >= ring->size;
> +}
> +
> +struct kvm_dirty_ring *kvm_dirty_ring_get(struct kvm *kvm)
> +{
> +	struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
> +
> +	WARN_ON_ONCE(vcpu->kvm != kvm);
> +
> +	return &vcpu->dirty_ring;
> +}
> +
> +int kvm_dirty_ring_alloc(struct kvm_dirty_ring *ring,
> +			 struct kvm_dirty_ring_indices *indices,
> +			 int index, u32 size)
> +{
> +	ring->dirty_gfns = vmalloc(size);
> +	if (!ring->dirty_gfns)
> +		return -ENOMEM;
> +	memset(ring->dirty_gfns, 0, size);
> +
> +	ring->size = size / sizeof(struct kvm_dirty_gfn);
> +	ring->soft_limit = ring->size - kvm_dirty_ring_get_rsvd_entries();
> +	ring->dirty_index = 0;
> +	ring->reset_index = 0;
> +	ring->index = index;
> +	ring->indices = indices;
> +
> +	return 0;
> +}
> +
> +int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring)
> +{
> +	u32 cur_slot, next_slot;
> +	u64 cur_offset, next_offset;
> +	unsigned long mask;
> +	u32 fetch;
> +	int count = 0;
> +	struct kvm_dirty_gfn *entry;
> +	struct kvm_dirty_ring_indices *indices = ring->indices;
> +	bool first_round = true;
> +
> +	fetch = READ_ONCE(indices->fetch_index);
> +
> +	/*
> +	 * Note that fetch_index is written by the userspace, which
> +	 * should not be trusted.  If this happens, then it's probably
> +	 * that the userspace has written a wrong fetch_index.
> +	 */
> +	if (fetch - ring->reset_index > ring->size)
> +		return -EINVAL;
> +
> +	if (fetch == ring->reset_index)
> +		return 0;
> +
> +	/* This is only needed to make compilers happy */
> +	cur_slot = cur_offset = mask = 0;
> +	while (ring->reset_index != fetch) {
> +		entry = &ring->dirty_gfns[ring->reset_index & (ring->size - 1)];
> +		next_slot = READ_ONCE(entry->slot);
> +		next_offset = READ_ONCE(entry->offset);
> +		ring->reset_index++;
> +		count++;
> +		/*
> +		 * Try to coalesce the reset operations when the guest is
> +		 * scanning pages in the same slot.
> +		 */
> +		if (!first_round && next_slot == cur_slot) {
> +			s64 delta = next_offset - cur_offset;
> +
> +			if (delta >= 0 && delta < BITS_PER_LONG) {
> +				mask |= 1ull << delta;
> +				continue;
> +			}
> +
> +			/* Backwards visit, careful about overflows!  */
> +			if (delta > -BITS_PER_LONG && delta < 0 &&
> +			    (mask << -delta >> -delta) == mask) {
> +				cur_offset = next_offset;
> +				mask = (mask << -delta) | 1;
> +				continue;
> +			}
> +		}
> +		kvm_reset_dirty_gfn(kvm, cur_slot, cur_offset, mask);
> +		cur_slot = next_slot;
> +		cur_offset = next_offset;
> +		mask = 1;
> +		first_round = false;
> +	}
> +	kvm_reset_dirty_gfn(kvm, cur_slot, cur_offset, mask);
> +
> +	trace_kvm_dirty_ring_reset(ring);
> +
> +	return count;
> +}
> +
> +void kvm_dirty_ring_push(struct kvm_dirty_ring *ring, u32 slot, u64 offset)
> +{
> +	struct kvm_dirty_gfn *entry;
> +	struct kvm_dirty_ring_indices *indices = ring->indices;
> +
> +	/* It should never get full */
> +	WARN_ON_ONCE(kvm_dirty_ring_full(ring));
> +
> +	entry = &ring->dirty_gfns[ring->dirty_index & (ring->size - 1)];
> +	entry->slot = slot;
> +	entry->offset = offset;
> +	/*
> +	 * Make sure the data is filled in before we publish this to
> +	 * the userspace program.  There's no paired kernel-side reader.
> +	 */
> +	smp_wmb();
> +	ring->dirty_index++;
> +	WRITE_ONCE(indices->avail_index, ring->dirty_index);
> +
> +	trace_kvm_dirty_ring_push(ring, slot, offset);
> +}
> +
> +struct page *kvm_dirty_ring_get_page(struct kvm_dirty_ring *ring, u32 offset)
> +{
> +	return vmalloc_to_page((void *)ring->dirty_gfns + offset * PAGE_SIZE);
> +}
> +
> +void kvm_dirty_ring_free(struct kvm_dirty_ring *ring)
> +{
> +	vfree(ring->dirty_gfns);
> +	ring->dirty_gfns = NULL;
> +}
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 5bbd8b8730fa..5e36792e15ae 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -64,6 +64,8 @@
>  #define CREATE_TRACE_POINTS
>  #include <trace/events/kvm.h>
>  
> +#include <linux/kvm_dirty_ring.h>
> +
>  /* Worst case buffer size needed for holding an integer. */
>  #define ITOA_MAX_LEN 12
>  
> @@ -357,11 +359,22 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
>  	vcpu->preempted = false;
>  	vcpu->ready = false;
>  
> +	if (kvm->dirty_ring_size) {
> +		r = kvm_dirty_ring_alloc(&vcpu->dirty_ring,
> +					 &vcpu->run->vcpu_ring_indices,
> +					 id, kvm->dirty_ring_size);
> +		if (r)
> +			goto fail_free_run;
> +	}
> +
>  	r = kvm_arch_vcpu_init(vcpu);
>  	if (r < 0)
> -		goto fail_free_run;
> +		goto fail_free_ring;
>  	return 0;
>  
> +fail_free_ring:
> +	if (kvm->dirty_ring_size)
> +		kvm_dirty_ring_free(&vcpu->dirty_ring);
>  fail_free_run:
>  	free_page((unsigned long)vcpu->run);
>  fail:
> @@ -379,6 +392,8 @@ void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
>  	put_pid(rcu_dereference_protected(vcpu->pid, 1));
>  	kvm_arch_vcpu_uninit(vcpu);
>  	free_page((unsigned long)vcpu->run);
> +	if (vcpu->kvm->dirty_ring_size)
> +		kvm_dirty_ring_free(&vcpu->dirty_ring);
>  }
>  EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
>  
> @@ -2284,8 +2299,13 @@ static void mark_page_dirty_in_slot(struct kvm *kvm,
>  {
>  	if (memslot && memslot->dirty_bitmap) {
>  		unsigned long rel_gfn = gfn - memslot->base_gfn;
> +		u32 slot = (memslot->as_id << 16) | memslot->id;
>  
> -		set_bit_le(rel_gfn, memslot->dirty_bitmap);
> +		if (kvm->dirty_ring_size)
> +			kvm_dirty_ring_push(kvm_dirty_ring_get(kvm),
> +					    slot, rel_gfn);
> +		else
> +			set_bit_le(rel_gfn, memslot->dirty_bitmap);
>  	}
>  }
>  
> @@ -2632,6 +2652,16 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
>  }
>  EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
>  
> +static bool kvm_page_in_dirty_ring(struct kvm *kvm, unsigned long pgoff)
> +{
> +	if (!KVM_DIRTY_LOG_PAGE_OFFSET)
> +		return false;
> +
> +	return (pgoff >= KVM_DIRTY_LOG_PAGE_OFFSET) &&
> +	    (pgoff < KVM_DIRTY_LOG_PAGE_OFFSET +
> +	     kvm->dirty_ring_size / PAGE_SIZE);
> +}
> +
>  static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf)
>  {
>  	struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data;
> @@ -2647,6 +2677,10 @@ static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf)
>  	else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
>  		page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
>  #endif
> +	else if (kvm_page_in_dirty_ring(vcpu->kvm, vmf->pgoff))
> +		page = kvm_dirty_ring_get_page(
> +		    &vcpu->dirty_ring,
> +		    vmf->pgoff - KVM_DIRTY_LOG_PAGE_OFFSET);
>  	else
>  		return kvm_arch_vcpu_fault(vcpu, vmf);
>  	get_page(page);
> @@ -2660,6 +2694,15 @@ static const struct vm_operations_struct kvm_vcpu_vm_ops = {
>  
>  static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
>  {
> +	struct kvm_vcpu *vcpu = file->private_data;
> +	unsigned long pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
> +
> +	/* If to map any writable page within dirty ring, fail it */
> +	if ((kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff) ||
> +	     kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff + pages - 1)) &&
> +	    vma->vm_flags & VM_WRITE)
> +		return -EINVAL;
> +
>  	vma->vm_ops = &kvm_vcpu_vm_ops;
>  	return 0;
>  }
> @@ -3242,12 +3285,97 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
>  #endif
>  	case KVM_CAP_NR_MEMSLOTS:
>  		return KVM_USER_MEM_SLOTS;
> +	case KVM_CAP_DIRTY_LOG_RING:
> +#ifdef CONFIG_X86
> +		return KVM_DIRTY_RING_MAX_ENTRIES;
> +#else
> +		return 0;
> +#endif
>  	default:
>  		break;
>  	}
>  	return kvm_vm_ioctl_check_extension(kvm, arg);
>  }
>  
> +void kvm_reset_dirty_gfn(struct kvm *kvm, u32 slot, u64 offset, u64 mask)
> +{
> +	struct kvm_memory_slot *memslot;
> +	int as_id, id;
> +
> +	as_id = slot >> 16;
> +	id = (u16)slot;
> +	if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
> +		return;
> +
> +	memslot = id_to_memslot(__kvm_memslots(kvm, as_id), id);
> +	if (offset >= memslot->npages)
> +		return;
> +
> +	spin_lock(&kvm->mmu_lock);
> +	kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, offset, mask);
> +	spin_unlock(&kvm->mmu_lock);
> +}
> +
> +static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, u32 size)
> +{
> +	int r;
> +
> +	if (!KVM_DIRTY_LOG_PAGE_OFFSET)
> +		return -EINVAL;
> +
> +	/* the size should be power of 2 */
> +	if (!size || (size & (size - 1)))
> +		return -EINVAL;
> +
> +	/* Should be bigger to keep the reserved entries, or a page */
> +	if (size < kvm_dirty_ring_get_rsvd_entries() *
> +	    sizeof(struct kvm_dirty_gfn) || size < PAGE_SIZE)
> +		return -EINVAL;
> +
> +	if (size > KVM_DIRTY_RING_MAX_ENTRIES *
> +	    sizeof(struct kvm_dirty_gfn))
> +		return -E2BIG;
> +
> +	/* We only allow it to set once */
> +	if (kvm->dirty_ring_size)
> +		return -EINVAL;
> +
> +	mutex_lock(&kvm->lock);
> +
> +	if (kvm->created_vcpus) {
> +		/* We don't allow to change this value after vcpu created */
> +		r = -EINVAL;
> +	} else {
> +		kvm->dirty_ring_size = size;
> +		r = 0;
> +	}
> +
> +	mutex_unlock(&kvm->lock);
> +	return r;
> +}
> +
> +static int kvm_vm_ioctl_reset_dirty_pages(struct kvm *kvm)
> +{
> +	int i;
> +	struct kvm_vcpu *vcpu;
> +	int cleared = 0;
> +
> +	if (!kvm->dirty_ring_size)
> +		return -EINVAL;
> +
> +	mutex_lock(&kvm->slots_lock);
> +
> +	kvm_for_each_vcpu(i, vcpu, kvm)
> +		cleared += kvm_dirty_ring_reset(vcpu->kvm, &vcpu->dirty_ring);
> +
> +	mutex_unlock(&kvm->slots_lock);
> +
> +	if (cleared)
> +		kvm_flush_remote_tlbs(kvm);
> +
> +	return cleared;
> +}
> +
>  int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm,
>  						  struct kvm_enable_cap *cap)
>  {
> @@ -3265,6 +3393,8 @@ static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
>  		kvm->manual_dirty_log_protect = cap->args[0];
>  		return 0;
>  #endif
> +	case KVM_CAP_DIRTY_LOG_RING:
> +		return kvm_vm_ioctl_enable_dirty_log_ring(kvm, cap->args[0]);
>  	default:
>  		return kvm_vm_ioctl_enable_cap(kvm, cap);
>  	}
> @@ -3452,6 +3582,9 @@ static long kvm_vm_ioctl(struct file *filp,
>  	case KVM_CHECK_EXTENSION:
>  		r = kvm_vm_ioctl_check_extension_generic(kvm, arg);
>  		break;
> +	case KVM_RESET_DIRTY_RINGS:
> +		r = kvm_vm_ioctl_reset_dirty_pages(kvm);
> +		break;
>  	default:
>  		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
>  	}
> -- 
> 2.24.1
Michael S. Tsirkin Jan. 15, 2020, 6:50 a.m. UTC | #15
On Tue, Jan 14, 2020 at 03:01:34PM -0500, Peter Xu wrote:
> On Thu, Jan 09, 2020 at 02:35:46PM -0500, Michael S. Tsirkin wrote:
> >   ``void flush_dcache_page(struct page *page)``
> > 
> >         Any time the kernel writes to a page cache page, _OR_
> >         the kernel is about to read from a page cache page and
> >         user space shared/writable mappings of this page potentially
> >         exist, this routine is called.
> > 
> > 
> > > Also, I believe this is the similar question that Jason has asked in
> > > V2.  Sorry I should mention this earlier, but I didn't address that in
> > > this series because if we need to do so we probably need to do it
> > > kvm-wise, rather than only in this series.
> > 
> > You need to document these things.
> > 
> > >  I feel like it's missing
> > > probably only because all existing KVM supported archs do not have
> > > virtual-tagged caches as you mentioned.
> > 
> > But is that a fact? ARM has such a variety of CPUs,
> > I can't really tell. Did you research this to make sure?
> > 
> > > If so, I would prefer if you
> > > can allow me to ignore that issue until KVM starts to support such an
> > > arch.
> > 
> > Document limitations pls.  Don't ignore them.
> 
> Hi, Michael,
> 
> I failed to find a good place to document about flush_dcache_page()
> for KVM.  Could you give me a suggestion?

Maybe where the field is introduced. I posted the suggestions to the
relevant patch.

> And I don't know about whether there's any ARM hosts that requires
> flush_dcache_page().  I think not, because again I didn't see any
> caller of flush_dcache_page() in KVM code yet.  Otherwise I think we
> should at least call it before the kernel reading kvm_run or after
> publishing data to kvm_run.

But is kvm run ever accessed while VCPU is running on another CPU?
I always assumed no but maybe I'm missing something?

>  However I'm also CCing Drew for this.
> 
> Thanks,
> 
> -- 
> Peter Xu
Peter Xu Jan. 15, 2020, 3:20 p.m. UTC | #16
On Wed, Jan 15, 2020 at 01:50:08AM -0500, Michael S. Tsirkin wrote:
> On Tue, Jan 14, 2020 at 03:01:34PM -0500, Peter Xu wrote:
> > On Thu, Jan 09, 2020 at 02:35:46PM -0500, Michael S. Tsirkin wrote:
> > >   ``void flush_dcache_page(struct page *page)``
> > > 
> > >         Any time the kernel writes to a page cache page, _OR_
> > >         the kernel is about to read from a page cache page and
> > >         user space shared/writable mappings of this page potentially
> > >         exist, this routine is called.

[1]

> > > 
> > > 
> > > > Also, I believe this is the similar question that Jason has asked in
> > > > V2.  Sorry I should mention this earlier, but I didn't address that in
> > > > this series because if we need to do so we probably need to do it
> > > > kvm-wise, rather than only in this series.
> > > 
> > > You need to document these things.
> > > 
> > > >  I feel like it's missing
> > > > probably only because all existing KVM supported archs do not have
> > > > virtual-tagged caches as you mentioned.
> > > 
> > > But is that a fact? ARM has such a variety of CPUs,
> > > I can't really tell. Did you research this to make sure?
> > > 
> > > > If so, I would prefer if you
> > > > can allow me to ignore that issue until KVM starts to support such an
> > > > arch.
> > > 
> > > Document limitations pls.  Don't ignore them.
> > 
> > Hi, Michael,
> > 
> > I failed to find a good place to document about flush_dcache_page()
> > for KVM.  Could you give me a suggestion?
> 
> Maybe where the field is introduced. I posted the suggestions to the
> relevant patch.

(will reply there)

> 
> > And I don't know about whether there's any ARM hosts that requires
> > flush_dcache_page().  I think not, because again I didn't see any
> > caller of flush_dcache_page() in KVM code yet.  Otherwise I think we
> > should at least call it before the kernel reading kvm_run or after
> > publishing data to kvm_run.
> 
> But is kvm run ever accessed while VCPU is running on another CPU?
> I always assumed no but maybe I'm missing something?

IMHO we need to call it even if it's running on the same CPU - please
refer to [1] above, there's no restriction on which CPU the code is
running on.  I think it makes sense, especially the systems for
virtually-tagged caches because even if the memory accesses happened
on the same CPU, the virtual addresses to access the same page could
still be different when accessed from kernel/userspace.

Thanks,
Peter Xu Jan. 15, 2020, 3:27 p.m. UTC | #17
On Wed, Jan 15, 2020 at 01:47:15AM -0500, Michael S. Tsirkin wrote:
> > diff --git a/include/linux/kvm_dirty_ring.h b/include/linux/kvm_dirty_ring.h
> > new file mode 100644
> > index 000000000000..d6fe9e1b7617
> > --- /dev/null
> > +++ b/include/linux/kvm_dirty_ring.h
> > @@ -0,0 +1,55 @@
> > +#ifndef KVM_DIRTY_RING_H
> > +#define KVM_DIRTY_RING_H
> > +
> > +/**
> > + * kvm_dirty_ring: KVM internal dirty ring structure
> > + *
> > + * @dirty_index: free running counter that points to the next slot in
> > + *               dirty_ring->dirty_gfns, where a new dirty page should go
> > + * @reset_index: free running counter that points to the next dirty page
> > + *               in dirty_ring->dirty_gfns for which dirty trap needs to
> > + *               be reenabled
> > + * @size:        size of the compact list, dirty_ring->dirty_gfns
> > + * @soft_limit:  when the number of dirty pages in the list reaches this
> > + *               limit, vcpu that owns this ring should exit to userspace
> > + *               to allow userspace to harvest all the dirty pages
> > + * @dirty_gfns:  the array to keep the dirty gfns
> > + * @indices:     the pointer to the @kvm_dirty_ring_indices structure
> > + *               of this specific ring
> > + * @index:       index of this dirty ring
> > + */
> > +struct kvm_dirty_ring {
> > +	u32 dirty_index;
> > +	u32 reset_index;
> > +	u32 size;
> > +	u32 soft_limit;
> > +	struct kvm_dirty_gfn *dirty_gfns;
> 
> Here would be a good place to document that accessing
> shared page like this is only safe if archotecture is physically
> tagged.

Right, more importantly is where to document for kvm_run, and any
other shared mappings that I'm not yet aware of across the whole KVM.

[...]

> > +/*
> > + * The following are the requirements for supporting dirty log ring
> > + * (by enabling KVM_DIRTY_LOG_PAGE_OFFSET).
> > + *
> > + * 1. Memory accesses by KVM should call kvm_vcpu_write_* instead
> > + *    of kvm_write_* so that the global dirty ring is not filled up
> > + *    too quickly.
> > + * 2. kvm_arch_mmu_enable_log_dirty_pt_masked should be defined for
> > + *    enabling dirty logging.
> > + * 3. There should not be a separate step to synchronize hardware
> > + *    dirty bitmap with KVM's.
> 
> 
> Are these requirement from an architecture? Then you want to move
> this out of UAPI, keep things relevant to userspace there.

Good point, I removed it, and instead of this...

> 
> > + */
> > +
> > +struct kvm_dirty_gfn {
> > +	__u32 pad;
> > +	__u32 slot;
> > +	__u64 offset;
> > +};
> > +
> 
> Pls add comments about how kvm_dirty_gfn must be mmapped.

... I added this:

/*
 * KVM dirty rings should be mapped at KVM_DIRTY_LOG_PAGE_OFFSET of
 * per-vcpu mmaped regions as an array of struct kvm_dirty_gfn.  The
 * size of the gfn buffer is decided by the first argument when
 * enabling KVM_CAP_DIRTY_LOG_RING.
 */

Thanks,
Michael S. Tsirkin Jan. 16, 2020, 8:38 a.m. UTC | #18
On Thu, Jan 09, 2020 at 09:57:20AM -0500, Peter Xu wrote:
> +	/* If to map any writable page within dirty ring, fail it */
> +	if ((kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff) ||
> +	     kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff + pages - 1)) &&
> +	    vma->vm_flags & VM_WRITE)
> +		return -EINVAL;

Worth thinking about other flags. Do we want to force VM_SHARED?
Disable VM_EXEC?
Peter Xu Jan. 16, 2020, 4:27 p.m. UTC | #19
On Thu, Jan 16, 2020 at 03:38:21AM -0500, Michael S. Tsirkin wrote:
> On Thu, Jan 09, 2020 at 09:57:20AM -0500, Peter Xu wrote:
> > +	/* If to map any writable page within dirty ring, fail it */
> > +	if ((kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff) ||
> > +	     kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff + pages - 1)) &&
> > +	    vma->vm_flags & VM_WRITE)
> > +		return -EINVAL;
> 
> Worth thinking about other flags. Do we want to force VM_SHARED?
> Disable VM_EXEC?

Makes sense to me.  I think it worths a standalone patch since they
should apply for the whole per-vcpu mmaped regions rather than only
for the dirty ring buffers.

(Should include KVM_PIO_PAGE_OFFSET, KVM_COALESCED_MMIO_PAGE_OFFSET,
 KVM_S390_SIE_PAGE_OFFSET, kvm_run, and this new one)

Thanks,
Michael S. Tsirkin Jan. 17, 2020, 9:50 a.m. UTC | #20
On Thu, Jan 16, 2020 at 11:27:03AM -0500, Peter Xu wrote:
> On Thu, Jan 16, 2020 at 03:38:21AM -0500, Michael S. Tsirkin wrote:
> > On Thu, Jan 09, 2020 at 09:57:20AM -0500, Peter Xu wrote:
> > > +	/* If to map any writable page within dirty ring, fail it */
> > > +	if ((kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff) ||
> > > +	     kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff + pages - 1)) &&
> > > +	    vma->vm_flags & VM_WRITE)
> > > +		return -EINVAL;
> > 
> > Worth thinking about other flags. Do we want to force VM_SHARED?
> > Disable VM_EXEC?
> 
> Makes sense to me.  I think it worths a standalone patch since they
> should apply for the whole per-vcpu mmaped regions rather than only
> for the dirty ring buffers.
> 
> (Should include KVM_PIO_PAGE_OFFSET, KVM_COALESCED_MMIO_PAGE_OFFSET,
>  KVM_S390_SIE_PAGE_OFFSET, kvm_run, and this new one)
> 
> Thanks,


I don't think we can change UAPI for existing ones.
Userspace might be setting these by mistake.

> -- 
> Peter Xu
Paolo Bonzini Jan. 19, 2020, 9:09 a.m. UTC | #21
On 09/01/20 20:15, Peter Xu wrote:
> Regarding dropping the indices: I feel like it can be done, though we
> probably need two extra bits for each GFN entry, for example:
> 
>   - Bit 0 of the GFN address to show whether this is a valid publish
>     of dirty gfn
> 
>   - Bit 1 of the GFN address to show whether this is collected by the
>     user

We can use bit 62 and 63 of the GFN.

I think this can be done in a secure way.  Later in the thread you say:

> We simply check fetch_index (sorry I
> meant this when I said reset_index, anyway it's the only index that we
> expose to userspace) to make sure:
> 
>   reset_index <= fetch_index <= dirty_index

So this means that KVM_RESET_DIRTY_RINGS should only test the "collected
by user" flag on dirty ring entries between reset_index and dirty_index.

Also I would make it

   00b (invalid GFN) ->
     01b (valid gfn published by kernel, which is dirty) ->
       1*b (gfn dirty page collected by userspace) ->
         00b (gfn reset by kernel, so goes back to invalid gfn)
That is 10b and 11b are equivalent.  The kernel doesn't read that bit if
userspace has collected the page.

Paolo
Michael S. Tsirkin Jan. 19, 2020, 10:12 a.m. UTC | #22
On Sun, Jan 19, 2020 at 10:09:53AM +0100, Paolo Bonzini wrote:
> On 09/01/20 20:15, Peter Xu wrote:
> > Regarding dropping the indices: I feel like it can be done, though we
> > probably need two extra bits for each GFN entry, for example:
> > 
> >   - Bit 0 of the GFN address to show whether this is a valid publish
> >     of dirty gfn
> > 
> >   - Bit 1 of the GFN address to show whether this is collected by the
> >     user
> 
> We can use bit 62 and 63 of the GFN.

If we are short on bits we can just use 1 bit. E.g. set if
userspace has collected the GFN.

> I think this can be done in a secure way.  Later in the thread you say:
> 
> > We simply check fetch_index (sorry I
> > meant this when I said reset_index, anyway it's the only index that we
> > expose to userspace) to make sure:
> > 
> >   reset_index <= fetch_index <= dirty_index
> 
> So this means that KVM_RESET_DIRTY_RINGS should only test the "collected
> by user" flag on dirty ring entries between reset_index and dirty_index.
> 
> Also I would make it
> 
>    00b (invalid GFN) ->
>      01b (valid gfn published by kernel, which is dirty) ->
>        1*b (gfn dirty page collected by userspace) ->
>          00b (gfn reset by kernel, so goes back to invalid gfn)
> That is 10b and 11b are equivalent.  The kernel doesn't read that bit if
> userspace has collected the page.
> 
> Paolo
Peter Xu Jan. 20, 2020, 6:48 a.m. UTC | #23
On Fri, Jan 17, 2020 at 04:50:48AM -0500, Michael S. Tsirkin wrote:
> On Thu, Jan 16, 2020 at 11:27:03AM -0500, Peter Xu wrote:
> > On Thu, Jan 16, 2020 at 03:38:21AM -0500, Michael S. Tsirkin wrote:
> > > On Thu, Jan 09, 2020 at 09:57:20AM -0500, Peter Xu wrote:
> > > > +	/* If to map any writable page within dirty ring, fail it */
> > > > +	if ((kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff) ||
> > > > +	     kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff + pages - 1)) &&
> > > > +	    vma->vm_flags & VM_WRITE)
> > > > +		return -EINVAL;
> > > 
> > > Worth thinking about other flags. Do we want to force VM_SHARED?
> > > Disable VM_EXEC?
> > 
> > Makes sense to me.  I think it worths a standalone patch since they
> > should apply for the whole per-vcpu mmaped regions rather than only
> > for the dirty ring buffers.
> > 
> > (Should include KVM_PIO_PAGE_OFFSET, KVM_COALESCED_MMIO_PAGE_OFFSET,
> >  KVM_S390_SIE_PAGE_OFFSET, kvm_run, and this new one)
> > 
> > Thanks,
> 
> 
> I don't think we can change UAPI for existing ones.
> Userspace might be setting these by mistake.

Right (especially for VM_EXEC)... I'll only check that for the new
pages then.  Thanks,
Peter Xu Jan. 20, 2020, 7:29 a.m. UTC | #24
On Sun, Jan 19, 2020 at 05:12:35AM -0500, Michael S. Tsirkin wrote:
> On Sun, Jan 19, 2020 at 10:09:53AM +0100, Paolo Bonzini wrote:
> > On 09/01/20 20:15, Peter Xu wrote:
> > > Regarding dropping the indices: I feel like it can be done, though we
> > > probably need two extra bits for each GFN entry, for example:
> > > 
> > >   - Bit 0 of the GFN address to show whether this is a valid publish
> > >     of dirty gfn
> > > 
> > >   - Bit 1 of the GFN address to show whether this is collected by the
> > >     user
> > 
> > We can use bit 62 and 63 of the GFN.
> 
> If we are short on bits we can just use 1 bit. E.g. set if
> userspace has collected the GFN.

I'm still unsure whether we can use only one bit for this.  Say,
otherwise how does the userspace knows the entry is valid?  For
example, the entry with all zeros ({.slot = 0, gfn = 0}) could be
recognized as a valid dirty page on slot 0 gfn 0, even if it's
actually an unused entry.

> 
> > I think this can be done in a secure way.  Later in the thread you say:
> > 
> > > We simply check fetch_index (sorry I
> > > meant this when I said reset_index, anyway it's the only index that we
> > > expose to userspace) to make sure:
> > > 
> > >   reset_index <= fetch_index <= dirty_index
> > 
> > So this means that KVM_RESET_DIRTY_RINGS should only test the "collected
> > by user" flag on dirty ring entries between reset_index and dirty_index.
> > 
> > Also I would make it
> > 
> >    00b (invalid GFN) ->
> >      01b (valid gfn published by kernel, which is dirty) ->
> >        1*b (gfn dirty page collected by userspace) ->
> >          00b (gfn reset by kernel, so goes back to invalid gfn)
> > That is 10b and 11b are equivalent.  The kernel doesn't read that bit if
> > userspace has collected the page.

Yes "1*b" is good too (IMHO as long as we can define three states for
an entry).  However do you want me to change to that?  Note that I
still think we need to read the rest of the field (in this case,
"slot" and "gfn") besides the two bits to do re-protect.  Should we
trust that unconditionally if writable?

Thanks,
Michael S. Tsirkin Jan. 20, 2020, 7:47 a.m. UTC | #25
On Mon, Jan 20, 2020 at 03:29:15PM +0800, Peter Xu wrote:
> On Sun, Jan 19, 2020 at 05:12:35AM -0500, Michael S. Tsirkin wrote:
> > On Sun, Jan 19, 2020 at 10:09:53AM +0100, Paolo Bonzini wrote:
> > > On 09/01/20 20:15, Peter Xu wrote:
> > > > Regarding dropping the indices: I feel like it can be done, though we
> > > > probably need two extra bits for each GFN entry, for example:
> > > > 
> > > >   - Bit 0 of the GFN address to show whether this is a valid publish
> > > >     of dirty gfn
> > > > 
> > > >   - Bit 1 of the GFN address to show whether this is collected by the
> > > >     user
> > > 
> > > We can use bit 62 and 63 of the GFN.
> > 
> > If we are short on bits we can just use 1 bit. E.g. set if
> > userspace has collected the GFN.
> 
> I'm still unsure whether we can use only one bit for this.  Say,
> otherwise how does the userspace knows the entry is valid?  For
> example, the entry with all zeros ({.slot = 0, gfn = 0}) could be
> recognized as a valid dirty page on slot 0 gfn 0, even if it's
> actually an unused entry.

So I guess the reverse: valid entry has bit set, userspace sets it to
0 when it collects it?


> > 
> > > I think this can be done in a secure way.  Later in the thread you say:
> > > 
> > > > We simply check fetch_index (sorry I
> > > > meant this when I said reset_index, anyway it's the only index that we
> > > > expose to userspace) to make sure:
> > > > 
> > > >   reset_index <= fetch_index <= dirty_index
> > > 
> > > So this means that KVM_RESET_DIRTY_RINGS should only test the "collected
> > > by user" flag on dirty ring entries between reset_index and dirty_index.
> > > 
> > > Also I would make it
> > > 
> > >    00b (invalid GFN) ->
> > >      01b (valid gfn published by kernel, which is dirty) ->
> > >        1*b (gfn dirty page collected by userspace) ->
> > >          00b (gfn reset by kernel, so goes back to invalid gfn)
> > > That is 10b and 11b are equivalent.  The kernel doesn't read that bit if
> > > userspace has collected the page.
> 
> Yes "1*b" is good too (IMHO as long as we can define three states for
> an entry).  However do you want me to change to that?  Note that I
> still think we need to read the rest of the field (in this case,
> "slot" and "gfn") besides the two bits to do re-protect.  Should we
> trust that unconditionally if writable?
> 
> Thanks,
> 
> -- 
> Peter Xu
Peter Xu Jan. 21, 2020, 8:29 a.m. UTC | #26
On Mon, Jan 20, 2020 at 02:47:46AM -0500, Michael S. Tsirkin wrote:
> On Mon, Jan 20, 2020 at 03:29:15PM +0800, Peter Xu wrote:
> > On Sun, Jan 19, 2020 at 05:12:35AM -0500, Michael S. Tsirkin wrote:
> > > On Sun, Jan 19, 2020 at 10:09:53AM +0100, Paolo Bonzini wrote:
> > > > On 09/01/20 20:15, Peter Xu wrote:
> > > > > Regarding dropping the indices: I feel like it can be done, though we
> > > > > probably need two extra bits for each GFN entry, for example:
> > > > > 
> > > > >   - Bit 0 of the GFN address to show whether this is a valid publish
> > > > >     of dirty gfn
> > > > > 
> > > > >   - Bit 1 of the GFN address to show whether this is collected by the
> > > > >     user
> > > > 
> > > > We can use bit 62 and 63 of the GFN.
> > > 
> > > If we are short on bits we can just use 1 bit. E.g. set if
> > > userspace has collected the GFN.
> > 
> > I'm still unsure whether we can use only one bit for this.  Say,
> > otherwise how does the userspace knows the entry is valid?  For
> > example, the entry with all zeros ({.slot = 0, gfn = 0}) could be
> > recognized as a valid dirty page on slot 0 gfn 0, even if it's
> > actually an unused entry.
> 
> So I guess the reverse: valid entry has bit set, userspace sets it to
> 0 when it collects it?

Right, this seems to work.

Thanks,
Paolo Bonzini Jan. 21, 2020, 10:24 a.m. UTC | #27
On 20/01/20 08:29, Peter Xu wrote:
>>>
>>>    00b (invalid GFN) ->
>>>      01b (valid gfn published by kernel, which is dirty) ->
>>>        1*b (gfn dirty page collected by userspace) ->
>>>          00b (gfn reset by kernel, so goes back to invalid gfn)
>>> That is 10b and 11b are equivalent.  The kernel doesn't read that bit if
>>> userspace has collected the page.
> Yes "1*b" is good too (IMHO as long as we can define three states for
> an entry).  However do you want me to change to that?  Note that I
> still think we need to read the rest of the field (in this case,
> "slot" and "gfn") besides the two bits to do re-protect.  Should we
> trust that unconditionally if writable?

I think that userspace would only hurt itself if they do so.  As long as
the kernel has a trusted copy of the indices, it's okay.

We have plenty of bits--x86 limits GFNs to 40 bits (52 bits maximum
physical address).  However, even on other architectures GFNs are
limited to address space size - page shift (64-12).

Paolo
Paolo Bonzini Jan. 21, 2020, 10:25 a.m. UTC | #28
On 21/01/20 09:29, Peter Xu wrote:
>>>> If we are short on bits we can just use 1 bit. E.g. set if
>>>> userspace has collected the GFN.
>>> I'm still unsure whether we can use only one bit for this.  Say,
>>> otherwise how does the userspace knows the entry is valid?  For
>>> example, the entry with all zeros ({.slot = 0, gfn = 0}) could be
>>> recognized as a valid dirty page on slot 0 gfn 0, even if it's
>>> actually an unused entry.
>> So I guess the reverse: valid entry has bit set, userspace sets it to
>> 0 when it collects it?
> Right, this seems to work.

Yes, that's okay too.

Paolo
diff mbox series

Patch

diff --git a/Documentation/virt/kvm/api.txt b/Documentation/virt/kvm/api.txt
index ebb37b34dcfc..708c3e0f7eae 100644
--- a/Documentation/virt/kvm/api.txt
+++ b/Documentation/virt/kvm/api.txt
@@ -231,6 +231,7 @@  Based on their initialization different VMs may have different capabilities.
 It is thus encouraged to use the vm ioctl to query for capabilities (available
 with KVM_CAP_CHECK_EXTENSION_VM on the vm fd)
 
+
 4.5 KVM_GET_VCPU_MMAP_SIZE
 
 Capability: basic
@@ -243,6 +244,18 @@  The KVM_RUN ioctl (cf.) communicates with userspace via a shared
 memory region.  This ioctl returns the size of that region.  See the
 KVM_RUN documentation for details.
 
+Besides the size of the KVM_RUN communication region, other areas of
+the VCPU file descriptor can be mmap-ed, including:
+
+- if KVM_CAP_COALESCED_MMIO is available, a page at
+  KVM_COALESCED_MMIO_PAGE_OFFSET * PAGE_SIZE; for historical reasons,
+  this page is included in the result of KVM_GET_VCPU_MMAP_SIZE.
+  KVM_CAP_COALESCED_MMIO is not documented yet.
+
+- if KVM_CAP_DIRTY_LOG_RING is available, a number of pages at
+  KVM_DIRTY_LOG_PAGE_OFFSET * PAGE_SIZE.  For more information on
+  KVM_CAP_DIRTY_LOG_RING, see section 8.3.
+
 
 4.6 KVM_SET_MEMORY_REGION
 
@@ -5376,6 +5389,7 @@  CPU when the exception is taken. If this virtual SError is taken to EL1 using
 AArch64, this value will be reported in the ISS field of ESR_ELx.
 
 See KVM_CAP_VCPU_EVENTS for more details.
+
 8.20 KVM_CAP_HYPERV_SEND_IPI
 
 Architectures: x86
@@ -5383,6 +5397,7 @@  Architectures: x86
 This capability indicates that KVM supports paravirtualized Hyper-V IPI send
 hypercalls:
 HvCallSendSyntheticClusterIpi, HvCallSendSyntheticClusterIpiEx.
+
 8.21 KVM_CAP_HYPERV_DIRECT_TLBFLUSH
 
 Architecture: x86
@@ -5396,3 +5411,77 @@  handling by KVM (as some KVM hypercall may be mistakenly treated as TLB
 flush hypercalls by Hyper-V) so userspace should disable KVM identification
 in CPUID and only exposes Hyper-V identification. In this case, guest
 thinks it's running on Hyper-V and only use Hyper-V hypercalls.
+
+8.22 KVM_CAP_DIRTY_LOG_RING
+
+Architectures: x86
+Parameters: args[0] - size of the dirty log ring
+
+KVM is capable of tracking dirty memory using ring buffers that are
+mmaped into userspace; there is one dirty ring per vcpu.
+
+One dirty ring is defined as below internally:
+
+struct kvm_dirty_ring {
+	u32 dirty_index;
+	u32 reset_index;
+	u32 size;
+	u32 soft_limit;
+	struct kvm_dirty_gfn *dirty_gfns;
+	struct kvm_dirty_ring_indices *indices;
+	int index;
+};
+
+Dirty GFNs (Guest Frame Numbers) are stored in the dirty_gfns array.
+For each of the dirty entry it's defined as:
+
+struct kvm_dirty_gfn {
+        __u32 pad;
+        __u32 slot; /* as_id | slot_id */
+        __u64 offset;
+};
+
+Most of the ring structure is used by KVM internally, while only the
+indices are exposed to userspace:
+
+struct kvm_dirty_ring_indices {
+	__u32 avail_index; /* set by kernel */
+	__u32 fetch_index; /* set by userspace */
+};
+
+The two indices in the ring buffer are free running counters.
+
+Userspace calls KVM_ENABLE_CAP ioctl right after KVM_CREATE_VM ioctl
+to enable this capability for the new guest and set the size of the
+rings.  It is only allowed before creating any vCPU, and the size of
+the ring must be a power of two.  The larger the ring buffer, the less
+likely the ring is full and the VM is forced to exit to userspace. The
+optimal size depends on the workload, but it is recommended that it be
+at least 64 KiB (4096 entries).
+
+Just like for dirty page bitmaps, the buffer tracks writes to
+all user memory regions for which the KVM_MEM_LOG_DIRTY_PAGES flag was
+set in KVM_SET_USER_MEMORY_REGION.  Once a memory region is registered
+with the flag set, userspace can start harvesting dirty pages from the
+ring buffer.
+
+To harvest the dirty pages, userspace accesses the mmaped ring buffer
+to read the dirty GFNs up to avail_index, and sets the fetch_index
+accordingly.  This can be done when the guest is running or paused,
+and dirty pages need not be collected all at once.  After processing
+one or more entries in the ring buffer, userspace calls the VM ioctl
+KVM_RESET_DIRTY_RINGS to notify the kernel that it has updated
+fetch_index and to mark those pages clean.  Therefore, the ioctl
+must be called *before* reading the content of the dirty pages.
+
+However, there is a major difference comparing to the
+KVM_GET_DIRTY_LOG interface in that when reading the dirty ring from
+userspace it's still possible that the kernel has not yet flushed the
+hardware dirty buffers into the kernel buffer (which was previously
+done by the KVM_GET_DIRTY_LOG ioctl).  To achieve that, one needs to
+kick the vcpu out for a hardware buffer flush (vmexit) to make sure
+all the existing dirty gfns are flushed to the dirty rings.
+
+If one of the ring buffers is full, the guest will exit to userspace
+with the exit reason set to KVM_EXIT_DIRTY_LOG_FULL, and the KVM_RUN
+ioctl will return to userspace with zero.
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f536d139b3d2..3fe18402e6a3 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1181,6 +1181,7 @@  struct kvm_x86_ops {
 					   struct kvm_memory_slot *slot,
 					   gfn_t offset, unsigned long mask);
 	int (*write_log_dirty)(struct kvm_vcpu *vcpu);
+	int (*cpu_dirty_log_size)(void);
 
 	/* pmu operations of sub-arch */
 	const struct kvm_pmu_ops *pmu_ops;
@@ -1666,4 +1667,6 @@  static inline int kvm_cpu_get_apicid(int mps_cpu)
 #define GET_SMSTATE(type, buf, offset)		\
 	(*(type *)((buf) + (offset) - 0x7e00))
 
+int kvm_cpu_dirty_log_size(void);
+
 #endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 503d3f42da16..b59bf356c478 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -12,6 +12,7 @@ 
 
 #define KVM_PIO_PAGE_OFFSET 1
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 2
+#define KVM_DIRTY_LOG_PAGE_OFFSET 64
 
 #define DE_VECTOR 0
 #define DB_VECTOR 1
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index b19ef421084d..0acee817adfb 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -5,7 +5,8 @@  ccflags-y += -Iarch/x86/kvm
 KVM := ../../../virt/kvm
 
 kvm-y			+= $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
-				$(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o
+				$(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o \
+				$(KVM)/dirty_ring.o
 kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(KVM)/async_pf.o
 
 kvm-y			+= x86.o emulate.o i8259.o irq.o lapic.o \
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 7269130ea5e2..621b842a9b7b 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -1832,7 +1832,13 @@  int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu)
 {
 	if (kvm_x86_ops->write_log_dirty)
 		return kvm_x86_ops->write_log_dirty(vcpu);
+	return 0;
+}
 
+int kvm_cpu_dirty_log_size(void)
+{
+	if (kvm_x86_ops->cpu_dirty_log_size)
+		return kvm_x86_ops->cpu_dirty_log_size();
 	return 0;
 }
 
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 62175a246bcc..2151de89456d 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7689,6 +7689,7 @@  static __init int hardware_setup(void)
 		kvm_x86_ops->slot_disable_log_dirty = NULL;
 		kvm_x86_ops->flush_log_dirty = NULL;
 		kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
+		kvm_x86_ops->cpu_dirty_log_size = NULL;
 	}
 
 	if (!cpu_has_vmx_preemption_timer())
@@ -7753,6 +7754,11 @@  static __exit void hardware_unsetup(void)
 	free_kvm_area();
 }
 
+static int vmx_cpu_dirty_log_size(void)
+{
+	return enable_pml ? PML_ENTITY_NUM : 0;
+}
+
 static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 	.cpu_has_kvm_support = cpu_has_kvm_support,
 	.disabled_by_bios = vmx_disabled_by_bios,
@@ -7875,6 +7881,7 @@  static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 	.flush_log_dirty = vmx_flush_log_dirty,
 	.enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
 	.write_log_dirty = vmx_write_pml_buffer,
+	.cpu_dirty_log_size = vmx_cpu_dirty_log_size,
 
 	.pre_block = vmx_pre_block,
 	.post_block = vmx_post_block,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ff97782b3919..9c3673592826 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7998,6 +7998,15 @@  static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
 	bool req_immediate_exit = false;
 
+	/* Forbid vmenter if vcpu dirty ring is soft-full */
+	if (unlikely(vcpu->kvm->dirty_ring_size &&
+		     kvm_dirty_ring_soft_full(&vcpu->dirty_ring))) {
+		vcpu->run->exit_reason = KVM_EXIT_DIRTY_RING_FULL;
+		trace_kvm_dirty_ring_exit(vcpu);
+		r = 0;
+		goto out;
+	}
+
 	if (kvm_request_pending(vcpu)) {
 		if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu)) {
 			if (unlikely(!kvm_x86_ops->get_vmcs12_pages(vcpu))) {
diff --git a/include/linux/kvm_dirty_ring.h b/include/linux/kvm_dirty_ring.h
new file mode 100644
index 000000000000..d6fe9e1b7617
--- /dev/null
+++ b/include/linux/kvm_dirty_ring.h
@@ -0,0 +1,55 @@ 
+#ifndef KVM_DIRTY_RING_H
+#define KVM_DIRTY_RING_H
+
+/**
+ * kvm_dirty_ring: KVM internal dirty ring structure
+ *
+ * @dirty_index: free running counter that points to the next slot in
+ *               dirty_ring->dirty_gfns, where a new dirty page should go
+ * @reset_index: free running counter that points to the next dirty page
+ *               in dirty_ring->dirty_gfns for which dirty trap needs to
+ *               be reenabled
+ * @size:        size of the compact list, dirty_ring->dirty_gfns
+ * @soft_limit:  when the number of dirty pages in the list reaches this
+ *               limit, vcpu that owns this ring should exit to userspace
+ *               to allow userspace to harvest all the dirty pages
+ * @dirty_gfns:  the array to keep the dirty gfns
+ * @indices:     the pointer to the @kvm_dirty_ring_indices structure
+ *               of this specific ring
+ * @index:       index of this dirty ring
+ */
+struct kvm_dirty_ring {
+	u32 dirty_index;
+	u32 reset_index;
+	u32 size;
+	u32 soft_limit;
+	struct kvm_dirty_gfn *dirty_gfns;
+	struct kvm_dirty_ring_indices *indices;
+	int index;
+};
+
+u32 kvm_dirty_ring_get_rsvd_entries(void);
+int kvm_dirty_ring_alloc(struct kvm_dirty_ring *ring,
+			 struct kvm_dirty_ring_indices *indices,
+			 int index, u32 size);
+struct kvm_dirty_ring *kvm_dirty_ring_get(struct kvm *kvm);
+
+/*
+ * called with kvm->slots_lock held, returns the number of
+ * processed pages.
+ */
+int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring);
+
+/*
+ * returns =0: successfully pushed
+ *         <0: unable to push, need to wait
+ */
+void kvm_dirty_ring_push(struct kvm_dirty_ring *ring, u32 slot, u64 offset);
+
+/* for use in vm_operations_struct */
+struct page *kvm_dirty_ring_get_page(struct kvm_dirty_ring *ring, u32 offset);
+
+void kvm_dirty_ring_free(struct kvm_dirty_ring *ring);
+bool kvm_dirty_ring_soft_full(struct kvm_dirty_ring *ring);
+
+#endif
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index cbd633ece959..c96161c6a0c9 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -34,6 +34,7 @@ 
 #include <linux/kvm_types.h>
 
 #include <asm/kvm_host.h>
+#include <linux/kvm_dirty_ring.h>
 
 #ifndef KVM_MAX_VCPU_ID
 #define KVM_MAX_VCPU_ID KVM_MAX_VCPUS
@@ -321,6 +322,7 @@  struct kvm_vcpu {
 	bool ready;
 	struct kvm_vcpu_arch arch;
 	struct dentry *debugfs_dentry;
+	struct kvm_dirty_ring dirty_ring;
 };
 
 static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu)
@@ -502,6 +504,7 @@  struct kvm {
 	struct srcu_struct srcu;
 	struct srcu_struct irq_srcu;
 	pid_t userspace_pid;
+	u32 dirty_ring_size;
 };
 
 #define kvm_err(fmt, ...) \
@@ -831,6 +834,8 @@  void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
 					gfn_t gfn_offset,
 					unsigned long mask);
 
+void kvm_reset_dirty_gfn(struct kvm *kvm, u32 slot, u64 offset, u64 mask);
+
 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 				struct kvm_dirty_log *log);
 int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
@@ -1409,4 +1414,25 @@  int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
 				uintptr_t data, const char *name,
 				struct task_struct **thread_ptr);
 
+/*
+ * This defines how many reserved entries we want to keep before we
+ * kick the vcpu to the userspace to avoid dirty ring full.  This
+ * value can be tuned to higher if e.g. PML is enabled on the host.
+ */
+#define  KVM_DIRTY_RING_RSVD_ENTRIES  64
+
+/* Max number of entries allowed for each kvm dirty ring */
+#define  KVM_DIRTY_RING_MAX_ENTRIES  65536
+
+/*
+ * Arch needs to define these macro after implementing the dirty ring
+ * feature.  KVM_DIRTY_LOG_PAGE_OFFSET should be defined as the
+ * starting page offset of the dirty ring structures, while
+ * KVM_DIRTY_RING_VERSION should be defined as >=1.  By default, this
+ * feature is off on all archs.
+ */
+#ifndef KVM_DIRTY_LOG_PAGE_OFFSET
+#define KVM_DIRTY_LOG_PAGE_OFFSET 0
+#endif
+
 #endif
diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index 2c735a3e6613..3d850997940c 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -399,6 +399,84 @@  TRACE_EVENT(kvm_halt_poll_ns,
 #define trace_kvm_halt_poll_ns_shrink(vcpu_id, new, old) \
 	trace_kvm_halt_poll_ns(false, vcpu_id, new, old)
 
+TRACE_EVENT(kvm_dirty_ring_push,
+	TP_PROTO(struct kvm_dirty_ring *ring, u32 slot, u64 offset),
+	TP_ARGS(ring, slot, offset),
+
+	TP_STRUCT__entry(
+		__field(int, index)
+		__field(u32, dirty_index)
+		__field(u32, reset_index)
+		__field(u32, slot)
+		__field(u64, offset)
+	),
+
+	TP_fast_assign(
+		__entry->index          = ring->index;
+		__entry->dirty_index    = ring->dirty_index;
+		__entry->reset_index    = ring->reset_index;
+		__entry->slot           = slot;
+		__entry->offset         = offset;
+	),
+
+	TP_printk("ring %d: dirty 0x%x reset 0x%x "
+		  "slot %u offset 0x%llx (used %u)",
+		  __entry->index, __entry->dirty_index,
+		  __entry->reset_index,  __entry->slot, __entry->offset,
+		  __entry->dirty_index - __entry->reset_index)
+);
+
+TRACE_EVENT(kvm_dirty_ring_reset,
+	TP_PROTO(struct kvm_dirty_ring *ring),
+	TP_ARGS(ring),
+
+	TP_STRUCT__entry(
+		__field(int, index)
+		__field(u32, dirty_index)
+		__field(u32, reset_index)
+	),
+
+	TP_fast_assign(
+		__entry->index          = ring->index;
+		__entry->dirty_index    = ring->dirty_index;
+		__entry->reset_index    = ring->reset_index;
+	),
+
+	TP_printk("ring %d: dirty 0x%x reset 0x%x (used %u)",
+		  __entry->index, __entry->dirty_index, __entry->reset_index,
+		  __entry->dirty_index - __entry->reset_index)
+);
+
+TRACE_EVENT(kvm_dirty_ring_waitqueue,
+	TP_PROTO(bool enter),
+	TP_ARGS(enter),
+
+	TP_STRUCT__entry(
+	    __field(bool, enter)
+	),
+
+	TP_fast_assign(
+	    __entry->enter = enter;
+	),
+
+	TP_printk("%s", __entry->enter ? "wait" : "awake")
+);
+
+TRACE_EVENT(kvm_dirty_ring_exit,
+	TP_PROTO(struct kvm_vcpu *vcpu),
+	TP_ARGS(vcpu),
+
+	TP_STRUCT__entry(
+	    __field(int, vcpu_id)
+	),
+
+	TP_fast_assign(
+	    __entry->vcpu_id = vcpu->vcpu_id;
+	),
+
+	TP_printk("vcpu %d", __entry->vcpu_id)
+);
+
 #endif /* _TRACE_KVM_MAIN_H */
 
 /* This part must be outside protection */
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index f0a16b4adbbd..df4a1700ff1e 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -236,6 +236,7 @@  struct kvm_hyperv_exit {
 #define KVM_EXIT_IOAPIC_EOI       26
 #define KVM_EXIT_HYPERV           27
 #define KVM_EXIT_ARM_NISV         28
+#define KVM_EXIT_DIRTY_RING_FULL  29
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 /* Emulate instruction failed. */
@@ -247,6 +248,13 @@  struct kvm_hyperv_exit {
 /* Encounter unexpected vm-exit reason */
 #define KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON	4
 
+struct kvm_dirty_ring_indices {
+	__u32 avail_index; /* set by kernel */
+	__u32 padding1;
+	__u32 fetch_index; /* set by userspace */
+	__u32 padding2;
+};
+
 /* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */
 struct kvm_run {
 	/* in */
@@ -421,6 +429,8 @@  struct kvm_run {
 		struct kvm_sync_regs regs;
 		char padding[SYNC_REGS_SIZE_BYTES];
 	} s;
+
+	struct kvm_dirty_ring_indices vcpu_ring_indices;
 };
 
 /* for KVM_REGISTER_COALESCED_MMIO / KVM_UNREGISTER_COALESCED_MMIO */
@@ -1009,6 +1019,7 @@  struct kvm_ppc_resize_hpt {
 #define KVM_CAP_PPC_GUEST_DEBUG_SSTEP 176
 #define KVM_CAP_ARM_NISV_TO_USER 177
 #define KVM_CAP_ARM_INJECT_EXT_DABT 178
+#define KVM_CAP_DIRTY_LOG_RING 179
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1473,6 +1484,9 @@  struct kvm_enc_region {
 /* Available with KVM_CAP_ARM_SVE */
 #define KVM_ARM_VCPU_FINALIZE	  _IOW(KVMIO,  0xc2, int)
 
+/* Available with KVM_CAP_DIRTY_LOG_RING */
+#define KVM_RESET_DIRTY_RINGS     _IO(KVMIO, 0xc3)
+
 /* Secure Encrypted Virtualization command */
 enum sev_cmd_id {
 	/* Guest initialization commands */
@@ -1623,4 +1637,23 @@  struct kvm_hyperv_eventfd {
 #define KVM_HYPERV_CONN_ID_MASK		0x00ffffff
 #define KVM_HYPERV_EVENTFD_DEASSIGN	(1 << 0)
 
+/*
+ * The following are the requirements for supporting dirty log ring
+ * (by enabling KVM_DIRTY_LOG_PAGE_OFFSET).
+ *
+ * 1. Memory accesses by KVM should call kvm_vcpu_write_* instead
+ *    of kvm_write_* so that the global dirty ring is not filled up
+ *    too quickly.
+ * 2. kvm_arch_mmu_enable_log_dirty_pt_masked should be defined for
+ *    enabling dirty logging.
+ * 3. There should not be a separate step to synchronize hardware
+ *    dirty bitmap with KVM's.
+ */
+
+struct kvm_dirty_gfn {
+	__u32 pad;
+	__u32 slot;
+	__u64 offset;
+};
+
 #endif /* __LINUX_KVM_H */
diff --git a/virt/kvm/dirty_ring.c b/virt/kvm/dirty_ring.c
new file mode 100644
index 000000000000..67ec5bbc21c0
--- /dev/null
+++ b/virt/kvm/dirty_ring.c
@@ -0,0 +1,162 @@ 
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * KVM dirty ring implementation
+ *
+ * Copyright 2019 Red Hat, Inc.
+ */
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+#include <linux/vmalloc.h>
+#include <linux/kvm_dirty_ring.h>
+#include <trace/events/kvm.h>
+
+int __weak kvm_cpu_dirty_log_size(void)
+{
+	return 0;
+}
+
+u32 kvm_dirty_ring_get_rsvd_entries(void)
+{
+	return KVM_DIRTY_RING_RSVD_ENTRIES + kvm_cpu_dirty_log_size();
+}
+
+static u32 kvm_dirty_ring_used(struct kvm_dirty_ring *ring)
+{
+	return READ_ONCE(ring->dirty_index) - READ_ONCE(ring->reset_index);
+}
+
+bool kvm_dirty_ring_soft_full(struct kvm_dirty_ring *ring)
+{
+	return kvm_dirty_ring_used(ring) >= ring->soft_limit;
+}
+
+bool kvm_dirty_ring_full(struct kvm_dirty_ring *ring)
+{
+	return kvm_dirty_ring_used(ring) >= ring->size;
+}
+
+struct kvm_dirty_ring *kvm_dirty_ring_get(struct kvm *kvm)
+{
+	struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
+
+	WARN_ON_ONCE(vcpu->kvm != kvm);
+
+	return &vcpu->dirty_ring;
+}
+
+int kvm_dirty_ring_alloc(struct kvm_dirty_ring *ring,
+			 struct kvm_dirty_ring_indices *indices,
+			 int index, u32 size)
+{
+	ring->dirty_gfns = vmalloc(size);
+	if (!ring->dirty_gfns)
+		return -ENOMEM;
+	memset(ring->dirty_gfns, 0, size);
+
+	ring->size = size / sizeof(struct kvm_dirty_gfn);
+	ring->soft_limit = ring->size - kvm_dirty_ring_get_rsvd_entries();
+	ring->dirty_index = 0;
+	ring->reset_index = 0;
+	ring->index = index;
+	ring->indices = indices;
+
+	return 0;
+}
+
+int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring)
+{
+	u32 cur_slot, next_slot;
+	u64 cur_offset, next_offset;
+	unsigned long mask;
+	u32 fetch;
+	int count = 0;
+	struct kvm_dirty_gfn *entry;
+	struct kvm_dirty_ring_indices *indices = ring->indices;
+	bool first_round = true;
+
+	fetch = READ_ONCE(indices->fetch_index);
+
+	/*
+	 * Note that fetch_index is written by the userspace, which
+	 * should not be trusted.  If this happens, then it's probably
+	 * that the userspace has written a wrong fetch_index.
+	 */
+	if (fetch - ring->reset_index > ring->size)
+		return -EINVAL;
+
+	if (fetch == ring->reset_index)
+		return 0;
+
+	/* This is only needed to make compilers happy */
+	cur_slot = cur_offset = mask = 0;
+	while (ring->reset_index != fetch) {
+		entry = &ring->dirty_gfns[ring->reset_index & (ring->size - 1)];
+		next_slot = READ_ONCE(entry->slot);
+		next_offset = READ_ONCE(entry->offset);
+		ring->reset_index++;
+		count++;
+		/*
+		 * Try to coalesce the reset operations when the guest is
+		 * scanning pages in the same slot.
+		 */
+		if (!first_round && next_slot == cur_slot) {
+			s64 delta = next_offset - cur_offset;
+
+			if (delta >= 0 && delta < BITS_PER_LONG) {
+				mask |= 1ull << delta;
+				continue;
+			}
+
+			/* Backwards visit, careful about overflows!  */
+			if (delta > -BITS_PER_LONG && delta < 0 &&
+			    (mask << -delta >> -delta) == mask) {
+				cur_offset = next_offset;
+				mask = (mask << -delta) | 1;
+				continue;
+			}
+		}
+		kvm_reset_dirty_gfn(kvm, cur_slot, cur_offset, mask);
+		cur_slot = next_slot;
+		cur_offset = next_offset;
+		mask = 1;
+		first_round = false;
+	}
+	kvm_reset_dirty_gfn(kvm, cur_slot, cur_offset, mask);
+
+	trace_kvm_dirty_ring_reset(ring);
+
+	return count;
+}
+
+void kvm_dirty_ring_push(struct kvm_dirty_ring *ring, u32 slot, u64 offset)
+{
+	struct kvm_dirty_gfn *entry;
+	struct kvm_dirty_ring_indices *indices = ring->indices;
+
+	/* It should never get full */
+	WARN_ON_ONCE(kvm_dirty_ring_full(ring));
+
+	entry = &ring->dirty_gfns[ring->dirty_index & (ring->size - 1)];
+	entry->slot = slot;
+	entry->offset = offset;
+	/*
+	 * Make sure the data is filled in before we publish this to
+	 * the userspace program.  There's no paired kernel-side reader.
+	 */
+	smp_wmb();
+	ring->dirty_index++;
+	WRITE_ONCE(indices->avail_index, ring->dirty_index);
+
+	trace_kvm_dirty_ring_push(ring, slot, offset);
+}
+
+struct page *kvm_dirty_ring_get_page(struct kvm_dirty_ring *ring, u32 offset)
+{
+	return vmalloc_to_page((void *)ring->dirty_gfns + offset * PAGE_SIZE);
+}
+
+void kvm_dirty_ring_free(struct kvm_dirty_ring *ring)
+{
+	vfree(ring->dirty_gfns);
+	ring->dirty_gfns = NULL;
+}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 5bbd8b8730fa..5e36792e15ae 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -64,6 +64,8 @@ 
 #define CREATE_TRACE_POINTS
 #include <trace/events/kvm.h>
 
+#include <linux/kvm_dirty_ring.h>
+
 /* Worst case buffer size needed for holding an integer. */
 #define ITOA_MAX_LEN 12
 
@@ -357,11 +359,22 @@  int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
 	vcpu->preempted = false;
 	vcpu->ready = false;
 
+	if (kvm->dirty_ring_size) {
+		r = kvm_dirty_ring_alloc(&vcpu->dirty_ring,
+					 &vcpu->run->vcpu_ring_indices,
+					 id, kvm->dirty_ring_size);
+		if (r)
+			goto fail_free_run;
+	}
+
 	r = kvm_arch_vcpu_init(vcpu);
 	if (r < 0)
-		goto fail_free_run;
+		goto fail_free_ring;
 	return 0;
 
+fail_free_ring:
+	if (kvm->dirty_ring_size)
+		kvm_dirty_ring_free(&vcpu->dirty_ring);
 fail_free_run:
 	free_page((unsigned long)vcpu->run);
 fail:
@@ -379,6 +392,8 @@  void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
 	put_pid(rcu_dereference_protected(vcpu->pid, 1));
 	kvm_arch_vcpu_uninit(vcpu);
 	free_page((unsigned long)vcpu->run);
+	if (vcpu->kvm->dirty_ring_size)
+		kvm_dirty_ring_free(&vcpu->dirty_ring);
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
 
@@ -2284,8 +2299,13 @@  static void mark_page_dirty_in_slot(struct kvm *kvm,
 {
 	if (memslot && memslot->dirty_bitmap) {
 		unsigned long rel_gfn = gfn - memslot->base_gfn;
+		u32 slot = (memslot->as_id << 16) | memslot->id;
 
-		set_bit_le(rel_gfn, memslot->dirty_bitmap);
+		if (kvm->dirty_ring_size)
+			kvm_dirty_ring_push(kvm_dirty_ring_get(kvm),
+					    slot, rel_gfn);
+		else
+			set_bit_le(rel_gfn, memslot->dirty_bitmap);
 	}
 }
 
@@ -2632,6 +2652,16 @@  void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
 
+static bool kvm_page_in_dirty_ring(struct kvm *kvm, unsigned long pgoff)
+{
+	if (!KVM_DIRTY_LOG_PAGE_OFFSET)
+		return false;
+
+	return (pgoff >= KVM_DIRTY_LOG_PAGE_OFFSET) &&
+	    (pgoff < KVM_DIRTY_LOG_PAGE_OFFSET +
+	     kvm->dirty_ring_size / PAGE_SIZE);
+}
+
 static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf)
 {
 	struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data;
@@ -2647,6 +2677,10 @@  static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf)
 	else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
 		page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
 #endif
+	else if (kvm_page_in_dirty_ring(vcpu->kvm, vmf->pgoff))
+		page = kvm_dirty_ring_get_page(
+		    &vcpu->dirty_ring,
+		    vmf->pgoff - KVM_DIRTY_LOG_PAGE_OFFSET);
 	else
 		return kvm_arch_vcpu_fault(vcpu, vmf);
 	get_page(page);
@@ -2660,6 +2694,15 @@  static const struct vm_operations_struct kvm_vcpu_vm_ops = {
 
 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
 {
+	struct kvm_vcpu *vcpu = file->private_data;
+	unsigned long pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+
+	/* If to map any writable page within dirty ring, fail it */
+	if ((kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff) ||
+	     kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff + pages - 1)) &&
+	    vma->vm_flags & VM_WRITE)
+		return -EINVAL;
+
 	vma->vm_ops = &kvm_vcpu_vm_ops;
 	return 0;
 }
@@ -3242,12 +3285,97 @@  static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
 #endif
 	case KVM_CAP_NR_MEMSLOTS:
 		return KVM_USER_MEM_SLOTS;
+	case KVM_CAP_DIRTY_LOG_RING:
+#ifdef CONFIG_X86
+		return KVM_DIRTY_RING_MAX_ENTRIES;
+#else
+		return 0;
+#endif
 	default:
 		break;
 	}
 	return kvm_vm_ioctl_check_extension(kvm, arg);
 }
 
+void kvm_reset_dirty_gfn(struct kvm *kvm, u32 slot, u64 offset, u64 mask)
+{
+	struct kvm_memory_slot *memslot;
+	int as_id, id;
+
+	as_id = slot >> 16;
+	id = (u16)slot;
+	if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
+		return;
+
+	memslot = id_to_memslot(__kvm_memslots(kvm, as_id), id);
+	if (offset >= memslot->npages)
+		return;
+
+	spin_lock(&kvm->mmu_lock);
+	kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, offset, mask);
+	spin_unlock(&kvm->mmu_lock);
+}
+
+static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, u32 size)
+{
+	int r;
+
+	if (!KVM_DIRTY_LOG_PAGE_OFFSET)
+		return -EINVAL;
+
+	/* the size should be power of 2 */
+	if (!size || (size & (size - 1)))
+		return -EINVAL;
+
+	/* Should be bigger to keep the reserved entries, or a page */
+	if (size < kvm_dirty_ring_get_rsvd_entries() *
+	    sizeof(struct kvm_dirty_gfn) || size < PAGE_SIZE)
+		return -EINVAL;
+
+	if (size > KVM_DIRTY_RING_MAX_ENTRIES *
+	    sizeof(struct kvm_dirty_gfn))
+		return -E2BIG;
+
+	/* We only allow it to set once */
+	if (kvm->dirty_ring_size)
+		return -EINVAL;
+
+	mutex_lock(&kvm->lock);
+
+	if (kvm->created_vcpus) {
+		/* We don't allow to change this value after vcpu created */
+		r = -EINVAL;
+	} else {
+		kvm->dirty_ring_size = size;
+		r = 0;
+	}
+
+	mutex_unlock(&kvm->lock);
+	return r;
+}
+
+static int kvm_vm_ioctl_reset_dirty_pages(struct kvm *kvm)
+{
+	int i;
+	struct kvm_vcpu *vcpu;
+	int cleared = 0;
+
+	if (!kvm->dirty_ring_size)
+		return -EINVAL;
+
+	mutex_lock(&kvm->slots_lock);
+
+	kvm_for_each_vcpu(i, vcpu, kvm)
+		cleared += kvm_dirty_ring_reset(vcpu->kvm, &vcpu->dirty_ring);
+
+	mutex_unlock(&kvm->slots_lock);
+
+	if (cleared)
+		kvm_flush_remote_tlbs(kvm);
+
+	return cleared;
+}
+
 int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm,
 						  struct kvm_enable_cap *cap)
 {
@@ -3265,6 +3393,8 @@  static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
 		kvm->manual_dirty_log_protect = cap->args[0];
 		return 0;
 #endif
+	case KVM_CAP_DIRTY_LOG_RING:
+		return kvm_vm_ioctl_enable_dirty_log_ring(kvm, cap->args[0]);
 	default:
 		return kvm_vm_ioctl_enable_cap(kvm, cap);
 	}
@@ -3452,6 +3582,9 @@  static long kvm_vm_ioctl(struct file *filp,
 	case KVM_CHECK_EXTENSION:
 		r = kvm_vm_ioctl_check_extension_generic(kvm, arg);
 		break;
+	case KVM_RESET_DIRTY_RINGS:
+		r = kvm_vm_ioctl_reset_dirty_pages(kvm);
+		break;
 	default:
 		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
 	}