diff mbox series

[v7,1/4] KVM: Implement dirty quota-based throttling of vcpus

Message ID 20221113170507.208810-2-shivam.kumar1@nutanix.com (mailing list archive)
State New, archived
Headers show
Series KVM: Dirty quota-based throttling | expand

Commit Message

Shivam Kumar Nov. 13, 2022, 5:05 p.m. UTC
Define variables to track and throttle memory dirtying for every vcpu.

dirty_count:    Number of pages the vcpu has dirtied since its creation,
                while dirty logging is enabled.
dirty_quota:    Number of pages the vcpu is allowed to dirty. To dirty
                more, it needs to request more quota by exiting to
                userspace.

Implement the flow for throttling based on dirty quota.

i) Increment dirty_count for the vcpu whenever it dirties a page.
ii) Exit to userspace whenever the dirty quota is exhausted (i.e. dirty
count equals/exceeds dirty quota) to request more dirty quota.

Suggested-by: Shaju Abraham <shaju.abraham@nutanix.com>
Suggested-by: Manish Mishra <manish.mishra@nutanix.com>
Co-developed-by: Anurag Madnawat <anurag.madnawat@nutanix.com>
Signed-off-by: Anurag Madnawat <anurag.madnawat@nutanix.com>
Signed-off-by: Shivam Kumar <shivam.kumar1@nutanix.com>
---
 Documentation/virt/kvm/api.rst | 35 ++++++++++++++++++++++++++++++++++
 arch/x86/kvm/Kconfig           |  1 +
 include/linux/kvm_host.h       |  5 ++++-
 include/linux/kvm_types.h      |  1 +
 include/uapi/linux/kvm.h       | 13 +++++++++++++
 tools/include/uapi/linux/kvm.h |  1 +
 virt/kvm/Kconfig               |  4 ++++
 virt/kvm/kvm_main.c            | 25 +++++++++++++++++++++---
 8 files changed, 81 insertions(+), 4 deletions(-)

Comments

Yunhong Jiang Nov. 14, 2022, 11:29 p.m. UTC | #1
On Sun, Nov 13, 2022 at 05:05:06PM +0000, Shivam Kumar wrote:
> Define variables to track and throttle memory dirtying for every vcpu.
> 
> dirty_count:    Number of pages the vcpu has dirtied since its creation,
>                 while dirty logging is enabled.
> dirty_quota:    Number of pages the vcpu is allowed to dirty. To dirty
>                 more, it needs to request more quota by exiting to
>                 userspace.
> 
> Implement the flow for throttling based on dirty quota.
> 
> i) Increment dirty_count for the vcpu whenever it dirties a page.
> ii) Exit to userspace whenever the dirty quota is exhausted (i.e. dirty
> count equals/exceeds dirty quota) to request more dirty quota.
> 
> Suggested-by: Shaju Abraham <shaju.abraham@nutanix.com>
> Suggested-by: Manish Mishra <manish.mishra@nutanix.com>
> Co-developed-by: Anurag Madnawat <anurag.madnawat@nutanix.com>
> Signed-off-by: Anurag Madnawat <anurag.madnawat@nutanix.com>
> Signed-off-by: Shivam Kumar <shivam.kumar1@nutanix.com>
> ---
>  Documentation/virt/kvm/api.rst | 35 ++++++++++++++++++++++++++++++++++
>  arch/x86/kvm/Kconfig           |  1 +
>  include/linux/kvm_host.h       |  5 ++++-
>  include/linux/kvm_types.h      |  1 +
>  include/uapi/linux/kvm.h       | 13 +++++++++++++
>  tools/include/uapi/linux/kvm.h |  1 +
>  virt/kvm/Kconfig               |  4 ++++
>  virt/kvm/kvm_main.c            | 25 +++++++++++++++++++++---
>  8 files changed, 81 insertions(+), 4 deletions(-)
> 
> diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
> index eee9f857a986..4568faa33c6d 100644
> --- a/Documentation/virt/kvm/api.rst
> +++ b/Documentation/virt/kvm/api.rst
> @@ -6513,6 +6513,26 @@ array field represents return values. The userspace should update the return
>  values of SBI call before resuming the VCPU. For more details on RISC-V SBI
>  spec refer, https://github.com/riscv/riscv-sbi-doc.
>  
> +::
> +
> +		/* KVM_EXIT_DIRTY_QUOTA_EXHAUSTED */
> +		struct {
> +			__u64 count;
> +			__u64 quota;
> +		} dirty_quota_exit;
> +
> +If exit reason is KVM_EXIT_DIRTY_QUOTA_EXHAUSTED, it indicates that the VCPU has
> +exhausted its dirty quota. The 'dirty_quota_exit' member of kvm_run structure
> +makes the following information available to the userspace:
> +    count: the current count of pages dirtied by the VCPU, can be
> +    skewed based on the size of the pages accessed by each vCPU.
> +    quota: the observed dirty quota just before the exit to userspace.
> +
> +The userspace can design a strategy to allocate the overall scope of dirtying
> +for the VM among the vcpus. Based on the strategy and the current state of dirty
> +quota throttling, the userspace can make a decision to either update (increase)
> +the quota or to put the VCPU to sleep for some time.
> +
>  ::
>  
>      /* KVM_EXIT_NOTIFY */
> @@ -6567,6 +6587,21 @@ values in kvm_run even if the corresponding bit in kvm_dirty_regs is not set.
>  
>  ::
>  
> +	/*
> +	 * Number of pages the vCPU is allowed to have dirtied over its entire
> +	 * lifetime.  KVM_RUN exits with KVM_EXIT_DIRTY_QUOTA_EXHAUSTED if the quota
> +	 * is reached/exceeded.
> +	 */
> +	__u64 dirty_quota;
> +
> +Please note that enforcing the quota is best effort, as the guest may dirty
> +multiple pages before KVM can recheck the quota.  However, unless KVM is using
> +a hardware-based dirty ring buffer, e.g. Intel's Page Modification Logging,
> +KVM will detect quota exhaustion within a handful of dirtied pages.  If a
> +hardware ring buffer is used, the overrun is bounded by the size of the buffer
> +(512 entries for PML).
> +
> +::
>    };
>  
>  
> diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
> index 67be7f217e37..bdbd36321d52 100644
> --- a/arch/x86/kvm/Kconfig
> +++ b/arch/x86/kvm/Kconfig
> @@ -48,6 +48,7 @@ config KVM
>  	select KVM_VFIO
>  	select SRCU
>  	select INTERVAL_TREE
> +	select HAVE_KVM_DIRTY_QUOTA
>  	select HAVE_KVM_PM_NOTIFIER if PM
>  	help
>  	  Support hosting fully virtualized guest machines using hardware
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 18592bdf4c1b..0b9b5c251a04 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -151,11 +151,12 @@ static inline bool is_error_page(struct page *page)
>  #define KVM_REQUEST_NO_ACTION      BIT(10)
>  /*
>   * Architecture-independent vcpu->requests bit members
> - * Bits 3-7 are reserved for more arch-independent bits.
> + * Bits 5-7 are reserved for more arch-independent bits.
>   */
>  #define KVM_REQ_TLB_FLUSH         (0 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
>  #define KVM_REQ_VM_DEAD           (1 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
>  #define KVM_REQ_UNBLOCK           2
> +#define KVM_REQ_DIRTY_QUOTA_EXIT  4
Sorry if I missed anything. Why it's 4 instead of 3?
Shivam Kumar Nov. 15, 2022, 4:48 a.m. UTC | #2
On 15/11/22 4:59 am, Yunhong Jiang wrote:
> On Sun, Nov 13, 2022 at 05:05:06PM +0000, Shivam Kumar wrote:
>> Define variables to track and throttle memory dirtying for every vcpu.
>>
>> dirty_count:    Number of pages the vcpu has dirtied since its creation,
>>                  while dirty logging is enabled.
>> dirty_quota:    Number of pages the vcpu is allowed to dirty. To dirty
>>                  more, it needs to request more quota by exiting to
>>                  userspace.
>>
>> Implement the flow for throttling based on dirty quota.
>>
>> i) Increment dirty_count for the vcpu whenever it dirties a page.
>> ii) Exit to userspace whenever the dirty quota is exhausted (i.e. dirty
>> count equals/exceeds dirty quota) to request more dirty quota.
>>
>> Suggested-by: Shaju Abraham <shaju.abraham@nutanix.com>
>> Suggested-by: Manish Mishra <manish.mishra@nutanix.com>
>> Co-developed-by: Anurag Madnawat <anurag.madnawat@nutanix.com>
>> Signed-off-by: Anurag Madnawat <anurag.madnawat@nutanix.com>
>> Signed-off-by: Shivam Kumar <shivam.kumar1@nutanix.com>
>> ---
>>   Documentation/virt/kvm/api.rst | 35 ++++++++++++++++++++++++++++++++++
>>   arch/x86/kvm/Kconfig           |  1 +
>>   include/linux/kvm_host.h       |  5 ++++-
>>   include/linux/kvm_types.h      |  1 +
>>   include/uapi/linux/kvm.h       | 13 +++++++++++++
>>   tools/include/uapi/linux/kvm.h |  1 +
>>   virt/kvm/Kconfig               |  4 ++++
>>   virt/kvm/kvm_main.c            | 25 +++++++++++++++++++++---
>>   8 files changed, 81 insertions(+), 4 deletions(-)
>>
>> diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
>> index eee9f857a986..4568faa33c6d 100644
>> --- a/Documentation/virt/kvm/api.rst
>> +++ b/Documentation/virt/kvm/api.rst
>> @@ -6513,6 +6513,26 @@ array field represents return values. The userspace should update the return
>>   values of SBI call before resuming the VCPU. For more details on RISC-V SBI
>>   spec refer, https://urldefense.proofpoint.com/v2/url?u=https-3A__github.com_riscv_riscv-2Dsbi-2Ddoc&d=DwIBAg&c=s883GpUCOChKOHiocYtGcg&r=4hVFP4-J13xyn-OcN0apTCh8iKZRosf5OJTQePXBMB8&m=tRXKbxC9SnS7eawp-UoL_Dqi0C8tdGVG6pmh1Gv6Ijw1OItZ-EFLCPz4aAQ_3sob&s=HFLZ0ulDwLg_-wHdDvDhunY5olDW4tZ-6NQQE9WirIY&e= .
>>   
>> +::
>> +
>> +		/* KVM_EXIT_DIRTY_QUOTA_EXHAUSTED */
>> +		struct {
>> +			__u64 count;
>> +			__u64 quota;
>> +		} dirty_quota_exit;
>> +
>> +If exit reason is KVM_EXIT_DIRTY_QUOTA_EXHAUSTED, it indicates that the VCPU has
>> +exhausted its dirty quota. The 'dirty_quota_exit' member of kvm_run structure
>> +makes the following information available to the userspace:
>> +    count: the current count of pages dirtied by the VCPU, can be
>> +    skewed based on the size of the pages accessed by each vCPU.
>> +    quota: the observed dirty quota just before the exit to userspace.
>> +
>> +The userspace can design a strategy to allocate the overall scope of dirtying
>> +for the VM among the vcpus. Based on the strategy and the current state of dirty
>> +quota throttling, the userspace can make a decision to either update (increase)
>> +the quota or to put the VCPU to sleep for some time.
>> +
>>   ::
>>   
>>       /* KVM_EXIT_NOTIFY */
>> @@ -6567,6 +6587,21 @@ values in kvm_run even if the corresponding bit in kvm_dirty_regs is not set.
>>   
>>   ::
>>   
>> +	/*
>> +	 * Number of pages the vCPU is allowed to have dirtied over its entire
>> +	 * lifetime.  KVM_RUN exits with KVM_EXIT_DIRTY_QUOTA_EXHAUSTED if the quota
>> +	 * is reached/exceeded.
>> +	 */
>> +	__u64 dirty_quota;
>> +
>> +Please note that enforcing the quota is best effort, as the guest may dirty
>> +multiple pages before KVM can recheck the quota.  However, unless KVM is using
>> +a hardware-based dirty ring buffer, e.g. Intel's Page Modification Logging,
>> +KVM will detect quota exhaustion within a handful of dirtied pages.  If a
>> +hardware ring buffer is used, the overrun is bounded by the size of the buffer
>> +(512 entries for PML).
>> +
>> +::
>>     };
>>   
>>   
>> diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
>> index 67be7f217e37..bdbd36321d52 100644
>> --- a/arch/x86/kvm/Kconfig
>> +++ b/arch/x86/kvm/Kconfig
>> @@ -48,6 +48,7 @@ config KVM
>>   	select KVM_VFIO
>>   	select SRCU
>>   	select INTERVAL_TREE
>> +	select HAVE_KVM_DIRTY_QUOTA
>>   	select HAVE_KVM_PM_NOTIFIER if PM
>>   	help
>>   	  Support hosting fully virtualized guest machines using hardware
>> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
>> index 18592bdf4c1b..0b9b5c251a04 100644
>> --- a/include/linux/kvm_host.h
>> +++ b/include/linux/kvm_host.h
>> @@ -151,11 +151,12 @@ static inline bool is_error_page(struct page *page)
>>   #define KVM_REQUEST_NO_ACTION      BIT(10)
>>   /*
>>    * Architecture-independent vcpu->requests bit members
>> - * Bits 3-7 are reserved for more arch-independent bits.
>> + * Bits 5-7 are reserved for more arch-independent bits.
>>    */
>>   #define KVM_REQ_TLB_FLUSH         (0 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
>>   #define KVM_REQ_VM_DEAD           (1 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
>>   #define KVM_REQ_UNBLOCK           2
>> +#define KVM_REQ_DIRTY_QUOTA_EXIT  4
> Sorry if I missed anything. Why it's 4 instead of 3?
3 was already in use last time. Will update it. Thanks.
Marc Zyngier Nov. 17, 2022, 7:26 p.m. UTC | #3
On Sun, 13 Nov 2022 17:05:06 +0000,
Shivam Kumar <shivam.kumar1@nutanix.com> wrote:
> 
> Define variables to track and throttle memory dirtying for every vcpu.
> 
> dirty_count:    Number of pages the vcpu has dirtied since its creation,
>                 while dirty logging is enabled.
> dirty_quota:    Number of pages the vcpu is allowed to dirty. To dirty
>                 more, it needs to request more quota by exiting to
>                 userspace.
> 
> Implement the flow for throttling based on dirty quota.
> 
> i) Increment dirty_count for the vcpu whenever it dirties a page.
> ii) Exit to userspace whenever the dirty quota is exhausted (i.e. dirty
> count equals/exceeds dirty quota) to request more dirty quota.
> 
> Suggested-by: Shaju Abraham <shaju.abraham@nutanix.com>
> Suggested-by: Manish Mishra <manish.mishra@nutanix.com>
> Co-developed-by: Anurag Madnawat <anurag.madnawat@nutanix.com>
> Signed-off-by: Anurag Madnawat <anurag.madnawat@nutanix.com>
> Signed-off-by: Shivam Kumar <shivam.kumar1@nutanix.com>
> ---
>  Documentation/virt/kvm/api.rst | 35 ++++++++++++++++++++++++++++++++++
>  arch/x86/kvm/Kconfig           |  1 +
>  include/linux/kvm_host.h       |  5 ++++-
>  include/linux/kvm_types.h      |  1 +
>  include/uapi/linux/kvm.h       | 13 +++++++++++++
>  tools/include/uapi/linux/kvm.h |  1 +
>  virt/kvm/Kconfig               |  4 ++++
>  virt/kvm/kvm_main.c            | 25 +++++++++++++++++++++---
>  8 files changed, 81 insertions(+), 4 deletions(-)
> 
> diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
> index eee9f857a986..4568faa33c6d 100644
> --- a/Documentation/virt/kvm/api.rst
> +++ b/Documentation/virt/kvm/api.rst
> @@ -6513,6 +6513,26 @@ array field represents return values. The userspace should update the return
>  values of SBI call before resuming the VCPU. For more details on RISC-V SBI
>  spec refer, https://github.com/riscv/riscv-sbi-doc.
>  
> +::
> +
> +		/* KVM_EXIT_DIRTY_QUOTA_EXHAUSTED */
> +		struct {
> +			__u64 count;
> +			__u64 quota;
> +		} dirty_quota_exit;
> +
> +If exit reason is KVM_EXIT_DIRTY_QUOTA_EXHAUSTED, it indicates that the VCPU has
> +exhausted its dirty quota. The 'dirty_quota_exit' member of kvm_run structure
> +makes the following information available to the userspace:
> +    count: the current count of pages dirtied by the VCPU, can be
> +    skewed based on the size of the pages accessed by each vCPU.

How can userspace make a decision on the amount of dirtying this
represent if this doesn't represent a number of base pages? Or are you
saying that this only counts the number of permission faults that have
dirtied pages?

> +    quota: the observed dirty quota just before the exit to
> userspace.

You are defining the quota in terms of quota. -ENOCLUE.

> +
> +The userspace can design a strategy to allocate the overall scope of dirtying
> +for the VM among the vcpus. Based on the strategy and the current state of dirty
> +quota throttling, the userspace can make a decision to either update (increase)
> +the quota or to put the VCPU to sleep for some time.

This looks like something out of 1984 (Newspeak anyone)? Can't you
just say that userspace is responsible for allocating the quota and
manage the resulting throttling effect?

> +
>  ::
>  
>      /* KVM_EXIT_NOTIFY */
> @@ -6567,6 +6587,21 @@ values in kvm_run even if the corresponding bit in kvm_dirty_regs is not set.
>  
>  ::
>  
> +	/*
> +	 * Number of pages the vCPU is allowed to have dirtied over its entire
> +	 * lifetime.  KVM_RUN exits with KVM_EXIT_DIRTY_QUOTA_EXHAUSTED if the quota
> +	 * is reached/exceeded.
> +	 */
> +	__u64 dirty_quota;

How are dirty_quota and dirty_quota_exit.quota related?

> +
> +Please note that enforcing the quota is best effort, as the guest may dirty
> +multiple pages before KVM can recheck the quota.  However, unless KVM is using
> +a hardware-based dirty ring buffer, e.g. Intel's Page Modification Logging,
> +KVM will detect quota exhaustion within a handful of dirtied pages.  If a
> +hardware ring buffer is used, the overrun is bounded by the size of the buffer
> +(512 entries for PML).
> +
> +::
>    };
>  
>  
> diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
> index 67be7f217e37..bdbd36321d52 100644
> --- a/arch/x86/kvm/Kconfig
> +++ b/arch/x86/kvm/Kconfig
> @@ -48,6 +48,7 @@ config KVM
>  	select KVM_VFIO
>  	select SRCU
>  	select INTERVAL_TREE
> +	select HAVE_KVM_DIRTY_QUOTA

Why isn't this part of the x86 patch?

>  	select HAVE_KVM_PM_NOTIFIER if PM
>  	help
>  	  Support hosting fully virtualized guest machines using hardware
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 18592bdf4c1b..0b9b5c251a04 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -151,11 +151,12 @@ static inline bool is_error_page(struct page *page)
>  #define KVM_REQUEST_NO_ACTION      BIT(10)
>  /*
>   * Architecture-independent vcpu->requests bit members
> - * Bits 3-7 are reserved for more arch-independent bits.
> + * Bits 5-7 are reserved for more arch-independent bits.
>   */
>  #define KVM_REQ_TLB_FLUSH         (0 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
>  #define KVM_REQ_VM_DEAD           (1 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
>  #define KVM_REQ_UNBLOCK           2
> +#define KVM_REQ_DIRTY_QUOTA_EXIT  4

Where is 3? Why reserve two bits when only one is used?

>  #define KVM_REQUEST_ARCH_BASE     8
>  
>  /*
> @@ -379,6 +380,8 @@ struct kvm_vcpu {
>  	 */
>  	struct kvm_memory_slot *last_used_slot;
>  	u64 last_used_slot_gen;
> +
> +	u64 dirty_quota;
>  };
>  
>  /*
> diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
> index 3ca3db020e0e..263a588f3cd3 100644
> --- a/include/linux/kvm_types.h
> +++ b/include/linux/kvm_types.h
> @@ -118,6 +118,7 @@ struct kvm_vcpu_stat_generic {
>  	u64 halt_poll_fail_hist[HALT_POLL_HIST_COUNT];
>  	u64 halt_wait_hist[HALT_POLL_HIST_COUNT];
>  	u64 blocking;
> +	u64 pages_dirtied;
>  };
>  
>  #define KVM_STATS_NAME_SIZE	48
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 0d5d4419139a..5acb8991f872 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -272,6 +272,7 @@ struct kvm_xen_exit {
>  #define KVM_EXIT_RISCV_SBI        35
>  #define KVM_EXIT_RISCV_CSR        36
>  #define KVM_EXIT_NOTIFY           37
> +#define KVM_EXIT_DIRTY_QUOTA_EXHAUSTED 38
>  
>  /* For KVM_EXIT_INTERNAL_ERROR */
>  /* Emulate instruction failed. */
> @@ -510,6 +511,11 @@ struct kvm_run {
>  #define KVM_NOTIFY_CONTEXT_INVALID	(1 << 0)
>  			__u32 flags;
>  		} notify;
> +		/* KVM_EXIT_DIRTY_QUOTA_EXHAUSTED */
> +		struct {
> +			__u64 count;
> +			__u64 quota;
> +		} dirty_quota_exit;
>  		/* Fix the size of the union. */
>  		char padding[256];
>  	};
> @@ -531,6 +537,12 @@ struct kvm_run {
>  		struct kvm_sync_regs regs;
>  		char padding[SYNC_REGS_SIZE_BYTES];
>  	} s;
> +	/*
> +	 * Number of pages the vCPU is allowed to have dirtied over its entire
> +	 * lifetime.  KVM_RUN exits with KVM_EXIT_DIRTY_QUOTA_EXHAUSTED if the
> +	 * quota is reached/exceeded.
> +	 */
> +	__u64 dirty_quota;
>  };
>  
>  /* for KVM_REGISTER_COALESCED_MMIO / KVM_UNREGISTER_COALESCED_MMIO */
> @@ -1178,6 +1190,7 @@ struct kvm_ppc_resize_hpt {
>  #define KVM_CAP_S390_ZPCI_OP 221
>  #define KVM_CAP_S390_CPU_TOPOLOGY 222
>  #define KVM_CAP_DIRTY_LOG_RING_ACQ_REL 223
> +#define KVM_CAP_DIRTY_QUOTA 224
>  
>  #ifdef KVM_CAP_IRQ_ROUTING
>  
> diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h
> index 0d5d4419139a..c8f811572670 100644
> --- a/tools/include/uapi/linux/kvm.h
> +++ b/tools/include/uapi/linux/kvm.h
> @@ -1178,6 +1178,7 @@ struct kvm_ppc_resize_hpt {
>  #define KVM_CAP_S390_ZPCI_OP 221
>  #define KVM_CAP_S390_CPU_TOPOLOGY 222
>  #define KVM_CAP_DIRTY_LOG_RING_ACQ_REL 223
> +#define KVM_CAP_DIRTY_QUOTA 224
>  
>  #ifdef KVM_CAP_IRQ_ROUTING
>  
> diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
> index 800f9470e36b..b6418a578c0a 100644
> --- a/virt/kvm/Kconfig
> +++ b/virt/kvm/Kconfig
> @@ -19,6 +19,9 @@ config HAVE_KVM_IRQ_ROUTING
>  config HAVE_KVM_DIRTY_RING
>         bool
>  
> +config HAVE_KVM_DIRTY_QUOTA
> +       bool
> +
>  # Only strongly ordered architectures can select this, as it doesn't
>  # put any explicit constraint on userspace ordering. They can also
>  # select the _ACQ_REL version.
> @@ -86,3 +89,4 @@ config KVM_XFER_TO_GUEST_WORK
>  
>  config HAVE_KVM_PM_NOTIFIER
>         bool
> +
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 25d7872b29c1..7a54438b4d49 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -3298,18 +3298,32 @@ int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
>  }
>  EXPORT_SYMBOL_GPL(kvm_clear_guest);
>  
> +static bool kvm_vcpu_is_dirty_quota_exhausted(struct kvm_vcpu *vcpu)
> +{
> +#ifdef CONFIG_HAVE_KVM_DIRTY_QUOTA
> +	u64 dirty_quota = READ_ONCE(vcpu->run->dirty_quota);
> +
> +	return dirty_quota && (vcpu->stat.generic.pages_dirtied >= dirty_quota);
> +#else
> +	return false;
> +#endif

If you introduce additional #ifdefery here, why are the additional
fields in the vcpu structure unconditional?

> +}
> +
>  void mark_page_dirty_in_slot(struct kvm *kvm,
>  			     const struct kvm_memory_slot *memslot,
>  		 	     gfn_t gfn)
>  {
>  	struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
>  
> -#ifdef CONFIG_HAVE_KVM_DIRTY_RING
>  	if (WARN_ON_ONCE(!vcpu) || WARN_ON_ONCE(vcpu->kvm != kvm))
>  		return;
> -#endif
>  
> -	if (memslot && kvm_slot_dirty_track_enabled(memslot)) {
> +	if (!memslot)
> +		return;
> +
> +	WARN_ON_ONCE(!vcpu->stat.generic.pages_dirtied++);
> +
> +	if (kvm_slot_dirty_track_enabled(memslot)) {
>  		unsigned long rel_gfn = gfn - memslot->base_gfn;
>  		u32 slot = (memslot->as_id << 16) | memslot->id;
>  
> @@ -3318,6 +3332,9 @@ void mark_page_dirty_in_slot(struct kvm *kvm,
>  					    slot, rel_gfn);
>  		else
>  			set_bit_le(rel_gfn, memslot->dirty_bitmap);
> +
> +		if (kvm_vcpu_is_dirty_quota_exhausted(vcpu))
> +			kvm_make_request(KVM_REQ_DIRTY_QUOTA_EXIT, vcpu);

This is broken in the light of new dirty-tracking code queued for
6.2. Specifically, you absolutely can end-up here *without* a vcpu on
arm64. You just have to snapshot the ITS state to observe the fireworks.

	M.
Shivam Kumar Nov. 18, 2022, 9:47 a.m. UTC | #4
On 18/11/22 12:56 am, Marc Zyngier wrote:
> On Sun, 13 Nov 2022 17:05:06 +0000,
> Shivam Kumar <shivam.kumar1@nutanix.com> wrote:
>>
>> +    count: the current count of pages dirtied by the VCPU, can be
>> +    skewed based on the size of the pages accessed by each vCPU.
> 
> How can userspace make a decision on the amount of dirtying this
> represent if this doesn't represent a number of base pages? Or are you
> saying that this only counts the number of permission faults that have
> dirtied pages?

Yes, this only counts the number of permission faults that have dirtied 
pages.

> 
>> +    quota: the observed dirty quota just before the exit to
>> userspace.
> 
> You are defining the quota in terms of quota. -ENOCLUE.

I am defining the "quota" member of the dirty_quota_exit struct in terms 
of "dirty quota" which is already defined in the commit message.

> 
>> +
>> +The userspace can design a strategy to allocate the overall scope of dirtying
>> +for the VM among the vcpus. Based on the strategy and the current state of dirty
>> +quota throttling, the userspace can make a decision to either update (increase)
>> +the quota or to put the VCPU to sleep for some time.
> 
> This looks like something out of 1984 (Newspeak anyone)? Can't you
> just say that userspace is responsible for allocating the quota and
> manage the resulting throttling effect?

We didn't intend to sound like the Party or the Big Brother. We started 
working on the linux and QEMU patches at the same time and got tempted 
into exposing the details of how we were using this feature in QEMU for 
throttling. I can get rid of the details if it helps.

>> +	/*
>> +	 * Number of pages the vCPU is allowed to have dirtied over its entire
>> +	 * lifetime.  KVM_RUN exits with KVM_EXIT_DIRTY_QUOTA_EXHAUSTED if the quota
>> +	 * is reached/exceeded.
>> +	 */
>> +	__u64 dirty_quota;
> 
> How are dirty_quota and dirty_quota_exit.quota related?
> 

dirty_quota_exit.quota is the dirty quota at the time of the exit. We 
are capturing it for userspace's reference because dirty quota can be 
updated anytime.

>> @@ -48,6 +48,7 @@ config KVM
>>   	select KVM_VFIO
>>   	select SRCU
>>   	select INTERVAL_TREE
>> +	select HAVE_KVM_DIRTY_QUOTA
> 
> Why isn't this part of the x86 patch?

Ack. Thanks.

>>    * Architecture-independent vcpu->requests bit members
>> - * Bits 3-7 are reserved for more arch-independent bits.
>> + * Bits 5-7 are reserved for more arch-independent bits.
>>    */
>>   #define KVM_REQ_TLB_FLUSH         (0 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
>>   #define KVM_REQ_VM_DEAD           (1 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
>>   #define KVM_REQ_UNBLOCK           2
>> +#define KVM_REQ_DIRTY_QUOTA_EXIT  4
> 
> Where is 3? Why reserve two bits when only one is used?

Ack. 3 was in use when I was working on the patchset. Missed this in my 
last code walkthrough before sending the patchset. Thanks.

>>   
>> +static bool kvm_vcpu_is_dirty_quota_exhausted(struct kvm_vcpu *vcpu)
>> +{
>> +#ifdef CONFIG_HAVE_KVM_DIRTY_QUOTA
>> +	u64 dirty_quota = READ_ONCE(vcpu->run->dirty_quota);
>> +
>> +	return dirty_quota && (vcpu->stat.generic.pages_dirtied >= dirty_quota);
>> +#else
>> +	return false;
>> +#endif
> 
> If you introduce additional #ifdefery here, why are the additional
> fields in the vcpu structure unconditional?

pages_dirtied can be a useful information even if dirty quota throttling 
is not used. So, I kept it unconditional based on feedback.

CC: Sean

I can add #ifdefery in the vcpu run struct for dirty_quota.

>>   		else
>>   			set_bit_le(rel_gfn, memslot->dirty_bitmap);
>> +
>> +		if (kvm_vcpu_is_dirty_quota_exhausted(vcpu))
>> +			kvm_make_request(KVM_REQ_DIRTY_QUOTA_EXIT, vcpu);
> 
> This is broken in the light of new dirty-tracking code queued for
> 6.2. Specifically, you absolutely can end-up here *without* a vcpu on
> arm64. You just have to snapshot the ITS state to observe the fireworks.

Could you please point me to the patchset which is in queue?


I am grateful for the suggestions and feedback.

Thanks,
Shivam
Marc Zyngier Nov. 22, 2022, 5:46 p.m. UTC | #5
On Fri, 18 Nov 2022 09:47:50 +0000,
Shivam Kumar <shivam.kumar1@nutanix.com> wrote:
> 
> 
> 
> On 18/11/22 12:56 am, Marc Zyngier wrote:
> > On Sun, 13 Nov 2022 17:05:06 +0000,
> > Shivam Kumar <shivam.kumar1@nutanix.com> wrote:
> >> 
> >> +    count: the current count of pages dirtied by the VCPU, can be
> >> +    skewed based on the size of the pages accessed by each vCPU.
> > 
> > How can userspace make a decision on the amount of dirtying this
> > represent if this doesn't represent a number of base pages? Or are you
> > saying that this only counts the number of permission faults that have
> > dirtied pages?
> 
> Yes, this only counts the number of permission faults that have
> dirtied pages.

So how can userspace consistently set a quota of dirtied memory? This
has to account for the size that has been faulted, because that's all
userspace can reason about. Remember that at least on arm64, we're
dealing with 3 different base page sizes, and many more large page
sizes.

> 
> > 
> >> +    quota: the observed dirty quota just before the exit to
> >> userspace.
> > 
> > You are defining the quota in terms of quota. -ENOCLUE.
> 
> I am defining the "quota" member of the dirty_quota_exit struct in
> terms of "dirty quota" which is already defined in the commit
> message.

Which nobody will see. This is supposed to be a self contained
documentation.

> 
> > 
> >> +
> >> +The userspace can design a strategy to allocate the overall scope of dirtying
> >> +for the VM among the vcpus. Based on the strategy and the current state of dirty
> >> +quota throttling, the userspace can make a decision to either update (increase)
> >> +the quota or to put the VCPU to sleep for some time.
> > 
> > This looks like something out of 1984 (Newspeak anyone)? Can't you
> > just say that userspace is responsible for allocating the quota and
> > manage the resulting throttling effect?
> 
> We didn't intend to sound like the Party or the Big Brother. We
> started working on the linux and QEMU patches at the same time and got
> tempted into exposing the details of how we were using this feature in
> QEMU for throttling. I can get rid of the details if it helps.

I think the details are meaningless, and this should stick to the API,
not the way the API could be used.

> 
> >> +	/*
> >> +	 * Number of pages the vCPU is allowed to have dirtied over its entire
> >> +	 * lifetime.  KVM_RUN exits with KVM_EXIT_DIRTY_QUOTA_EXHAUSTED if the quota
> >> +	 * is reached/exceeded.
> >> +	 */
> >> +	__u64 dirty_quota;
> > 
> > How are dirty_quota and dirty_quota_exit.quota related?
> > 
> 
> dirty_quota_exit.quota is the dirty quota at the time of the exit. We
> are capturing it for userspace's reference because dirty quota can be
> updated anytime.

Shouldn't that be described here?

> 
> >> @@ -48,6 +48,7 @@ config KVM
> >>   	select KVM_VFIO
> >>   	select SRCU
> >>   	select INTERVAL_TREE
> >> +	select HAVE_KVM_DIRTY_QUOTA
> > 
> > Why isn't this part of the x86 patch?
> 
> Ack. Thanks.
> 
> >>    * Architecture-independent vcpu->requests bit members
> >> - * Bits 3-7 are reserved for more arch-independent bits.
> >> + * Bits 5-7 are reserved for more arch-independent bits.
> >>    */
> >>   #define KVM_REQ_TLB_FLUSH         (0 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
> >>   #define KVM_REQ_VM_DEAD           (1 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
> >>   #define KVM_REQ_UNBLOCK           2
> >> +#define KVM_REQ_DIRTY_QUOTA_EXIT  4
> > 
> > Where is 3? Why reserve two bits when only one is used?
> 
> Ack. 3 was in use when I was working on the patchset. Missed this in
> my last code walkthrough before sending the patchset. Thanks.
> 
> >>   +static bool kvm_vcpu_is_dirty_quota_exhausted(struct kvm_vcpu
> >> *vcpu)
> >> +{
> >> +#ifdef CONFIG_HAVE_KVM_DIRTY_QUOTA
> >> +	u64 dirty_quota = READ_ONCE(vcpu->run->dirty_quota);
> >> +
> >> +	return dirty_quota && (vcpu->stat.generic.pages_dirtied >= dirty_quota);
> >> +#else
> >> +	return false;
> >> +#endif
> > 
> > If you introduce additional #ifdefery here, why are the additional
> > fields in the vcpu structure unconditional?
> 
> pages_dirtied can be a useful information even if dirty quota
> throttling is not used. So, I kept it unconditional based on
> feedback.

Useful for whom? This creates an ABI for all architectures, and this
needs buy-in from everyone. Personally, I think it is a pretty useless
stat.

And while we're talking about pages_dirtied, I really dislike the
WARN_ON in mark_page_dirty_in_slot(). A counter has rolled over?
Shock, horror...

> 
> CC: Sean
> 
> I can add #ifdefery in the vcpu run struct for dirty_quota.
> 
> >>   		else
> >>   			set_bit_le(rel_gfn, memslot->dirty_bitmap);
> >> +
> >> +		if (kvm_vcpu_is_dirty_quota_exhausted(vcpu))
> >> +			kvm_make_request(KVM_REQ_DIRTY_QUOTA_EXIT, vcpu);
> > 
> > This is broken in the light of new dirty-tracking code queued for
> > 6.2. Specifically, you absolutely can end-up here *without* a vcpu on
> > arm64. You just have to snapshot the ITS state to observe the fireworks.
> 
> Could you please point me to the patchset which is in queue?

The patches are in -next, and you can look at the branch here[1].
Please also refer to the discussion on the list, as a lot of what was
discussed there does apply here.

Thanks,

	M.

[1] https://git.kernel.org/pub/scm/linux/kernel/git/maz/arm-platforms.git/log/?h=kvm-arm64/dirty-ring
Shivam Kumar Dec. 6, 2022, 6:22 a.m. UTC | #6
On 22/11/22 11:16 pm, Marc Zyngier wrote:
> On Fri, 18 Nov 2022 09:47:50 +0000,
> Shivam Kumar <shivam.kumar1@nutanix.com> wrote:
>>
>>
>>
>> On 18/11/22 12:56 am, Marc Zyngier wrote:
>>> On Sun, 13 Nov 2022 17:05:06 +0000,
>>> Shivam Kumar <shivam.kumar1@nutanix.com> wrote:
>>>>
>>>> +    count: the current count of pages dirtied by the VCPU, can be
>>>> +    skewed based on the size of the pages accessed by each vCPU.
>>>
>>> How can userspace make a decision on the amount of dirtying this
>>> represent if this doesn't represent a number of base pages? Or are you
>>> saying that this only counts the number of permission faults that have
>>> dirtied pages?
>>
>> Yes, this only counts the number of permission faults that have
>> dirtied pages.
> 
> So how can userspace consistently set a quota of dirtied memory? This
> has to account for the size that has been faulted, because that's all
> userspace can reason about. Remember that at least on arm64, we're
> dealing with 3 different base page sizes, and many more large page
> sizes.

I understand that this helps only when the majority of dirtying is 
happening for the same page size. In our use case (VM live migration), 
even large page is broken into 4k pages at first dirty. If required in 
future, we can add individual counters for different page sizes.

Thanks for pointing this out.

>>>> +    quota: the observed dirty quota just before the exit to
>>>> userspace.
>>>
>>> You are defining the quota in terms of quota. -ENOCLUE.
>>
>> I am defining the "quota" member of the dirty_quota_exit struct in
>> terms of "dirty quota" which is already defined in the commit
>> message.
> 
> Which nobody will see. This is supposed to be a self contained
> documentation.
Ack. Thanks.

>>>> +The userspace can design a strategy to allocate the overall scope of dirtying
>>>> +for the VM among the vcpus. Based on the strategy and the current state of dirty
>>>> +quota throttling, the userspace can make a decision to either update (increase)
>>>> +the quota or to put the VCPU to sleep for some time.
>>>
>>> This looks like something out of 1984 (Newspeak anyone)? Can't you
>>> just say that userspace is responsible for allocating the quota and
>>> manage the resulting throttling effect?
>>
>> We didn't intend to sound like the Party or the Big Brother. We
>> started working on the linux and QEMU patches at the same time and got
>> tempted into exposing the details of how we were using this feature in
>> QEMU for throttling. I can get rid of the details if it helps.
> 
> I think the details are meaningless, and this should stick to the API,
> not the way the API could be used.

Ack. Thanks.

>>>> +	/*
>>>> +	 * Number of pages the vCPU is allowed to have dirtied over its entire
>>>> +	 * lifetime.  KVM_RUN exits with KVM_EXIT_DIRTY_QUOTA_EXHAUSTED if the quota
>>>> +	 * is reached/exceeded.
>>>> +	 */
>>>> +	__u64 dirty_quota;
>>>
>>> How are dirty_quota and dirty_quota_exit.quota related?
>>>
>>
>> dirty_quota_exit.quota is the dirty quota at the time of the exit. We
>> are capturing it for userspace's reference because dirty quota can be
>> updated anytime.
> 
> Shouldn't that be described here?

Ack. Thanks.

>>>> +#endif
>>>
>>> If you introduce additional #ifdefery here, why are the additional
>>> fields in the vcpu structure unconditional?
>>
>> pages_dirtied can be a useful information even if dirty quota
>> throttling is not used. So, I kept it unconditional based on
>> feedback.
> 
> Useful for whom? This creates an ABI for all architectures, and this
> needs buy-in from everyone. Personally, I think it is a pretty useless
> stat.

When we started this patch series, it was a member of the kvm_run 
struct. I made this a stat based on the feedback I received from the 
reviews. If you think otherwise, I can move it back to where it was.

Thanks.

> And while we're talking about pages_dirtied, I really dislike the
> WARN_ON in mark_page_dirty_in_slot(). A counter has rolled over?
> Shock, horror...

Ack. I'll give it a thought but if you have any specific suggestion on 
how I can make it better, kindly let me know. Thanks.

>>
>> CC: Sean
>>
>> I can add #ifdefery in the vcpu run struct for dirty_quota.
>>
>>>>    		else
>>>>    			set_bit_le(rel_gfn, memslot->dirty_bitmap);
>>>> +
>>>> +		if (kvm_vcpu_is_dirty_quota_exhausted(vcpu))
>>>> +			kvm_make_request(KVM_REQ_DIRTY_QUOTA_EXIT, vcpu);
>>>
>>> This is broken in the light of new dirty-tracking code queued for
>>> 6.2. Specifically, you absolutely can end-up here *without* a vcpu on
>>> arm64. You just have to snapshot the ITS state to observe the fireworks.
>>
>> Could you please point me to the patchset which is in queue?
> 
> The patches are in -next, and you can look at the branch here[1].
> Please also refer to the discussion on the list, as a lot of what was
> discussed there does apply here.
> 
> Thanks,
> 
> 	M.
> 
> [1] https://urldefense.proofpoint.com/v2/url?u=https-3A__git.kernel.org_pub_scm_linux_kernel_git_maz_arm-2Dplatforms.git_log_-3Fh-3Dkvm-2Darm64_dirty-2Dring&d=DwIBAg&c=s883GpUCOChKOHiocYtGcg&r=4hVFP4-J13xyn-OcN0apTCh8iKZRosf5OJTQePXBMB8&m=gyAAYSO2lIMhffCjshL9ZiA15_isV4kNauvn7aKEDy-kwpVrGNLlmO9AF6ilCsI1&s=x8gk31QIy9KHImR3z1xJOs9bSpKw1WYC_d1W-Vj5eTM&e=
> 

Thank you so much for the information. I went through the patches and I 
feel an additional check in the "if" condition will help eliminate any 
possible issue.

if (vcpu && kvm_vcpu_is_dirty_quota_exhausted(vcpu))
	kvm_make_request(KVM_REQ_DIRTY_QUOTA_EXIT, vcpu);

Happy to know your thoughts.


Thank you so much Marc for the review.


Thanks,
Shivam
Marc Zyngier Dec. 7, 2022, 4:44 p.m. UTC | #7
On Tue, 06 Dec 2022 06:22:45 +0000,
Shivam Kumar <shivam.kumar1@nutanix.com> wrote:
> 
> 
> 
> On 22/11/22 11:16 pm, Marc Zyngier wrote:
> > On Fri, 18 Nov 2022 09:47:50 +0000,
> > Shivam Kumar <shivam.kumar1@nutanix.com> wrote:
> >> 
> >> 
> >> 
> >> On 18/11/22 12:56 am, Marc Zyngier wrote:
> >>> On Sun, 13 Nov 2022 17:05:06 +0000,
> >>> Shivam Kumar <shivam.kumar1@nutanix.com> wrote:
> >>>> 
> >>>> +    count: the current count of pages dirtied by the VCPU, can be
> >>>> +    skewed based on the size of the pages accessed by each vCPU.
> >>> 
> >>> How can userspace make a decision on the amount of dirtying this
> >>> represent if this doesn't represent a number of base pages? Or are you
> >>> saying that this only counts the number of permission faults that have
> >>> dirtied pages?
> >> 
> >> Yes, this only counts the number of permission faults that have
> >> dirtied pages.
> > 
> > So how can userspace consistently set a quota of dirtied memory? This
> > has to account for the size that has been faulted, because that's all
> > userspace can reason about. Remember that at least on arm64, we're
> > dealing with 3 different base page sizes, and many more large page
> > sizes.
> 
> I understand that this helps only when the majority of dirtying is
> happening for the same page size. In our use case (VM live migration),
> even large page is broken into 4k pages at first dirty. If required in
> future, we can add individual counters for different page sizes.
> 
> Thanks for pointing this out.

Adding counters for different page sizes won't help. It will only make
the API more complex and harder to reason about. arm64 has 3 base page
sizes, up to 5 levels of translation, the ability to have block
mappings at most levels, plus nice features such as contiguous hints
that we treat as another block size. If you're lucky, that's about a
dozen different sizes. Good luck with that.

You really need to move away from your particular, 4kB centric, live
migration use case. This is about throttling a vcpu based on how much
memory it dirties. Not about the number of page faults it takes.

You need to define the granularity of the counter, and account for
each fault according to its mapping size. If an architecture has 16kB
as the base page size, a 32MB fault (the size of the smallest block
mapping) must bump the counter by 2048. That's the only way userspace
can figure out what is going on.

Without that, you may as well add a random number to the counter, it
won't be any worse.

[...]

> >>> If you introduce additional #ifdefery here, why are the additional
> >>> fields in the vcpu structure unconditional?
> >> 
> >> pages_dirtied can be a useful information even if dirty quota
> >> throttling is not used. So, I kept it unconditional based on
> >> feedback.
> > 
> > Useful for whom? This creates an ABI for all architectures, and this
> > needs buy-in from everyone. Personally, I think it is a pretty useless
> > stat.
> 
> When we started this patch series, it was a member of the kvm_run
> struct. I made this a stat based on the feedback I received from the
> reviews. If you think otherwise, I can move it back to where it was.

I'm certainly totally opposed to stats that don't have a clear use
case. People keep piling random stats that satisfy their pet usage,
and this only bloats the various structures for no overall benefit
other than "hey, it might be useful". This is death by a thousand cut.

> > And while we're talking about pages_dirtied, I really dislike the
> > WARN_ON in mark_page_dirty_in_slot(). A counter has rolled over?
> > Shock, horror...
> 
> Ack. I'll give it a thought but if you have any specific suggestion on
> how I can make it better, kindly let me know. Thanks.

What is the effect of counter overflowing? Why is it important to
warn? What goes wrong? What could be changed to *avoid* this being an
issue?

	M.
Sean Christopherson Dec. 7, 2022, 7:53 p.m. UTC | #8
On Wed, Dec 07, 2022, Marc Zyngier wrote:
> On Tue, 06 Dec 2022 06:22:45 +0000,
> Shivam Kumar <shivam.kumar1@nutanix.com> wrote:
> You need to define the granularity of the counter, and account for
> each fault according to its mapping size. If an architecture has 16kB
> as the base page size, a 32MB fault (the size of the smallest block
> mapping) must bump the counter by 2048. That's the only way userspace
> can figure out what is going on.

I don't think that's true for the dirty logging case.  IIUC, when a memslot is
being dirty logged, KVM forces the memory to be mapped with PAGE_SIZE granularity,
and that base PAGE_SIZE is fixed and known to userspace.  I.e. accuracy is naturally
provided for this primary use case where accuracy really matters, and so this is
effectively a documentation issue and not a functional issue.

> Without that, you may as well add a random number to the counter, it
> won't be any worse.

The stat will be wildly inaccurate when dirty logging isn't enabled, but that doesn't
necessarily make the stat useless, e.g. it might be useful as a very rough guage
of which vCPUs are likely to be writing memory.  I do agree though that the value
provided is questionable and/or highly speculative.

> [...]
> 
> > >>> If you introduce additional #ifdefery here, why are the additional
> > >>> fields in the vcpu structure unconditional?
> > >> 
> > >> pages_dirtied can be a useful information even if dirty quota
> > >> throttling is not used. So, I kept it unconditional based on
> > >> feedback.
> > > 
> > > Useful for whom? This creates an ABI for all architectures, and this
> > > needs buy-in from everyone. Personally, I think it is a pretty useless
> > > stat.
> > 
> > When we started this patch series, it was a member of the kvm_run
> > struct. I made this a stat based on the feedback I received from the
> > reviews. If you think otherwise, I can move it back to where it was.
> 
> I'm certainly totally opposed to stats that don't have a clear use
> case. People keep piling random stats that satisfy their pet usage,
> and this only bloats the various structures for no overall benefit
> other than "hey, it might be useful". This is death by a thousand cut.

I don't have a strong opinion on putting the counter into kvm_run as an "out"
fields vs. making it a state.  I originally suggested making it a stat because
KVM needs to capture the information somewhere, so why not make it a stat?  But
I am definitely much more cavalier when it comes to adding stats, so I've no
objection to dropping the stat side of things.
Shivam Kumar Dec. 8, 2022, 7:20 a.m. UTC | #9
> I'm certainly totally opposed to stats that don't have a clear use
> case. People keep piling random stats that satisfy their pet usage,
> and this only bloats the various structures for no overall benefit
> other than "hey, it might be useful". This is death by a thousand cut.
> 
>>> And while we're talking about pages_dirtied, I really dislike the
>>> WARN_ON in mark_page_dirty_in_slot(). A counter has rolled over?
>>> Shock, horror...
>>
>> Ack. I'll give it a thought but if you have any specific suggestion on
>> how I can make it better, kindly let me know. Thanks.
> 
> What is the effect of counter overflowing? Why is it important to
> warn? What goes wrong? What could be changed to *avoid* this being an
> issue?
> 
> 	M.
> 

When dirty quota is not enabled, counter overflow has no harm as such. 
If dirty logging is enabled with dirty quota, two cases may arise:

i) While setting the dirty quota to count + new quota, the dirty quota 
itself overflows. Now, if the userspace doesn’t manipulate the count 
accordingly: the count will still be a large value and so the vcpu will 
exit to userspace again and again until the count also overflows at some 
point (this is inevitable because the count will continue getting 
incremented after each write).

ii) Dirty quota is very close to the max value of a 64 bit unsigned 
int. Dirty count can overflow in this case and the vcpu might never exit 
to userspace (with exit reason - dirty quota exhausted) which means no 
throttling happens. One possible way to resolve this is by exiting to 
userspace as soon as the count equals the dirty quota. By not waiting 
for the dirty count to exceed the dirty quota, we can avoid this. 
Though, this is difficult to achieve due to Intel’s PML.

In both these cases, nothing catastrophic happens; it’s just that the 
userspace’s expectations are not met. However, we can have clear 
instructions in the documentation on how the userspace can avoid these 
issues altogether by resetting the count and quota values whenever they 
exceed a safe level (for now I am able to think of 512 less than the 
maximum value of an unsigned int as a safe value for dirty quota). To 
allow the userspace to do it, we need to provide the userspace some way 
to reset the count. I am not sure how we can achieve this if the dirty 
count (i.e. pages dirtied) is a KVM stat. But, if we can make it a 
member of kvm_run, it is fairly simple to do. So IMO, yes, warn is 
useless here.

Happy to know your thoughts on this. Really grateful for the help so far.


Thanks,
Shivam
Shivam Kumar Dec. 8, 2022, 7:30 a.m. UTC | #10
On 08/12/22 1:23 am, Sean Christopherson wrote:
> On Wed, Dec 07, 2022, Marc Zyngier wrote:
>> On Tue, 06 Dec 2022 06:22:45 +0000,
>> Shivam Kumar <shivam.kumar1@nutanix.com> wrote:
>> You need to define the granularity of the counter, and account for
>> each fault according to its mapping size. If an architecture has 16kB
>> as the base page size, a 32MB fault (the size of the smallest block
>> mapping) must bump the counter by 2048. That's the only way userspace
>> can figure out what is going on.
> 
> I don't think that's true for the dirty logging case.  IIUC, when a memslot is
> being dirty logged, KVM forces the memory to be mapped with PAGE_SIZE granularity,
> and that base PAGE_SIZE is fixed and known to userspace.  I.e. accuracy is naturally
> provided for this primary use case where accuracy really matters, and so this is
> effectively a documentation issue and not a functional issue.

So, does defining "count" as "the number of write permission faults" 
help in addressing the documentation issue? My understanding too is that 
for dirty logging, we will have uniform granularity.

Thanks.

> 
>> Without that, you may as well add a random number to the counter, it
>> won't be any worse.
> 
> The stat will be wildly inaccurate when dirty logging isn't enabled, but that doesn't
> necessarily make the stat useless, e.g. it might be useful as a very rough guage
> of which vCPUs are likely to be writing memory.  I do agree though that the value
> provided is questionable and/or highly speculative.
> 
>> [...]
>>
>>>>>> If you introduce additional #ifdefery here, why are the additional
>>>>>> fields in the vcpu structure unconditional?
>>>>>
>>>>> pages_dirtied can be a useful information even if dirty quota
>>>>> throttling is not used. So, I kept it unconditional based on
>>>>> feedback.
>>>>
>>>> Useful for whom? This creates an ABI for all architectures, and this
>>>> needs buy-in from everyone. Personally, I think it is a pretty useless
>>>> stat.
>>>
>>> When we started this patch series, it was a member of the kvm_run
>>> struct. I made this a stat based on the feedback I received from the
>>> reviews. If you think otherwise, I can move it back to where it was.
>>
>> I'm certainly totally opposed to stats that don't have a clear use
>> case. People keep piling random stats that satisfy their pet usage,
>> and this only bloats the various structures for no overall benefit
>> other than "hey, it might be useful". This is death by a thousand cut.
> 
> I don't have a strong opinion on putting the counter into kvm_run as an "out"
> fields vs. making it a state.  I originally suggested making it a stat because
> KVM needs to capture the information somewhere, so why not make it a stat?  But
> I am definitely much more cavalier when it comes to adding stats, so I've no
> objection to dropping the stat side of things.

I'll be skeptical about making it a stat if we plan to allow the 
userspace to reset it at will.


Thank you so much for the comments.

Thanks,
Shivam
Shivam Kumar Dec. 25, 2022, 4:50 p.m. UTC | #11
On 08/12/22 1:00 pm, Shivam Kumar wrote:
> 
> 
> On 08/12/22 1:23 am, Sean Christopherson wrote:
>> On Wed, Dec 07, 2022, Marc Zyngier wrote:
>>> On Tue, 06 Dec 2022 06:22:45 +0000,
>>> Shivam Kumar <shivam.kumar1@nutanix.com> wrote:
>>> You need to define the granularity of the counter, and account for
>>> each fault according to its mapping size. If an architecture has 16kB
>>> as the base page size, a 32MB fault (the size of the smallest block
>>> mapping) must bump the counter by 2048. That's the only way userspace
>>> can figure out what is going on.
>>
>> I don't think that's true for the dirty logging case.  IIUC, when a 
>> memslot is
>> being dirty logged, KVM forces the memory to be mapped with PAGE_SIZE 
>> granularity,
>> and that base PAGE_SIZE is fixed and known to userspace.  I.e. 
>> accuracy is naturally
>> provided for this primary use case where accuracy really matters, and 
>> so this is
>> effectively a documentation issue and not a functional issue.
> 
> So, does defining "count" as "the number of write permission faults" 
> help in addressing the documentation issue? My understanding too is that 
> for dirty logging, we will have uniform granularity.
> 
> Thanks.
> 
>>
>>> Without that, you may as well add a random number to the counter, it
>>> won't be any worse.
>>
>> The stat will be wildly inaccurate when dirty logging isn't enabled, 
>> but that doesn't
>> necessarily make the stat useless, e.g. it might be useful as a very 
>> rough guage
>> of which vCPUs are likely to be writing memory.  I do agree though 
>> that the value
>> provided is questionable and/or highly speculative.
>>
>>> [...]
>>>
>>>>>>> If you introduce additional #ifdefery here, why are the additional
>>>>>>> fields in the vcpu structure unconditional?
>>>>>>
>>>>>> pages_dirtied can be a useful information even if dirty quota
>>>>>> throttling is not used. So, I kept it unconditional based on
>>>>>> feedback.
>>>>>
>>>>> Useful for whom? This creates an ABI for all architectures, and this
>>>>> needs buy-in from everyone. Personally, I think it is a pretty useless
>>>>> stat.
>>>>
>>>> When we started this patch series, it was a member of the kvm_run
>>>> struct. I made this a stat based on the feedback I received from the
>>>> reviews. If you think otherwise, I can move it back to where it was.
>>>
>>> I'm certainly totally opposed to stats that don't have a clear use
>>> case. People keep piling random stats that satisfy their pet usage,
>>> and this only bloats the various structures for no overall benefit
>>> other than "hey, it might be useful". This is death by a thousand cut.
>>
>> I don't have a strong opinion on putting the counter into kvm_run as 
>> an "out"
>> fields vs. making it a state.  I originally suggested making it a stat 
>> because
>> KVM needs to capture the information somewhere, so why not make it a 
>> stat?  But
>> I am definitely much more cavalier when it comes to adding stats, so 
>> I've no
>> objection to dropping the stat side of things.
> 
> I'll be skeptical about making it a stat if we plan to allow the 
> userspace to reset it at will.
> 
> 
> Thank you so much for the comments.
> 
> Thanks,
> Shivam

Hi Marc,
Hi Sean,

Please let me know if there's any further question or feedback.

Thanks,
Shivam
Marc Zyngier Dec. 26, 2022, 10:07 a.m. UTC | #12
On Sun, 25 Dec 2022 16:50:04 +0000,
Shivam Kumar <shivam.kumar1@nutanix.com> wrote:
> 
> 
> 
> On 08/12/22 1:00 pm, Shivam Kumar wrote:
> > 
> > 
> > On 08/12/22 1:23 am, Sean Christopherson wrote:
> >> On Wed, Dec 07, 2022, Marc Zyngier wrote:
> >>> On Tue, 06 Dec 2022 06:22:45 +0000,
> >>> Shivam Kumar <shivam.kumar1@nutanix.com> wrote:
> >>> You need to define the granularity of the counter, and account for
> >>> each fault according to its mapping size. If an architecture has 16kB
> >>> as the base page size, a 32MB fault (the size of the smallest block
> >>> mapping) must bump the counter by 2048. That's the only way userspace
> >>> can figure out what is going on.
> >> 
> >> I don't think that's true for the dirty logging case.  IIUC, when a
> >> memslot is
> >> being dirty logged, KVM forces the memory to be mapped with
> >> PAGE_SIZE granularity,
> >> and that base PAGE_SIZE is fixed and known to userspace. 
> >> I.e. accuracy is naturally
> >> provided for this primary use case where accuracy really matters,
> >> and so this is
> >> effectively a documentation issue and not a functional issue.
> > 
> > So, does defining "count" as "the number of write permission faults"
> > help in addressing the documentation issue? My understanding too is
> > that for dirty logging, we will have uniform granularity.
> > 
> > Thanks.
> > 
> >> 
> >>> Without that, you may as well add a random number to the counter, it
> >>> won't be any worse.
> >> 
> >> The stat will be wildly inaccurate when dirty logging isn't
> >> enabled, but that doesn't
> >> necessarily make the stat useless, e.g. it might be useful as a
> >> very rough guage
> >> of which vCPUs are likely to be writing memory.  I do agree though
> >> that the value
> >> provided is questionable and/or highly speculative.
> >> 
> >>> [...]
> >>> 
> >>>>>>> If you introduce additional #ifdefery here, why are the additional
> >>>>>>> fields in the vcpu structure unconditional?
> >>>>>> 
> >>>>>> pages_dirtied can be a useful information even if dirty quota
> >>>>>> throttling is not used. So, I kept it unconditional based on
> >>>>>> feedback.
> >>>>> 
> >>>>> Useful for whom? This creates an ABI for all architectures, and this
> >>>>> needs buy-in from everyone. Personally, I think it is a pretty useless
> >>>>> stat.
> >>>> 
> >>>> When we started this patch series, it was a member of the kvm_run
> >>>> struct. I made this a stat based on the feedback I received from the
> >>>> reviews. If you think otherwise, I can move it back to where it was.
> >>> 
> >>> I'm certainly totally opposed to stats that don't have a clear use
> >>> case. People keep piling random stats that satisfy their pet usage,
> >>> and this only bloats the various structures for no overall benefit
> >>> other than "hey, it might be useful". This is death by a thousand cut.
> >> 
> >> I don't have a strong opinion on putting the counter into kvm_run
> >> as an "out"
> >> fields vs. making it a state.  I originally suggested making it a
> >> stat because
> >> KVM needs to capture the information somewhere, so why not make it
> >> a stat?  But
> >> I am definitely much more cavalier when it comes to adding stats,
> >> so I've no
> >> objection to dropping the stat side of things.
> > 
> > I'll be skeptical about making it a stat if we plan to allow the
> > userspace to reset it at will.
> > 
> > 
> > Thank you so much for the comments.
> > 
> > Thanks,
> > Shivam
> 
> Hi Marc,
> Hi Sean,
> 
> Please let me know if there's any further question or feedback.

My earlier comments still stand: the proposed API is not usable as a
general purpose memory-tracking API because it counts faults instead
of memory, making it inadequate except for the most trivial cases.
And I cannot believe you were serious when you mentioned that you were
happy to make that the API.

This requires some serious work, and this series is not yet near a
state where it could be merged.

Thanks,

	M.
Shivam Kumar Jan. 7, 2023, 5:24 p.m. UTC | #13
On 26/12/22 3:37 pm, Marc Zyngier wrote:
> On Sun, 25 Dec 2022 16:50:04 +0000,
> Shivam Kumar <shivam.kumar1@nutanix.com> wrote:
>>
>>
>>
>> On 08/12/22 1:00 pm, Shivam Kumar wrote:
>>>
>>>
>>> On 08/12/22 1:23 am, Sean Christopherson wrote:
>>>> On Wed, Dec 07, 2022, Marc Zyngier wrote:
>>>>> On Tue, 06 Dec 2022 06:22:45 +0000,
>>>>> Shivam Kumar <shivam.kumar1@nutanix.com> wrote:
>>>>> You need to define the granularity of the counter, and account for
>>>>> each fault according to its mapping size. If an architecture has 16kB
>>>>> as the base page size, a 32MB fault (the size of the smallest block
>>>>> mapping) must bump the counter by 2048. That's the only way userspace
>>>>> can figure out what is going on.
>>>>
>>>> I don't think that's true for the dirty logging case.  IIUC, when a
>>>> memslot is
>>>> being dirty logged, KVM forces the memory to be mapped with
>>>> PAGE_SIZE granularity,
>>>> and that base PAGE_SIZE is fixed and known to userspace.
>>>> I.e. accuracy is naturally
>>>> provided for this primary use case where accuracy really matters,
>>>> and so this is
>>>> effectively a documentation issue and not a functional issue.
>>>
>>> So, does defining "count" as "the number of write permission faults"
>>> help in addressing the documentation issue? My understanding too is
>>> that for dirty logging, we will have uniform granularity.
>>>
>>> Thanks.
>>>
>>>>
>>>>> Without that, you may as well add a random number to the counter, it
>>>>> won't be any worse.
>>>>
>>>> The stat will be wildly inaccurate when dirty logging isn't
>>>> enabled, but that doesn't
>>>> necessarily make the stat useless, e.g. it might be useful as a
>>>> very rough guage
>>>> of which vCPUs are likely to be writing memory.  I do agree though
>>>> that the value
>>>> provided is questionable and/or highly speculative.
>>>>
>>>>> [...]
>>>>>
>>>>>>>>> If you introduce additional #ifdefery here, why are the additional
>>>>>>>>> fields in the vcpu structure unconditional?
>>>>>>>>
>>>>>>>> pages_dirtied can be a useful information even if dirty quota
>>>>>>>> throttling is not used. So, I kept it unconditional based on
>>>>>>>> feedback.
>>>>>>>
>>>>>>> Useful for whom? This creates an ABI for all architectures, and this
>>>>>>> needs buy-in from everyone. Personally, I think it is a pretty useless
>>>>>>> stat.
>>>>>>
>>>>>> When we started this patch series, it was a member of the kvm_run
>>>>>> struct. I made this a stat based on the feedback I received from the
>>>>>> reviews. If you think otherwise, I can move it back to where it was.
>>>>>
>>>>> I'm certainly totally opposed to stats that don't have a clear use
>>>>> case. People keep piling random stats that satisfy their pet usage,
>>>>> and this only bloats the various structures for no overall benefit
>>>>> other than "hey, it might be useful". This is death by a thousand cut.
>>>>
>>>> I don't have a strong opinion on putting the counter into kvm_run
>>>> as an "out"
>>>> fields vs. making it a state.  I originally suggested making it a
>>>> stat because
>>>> KVM needs to capture the information somewhere, so why not make it
>>>> a stat?  But
>>>> I am definitely much more cavalier when it comes to adding stats,
>>>> so I've no
>>>> objection to dropping the stat side of things.
>>>
>>> I'll be skeptical about making it a stat if we plan to allow the
>>> userspace to reset it at will.
>>>
>>>
>>> Thank you so much for the comments.
>>>
>>> Thanks,
>>> Shivam
>>
>> Hi Marc,
>> Hi Sean,
>>
>> Please let me know if there's any further question or feedback.
> 
> My earlier comments still stand: the proposed API is not usable as a
> general purpose memory-tracking API because it counts faults instead
> of memory, making it inadequate except for the most trivial cases.
> And I cannot believe you were serious when you mentioned that you were
> happy to make that the API.
> 
> This requires some serious work, and this series is not yet near a
> state where it could be merged.
> 
> Thanks,
> 
> 	M.
> 

Hi Marc,

IIUC, in the dirty ring interface too, the dirty_index variable is 
incremented in the mark_page_dirty_in_slot function and it is also 
count-based. At least on x86, I am aware that for dirty tracking we have 
uniform granularity as huge pages (2MB pages) too are broken into 4K 
pages and bitmap is at 4K-granularity. Please let me know if it is 
possible to have multiple page sizes even during dirty logging on ARM. 
And if that is the case, I am wondering how we handle the bitmap with 
different page sizes on ARM.

I agree that the notion of pages dirtied according to our pages_dirtied 
variable depends on how we are handling the bitmap but we expect the 
userspace to use the same granularity at which the dirty bitmap is 
handled. I can capture this in documentation


CC: Peter Xu

Thanks,
Shivam
Marc Zyngier Jan. 7, 2023, 9:44 p.m. UTC | #14
On Sat, 07 Jan 2023 17:24:24 +0000,
Shivam Kumar <shivam.kumar1@nutanix.com> wrote:
> On 26/12/22 3:37 pm, Marc Zyngier wrote:
> > On Sun, 25 Dec 2022 16:50:04 +0000,
> > Shivam Kumar <shivam.kumar1@nutanix.com> wrote:
> >> 
> >> Hi Marc,
> >> Hi Sean,
> >> 
> >> Please let me know if there's any further question or feedback.
> > 
> > My earlier comments still stand: the proposed API is not usable as a
> > general purpose memory-tracking API because it counts faults instead
> > of memory, making it inadequate except for the most trivial cases.
> > And I cannot believe you were serious when you mentioned that you were
> > happy to make that the API.
> > 
> > This requires some serious work, and this series is not yet near a
> > state where it could be merged.
> > 
> > Thanks,
> > 
> > 	M.
> > 
> 
> Hi Marc,
> 
> IIUC, in the dirty ring interface too, the dirty_index variable is
> incremented in the mark_page_dirty_in_slot function and it is also
> count-based. At least on x86, I am aware that for dirty tracking we
> have uniform granularity as huge pages (2MB pages) too are broken into
> 4K pages and bitmap is at 4K-granularity. Please let me know if it is
> possible to have multiple page sizes even during dirty logging on
> ARM. And if that is the case, I am wondering how we handle the bitmap
> with different page sizes on ARM.

Easy. It *is* page-size, by the very definition of the API which
explicitly says that a single bit represent one basic page. If you
were to only break 1GB mappings into 2MB blocks, you'd have to mask
512 pages dirty at once, no question asked.

Your API is different because at no point it implies any relationship
with any page size. As it stands, it is a useless API. I understand
that you are only concerned with your particular use case, but that's
nowhere good enough. And it has nothing to do with ARM. This is
equally broken on *any* architecture.

> I agree that the notion of pages dirtied according to our
> pages_dirtied variable depends on how we are handling the bitmap but
> we expect the userspace to use the same granularity at which the dirty
> bitmap is handled. I can capture this in documentation

But what does the bitmap have to do with any of this? This is not what
your API is about. You are supposed to count dirtied memory, and you
are counting page faults instead. No sane userspace can make any sense
of that. You keep coupling the two, but that's wrong. This thing has
to be useful on its own, not just for your particular, super narrow
use case. And that's a shame because the general idea of a dirty quota
is an interesting one.

If your sole intention is to capture in the documentation that the API
is broken, then all I can do is to NAK the whole thing. Until you turn
this page-fault quota into the dirty memory quota that you advertise,
I'll continue to say no to it.

Thanks,

	M.
Shivam Kumar Jan. 14, 2023, 1:07 p.m. UTC | #15
On 08/01/23 3:14 am, Marc Zyngier wrote:
> On Sat, 07 Jan 2023 17:24:24 +0000,
> Shivam Kumar <shivam.kumar1@nutanix.com> wrote:
>> On 26/12/22 3:37 pm, Marc Zyngier wrote:
>>> On Sun, 25 Dec 2022 16:50:04 +0000,
>>> Shivam Kumar <shivam.kumar1@nutanix.com> wrote:
>>>>
>>>> Hi Marc,
>>>> Hi Sean,
>>>>
>>>> Please let me know if there's any further question or feedback.
>>>
>>> My earlier comments still stand: the proposed API is not usable as a
>>> general purpose memory-tracking API because it counts faults instead
>>> of memory, making it inadequate except for the most trivial cases.
>>> And I cannot believe you were serious when you mentioned that you were
>>> happy to make that the API.
>>>
>>> This requires some serious work, and this series is not yet near a
>>> state where it could be merged.
>>>
>>> Thanks,
>>>
>>> 	M.
>>>
>>
>> Hi Marc,
>>
>> IIUC, in the dirty ring interface too, the dirty_index variable is
>> incremented in the mark_page_dirty_in_slot function and it is also
>> count-based. At least on x86, I am aware that for dirty tracking we
>> have uniform granularity as huge pages (2MB pages) too are broken into
>> 4K pages and bitmap is at 4K-granularity. Please let me know if it is
>> possible to have multiple page sizes even during dirty logging on
>> ARM. And if that is the case, I am wondering how we handle the bitmap
>> with different page sizes on ARM.
> 
> Easy. It *is* page-size, by the very definition of the API which
> explicitly says that a single bit represent one basic page. If you
> were to only break 1GB mappings into 2MB blocks, you'd have to mask
> 512 pages dirty at once, no question asked.
> 
> Your API is different because at no point it implies any relationship
> with any page size. As it stands, it is a useless API. I understand
> that you are only concerned with your particular use case, but that's
> nowhere good enough. And it has nothing to do with ARM. This is
> equally broken on *any* architecture.
> 
>> I agree that the notion of pages dirtied according to our
>> pages_dirtied variable depends on how we are handling the bitmap but
>> we expect the userspace to use the same granularity at which the dirty
>> bitmap is handled. I can capture this in documentation
> 
> But what does the bitmap have to do with any of this? This is not what
> your API is about. You are supposed to count dirtied memory, and you
> are counting page faults instead. No sane userspace can make any sense
> of that. You keep coupling the two, but that's wrong. This thing has
> to be useful on its own, not just for your particular, super narrow
> use case. And that's a shame because the general idea of a dirty quota
> is an interesting one.
> 
> If your sole intention is to capture in the documentation that the API
> is broken, then all I can do is to NAK the whole thing. Until you turn
> this page-fault quota into the dirty memory quota that you advertise,
> I'll continue to say no to it.
> 
> Thanks,
> 
> 	M.
> 

Thank you Marc for the suggestion. We can make dirty quota count dirtied 
memory rather than faults.

run->dirty_quota -= page_size;

We can raise a kvm request for exiting to userspace as soon as the dirty 
quota of the vcpu becomes zero or negative. Please let me know if this 
looks good to you.

Thanks,
Shivam
Marc Zyngier Jan. 15, 2023, 9:56 a.m. UTC | #16
On Sat, 14 Jan 2023 13:07:44 +0000,
Shivam Kumar <shivam.kumar1@nutanix.com> wrote:
> 
> 
> 
> On 08/01/23 3:14 am, Marc Zyngier wrote:
> > On Sat, 07 Jan 2023 17:24:24 +0000,
> > Shivam Kumar <shivam.kumar1@nutanix.com> wrote:
> >> On 26/12/22 3:37 pm, Marc Zyngier wrote:
> >>> On Sun, 25 Dec 2022 16:50:04 +0000,
> >>> Shivam Kumar <shivam.kumar1@nutanix.com> wrote:
> >>>> 
> >>>> Hi Marc,
> >>>> Hi Sean,
> >>>> 
> >>>> Please let me know if there's any further question or feedback.
> >>> 
> >>> My earlier comments still stand: the proposed API is not usable as a
> >>> general purpose memory-tracking API because it counts faults instead
> >>> of memory, making it inadequate except for the most trivial cases.
> >>> And I cannot believe you were serious when you mentioned that you were
> >>> happy to make that the API.
> >>> 
> >>> This requires some serious work, and this series is not yet near a
> >>> state where it could be merged.
> >>> 
> >>> Thanks,
> >>> 
> >>> 	M.
> >>> 
> >> 
> >> Hi Marc,
> >> 
> >> IIUC, in the dirty ring interface too, the dirty_index variable is
> >> incremented in the mark_page_dirty_in_slot function and it is also
> >> count-based. At least on x86, I am aware that for dirty tracking we
> >> have uniform granularity as huge pages (2MB pages) too are broken into
> >> 4K pages and bitmap is at 4K-granularity. Please let me know if it is
> >> possible to have multiple page sizes even during dirty logging on
> >> ARM. And if that is the case, I am wondering how we handle the bitmap
> >> with different page sizes on ARM.
> > 
> > Easy. It *is* page-size, by the very definition of the API which
> > explicitly says that a single bit represent one basic page. If you
> > were to only break 1GB mappings into 2MB blocks, you'd have to mask
> > 512 pages dirty at once, no question asked.
> > 
> > Your API is different because at no point it implies any relationship
> > with any page size. As it stands, it is a useless API. I understand
> > that you are only concerned with your particular use case, but that's
> > nowhere good enough. And it has nothing to do with ARM. This is
> > equally broken on *any* architecture.
> > 
> >> I agree that the notion of pages dirtied according to our
> >> pages_dirtied variable depends on how we are handling the bitmap but
> >> we expect the userspace to use the same granularity at which the dirty
> >> bitmap is handled. I can capture this in documentation
> > 
> > But what does the bitmap have to do with any of this? This is not what
> > your API is about. You are supposed to count dirtied memory, and you
> > are counting page faults instead. No sane userspace can make any sense
> > of that. You keep coupling the two, but that's wrong. This thing has
> > to be useful on its own, not just for your particular, super narrow
> > use case. And that's a shame because the general idea of a dirty quota
> > is an interesting one.
> > 
> > If your sole intention is to capture in the documentation that the API
> > is broken, then all I can do is to NAK the whole thing. Until you turn
> > this page-fault quota into the dirty memory quota that you advertise,
> > I'll continue to say no to it.
> > 
> > Thanks,
> > 
> > 	M.
> > 
> 
> Thank you Marc for the suggestion. We can make dirty quota count
> dirtied memory rather than faults.
> 
> run->dirty_quota -= page_size;
>
> We can raise a kvm request for exiting to userspace as soon as the
> dirty quota of the vcpu becomes zero or negative. Please let me know
> if this looks good to you.

It really depends what "page_size" represents here. If you mean
"mapping size", then yes. If you really mean "page size", then no.

Assuming this is indeed "mapping size", then it all depends on how
this is integrated and how this is managed in a generic, cross
architecture way.

Thanks,

	M.
Shivam Kumar Jan. 15, 2023, 2:50 p.m. UTC | #17
On 15/01/23 3:26 pm, Marc Zyngier wrote:
> On Sat, 14 Jan 2023 13:07:44 +0000,
> Shivam Kumar <shivam.kumar1@nutanix.com> wrote:
>>
>>
>>
>> On 08/01/23 3:14 am, Marc Zyngier wrote:
>>> On Sat, 07 Jan 2023 17:24:24 +0000,
>>> Shivam Kumar <shivam.kumar1@nutanix.com> wrote:
>>>> On 26/12/22 3:37 pm, Marc Zyngier wrote:
>>>>> On Sun, 25 Dec 2022 16:50:04 +0000,
>>>>> Shivam Kumar <shivam.kumar1@nutanix.com> wrote:
>>>>>>
>>>>>> Hi Marc,
>>>>>> Hi Sean,
>>>>>>
>>>>>> Please let me know if there's any further question or feedback.
>>>>>
>>>>> My earlier comments still stand: the proposed API is not usable as a
>>>>> general purpose memory-tracking API because it counts faults instead
>>>>> of memory, making it inadequate except for the most trivial cases.
>>>>> And I cannot believe you were serious when you mentioned that you were
>>>>> happy to make that the API.
>>>>>
>>>>> This requires some serious work, and this series is not yet near a
>>>>> state where it could be merged.
>>>>>
>>>>> Thanks,
>>>>>
>>>>> 	M.
>>>>>
>>>>
>>>> Hi Marc,
>>>>
>>>> IIUC, in the dirty ring interface too, the dirty_index variable is
>>>> incremented in the mark_page_dirty_in_slot function and it is also
>>>> count-based. At least on x86, I am aware that for dirty tracking we
>>>> have uniform granularity as huge pages (2MB pages) too are broken into
>>>> 4K pages and bitmap is at 4K-granularity. Please let me know if it is
>>>> possible to have multiple page sizes even during dirty logging on
>>>> ARM. And if that is the case, I am wondering how we handle the bitmap
>>>> with different page sizes on ARM.
>>>
>>> Easy. It *is* page-size, by the very definition of the API which
>>> explicitly says that a single bit represent one basic page. If you
>>> were to only break 1GB mappings into 2MB blocks, you'd have to mask
>>> 512 pages dirty at once, no question asked.
>>>
>>> Your API is different because at no point it implies any relationship
>>> with any page size. As it stands, it is a useless API. I understand
>>> that you are only concerned with your particular use case, but that's
>>> nowhere good enough. And it has nothing to do with ARM. This is
>>> equally broken on *any* architecture.
>>>
>>>> I agree that the notion of pages dirtied according to our
>>>> pages_dirtied variable depends on how we are handling the bitmap but
>>>> we expect the userspace to use the same granularity at which the dirty
>>>> bitmap is handled. I can capture this in documentation
>>>
>>> But what does the bitmap have to do with any of this? This is not what
>>> your API is about. You are supposed to count dirtied memory, and you
>>> are counting page faults instead. No sane userspace can make any sense
>>> of that. You keep coupling the two, but that's wrong. This thing has
>>> to be useful on its own, not just for your particular, super narrow
>>> use case. And that's a shame because the general idea of a dirty quota
>>> is an interesting one.
>>>
>>> If your sole intention is to capture in the documentation that the API
>>> is broken, then all I can do is to NAK the whole thing. Until you turn
>>> this page-fault quota into the dirty memory quota that you advertise,
>>> I'll continue to say no to it.
>>>
>>> Thanks,
>>>
>>> 	M.
>>>
>>
>> Thank you Marc for the suggestion. We can make dirty quota count
>> dirtied memory rather than faults.
>>
>> run->dirty_quota -= page_size;
>>
>> We can raise a kvm request for exiting to userspace as soon as the
>> dirty quota of the vcpu becomes zero or negative. Please let me know
>> if this looks good to you.
> 
> It really depends what "page_size" represents here. If you mean
> "mapping size", then yes. If you really mean "page size", then no.
> 
> Assuming this is indeed "mapping size", then it all depends on how
> this is integrated and how this is managed in a generic, cross
> architecture way.
> 
> Thanks,
> 
> 	M.
> 

Yes, it is "mapping size". I can see that there's a "npages" variable in 
"kvm_memory_slot" which determines the number of bits we need to track 
dirtying for a given memory slot. And this variable is computed by right 
shifting the memory size by PAGE_SHIFT. Each arch defines the macro 
PAGE_SHIFT, and another macro PAGE_SIZE as the left shift of 1 by 
PAGE_SHIFT. Does it make sense to use this macro?

Thanks, again, Marc.
Marc Zyngier Jan. 15, 2023, 7:13 p.m. UTC | #18
On Sun, 15 Jan 2023 14:50:55 +0000,
Shivam Kumar <shivam.kumar1@nutanix.com> wrote:
> 
> >> Thank you Marc for the suggestion. We can make dirty quota count
> >> dirtied memory rather than faults.
> >> 
> >> run->dirty_quota -= page_size;
> >> 
> >> We can raise a kvm request for exiting to userspace as soon as the
> >> dirty quota of the vcpu becomes zero or negative. Please let me know
> >> if this looks good to you.
> > 
> > It really depends what "page_size" represents here. If you mean
> > "mapping size", then yes. If you really mean "page size", then no.
> > 
> > Assuming this is indeed "mapping size", then it all depends on how
> > this is integrated and how this is managed in a generic, cross
> > architecture way.
> > 
> > Thanks,
> > 
> > 	M.
> > 
> 
> Yes, it is "mapping size". I can see that there's a "npages" variable
> in "kvm_memory_slot" which determines the number of bits we need to
> track dirtying for a given memory slot. And this variable is computed
> by right shifting the memory size by PAGE_SHIFT. Each arch defines the
> macro PAGE_SHIFT, and another macro PAGE_SIZE as the left shift of 1
> by PAGE_SHIFT. Does it make sense to use this macro?

I don't think it makes any sense.

There is nothing in the memslot structure that you can make use of.
The information you need is the page table structure itself (the
level, precisely), which tells you how big the mapping is for this
particular part of the memslot.

This is dynamic information, not defined at memslot creation. Which is
why it can only be captured at fault time (with the exception of
HugeTLBFS backed memslots for which the mapping size is cast into
stone).

	M.
Shivam Kumar Jan. 29, 2023, 10 p.m. UTC | #19
On 15/01/23 3:26 pm, Marc Zyngier wrote:
> On Sat, 14 Jan 2023 13:07:44 +0000,
> Shivam Kumar <shivam.kumar1@nutanix.com> wrote:
>>
>>
>>
>> On 08/01/23 3:14 am, Marc Zyngier wrote:
>>> On Sat, 07 Jan 2023 17:24:24 +0000,
>>> Shivam Kumar <shivam.kumar1@nutanix.com> wrote:
>>>> On 26/12/22 3:37 pm, Marc Zyngier wrote:
>>>>> On Sun, 25 Dec 2022 16:50:04 +0000,
>>>>> Shivam Kumar <shivam.kumar1@nutanix.com> wrote:
>>>>>>
>>>>>> Hi Marc,
>>>>>> Hi Sean,
>>>>>>
>>>>>> Please let me know if there's any further question or feedback.
>>>>>
>>>>> My earlier comments still stand: the proposed API is not usable as a
>>>>> general purpose memory-tracking API because it counts faults instead
>>>>> of memory, making it inadequate except for the most trivial cases.
>>>>> And I cannot believe you were serious when you mentioned that you were
>>>>> happy to make that the API.
>>>>>
>>>>> This requires some serious work, and this series is not yet near a
>>>>> state where it could be merged.
>>>>>
>>>>> Thanks,
>>>>>
>>>>> 	M.
>>>>>
>>>>
>>>> Hi Marc,
>>>>
>>>> IIUC, in the dirty ring interface too, the dirty_index variable is
>>>> incremented in the mark_page_dirty_in_slot function and it is also
>>>> count-based. At least on x86, I am aware that for dirty tracking we
>>>> have uniform granularity as huge pages (2MB pages) too are broken into
>>>> 4K pages and bitmap is at 4K-granularity. Please let me know if it is
>>>> possible to have multiple page sizes even during dirty logging on
>>>> ARM. And if that is the case, I am wondering how we handle the bitmap
>>>> with different page sizes on ARM.
>>>
>>> Easy. It *is* page-size, by the very definition of the API which
>>> explicitly says that a single bit represent one basic page. If you
>>> were to only break 1GB mappings into 2MB blocks, you'd have to mask
>>> 512 pages dirty at once, no question asked.
>>>
>>> Your API is different because at no point it implies any relationship
>>> with any page size. As it stands, it is a useless API. I understand
>>> that you are only concerned with your particular use case, but that's
>>> nowhere good enough. And it has nothing to do with ARM. This is
>>> equally broken on *any* architecture.
>>>
>>>> I agree that the notion of pages dirtied according to our
>>>> pages_dirtied variable depends on how we are handling the bitmap but
>>>> we expect the userspace to use the same granularity at which the dirty
>>>> bitmap is handled. I can capture this in documentation
>>>
>>> But what does the bitmap have to do with any of this? This is not what
>>> your API is about. You are supposed to count dirtied memory, and you
>>> are counting page faults instead. No sane userspace can make any sense
>>> of that. You keep coupling the two, but that's wrong. This thing has
>>> to be useful on its own, not just for your particular, super narrow
>>> use case. And that's a shame because the general idea of a dirty quota
>>> is an interesting one.
>>>
>>> If your sole intention is to capture in the documentation that the API
>>> is broken, then all I can do is to NAK the whole thing. Until you turn
>>> this page-fault quota into the dirty memory quota that you advertise,
>>> I'll continue to say no to it.
>>>
>>> Thanks,
>>>
>>> 	M.
>>>
>>
>> Thank you Marc for the suggestion. We can make dirty quota count
>> dirtied memory rather than faults.
>>
>> run->dirty_quota -= page_size;
>>
>> We can raise a kvm request for exiting to userspace as soon as the
>> dirty quota of the vcpu becomes zero or negative. Please let me know
>> if this looks good to you.
> 
> It really depends what "page_size" represents here. If you mean
> "mapping size", then yes. If you really mean "page size", then no.
> 
> Assuming this is indeed "mapping size", then it all depends on how
> this is integrated and how this is managed in a generic, cross
> architecture way.
> 
> Thanks,
> 
> 	M.
> 

Hi Marc,

I'm proposing this new implementation to address the concern you raised 
regarding dirty quota being a non-generic feature with the previous 
implementation. This implementation decouples dirty quota from dirty 
logging for the ARM64 arch. We shall post a similar implementation for 
x86 if this looks good. With this new implementation, dirty quota can be 
enforced independent of dirty logging. Dirty quota is now in bytes and 
is decreased at write-protect page fault by page fault granularity. For 
userspace, the interface is unchanged, i.e. the dirty quota can be set 
from userspace via an ioctl or by forcing the vcpu to exit to userspace; 
userspace can expect a KVM exit with exit reason 
KVM_EXIT_DIRTY_QUOTA_EXHAUSTED when the dirty quota is exhausted.

Please let me know if it looks good to you. Happy to hear any further
feedback and work on it. Also, I am curious about use case scenarios 
other than dirty tracking for dirty quota. Besides, I am not aware of 
any interface exposed to the userspace, other than the dirty 
tracking-related ioctls, to write-protect guest pages transiently 
(unlike mprotect, which will generate a SIGSEGV signal on write).

Thanks,
Shivam


---
  arch/arm64/kvm/mmu.c 	|  1 +
  include/linux/kvm_host.h |  1 +
  virt/kvm/kvm_main.c  	| 12 ++++++++++++
  3 files changed, 14 insertions(+)

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 60ee3d9f01f8..edd88529d622 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1336,6 +1336,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
      	/* Mark the page dirty only if the fault is handled successfully */
      	if (writable && !ret) {
              	kvm_set_pfn_dirty(pfn);
+           	update_dirty_quota(kvm, fault_granule);
              	mark_page_dirty_in_slot(kvm, memslot, gfn);
      	}

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 0b9b5c251a04..10fda457ac3d 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1219,6 +1219,7 @@ struct kvm_memory_slot *gfn_to_memslot(struct kvm 
*kvm, gfn_t gfn);
  bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn);
  bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn);
  unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn);
+void update_dirty_quota(struct kvm *kvm, unsigned long 
dirty_granule_bytes);
  void mark_page_dirty_in_slot(struct kvm *kvm, const struct 
kvm_memory_slot *memslot, gfn_t gfn);
  void mark_page_dirty(struct kvm *kvm, gfn_t gfn);

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 7a54438b4d49..377cc9d07e80 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -3309,6 +3309,18 @@ static bool 
kvm_vcpu_is_dirty_quota_exhausted(struct kvm_vcpu *vcpu)
  #endif
  }

+void update_dirty_quota(struct kvm *kvm, unsigned long dirty_granule_bytes)
+{
+	if (kvm->dirty_quota_enabled) {
+		struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
+
+   		if (!vcpu)
+           		return;
+
+   		vcpu->run->dirty_quota_bytes -= dirty_granule_bytes;
+   		if (vcpu->run->dirty_quota_bytes <= 0)
+                   		kvm_make_request(KVM_REQ_DIRTY_QUOTA_EXIT, vcpu);
+	}
+}
+
  void mark_page_dirty_in_slot(struct kvm *kvm,
                           	const struct kvm_memory_slot *memslot,
                     	gfn_t gfn)
Shivam Kumar Feb. 11, 2023, 6:52 a.m. UTC | #20
> 
> Hi Marc,
> 
> I'm proposing this new implementation to address the concern you raised 
> regarding dirty quota being a non-generic feature with the previous 
> implementation. This implementation decouples dirty quota from dirty 
> logging for the ARM64 arch. We shall post a similar implementation for 
> x86 if this looks good. With this new implementation, dirty quota can be 
> enforced independent of dirty logging. Dirty quota is now in bytes and 

Hi Marc,

Thank you for your valuable feedback so far. Looking forward to your 
feedback on this new proposition.

Thanks,
Shivam
Marc Zyngier Feb. 12, 2023, 5:09 p.m. UTC | #21
On Sat, 11 Feb 2023 06:52:02 +0000,
Shivam Kumar <shivam.kumar1@nutanix.com> wrote:
> 
> > 
> > Hi Marc,
> > 
> > I'm proposing this new implementation to address the concern you
> > raised regarding dirty quota being a non-generic feature with the
> > previous implementation. This implementation decouples dirty quota
> > from dirty logging for the ARM64 arch. We shall post a similar
> > implementation for x86 if this looks good. With this new
> > implementation, dirty quota can be enforced independent of dirty
> > logging. Dirty quota is now in bytes and 
> 
> Hi Marc,
> 
> Thank you for your valuable feedback so far. Looking forward to your
> feedback on this new proposition.

I'm not sure what you are expecting from me here. I've explained in
great details what I wanted to see, repeatedly. This above says
nothing other than "we are going to do *something* that matches your
expectations".

My answer is, to quote someone else, "show me the code". Until then, I
don't have much to add.

Thanks,

	M.
Shivam Kumar Feb. 12, 2023, 5:54 p.m. UTC | #22
On 12/02/23 10:39 pm, Marc Zyngier wrote:
> On Sat, 11 Feb 2023 06:52:02 +0000,
> Shivam Kumar <shivam.kumar1@nutanix.com> wrote:
>>
>>>
>>> Hi Marc,
>>>
>>> I'm proposing this new implementation to address the concern you
>>> raised regarding dirty quota being a non-generic feature with the
>>> previous implementation. This implementation decouples dirty quota
>>> from dirty logging for the ARM64 arch. We shall post a similar
>>> implementation for x86 if this looks good. With this new
>>> implementation, dirty quota can be enforced independent of dirty
>>> logging. Dirty quota is now in bytes and
>>
>> Hi Marc,
>>
>> Thank you for your valuable feedback so far. Looking forward to your
>> feedback on this new proposition.
> 
> I'm not sure what you are expecting from me here. I've explained in
> great details what I wanted to see, repeatedly. This above says
> nothing other than "we are going to do *something* that matches your
> expectations".
> 
> My answer is, to quote someone else, "show me the code". Until then, I
> don't have much to add.
> 
> Thanks,
> 
> 	M.
> 

Hi Marc,

I had posted some code in the previous comment. Let me tag you there.

Thanks,
Shivam
Shivam Kumar Feb. 12, 2023, 5:56 p.m. UTC | #23
On 30/01/23 3:30 am, Shivam Kumar wrote:
> 
> 
> On 15/01/23 3:26 pm, Marc Zyngier wrote:
>> On Sat, 14 Jan 2023 13:07:44 +0000,
>> Shivam Kumar <shivam.kumar1@nutanix.com> wrote:
>>>
>>>
>>>
>>> On 08/01/23 3:14 am, Marc Zyngier wrote:
>>>> On Sat, 07 Jan 2023 17:24:24 +0000,
>>>> Shivam Kumar <shivam.kumar1@nutanix.com> wrote:
>>>>> On 26/12/22 3:37 pm, Marc Zyngier wrote:
>>>>>> On Sun, 25 Dec 2022 16:50:04 +0000,
>>>>>> Shivam Kumar <shivam.kumar1@nutanix.com> wrote:
>>>>>>>
>>>>>>> Hi Marc,
>>>>>>> Hi Sean,
>>>>>>>
>>>>>>> Please let me know if there's any further question or feedback.
>>>>>>
>>>>>> My earlier comments still stand: the proposed API is not usable as a
>>>>>> general purpose memory-tracking API because it counts faults instead
>>>>>> of memory, making it inadequate except for the most trivial cases.
>>>>>> And I cannot believe you were serious when you mentioned that you 
>>>>>> were
>>>>>> happy to make that the API.
>>>>>>
>>>>>> This requires some serious work, and this series is not yet near a
>>>>>> state where it could be merged.
>>>>>>
>>>>>> Thanks,
>>>>>>
>>>>>>     M.
>>>>>>
>>>>>
>>>>> Hi Marc,
>>>>>
>>>>> IIUC, in the dirty ring interface too, the dirty_index variable is
>>>>> incremented in the mark_page_dirty_in_slot function and it is also
>>>>> count-based. At least on x86, I am aware that for dirty tracking we
>>>>> have uniform granularity as huge pages (2MB pages) too are broken into
>>>>> 4K pages and bitmap is at 4K-granularity. Please let me know if it is
>>>>> possible to have multiple page sizes even during dirty logging on
>>>>> ARM. And if that is the case, I am wondering how we handle the bitmap
>>>>> with different page sizes on ARM.
>>>>
>>>> Easy. It *is* page-size, by the very definition of the API which
>>>> explicitly says that a single bit represent one basic page. If you
>>>> were to only break 1GB mappings into 2MB blocks, you'd have to mask
>>>> 512 pages dirty at once, no question asked.
>>>>
>>>> Your API is different because at no point it implies any relationship
>>>> with any page size. As it stands, it is a useless API. I understand
>>>> that you are only concerned with your particular use case, but that's
>>>> nowhere good enough. And it has nothing to do with ARM. This is
>>>> equally broken on *any* architecture.
>>>>
>>>>> I agree that the notion of pages dirtied according to our
>>>>> pages_dirtied variable depends on how we are handling the bitmap but
>>>>> we expect the userspace to use the same granularity at which the dirty
>>>>> bitmap is handled. I can capture this in documentation
>>>>
>>>> But what does the bitmap have to do with any of this? This is not what
>>>> your API is about. You are supposed to count dirtied memory, and you
>>>> are counting page faults instead. No sane userspace can make any sense
>>>> of that. You keep coupling the two, but that's wrong. This thing has
>>>> to be useful on its own, not just for your particular, super narrow
>>>> use case. And that's a shame because the general idea of a dirty quota
>>>> is an interesting one.
>>>>
>>>> If your sole intention is to capture in the documentation that the API
>>>> is broken, then all I can do is to NAK the whole thing. Until you turn
>>>> this page-fault quota into the dirty memory quota that you advertise,
>>>> I'll continue to say no to it.
>>>>
>>>> Thanks,
>>>>
>>>>     M.
>>>>
>>>
>>> Thank you Marc for the suggestion. We can make dirty quota count
>>> dirtied memory rather than faults.
>>>
>>> run->dirty_quota -= page_size;
>>>
>>> We can raise a kvm request for exiting to userspace as soon as the
>>> dirty quota of the vcpu becomes zero or negative. Please let me know
>>> if this looks good to you.
>>
>> It really depends what "page_size" represents here. If you mean
>> "mapping size", then yes. If you really mean "page size", then no.
>>
>> Assuming this is indeed "mapping size", then it all depends on how
>> this is integrated and how this is managed in a generic, cross
>> architecture way.
>>
>> Thanks,
>>
>>     M.
>>
> 
> Hi Marc,
> 
> I'm proposing this new implementation to address the concern you raised 
> regarding dirty quota being a non-generic feature with the previous 
> implementation. This implementation decouples dirty quota from dirty 
> logging for the ARM64 arch. We shall post a similar implementation for 
> x86 if this looks good. With this new implementation, dirty quota can be 
> enforced independent of dirty logging. Dirty quota is now in bytes and 
> is decreased at write-protect page fault by page fault granularity. For 
> userspace, the interface is unchanged, i.e. the dirty quota can be set 
> from userspace via an ioctl or by forcing the vcpu to exit to userspace; 
> userspace can expect a KVM exit with exit reason 
> KVM_EXIT_DIRTY_QUOTA_EXHAUSTED when the dirty quota is exhausted.
> 
> Please let me know if it looks good to you. Happy to hear any further
> feedback and work on it. Also, I am curious about use case scenarios 
> other than dirty tracking for dirty quota. Besides, I am not aware of 
> any interface exposed to the userspace, other than the dirty 
> tracking-related ioctls, to write-protect guest pages transiently 
> (unlike mprotect, which will generate a SIGSEGV signal on write).
> 
> Thanks,
> Shivam
> 
> 
> ---
>   arch/arm64/kvm/mmu.c     |  1 +
>   include/linux/kvm_host.h |  1 +
>   virt/kvm/kvm_main.c      | 12 ++++++++++++
>   3 files changed, 14 insertions(+)
> 
> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> index 60ee3d9f01f8..edd88529d622 100644
> --- a/arch/arm64/kvm/mmu.c
> +++ b/arch/arm64/kvm/mmu.c
> @@ -1336,6 +1336,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
> phys_addr_t fault_ipa,
>           /* Mark the page dirty only if the fault is handled 
> successfully */
>           if (writable && !ret) {
>                   kvm_set_pfn_dirty(pfn);
> +               update_dirty_quota(kvm, fault_granule);
>                   mark_page_dirty_in_slot(kvm, memslot, gfn);
>           }
> 
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 0b9b5c251a04..10fda457ac3d 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -1219,6 +1219,7 @@ struct kvm_memory_slot *gfn_to_memslot(struct kvm 
> *kvm, gfn_t gfn);
>   bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn);
>   bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn);
>   unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn);
> +void update_dirty_quota(struct kvm *kvm, unsigned long 
> dirty_granule_bytes);
>   void mark_page_dirty_in_slot(struct kvm *kvm, const struct 
> kvm_memory_slot *memslot, gfn_t gfn);
>   void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
> 
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 7a54438b4d49..377cc9d07e80 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -3309,6 +3309,18 @@ static bool 
> kvm_vcpu_is_dirty_quota_exhausted(struct kvm_vcpu *vcpu)
>   #endif
>   }
> 
> +void update_dirty_quota(struct kvm *kvm, unsigned long 
> dirty_granule_bytes)
> +{
> +    if (kvm->dirty_quota_enabled) {
> +        struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
> +
> +           if (!vcpu)
> +                   return;
> +
> +           vcpu->run->dirty_quota_bytes -= dirty_granule_bytes;
> +           if (vcpu->run->dirty_quota_bytes <= 0)
> +                           kvm_make_request(KVM_REQ_DIRTY_QUOTA_EXIT, 
> vcpu);
> +    }
> +}
> +
>   void mark_page_dirty_in_slot(struct kvm *kvm,
>                                const struct kvm_memory_slot *memslot,
>                          gfn_t gfn)


Hi Marc,

Please review the above code.

Thanks,
Shivam
Marc Zyngier Feb. 12, 2023, 6:02 p.m. UTC | #24
On Sun, 12 Feb 2023 17:54:30 +0000,
Shivam Kumar <shivam.kumar1@nutanix.com> wrote:
> 
> 
> 
> On 12/02/23 10:39 pm, Marc Zyngier wrote:
> > On Sat, 11 Feb 2023 06:52:02 +0000,
> > Shivam Kumar <shivam.kumar1@nutanix.com> wrote:
> >> 
> >>> 
> >>> Hi Marc,
> >>> 
> >>> I'm proposing this new implementation to address the concern you
> >>> raised regarding dirty quota being a non-generic feature with the
> >>> previous implementation. This implementation decouples dirty quota
> >>> from dirty logging for the ARM64 arch. We shall post a similar
> >>> implementation for x86 if this looks good. With this new
> >>> implementation, dirty quota can be enforced independent of dirty
> >>> logging. Dirty quota is now in bytes and
> >> 
> >> Hi Marc,
> >> 
> >> Thank you for your valuable feedback so far. Looking forward to your
> >> feedback on this new proposition.
> > 
> > I'm not sure what you are expecting from me here. I've explained in
> > great details what I wanted to see, repeatedly. This above says
> > nothing other than "we are going to do *something* that matches your
> > expectations".
> > 
> > My answer is, to quote someone else, "show me the code". Until then, I
> > don't have much to add.
> > 
> > Thanks,
> > 
> > 	M.
> > 
> 
> Hi Marc,
> 
> I had posted some code in the previous comment. Let me tag you there.

You posted a tiny snippet that is completely out of context.

I took the time to fully review your series and provide extensive
comments. You could, similarly, take the time to post a complete
series for people to review the new proposal.

	M.
diff mbox series

Patch

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index eee9f857a986..4568faa33c6d 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -6513,6 +6513,26 @@  array field represents return values. The userspace should update the return
 values of SBI call before resuming the VCPU. For more details on RISC-V SBI
 spec refer, https://github.com/riscv/riscv-sbi-doc.
 
+::
+
+		/* KVM_EXIT_DIRTY_QUOTA_EXHAUSTED */
+		struct {
+			__u64 count;
+			__u64 quota;
+		} dirty_quota_exit;
+
+If exit reason is KVM_EXIT_DIRTY_QUOTA_EXHAUSTED, it indicates that the VCPU has
+exhausted its dirty quota. The 'dirty_quota_exit' member of kvm_run structure
+makes the following information available to the userspace:
+    count: the current count of pages dirtied by the VCPU, can be
+    skewed based on the size of the pages accessed by each vCPU.
+    quota: the observed dirty quota just before the exit to userspace.
+
+The userspace can design a strategy to allocate the overall scope of dirtying
+for the VM among the vcpus. Based on the strategy and the current state of dirty
+quota throttling, the userspace can make a decision to either update (increase)
+the quota or to put the VCPU to sleep for some time.
+
 ::
 
     /* KVM_EXIT_NOTIFY */
@@ -6567,6 +6587,21 @@  values in kvm_run even if the corresponding bit in kvm_dirty_regs is not set.
 
 ::
 
+	/*
+	 * Number of pages the vCPU is allowed to have dirtied over its entire
+	 * lifetime.  KVM_RUN exits with KVM_EXIT_DIRTY_QUOTA_EXHAUSTED if the quota
+	 * is reached/exceeded.
+	 */
+	__u64 dirty_quota;
+
+Please note that enforcing the quota is best effort, as the guest may dirty
+multiple pages before KVM can recheck the quota.  However, unless KVM is using
+a hardware-based dirty ring buffer, e.g. Intel's Page Modification Logging,
+KVM will detect quota exhaustion within a handful of dirtied pages.  If a
+hardware ring buffer is used, the overrun is bounded by the size of the buffer
+(512 entries for PML).
+
+::
   };
 
 
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 67be7f217e37..bdbd36321d52 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -48,6 +48,7 @@  config KVM
 	select KVM_VFIO
 	select SRCU
 	select INTERVAL_TREE
+	select HAVE_KVM_DIRTY_QUOTA
 	select HAVE_KVM_PM_NOTIFIER if PM
 	help
 	  Support hosting fully virtualized guest machines using hardware
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 18592bdf4c1b..0b9b5c251a04 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -151,11 +151,12 @@  static inline bool is_error_page(struct page *page)
 #define KVM_REQUEST_NO_ACTION      BIT(10)
 /*
  * Architecture-independent vcpu->requests bit members
- * Bits 3-7 are reserved for more arch-independent bits.
+ * Bits 5-7 are reserved for more arch-independent bits.
  */
 #define KVM_REQ_TLB_FLUSH         (0 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
 #define KVM_REQ_VM_DEAD           (1 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
 #define KVM_REQ_UNBLOCK           2
+#define KVM_REQ_DIRTY_QUOTA_EXIT  4
 #define KVM_REQUEST_ARCH_BASE     8
 
 /*
@@ -379,6 +380,8 @@  struct kvm_vcpu {
 	 */
 	struct kvm_memory_slot *last_used_slot;
 	u64 last_used_slot_gen;
+
+	u64 dirty_quota;
 };
 
 /*
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index 3ca3db020e0e..263a588f3cd3 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -118,6 +118,7 @@  struct kvm_vcpu_stat_generic {
 	u64 halt_poll_fail_hist[HALT_POLL_HIST_COUNT];
 	u64 halt_wait_hist[HALT_POLL_HIST_COUNT];
 	u64 blocking;
+	u64 pages_dirtied;
 };
 
 #define KVM_STATS_NAME_SIZE	48
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 0d5d4419139a..5acb8991f872 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -272,6 +272,7 @@  struct kvm_xen_exit {
 #define KVM_EXIT_RISCV_SBI        35
 #define KVM_EXIT_RISCV_CSR        36
 #define KVM_EXIT_NOTIFY           37
+#define KVM_EXIT_DIRTY_QUOTA_EXHAUSTED 38
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 /* Emulate instruction failed. */
@@ -510,6 +511,11 @@  struct kvm_run {
 #define KVM_NOTIFY_CONTEXT_INVALID	(1 << 0)
 			__u32 flags;
 		} notify;
+		/* KVM_EXIT_DIRTY_QUOTA_EXHAUSTED */
+		struct {
+			__u64 count;
+			__u64 quota;
+		} dirty_quota_exit;
 		/* Fix the size of the union. */
 		char padding[256];
 	};
@@ -531,6 +537,12 @@  struct kvm_run {
 		struct kvm_sync_regs regs;
 		char padding[SYNC_REGS_SIZE_BYTES];
 	} s;
+	/*
+	 * Number of pages the vCPU is allowed to have dirtied over its entire
+	 * lifetime.  KVM_RUN exits with KVM_EXIT_DIRTY_QUOTA_EXHAUSTED if the
+	 * quota is reached/exceeded.
+	 */
+	__u64 dirty_quota;
 };
 
 /* for KVM_REGISTER_COALESCED_MMIO / KVM_UNREGISTER_COALESCED_MMIO */
@@ -1178,6 +1190,7 @@  struct kvm_ppc_resize_hpt {
 #define KVM_CAP_S390_ZPCI_OP 221
 #define KVM_CAP_S390_CPU_TOPOLOGY 222
 #define KVM_CAP_DIRTY_LOG_RING_ACQ_REL 223
+#define KVM_CAP_DIRTY_QUOTA 224
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h
index 0d5d4419139a..c8f811572670 100644
--- a/tools/include/uapi/linux/kvm.h
+++ b/tools/include/uapi/linux/kvm.h
@@ -1178,6 +1178,7 @@  struct kvm_ppc_resize_hpt {
 #define KVM_CAP_S390_ZPCI_OP 221
 #define KVM_CAP_S390_CPU_TOPOLOGY 222
 #define KVM_CAP_DIRTY_LOG_RING_ACQ_REL 223
+#define KVM_CAP_DIRTY_QUOTA 224
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 800f9470e36b..b6418a578c0a 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -19,6 +19,9 @@  config HAVE_KVM_IRQ_ROUTING
 config HAVE_KVM_DIRTY_RING
        bool
 
+config HAVE_KVM_DIRTY_QUOTA
+       bool
+
 # Only strongly ordered architectures can select this, as it doesn't
 # put any explicit constraint on userspace ordering. They can also
 # select the _ACQ_REL version.
@@ -86,3 +89,4 @@  config KVM_XFER_TO_GUEST_WORK
 
 config HAVE_KVM_PM_NOTIFIER
        bool
+
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 25d7872b29c1..7a54438b4d49 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -3298,18 +3298,32 @@  int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
 }
 EXPORT_SYMBOL_GPL(kvm_clear_guest);
 
+static bool kvm_vcpu_is_dirty_quota_exhausted(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_HAVE_KVM_DIRTY_QUOTA
+	u64 dirty_quota = READ_ONCE(vcpu->run->dirty_quota);
+
+	return dirty_quota && (vcpu->stat.generic.pages_dirtied >= dirty_quota);
+#else
+	return false;
+#endif
+}
+
 void mark_page_dirty_in_slot(struct kvm *kvm,
 			     const struct kvm_memory_slot *memslot,
 		 	     gfn_t gfn)
 {
 	struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
 
-#ifdef CONFIG_HAVE_KVM_DIRTY_RING
 	if (WARN_ON_ONCE(!vcpu) || WARN_ON_ONCE(vcpu->kvm != kvm))
 		return;
-#endif
 
-	if (memslot && kvm_slot_dirty_track_enabled(memslot)) {
+	if (!memslot)
+		return;
+
+	WARN_ON_ONCE(!vcpu->stat.generic.pages_dirtied++);
+
+	if (kvm_slot_dirty_track_enabled(memslot)) {
 		unsigned long rel_gfn = gfn - memslot->base_gfn;
 		u32 slot = (memslot->as_id << 16) | memslot->id;
 
@@ -3318,6 +3332,9 @@  void mark_page_dirty_in_slot(struct kvm *kvm,
 					    slot, rel_gfn);
 		else
 			set_bit_le(rel_gfn, memslot->dirty_bitmap);
+
+		if (kvm_vcpu_is_dirty_quota_exhausted(vcpu))
+			kvm_make_request(KVM_REQ_DIRTY_QUOTA_EXIT, vcpu);
 	}
 }
 EXPORT_SYMBOL_GPL(mark_page_dirty_in_slot);
@@ -4487,6 +4504,8 @@  static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
 	case KVM_CAP_BINARY_STATS_FD:
 	case KVM_CAP_SYSTEM_EVENT_DATA:
 		return 1;
+	case KVM_CAP_DIRTY_QUOTA:
+		return !!IS_ENABLED(CONFIG_HAVE_KVM_DIRTY_QUOTA);
 	default:
 		break;
 	}