diff mbox series

[v9,2/3] cpu-throttle: implement vCPU throttle

Message ID 155c8ef6b68c06829f971d356732783c671f661a.1638495274.git.huangy81@chinatelecom.cn (mailing list archive)
State New, archived
Headers show
Series support dirty restraint on vCPU | expand

Commit Message

Hyman Huang Dec. 3, 2021, 1:39 a.m. UTC
From: Hyman Huang(黄勇) <huangy81@chinatelecom.cn>

Impose dirty restraint on vCPU by kicking it and sleep
as the auto-converge does during migration, but just
kick the specified vCPU instead, not all vCPUs of vm.

Start a thread to track the dirtylimit status and adjust
the throttle pencentage dynamically depend on current
and quota dirtyrate.

Introduce the util function in the header for dirtylimit
implementation.

Signed-off-by: Hyman Huang(黄勇) <huangy81@chinatelecom.cn>
---
 include/sysemu/cpu-throttle.h |  45 ++++++
 qapi/migration.json           |  22 +++
 softmmu/cpu-throttle.c        | 355 ++++++++++++++++++++++++++++++++++++++++++
 softmmu/trace-events          |   5 +
 4 files changed, 427 insertions(+)

Comments

Peter Xu Dec. 6, 2021, 10:10 a.m. UTC | #1
On Fri, Dec 03, 2021 at 09:39:46AM +0800, huangy81@chinatelecom.cn wrote:
> +static uint64_t dirtylimit_pct(unsigned int last_pct,
> +                               uint64_t quota,
> +                               uint64_t current)
> +{
> +    uint64_t limit_pct = 0;
> +    RestrainPolicy policy;
> +    bool mitigate = (quota > current) ? true : false;
> +
> +    if (mitigate && ((current == 0) ||
> +        (last_pct <= DIRTYLIMIT_THROTTLE_SLIGHT_STEP_SIZE))) {
> +        return 0;
> +    }
> +
> +    policy = dirtylimit_policy(last_pct, quota, current);
> +    switch (policy) {
> +    case RESTRAIN_SLIGHT:
> +        /* [90, 99] */
> +        if (mitigate) {
> +            limit_pct =
> +                last_pct - DIRTYLIMIT_THROTTLE_SLIGHT_STEP_SIZE;
> +        } else {
> +            limit_pct =
> +                last_pct + DIRTYLIMIT_THROTTLE_SLIGHT_STEP_SIZE;
> +
> +            limit_pct = MIN(limit_pct, CPU_THROTTLE_PCT_MAX);
> +        }
> +       break;
> +    case RESTRAIN_HEAVY:
> +        /* [75, 90) */
> +        if (mitigate) {
> +            limit_pct =
> +                last_pct - DIRTYLIMIT_THROTTLE_HEAVY_STEP_SIZE;
> +        } else {
> +            limit_pct =
> +                last_pct + DIRTYLIMIT_THROTTLE_HEAVY_STEP_SIZE;
> +
> +            limit_pct = MIN(limit_pct,
> +                DIRTYLIMIT_THROTTLE_SLIGHT_WATERMARK);
> +        }
> +       break;
> +    case RESTRAIN_RATIO:
> +        /* [0, 75) */
> +        if (mitigate) {
> +            if (last_pct <= (((quota - current) * 100 / quota))) {
> +                limit_pct = 0;
> +            } else {
> +                limit_pct = last_pct -
> +                    ((quota - current) * 100 / quota);
> +                limit_pct = MAX(limit_pct, CPU_THROTTLE_PCT_MIN);
> +            }
> +        } else {
> +            limit_pct = last_pct +
> +                ((current - quota) * 100 / current);
> +
> +            limit_pct = MIN(limit_pct,
> +                DIRTYLIMIT_THROTTLE_HEAVY_WATERMARK);
> +        }
> +       break;
> +    case RESTRAIN_KEEP:
> +    default:
> +       limit_pct = last_pct;
> +       break;
> +    }
> +
> +    return limit_pct;
> +}
> +
> +static void *dirtylimit_thread(void *opaque)
> +{
> +    int cpu_index = *(int *)opaque;
> +    uint64_t quota_dirtyrate, current_dirtyrate;
> +    unsigned int last_pct = 0;
> +    unsigned int pct = 0;
> +
> +    rcu_register_thread();
> +
> +    quota_dirtyrate = dirtylimit_quota(cpu_index);
> +    current_dirtyrate = dirtylimit_current(cpu_index);
> +
> +    pct = dirtylimit_init_pct(quota_dirtyrate, current_dirtyrate);
> +
> +    do {
> +        trace_dirtylimit_impose(cpu_index,
> +            quota_dirtyrate, current_dirtyrate, pct);
> +
> +        last_pct = pct;
> +        if (pct == 0) {
> +            sleep(DIRTYLIMIT_CALC_PERIOD_TIME_S);
> +        } else {
> +            dirtylimit_check(cpu_index, pct);
> +        }
> +
> +        quota_dirtyrate = dirtylimit_quota(cpu_index);
> +        current_dirtyrate = dirtylimit_current(cpu_index);
> +
> +        pct = dirtylimit_pct(last_pct, quota_dirtyrate, current_dirtyrate);

So what I had in mind is we can start with an extremely simple version of
negative feedback system.  Say, firstly each vcpu will have a simple number to
sleep for some interval (this is ugly code, but just show what I meant..):

===============
diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index eecd8031cf..c320fd190f 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -2932,6 +2932,8 @@ int kvm_cpu_exec(CPUState *cpu)
             trace_kvm_dirty_ring_full(cpu->cpu_index);
             qemu_mutex_lock_iothread();
             kvm_dirty_ring_reap(kvm_state);
+            if (dirtylimit_enabled(cpu->cpu_index) && cpu->throttle_us_per_full)
+                usleep(cpu->throttle_us_per_full);
             qemu_mutex_unlock_iothread();
             ret = 0;
             break;
===============

I think this will have finer granularity when throttle (for 4096 ring size,
that's per-16MB operation) than current way where we inject per-vcpu async task
to sleep, like auto-converge.

Then we have the "black box" to tune this value with below input/output:

  - Input: dirty rate information, same as current algo

  - Output: increase/decrease of per-vcpu throttle_us_per_full above, and
    that's all

We can do the sampling per-second, then we keep doing it: we can have 1 thread
doing per-second task collecting dirty rate information for all the vcpus, then
tune that throttle_us_per_full for each of them.

The simplest linear algorithm would be as simple as (for each vcpu):

  if (quota < current)
    throttle_us_per_full += SOMETHING;
    if (throttle_us_per_full > MAX)
      throttle_us_per_full = MAX;
  else
    throttle_us_per_full -= SOMETHING;
    if (throttle_us_per_full < 0)
      throttle_us_per_full = 0;

I think your algorithm is fine, but thoroughly review every single bit of it in
one shot will be challenging, and it's also hard to prove every bit of the
algorithm is helpful, as there're a lot of hand-made macros and state changes.

I actually tested the current algorithm of yours, the dirty rate fluctuates a
bit (when I specified 200MB/s, it can go into either a few tens of MB/s or
300MB/s, normally less), neither does it respond fast (the initial throtle from
500MB/s -> 200MB/s should need 1 minute or something), so it seems not ideal
anyway. In that case I prefer we start with simple.

So IMHO we can start with this simple scheme first then it'll start working
with much less line of codes, afaict.  With that scheme ready in the 1st or
initial patches, it'll be easier to either apply any better algorithm
(e.g. your current one, if you're confident with that) or other things then
it'll be much easier to review too if you could consider split your patch like
that.

Normally per my knowledge for the need on migration, we could consider add an
integral algorithm into this linear algorithm that I said above, and it should
help us reach a very stable and constant state of throttling already.  But
we'll need to try it out, as I never tried.

What do you think?
Hyman Huang Dec. 8, 2021, 3:36 p.m. UTC | #2
在 2021/12/6 18:10, Peter Xu 写道:
> On Fri, Dec 03, 2021 at 09:39:46AM +0800, huangy81@chinatelecom.cn wrote:
>> +static uint64_t dirtylimit_pct(unsigned int last_pct,
>> +                               uint64_t quota,
>> +                               uint64_t current)
>> +{
>> +    uint64_t limit_pct = 0;
>> +    RestrainPolicy policy;
>> +    bool mitigate = (quota > current) ? true : false;
>> +
>> +    if (mitigate && ((current == 0) ||
>> +        (last_pct <= DIRTYLIMIT_THROTTLE_SLIGHT_STEP_SIZE))) {
>> +        return 0;
>> +    }
>> +
>> +    policy = dirtylimit_policy(last_pct, quota, current);
>> +    switch (policy) {
>> +    case RESTRAIN_SLIGHT:
>> +        /* [90, 99] */
>> +        if (mitigate) {
>> +            limit_pct =
>> +                last_pct - DIRTYLIMIT_THROTTLE_SLIGHT_STEP_SIZE;
>> +        } else {
>> +            limit_pct =
>> +                last_pct + DIRTYLIMIT_THROTTLE_SLIGHT_STEP_SIZE;
>> +
>> +            limit_pct = MIN(limit_pct, CPU_THROTTLE_PCT_MAX);
>> +        }
>> +       break;
>> +    case RESTRAIN_HEAVY:
>> +        /* [75, 90) */
>> +        if (mitigate) {
>> +            limit_pct =
>> +                last_pct - DIRTYLIMIT_THROTTLE_HEAVY_STEP_SIZE;
>> +        } else {
>> +            limit_pct =
>> +                last_pct + DIRTYLIMIT_THROTTLE_HEAVY_STEP_SIZE;
>> +
>> +            limit_pct = MIN(limit_pct,
>> +                DIRTYLIMIT_THROTTLE_SLIGHT_WATERMARK);
>> +        }
>> +       break;
>> +    case RESTRAIN_RATIO:
>> +        /* [0, 75) */
>> +        if (mitigate) {
>> +            if (last_pct <= (((quota - current) * 100 / quota))) {
>> +                limit_pct = 0;
>> +            } else {
>> +                limit_pct = last_pct -
>> +                    ((quota - current) * 100 / quota);
>> +                limit_pct = MAX(limit_pct, CPU_THROTTLE_PCT_MIN);
>> +            }
>> +        } else {
>> +            limit_pct = last_pct +
>> +                ((current - quota) * 100 / current);
>> +
>> +            limit_pct = MIN(limit_pct,
>> +                DIRTYLIMIT_THROTTLE_HEAVY_WATERMARK);
>> +        }
>> +       break;
>> +    case RESTRAIN_KEEP:
>> +    default:
>> +       limit_pct = last_pct;
>> +       break;
>> +    }
>> +
>> +    return limit_pct;
>> +}
>> +
>> +static void *dirtylimit_thread(void *opaque)
>> +{
>> +    int cpu_index = *(int *)opaque;
>> +    uint64_t quota_dirtyrate, current_dirtyrate;
>> +    unsigned int last_pct = 0;
>> +    unsigned int pct = 0;
>> +
>> +    rcu_register_thread();
>> +
>> +    quota_dirtyrate = dirtylimit_quota(cpu_index);
>> +    current_dirtyrate = dirtylimit_current(cpu_index);
>> +
>> +    pct = dirtylimit_init_pct(quota_dirtyrate, current_dirtyrate);
>> +
>> +    do {
>> +        trace_dirtylimit_impose(cpu_index,
>> +            quota_dirtyrate, current_dirtyrate, pct);
>> +
>> +        last_pct = pct;
>> +        if (pct == 0) {
>> +            sleep(DIRTYLIMIT_CALC_PERIOD_TIME_S);
>> +        } else {
>> +            dirtylimit_check(cpu_index, pct);
>> +        }
>> +
>> +        quota_dirtyrate = dirtylimit_quota(cpu_index);
>> +        current_dirtyrate = dirtylimit_current(cpu_index);
>> +
>> +        pct = dirtylimit_pct(last_pct, quota_dirtyrate, current_dirtyrate);
> 
> So what I had in mind is we can start with an extremely simple version of
> negative feedback system.  Say, firstly each vcpu will have a simple number to
> sleep for some interval (this is ugly code, but just show what I meant..):
> 
> ===============
> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
> index eecd8031cf..c320fd190f 100644
> --- a/accel/kvm/kvm-all.c
> +++ b/accel/kvm/kvm-all.c
> @@ -2932,6 +2932,8 @@ int kvm_cpu_exec(CPUState *cpu)
>               trace_kvm_dirty_ring_full(cpu->cpu_index);
>               qemu_mutex_lock_iothread();
>               kvm_dirty_ring_reap(kvm_state);
> +            if (dirtylimit_enabled(cpu->cpu_index) && cpu->throttle_us_per_full)
> +                usleep(cpu->throttle_us_per_full);
>               qemu_mutex_unlock_iothread();
>               ret = 0;
>               break;
> ===============
> 
> I think this will have finer granularity when throttle (for 4096 ring size,
> that's per-16MB operation) than current way where we inject per-vcpu async task
> to sleep, like auto-converge.
> 
> Then we have the "black box" to tune this value with below input/output:
> 
>    - Input: dirty rate information, same as current algo
> 
>    - Output: increase/decrease of per-vcpu throttle_us_per_full above, and
>      that's all
> 
> We can do the sampling per-second, then we keep doing it: we can have 1 thread
> doing per-second task collecting dirty rate information for all the vcpus, then
> tune that throttle_us_per_full for each of them.
> 
> The simplest linear algorithm would be as simple as (for each vcpu):
> 
>    if (quota < current)
>      throttle_us_per_full += SOMETHING;
>      if (throttle_us_per_full > MAX)
>        throttle_us_per_full = MAX;
>    else
>      throttle_us_per_full -= SOMETHING;
>      if (throttle_us_per_full < 0)
>        throttle_us_per_full = 0;
> 
> I think your algorithm is fine, but thoroughly review every single bit of it in
> one shot will be challenging, and it's also hard to prove every bit of the
> algorithm is helpful, as there're a lot of hand-made macros and state changes.
> 
> I actually tested the current algorithm of yours, the dirty rate fluctuates a
> bit (when I specified 200MB/s, it can go into either a few tens of MB/s or
> 300MB/s, normally less), neither does it respond fast (the initial throtle from
> 500MB/s -> 200MB/s should need 1 minute or something), so it seems not ideal
> anyway. In that case I prefer we start with simple.
> 
> So IMHO we can start with this simple scheme first then it'll start working
> with much less line of codes, afaict.  With that scheme ready in the 1st or
> initial patches, it'll be easier to either apply any better algorithm
> (e.g. your current one, if you're confident with that) or other things then
> it'll be much easier to review too if you could consider split your patch like
> that.
> 
> Normally per my knowledge for the need on migration, we could consider add an
> integral algorithm into this linear algorithm that I said above, and it should
> help us reach a very stable and constant state of throttling already.  But
> we'll need to try it out, as I never tried.
> 
> What do you think?
> 
I absolutely agree with your point, negative feedback system is also 
what i thought in the first place, and theoretically may be the most 
appropriate algo to control the vcpu in a stable dirty page rate from my 
point of view, but at the very beginning i'm not sure the new algo of 
throttling can be accepted, so i adopted the exiting auto-converge algo 
in qemu... :). One of my purposes of posting this patchset is for the 
sake of RFC, and thanks Peter very much for giving the advice.

I'll try it out and see the results. If things go well, the negative 
feedback system to control the dirty page rate for a vcpu will be 
introduced next version.
Hyman Huang Dec. 8, 2021, 3:50 p.m. UTC | #3
在 2021/12/8 23:36, Hyman 写道:
> 
> 
> 在 2021/12/6 18:10, Peter Xu 写道:
>> On Fri, Dec 03, 2021 at 09:39:46AM +0800, huangy81@chinatelecom.cn wrote:
>>> +static uint64_t dirtylimit_pct(unsigned int last_pct,
>>> +                               uint64_t quota,
>>> +                               uint64_t current)
>>> +{
>>> +    uint64_t limit_pct = 0;
>>> +    RestrainPolicy policy;
>>> +    bool mitigate = (quota > current) ? true : false;
>>> +
>>> +    if (mitigate && ((current == 0) ||
>>> +        (last_pct <= DIRTYLIMIT_THROTTLE_SLIGHT_STEP_SIZE))) {
>>> +        return 0;
>>> +    }
>>> +
>>> +    policy = dirtylimit_policy(last_pct, quota, current);
>>> +    switch (policy) {
>>> +    case RESTRAIN_SLIGHT:
>>> +        /* [90, 99] */
>>> +        if (mitigate) {
>>> +            limit_pct =
>>> +                last_pct - DIRTYLIMIT_THROTTLE_SLIGHT_STEP_SIZE;
>>> +        } else {
>>> +            limit_pct =
>>> +                last_pct + DIRTYLIMIT_THROTTLE_SLIGHT_STEP_SIZE;
>>> +
>>> +            limit_pct = MIN(limit_pct, CPU_THROTTLE_PCT_MAX);
>>> +        }
>>> +       break;
>>> +    case RESTRAIN_HEAVY:
>>> +        /* [75, 90) */
>>> +        if (mitigate) {
>>> +            limit_pct =
>>> +                last_pct - DIRTYLIMIT_THROTTLE_HEAVY_STEP_SIZE;
>>> +        } else {
>>> +            limit_pct =
>>> +                last_pct + DIRTYLIMIT_THROTTLE_HEAVY_STEP_SIZE;
>>> +
>>> +            limit_pct = MIN(limit_pct,
>>> +                DIRTYLIMIT_THROTTLE_SLIGHT_WATERMARK);
>>> +        }
>>> +       break;
>>> +    case RESTRAIN_RATIO:
>>> +        /* [0, 75) */
>>> +        if (mitigate) {
>>> +            if (last_pct <= (((quota - current) * 100 / quota))) {
>>> +                limit_pct = 0;
>>> +            } else {
>>> +                limit_pct = last_pct -
>>> +                    ((quota - current) * 100 / quota);
>>> +                limit_pct = MAX(limit_pct, CPU_THROTTLE_PCT_MIN);
>>> +            }
>>> +        } else {
>>> +            limit_pct = last_pct +
>>> +                ((current - quota) * 100 / current);
>>> +
>>> +            limit_pct = MIN(limit_pct,
>>> +                DIRTYLIMIT_THROTTLE_HEAVY_WATERMARK);
>>> +        }
>>> +       break;
>>> +    case RESTRAIN_KEEP:
>>> +    default:
>>> +       limit_pct = last_pct;
>>> +       break;
>>> +    }
>>> +
>>> +    return limit_pct;
>>> +}
>>> +
>>> +static void *dirtylimit_thread(void *opaque)
>>> +{
>>> +    int cpu_index = *(int *)opaque;
>>> +    uint64_t quota_dirtyrate, current_dirtyrate;
>>> +    unsigned int last_pct = 0;
>>> +    unsigned int pct = 0;
>>> +
>>> +    rcu_register_thread();
>>> +
>>> +    quota_dirtyrate = dirtylimit_quota(cpu_index);
>>> +    current_dirtyrate = dirtylimit_current(cpu_index);
>>> +
>>> +    pct = dirtylimit_init_pct(quota_dirtyrate, current_dirtyrate);
>>> +
>>> +    do {
>>> +        trace_dirtylimit_impose(cpu_index,
>>> +            quota_dirtyrate, current_dirtyrate, pct);
>>> +
>>> +        last_pct = pct;
>>> +        if (pct == 0) {
>>> +            sleep(DIRTYLIMIT_CALC_PERIOD_TIME_S);
>>> +        } else {
>>> +            dirtylimit_check(cpu_index, pct);
>>> +        }
>>> +
>>> +        quota_dirtyrate = dirtylimit_quota(cpu_index);
>>> +        current_dirtyrate = dirtylimit_current(cpu_index);
>>> +
>>> +        pct = dirtylimit_pct(last_pct, quota_dirtyrate, 
>>> current_dirtyrate);
>>
>> So what I had in mind is we can start with an extremely simple version of
>> negative feedback system.  Say, firstly each vcpu will have a simple 
>> number to
>> sleep for some interval (this is ugly code, but just show what I 
>> meant..):
>>
>> ===============
>> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
>> index eecd8031cf..c320fd190f 100644
>> --- a/accel/kvm/kvm-all.c
>> +++ b/accel/kvm/kvm-all.c
>> @@ -2932,6 +2932,8 @@ int kvm_cpu_exec(CPUState *cpu)
>>               trace_kvm_dirty_ring_full(cpu->cpu_index);
>>               qemu_mutex_lock_iothread();
>>               kvm_dirty_ring_reap(kvm_state);
>> +            if (dirtylimit_enabled(cpu->cpu_index) && 
>> cpu->throttle_us_per_full)
>> +                usleep(cpu->throttle_us_per_full);
>>               qemu_mutex_unlock_iothread();
>>               ret = 0;
>>               break;
>> ===============
>>
>> I think this will have finer granularity when throttle (for 4096 ring 
>> size,
>> that's per-16MB operation) than current way where we inject per-vcpu 
>> async task
>> to sleep, like auto-converge.
>>
>> Then we have the "black box" to tune this value with below input/output:
>>
>>    - Input: dirty rate information, same as current algo
>>
>>    - Output: increase/decrease of per-vcpu throttle_us_per_full above, 
>> and
>>      that's all
>>
>> We can do the sampling per-second, then we keep doing it: we can have 
>> 1 thread
>> doing per-second task collecting dirty rate information for all the 
>> vcpus, then
>> tune that throttle_us_per_full for each of them.
>>
>> The simplest linear algorithm would be as simple as (for each vcpu):
>>
>>    if (quota < current)
>>      throttle_us_per_full += SOMETHING;
>>      if (throttle_us_per_full > MAX)
>>        throttle_us_per_full = MAX;
>>    else
>>      throttle_us_per_full -= SOMETHING;
>>      if (throttle_us_per_full < 0)
>>        throttle_us_per_full = 0;
>>
>> I think your algorithm is fine, but thoroughly review every single bit 
>> of it in
>> one shot will be challenging, and it's also hard to prove every bit of 
>> the
>> algorithm is helpful, as there're a lot of hand-made macros and state 
>> changes.
>>
>> I actually tested the current algorithm of yours, the dirty rate 
>> fluctuates a
>> bit (when I specified 200MB/s, it can go into either a few tens of 
>> MB/s or
>> 300MB/s, normally less), neither does it respond fast (the initial 
>> throtle from
>> 500MB/s -> 200MB/s should need 1 minute or something), so it seems not 
>> ideal
>> anyway. In that case I prefer we start with simple.
>>
>> So IMHO we can start with this simple scheme first then it'll start 
>> working
>> with much less line of codes, afaict.  With that scheme ready in the 
>> 1st or
>> initial patches, it'll be easier to either apply any better algorithm
>> (e.g. your current one, if you're confident with that) or other things 
>> then
>> it'll be much easier to review too if you could consider split your 
>> patch like
>> that.
>>
>> Normally per my knowledge for the need on migration, we could consider 
>> add an
>> integral algorithm into this linear algorithm that I said above, and 
>> it should
>> help us reach a very stable and constant state of throttling already.  
>> But
>> we'll need to try it out, as I never tried.
>>
>> What do you think?
>>
> I absolutely agree with your point, negative feedback system is also 
> what i thought in the first place, and theoretically may be the most 
> appropriate algo to control the vcpu in a stable dirty page rate from my 
> point of view, but at the very beginning i'm not sure the new algo of 
> throttling can be accepted, so i adopted the exiting auto-converge algo 
> in qemu... :). One of my purposes of posting this patchset is for the 
> sake of RFC, and thanks Peter very much for giving the advice.
> 
> I'll try it out and see the results. If things go well, the negative 
> feedback system to control the dirty page rate for a vcpu will be 
> introduced next version.

uh... "method" may be a better word to express what i mean instead of 
"algo" in my reply above, and the real "algo" implemented in "black box".
diff mbox series

Patch

diff --git a/include/sysemu/cpu-throttle.h b/include/sysemu/cpu-throttle.h
index d65bdef..962990b 100644
--- a/include/sysemu/cpu-throttle.h
+++ b/include/sysemu/cpu-throttle.h
@@ -65,4 +65,49 @@  bool cpu_throttle_active(void);
  */
 int cpu_throttle_get_percentage(void);
 
+/**
+ * dirtylimit_enabled
+ *
+ * Returns: %true if dirty page limit for vCPU is enabled, %false otherwise.
+ */
+bool dirtylimit_enabled(int cpu_index);
+
+/**
+ * dirtylimit_is_vcpu_index_valid
+ *
+ * Returns: %true if cpu index valid, %false otherwise.
+ */
+bool dirtylimit_is_vcpu_index_valid(int cpu_index);
+
+/**
+ * dirtylimit_state_init:
+ *
+ * initialize golobal state for dirtylimit
+ */
+void dirtylimit_state_init(int max_cpus);
+
+/**
+ * dirtylimit_vcpu:
+ *
+ * impose dirtylimit on vcpu util reaching the quota dirtyrate
+ */
+void dirtylimit_vcpu(int cpu_index,
+                     uint64_t quota);
+
+/**
+ * dirtylimit_query_vcpu:
+ *
+ * Returns: dirty page limit information of specified virtual CPU.
+ */
+struct DirtyLimitInfo *dirtylimit_query_vcpu(int cpu_index);
+
+/**
+ * dirtylimit_cancel_vcpu:
+ *
+ * cancel dirtylimit for the specified vcpu
+ *
+ * Returns: the number of running threads for dirtylimit
+ */
+int dirtylimit_cancel_vcpu(int cpu_index);
+
 #endif /* SYSEMU_CPU_THROTTLE_H */
diff --git a/qapi/migration.json b/qapi/migration.json
index bbfd48c..3da8fdf 100644
--- a/qapi/migration.json
+++ b/qapi/migration.json
@@ -1850,6 +1850,28 @@ 
 { 'command': 'query-dirty-rate', 'returns': 'DirtyRateInfo' }
 
 ##
+# @DirtyLimitInfo:
+#
+# Dirty page rate limit information of virtual CPU.
+#
+# @cpu-index: index of virtual CPU.
+#
+# @enable: true to enable, false to disable.
+#
+# @limit-rate: upper limit of dirty page rate for virtual CPU.
+#
+# @current-rate: current dirty page rate for virtual CPU.
+#
+# Since: 7.0
+#
+##
+{ 'struct': 'DirtyLimitInfo',
+  'data': { 'cpu-index': 'int',
+            'enable': 'bool',
+            'limit-rate': 'int64',
+            'current-rate': 'int64' } }
+
+##
 # @snapshot-save:
 #
 # Save a VM snapshot
diff --git a/softmmu/cpu-throttle.c b/softmmu/cpu-throttle.c
index 8c2144a..ca0f440 100644
--- a/softmmu/cpu-throttle.c
+++ b/softmmu/cpu-throttle.c
@@ -29,6 +29,9 @@ 
 #include "qemu/main-loop.h"
 #include "sysemu/cpus.h"
 #include "sysemu/cpu-throttle.h"
+#include "sysemu/dirtylimit.h"
+#include "qapi/qapi-commands-migration.h"
+#include "trace.h"
 
 /* vcpu throttling controls */
 static QEMUTimer *throttle_timer;
@@ -38,6 +41,358 @@  static unsigned int throttle_percentage;
 #define CPU_THROTTLE_PCT_MAX 99
 #define CPU_THROTTLE_TIMESLICE_NS 10000000
 
+#define DIRTYLIMIT_TOLERANCE_RANGE  15      /* 15MB/s */
+
+#define DIRTYLIMIT_THROTTLE_HEAVY_WATERMARK     75
+#define DIRTYLIMIT_THROTTLE_SLIGHT_WATERMARK    90
+
+#define DIRTYLIMIT_THROTTLE_HEAVY_STEP_SIZE     5
+#define DIRTYLIMIT_THROTTLE_SLIGHT_STEP_SIZE    2
+
+typedef enum {
+    RESTRAIN_KEEP,
+    RESTRAIN_RATIO,
+    RESTRAIN_HEAVY,
+    RESTRAIN_SLIGHT,
+} RestrainPolicy;
+
+typedef struct DirtyLimitState {
+    int cpu_index;
+    bool enabled;
+    uint64_t quota;     /* quota dirtyrate MB/s */
+    QemuThread thread;
+    char *name;         /* thread name */
+} DirtyLimitState;
+
+struct {
+    DirtyLimitState *states;
+    int max_cpus;
+    unsigned long *bmap; /* running thread bitmap */
+    unsigned long nr;
+} *dirtylimit_state;
+
+bool dirtylimit_enabled(int cpu_index)
+{
+    return qatomic_read(&dirtylimit_state->states[cpu_index].enabled);
+}
+
+static bool dirtylimit_is_vcpu_unplug(int cpu_index)
+{
+    CPUState *cpu;
+    CPU_FOREACH(cpu) {
+        if (cpu->cpu_index == cpu_index) {
+            break;
+        }
+    }
+
+    return cpu->unplug;
+}
+
+bool dirtylimit_is_vcpu_index_valid(int cpu_index)
+{
+    if (cpu_index < 0 ||
+        cpu_index >= qatomic_read(&dirtylimit_state->max_cpus) ||
+        dirtylimit_is_vcpu_unplug(cpu_index)) {
+        return false;
+    }
+
+    return true;
+}
+
+static inline void dirtylimit_set_quota(int cpu_index, uint64_t quota)
+{
+    qatomic_set(&dirtylimit_state->states[cpu_index].quota, quota);
+}
+
+static inline uint64_t dirtylimit_quota(int cpu_index)
+{
+    return qatomic_read(&dirtylimit_state->states[cpu_index].quota);
+}
+
+static int64_t dirtylimit_current(int cpu_index)
+{
+    return dirtylimit_calc_current(cpu_index);
+}
+
+static void dirtylimit_vcpu_thread(CPUState *cpu, run_on_cpu_data data)
+{
+    double pct;
+    double throttle_ratio;
+    int64_t sleeptime_ns, endtime_ns;
+    int *percentage = (int *)data.host_ptr;
+
+    pct = (double)(*percentage) / 100;
+    throttle_ratio = pct / (1 - pct);
+    /* Add 1ns to fix double's rounding error (like 0.9999999...) */
+    sleeptime_ns = (int64_t)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS + 1);
+    endtime_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + sleeptime_ns;
+    while (sleeptime_ns > 0 && !cpu->stop) {
+        if (sleeptime_ns > SCALE_MS) {
+            qemu_cond_timedwait_iothread(cpu->halt_cond,
+                                         sleeptime_ns / SCALE_MS);
+        } else {
+            qemu_mutex_unlock_iothread();
+            g_usleep(sleeptime_ns / SCALE_US);
+            qemu_mutex_lock_iothread();
+        }
+        sleeptime_ns = endtime_ns - qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
+    }
+    qatomic_set(&cpu->throttle_thread_scheduled, 0);
+
+    free(percentage);
+}
+
+static void dirtylimit_check(int cpu_index,
+                             int percentage)
+{
+    CPUState *cpu;
+    int64_t sleeptime_ns, starttime_ms, currenttime_ms;
+    int *pct_parameter;
+    double pct;
+
+    pct = (double) percentage / 100;
+
+    starttime_ms = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+
+    while (true) {
+        CPU_FOREACH(cpu) {
+            if ((cpu_index == cpu->cpu_index) &&
+                (!qatomic_xchg(&cpu->throttle_thread_scheduled, 1))) {
+                pct_parameter = malloc(sizeof(*pct_parameter));
+                *pct_parameter = percentage;
+                async_run_on_cpu(cpu, dirtylimit_vcpu_thread,
+                                 RUN_ON_CPU_HOST_PTR(pct_parameter));
+                break;
+            }
+        }
+
+        sleeptime_ns = CPU_THROTTLE_TIMESLICE_NS / (1 - pct);
+        g_usleep(sleeptime_ns / SCALE_US);
+
+        currenttime_ms = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+        if (unlikely((currenttime_ms - starttime_ms) >
+                     (DIRTYLIMIT_CALC_PERIOD_TIME_S * 1000))) {
+            break;
+        }
+    }
+}
+
+static uint64_t dirtylimit_init_pct(uint64_t quota,
+                                    uint64_t current)
+{
+    uint64_t limit_pct = 0;
+
+    if (quota >= current || (current == 0) ||
+        ((current - quota) <= DIRTYLIMIT_TOLERANCE_RANGE)) {
+        limit_pct = 0;
+    } else {
+        limit_pct = (current - quota) * 100 / current;
+
+        limit_pct = MIN(limit_pct,
+            DIRTYLIMIT_THROTTLE_HEAVY_WATERMARK);
+    }
+
+    return limit_pct;
+}
+
+static RestrainPolicy dirtylimit_policy(unsigned int last_pct,
+                                        uint64_t quota,
+                                        uint64_t current)
+{
+    uint64_t max, min;
+
+    max = MAX(quota, current);
+    min = MIN(quota, current);
+    if ((max - min) <= DIRTYLIMIT_TOLERANCE_RANGE) {
+        return RESTRAIN_KEEP;
+    }
+    if (last_pct < DIRTYLIMIT_THROTTLE_HEAVY_WATERMARK) {
+        /* last percentage locates in [0, 75)*/
+        return RESTRAIN_RATIO;
+    } else if (last_pct < DIRTYLIMIT_THROTTLE_SLIGHT_WATERMARK) {
+        /* last percentage locates in [75, 90)*/
+        return RESTRAIN_HEAVY;
+    } else {
+        /* last percentage locates in [90, 99]*/
+        return RESTRAIN_SLIGHT;
+    }
+}
+
+static uint64_t dirtylimit_pct(unsigned int last_pct,
+                               uint64_t quota,
+                               uint64_t current)
+{
+    uint64_t limit_pct = 0;
+    RestrainPolicy policy;
+    bool mitigate = (quota > current) ? true : false;
+
+    if (mitigate && ((current == 0) ||
+        (last_pct <= DIRTYLIMIT_THROTTLE_SLIGHT_STEP_SIZE))) {
+        return 0;
+    }
+
+    policy = dirtylimit_policy(last_pct, quota, current);
+    switch (policy) {
+    case RESTRAIN_SLIGHT:
+        /* [90, 99] */
+        if (mitigate) {
+            limit_pct =
+                last_pct - DIRTYLIMIT_THROTTLE_SLIGHT_STEP_SIZE;
+        } else {
+            limit_pct =
+                last_pct + DIRTYLIMIT_THROTTLE_SLIGHT_STEP_SIZE;
+
+            limit_pct = MIN(limit_pct, CPU_THROTTLE_PCT_MAX);
+        }
+       break;
+    case RESTRAIN_HEAVY:
+        /* [75, 90) */
+        if (mitigate) {
+            limit_pct =
+                last_pct - DIRTYLIMIT_THROTTLE_HEAVY_STEP_SIZE;
+        } else {
+            limit_pct =
+                last_pct + DIRTYLIMIT_THROTTLE_HEAVY_STEP_SIZE;
+
+            limit_pct = MIN(limit_pct,
+                DIRTYLIMIT_THROTTLE_SLIGHT_WATERMARK);
+        }
+       break;
+    case RESTRAIN_RATIO:
+        /* [0, 75) */
+        if (mitigate) {
+            if (last_pct <= (((quota - current) * 100 / quota))) {
+                limit_pct = 0;
+            } else {
+                limit_pct = last_pct -
+                    ((quota - current) * 100 / quota);
+                limit_pct = MAX(limit_pct, CPU_THROTTLE_PCT_MIN);
+            }
+        } else {
+            limit_pct = last_pct +
+                ((current - quota) * 100 / current);
+
+            limit_pct = MIN(limit_pct,
+                DIRTYLIMIT_THROTTLE_HEAVY_WATERMARK);
+        }
+       break;
+    case RESTRAIN_KEEP:
+    default:
+       limit_pct = last_pct;
+       break;
+    }
+
+    return limit_pct;
+}
+
+static void *dirtylimit_thread(void *opaque)
+{
+    int cpu_index = *(int *)opaque;
+    uint64_t quota_dirtyrate, current_dirtyrate;
+    unsigned int last_pct = 0;
+    unsigned int pct = 0;
+
+    rcu_register_thread();
+
+    quota_dirtyrate = dirtylimit_quota(cpu_index);
+    current_dirtyrate = dirtylimit_current(cpu_index);
+
+    pct = dirtylimit_init_pct(quota_dirtyrate, current_dirtyrate);
+
+    do {
+        trace_dirtylimit_impose(cpu_index,
+            quota_dirtyrate, current_dirtyrate, pct);
+
+        last_pct = pct;
+        if (pct == 0) {
+            sleep(DIRTYLIMIT_CALC_PERIOD_TIME_S);
+        } else {
+            dirtylimit_check(cpu_index, pct);
+        }
+
+        quota_dirtyrate = dirtylimit_quota(cpu_index);
+        current_dirtyrate = dirtylimit_current(cpu_index);
+
+        pct = dirtylimit_pct(last_pct, quota_dirtyrate, current_dirtyrate);
+    } while (dirtylimit_enabled(cpu_index));
+
+    rcu_unregister_thread();
+
+    return NULL;
+}
+
+int dirtylimit_cancel_vcpu(int cpu_index)
+{
+    int i;
+    int nr_threads = 0;
+
+    qatomic_set(&dirtylimit_state->states[cpu_index].enabled, 0);
+    dirtylimit_set_quota(cpu_index, 0);
+
+    bitmap_test_and_clear_atomic(dirtylimit_state->bmap, cpu_index, 1);
+
+    for (i = 0; i < dirtylimit_state->nr; i++) {
+        unsigned long temp = dirtylimit_state->bmap[i];
+        nr_threads += ctpopl(temp);
+    }
+
+   return nr_threads;
+}
+
+void dirtylimit_vcpu(int cpu_index,
+                     uint64_t quota)
+{
+    trace_dirtylimit_vcpu(cpu_index, quota);
+
+    dirtylimit_set_quota(cpu_index, quota);
+
+    if (unlikely(!dirtylimit_enabled(cpu_index))) {
+        qatomic_set(&dirtylimit_state->states[cpu_index].enabled, 1);
+        dirtylimit_state->states[cpu_index].name =
+            g_strdup_printf("dirtylimit-%d", cpu_index);
+        qemu_thread_create(&dirtylimit_state->states[cpu_index].thread,
+            dirtylimit_state->states[cpu_index].name,
+            dirtylimit_thread,
+            (void *)&dirtylimit_state->states[cpu_index].cpu_index,
+            QEMU_THREAD_DETACHED);
+        bitmap_set_atomic(dirtylimit_state->bmap, cpu_index, 1);
+    }
+}
+
+struct DirtyLimitInfo *dirtylimit_query_vcpu(int cpu_index)
+{
+    DirtyLimitInfo *info = NULL;
+
+    info = g_malloc0(sizeof(*info));
+    info->cpu_index = cpu_index;
+    info->enable = dirtylimit_enabled(cpu_index);
+    info->limit_rate= dirtylimit_quota(cpu_index);;
+    info->current_rate = dirtylimit_current(cpu_index);
+
+    return info;
+}
+
+void dirtylimit_state_init(int max_cpus)
+{
+    int i;
+
+    dirtylimit_state = g_malloc0(sizeof(*dirtylimit_state));
+
+    dirtylimit_state->states =
+            g_malloc0(sizeof(DirtyLimitState) * max_cpus);
+
+    for (i = 0; i < max_cpus; i++) {
+        dirtylimit_state->states[i].cpu_index = i;
+    }
+
+    dirtylimit_state->max_cpus = max_cpus;
+    dirtylimit_state->bmap = bitmap_new(max_cpus);
+    bitmap_clear(dirtylimit_state->bmap, 0, max_cpus);
+    dirtylimit_state->nr = BITS_TO_LONGS(max_cpus);
+
+    trace_dirtylimit_state_init(max_cpus);
+}
+
 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
 {
     double pct;
diff --git a/softmmu/trace-events b/softmmu/trace-events
index 9c88887..a7c9c04 100644
--- a/softmmu/trace-events
+++ b/softmmu/trace-events
@@ -31,3 +31,8 @@  runstate_set(int current_state, const char *current_state_str, int new_state, co
 system_wakeup_request(int reason) "reason=%d"
 qemu_system_shutdown_request(int reason) "reason=%d"
 qemu_system_powerdown_request(void) ""
+
+#cpu-throttle.c
+dirtylimit_state_init(int max_cpus) "dirtylimit state init: max cpus %d"
+dirtylimit_impose(int cpu_index, uint64_t quota, uint64_t current, int pct) "CPU[%d] impose dirtylimit: quota %" PRIu64 ", current %" PRIu64 ", percentage %d"
+dirtylimit_vcpu(int cpu_index, uint64_t quota) "CPU[%d] set quota dirtylimit %"PRIu64