From patchwork Tue Sep  6 10:48:20 2011
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
X-Patchwork-Id: 1125972
Received: from vger.kernel.org (vger.kernel.org [209.132.180.67])
	by demeter1.kernel.org (8.14.4/8.14.4) with ESMTP id p86ApNGw013595
	for <patchwork-kvm@patchwork.kernel.org>; Tue, 6 Sep 2011 10:51:24 GMT
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
	id S1754200Ab1IFKvV (ORCPT
	<rfc822;patchwork-kvm@patchwork.kernel.org>);
	Tue, 6 Sep 2011 06:51:21 -0400
Received: from e38.co.us.ibm.com ([32.97.110.159]:56549 "EHLO
	e38.co.us.ibm.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
	with ESMTP id S1754191Ab1IFKvU (ORCPT <rfc822; kvm@vger.kernel.org>);
	Tue, 6 Sep 2011 06:51:20 -0400
Received: from d03relay01.boulder.ibm.com (d03relay01.boulder.ibm.com
	[9.17.195.226])
	by e38.co.us.ibm.com (8.14.4/8.13.1) with ESMTP id p86AhMXv014813
	for <kvm@vger.kernel.org>; Tue, 6 Sep 2011 04:43:22 -0600
Received: from d03av04.boulder.ibm.com (d03av04.boulder.ibm.com
	[9.17.195.170])
	by d03relay01.boulder.ibm.com (8.13.8/8.13.8/NCO v10.0) with ESMTP id
	p86Ao2eG188898
	for <kvm@vger.kernel.org>; Tue, 6 Sep 2011 04:50:02 -0600
Received: from d03av04.boulder.ibm.com (loopback [127.0.0.1])
	by d03av04.boulder.ibm.com (8.14.4/8.13.1/NCO v10.0 AVout) with ESMTP
	id p864o18b013096
	for <kvm@vger.kernel.org>; Mon, 5 Sep 2011 22:50:01 -0600
Received: from us.ibm.com ([9.115.118.38])
	by d03av04.boulder.ibm.com (8.14.4/8.13.1/NCO v10.0 AVin) with SMTP
	id p864nui2012033; Mon, 5 Sep 2011 22:49:57 -0600
Received: by us.ibm.com (sSMTP sendmail emulation);
	Tue,  6 Sep 2011 18:49:36 +0800
From: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
To: qemu-devel@nongnu.org
Cc: kvm@vger.kernel.org, stefanha@linux.vnet.ibm.com, mtosatti@redhat.com,
	aliguori@us.ibm.com, ryanh@us.ibm.com, zwu.kernel@gmail.com,
	kwolf@redhat.com, pair@us.ibm.com, Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
Subject: [PATCH v7 3/4] block: add block timer and throttling algorithm
Date: Tue,  6 Sep 2011 18:48:20 +0800
Message-Id: <1315306101-3129-4-git-send-email-wuzhy@linux.vnet.ibm.com>
X-Mailer: git-send-email 1.7.6
In-Reply-To: <1315306101-3129-1-git-send-email-wuzhy@linux.vnet.ibm.com>
References: <1315306101-3129-1-git-send-email-wuzhy@linux.vnet.ibm.com>
Sender: kvm-owner@vger.kernel.org
Precedence: bulk
List-ID: <kvm.vger.kernel.org>
X-Mailing-List: kvm@vger.kernel.org
X-Greylist: IP, sender and recipient auto-whitelisted, not delayed by
	milter-greylist-4.2.6 (demeter1.kernel.org [140.211.167.41]);
	Tue, 06 Sep 2011 10:51:24 +0000 (UTC)

Note:
     1.) When bps/iops limits are specified to a small value such as 511 bytes/s, this VM will hang up. We are considering how to handle this senario.
     2.) When "dd" command is issued in guest, if its option bs is set to a large value such as "bs=1024K", the result speed will slightly bigger than the limits.

For these problems, if you have nice thought, pls let us know.:)

Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
---
 block.c    |  297 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 block.h    |    6 +-
 blockdev.c |   69 ++++++++++++++
 3 files changed, 361 insertions(+), 11 deletions(-)

diff --git a/block.c b/block.c
index 17ee3df..1f6f188 100644
--- a/block.c
+++ b/block.c
@@ -30,6 +30,9 @@
 #include "qemu-objects.h"
 #include "qemu-coroutine.h"
 
+#include "qemu-timer.h"
+#include "block/blk-queue.h"
+
 #ifdef CONFIG_BSD
 #include <sys/types.h>
 #include <sys/stat.h>
@@ -72,6 +75,13 @@ static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
                                          QEMUIOVector *iov);
 static int coroutine_fn bdrv_co_flush_em(BlockDriverState *bs);
 
+static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
+        bool is_write, double elapsed_time, uint64_t *wait);
+static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
+        double elapsed_time, uint64_t *wait);
+static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
+        bool is_write, int64_t *wait);
+
 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
     QTAILQ_HEAD_INITIALIZER(bdrv_states);
 
@@ -104,6 +114,57 @@ int is_windows_drive(const char *filename)
 }
 #endif
 
+/* throttling disk I/O limits */
+void bdrv_io_limits_disable(BlockDriverState *bs)
+{
+    bs->io_limits_enabled = false;
+
+    if (bs->block_queue) {
+        qemu_block_queue_flush(bs->block_queue);
+        qemu_del_block_queue(bs->block_queue);
+        bs->block_queue = NULL;
+    }
+
+    if (bs->block_timer) {
+        qemu_del_timer(bs->block_timer);
+        qemu_free_timer(bs->block_timer);
+        bs->block_timer     = NULL;
+    }
+
+    bs->slice_start = 0;
+
+    bs->slice_end   = 0;
+}
+
+static void bdrv_block_timer(void *opaque)
+{
+    BlockDriverState *bs = opaque;
+    BlockQueue *queue    = bs->block_queue;
+
+    qemu_block_queue_flush(queue);
+}
+
+void bdrv_io_limits_enable(BlockDriverState *bs)
+{
+    bs->block_queue = qemu_new_block_queue();
+    bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
+
+    bs->slice_start = qemu_get_clock_ns(vm_clock);
+
+    bs->slice_end   = bs->slice_start + BLOCK_IO_SLICE_TIME;
+}
+
+bool bdrv_io_limits_enabled(BlockDriverState *bs)
+{
+    BlockIOLimit *io_limits = &bs->io_limits;
+    return io_limits->bps[BLOCK_IO_LIMIT_READ]
+         || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
+         || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
+         || io_limits->iops[BLOCK_IO_LIMIT_READ]
+         || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
+         || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
+}
+
 /* check if the path starts with "<protocol>:" */
 static int path_has_protocol(const char *path)
 {
@@ -694,6 +755,11 @@ int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
             bs->change_cb(bs->change_opaque, CHANGE_MEDIA);
     }
 
+    /* throttling disk I/O limits */
+    if (bs->io_limits_enabled) {
+        bdrv_io_limits_enable(bs);
+    }
+
     return 0;
 
 unlink_and_fail:
@@ -732,6 +798,18 @@ void bdrv_close(BlockDriverState *bs)
         if (bs->change_cb)
             bs->change_cb(bs->change_opaque, CHANGE_MEDIA);
     }
+
+    /* throttling disk I/O limits */
+    if (bs->block_queue) {
+        qemu_del_block_queue(bs->block_queue);
+        bs->block_queue = NULL;
+    }
+
+    if (bs->block_timer) {
+        qemu_del_timer(bs->block_timer);
+        qemu_free_timer(bs->block_timer);
+        bs->block_timer = NULL;
+    }
 }
 
 void bdrv_close_all(void)
@@ -2290,16 +2368,40 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
                                  BlockDriverCompletionFunc *cb, void *opaque)
 {
     BlockDriver *drv = bs->drv;
+    BlockDriverAIOCB *ret;
+    int64_t wait_time = -1;
 
     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
 
-    if (!drv)
-        return NULL;
-    if (bdrv_check_request(bs, sector_num, nb_sectors))
+    if (!drv || bdrv_check_request(bs, sector_num, nb_sectors)) {
         return NULL;
+    }
 
-    return drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
+    /* throttling disk read I/O */
+    if (bs->io_limits_enabled) {
+        if (bdrv_exceed_io_limits(bs, nb_sectors, false, &wait_time)) {
+            ret = qemu_block_queue_enqueue(bs->block_queue, bs, bdrv_aio_readv,
+                           sector_num, qiov, nb_sectors, cb, opaque);
+            if (wait_time != -1) {
+                qemu_mod_timer(bs->block_timer,
+                               wait_time + qemu_get_clock_ns(vm_clock));
+            }
+
+            return ret;
+        }
+    }
+
+    ret =  drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
                                cb, opaque);
+    if (ret) {
+        if (bs->io_limits_enabled) {
+            bs->io_disps.bytes[BLOCK_IO_LIMIT_READ] +=
+                              (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
+            bs->io_disps.ios[BLOCK_IO_LIMIT_READ]++;
+        }
+    }
+
+    return ret;
 }
 
 typedef struct BlockCompleteData {
@@ -2345,15 +2447,14 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
     BlockDriver *drv = bs->drv;
     BlockDriverAIOCB *ret;
     BlockCompleteData *blk_cb_data;
+    int64_t wait_time = -1;
 
     trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
 
-    if (!drv)
-        return NULL;
-    if (bs->read_only)
-        return NULL;
-    if (bdrv_check_request(bs, sector_num, nb_sectors))
+    if (!drv || bs->read_only
+        || bdrv_check_request(bs, sector_num, nb_sectors)) {
         return NULL;
+    }
 
     if (bs->dirty_bitmap) {
         blk_cb_data = blk_dirty_cb_alloc(bs, sector_num, nb_sectors, cb,
@@ -2362,13 +2463,32 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
         opaque = blk_cb_data;
     }
 
+    /* throttling disk write I/O */
+    if (bs->io_limits_enabled) {
+        if (bdrv_exceed_io_limits(bs, nb_sectors, true, &wait_time)) {
+            ret = qemu_block_queue_enqueue(bs->block_queue, bs, bdrv_aio_writev,
+                                  sector_num, qiov, nb_sectors, cb, opaque);
+            if (wait_time != -1) {
+                qemu_mod_timer(bs->block_timer,
+                               wait_time + qemu_get_clock_ns(vm_clock));
+            }
+
+            return ret;
+        }
+    }
+
     ret = drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors,
                                cb, opaque);
-
     if (ret) {
         if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
             bs->wr_highest_sector = sector_num + nb_sectors - 1;
         }
+
+        if (bs->io_limits_enabled) {
+            bs->io_disps.bytes[BLOCK_IO_LIMIT_WRITE] +=
+                               (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
+            bs->io_disps.ios[BLOCK_IO_LIMIT_WRITE]++;
+        }
     }
 
     return ret;
@@ -2633,6 +2753,163 @@ void bdrv_aio_cancel(BlockDriverAIOCB *acb)
     acb->pool->cancel(acb);
 }
 
+static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
+                 bool is_write, double elapsed_time, uint64_t *wait) {
+    uint64_t bps_limit = 0;
+    double   bytes_limit, bytes_disp, bytes_res;
+    double   slice_time, wait_time;
+
+    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
+        bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
+    } else if (bs->io_limits.bps[is_write]) {
+        bps_limit = bs->io_limits.bps[is_write];
+    } else {
+        if (wait) {
+            *wait = 0;
+        }
+
+        return false;
+    }
+
+    slice_time = bs->slice_end - bs->slice_start;
+    slice_time /= (NANOSECONDS_PER_SECOND);
+    bytes_limit = bps_limit * slice_time;
+    bytes_disp  = bs->io_disps.bytes[is_write];
+    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
+        bytes_disp += bs->io_disps.bytes[!is_write];
+    }
+
+    bytes_res   = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
+
+    if (bytes_disp + bytes_res <= bytes_limit) {
+        if (wait) {
+            *wait = 0;
+        }
+
+        return false;
+    }
+
+    /* Calc approx time to dispatch */
+    wait_time = (bytes_disp + bytes_res) / bps_limit - elapsed_time;
+
+    if (wait) {
+        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
+    }
+
+    return true;
+}
+
+static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
+                             double elapsed_time, uint64_t *wait) {
+    uint64_t iops_limit = 0;
+    double   ios_limit, ios_disp;
+    double   slice_time, wait_time;
+
+    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
+        iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
+    } else if (bs->io_limits.iops[is_write]) {
+        iops_limit = bs->io_limits.iops[is_write];
+    } else {
+        if (wait) {
+            *wait = 0;
+        }
+
+        return false;
+    }
+
+    slice_time = bs->slice_end - bs->slice_start;
+    slice_time /= (NANOSECONDS_PER_SECOND);
+    ios_limit  = iops_limit * slice_time;
+    ios_disp   = bs->io_disps.ios[is_write];
+    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
+        ios_disp += bs->io_disps.ios[!is_write];
+    }
+
+    if (ios_disp + 1 <= ios_limit) {
+        if (wait) {
+            *wait = 0;
+        }
+
+        return false;
+    }
+
+    /* Calc approx time to dispatch */
+    wait_time = (ios_disp + 1) / iops_limit;
+    if (wait_time > elapsed_time) {
+        wait_time = wait_time - elapsed_time;
+    } else {
+        wait_time = 0;
+    }
+
+    if (wait) {
+        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
+    }
+
+    return true;
+}
+
+static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
+                           bool is_write, int64_t *wait) {
+    int64_t  now, max_wait;
+    uint64_t bps_wait = 0, iops_wait = 0;
+    double   elapsed_time;
+    int      bps_ret, iops_ret;
+
+    now = qemu_get_clock_ns(vm_clock);
+    if ((bs->slice_start < now)
+        && (bs->slice_end > now)) {
+        bs->slice_end = now + BLOCK_IO_SLICE_TIME;
+    } else {
+        bs->slice_start = now;
+        bs->slice_end   = now + BLOCK_IO_SLICE_TIME;
+
+        bs->io_disps.bytes[is_write]  = 0;
+        bs->io_disps.bytes[!is_write] = 0;
+
+        bs->io_disps.ios[is_write]    = 0;
+        bs->io_disps.ios[!is_write]   = 0;
+    }
+
+    /* If a limit was exceeded, immediately queue this request */
+    if (qemu_block_queue_has_pending(bs->block_queue)) {
+        if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]
+            || bs->io_limits.bps[is_write] || bs->io_limits.iops[is_write]
+            || bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
+            if (wait) {
+                *wait = -1;
+            }
+
+            return true;
+        }
+    }
+
+    elapsed_time  = now - bs->slice_start;
+    elapsed_time  /= (NANOSECONDS_PER_SECOND);
+
+    bps_ret  = bdrv_exceed_bps_limits(bs, nb_sectors,
+                                      is_write, elapsed_time, &bps_wait);
+    iops_ret = bdrv_exceed_iops_limits(bs, is_write,
+                                      elapsed_time, &iops_wait);
+    if (bps_ret || iops_ret) {
+        max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
+        if (wait) {
+            *wait = max_wait;
+        }
+
+        now = qemu_get_clock_ns(vm_clock);
+        if (bs->slice_end < now + max_wait) {
+            bs->slice_end = now + max_wait;
+        }
+
+        return true;
+    }
+
+    if (wait) {
+        *wait = 0;
+    }
+
+    return false;
+}
 
 /**************************************************************/
 /* async block device emulation */
diff --git a/block.h b/block.h
index 3ac0b94..10d2828 100644
--- a/block.h
+++ b/block.h
@@ -58,6 +58,11 @@ void bdrv_info(Monitor *mon, QObject **ret_data);
 void bdrv_stats_print(Monitor *mon, const QObject *data);
 void bdrv_info_stats(Monitor *mon, QObject **ret_data);
 
+/* disk I/O throttling */
+void bdrv_io_limits_enable(BlockDriverState *bs);
+void bdrv_io_limits_disable(BlockDriverState *bs);
+bool bdrv_io_limits_enabled(BlockDriverState *bs);
+
 void bdrv_init(void);
 void bdrv_init_with_whitelist(void);
 BlockDriver *bdrv_find_protocol(const char *filename);
@@ -102,7 +107,6 @@ int bdrv_change_backing_file(BlockDriverState *bs,
     const char *backing_file, const char *backing_fmt);
 void bdrv_register(BlockDriver *bdrv);
 
-
 typedef struct BdrvCheckResult {
     int corruptions;
     int leaks;
diff --git a/blockdev.c b/blockdev.c
index 619ae9f..7f5c4df 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -747,6 +747,75 @@ int do_change_block(Monitor *mon, const char *device,
     return monitor_read_bdrv_key_start(mon, bs, NULL, NULL);
 }
 
+/* throttling disk I/O limits */
+int do_block_set_io_throttle(Monitor *mon,
+                       const QDict *qdict, QObject **ret_data)
+{
+    const char *devname = qdict_get_str(qdict, "device");
+    uint64_t bps        = qdict_get_try_int(qdict, "bps", -1);
+    uint64_t bps_rd     = qdict_get_try_int(qdict, "bps_rd", -1);
+    uint64_t bps_wr     = qdict_get_try_int(qdict, "bps_wr", -1);
+    uint64_t iops       = qdict_get_try_int(qdict, "iops", -1);
+    uint64_t iops_rd    = qdict_get_try_int(qdict, "iops_rd", -1);
+    uint64_t iops_wr    = qdict_get_try_int(qdict, "iops_wr", -1);
+    BlockDriverState *bs;
+
+    bs = bdrv_find(devname);
+    if (!bs) {
+        qerror_report(QERR_DEVICE_NOT_FOUND, devname);
+        return -1;
+    }
+
+    if ((bps == -1) && (bps_rd == -1) && (bps_wr == -1)
+        && (iops == -1) && (iops_rd == -1) && (iops_wr == -1)) {
+        qerror_report(QERR_MISSING_PARAMETER,
+                      "bps/bps_rd/bps_wr/iops/iops_rd/iops_wr");
+        return -1;
+    }
+
+    if (((bps != -1) && ((bps_rd != -1) || (bps_wr != -1)))
+        || ((iops != -1) && ((iops_rd != -1) || (iops_wr != -1)))) {
+        qerror_report(QERR_INVALID_PARAMETER_COMBINATION);
+        return -1;
+    }
+
+    if (bps != -1) {
+        bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL] = bps;
+        bs->io_limits.bps[BLOCK_IO_LIMIT_READ]  = 0;
+        bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE] = 0;
+    }
+
+    if ((bps_rd != -1) || (bps_wr != -1)) {
+        bs->io_limits.bps[BLOCK_IO_LIMIT_READ]   =
+           (bps_rd == -1) ? bs->io_limits.bps[BLOCK_IO_LIMIT_READ] : bps_rd;
+        bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE]  =
+           (bps_wr == -1) ? bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE] : bps_wr;
+        bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]  = 0;
+    }
+
+    if (iops != -1) {
+        bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL] = iops;
+        bs->io_limits.iops[BLOCK_IO_LIMIT_READ]  = 0;
+        bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE] = 0;
+    }
+
+    if ((iops_rd != -1) || (iops_wr != -1)) {
+        bs->io_limits.iops[BLOCK_IO_LIMIT_READ]  =
+           (iops_rd == -1) ? bs->io_limits.iops[BLOCK_IO_LIMIT_READ] : iops_rd;
+        bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE] =
+           (iops_wr == -1) ? bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE] : iops_wr;
+        bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL] = 0;
+    }
+
+    if (!bs->io_limits_enabled && bdrv_io_limits_enabled(bs)) {
+        bdrv_io_limits_enable(bs);
+    } else if (bs->io_limits_enabled && !bdrv_io_limits_enabled(bs)) {
+        bdrv_io_limits_disable(bs);
+    }
+
+    return 0;
+}
+
 int do_drive_del(Monitor *mon, const QDict *qdict, QObject **ret_data)
 {
     const char *id = qdict_get_str(qdict, "id");