@@ -29,6 +29,9 @@
#include "module.h"
#include "qemu-objects.h"
+#include "qemu-timer.h"
+#include "block/blk-queue.h"
+
#ifdef CONFIG_BSD
#include <sys/types.h>
#include <sys/stat.h>
@@ -58,6 +61,13 @@ static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num,
static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num,
const uint8_t *buf, int nb_sectors);
+static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
+ bool is_write, double elapsed_time, uint64_t *wait);
+static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
+ double elapsed_time, uint64_t *wait);
+static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
+ bool is_write, uint64_t *wait);
+
static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
QTAILQ_HEAD_INITIALIZER(bdrv_states);
@@ -90,6 +100,20 @@ int is_windows_drive(const char *filename)
}
#endif
+static int bdrv_io_limits_enable(BlockIOLimit *io_limits)
+{
+ if ((io_limits->bps[0] == 0)
+ && (io_limits->bps[1] == 0)
+ && (io_limits->bps[2] == 0)
+ && (io_limits->iops[0] == 0)
+ && (io_limits->iops[1] == 0)
+ && (io_limits->iops[2] == 0)) {
+ return 0;
+ }
+
+ return 1;
+}
+
/* check if the path starts with "<protocol>:" */
static int path_has_protocol(const char *path)
{
@@ -167,6 +191,28 @@ void path_combine(char *dest, int dest_size,
}
}
+static void bdrv_block_timer(void *opaque)
+{
+ BlockDriverState *bs = opaque;
+ BlockQueue *queue = bs->block_queue;
+
+ while (!QTAILQ_EMPTY(&queue->requests)) {
+ BlockIORequest *request;
+ int ret;
+
+ request = QTAILQ_FIRST(&queue->requests);
+ QTAILQ_REMOVE(&queue->requests, request, entry);
+
+ ret = qemu_block_queue_handler(request);
+ if (ret == 0) {
+ QTAILQ_INSERT_HEAD(&queue->requests, request, entry);
+ break;
+ }
+
+ qemu_free(request);
+ }
+}
+
void bdrv_register(BlockDriver *bdrv)
{
if (!bdrv->bdrv_aio_readv) {
@@ -642,6 +688,19 @@ int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
bs->change_cb(bs->change_opaque, CHANGE_MEDIA);
}
+ /* throttling disk I/O limits */
+ if (bdrv_io_limits_enable(&bs->io_limits)) {
+ bs->req_from_queue = false;
+ bs->block_queue = qemu_new_block_queue();
+ bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
+
+ bs->slice_start[0] = qemu_get_clock_ns(vm_clock);
+ bs->slice_start[1] = qemu_get_clock_ns(vm_clock);
+
+ bs->slice_end[0] = qemu_get_clock_ns(vm_clock) + BLOCK_IO_SLICE_TIME;
+ bs->slice_end[1] = qemu_get_clock_ns(vm_clock) + BLOCK_IO_SLICE_TIME;
+ }
+
return 0;
unlink_and_fail:
@@ -680,6 +739,16 @@ void bdrv_close(BlockDriverState *bs)
if (bs->change_cb)
bs->change_cb(bs->change_opaque, CHANGE_MEDIA);
}
+
+ /* throttling disk I/O limits */
+ if (bs->block_queue) {
+ qemu_del_block_queue(bs->block_queue);
+ }
+
+ if (bs->block_timer) {
+ qemu_del_timer(bs->block_timer);
+ qemu_free_timer(bs->block_timer);
+ }
}
void bdrv_close_all(void)
@@ -1312,6 +1381,14 @@ void bdrv_get_geometry_hint(BlockDriverState *bs,
*psecs = bs->secs;
}
+/* throttling disk io limits */
+void bdrv_set_io_limits(BlockDriverState *bs,
+ BlockIOLimit *io_limits)
+{
+ memset(&bs->io_limits, 0, sizeof(BlockIOLimit));
+ bs->io_limits = *io_limits;
+}
+
/* Recognize floppy formats */
typedef struct FDFormat {
FDriveType drive;
@@ -2111,6 +2188,165 @@ char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
return buf;
}
+static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
+ bool is_write, double elapsed_time, uint64_t *wait) {
+ uint64_t bps_limit = 0;
+ double bytes_limit, bytes_disp, bytes_res;
+ double slice_time, wait_time;
+
+ if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
+ bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
+ } else if (bs->io_limits.bps[is_write]) {
+ bps_limit = bs->io_limits.bps[is_write];
+ } else {
+ if (wait) {
+ *wait = 0;
+ }
+
+ return false;
+ }
+
+ slice_time = bs->slice_end[is_write] - bs->slice_start[is_write];
+ slice_time /= (BLOCK_IO_SLICE_TIME * 10.0);
+ bytes_limit = bps_limit * slice_time;
+ bytes_disp = bs->io_disps.bytes[is_write];
+ if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
+ bytes_disp += bs->io_disps.bytes[!is_write];
+ }
+
+ bytes_res = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
+
+ if (bytes_disp + bytes_res <= bytes_limit) {
+ if (wait) {
+ *wait = 0;
+ }
+
+ return false;
+ }
+
+ /* Calc approx time to dispatch */
+ wait_time = (bytes_disp + bytes_res) / bps_limit - elapsed_time;
+
+ if (wait) {
+ *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
+ }
+
+ return true;
+}
+
+static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
+ double elapsed_time, uint64_t *wait) {
+ uint64_t iops_limit = 0;
+ double ios_limit, ios_disp;
+ double slice_time, wait_time;
+
+ if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
+ iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
+ } else if (bs->io_limits.iops[is_write]) {
+ iops_limit = bs->io_limits.iops[is_write];
+ } else {
+ if (wait) {
+ *wait = 0;
+ }
+
+ return false;
+ }
+
+ slice_time = bs->slice_end[is_write] - bs->slice_start[is_write];
+ slice_time /= (BLOCK_IO_SLICE_TIME * 10.0);
+ ios_limit = iops_limit * slice_time;
+ ios_disp = bs->io_disps.ios[is_write];
+ if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
+ ios_disp += bs->io_disps.ios[!is_write];
+ }
+
+ if (ios_disp + 1 <= ios_limit) {
+ if (wait) {
+ *wait = 0;
+ }
+
+ return false;
+ }
+
+ /* Calc approx time to dispatch */
+ wait_time = (ios_disp + 1) / iops_limit;
+ if (wait_time > elapsed_time) {
+ wait_time = wait_time - elapsed_time;
+ } else {
+ wait_time = 0;
+ }
+
+ if (wait) {
+ *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
+ }
+
+ return true;
+}
+
+static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
+ bool is_write, uint64_t *wait) {
+ int64_t real_time, real_slice;
+ uint64_t bps_wait = 0, iops_wait = 0, max_wait;
+ double elapsed_time;
+ int bps_ret, iops_ret;
+
+ real_time = qemu_get_clock_ns(vm_clock);
+ real_slice = bs->slice_end[is_write] - bs->slice_start[is_write];
+ if ((bs->slice_start[is_write] < real_time)
+ && (bs->slice_end[is_write] > real_time)) {
+ bs->slice_end[is_write] = real_time + BLOCK_IO_SLICE_TIME;
+ } else {
+ bs->slice_start[is_write] = real_time;
+ bs->slice_end[is_write] = real_time + BLOCK_IO_SLICE_TIME;
+
+ bs->io_disps.bytes[is_write] = 0;
+ bs->io_disps.bytes[!is_write] = 0;
+
+ bs->io_disps.ios[is_write] = 0;
+ bs->io_disps.ios[!is_write] = 0;
+ }
+
+ /* If a limit was exceeded, immediately queue this request */
+ if ((bs->req_from_queue == false)
+ && !QTAILQ_EMPTY(&bs->block_queue->requests)) {
+ if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]
+ || bs->io_limits.bps[is_write] || bs->io_limits.iops[is_write]
+ || bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
+ if (wait) {
+ *wait = 0;
+ }
+
+ return true;
+ }
+ }
+
+ elapsed_time = real_time - bs->slice_start[is_write];
+ elapsed_time /= (BLOCK_IO_SLICE_TIME * 10.0);
+
+ bps_ret = bdrv_exceed_bps_limits(bs, nb_sectors,
+ is_write, elapsed_time, &bps_wait);
+ iops_ret = bdrv_exceed_iops_limits(bs, is_write,
+ elapsed_time, &iops_wait);
+ if (bps_ret || iops_ret) {
+ max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
+ if (wait) {
+ *wait = max_wait;
+ }
+
+ real_time = qemu_get_clock_ns(vm_clock);
+ if (bs->slice_end[is_write] < real_time + max_wait) {
+ bs->slice_end[is_write] = real_time + max_wait;
+ }
+
+ return true;
+ }
+
+ if (wait) {
+ *wait = 0;
+ }
+
+ return false;
+}
/**************************************************************/
/* async I/Os */
@@ -2121,13 +2357,28 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
{
BlockDriver *drv = bs->drv;
BlockDriverAIOCB *ret;
+ uint64_t wait_time = 0;
trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
- if (!drv)
- return NULL;
- if (bdrv_check_request(bs, sector_num, nb_sectors))
+ if (!drv || bdrv_check_request(bs, sector_num, nb_sectors)) {
+ if (bdrv_io_limits_enable(&bs->io_limits)) {
+ bs->req_from_queue = false;
+ }
return NULL;
+ }
+
+ /* throttling disk read I/O */
+ if (bdrv_io_limits_enable(&bs->io_limits)) {
+ if (bdrv_exceed_io_limits(bs, nb_sectors, false, &wait_time)) {
+ ret = qemu_block_queue_enqueue(bs->block_queue, bs, bdrv_aio_readv,
+ sector_num, qiov, nb_sectors, cb, opaque);
+ qemu_mod_timer(bs->block_timer,
+ wait_time + qemu_get_clock_ns(vm_clock));
+ bs->req_from_queue = false;
+ return ret;
+ }
+ }
ret = drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
cb, opaque);
@@ -2136,6 +2387,16 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
/* Update stats even though technically transfer has not happened. */
bs->rd_bytes += (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
bs->rd_ops ++;
+
+ if (bdrv_io_limits_enable(&bs->io_limits)) {
+ bs->io_disps.bytes[BLOCK_IO_LIMIT_READ] +=
+ (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
+ bs->io_disps.ios[BLOCK_IO_LIMIT_READ]++;
+ }
+ }
+
+ if (bdrv_io_limits_enable(&bs->io_limits)) {
+ bs->req_from_queue = false;
}
return ret;
@@ -2184,15 +2445,18 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
BlockDriver *drv = bs->drv;
BlockDriverAIOCB *ret;
BlockCompleteData *blk_cb_data;
+ uint64_t wait_time = 0;
trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
- if (!drv)
- return NULL;
- if (bs->read_only)
- return NULL;
- if (bdrv_check_request(bs, sector_num, nb_sectors))
+ if (!drv || bs->read_only
+ || bdrv_check_request(bs, sector_num, nb_sectors)) {
+ if (bdrv_io_limits_enable(&bs->io_limits)) {
+ bs->req_from_queue = false;
+ }
+
return NULL;
+ }
if (bs->dirty_bitmap) {
blk_cb_data = blk_dirty_cb_alloc(bs, sector_num, nb_sectors, cb,
@@ -2201,6 +2465,18 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
opaque = blk_cb_data;
}
+ /* throttling disk write I/O */
+ if (bdrv_io_limits_enable(&bs->io_limits)) {
+ if (bdrv_exceed_io_limits(bs, nb_sectors, true, &wait_time)) {
+ ret = qemu_block_queue_enqueue(bs->block_queue, bs, bdrv_aio_writev,
+ sector_num, qiov, nb_sectors, cb, opaque);
+ qemu_mod_timer(bs->block_timer,
+ wait_time + qemu_get_clock_ns(vm_clock));
+ bs->req_from_queue = false;
+ return ret;
+ }
+ }
+
ret = drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors,
cb, opaque);
@@ -2211,6 +2487,16 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
bs->wr_highest_sector = sector_num + nb_sectors - 1;
}
+
+ if (bdrv_io_limits_enable(&bs->io_limits)) {
+ bs->io_disps.bytes[BLOCK_IO_LIMIT_WRITE] +=
+ (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
+ bs->io_disps.ios[BLOCK_IO_LIMIT_WRITE]++;
+ }
+ }
+
+ if (bdrv_io_limits_enable(&bs->io_limits)) {
+ bs->req_from_queue = false;
}
return ret;
@@ -97,7 +97,6 @@ int bdrv_change_backing_file(BlockDriverState *bs,
const char *backing_file, const char *backing_fmt);
void bdrv_register(BlockDriver *bdrv);
-
typedef struct BdrvCheckResult {
int corruptions;
int leaks;
@@ -27,10 +27,17 @@
#include "block.h"
#include "qemu-option.h"
#include "qemu-queue.h"
+#include "block/blk-queue.h"
#define BLOCK_FLAG_ENCRYPT 1
#define BLOCK_FLAG_COMPAT6 4
+#define BLOCK_IO_LIMIT_READ 0
+#define BLOCK_IO_LIMIT_WRITE 1
+#define BLOCK_IO_LIMIT_TOTAL 2
+
+#define BLOCK_IO_SLICE_TIME 100000000
+
#define BLOCK_OPT_SIZE "size"
#define BLOCK_OPT_ENCRYPT "encryption"
#define BLOCK_OPT_COMPAT6 "compat6"
@@ -46,6 +53,16 @@ typedef struct AIOPool {
BlockDriverAIOCB *free_aiocb;
} AIOPool;
+typedef struct BlockIOLimit {
+ uint64_t bps[3];
+ uint64_t iops[3];
+} BlockIOLimit;
+
+typedef struct BlockIODisp {
+ uint64_t bytes[2];
+ uint64_t ios[2];
+} BlockIODisp;
+
struct BlockDriver {
const char *format_name;
int instance_size;
@@ -175,6 +192,15 @@ struct BlockDriverState {
void *sync_aiocb;
+ /* the time for latest disk I/O */
+ int64_t slice_start[2];
+ int64_t slice_end[2];
+ BlockIOLimit io_limits;
+ BlockIODisp io_disps;
+ BlockQueue *block_queue;
+ QEMUTimer *block_timer;
+ bool req_from_queue;
+
/* I/O stats (display with "info blockstats"). */
uint64_t rd_bytes;
uint64_t wr_bytes;
@@ -222,6 +248,9 @@ void qemu_aio_release(void *p);
void *qemu_blockalign(BlockDriverState *bs, size_t size);
+void bdrv_set_io_limits(BlockDriverState *bs,
+ BlockIOLimit *io_limits);
+
#ifdef _WIN32
int is_windows_drive(const char *filename);
#endif
Note: 1.) When bps/iops limits are specified to a small value such as 511 bytes/s, this VM will hang up. We are considering how to handle this senario. 2.) When "dd" command is issued in guest, if its option bs is set to a large value such as "bs=1024K", the result speed will slightly bigger than the limits. For these problems, if you have nice thought, pls let us know.:) Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com> --- block.c | 302 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- block.h | 1 - block_int.h | 29 ++++++ 3 files changed, 323 insertions(+), 9 deletions(-)