diff mbox

kvm tools: Use mmap for working with disk image V3

Message ID 1302351545-5214-1-git-send-email-levinsasha928@gmail.com (mailing list archive)
State New, archived
Headers show

Commit Message

Sasha Levin April 9, 2011, 12:19 p.m. UTC
Attempt to use mmap first for working with a disk image, if the attempt is failed (for example, large image on a 32bit system) fallback to using read/write.

Performance (kB/s) test using bonnie++ showed the following improvement:

kvm cmdline: ./kvm run --mem=256 --image=./work/vms/gentoo.img --kernel=/boot/bzImage-git
bonnie++ cmdline: bonnie++ -u 0

Before:
Version  1.96       ------Sequential Output----- --Sequential Input- -Random-
Concurrency   1     -Per Chr- --Block-- -Rewrite- -Per Chr- --Block-- --Seeks--
Machine        Size K/sec %CP K/sec %CP K/sec %CP K/sec %CP K/sec %CP  /sec %CP
tux            480M   673 100 308017  61 288156  49  3286  99 892186  76 +++++ +++
Latency             12998us   50992us   35993us    3000us    1999us     201ms
Version  1.96       ------Sequential Create------ --------Random Create--------
tux                 -Create-- --Read--- -Delete-- -Create-- --Read--- -Delete--
              files  /sec %CP  /sec %CP  /sec %CP  /sec %CP  /sec %CP  /sec %CP
                 16 +++++ +++ +++++ +++ +++++ +++ +++++ +++ +++++ +++ +++++ +++
Latency              3000us    1000us    1000us    1000us    1998us    1000us

After:
Version  1.96       ------Sequential Output------ --Sequential Input- --Random-
Concurrency   1     -Per Chr- --Block-- -Rewrite- -Per Chr- --Block-- --Seeks--
Machine        Size K/sec %CP K/sec %CP K/sec %CP K/sec %CP K/sec %CP  /sec %CP
tux            480M   672  99 415549  45 310350  52  3495  99 910360  83 +++++ +++
Latency             12998us    7998us   23996us    3998us    1000us     116ms
Version  1.96       ------Sequential Create------ --------Random Create--------
tux                 -Create-- --Read--- -Delete-- -Create-- --Read--- -Delete--
              files  /sec %CP  /sec %CP  /sec %CP  /sec %CP  /sec %CP  /sec %CP
                 16 +++++ +++ +++++ +++ +++++ +++ +++++ +++ +++++ +++ +++++ +++
Latency              1000us    1000us    1000us    2000us    1000us    1999us

Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
---
 tools/kvm/disk-image.c             |   54 +++++++++++++++++++++++++++++------
 tools/kvm/include/kvm/disk-image.h |    2 +-
 2 files changed, 45 insertions(+), 11 deletions(-)

Comments

Pekka Enberg April 9, 2011, 12:38 p.m. UTC | #1
On Sat, 9 Apr 2011, Sasha Levin wrote:
> +struct disk_image *disk_image__new(int fd, uint64_t size)
> +{
> +	struct disk_image *self;
> +
> +	self		= malloc(sizeof *self);
> +	if (!self)
> +		return NULL;
> +
> +	self->fd	= fd;
> +	self->size	= size;
> +	self->priv	= mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_NORESERVE, fd, 0);

This needs to be MAP_SHARED to preserve the same semantics. Does it still 
show a performance improvement if you change it?

 			Pekka
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Anthony Liguori April 9, 2011, 1:26 p.m. UTC | #2
On 04/09/2011 07:19 AM, Sasha Levin wrote:
> Attempt to use mmap first for working with a disk image, if the attempt is failed (for example, large image on a 32bit system) fallback to using read/write.

I think you're likely seeing a secondary affect.

Right now, you issue synchronous I/O on the VCPU exit path.  Blocking 
the VCPU to do I/O not only slows down I/O (since you're only handling 
one request at a time) but it has secondary affects on the guest since 
normally the guest can do useful work while waiting for I/O to complete.

You really want to defer I/O to a thread pool to allow the VCPU to 
continue execution while the I/O is being processed.

In QEMU, we use ioeventfd() with the virtio pci notify register which 
allows us to avoid heavy weight exits in this path.  We handle the 
ioeventfd in a single thread and then dispatch the I/O to the thread pool.

There might be a clever way to avoid that extra context switch.  It may 
involve doing something like X = read(eventfd); write(eventfd, X - 1); 
though which seems pretty gnarly to me.

Regards,

Anthony Liguori

> Performance (kB/s) test using bonnie++ showed the following improvement:
>
> kvm cmdline: ./kvm run --mem=256 --image=./work/vms/gentoo.img --kernel=/boot/bzImage-git
> bonnie++ cmdline: bonnie++ -u 0
>
> Before:
> Version  1.96       ------Sequential Output----- --Sequential Input- -Random-
> Concurrency   1     -Per Chr- --Block-- -Rewrite- -Per Chr- --Block-- --Seeks--
> Machine        Size K/sec %CP K/sec %CP K/sec %CP K/sec %CP K/sec %CP  /sec %CP
> tux            480M   673 100 308017  61 288156  49  3286  99 892186  76 +++++ +++
> Latency             12998us   50992us   35993us    3000us    1999us     201ms
> Version  1.96       ------Sequential Create------ --------Random Create--------
> tux                 -Create-- --Read--- -Delete-- -Create-- --Read--- -Delete--
>                files  /sec %CP  /sec %CP  /sec %CP  /sec %CP  /sec %CP  /sec %CP
>                   16 +++++ +++ +++++ +++ +++++ +++ +++++ +++ +++++ +++ +++++ +++
> Latency              3000us    1000us    1000us    1000us    1998us    1000us
>
> After:
> Version  1.96       ------Sequential Output------ --Sequential Input- --Random-
> Concurrency   1     -Per Chr- --Block-- -Rewrite- -Per Chr- --Block-- --Seeks--
> Machine        Size K/sec %CP K/sec %CP K/sec %CP K/sec %CP K/sec %CP  /sec %CP
> tux            480M   672  99 415549  45 310350  52  3495  99 910360  83 +++++ +++
> Latency             12998us    7998us   23996us    3998us    1000us     116ms
> Version  1.96       ------Sequential Create------ --------Random Create--------
> tux                 -Create-- --Read--- -Delete-- -Create-- --Read--- -Delete--
>                files  /sec %CP  /sec %CP  /sec %CP  /sec %CP  /sec %CP  /sec %CP
>                   16 +++++ +++ +++++ +++ +++++ +++ +++++ +++ +++++ +++ +++++ +++
> Latency              1000us    1000us    1000us    2000us    1000us    1999us
>
> Signed-off-by: Sasha Levin<levinsasha928@gmail.com>
> ---
>   tools/kvm/disk-image.c             |   54 +++++++++++++++++++++++++++++------
>   tools/kvm/include/kvm/disk-image.h |    2 +-
>   2 files changed, 45 insertions(+), 11 deletions(-)
>
> diff --git a/tools/kvm/disk-image.c b/tools/kvm/disk-image.c
> index 9deaf45..dbae1e4 100644
> --- a/tools/kvm/disk-image.c
> +++ b/tools/kvm/disk-image.c
> @@ -13,21 +13,31 @@
>   #include<unistd.h>
>   #include<fcntl.h>
>
> -struct disk_image *disk_image__new(int fd, uint64_t size, struct disk_image_operations *ops)
> +static int raw_image__read_sector_mmap(struct disk_image *self, uint64_t sector, void *dst, uint32_t dst_len)
>   {
> -	struct disk_image *self;
> +	uint64_t offset = sector<<  SECTOR_SHIFT;
>
> -	self		= malloc(sizeof *self);
> -	if (!self)
> -		return NULL;
> +	if (offset + dst_len>  self->size)
> +		return -1;
>
> -	self->fd	= fd;
> -	self->size	= size;
> -	self->ops	= ops;
> +	memcpy(dst, self->priv + offset, dst_len);
>
> -	return self;
> +	return 0;
>   }
>
> +static int raw_image__write_sector_mmap(struct disk_image *self, uint64_t sector, void *src, uint32_t src_len)
> +{
> +	uint64_t offset = sector<<  SECTOR_SHIFT;
> +
> +	if (offset + src_len>  self->size)
> +		return -1;
> +
> +	memcpy(self->priv + offset, src, src_len);
> +
> +	return 0;
> +}
> +
> +
>   static int raw_image__read_sector(struct disk_image *self, uint64_t sector, void *dst, uint32_t dst_len)
>   {
>   	uint64_t offset = sector<<  SECTOR_SHIFT;
> @@ -59,6 +69,27 @@ static struct disk_image_operations raw_image_ops = {
>   	.write_sector		= raw_image__write_sector,
>   };
>
> +static struct disk_image_operations raw_image_mmap_ops = {
> +	.read_sector		= raw_image__read_sector_mmap,
> +	.write_sector		= raw_image__write_sector_mmap,
> +};
> +
> +struct disk_image *disk_image__new(int fd, uint64_t size)
> +{
> +	struct disk_image *self;
> +
> +	self		= malloc(sizeof *self);
> +	if (!self)
> +		return NULL;
> +
> +	self->fd	= fd;
> +	self->size	= size;
> +	self->priv	= mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_NORESERVE, fd, 0);
> +	self->ops 	= (self->priv == MAP_FAILED) ?&raw_image_ops :&raw_image_mmap_ops;
> +	
> +	return self;
> +}
> +
>   static struct disk_image *raw_image__probe(int fd)
>   {
>   	struct stat st;
> @@ -66,7 +97,7 @@ static struct disk_image *raw_image__probe(int fd)
>   	if (fstat(fd,&st)<  0)
>   		return NULL;
>
> -	return disk_image__new(fd, st.st_size,&raw_image_ops);
> +	return disk_image__new(fd, st.st_size);
>   }
>
>   struct disk_image *disk_image__open(const char *filename)
> @@ -97,6 +128,9 @@ void disk_image__close(struct disk_image *self)
>   	if (self->ops->close)
>   		self->ops->close(self);
>
> +	if (self->priv != MAP_FAILED)
> +		munmap(self->priv, self->size);
> +
>   	if (close(self->fd)<  0)
>   		warning("close() failed");
>
> diff --git a/tools/kvm/include/kvm/disk-image.h b/tools/kvm/include/kvm/disk-image.h
> index df0a15d..8b78657 100644
> --- a/tools/kvm/include/kvm/disk-image.h
> +++ b/tools/kvm/include/kvm/disk-image.h
> @@ -22,7 +22,7 @@ struct disk_image {
>   };
>
>   struct disk_image *disk_image__open(const char *filename);
> -struct disk_image *disk_image__new(int fd, uint64_t size, struct disk_image_operations *ops);
> +struct disk_image *disk_image__new(int fd, uint64_t size);
>   void disk_image__close(struct disk_image *self);
>
>   static inline int disk_image__read_sector(struct disk_image *self, uint64_t sector, void *dst, uint32_t dst_len)

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Pekka Enberg April 9, 2011, 1:34 p.m. UTC | #3
On Sat, Apr 9, 2011 at 4:26 PM, Anthony Liguori <anthony@codemonkey.ws> wrote:
> Right now, you issue synchronous I/O on the VCPU exit path.  Blocking the
> VCPU to do I/O not only slows down I/O (since you're only handling one
> request at a time) but it has secondary affects on the guest since normally
> the guest can do useful work while waiting for I/O to complete.
>
> You really want to defer I/O to a thread pool to allow the VCPU to continue
> execution while the I/O is being processed.

Yup, we definitely need to do that.

                        Pekka
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/tools/kvm/disk-image.c b/tools/kvm/disk-image.c
index 9deaf45..dbae1e4 100644
--- a/tools/kvm/disk-image.c
+++ b/tools/kvm/disk-image.c
@@ -13,21 +13,31 @@ 
 #include <unistd.h>
 #include <fcntl.h>
 
-struct disk_image *disk_image__new(int fd, uint64_t size, struct disk_image_operations *ops)
+static int raw_image__read_sector_mmap(struct disk_image *self, uint64_t sector, void *dst, uint32_t dst_len)
 {
-	struct disk_image *self;
+	uint64_t offset = sector << SECTOR_SHIFT;
 
-	self		= malloc(sizeof *self);
-	if (!self)
-		return NULL;
+	if (offset + dst_len > self->size)
+		return -1;
 
-	self->fd	= fd;
-	self->size	= size;
-	self->ops	= ops;
+	memcpy(dst, self->priv + offset, dst_len);
 
-	return self;
+	return 0;
 }
 
+static int raw_image__write_sector_mmap(struct disk_image *self, uint64_t sector, void *src, uint32_t src_len)
+{
+	uint64_t offset = sector << SECTOR_SHIFT;
+
+	if (offset + src_len > self->size)
+		return -1;
+
+	memcpy(self->priv + offset, src, src_len);
+
+	return 0;
+}
+
+
 static int raw_image__read_sector(struct disk_image *self, uint64_t sector, void *dst, uint32_t dst_len)
 {
 	uint64_t offset = sector << SECTOR_SHIFT;
@@ -59,6 +69,27 @@  static struct disk_image_operations raw_image_ops = {
 	.write_sector		= raw_image__write_sector,
 };
 
+static struct disk_image_operations raw_image_mmap_ops = {
+	.read_sector		= raw_image__read_sector_mmap,
+	.write_sector		= raw_image__write_sector_mmap,
+};
+
+struct disk_image *disk_image__new(int fd, uint64_t size)
+{
+	struct disk_image *self;
+
+	self		= malloc(sizeof *self);
+	if (!self)
+		return NULL;
+
+	self->fd	= fd;
+	self->size	= size;
+	self->priv	= mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_NORESERVE, fd, 0);
+	self->ops 	= (self->priv == MAP_FAILED) ? &raw_image_ops : &raw_image_mmap_ops;
+	
+	return self;
+}
+
 static struct disk_image *raw_image__probe(int fd)
 {
 	struct stat st;
@@ -66,7 +97,7 @@  static struct disk_image *raw_image__probe(int fd)
 	if (fstat(fd, &st) < 0)
 		return NULL;
 
-	return disk_image__new(fd, st.st_size, &raw_image_ops);
+	return disk_image__new(fd, st.st_size);
 }
 
 struct disk_image *disk_image__open(const char *filename)
@@ -97,6 +128,9 @@  void disk_image__close(struct disk_image *self)
 	if (self->ops->close)
 		self->ops->close(self);
 
+	if (self->priv != MAP_FAILED)
+		munmap(self->priv, self->size);
+
 	if (close(self->fd) < 0)
 		warning("close() failed");
 
diff --git a/tools/kvm/include/kvm/disk-image.h b/tools/kvm/include/kvm/disk-image.h
index df0a15d..8b78657 100644
--- a/tools/kvm/include/kvm/disk-image.h
+++ b/tools/kvm/include/kvm/disk-image.h
@@ -22,7 +22,7 @@  struct disk_image {
 };
 
 struct disk_image *disk_image__open(const char *filename);
-struct disk_image *disk_image__new(int fd, uint64_t size, struct disk_image_operations *ops);
+struct disk_image *disk_image__new(int fd, uint64_t size);
 void disk_image__close(struct disk_image *self);
 
 static inline int disk_image__read_sector(struct disk_image *self, uint64_t sector, void *dst, uint32_t dst_len)