From patchwork Wed May 18 10:17:17 2011
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Prasad Joshi <prasadjoshi124@gmail.com>
X-Patchwork-Id: 793332
Received: from vger.kernel.org (vger.kernel.org [209.132.180.67])
	by demeter2.kernel.org (8.14.4/8.14.3) with ESMTP id p4IAMV8G029441
	for <patchwork-kvm@patchwork.kernel.org>;
	Wed, 18 May 2011 10:22:31 GMT
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
	id S1756893Ab1ERKRZ (ORCPT
	<rfc822;patchwork-kvm@patchwork.kernel.org>);
	Wed, 18 May 2011 06:17:25 -0400
Received: from mail-wy0-f174.google.com ([74.125.82.174]:54485 "EHLO
	mail-wy0-f174.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
	with ESMTP id S1756887Ab1ERKRW (ORCPT <rfc822; kvm@vger.kernel.org>);
	Wed, 18 May 2011 06:17:22 -0400
Received: by wya21 with SMTP id 21so1055354wya.19
	for <kvm@vger.kernel.org>; Wed, 18 May 2011 03:17:21 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=gamma;
	h=domainkey-signature:from:to:cc:subject:date:message-id:x-mailer
	:in-reply-to:references;
	bh=g4sVl9FU7Q9zWTwPyGj6cBlVY//0sOf3d97g1t400os=;
	b=CpXD/KHX9GQbUk9k2jheToOV7nDK9A2UxuTDHor3iZjbpHYn6THW1Iuyqn5WQI3sSu
	tei2MhXGY9xEdh6vwYbhWiKHZXEAjMjMvAmEb/QFjFwduJRLH5bHZ+uLTk6YedESSsyd
	iBIuWoRm9rSzAg6tdBfOZBq3TiSlopmo1+Vao=
DomainKey-Signature: a=rsa-sha1; c=nofws; d=gmail.com; s=gamma;
	h=from:to:cc:subject:date:message-id:x-mailer:in-reply-to:references;
	b=DHzPFvsLs5PXDMXsd4hEk6h8oVw5ZIpCZblrebFWmsS4V7tgd/BUHgQX5JNgEID4wU
	rti3Sz1KJvSspZL1/wjEIhzQCFDthUQa1CI9T2NWWijIwS/qd8FNt4FXpM/X7SiOafxO
	wJxlHTz26rCxoWy7ipDTtwjkW06MvSqRPlX88=
Received: by 10.216.46.21 with SMTP id q21mr1745479web.113.1305713841348;
	Wed, 18 May 2011 03:17:21 -0700 (PDT)
Received: from prasad-kvm.localdomain (pineapple.rdg.ac.uk [134.225.206.123])
	by mx.google.com with ESMTPS id w12sm880547wby.24.2011.05.18.03.17.18
	(version=TLSv1/SSLv3 cipher=OTHER);
	Wed, 18 May 2011 03:17:19 -0700 (PDT)
Received: by prasad-kvm.localdomain (Postfix, from userid 1000)
	id 3069126E004C; Wed, 18 May 2011 11:17:19 +0100 (BST)
From: Prasad Joshi <prasadjoshi124@gmail.com>
To: prasadjoshi124@gmail.com
Cc: mingo@elte.hu, kvm@vger.kernel.org, penberg@kernel.org,
	asias.hejun@gmail.com, gorcunov@gmail.com, levinsasha928@gmail.com,
	chaitanyakulkarni15@gmail.com, ashwini.kulkarni@gmail.com
Subject: [PATCH v1 3/3] kvm tools: Add QCOW level2 caching support
Date: Wed, 18 May 2011 11:17:17 +0100
Message-Id: <1305713837-18889-3-git-send-email-prasadjoshi124@gmail.com>
X-Mailer: git-send-email 1.7.4.1
In-Reply-To: <1305713837-18889-1-git-send-email-prasadjoshi124@gmail.com>
References: <1305713837-18889-1-git-send-email-prasadjoshi124@gmail.com>
Sender: kvm-owner@vger.kernel.org
Precedence: bulk
List-ID: <kvm.vger.kernel.org>
X-Mailing-List: kvm@vger.kernel.org
X-Greylist: IP, sender and recipient auto-whitelisted, not delayed by
	milter-greylist-4.2.6 (demeter2.kernel.org [140.211.167.43]);
	Wed, 18 May 2011 10:22:31 +0000 (UTC)

QCOW uses two tables level1 (L1) table and level2
(L2) table. The L1 table points to offset of L2
table. When a QCOW image is probed, the L1 table is
cached in the memory to avoid reading it from disk
on every reference. This caching imporves the
performance. The similar performance improvment can
be observed when L2 tables are also cached. It is
impossible to cache all of the L2 tables because of
the memory constraint. The patch adds L2 table
caching capability for upto 128 L2 tables, it uses
combination of RB tree and List to manage the L2
cached tables. The link list implementation helps
in building simple LRU structure and RB tree helps
in improving the search time during read/write
operations.

Signed-off-by: Prasad Joshi <prasadjoshi124@gmail.com>
---
 tools/kvm/include/kvm/qcow.h |   18 ++++
 tools/kvm/qcow.c             |  230 ++++++++++++++++++++++++++++++++++++++----
 2 files changed, 230 insertions(+), 18 deletions(-)

diff --git a/tools/kvm/include/kvm/qcow.h b/tools/kvm/include/kvm/qcow.h
index b6e7493..095566a 100644
--- a/tools/kvm/include/kvm/qcow.h
+++ b/tools/kvm/include/kvm/qcow.h
@@ -3,6 +3,8 @@
 
 #include <linux/types.h>
 #include <stdbool.h>
+#include <linux/rbtree.h>
+#include <linux/list.h>
 
 #define QCOW_MAGIC		(('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
 
@@ -17,6 +19,16 @@
 #define QCOW2_OFLAG_COMPRESSED	(1LL << 62)
 #define QCOW2_OFLAG_MASK	(QCOW2_OFLAG_COPIED|QCOW2_OFLAG_COMPRESSED)
 
+#define MAX_CACHE_NODES         128
+
+struct qcow_l2_cache {
+	u64                     offset;
+	u64                     *table;
+	struct rb_node          node;
+	struct list_head        list;
+	struct qcow             *qcow;
+};
+
 struct qcow_table {
 	u32			table_size;
 	u64			*l1_table;
@@ -26,6 +38,12 @@ struct qcow {
 	void			*header;
 	struct qcow_table	table;
 	int			fd;
+
+	/* Level2 caching data structures */
+	struct qcow_l2_cache    cache;
+	struct rb_root          root;
+	struct list_head        lru_list;
+	int                     no_cached;
 };
 
 struct qcow_header {
diff --git a/tools/kvm/qcow.c b/tools/kvm/qcow.c
index bb2345c..ca61374 100644
--- a/tools/kvm/qcow.c
+++ b/tools/kvm/qcow.c
@@ -16,6 +16,153 @@
 #include <linux/kernel.h>
 #include <linux/types.h>
 
+static inline int insert(struct rb_root *root, struct qcow_l2_cache *new)
+{
+	struct rb_node **link = &(root->rb_node), *parent = NULL;
+	u64 offset = new->offset;
+	
+	/* search the tree */
+	while (*link) {
+		struct qcow_l2_cache *t;
+
+		t = rb_entry(*link, struct qcow_l2_cache, node);
+		if (!t)
+			goto error;
+
+		parent = *link;
+
+		if (t->offset > offset)
+			link = &(*link)->rb_left;
+		else if (t->offset < offset)
+			link = &(*link)->rb_right;
+		else
+			goto out;
+	}
+
+	/* add new node */
+	rb_link_node(&new->node, parent, link);
+	rb_insert_color(&new->node, root);
+out:
+	return 0;
+error:
+	return -1;
+}
+
+static inline struct qcow_l2_cache *search(struct rb_root *root, u64 offset)
+{
+	struct rb_node *link = root->rb_node;
+
+	while (link) {
+		struct qcow_l2_cache *t;
+
+		t = rb_entry(link, struct qcow_l2_cache, node);
+		if (!t)
+			goto out;
+
+		if (t->offset > offset)
+			link = link->rb_left;
+		else if (t->offset < offset)
+			link = link->rb_right;
+		else
+			return t;
+	}
+out:
+	return NULL;
+}
+
+static inline void free_cache(struct qcow *q)
+{
+	struct list_head *pos, *n;
+	struct qcow_l2_cache *t;
+	struct rb_root *r = &q->root;
+
+	list_for_each_safe(pos, n, &q->lru_list) {
+		/* Remove cache table from the list and RB tree */
+		list_del(pos);
+		t = list_entry(pos, struct qcow_l2_cache, list);
+		rb_erase(&t->node, r);
+
+		/* Free the cached node */
+		free(t->table);
+		free(t);
+	}
+}
+
+static inline int cache_table(struct qcow *q, u64 *table, u64 offset)
+{
+	struct qcow_l2_cache *n;
+	struct rb_root *r = &q->root;
+	struct qcow_l2_cache *lru;
+
+	n = search(r, offset);
+	if (n) {
+		/* Update the LRU state */
+		list_del_init(&n->list);
+		list_add_tail(&n->list, &q->lru_list);
+		return 0;
+	}
+
+	lru = NULL;
+	if (q->no_cached == MAX_CACHE_NODES) {
+		/* Find the node for LRU replacement */
+		lru = list_first_entry(&q->lru_list, struct qcow_l2_cache, list);
+	}
+
+	n = calloc(1, sizeof(struct qcow_l2_cache));
+	if (!n)
+		goto error;
+
+	n->offset = offset;
+	n->table  = table;
+	n->qcow   = q;
+	RB_CLEAR_NODE(&n->node);
+	INIT_LIST_HEAD(&n->list);
+
+	if (lru) {
+		/* LRU Replacemen */
+		rb_replace_node(&lru->node, &n->node, r);
+
+		/*
+		 * The new node should be the last node in the LRU list
+		 * therefore, do not list_replace instead delete the node to
+		 * be replaced and add a new node at the end of the list.
+		 */
+		list_del_init(&lru->list);
+		list_add_tail(&n->list, &q->lru_list);
+
+		/* Free the LRUed node */
+		free(lru->table);
+		free(lru);
+	} else {
+		/* Add in RB Tree: Helps in searching faster */
+		if (insert(r, n) < 0)
+			goto error;
+
+		/* Add in LRU replacement list */
+		list_add_tail(&n->list, &q->lru_list);
+		q->no_cached++;
+	}
+
+	return 0;
+error:
+	free(n);
+	return -1;
+}
+
+static inline int search_table(struct qcow *q, u64 **table, u64 offset)
+{
+	struct qcow_l2_cache *c;
+
+	*table = NULL;
+
+	c = search(&q->root, offset);
+	if (!c)
+		return -1;
+
+	*table = c->table;
+	return 0;
+}
+
 static inline u64 get_l1_index(struct qcow *q, u64 offset)
 {
 	struct qcow_header *header = q->header;
@@ -37,6 +184,43 @@ static inline u64 get_cluster_offset(struct qcow *q, u64 offset)
 	return offset & ((1 << header->cluster_bits)-1);
 }
 
+static int qcow1_read_l2_table(struct qcow *q, u64 **table, u64 offset)
+{
+	struct qcow_header *header = q->header;
+	u64 size;
+	u64 i;
+	u64 *t;
+
+	*table = NULL;
+	size   = 1 << header->l2_bits;
+
+	/* search an entry for offset in cache */
+	if (search_table(q, table, offset) >= 0)
+		return 0;
+
+	t = calloc(size, sizeof(u64));
+	if (!t)
+		goto error;
+
+	/* table not cached: read from the disk */
+	if (pread_in_full(q->fd, t, size * sizeof(u64), offset) < 0)
+		goto error;
+
+	/* cache the table */
+	if (cache_table(q, t, offset) < 0)
+		goto error;
+
+	/* change cached table to CPU's byte-order */
+	for (i = 0; i < size; i++)
+		be64_to_cpus(&t[i]);
+
+	*table = t;
+	return 0;
+error:
+	free(t);
+	return -1;
+}
+
 static ssize_t qcow1_read_cluster(struct qcow *q, u64 offset, void *dst, u32 dst_len)
 {
 	struct qcow_header *header = q->header;
@@ -51,8 +235,6 @@ static ssize_t qcow1_read_cluster(struct qcow *q, u64 offset, void *dst, u32 dst
 	u64 l1_idx;
 	u64 l2_idx;
 
-	l2_table = NULL;
-
 	cluster_size = 1 << header->cluster_bits;
 
 	l1_idx = get_l1_index(q, offset);
@@ -72,18 +254,16 @@ static ssize_t qcow1_read_cluster(struct qcow *q, u64 offset, void *dst, u32 dst
 		goto zero_cluster;
 
 	l2_table_size = 1 << header->l2_bits;
-	l2_table = calloc(l2_table_size, sizeof(u64));
-	if (!l2_table)
-		goto out_error;
 
-	if (pread_in_full(q->fd, l2_table, l2_table_size * sizeof(u64), l2_table_offset) < 0)
+	/* read and cache level 2 table */
+	if (qcow1_read_l2_table(q, &l2_table, l2_table_offset) < 0)
 		goto out_error;
 
 	l2_idx = get_l2_index(q, offset);
 	if (l2_idx >= l2_table_size)
 		goto out_error;
 
-	clust_start = be64_to_cpu(l2_table[l2_idx]) & ~header->oflag_mask;
+	clust_start = l2_table[l2_idx] & ~header->oflag_mask;
 	if (!clust_start)
 		goto zero_cluster;
 
@@ -91,7 +271,6 @@ static ssize_t qcow1_read_cluster(struct qcow *q, u64 offset, void *dst, u32 dst
 		goto out_error;
 
 out:
-	free(l2_table);
 	return length;
 
 zero_cluster:
@@ -221,15 +400,16 @@ static ssize_t qcow1_write_cluster(struct qcow *q, u64 offset, void *buf, u32 sr
 	if (len > src_len)
 		len = src_len;
 
-	l2t = calloc(l2t_sz, sizeof(u64));
-	if (!l2t)
-		goto error;
-
 	l2t_off		= table->l1_table[l1t_idx] & ~header->oflag_mask;
 	if (l2t_off) {
-		if (pread_in_full(q->fd, l2t, l2t_sz * sizeof(u64), l2t_off) < 0)
-			goto free_l2;
+		/* cache level2 table */
+		if (qcow1_read_l2_table(q, &l2t, l2t_off) < 0)
+			goto error;
 	} else {
+		l2t = calloc(l2t_sz, sizeof(u64));
+		if (!l2t)
+			goto error;
+
 		/* Capture the state of the consistent QCOW image */
 		f_sz		= file_size(q->fd);
 		if (!f_sz)
@@ -251,6 +431,13 @@ static ssize_t qcow1_write_cluster(struct qcow *q, u64 offset, void *buf, u32 sr
 			goto free_l2;
 		}
 
+		if (cache_table(q, l2t, l2t_off) < 0) {
+			if (ftruncate(q->fd, f_sz) < 0)
+				goto free_l2;
+
+			goto free_l2;
+		}
+
 		/* Update the in-core entry */
 		table->l1_table[l1t_idx] = l2t_off;
 	}
@@ -258,17 +445,15 @@ static ssize_t qcow1_write_cluster(struct qcow *q, u64 offset, void *buf, u32 sr
 	/* Capture the state of the consistent QCOW image */
 	f_sz		= file_size(q->fd);
 	if (!f_sz)
-		goto free_l2;
+		goto error;
 
-	clust_start	= be64_to_cpu(l2t[l2t_idx]) & ~header->oflag_mask;
+	clust_start	= l2t[l2t_idx] & ~header->oflag_mask;
 	if (!clust_start) {
 		clust_start	= ALIGN(f_sz, clust_sz);
 		update_meta	= true;
 	} else
 		update_meta	= false;
 
-	free(l2t);
-
 	/* Write actual data */
 	if (pwrite_in_full(q->fd, buf, len, clust_start + clust_off) < 0)
 		goto error;
@@ -282,9 +467,13 @@ static ssize_t qcow1_write_cluster(struct qcow *q, u64 offset, void *buf, u32 sr
 
 			goto error;
 		}
+
+		/* Update the cached level2 entry */
+		l2t[l2t_idx] = clust_start;
 	}
 
 	return len;
+
 free_l2:
 	free(l2t);
 error:
@@ -335,6 +524,7 @@ static void qcow1_disk_close(struct disk_image *disk)
 
 	q = disk->priv;
 
+	free_cache(q);
 	free(q->table.l1_table);
 	free(q->header);
 	free(q);
@@ -425,6 +615,8 @@ static struct disk_image *qcow2_probe(int fd, bool readonly)
 		goto error;
 
 	q->fd = fd;
+	q->root = RB_ROOT;
+	INIT_LIST_HEAD(&q->lru_list);
 
 	h = q->header = qcow2_read_header(fd);
 	if (!h)
@@ -519,6 +711,8 @@ static struct disk_image *qcow1_probe(int fd, bool readonly)
 		goto error;
 
 	q->fd = fd;
+	q->root = RB_ROOT;
+	INIT_LIST_HEAD(&q->lru_list);
 
 	h = q->header = qcow1_read_header(fd);
 	if (!h)