diff mbox

[4/4] add ksm kernel shared memory driver.

Message ID 1238457560-7613-5-git-send-email-ieidus@redhat.com (mailing list archive)
State Accepted
Headers show

Commit Message

Izik Eidus March 30, 2009, 11:59 p.m. UTC
Ksm is driver that allow merging identical pages between one or more
applications in way unvisible to the application that use it.
Pages that are merged are marked as readonly and are COWed when any
application try to change them.

Ksm is used for cases where using fork() is not suitable,
one of this cases is where the pages of the application keep changing
dynamicly and the application cannot know in advance what pages are
going to be identical.

Ksm works by walking over the memory pages of the applications it
scan in order to find identical pages.
It uses a two sorted data strctures called stable and unstable trees
to find in effective way the identical pages.

When ksm finds two identical pages, it marks them as readonly and merges
them into single one page,
after the pages are marked as readonly and merged into one page, linux
will treat this pages as normal copy_on_write pages and will fork them
when write access will happen to them.

Ksm scan just memory areas that were registred to be scanned by it.

Ksm api:

KSM_GET_API_VERSION:
Give the userspace the api version of the module.

KSM_CREATE_SHARED_MEMORY_AREA:
Create shared memory reagion fd, that latter allow the user to register
the memory region to scan by using:
KSM_REGISTER_MEMORY_REGION and KSM_REMOVE_MEMORY_REGION

KSM_START_STOP_KTHREAD:
Return information about the kernel thread, the inforamtion is returned
using the ksm_kthread_info structure:
ksm_kthread_info:
__u32 sleep:
        number of microsecoends to sleep between each iteration of
scanning.

__u32 pages_to_scan:
        number of pages to scan for each iteration of scanning.

__u32 max_pages_to_merge:
        maximum number of pages to merge in each iteration of scanning
        (so even if there are still more pages to scan, we stop this
iteration)

__u32 flags:
       flags to control ksmd (right now just ksm_control_flags_run
			      available)

KSM_REGISTER_MEMORY_REGION:
Register userspace virtual address range to be scanned by ksm.
This ioctl is using the ksm_memory_region structure:
ksm_memory_region:
__u32 npages;
         number of pages to share inside this memory region.
__u32 pad;
__u64 addr:
        the begining of the virtual address of this region.

KSM_REMOVE_MEMORY_REGION:
Remove memory region from ksm.

Signed-off-by: Izik Eidus <ieidus@redhat.com>
---
 include/linux/ksm.h        |   69 +++
 include/linux/miscdevice.h |    1 +
 mm/Kconfig                 |    6 +
 mm/Makefile                |    1 +
 mm/ksm.c                   | 1431 ++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 1508 insertions(+), 0 deletions(-)
 create mode 100644 include/linux/ksm.h
 create mode 100644 mm/ksm.c

Comments

Anthony Liguori March 31, 2009, 2:12 a.m. UTC | #1
Izik Eidus wrote:
> Ksm is driver that allow merging identical pages between one or more
> applications in way unvisible to the application that use it.
> Pages that are merged are marked as readonly and are COWed when any
> application try to change them.
>
> Ksm is used for cases where using fork() is not suitable,
> one of this cases is where the pages of the application keep changing
> dynamicly and the application cannot know in advance what pages are
> going to be identical.
>
> Ksm works by walking over the memory pages of the applications it
> scan in order to find identical pages.
> It uses a two sorted data strctures called stable and unstable trees
> to find in effective way the identical pages.
>
> When ksm finds two identical pages, it marks them as readonly and merges
> them into single one page,
> after the pages are marked as readonly and merged into one page, linux
> will treat this pages as normal copy_on_write pages and will fork them
> when write access will happen to them.
>
> Ksm scan just memory areas that were registred to be scanned by it.
>
> Ksm api:
>
> KSM_GET_API_VERSION:
> Give the userspace the api version of the module.
>
> KSM_CREATE_SHARED_MEMORY_AREA:
> Create shared memory reagion fd, that latter allow the user to register
> the memory region to scan by using:
> KSM_REGISTER_MEMORY_REGION and KSM_REMOVE_MEMORY_REGION
>
> KSM_START_STOP_KTHREAD:
> Return information about the kernel thread, the inforamtion is returned
> using the ksm_kthread_info structure:
> ksm_kthread_info:
> __u32 sleep:
>         number of microsecoends to sleep between each iteration of
> scanning.
>
> __u32 pages_to_scan:
>         number of pages to scan for each iteration of scanning.
>
> __u32 max_pages_to_merge:
>         maximum number of pages to merge in each iteration of scanning
>         (so even if there are still more pages to scan, we stop this
> iteration)
>
> __u32 flags:
>        flags to control ksmd (right now just ksm_control_flags_run
> 			      available)
>   

Wouldn't this make more sense as a sysfs interface?  That is, the 
KSM_START_STOP_KTHREAD part, not necessarily the rest of the API.

Regards,

Anthony Liguori

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
KAMEZAWA Hiroyuki March 31, 2009, 2:15 a.m. UTC | #2
On Tue, 31 Mar 2009 02:59:20 +0300
Izik Eidus <ieidus@redhat.com> wrote:

> Ksm is driver that allow merging identical pages between one or more
> applications in way unvisible to the application that use it.
> Pages that are merged are marked as readonly and are COWed when any
> application try to change them.
> 
> Ksm is used for cases where using fork() is not suitable,
> one of this cases is where the pages of the application keep changing
> dynamicly and the application cannot know in advance what pages are
> going to be identical.
> 
> Ksm works by walking over the memory pages of the applications it
> scan in order to find identical pages.
> It uses a two sorted data strctures called stable and unstable trees
> to find in effective way the identical pages.
> 
> When ksm finds two identical pages, it marks them as readonly and merges
> them into single one page,
> after the pages are marked as readonly and merged into one page, linux
> will treat this pages as normal copy_on_write pages and will fork them
> when write access will happen to them.
> 
> Ksm scan just memory areas that were registred to be scanned by it.
> 
> Ksm api:
> 
> KSM_GET_API_VERSION:
> Give the userspace the api version of the module.
> 
> KSM_CREATE_SHARED_MEMORY_AREA:
> Create shared memory reagion fd, that latter allow the user to register
> the memory region to scan by using:
> KSM_REGISTER_MEMORY_REGION and KSM_REMOVE_MEMORY_REGION
> 
> KSM_START_STOP_KTHREAD:
> Return information about the kernel thread, the inforamtion is returned
> using the ksm_kthread_info structure:
> ksm_kthread_info:
> __u32 sleep:
>         number of microsecoends to sleep between each iteration of
> scanning.
> 
> __u32 pages_to_scan:
>         number of pages to scan for each iteration of scanning.
> 
> __u32 max_pages_to_merge:
>         maximum number of pages to merge in each iteration of scanning
>         (so even if there are still more pages to scan, we stop this
> iteration)
> 
> __u32 flags:
>        flags to control ksmd (right now just ksm_control_flags_run
> 			      available)
> 
> KSM_REGISTER_MEMORY_REGION:
> Register userspace virtual address range to be scanned by ksm.
> This ioctl is using the ksm_memory_region structure:
> ksm_memory_region:
> __u32 npages;
>          number of pages to share inside this memory region.
> __u32 pad;
> __u64 addr:
>         the begining of the virtual address of this region.
> 
> KSM_REMOVE_MEMORY_REGION:
> Remove memory region from ksm.
> 
> Signed-off-by: Izik Eidus <ieidus@redhat.com>
> ---
>  include/linux/ksm.h        |   69 +++
>  include/linux/miscdevice.h |    1 +
>  mm/Kconfig                 |    6 +
>  mm/Makefile                |    1 +
>  mm/ksm.c                   | 1431 ++++++++++++++++++++++++++++++++++++++++++++
>  5 files changed, 1508 insertions(+), 0 deletions(-)
>  create mode 100644 include/linux/ksm.h
>  create mode 100644 mm/ksm.c
> 
> diff --git a/include/linux/ksm.h b/include/linux/ksm.h
> new file mode 100644
> index 0000000..5776dce
> --- /dev/null
> +++ b/include/linux/ksm.h
> @@ -0,0 +1,69 @@
> +#ifndef __LINUX_KSM_H
> +#define __LINUX_KSM_H
> +
> +/*
> + * Userspace interface for /dev/ksm - kvm shared memory
> + */
> +
> +#include <linux/types.h>
> +#include <linux/ioctl.h>
> +
> +#include <asm/types.h>
> +
> +#define KSM_API_VERSION 1
> +
> +#define ksm_control_flags_run 1
> +
> +/* for KSM_REGISTER_MEMORY_REGION */
> +struct ksm_memory_region {
> +	__u32 npages; /* number of pages to share */
> +	__u32 pad;
> +	__u64 addr; /* the begining of the virtual address */
> +        __u64 reserved_bits;
> +};
> +
> +struct ksm_kthread_info {
> +	__u32 sleep; /* number of microsecoends to sleep */
> +	__u32 pages_to_scan; /* number of pages to scan */
> +	__u32 flags; /* control flags */
> +        __u32 pad;
> +        __u64 reserved_bits;
> +};
> +
> +#define KSMIO 0xAB
> +
> +/* ioctls for /dev/ksm */
> +
> +#define KSM_GET_API_VERSION              _IO(KSMIO,   0x00)
> +/*
> + * KSM_CREATE_SHARED_MEMORY_AREA - create the shared memory reagion fd
> + */
> +#define KSM_CREATE_SHARED_MEMORY_AREA    _IO(KSMIO,   0x01) /* return SMA fd */
> +/*
> + * KSM_START_STOP_KTHREAD - control the kernel thread scanning speed
> + * (can stop the kernel thread from working by setting running = 0)
> + */
> +#define KSM_START_STOP_KTHREAD		 _IOW(KSMIO,  0x02,\
> +					      struct ksm_kthread_info)
> +/*
> + * KSM_GET_INFO_KTHREAD - return information about the kernel thread
> + * scanning speed.
> + */
> +#define KSM_GET_INFO_KTHREAD		 _IOW(KSMIO,  0x03,\
> +					      struct ksm_kthread_info)
> +
> +
> +/* ioctls for SMA fds */
> +
> +/*
> + * KSM_REGISTER_MEMORY_REGION - register virtual address memory area to be
> + * scanned by kvm.
> + */
> +#define KSM_REGISTER_MEMORY_REGION       _IOW(KSMIO,  0x20,\
> +					      struct ksm_memory_region)
> +/*
> + * KSM_REMOVE_MEMORY_REGION - remove virtual address memory area from ksm.
> + */
> +#define KSM_REMOVE_MEMORY_REGION         _IO(KSMIO,   0x21)
> +
> +#endif
> diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h
> index a820f81..6d4f8df 100644
> --- a/include/linux/miscdevice.h
> +++ b/include/linux/miscdevice.h
> @@ -29,6 +29,7 @@
>  #define HPET_MINOR		228
>  #define FUSE_MINOR		229
>  #define KVM_MINOR		232
> +#define KSM_MINOR		233
>  #define MISC_DYNAMIC_MINOR	255
>  
>  struct device;
> diff --git a/mm/Kconfig b/mm/Kconfig
> index a5b7781..2818223 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -216,3 +216,9 @@ config UNEVICTABLE_LRU
>  
>  config MMU_NOTIFIER
>  	bool
> +
> +config KSM
> +	tristate "Enable KSM for page sharing"
> +	help
> +	  Enable the KSM kernel module to allow page sharing of equal pages
> +	  among different tasks.
> diff --git a/mm/Makefile b/mm/Makefile
> index 72255be..e3bf7bf 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -24,6 +24,7 @@ obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
>  obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
>  obj-$(CONFIG_SLOB) += slob.o
>  obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
> +obj-$(CONFIG_KSM) += ksm.o
>  obj-$(CONFIG_SLAB) += slab.o
>  obj-$(CONFIG_SLUB) += slub.o
>  obj-$(CONFIG_FAILSLAB) += failslab.o
> diff --git a/mm/ksm.c b/mm/ksm.c
> new file mode 100644
> index 0000000..eba4c09
> --- /dev/null
> +++ b/mm/ksm.c
> @@ -0,0 +1,1431 @@
> +/*
> + * Memory merging driver for Linux
> + *
> + * This module enables dynamic sharing of identical pages found in different
> + * memory areas, even if they are not shared by fork()
> + *
> + * Copyright (C) 2008 Red Hat, Inc.
> + * Authors:
> + *	Izik Eidus
> + *	Andrea Arcangeli
> + *	Chris Wright
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.
> + */
> +
> +#include <linux/module.h>
> +#include <linux/errno.h>
> +#include <linux/mm.h>
> +#include <linux/fs.h>
> +#include <linux/miscdevice.h>
> +#include <linux/vmalloc.h>
> +#include <linux/file.h>
> +#include <linux/mman.h>
> +#include <linux/sched.h>
> +#include <linux/rwsem.h>
> +#include <linux/pagemap.h>
> +#include <linux/sched.h>
> +#include <linux/rmap.h>
> +#include <linux/spinlock.h>
> +#include <linux/jhash.h>
> +#include <linux/delay.h>
> +#include <linux/kthread.h>
> +#include <linux/wait.h>
> +#include <linux/scatterlist.h>
> +#include <linux/random.h>
> +#include <linux/slab.h>
> +#include <linux/swap.h>
> +#include <linux/rbtree.h>
> +#include <linux/anon_inodes.h>
> +#include <linux/ksm.h>
> +
> +#include <asm/tlbflush.h>
> +
> +MODULE_AUTHOR("Red Hat, Inc.");
> +MODULE_LICENSE("GPL");
> +
> +static int rmap_hash_size;
> +module_param(rmap_hash_size, int, 0);
> +MODULE_PARM_DESC(rmap_hash_size, "Hash table size for the reverse mapping");
> +
> +/*
> + * ksm_mem_slot - hold information for an userspace scanning range
> + * (the scanning for this region will be from addr untill addr +
> + *  npages * PAGE_SIZE inside mm)
> + */
> +struct ksm_mem_slot {
> +	struct list_head link;
> +	struct list_head sma_link;
> +	struct mm_struct *mm;
> +	unsigned long addr;	/* the begining of the virtual address */
> +	unsigned npages;	/* number of pages to share */
> +};
> +
> +/*
> + * ksm_sma - shared memory area, each process have its own sma that contain the
> + * information about the slots that it own
> + */
> +struct ksm_sma {
> +	struct list_head sma_slots;
> +};
> +
> +/**
> + * struct ksm_scan - cursor for scanning
> + * @slot_index: the current slot we are scanning
> + * @page_index: the page inside the sma that is currently being scanned
> + *
> + * ksm uses it to know what are the next pages it need to scan
> + */
> +struct ksm_scan {
> +	struct ksm_mem_slot *slot_index;
> +	unsigned long page_index;
> +};
> +
> +/*
> + * Few notes about ksm scanning progress (make it easier to understand the
> + * data structures below):
> + *
> + * In order to reduce excessive scanning, ksm sort the memory pages by their
> + * contents into a data strcture that hold pointer into the pages.
> + *
> + * Since the contents of the pages may change at any moment, ksm cant just
> + * insert the pages into normal sorted tree and expect it to find anything.
> + *
> + * For this purpuse ksm use two data strctures - stable and unstable trees,
> + * the stable tree hold pointers into all the merged pages (KsmPage) sorted by
> + * their contents, beacuse that each such page have to be write-protected,
> + * searching on this tree is fully assuranced to be working and therefore this
> + * tree is called the stable tree.
> + *
> + * In addition to the stable tree, ksm use another data strcture called the
> + * unstable tree, this specific tree hold pointers into pages that have
> + * been found to be "unchanged for period of time", the unstable tree sort this
> + * pages by their contents, but given the fact that this pages are not
> + * write-protected, ksm cant trust the unstable tree to be fully assuranced to
> + * work.
> + * For the reason that the unstable tree would become corrupted when some of
> + * the page inside itself would change, the tree is called unstable.
> + * Ksm solve this problem by two ways:
> + * 1) the unstable tree get flushed every time ksm finish to scan the whole
> + *    memory, and then the tree is rebuild from the begining.
> + * 2) Ksm will only insert into the unstable tree, pages that their hash value
> + *    was not changed during the whole progress of one circuler scanning of the
> + *    memory.
> + * 3) The unstable tree is RedBlack Tree - meaning its balancing is based on
> + *    the colors of the nodes and not their content, this assure that even when
> + *    the tree get "corrupted" we wont get out of balance and the timing of
> + *    scanning is the same, another issue is that searching and inserting nodes
> + *    into rbtree is the same algorithem, therefore we have no overhead when we
> + *    flush the tree and rebuild it.
> + * 4) Ksm never flush the stable tree, this mean that even if it would take 10
> + *    times to find page inside the unstable tree, as soon as we would find it,
> + *    it will be secured inside the stable tree,
> + *    (When we scan new page, we first compare it against the stable tree, and
> + *     then against the unstable tree)
> + */
> +
> +struct rmap_item;
> +
> +/*
> + * tree_item - object of the stable and unstable trees
> + */
> +struct tree_item {
> +	struct rb_node node;
> +	struct rmap_item *rmap_item;
> +};
> +
> +/*
> + * rmap_item - object of the rmap_hash hash table
> + * (it is holding the previous hash value (oldindex),
> + *  pointer into the page_hash_item, and pointer into the tree_item)
> + */
> +
> +/**
> + * struct rmap_item - reverse mapping item for virtual addresses
> + * @link: link into the rmap_hash hash table.
> + * @mm: the memory strcture the rmap_item is pointing to.
> + * @address: the virtual address the rmap_item is pointing to.
> + * @oldchecksum: old checksum result for the page belong the virtual address
> + * @stable_tree: when 1 rmap_item is used for stable_tree, 0 unstable tree
> + * @tree_item: pointer into the stable/unstable tree that hold the virtual
> + *             address that the rmap_item is pointing to.
> + * @next: the next rmap item inside the stable/unstable tree that have that is
> + *        found inside the same tree node.
> + */
> +
> +struct rmap_item {
> +	struct hlist_node link;
> +	struct mm_struct *mm;
> +	unsigned long address;
> +	unsigned int oldchecksum; /* old checksum value */
> +	unsigned char stable_tree; /* 1 stable_tree 0 unstable tree */
> +	struct tree_item *tree_item;
> +	struct rmap_item *next;
> +	struct rmap_item *prev;
> +};
> +
> +/*
> + * slots is linked list that hold all the memory regions that were registred
> + * to be scanned.
> + */
> +static LIST_HEAD(slots);
> +/*
> + * slots_lock protect against removing and adding memory regions while a scanner
> + * is in the middle of scanning.
> + */
> +static DECLARE_RWSEM(slots_lock);
> +
> +/* The stable and unstable trees heads. */
> +struct rb_root root_stable_tree = RB_ROOT;
> +struct rb_root root_unstable_tree = RB_ROOT;
> +
> +
> +/* The number of linked list members inside the hash table */
> +static int nrmaps_hash;
> +/* rmap_hash hash table */
> +static struct hlist_head *rmap_hash;
> +
> +static struct kmem_cache *tree_item_cache;
> +static struct kmem_cache *rmap_item_cache;
> +
> +static int kthread_sleep; /* sleep time of the kernel thread */
> +static int kthread_pages_to_scan; /* npages to scan for the kernel thread */
> +static struct ksm_scan kthread_ksm_scan;
> +static int ksmd_flags;
> +static struct task_struct *kthread;
> +static DECLARE_WAIT_QUEUE_HEAD(kthread_wait);
> +static DECLARE_RWSEM(kthread_lock);
> +
> +static int ksm_slab_init(void)
> +{
> +	int ret = -ENOMEM;
> +
> +	tree_item_cache = KMEM_CACHE(tree_item, 0);
> +	if (!tree_item_cache)
> +		goto out;
> +
> +	rmap_item_cache = KMEM_CACHE(rmap_item, 0);
> +	if (!rmap_item_cache)
> +		goto out_free;
> +
> +	return 0;
> +
> +out_free:
> +	kmem_cache_destroy(tree_item_cache);
> +out:
> +	return ret;
> +}
> +
> +static void ksm_slab_free(void)
> +{
> +	kmem_cache_destroy(rmap_item_cache);
> +	kmem_cache_destroy(tree_item_cache);
> +}
> +
> +static inline struct tree_item *alloc_tree_item(void)
> +{
> +	return kmem_cache_zalloc(tree_item_cache, GFP_KERNEL);
> +}
> +
> +static void free_tree_item(struct tree_item *tree_item)
> +{
> +	kmem_cache_free(tree_item_cache, tree_item);
> +}
> +
> +static inline struct rmap_item *alloc_rmap_item(void)
> +{
> +	return kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL);
> +}
> +
> +static inline void free_rmap_item(struct rmap_item *rmap_item)
> +{
> +	kmem_cache_free(rmap_item_cache, rmap_item);
> +}
> +
> +/*
> + * PageKsm - this type of pages are the write protected pages that ksm map
> + * into multiple vmas (this is the "shared page")
> + * this page was allocated using alloc_page(), and every pte that point to it
> + * is always write protected (therefore its data content cant ever be changed)
> + * and this page cant be swapped.
> + */
> +static inline int PageKsm(struct page *page)
> +{
> +	/*
> +	 * When ksm create new shared page, it create kernel allocated page
> +	 * using alloc_page(), therefore this page is not anonymous, taking into
> +         * account that ksm scan just anonymous pages, we can relay on the fact
> +	 * that each time we see !PageAnon(page) we are hitting shared page.
> +	 */
> +	return !PageAnon(page);
> +}
> +
> +static int rmap_hash_init(void)
> +{
> +	if (!rmap_hash_size) {
> +		struct sysinfo sinfo;
> +
> +		si_meminfo(&sinfo);
> +		rmap_hash_size = sinfo.totalram / 10;
> +	}
> +	nrmaps_hash = rmap_hash_size;
> +	rmap_hash = vmalloc(nrmaps_hash * sizeof(struct hlist_head));
> +	if (!rmap_hash)
> +		return -ENOMEM;
> +	memset(rmap_hash, 0, nrmaps_hash * sizeof(struct hlist_head));
> +	return 0;
> +}
> +
> +static void rmap_hash_free(void)
> +{
> +	int i;
> +	struct hlist_head *bucket;
> +	struct hlist_node *node, *n;
> +	struct rmap_item *rmap_item;
> +
> +	for (i = 0; i < nrmaps_hash; ++i) {
> +		bucket = &rmap_hash[i];
> +		hlist_for_each_entry_safe(rmap_item, node, n, bucket, link) {
> +			hlist_del(&rmap_item->link);
> +			free_rmap_item(rmap_item);
> +		}
> +	}
> +	vfree(rmap_hash);
> +}
> +
> +static inline u32 calc_checksum(struct page *page)
> +{
> +	u32 checksum;
> +	void *addr = kmap_atomic(page, KM_USER0);
> +	checksum = jhash(addr, PAGE_SIZE, 17);
> +	kunmap_atomic(addr, KM_USER0);
> +	return checksum;
> +}
> +
> +/*
> + * Return rmap_item for a given virtual address.
> + */
> +static struct rmap_item *get_rmap_item(struct mm_struct *mm, unsigned long addr)
> +{
> +	struct rmap_item *rmap_item;
> +	struct hlist_head *bucket;
> +	struct hlist_node *node;
> +
> +	bucket = &rmap_hash[addr % nrmaps_hash];
> +	hlist_for_each_entry(rmap_item, node, bucket, link) {
> +		if (mm == rmap_item->mm && rmap_item->address == addr) {
> +			return rmap_item;
> +		}
> +	}
> +	return NULL;
> +}
> +
> +/*
> + * Removing rmap_item from stable or unstable tree.
> + * This function will free the rmap_item object, and if that rmap_item was
> + * insde the stable or unstable trees, it would remove the link from there
> + * as well.
> + */
> +static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
> +{
> +	struct tree_item *tree_item;
> +
> +	tree_item = rmap_item->tree_item;
> +	rmap_item->tree_item = NULL;
> +
> +	if (rmap_item->stable_tree) {
> +		if (rmap_item->prev) {
> +			BUG_ON(rmap_item->prev->next != rmap_item);
> +			rmap_item->prev->next = rmap_item->next;
> +		}
> +		if (rmap_item->next) {
> +			BUG_ON(rmap_item->next->prev != rmap_item);
> +			rmap_item->next->prev = rmap_item->prev;
> +		}
> +	}
> +
> +	if (tree_item) {
> +		if (rmap_item->stable_tree) {
> +	 		if (!rmap_item->next && !rmap_item->prev) {
> +				rb_erase(&tree_item->node, &root_stable_tree);
> +				free_tree_item(tree_item);
> +			} else if (!rmap_item->prev) {
> +				tree_item->rmap_item = rmap_item->next;
> +			} else {
> +				tree_item->rmap_item = rmap_item->prev;
> +			}
> +		} else if (!rmap_item->stable_tree) {
> +			free_tree_item(tree_item);
> +		}
> +	}
> +
> +	hlist_del(&rmap_item->link);
> +	free_rmap_item(rmap_item);
> +}
> +
> +static void remove_page_from_tree(struct mm_struct *mm,
> +				  unsigned long addr)
> +{
> +	struct rmap_item *rmap_item;
> +
> +	rmap_item = get_rmap_item(mm, addr);
> +	if (!rmap_item)
> +		return;
> +	remove_rmap_item_from_tree(rmap_item);
> +	return;
> +}
> +
> +static int ksm_sma_ioctl_register_memory_region(struct ksm_sma *ksm_sma,
> +						struct ksm_memory_region *mem)
> +{
> +	struct ksm_mem_slot *slot;
> +	int ret = -EPERM;
> +
> +	slot = kzalloc(sizeof(struct ksm_mem_slot), GFP_KERNEL);
> +	if (!slot) {
> +		ret = -ENOMEM;
> +		goto out;
> +	}
> +
> +	slot->mm = get_task_mm(current);
> +	if (!slot->mm)
> +		goto out_free;
> +	slot->addr = mem->addr;
> +	slot->npages = mem->npages;
> +
> +	down_write(&slots_lock);
> +
> +	list_add_tail(&slot->link, &slots);
> +	list_add_tail(&slot->sma_link, &ksm_sma->sma_slots);
> +
> +	up_write(&slots_lock);
> +	return 0;
> +
> +out_free:
> +	kfree(slot);
> +out:
> +	return ret;
> +}
> +
> +static void remove_mm_from_hash_and_tree(struct mm_struct *mm)
> +{
> +	struct ksm_mem_slot *slot;
> +	int pages_count;
> +
> +	list_for_each_entry(slot, &slots, link)
> +		if (slot->mm == mm)
> +			break;
> +	BUG_ON(!slot);
> +
> +	root_unstable_tree = RB_ROOT;
> +	for (pages_count = 0; pages_count < slot->npages; ++pages_count)
> +		remove_page_from_tree(mm, slot->addr +
> +				      pages_count * PAGE_SIZE);
> +	list_del(&slot->link);
> +}
> +
> +static int ksm_sma_ioctl_remove_memory_region(struct ksm_sma *ksm_sma)
> +{
> +	struct ksm_mem_slot *slot, *node;
> +
> +	down_write(&slots_lock);
> +	list_for_each_entry_safe(slot, node, &ksm_sma->sma_slots, sma_link) {
> +		remove_mm_from_hash_and_tree(slot->mm);
> +		mmput(slot->mm);
> +		list_del(&slot->sma_link);
> +		kfree(slot);
> +	}
> +	up_write(&slots_lock);
> +	return 0;
> +}
> +
> +static int ksm_sma_release(struct inode *inode, struct file *filp)
> +{
> +	struct ksm_sma *ksm_sma = filp->private_data;
> +	int r;
> +
> +	r = ksm_sma_ioctl_remove_memory_region(ksm_sma);
> +	kfree(ksm_sma);
> +	return r;
> +}
> +
> +static long ksm_sma_ioctl(struct file *filp,
> +			  unsigned int ioctl, unsigned long arg)
> +{
> +	struct ksm_sma *sma = filp->private_data;
> +	void __user *argp = (void __user *)arg;
> +	int r = EINVAL;
> +
> +	switch (ioctl) {
> +	case KSM_REGISTER_MEMORY_REGION: {
> +		struct ksm_memory_region ksm_memory_region;
> +
> +		r = -EFAULT;
> +		if (copy_from_user(&ksm_memory_region, argp,
> +				   sizeof(ksm_memory_region)))
> +			goto out;
> +		r = ksm_sma_ioctl_register_memory_region(sma,
> +							 &ksm_memory_region);
> +		break;
> +	}
> +	case KSM_REMOVE_MEMORY_REGION:
> +		r = ksm_sma_ioctl_remove_memory_region(sma);
> +		break;
> +	}
> +
> +out:
> +	return r;
> +}
> +
> +static unsigned long addr_in_vma(struct vm_area_struct *vma, struct page *page)
> +{
> +	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
> +	unsigned long addr;
> +
> +	addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
> +	if (unlikely(addr < vma->vm_start || addr >= vma->vm_end))
> +		return -EFAULT;
> +	return addr;
> +}
> +
> +static pte_t *get_pte(struct mm_struct *mm, unsigned long addr)
> +{
> +	pgd_t *pgd;
> +	pud_t *pud;
> +	pmd_t *pmd;
> +	pte_t *ptep = NULL;
> +
> +	pgd = pgd_offset(mm, addr);
> +	if (!pgd_present(*pgd))
> +		goto out;
> +
> +	pud = pud_offset(pgd, addr);
> +	if (!pud_present(*pud))
> +		goto out;
> +
> +	pmd = pmd_offset(pud, addr);
> +	if (!pmd_present(*pmd))
> +		goto out;
> +
> +	ptep = pte_offset_map(pmd, addr);
> +out:
> +	return ptep;
> +}
> +
> +static int is_present_pte(struct mm_struct *mm, unsigned long addr)
> +{
> +	pte_t *ptep;
> +	int r;
> +
> +	ptep = get_pte(mm, addr);
> +	if (!ptep)
> +		return 0;
> +
> +	r = pte_present(*ptep);
> +	pte_unmap(ptep);
> +
> +	return r;
> +}
> +
> +static int memcmp_pages(struct page *page1, struct page *page2)
> +{
> +	char *addr1, *addr2;
> +	int r;
> +
> +	addr1 = kmap_atomic(page1, KM_USER0);
> +	addr2 = kmap_atomic(page2, KM_USER1);
> +	r = memcmp(addr1, addr2, PAGE_SIZE);
> +	kunmap_atomic(addr1, KM_USER0);
> +	kunmap_atomic(addr2, KM_USER1);
> +	return r;
> +}
> +
> +/* pages_identical
> + * return 1 if identical, 0 otherwise.
> + */
> +static inline int pages_identical(struct page *page1, struct page *page2)
> +{
> +	return !memcmp_pages(page1, page2);
> +}
> +
> +/*
> + * try_to_merge_one_page - take two pages and merge them into one
> + * @mm: mm_struct that hold vma pointing into oldpage
> + * @vma: the vma that hold the pte pointing into oldpage
> + * @oldpage: the page that we want to replace with newpage
> + * @newpage: the page that we want to map instead of oldpage
> + * @newprot: the new permission of the pte inside vma
> + * note:
> + * oldpage should be anon page while newpage should be file mapped page
> + *
> + * this function return 0 if the pages were merged, 1 otherwise.
> + */
> +static int try_to_merge_one_page(struct mm_struct *mm,
> +				 struct vm_area_struct *vma,
> +				 struct page *oldpage,
> +				 struct page *newpage,
> +				 pgprot_t newprot)
> +{
> +	int ret = 1;
> +	int odirect_sync;
> +	unsigned long page_addr_in_vma;
> +	pte_t orig_pte, *orig_ptep;
> +
> +	get_page(newpage);
> +	get_page(oldpage);
> +
> +	down_read(&mm->mmap_sem);
> +
> +	page_addr_in_vma = addr_in_vma(vma, oldpage);
> +	if (page_addr_in_vma == -EFAULT)
> +		goto out_unlock;
> +
> +	orig_ptep = get_pte(mm, page_addr_in_vma);
> +	if (!orig_ptep)
> +		goto out_unlock;
> +	orig_pte = *orig_ptep;
> +	pte_unmap(orig_ptep);
> +	if (!pte_present(orig_pte))
> +		goto out_unlock;
> +	if (page_to_pfn(oldpage) != pte_pfn(orig_pte))
> +		goto out_unlock;
> +	/*
> +	 * we need the page lock to read a stable PageSwapCache in
> +	 * page_wrprotect()
> +	 */
> +	if (!trylock_page(oldpage))
> +		goto out_unlock;
> +	/*
> +	 * page_wrprotect check if the page is swapped or in swap cache,
> +	 * in the future we might want to run here if_present_pte and then
> +	 * swap_free
> +	 */
> +	if (!page_wrprotect(oldpage, &odirect_sync, 2)) {
> +		unlock_page(oldpage);
> +		goto out_unlock;
> +	}
> +	unlock_page(oldpage);
> +	if (!odirect_sync)
> +		goto out_unlock;
> +
> +	orig_pte = pte_wrprotect(orig_pte);
> +
> +	if (pages_identical(oldpage, newpage))
> +		ret = replace_page(vma, oldpage, newpage, orig_pte, newprot);
> +
> +out_unlock:
> +	up_read(&mm->mmap_sem);
> +	put_page(oldpage);
> +	put_page(newpage);
> +	return ret;
> +}
> +
> +/*
> + * try_to_merge_two_pages - take two identical pages and prepare them to be
> + * merged into one page.
> + *
> + * this function return 0 if we successfully mapped two identical pages into one
> + * page, 1 otherwise.
> + * (note in case we created KsmPage and mapped one page into it but the second
> + *  page was not mapped we consider it as a failure and return 1)
> + */
> +static int try_to_merge_two_pages(struct mm_struct *mm1, struct page *page1,
> +				  struct mm_struct *mm2, struct page *page2,
> +				  unsigned long addr1, unsigned long addr2)
> +{
> +	struct vm_area_struct *vma;
> +	pgprot_t prot;
> +	int ret = 1;
> +
> +	/*
> +	 * If page2 isn't shared (it isn't PageKsm) we have to allocate a new
> +	 * file mapped page and make the two ptes of mm1(page1) and mm2(page2)
> +	 * point to it.  If page2 is shared, we can just make the pte of
> +	 * mm1(page1) point to page2
> +	 */
> +	if (PageKsm(page2)) {
> +		down_read(&mm1->mmap_sem);
> +		vma = find_vma(mm1, addr1);
> +		up_read(&mm1->mmap_sem);
> +		if (!vma)
> +			return ret;
> +		prot = vma->vm_page_prot;
> +		pgprot_val(prot) &= ~_PAGE_RW;
> +		ret = try_to_merge_one_page(mm1, vma, page1, page2, prot);
> +	} else {
> +		struct page *kpage;
> +
> +		kpage = alloc_page(GFP_HIGHUSER);
> +		if (!kpage)
> +			return ret;
> +		down_read(&mm1->mmap_sem);
> +		vma = find_vma(mm1, addr1);
> +		up_read(&mm1->mmap_sem);
> +		if (!vma) {
> +			put_page(kpage);
> +			return ret;
> +		}
> +		prot = vma->vm_page_prot;
> +		pgprot_val(prot) &= ~_PAGE_RW;
> +
> +		copy_user_highpage(kpage, page1, addr1, vma);
> +		ret = try_to_merge_one_page(mm1, vma, page1, kpage, prot);
> +
> +		if (!ret) {
> +			down_read(&mm2->mmap_sem);
> +			vma = find_vma(mm2, addr2);
> +			up_read(&mm2->mmap_sem);
> +			if (!vma) {
> +				put_page(kpage);
> +				ret = 1;
> +				return ret;
> +			}
> +
> +			prot = vma->vm_page_prot;
> +			pgprot_val(prot) &= ~_PAGE_RW;
> +
> +			ret = try_to_merge_one_page(mm2, vma, page2, kpage,
> +						    prot);
> +			/*
> +			 * If the secoend try_to_merge_one_page call was failed,
> +			 * we are in situation where we have Ksm page that have
> +			 * just one pte pointing to it, in this case we break
> +			 * it.
> +			 */
> +			if (ret) {
> +				struct page *tmppage[1];
> +
> +				down_read(&mm1->mmap_sem);
> +				if (get_user_pages(current, mm1, addr1, 1, 1,
> +						    0, tmppage, NULL)) {
> +					put_page(tmppage[0]);
> +				}
> +				up_read(&mm1->mmap_sem);
> +			}
> +		}
> +		put_page(kpage);
> +	}
> +	return ret;
> +}

I'm sorry if I'm wrong. Is the above "kpage" is free from global LRU and never be
reclaimed(swapped-out) by global LRU ?

If so, please
 - show the amount of kpage
 
 - allow users to set limit for usage of kpages. or preserve kpages at boot or
   by user's command.

Thanks,
-Kame


> +/*
> + * is_zapped_item - check if the page belong to the rmap_item was zapped.
> + *
> + * This function would check if the page that the virtual address inside
> + * rmap_item is poiting to is still KsmPage, and therefore we can trust the
> + * content of this page.
> + * Since that this function call already to get_user_pages it return the
> + * pointer to the page as an optimization.
> + */
> +static int is_zapped_item(struct rmap_item *rmap_item,
> +			  struct page **page)
> +{
> +	int ret = 0;
> +
> +	cond_resched();
> +	if (is_present_pte(rmap_item->mm, rmap_item->address)) {
> +		down_read(&rmap_item->mm->mmap_sem);
> +		ret = get_user_pages(current, rmap_item->mm, rmap_item->address,
> +				     1, 0, 0, page, NULL);
> +		up_read(&rmap_item->mm->mmap_sem);
> +	}
> +
> +	if (!ret)
> +		return 1;
> +
> +	if (unlikely(!PageKsm(page[0]))) {
> +		put_page(page[0]);
> +		return 1;
> +	}
> +	return 0;
> +}
> +
> +/*
> + * stable_tree_search - search page inside the stable tree
> + * @page: the page that we are searching idneitcal pages to.
> + * @page2: pointer into identical page that we are holding inside the stable
> + *	   tree that we have found.
> + * @rmap_item: the reverse mapping item
> + *
> + * this function check if there is a page inside the stable tree
> + * with identical content to the page that we are scanning right now.
> + *
> + * this function return rmap_item pointer to the identical item if found, NULL
> + * otherwise.
> + */
> +static struct rmap_item *stable_tree_search(struct page *page,
> +					    struct page **page2,
> +					    struct rmap_item *rmap_item)
> +{
> +	struct rb_node *node = root_stable_tree.rb_node;
> +	struct tree_item *tree_item;
> +	struct rmap_item *found_rmap_item;
> +
> +	while (node) {
> +		int ret;
> +
> +		tree_item = rb_entry(node, struct tree_item, node);
> +		found_rmap_item = tree_item->rmap_item;
> +		while (found_rmap_item) {
> +			BUG_ON(!found_rmap_item->stable_tree);
> +			BUG_ON(!found_rmap_item->tree_item);
> +			if (!rmap_item ||
> +			     !(found_rmap_item->mm == rmap_item->mm &&
> +			      found_rmap_item->address == rmap_item->address)) {
> +				if (!is_zapped_item(found_rmap_item, page2))
> +					break;
> +				remove_rmap_item_from_tree(found_rmap_item);
> +			}
> +			found_rmap_item = found_rmap_item->next;
> +		}
> +		if (!found_rmap_item)
> +			goto out_didnt_find;
> +
> +		/*
> +		 * We can trust the value of the memcmp as we know the pages
> +		 * are write protected.
> +		 */
> +		ret = memcmp_pages(page, page2[0]);
> +
> +		if (ret < 0) {
> +			put_page(page2[0]);
> +			node = node->rb_left;
> +		} else if (ret > 0) {
> +			put_page(page2[0]);
> +			node = node->rb_right;
> +		} else {
> +			goto out_found;
> +		}
> +	}
> +out_didnt_find:
> +	found_rmap_item = NULL;
> +out_found:
> +	return found_rmap_item;
> +}
> +
> +/*
> + * stable_tree_insert - insert into the stable tree, new rmap_item that is
> + * pointing into a new KsmPage.
> + *
> + * @page: the page that we are searching identical page to inside the stable
> + *	  tree.
> + * @new_tree_item: the new tree item we are going to link into the stable tree.
> + * @rmap_item: pointer into the reverse mapping item.
> + *
> + * this function return 0 if success, 0 otherwise.
> + * otherwise.
> + */
> +static int stable_tree_insert(struct page *page,
> +			      struct tree_item *new_tree_item,
> +			      struct rmap_item *rmap_item)
> +{
> +	struct rb_node **new = &(root_stable_tree.rb_node);
> +	struct rb_node *parent = NULL;
> +	struct tree_item *tree_item;
> +	struct page *page2[1];
> +
> +	while (*new) {
> +		int ret;
> +		struct rmap_item *insert_rmap_item;
> +
> +		tree_item = rb_entry(*new, struct tree_item, node);
> +		BUG_ON(!tree_item);
> +		BUG_ON(!tree_item->rmap_item);
> +
> +		insert_rmap_item = tree_item->rmap_item;
> +		while (insert_rmap_item) {
> +			BUG_ON(!insert_rmap_item->stable_tree);
> +			BUG_ON(!insert_rmap_item->tree_item);
> +			if (!rmap_item ||
> +			    !(insert_rmap_item->mm == rmap_item->mm &&
> +			     insert_rmap_item->address == rmap_item->address)) {
> +				if (!is_zapped_item(insert_rmap_item, page2))
> +					break;
> +				remove_rmap_item_from_tree(insert_rmap_item);
> +			}
> +			insert_rmap_item = insert_rmap_item->next;
> +		}
> +		if (!insert_rmap_item)
> +			return 1;
> +
> +		ret = memcmp_pages(page, page2[0]);
> +
> +		parent = *new;
> +		if (ret < 0) {
> +			put_page(page2[0]);
> +			new = &((*new)->rb_left);
> +		} else if (ret > 0) {
> +			put_page(page2[0]);
> +			new = &((*new)->rb_right);
> +		} else {
> +			/*
> +			 * It isnt a bug when we are here,
> +			 * beacuse after we release the stable_tree_lock
> +			 * someone else could have merge identical page to the
> +			 * tree.
> +			 */
> +			return 1;
> +		}
> +	}
> +
> +	rb_link_node(&new_tree_item->node, parent, new);
> +	rb_insert_color(&new_tree_item->node, &root_stable_tree);
> +	rmap_item->stable_tree = 1;
> +	rmap_item->tree_item = new_tree_item;
> +
> +	return 0;
> +}
> +
> +/*
> + * unstable_tree_search_insert - search and insert items into the unstable tree.
> + *
> + * @page: the page that we are going to search for identical page or to insert
> + *	  into the unstable tree
> + * @page2: pointer into identical page that was found inside the unstable tree
> + * @page_rmap_item: the reverse mapping item of page
> + *
> + * this function search if identical page to the page that we
> + * are scanning right now is found inside the unstable tree, and in case no page
> + * with identical content is exist inside the unstable tree, we insert
> + * page_rmap_item as a new object into the unstable tree.
> + *
> + * this function return pointer to rmap_item pointer of item that is found to
> + * be identical to the page that we are scanning right now, NULL otherwise.
> + *
> + * (this function do both searching and inserting, beacuse the fact that
> + *  searching and inserting share the same walking algorithem in rbtrees)
> + */
> +static struct tree_item *unstable_tree_search_insert(struct page *page,
> +					struct page **page2,
> +					struct rmap_item *page_rmap_item)
> +{
> +	struct rb_node **new = &(root_unstable_tree.rb_node);
> +	struct rb_node *parent = NULL;
> +	struct tree_item *tree_item;
> +	struct tree_item *new_tree_item;
> +	struct rmap_item *rmap_item;
> +
> +	while (*new) {
> +		int ret;
> +
> +		tree_item = rb_entry(*new, struct tree_item, node);
> +		BUG_ON(!tree_item);
> +		rmap_item = tree_item->rmap_item;
> +		BUG_ON(!rmap_item);
> +
> +		/*
> +		 * We dont want to swap in pages
> +		 */
> +		if (!is_present_pte(rmap_item->mm, rmap_item->address))
> +			return NULL;
> +
> +		down_read(&rmap_item->mm->mmap_sem);
> +		ret = get_user_pages(current, rmap_item->mm, rmap_item->address,
> +				     1, 0, 0, page2, NULL);
> +		up_read(&rmap_item->mm->mmap_sem);
> +		if (!ret)
> +			return NULL;
> +
> +		ret = memcmp_pages(page, page2[0]);
> +
> +		parent = *new;
> +		if (ret < 0) {
> +			put_page(page2[0]);
> +			new = &((*new)->rb_left);
> +		} else if (ret > 0) {
> +			put_page(page2[0]);
> +			new = &((*new)->rb_right);
> +		} else {
> +			return tree_item;
> +		}
> +	}
> +
> +	if (!page_rmap_item)
> +		return NULL;
> +
> +	new_tree_item = alloc_tree_item();
> +	if (!new_tree_item)
> +		return NULL;
> +
> +	page_rmap_item->tree_item = new_tree_item;
> +	page_rmap_item->stable_tree = 0;
> +	new_tree_item->rmap_item = page_rmap_item;
> +	rb_link_node(&new_tree_item->node, parent, new);
> +	rb_insert_color(&new_tree_item->node, &root_unstable_tree);
> +
> +	return NULL;
> +}
> +
> +/*
> + * update_stable_tree - check if the page inside the tree got zapped,
> + * and if it got zapped, kick it from the tree.
> + *
> + * we are setting wait to 1 in case we find that the rmap_item was object
> + * inside the stable_tree.
> + * (this is used to notify that we dont want to create new rmap_item to it
> + *  at this moment, but in the next time)
> + * wait is left unchanged incase the rmap_item was object inside the unstable
> + * tree.
> + */
> +int update_tree(struct rmap_item *rmap_item, int *wait)
> +{
> +	struct page *page[1];
> +
> +	if (!rmap_item->stable_tree) {
> +		if (rmap_item->tree_item) {
> +			remove_rmap_item_from_tree(rmap_item);
> +			return 1;
> +		}
> +		return 0;
> +	}
> +	if (is_zapped_item(rmap_item, page)) {
> +		remove_rmap_item_from_tree(rmap_item);
> +		*wait = 1;
> +		return 1;
> +	}
> +	put_page(page[0]);
> +	return 0;
> +}
> +
> +static struct rmap_item *create_new_rmap_item(struct mm_struct *mm,
> +			 		      unsigned long addr,
> +					      unsigned int checksum)
> +{
> +	struct rmap_item *rmap_item;
> +	struct hlist_head *bucket;
> +
> +	rmap_item = alloc_rmap_item();
> +	if (!rmap_item)
> +		return NULL;
> +
> +	rmap_item->mm = mm;
> +	rmap_item->address = addr;
> +	rmap_item->oldchecksum = checksum;
> +	rmap_item->stable_tree = 0;
> +	rmap_item->tree_item = NULL;
> +
> +	bucket = &rmap_hash[addr % nrmaps_hash];
> +	hlist_add_head(&rmap_item->link, bucket);
> +
> +	return rmap_item;
> +}
> +
> +/*
> + * cmp_and_merge_page - take a page computes its hash value and check if there
> + * is similar hash value to different page,
> + * in case we find that there is similar hash to different page we call to
> + * try_to_merge_two_pages().
> + *
> + * @ksm_scan: the ksm scanner strcture.
> + * @page: the page that we are searching identical page to.
> + */
> +static int cmp_and_merge_page(struct ksm_scan *ksm_scan, struct page *page)
> +{
> +	struct page *page2[1];
> +	struct ksm_mem_slot *slot;
> +	struct tree_item *tree_item;
> +	struct rmap_item *rmap_item;
> +	struct rmap_item *tree_rmap_item;
> +	unsigned int checksum;
> +	unsigned long addr;
> +	int wait = 0;
> +	int ret;
> +
> +	slot = ksm_scan->slot_index;
> +	addr = slot->addr + ksm_scan->page_index * PAGE_SIZE;
> +	rmap_item = get_rmap_item(slot->mm, addr);
> +	if (rmap_item) {
> +		if (update_tree(rmap_item, &wait))
> +			rmap_item = NULL;
> +	}
> +
> +	/* We first start with searching the page inside the stable tree */
> +	tree_rmap_item = stable_tree_search(page, page2, rmap_item);
> +	if (tree_rmap_item) {
> +		BUG_ON(!tree_rmap_item->tree_item);
> +		ret = try_to_merge_two_pages(slot->mm, page, tree_rmap_item->mm,
> +					     page2[0], addr,
> +					     tree_rmap_item->address);
> +		put_page(page2[0]);
> +		if (!ret) {
> +			/*
> +			 * The page was successuly merged, lets insert its
> +			 * rmap_item into the stable tree.
> +			 */
> +
> +			if (!rmap_item)
> +				rmap_item = create_new_rmap_item(slot->mm,
> +								 addr, 0);
> +			if (!rmap_item)
> +				return !ret;
> +
> +			rmap_item->next = tree_rmap_item->next;
> +			rmap_item->prev = tree_rmap_item;
> +
> +			if (tree_rmap_item->next)
> +				tree_rmap_item->next->prev = rmap_item;
> +
> +			tree_rmap_item->next = rmap_item;
> +
> +			rmap_item->stable_tree = 1;
> +			rmap_item->tree_item = tree_rmap_item->tree_item;
> +		}
> +		ret = !ret;
> +		goto out;
> +	}
> +
> +	/*
> +	 * In case the hash value of the page was changed from the last time we
> +	 * have calculated it, this page to be changed frequely, therefore we
> +	 * dont want to insert it to the unstable tree, and we dont want to
> +	 * waste our time to search if there is something identical to it there.
> +	 */
> +	if (rmap_item) {
> +		checksum = calc_checksum(page);
> +		if (rmap_item->oldchecksum != checksum) {
> +			rmap_item->oldchecksum = checksum;
> +			goto out;
> +		}
> +	}
> +
> +	tree_item = unstable_tree_search_insert(page, page2, rmap_item);
> +	if (tree_item) {
> +		rmap_item = tree_item->rmap_item;
> +		BUG_ON(!rmap_item);
> +		ret = try_to_merge_two_pages(slot->mm, page, rmap_item->mm,
> +					     page2[0], addr,
> +					     rmap_item->address);
> +		/*
> +		 * As soon as we successuly merged this page, we want to remove
> +		 * the rmap_item object of the page that we have merged with and
> +		 * instead insert it as a new stable tree node.
> +		 */
> +		if (!ret) {
> +			rb_erase(&tree_item->node, &root_unstable_tree);
> +			stable_tree_insert(page2[0], tree_item, rmap_item);
> +		}
> +		put_page(page2[0]);
> +		ret = !ret;
> +		goto out;
> +	}
> +	/*
> +	 * When wait is 1, we dont want to calculate the hash value of the page
> +	 * right now, instead we prefer to wait.
> +	 */
> +	if (!wait && !rmap_item) {
> +		checksum = calc_checksum(page);
> +		create_new_rmap_item(slot->mm, addr, checksum);
> +	}
> +out:
> +	return ret;
> +}
> +
> +/* return -EAGAIN - no slots registered, nothing to be done */
> +static int scan_get_next_index(struct ksm_scan *ksm_scan, int nscan)
> +{
> +	struct ksm_mem_slot *slot;
> +
> +	if (list_empty(&slots))
> +		return -EAGAIN;
> +
> +	slot = ksm_scan->slot_index;
> +
> +	/* Are there pages left in this slot to scan? */
> +	if ((slot->npages - ksm_scan->page_index - nscan) > 0) {
> +		ksm_scan->page_index += nscan;
> +		return 0;
> +	}
> +
> +	list_for_each_entry_from(slot, &slots, link) {
> +		if (slot == ksm_scan->slot_index)
> +			continue;
> +		ksm_scan->page_index = 0;
> +		ksm_scan->slot_index = slot;
> +		return 0;
> +	}
> +
> +	/* look like we finished scanning the whole memory, starting again */
> +	root_unstable_tree = RB_ROOT;
> +	ksm_scan->page_index = 0;
> +	ksm_scan->slot_index = list_first_entry(&slots,
> +						struct ksm_mem_slot, link);
> +	return 0;
> +}
> +
> +/*
> + * update slot_index - make sure ksm_scan will point to vaild data,
> + * it is possible that by the time we are here the data that ksm_scan was
> + * pointed to was released so we have to call this function every time after
> + * taking the slots_lock
> + */
> +static void scan_update_old_index(struct ksm_scan *ksm_scan)
> +{
> +	struct ksm_mem_slot *slot;
> +
> +	if (list_empty(&slots))
> +		return;
> +
> +	list_for_each_entry(slot, &slots, link) {
> +		if (ksm_scan->slot_index == slot)
> +			return;
> +	}
> +
> +	ksm_scan->slot_index = list_first_entry(&slots,
> +						struct ksm_mem_slot, link);
> +	ksm_scan->page_index = 0;
> +}
> +
> +/**
> + * ksm_scan_start - the ksm scanner main worker function.
> + * @ksm_scan -    the scanner.
> + * @scan_npages - number of pages we are want to scan before we return from this
> + * @function.
> + *
> + * (this function can be called from the kernel thread scanner, or from 
> + *  userspace ioctl context scanner)
> + *
> + *  The function return -EAGAIN in case there are not slots to scan.
> + */
> +static int ksm_scan_start(struct ksm_scan *ksm_scan, unsigned int scan_npages)
> +{
> +	struct ksm_mem_slot *slot;
> +	struct page *page[1];
> +	int val;
> +	int ret = 0;
> +
> +	down_read(&slots_lock);
> +
> +	scan_update_old_index(ksm_scan);
> +
> +	while (scan_npages > 0) {
> +		ret = scan_get_next_index(ksm_scan, 1);
> +		if (ret)
> +			goto out;
> +
> +		slot = ksm_scan->slot_index;
> +
> +		cond_resched();
> +
> +		/*
> +		 * If the page is swapped out or in swap cache, we don't want to
> +		 * scan it (it is just for performance).
> +		 */
> +		if (is_present_pte(slot->mm, slot->addr +
> +				   ksm_scan->page_index * PAGE_SIZE)) {
> +			down_read(&slot->mm->mmap_sem);
> +			val = get_user_pages(current, slot->mm, slot->addr +
> +					     ksm_scan->page_index * PAGE_SIZE ,
> +					      1, 0, 0, page, NULL);
> +			up_read(&slot->mm->mmap_sem);
> +			if (val == 1) {
> +				if (!PageKsm(page[0]))
> +					cmp_and_merge_page(ksm_scan, page[0]);
> +				put_page(page[0]);
> +			}
> +		}
> +		scan_npages--;
> +	}
> +	scan_get_next_index(ksm_scan, 1);
> +out:
> +	up_read(&slots_lock);
> +	return ret;
> +}
> +
> +static struct file_operations ksm_sma_fops = {
> +	.release        = ksm_sma_release,
> +	.unlocked_ioctl = ksm_sma_ioctl,
> +	.compat_ioctl   = ksm_sma_ioctl,
> +};
> +
> +static int ksm_dev_ioctl_create_shared_memory_area(void)
> +{
> +	int fd = -1;
> +	struct ksm_sma *ksm_sma;
> +
> +	ksm_sma = kmalloc(sizeof(struct ksm_sma), GFP_KERNEL);
> +	if (!ksm_sma)
> +		goto out;
> +
> +	INIT_LIST_HEAD(&ksm_sma->sma_slots);
> +
> +	fd = anon_inode_getfd("ksm-sma", &ksm_sma_fops, ksm_sma, 0);
> +	if (fd < 0)
> +		goto out_free;
> +
> +	return fd;
> +out_free:
> +	kfree(ksm_sma);
> +out:
> +	return fd;
> +}
> +
> +/*
> + * ksm_dev_ioctl_start_stop_kthread - control the kernel thread scanning running
> + * speed.
> + * This function allow us to control on the time the kernel thread will sleep
> + * how many pages it will scan between sleep and sleep, and how many pages it
> + * will maximum merge between sleep and sleep.
> + */
> +static int ksm_dev_ioctl_start_stop_kthread(struct ksm_kthread_info *info)
> +{
> +	int ret = 0;
> +
> +	down_write(&kthread_lock);
> +
> +	if (info->flags & ksm_control_flags_run) {
> +		if (!info->pages_to_scan) {
> +			ret = EPERM;
> +			up_write(&kthread_lock);
> +			goto out;
> +		}
> +	}
> +
> +	kthread_sleep = info->sleep;
> +	kthread_pages_to_scan = info->pages_to_scan;
> +	ksmd_flags = info->flags;
> +
> +	up_write(&kthread_lock);
> +
> +	if (ksmd_flags & ksm_control_flags_run)
> +		wake_up_interruptible(&kthread_wait);
> +
> +out:
> +	return ret;
> +}
> +
> +/*
> + * ksm_dev_ioctl_get_info_kthread - write into info the scanning information
> + * of the ksm kernel thread
> + */
> +static void ksm_dev_ioctl_get_info_kthread(struct ksm_kthread_info *info)
> +{
> +	down_read(&kthread_lock);
> +
> +	info->sleep = kthread_sleep;
> +	info->pages_to_scan = kthread_pages_to_scan;
> +	info->flags = ksmd_flags;
> +
> +	up_read(&kthread_lock);
> +}
> +
> +static long ksm_dev_ioctl(struct file *filp,
> +			  unsigned int ioctl, unsigned long arg)
> +{
> +	void __user *argp = (void __user *)arg;
> +	long r = -EINVAL;
> +
> +	switch (ioctl) {
> +	case KSM_GET_API_VERSION:
> +		r = KSM_API_VERSION;
> +		break;
> +	case KSM_CREATE_SHARED_MEMORY_AREA:
> +		r = ksm_dev_ioctl_create_shared_memory_area();
> +		break;
> +	case KSM_START_STOP_KTHREAD: {
> +		struct ksm_kthread_info info;
> +
> +		r = -EFAULT;
> +		if (copy_from_user(&info, argp,
> +				   sizeof(struct ksm_kthread_info)))
> +			break;
> +
> +		r = ksm_dev_ioctl_start_stop_kthread(&info);
> +		break;
> +		}
> +	case KSM_GET_INFO_KTHREAD: {
> +		struct ksm_kthread_info info;
> +
> +		ksm_dev_ioctl_get_info_kthread(&info);
> +		r = -EFAULT;
> +		if (copy_to_user(argp, &info,
> +				 sizeof(struct ksm_kthread_info)))
> +			break;
> +		r = 0;
> +		break;
> +	}
> +	default:
> +		break;
> +	}
> +	return r;
> +}
> +
> +static struct file_operations ksm_chardev_ops = {
> +	.unlocked_ioctl = ksm_dev_ioctl,
> +	.compat_ioctl   = ksm_dev_ioctl,
> +	.owner          = THIS_MODULE,
> +};
> +
> +static struct miscdevice ksm_dev = {
> +	KSM_MINOR,
> +	"ksm",
> +	&ksm_chardev_ops,
> +};
> +
> +int kthread_ksm_scan_thread(void *nothing)
> +{
> +	while (!kthread_should_stop()) {
> +		if (ksmd_flags & ksm_control_flags_run) {
> +			down_read(&kthread_lock);
> +			ksm_scan_start(&kthread_ksm_scan,
> +				       kthread_pages_to_scan);
> +			up_read(&kthread_lock);
> +			schedule_timeout_interruptible(
> +					usecs_to_jiffies(kthread_sleep));
> +		} else {
> +			wait_event_interruptible(kthread_wait,
> +					ksmd_flags & ksm_control_flags_run ||
> +					kthread_should_stop());
> +		}
> +	}
> +	return 0;
> +}
> +
> +static int __init ksm_init(void)
> +{
> +	int r;
> +
> +	r = ksm_slab_init();
> +	if (r)
> +		goto out;
> +
> +	r = rmap_hash_init();
> +	if (r)
> +		goto out_free1;
> +
> +	kthread = kthread_run(kthread_ksm_scan_thread, NULL, "kksmd");
> +	if (IS_ERR(kthread)) {
> +		printk(KERN_ERR "ksm: creating kthread failed\n");
> +		r = PTR_ERR(kthread);
> +		goto out_free2;
> +	}
> +
> +	r = misc_register(&ksm_dev);
> +	if (r) {
> +		printk(KERN_ERR "ksm: misc device register failed\n");
> +		goto out_free3;
> +	}
> +
> +	printk(KERN_WARNING "ksm loaded\n");
> +	return 0;
> +
> +out_free3:
> +	kthread_stop(kthread);
> +out_free2:
> +	rmap_hash_free();
> +out_free1:
> +	ksm_slab_free();
> +out:
> +	return r;
> +}
> +
> +static void __exit ksm_exit(void)
> +{
> +	misc_deregister(&ksm_dev);
> +	ksmd_flags = ksm_control_flags_run;
> +	kthread_stop(kthread);
> +	rmap_hash_free();
> +	ksm_slab_free();
> +}
> +
> +module_init(ksm_init)
> +module_exit(ksm_exit)
> -- 
> 1.5.6.5
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
> 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Izik Eidus March 31, 2009, 12:21 p.m. UTC | #3
KAMEZAWA Hiroyuki wrote:
> On Tue, 31 Mar 2009 02:59:20 +0300
> Izik Eidus <ieidus@redhat.com> wrote:
>
>   
>> Ksm is driver that allow merging identical pages between one or more
>> applications in way unvisible to the application that use it.
>> Pages that are merged are marked as readonly and are COWed when any
>> application try to change them.
>>
>> Ksm is used for cases where using fork() is not suitable,
>> one of this cases is where the pages of the application keep changing
>> dynamicly and the application cannot know in advance what pages are
>> going to be identical.
>>
>> Ksm works by walking over the memory pages of the applications it
>> scan in order to find identical pages.
>> It uses a two sorted data strctures called stable and unstable trees
>> to find in effective way the identical pages.
>>
>> When ksm finds two identical pages, it marks them as readonly and merges
>> them into single one page,
>> after the pages are marked as readonly and merged into one page, linux
>> will treat this pages as normal copy_on_write pages and will fork them
>> when write access will happen to them.
>>
>> Ksm scan just memory areas that were registred to be scanned by it.
>>
>> Ksm api:
>>
>> KSM_GET_API_VERSION:
>> Give the userspace the api version of the module.
>>
>> KSM_CREATE_SHARED_MEMORY_AREA:
>> Create shared memory reagion fd, that latter allow the user to register
>> the memory region to scan by using:
>> KSM_REGISTER_MEMORY_REGION and KSM_REMOVE_MEMORY_REGION
>>
>> KSM_START_STOP_KTHREAD:
>> Return information about the kernel thread, the inforamtion is returned
>> using the ksm_kthread_info structure:
>> ksm_kthread_info:
>> __u32 sleep:
>>         number of microsecoends to sleep between each iteration of
>> scanning.
>>
>> __u32 pages_to_scan:
>>         number of pages to scan for each iteration of scanning.
>>
>> __u32 max_pages_to_merge:
>>         maximum number of pages to merge in each iteration of scanning
>>         (so even if there are still more pages to scan, we stop this
>> iteration)
>>
>> __u32 flags:
>>        flags to control ksmd (right now just ksm_control_flags_run
>> 			      available)
>>
>> KSM_REGISTER_MEMORY_REGION:
>> Register userspace virtual address range to be scanned by ksm.
>> This ioctl is using the ksm_memory_region structure:
>> ksm_memory_region:
>> __u32 npages;
>>          number of pages to share inside this memory region.
>> __u32 pad;
>> __u64 addr:
>>         the begining of the virtual address of this region.
>>
>> KSM_REMOVE_MEMORY_REGION:
>> Remove memory region from ksm.
>>
>> Signed-off-by: Izik Eidus <ieidus@redhat.com>
>> ---
>>  include/linux/ksm.h        |   69 +++
>>  include/linux/miscdevice.h |    1 +
>>  mm/Kconfig                 |    6 +
>>  mm/Makefile                |    1 +
>>  mm/ksm.c                   | 1431 ++++++++++++++++++++++++++++++++++++++++++++
>>  5 files changed, 1508 insertions(+), 0 deletions(-)
>>  create mode 100644 include/linux/ksm.h
>>  create mode 100644 mm/ksm.c
>>
>> diff --git a/include/linux/ksm.h b/include/linux/ksm.h
>> new file mode 100644
>> index 0000000..5776dce
>> --- /dev/null
>> +++ b/include/linux/ksm.h
>> @@ -0,0 +1,69 @@
>> +#ifndef __LINUX_KSM_H
>> +#define __LINUX_KSM_H
>> +
>> +/*
>> + * Userspace interface for /dev/ksm - kvm shared memory
>> + */
>> +
>> +#include <linux/types.h>
>> +#include <linux/ioctl.h>
>> +
>> +#include <asm/types.h>
>> +
>> +#define KSM_API_VERSION 1
>> +
>> +#define ksm_control_flags_run 1
>> +
>> +/* for KSM_REGISTER_MEMORY_REGION */
>> +struct ksm_memory_region {
>> +	__u32 npages; /* number of pages to share */
>> +	__u32 pad;
>> +	__u64 addr; /* the begining of the virtual address */
>> +        __u64 reserved_bits;
>> +};
>> +
>> +struct ksm_kthread_info {
>> +	__u32 sleep; /* number of microsecoends to sleep */
>> +	__u32 pages_to_scan; /* number of pages to scan */
>> +	__u32 flags; /* control flags */
>> +        __u32 pad;
>> +        __u64 reserved_bits;
>> +};
>> +
>> +#define KSMIO 0xAB
>> +
>> +/* ioctls for /dev/ksm */
>> +
>> +#define KSM_GET_API_VERSION              _IO(KSMIO,   0x00)
>> +/*
>> + * KSM_CREATE_SHARED_MEMORY_AREA - create the shared memory reagion fd
>> + */
>> +#define KSM_CREATE_SHARED_MEMORY_AREA    _IO(KSMIO,   0x01) /* return SMA fd */
>> +/*
>> + * KSM_START_STOP_KTHREAD - control the kernel thread scanning speed
>> + * (can stop the kernel thread from working by setting running = 0)
>> + */
>> +#define KSM_START_STOP_KTHREAD		 _IOW(KSMIO,  0x02,\
>> +					      struct ksm_kthread_info)
>> +/*
>> + * KSM_GET_INFO_KTHREAD - return information about the kernel thread
>> + * scanning speed.
>> + */
>> +#define KSM_GET_INFO_KTHREAD		 _IOW(KSMIO,  0x03,\
>> +					      struct ksm_kthread_info)
>> +
>> +
>> +/* ioctls for SMA fds */
>> +
>> +/*
>> + * KSM_REGISTER_MEMORY_REGION - register virtual address memory area to be
>> + * scanned by kvm.
>> + */
>> +#define KSM_REGISTER_MEMORY_REGION       _IOW(KSMIO,  0x20,\
>> +					      struct ksm_memory_region)
>> +/*
>> + * KSM_REMOVE_MEMORY_REGION - remove virtual address memory area from ksm.
>> + */
>> +#define KSM_REMOVE_MEMORY_REGION         _IO(KSMIO,   0x21)
>> +
>> +#endif
>> diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h
>> index a820f81..6d4f8df 100644
>> --- a/include/linux/miscdevice.h
>> +++ b/include/linux/miscdevice.h
>> @@ -29,6 +29,7 @@
>>  #define HPET_MINOR		228
>>  #define FUSE_MINOR		229
>>  #define KVM_MINOR		232
>> +#define KSM_MINOR		233
>>  #define MISC_DYNAMIC_MINOR	255
>>  
>>  struct device;
>> diff --git a/mm/Kconfig b/mm/Kconfig
>> index a5b7781..2818223 100644
>> --- a/mm/Kconfig
>> +++ b/mm/Kconfig
>> @@ -216,3 +216,9 @@ config UNEVICTABLE_LRU
>>  
>>  config MMU_NOTIFIER
>>  	bool
>> +
>> +config KSM
>> +	tristate "Enable KSM for page sharing"
>> +	help
>> +	  Enable the KSM kernel module to allow page sharing of equal pages
>> +	  among different tasks.
>> diff --git a/mm/Makefile b/mm/Makefile
>> index 72255be..e3bf7bf 100644
>> --- a/mm/Makefile
>> +++ b/mm/Makefile
>> @@ -24,6 +24,7 @@ obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
>>  obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
>>  obj-$(CONFIG_SLOB) += slob.o
>>  obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
>> +obj-$(CONFIG_KSM) += ksm.o
>>  obj-$(CONFIG_SLAB) += slab.o
>>  obj-$(CONFIG_SLUB) += slub.o
>>  obj-$(CONFIG_FAILSLAB) += failslab.o
>> diff --git a/mm/ksm.c b/mm/ksm.c
>> new file mode 100644
>> index 0000000..eba4c09
>> --- /dev/null
>> +++ b/mm/ksm.c
>> @@ -0,0 +1,1431 @@
>> +/*
>> + * Memory merging driver for Linux
>> + *
>> + * This module enables dynamic sharing of identical pages found in different
>> + * memory areas, even if they are not shared by fork()
>> + *
>> + * Copyright (C) 2008 Red Hat, Inc.
>> + * Authors:
>> + *	Izik Eidus
>> + *	Andrea Arcangeli
>> + *	Chris Wright
>> + *
>> + * This work is licensed under the terms of the GNU GPL, version 2.
>> + */
>> +
>> +#include <linux/module.h>
>> +#include <linux/errno.h>
>> +#include <linux/mm.h>
>> +#include <linux/fs.h>
>> +#include <linux/miscdevice.h>
>> +#include <linux/vmalloc.h>
>> +#include <linux/file.h>
>> +#include <linux/mman.h>
>> +#include <linux/sched.h>
>> +#include <linux/rwsem.h>
>> +#include <linux/pagemap.h>
>> +#include <linux/sched.h>
>> +#include <linux/rmap.h>
>> +#include <linux/spinlock.h>
>> +#include <linux/jhash.h>
>> +#include <linux/delay.h>
>> +#include <linux/kthread.h>
>> +#include <linux/wait.h>
>> +#include <linux/scatterlist.h>
>> +#include <linux/random.h>
>> +#include <linux/slab.h>
>> +#include <linux/swap.h>
>> +#include <linux/rbtree.h>
>> +#include <linux/anon_inodes.h>
>> +#include <linux/ksm.h>
>> +
>> +#include <asm/tlbflush.h>
>> +
>> +MODULE_AUTHOR("Red Hat, Inc.");
>> +MODULE_LICENSE("GPL");
>> +
>> +static int rmap_hash_size;
>> +module_param(rmap_hash_size, int, 0);
>> +MODULE_PARM_DESC(rmap_hash_size, "Hash table size for the reverse mapping");
>> +
>> +/*
>> + * ksm_mem_slot - hold information for an userspace scanning range
>> + * (the scanning for this region will be from addr untill addr +
>> + *  npages * PAGE_SIZE inside mm)
>> + */
>> +struct ksm_mem_slot {
>> +	struct list_head link;
>> +	struct list_head sma_link;
>> +	struct mm_struct *mm;
>> +	unsigned long addr;	/* the begining of the virtual address */
>> +	unsigned npages;	/* number of pages to share */
>> +};
>> +
>> +/*
>> + * ksm_sma - shared memory area, each process have its own sma that contain the
>> + * information about the slots that it own
>> + */
>> +struct ksm_sma {
>> +	struct list_head sma_slots;
>> +};
>> +
>> +/**
>> + * struct ksm_scan - cursor for scanning
>> + * @slot_index: the current slot we are scanning
>> + * @page_index: the page inside the sma that is currently being scanned
>> + *
>> + * ksm uses it to know what are the next pages it need to scan
>> + */
>> +struct ksm_scan {
>> +	struct ksm_mem_slot *slot_index;
>> +	unsigned long page_index;
>> +};
>> +
>> +/*
>> + * Few notes about ksm scanning progress (make it easier to understand the
>> + * data structures below):
>> + *
>> + * In order to reduce excessive scanning, ksm sort the memory pages by their
>> + * contents into a data strcture that hold pointer into the pages.
>> + *
>> + * Since the contents of the pages may change at any moment, ksm cant just
>> + * insert the pages into normal sorted tree and expect it to find anything.
>> + *
>> + * For this purpuse ksm use two data strctures - stable and unstable trees,
>> + * the stable tree hold pointers into all the merged pages (KsmPage) sorted by
>> + * their contents, beacuse that each such page have to be write-protected,
>> + * searching on this tree is fully assuranced to be working and therefore this
>> + * tree is called the stable tree.
>> + *
>> + * In addition to the stable tree, ksm use another data strcture called the
>> + * unstable tree, this specific tree hold pointers into pages that have
>> + * been found to be "unchanged for period of time", the unstable tree sort this
>> + * pages by their contents, but given the fact that this pages are not
>> + * write-protected, ksm cant trust the unstable tree to be fully assuranced to
>> + * work.
>> + * For the reason that the unstable tree would become corrupted when some of
>> + * the page inside itself would change, the tree is called unstable.
>> + * Ksm solve this problem by two ways:
>> + * 1) the unstable tree get flushed every time ksm finish to scan the whole
>> + *    memory, and then the tree is rebuild from the begining.
>> + * 2) Ksm will only insert into the unstable tree, pages that their hash value
>> + *    was not changed during the whole progress of one circuler scanning of the
>> + *    memory.
>> + * 3) The unstable tree is RedBlack Tree - meaning its balancing is based on
>> + *    the colors of the nodes and not their content, this assure that even when
>> + *    the tree get "corrupted" we wont get out of balance and the timing of
>> + *    scanning is the same, another issue is that searching and inserting nodes
>> + *    into rbtree is the same algorithem, therefore we have no overhead when we
>> + *    flush the tree and rebuild it.
>> + * 4) Ksm never flush the stable tree, this mean that even if it would take 10
>> + *    times to find page inside the unstable tree, as soon as we would find it,
>> + *    it will be secured inside the stable tree,
>> + *    (When we scan new page, we first compare it against the stable tree, and
>> + *     then against the unstable tree)
>> + */
>> +
>> +struct rmap_item;
>> +
>> +/*
>> + * tree_item - object of the stable and unstable trees
>> + */
>> +struct tree_item {
>> +	struct rb_node node;
>> +	struct rmap_item *rmap_item;
>> +};
>> +
>> +/*
>> + * rmap_item - object of the rmap_hash hash table
>> + * (it is holding the previous hash value (oldindex),
>> + *  pointer into the page_hash_item, and pointer into the tree_item)
>> + */
>> +
>> +/**
>> + * struct rmap_item - reverse mapping item for virtual addresses
>> + * @link: link into the rmap_hash hash table.
>> + * @mm: the memory strcture the rmap_item is pointing to.
>> + * @address: the virtual address the rmap_item is pointing to.
>> + * @oldchecksum: old checksum result for the page belong the virtual address
>> + * @stable_tree: when 1 rmap_item is used for stable_tree, 0 unstable tree
>> + * @tree_item: pointer into the stable/unstable tree that hold the virtual
>> + *             address that the rmap_item is pointing to.
>> + * @next: the next rmap item inside the stable/unstable tree that have that is
>> + *        found inside the same tree node.
>> + */
>> +
>> +struct rmap_item {
>> +	struct hlist_node link;
>> +	struct mm_struct *mm;
>> +	unsigned long address;
>> +	unsigned int oldchecksum; /* old checksum value */
>> +	unsigned char stable_tree; /* 1 stable_tree 0 unstable tree */
>> +	struct tree_item *tree_item;
>> +	struct rmap_item *next;
>> +	struct rmap_item *prev;
>> +};
>> +
>> +/*
>> + * slots is linked list that hold all the memory regions that were registred
>> + * to be scanned.
>> + */
>> +static LIST_HEAD(slots);
>> +/*
>> + * slots_lock protect against removing and adding memory regions while a scanner
>> + * is in the middle of scanning.
>> + */
>> +static DECLARE_RWSEM(slots_lock);
>> +
>> +/* The stable and unstable trees heads. */
>> +struct rb_root root_stable_tree = RB_ROOT;
>> +struct rb_root root_unstable_tree = RB_ROOT;
>> +
>> +
>> +/* The number of linked list members inside the hash table */
>> +static int nrmaps_hash;
>> +/* rmap_hash hash table */
>> +static struct hlist_head *rmap_hash;
>> +
>> +static struct kmem_cache *tree_item_cache;
>> +static struct kmem_cache *rmap_item_cache;
>> +
>> +static int kthread_sleep; /* sleep time of the kernel thread */
>> +static int kthread_pages_to_scan; /* npages to scan for the kernel thread */
>> +static struct ksm_scan kthread_ksm_scan;
>> +static int ksmd_flags;
>> +static struct task_struct *kthread;
>> +static DECLARE_WAIT_QUEUE_HEAD(kthread_wait);
>> +static DECLARE_RWSEM(kthread_lock);
>> +
>> +static int ksm_slab_init(void)
>> +{
>> +	int ret = -ENOMEM;
>> +
>> +	tree_item_cache = KMEM_CACHE(tree_item, 0);
>> +	if (!tree_item_cache)
>> +		goto out;
>> +
>> +	rmap_item_cache = KMEM_CACHE(rmap_item, 0);
>> +	if (!rmap_item_cache)
>> +		goto out_free;
>> +
>> +	return 0;
>> +
>> +out_free:
>> +	kmem_cache_destroy(tree_item_cache);
>> +out:
>> +	return ret;
>> +}
>> +
>> +static void ksm_slab_free(void)
>> +{
>> +	kmem_cache_destroy(rmap_item_cache);
>> +	kmem_cache_destroy(tree_item_cache);
>> +}
>> +
>> +static inline struct tree_item *alloc_tree_item(void)
>> +{
>> +	return kmem_cache_zalloc(tree_item_cache, GFP_KERNEL);
>> +}
>> +
>> +static void free_tree_item(struct tree_item *tree_item)
>> +{
>> +	kmem_cache_free(tree_item_cache, tree_item);
>> +}
>> +
>> +static inline struct rmap_item *alloc_rmap_item(void)
>> +{
>> +	return kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL);
>> +}
>> +
>> +static inline void free_rmap_item(struct rmap_item *rmap_item)
>> +{
>> +	kmem_cache_free(rmap_item_cache, rmap_item);
>> +}
>> +
>> +/*
>> + * PageKsm - this type of pages are the write protected pages that ksm map
>> + * into multiple vmas (this is the "shared page")
>> + * this page was allocated using alloc_page(), and every pte that point to it
>> + * is always write protected (therefore its data content cant ever be changed)
>> + * and this page cant be swapped.
>> + */
>> +static inline int PageKsm(struct page *page)
>> +{
>> +	/*
>> +	 * When ksm create new shared page, it create kernel allocated page
>> +	 * using alloc_page(), therefore this page is not anonymous, taking into
>> +         * account that ksm scan just anonymous pages, we can relay on the fact
>> +	 * that each time we see !PageAnon(page) we are hitting shared page.
>> +	 */
>> +	return !PageAnon(page);
>> +}
>> +
>> +static int rmap_hash_init(void)
>> +{
>> +	if (!rmap_hash_size) {
>> +		struct sysinfo sinfo;
>> +
>> +		si_meminfo(&sinfo);
>> +		rmap_hash_size = sinfo.totalram / 10;
>> +	}
>> +	nrmaps_hash = rmap_hash_size;
>> +	rmap_hash = vmalloc(nrmaps_hash * sizeof(struct hlist_head));
>> +	if (!rmap_hash)
>> +		return -ENOMEM;
>> +	memset(rmap_hash, 0, nrmaps_hash * sizeof(struct hlist_head));
>> +	return 0;
>> +}
>> +
>> +static void rmap_hash_free(void)
>> +{
>> +	int i;
>> +	struct hlist_head *bucket;
>> +	struct hlist_node *node, *n;
>> +	struct rmap_item *rmap_item;
>> +
>> +	for (i = 0; i < nrmaps_hash; ++i) {
>> +		bucket = &rmap_hash[i];
>> +		hlist_for_each_entry_safe(rmap_item, node, n, bucket, link) {
>> +			hlist_del(&rmap_item->link);
>> +			free_rmap_item(rmap_item);
>> +		}
>> +	}
>> +	vfree(rmap_hash);
>> +}
>> +
>> +static inline u32 calc_checksum(struct page *page)
>> +{
>> +	u32 checksum;
>> +	void *addr = kmap_atomic(page, KM_USER0);
>> +	checksum = jhash(addr, PAGE_SIZE, 17);
>> +	kunmap_atomic(addr, KM_USER0);
>> +	return checksum;
>> +}
>> +
>> +/*
>> + * Return rmap_item for a given virtual address.
>> + */
>> +static struct rmap_item *get_rmap_item(struct mm_struct *mm, unsigned long addr)
>> +{
>> +	struct rmap_item *rmap_item;
>> +	struct hlist_head *bucket;
>> +	struct hlist_node *node;
>> +
>> +	bucket = &rmap_hash[addr % nrmaps_hash];
>> +	hlist_for_each_entry(rmap_item, node, bucket, link) {
>> +		if (mm == rmap_item->mm && rmap_item->address == addr) {
>> +			return rmap_item;
>> +		}
>> +	}
>> +	return NULL;
>> +}
>> +
>> +/*
>> + * Removing rmap_item from stable or unstable tree.
>> + * This function will free the rmap_item object, and if that rmap_item was
>> + * insde the stable or unstable trees, it would remove the link from there
>> + * as well.
>> + */
>> +static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
>> +{
>> +	struct tree_item *tree_item;
>> +
>> +	tree_item = rmap_item->tree_item;
>> +	rmap_item->tree_item = NULL;
>> +
>> +	if (rmap_item->stable_tree) {
>> +		if (rmap_item->prev) {
>> +			BUG_ON(rmap_item->prev->next != rmap_item);
>> +			rmap_item->prev->next = rmap_item->next;
>> +		}
>> +		if (rmap_item->next) {
>> +			BUG_ON(rmap_item->next->prev != rmap_item);
>> +			rmap_item->next->prev = rmap_item->prev;
>> +		}
>> +	}
>> +
>> +	if (tree_item) {
>> +		if (rmap_item->stable_tree) {
>> +	 		if (!rmap_item->next && !rmap_item->prev) {
>> +				rb_erase(&tree_item->node, &root_stable_tree);
>> +				free_tree_item(tree_item);
>> +			} else if (!rmap_item->prev) {
>> +				tree_item->rmap_item = rmap_item->next;
>> +			} else {
>> +				tree_item->rmap_item = rmap_item->prev;
>> +			}
>> +		} else if (!rmap_item->stable_tree) {
>> +			free_tree_item(tree_item);
>> +		}
>> +	}
>> +
>> +	hlist_del(&rmap_item->link);
>> +	free_rmap_item(rmap_item);
>> +}
>> +
>> +static void remove_page_from_tree(struct mm_struct *mm,
>> +				  unsigned long addr)
>> +{
>> +	struct rmap_item *rmap_item;
>> +
>> +	rmap_item = get_rmap_item(mm, addr);
>> +	if (!rmap_item)
>> +		return;
>> +	remove_rmap_item_from_tree(rmap_item);
>> +	return;
>> +}
>> +
>> +static int ksm_sma_ioctl_register_memory_region(struct ksm_sma *ksm_sma,
>> +						struct ksm_memory_region *mem)
>> +{
>> +	struct ksm_mem_slot *slot;
>> +	int ret = -EPERM;
>> +
>> +	slot = kzalloc(sizeof(struct ksm_mem_slot), GFP_KERNEL);
>> +	if (!slot) {
>> +		ret = -ENOMEM;
>> +		goto out;
>> +	}
>> +
>> +	slot->mm = get_task_mm(current);
>> +	if (!slot->mm)
>> +		goto out_free;
>> +	slot->addr = mem->addr;
>> +	slot->npages = mem->npages;
>> +
>> +	down_write(&slots_lock);
>> +
>> +	list_add_tail(&slot->link, &slots);
>> +	list_add_tail(&slot->sma_link, &ksm_sma->sma_slots);
>> +
>> +	up_write(&slots_lock);
>> +	return 0;
>> +
>> +out_free:
>> +	kfree(slot);
>> +out:
>> +	return ret;
>> +}
>> +
>> +static void remove_mm_from_hash_and_tree(struct mm_struct *mm)
>> +{
>> +	struct ksm_mem_slot *slot;
>> +	int pages_count;
>> +
>> +	list_for_each_entry(slot, &slots, link)
>> +		if (slot->mm == mm)
>> +			break;
>> +	BUG_ON(!slot);
>> +
>> +	root_unstable_tree = RB_ROOT;
>> +	for (pages_count = 0; pages_count < slot->npages; ++pages_count)
>> +		remove_page_from_tree(mm, slot->addr +
>> +				      pages_count * PAGE_SIZE);
>> +	list_del(&slot->link);
>> +}
>> +
>> +static int ksm_sma_ioctl_remove_memory_region(struct ksm_sma *ksm_sma)
>> +{
>> +	struct ksm_mem_slot *slot, *node;
>> +
>> +	down_write(&slots_lock);
>> +	list_for_each_entry_safe(slot, node, &ksm_sma->sma_slots, sma_link) {
>> +		remove_mm_from_hash_and_tree(slot->mm);
>> +		mmput(slot->mm);
>> +		list_del(&slot->sma_link);
>> +		kfree(slot);
>> +	}
>> +	up_write(&slots_lock);
>> +	return 0;
>> +}
>> +
>> +static int ksm_sma_release(struct inode *inode, struct file *filp)
>> +{
>> +	struct ksm_sma *ksm_sma = filp->private_data;
>> +	int r;
>> +
>> +	r = ksm_sma_ioctl_remove_memory_region(ksm_sma);
>> +	kfree(ksm_sma);
>> +	return r;
>> +}
>> +
>> +static long ksm_sma_ioctl(struct file *filp,
>> +			  unsigned int ioctl, unsigned long arg)
>> +{
>> +	struct ksm_sma *sma = filp->private_data;
>> +	void __user *argp = (void __user *)arg;
>> +	int r = EINVAL;
>> +
>> +	switch (ioctl) {
>> +	case KSM_REGISTER_MEMORY_REGION: {
>> +		struct ksm_memory_region ksm_memory_region;
>> +
>> +		r = -EFAULT;
>> +		if (copy_from_user(&ksm_memory_region, argp,
>> +				   sizeof(ksm_memory_region)))
>> +			goto out;
>> +		r = ksm_sma_ioctl_register_memory_region(sma,
>> +							 &ksm_memory_region);
>> +		break;
>> +	}
>> +	case KSM_REMOVE_MEMORY_REGION:
>> +		r = ksm_sma_ioctl_remove_memory_region(sma);
>> +		break;
>> +	}
>> +
>> +out:
>> +	return r;
>> +}
>> +
>> +static unsigned long addr_in_vma(struct vm_area_struct *vma, struct page *page)
>> +{
>> +	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
>> +	unsigned long addr;
>> +
>> +	addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
>> +	if (unlikely(addr < vma->vm_start || addr >= vma->vm_end))
>> +		return -EFAULT;
>> +	return addr;
>> +}
>> +
>> +static pte_t *get_pte(struct mm_struct *mm, unsigned long addr)
>> +{
>> +	pgd_t *pgd;
>> +	pud_t *pud;
>> +	pmd_t *pmd;
>> +	pte_t *ptep = NULL;
>> +
>> +	pgd = pgd_offset(mm, addr);
>> +	if (!pgd_present(*pgd))
>> +		goto out;
>> +
>> +	pud = pud_offset(pgd, addr);
>> +	if (!pud_present(*pud))
>> +		goto out;
>> +
>> +	pmd = pmd_offset(pud, addr);
>> +	if (!pmd_present(*pmd))
>> +		goto out;
>> +
>> +	ptep = pte_offset_map(pmd, addr);
>> +out:
>> +	return ptep;
>> +}
>> +
>> +static int is_present_pte(struct mm_struct *mm, unsigned long addr)
>> +{
>> +	pte_t *ptep;
>> +	int r;
>> +
>> +	ptep = get_pte(mm, addr);
>> +	if (!ptep)
>> +		return 0;
>> +
>> +	r = pte_present(*ptep);
>> +	pte_unmap(ptep);
>> +
>> +	return r;
>> +}
>> +
>> +static int memcmp_pages(struct page *page1, struct page *page2)
>> +{
>> +	char *addr1, *addr2;
>> +	int r;
>> +
>> +	addr1 = kmap_atomic(page1, KM_USER0);
>> +	addr2 = kmap_atomic(page2, KM_USER1);
>> +	r = memcmp(addr1, addr2, PAGE_SIZE);
>> +	kunmap_atomic(addr1, KM_USER0);
>> +	kunmap_atomic(addr2, KM_USER1);
>> +	return r;
>> +}
>> +
>> +/* pages_identical
>> + * return 1 if identical, 0 otherwise.
>> + */
>> +static inline int pages_identical(struct page *page1, struct page *page2)
>> +{
>> +	return !memcmp_pages(page1, page2);
>> +}
>> +
>> +/*
>> + * try_to_merge_one_page - take two pages and merge them into one
>> + * @mm: mm_struct that hold vma pointing into oldpage
>> + * @vma: the vma that hold the pte pointing into oldpage
>> + * @oldpage: the page that we want to replace with newpage
>> + * @newpage: the page that we want to map instead of oldpage
>> + * @newprot: the new permission of the pte inside vma
>> + * note:
>> + * oldpage should be anon page while newpage should be file mapped page
>> + *
>> + * this function return 0 if the pages were merged, 1 otherwise.
>> + */
>> +static int try_to_merge_one_page(struct mm_struct *mm,
>> +				 struct vm_area_struct *vma,
>> +				 struct page *oldpage,
>> +				 struct page *newpage,
>> +				 pgprot_t newprot)
>> +{
>> +	int ret = 1;
>> +	int odirect_sync;
>> +	unsigned long page_addr_in_vma;
>> +	pte_t orig_pte, *orig_ptep;
>> +
>> +	get_page(newpage);
>> +	get_page(oldpage);
>> +
>> +	down_read(&mm->mmap_sem);
>> +
>> +	page_addr_in_vma = addr_in_vma(vma, oldpage);
>> +	if (page_addr_in_vma == -EFAULT)
>> +		goto out_unlock;
>> +
>> +	orig_ptep = get_pte(mm, page_addr_in_vma);
>> +	if (!orig_ptep)
>> +		goto out_unlock;
>> +	orig_pte = *orig_ptep;
>> +	pte_unmap(orig_ptep);
>> +	if (!pte_present(orig_pte))
>> +		goto out_unlock;
>> +	if (page_to_pfn(oldpage) != pte_pfn(orig_pte))
>> +		goto out_unlock;
>> +	/*
>> +	 * we need the page lock to read a stable PageSwapCache in
>> +	 * page_wrprotect()
>> +	 */
>> +	if (!trylock_page(oldpage))
>> +		goto out_unlock;
>> +	/*
>> +	 * page_wrprotect check if the page is swapped or in swap cache,
>> +	 * in the future we might want to run here if_present_pte and then
>> +	 * swap_free
>> +	 */
>> +	if (!page_wrprotect(oldpage, &odirect_sync, 2)) {
>> +		unlock_page(oldpage);
>> +		goto out_unlock;
>> +	}
>> +	unlock_page(oldpage);
>> +	if (!odirect_sync)
>> +		goto out_unlock;
>> +
>> +	orig_pte = pte_wrprotect(orig_pte);
>> +
>> +	if (pages_identical(oldpage, newpage))
>> +		ret = replace_page(vma, oldpage, newpage, orig_pte, newprot);
>> +
>> +out_unlock:
>> +	up_read(&mm->mmap_sem);
>> +	put_page(oldpage);
>> +	put_page(newpage);
>> +	return ret;
>> +}
>> +
>> +/*
>> + * try_to_merge_two_pages - take two identical pages and prepare them to be
>> + * merged into one page.
>> + *
>> + * this function return 0 if we successfully mapped two identical pages into one
>> + * page, 1 otherwise.
>> + * (note in case we created KsmPage and mapped one page into it but the second
>> + *  page was not mapped we consider it as a failure and return 1)
>> + */
>> +static int try_to_merge_two_pages(struct mm_struct *mm1, struct page *page1,
>> +				  struct mm_struct *mm2, struct page *page2,
>> +				  unsigned long addr1, unsigned long addr2)
>> +{
>> +	struct vm_area_struct *vma;
>> +	pgprot_t prot;
>> +	int ret = 1;
>> +
>> +	/*
>> +	 * If page2 isn't shared (it isn't PageKsm) we have to allocate a new
>> +	 * file mapped page and make the two ptes of mm1(page1) and mm2(page2)
>> +	 * point to it.  If page2 is shared, we can just make the pte of
>> +	 * mm1(page1) point to page2
>> +	 */
>> +	if (PageKsm(page2)) {
>> +		down_read(&mm1->mmap_sem);
>> +		vma = find_vma(mm1, addr1);
>> +		up_read(&mm1->mmap_sem);
>> +		if (!vma)
>> +			return ret;
>> +		prot = vma->vm_page_prot;
>> +		pgprot_val(prot) &= ~_PAGE_RW;
>> +		ret = try_to_merge_one_page(mm1, vma, page1, page2, prot);
>> +	} else {
>> +		struct page *kpage;
>> +
>> +		kpage = alloc_page(GFP_HIGHUSER);
>> +		if (!kpage)
>> +			return ret;
>> +		down_read(&mm1->mmap_sem);
>> +		vma = find_vma(mm1, addr1);
>> +		up_read(&mm1->mmap_sem);
>> +		if (!vma) {
>> +			put_page(kpage);
>> +			return ret;
>> +		}
>> +		prot = vma->vm_page_prot;
>> +		pgprot_val(prot) &= ~_PAGE_RW;
>> +
>> +		copy_user_highpage(kpage, page1, addr1, vma);
>> +		ret = try_to_merge_one_page(mm1, vma, page1, kpage, prot);
>> +
>> +		if (!ret) {
>> +			down_read(&mm2->mmap_sem);
>> +			vma = find_vma(mm2, addr2);
>> +			up_read(&mm2->mmap_sem);
>> +			if (!vma) {
>> +				put_page(kpage);
>> +				ret = 1;
>> +				return ret;
>> +			}
>> +
>> +			prot = vma->vm_page_prot;
>> +			pgprot_val(prot) &= ~_PAGE_RW;
>> +
>> +			ret = try_to_merge_one_page(mm2, vma, page2, kpage,
>> +						    prot);
>> +			/*
>> +			 * If the secoend try_to_merge_one_page call was failed,
>> +			 * we are in situation where we have Ksm page that have
>> +			 * just one pte pointing to it, in this case we break
>> +			 * it.
>> +			 */
>> +			if (ret) {
>> +				struct page *tmppage[1];
>> +
>> +				down_read(&mm1->mmap_sem);
>> +				if (get_user_pages(current, mm1, addr1, 1, 1,
>> +						    0, tmppage, NULL)) {
>> +					put_page(tmppage[0]);
>> +				}
>> +				up_read(&mm1->mmap_sem);
>> +			}
>> +		}
>> +		put_page(kpage);
>> +	}
>> +	return ret;
>> +}
>>     
>
> I'm sorry if I'm wrong. Is the above "kpage" is free from global LRU and never be
> reclaimed(swapped-out) by global LRU ?
>   
kpage is actually what going to be KsmPage -> the shared page...

Right now this pages are not swappable..., after ksm will be merged we 
will make this pages swappable as well...

> If so, please
>  - show the amount of kpage
>  
>  - allow users to set limit for usage of kpages. or preserve kpages at boot or
>    by user's command.
>   

kpage actually save memory..., and limiting the number of them, would 
make you limit the number of shared pages...

> Thanks,
> -Kame
>
>   

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Izik Eidus March 31, 2009, 12:24 p.m. UTC | #4
Anthony Liguori wrote:
> Izik Eidus wrote:
>> Ksm is driver that allow merging identical pages between one or more
>> applications in way unvisible to the application that use it.
>> Pages that are merged are marked as readonly and are COWed when any
>> application try to change them.
>>
>> Ksm is used for cases where using fork() is not suitable,
>> one of this cases is where the pages of the application keep changing
>> dynamicly and the application cannot know in advance what pages are
>> going to be identical.
>>
>> Ksm works by walking over the memory pages of the applications it
>> scan in order to find identical pages.
>> It uses a two sorted data strctures called stable and unstable trees
>> to find in effective way the identical pages.
>>
>> When ksm finds two identical pages, it marks them as readonly and merges
>> them into single one page,
>> after the pages are marked as readonly and merged into one page, linux
>> will treat this pages as normal copy_on_write pages and will fork them
>> when write access will happen to them.
>>
>> Ksm scan just memory areas that were registred to be scanned by it.
>>
>> Ksm api:
>>
>> KSM_GET_API_VERSION:
>> Give the userspace the api version of the module.
>>
>> KSM_CREATE_SHARED_MEMORY_AREA:
>> Create shared memory reagion fd, that latter allow the user to register
>> the memory region to scan by using:
>> KSM_REGISTER_MEMORY_REGION and KSM_REMOVE_MEMORY_REGION
>>
>> KSM_START_STOP_KTHREAD:
>> Return information about the kernel thread, the inforamtion is returned
>> using the ksm_kthread_info structure:
>> ksm_kthread_info:
>> __u32 sleep:
>>         number of microsecoends to sleep between each iteration of
>> scanning.
>>
>> __u32 pages_to_scan:
>>         number of pages to scan for each iteration of scanning.
>>
>> __u32 max_pages_to_merge:
>>         maximum number of pages to merge in each iteration of scanning
>>         (so even if there are still more pages to scan, we stop this
>> iteration)
>>
>> __u32 flags:
>>        flags to control ksmd (right now just ksm_control_flags_run
>>                   available)
>>   
>
> Wouldn't this make more sense as a sysfs interface?

I belive using ioctl for registering memory of applications make it 
easier....
Ksm doesnt have any complicated API that would benefit from sysfs 
(beside adding more complexity)

> That is, the KSM_START_STOP_KTHREAD part, not necessarily the rest of 
> the API.

What you mean?
>
> Regards,
>
> Anthony Liguori
>

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Anthony Liguori March 31, 2009, 1:31 p.m. UTC | #5
Izik Eidus wrote:
>
> I belive using ioctl for registering memory of applications make it 
> easier....

Yes, I completely agree.

> Ksm doesnt have any complicated API that would benefit from sysfs 
> (beside adding more complexity)
>
>> That is, the KSM_START_STOP_KTHREAD part, not necessarily the rest of 
>> the API.
>
> What you mean?

The ioctl(KSM_START_STOP_KTHREAD) API is distinct from the rest of the 
API.  Whereas the rest of the API is used by applications to register 
their memory with KSM, this API is used by ksmctl to allow parameters to 
be tweaked in userspace.

These parameters are just simple values like enable, pages_to_scan, 
sleep_time.  Then there is KSM_GET_INFO_KTHREAD which provides a read 
interface to these parameters.

You could drop KSM_START_STOP_KTHREAD and KSM_GET_INFO_KTHREAD 
altogether, and introduce a sysfs hierarchy:

/sysfs/<some/path>/ksm/{enable,pages_to_scan,sleep_time}

That eliminates the need for ksmctl altogether, cleanly separates the 
two APIs, and provides a stronger interface.

The main problem with the current API is that it uses a single device to 
do both the administrative task and the userspace interface.  That means 
that any application that has access to registering its memory with KSM 
also has the ability to disable KSM.  That seems like a security concern 
to me since registering a memory region ought to be an unprivileged 
action whereas enabling/disabling KSM ought to be a privileged action.

Regards,

Anthony Liguori

>>
>> Regards,
>>
>> Anthony Liguori
>>
>

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andrea Arcangeli March 31, 2009, 2:25 p.m. UTC | #6
On Tue, Mar 31, 2009 at 08:31:31AM -0500, Anthony Liguori wrote:
> You could drop KSM_START_STOP_KTHREAD and KSM_GET_INFO_KTHREAD altogether, 
> and introduce a sysfs hierarchy:
>
> /sysfs/<some/path>/ksm/{enable,pages_to_scan,sleep_time}

Introducing a sysfs hierarchy sounds a bit of overkill.

> the ability to disable KSM.  That seems like a security concern to me since 
> registering a memory region ought to be an unprivileged action whereas 
> enabling/disabling KSM ought to be a privileged action.

sysfs files would then only be writeable by admin, so if we want to
allow only admin to start/stop/tune ksm it'd be enough to plug an
admin capability check in the ioctl to provide equivalent permissions.

I could imagine converting the enable/pages_to_scan/sleep_time to
module params and tweaking them through /sys/module/ksm/parameters,
but for "enable" to work that way, we'd need to intercept the write so
we can at least weakup the kksmd daemon, which doesn't seem possible
with /sys/module/ksm/parameters, so in the end if we stick to the
ioctl for registering regions, it seems simpler to use it for
start/stop/tune too.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Anthony Liguori March 31, 2009, 2:37 p.m. UTC | #7
Andrea Arcangeli wrote:

>> the ability to disable KSM.  That seems like a security concern to me since 
>> registering a memory region ought to be an unprivileged action whereas 
>> enabling/disabling KSM ought to be a privileged action.
>>     
>
> sysfs files would then only be writeable by admin, so if we want to
> allow only admin to start/stop/tune ksm it'd be enough to plug an
> admin capability check in the ioctl to provide equivalent permissions.
>   

Caps are not very granular unless you introduce a new capability.  
Furthermore, it's a bit more difficult to associate a capability with a 
user/group.

With sysfs, you use file based permissions to control the API.  It also 
fits into things like selinux a lot better.

In the very least, if you insist on not using sysfs, you should have a 
separate character device that's used for control (like /dev/ksmctl).

Regards,

Anthony Liguori

> I could imagine converting the enable/pages_to_scan/sleep_time to
> module params and tweaking them through /sys/module/ksm/parameters,
> but for "enable" to work that way, we'd need to intercept the write so
> we can at least weakup the kksmd daemon, which doesn't seem possible
> with /sys/module/ksm/parameters, so in the end if we stick to the
> ioctl for registering regions, it seems simpler to use it for
> start/stop/tune too.
>   

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andrea Arcangeli March 31, 2009, 3:02 p.m. UTC | #8
On Tue, Mar 31, 2009 at 09:37:17AM -0500, Anthony Liguori wrote:
> In the very least, if you insist on not using sysfs, you should have a 
> separate character device that's used for control (like /dev/ksmctl).

I'm fine to use sysfs that's not the point, if you've to add a ksmctl
device, then sysfs is surely better. Besides ksm would normally be
enabled at boot, tasks jailed by selinux will better not start/stop
this thing.

If people wants /sys/kernel/mm/ksm instead of the start_stop ioctl we
surely can add it (provided there's a way to intercept write to the
sysfs file). Problem is registering memory could also be done with
'echo 0 -1 >/proc/self/ksm' and be inherited by childs, it's not just
start/stop. I mean this is more a matter of taste I'm
afraid... Personally I'm more concerned about the registering of the
ram API than the start/stop thing which I cannot care less about, so
my logic is that as long as this pseudodevice exists, we should use it
for everything. If we go away from it, then we should remove it as a
whole.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Anthony Liguori March 31, 2009, 3:09 p.m. UTC | #9
Andrea Arcangeli wrote:
> On Tue, Mar 31, 2009 at 09:37:17AM -0500, Anthony Liguori wrote:
>   
>> In the very least, if you insist on not using sysfs, you should have a 
>> separate character device that's used for control (like /dev/ksmctl).
>>     
>
> I'm fine to use sysfs that's not the point, if you've to add a ksmctl
> device, then sysfs is surely better. Besides ksm would normally be
> enabled at boot, tasks jailed by selinux will better not start/stop
> this thing.
>
> If people wants /sys/kernel/mm/ksm instead of the start_stop ioctl we
> surely can add it (provided there's a way to intercept write to the
> sysfs file). Problem is registering memory could also be done with
> 'echo 0 -1 >/proc/self/ksm' and be inherited by childs, it's not just
> start/stop. I mean this is more a matter of taste I'm
> afraid... Personally I'm more concerned about the registering of the
> ram API than the start/stop thing which I cannot care less about,

I don't think the registering of ram should be done via sysfs.  That 
would be a pretty bad interface IMHO.  But I do think the functionality 
that ksmctl provides along with the security issues I mentioned earlier 
really suggest that there ought to be a separate API for control vs. 
registration and that control API would make a lot of sense as a sysfs API.

If you wanted to explore alternative APIs for registration, madvise() 
seems like the obvious candidate to me.

madvise(start, size, MADV_SHARABLE) seems like a pretty obvious API to me.

So combining a sysfs interface for control and an madvise() interface 
for registration seems like a really nice interface to me.

Regards,

Anthony Liguori

>  so
> my logic is that as long as this pseudodevice exists, we should use it
> for everything. If we go away from it, then we should remove it as a
> whole.
>   

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andrea Arcangeli March 31, 2009, 3:18 p.m. UTC | #10
On Tue, Mar 31, 2009 at 10:09:24AM -0500, Anthony Liguori wrote:
> I don't think the registering of ram should be done via sysfs.  That would 
> be a pretty bad interface IMHO.  But I do think the functionality that 
> ksmctl provides along with the security issues I mentioned earlier really 
> suggest that there ought to be a separate API for control vs. registration 
> and that control API would make a lot of sense as a sysfs API.
>
> If you wanted to explore alternative APIs for registration, madvise() seems 
> like the obvious candidate to me.
>
> madvise(start, size, MADV_SHARABLE) seems like a pretty obvious API to me.

madvise to me would sound appropriate, only if ksm would be always-in,
which is not the case as it won't even be built if it's configured to
N.

Besides madvise is sus covered syscall, and this is linux specific detail.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Anthony Liguori March 31, 2009, 3:54 p.m. UTC | #11
Andrea Arcangeli wrote:
> On Tue, Mar 31, 2009 at 10:09:24AM -0500, Anthony Liguori wrote:
>   
>> I don't think the registering of ram should be done via sysfs.  That would 
>> be a pretty bad interface IMHO.  But I do think the functionality that 
>> ksmctl provides along with the security issues I mentioned earlier really 
>> suggest that there ought to be a separate API for control vs. registration 
>> and that control API would make a lot of sense as a sysfs API.
>>
>> If you wanted to explore alternative APIs for registration, madvise() seems 
>> like the obvious candidate to me.
>>
>> madvise(start, size, MADV_SHARABLE) seems like a pretty obvious API to me.
>>     
>
> madvise to me would sound appropriate, only if ksm would be always-in,
> which is not the case as it won't even be built if it's configured to
> N.
>   

You can still disable ksm and simply return ENOSYS for the MADV_ flag.  
You could even keep it as a module if you liked by separating the 
madvise bits from the ksm bits.  The madvise() bits could just provide 
the tracking infrastructure for determine which vmas were currently 
marked as sharable.

You could then have ksm as loadable module that consumed that interface 
to then perform scanning.

> Besides madvise is sus covered syscall, and this is linux specific detail.
>   

A number of MADV_ flags are Linux specific (like MADV_DOFORK/MADV_DONTFORK).

Regards,

Anthony Liguori
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andrea Arcangeli March 31, 2009, 4:25 p.m. UTC | #12
On Tue, Mar 31, 2009 at 10:54:57AM -0500, Anthony Liguori wrote:
> You can still disable ksm and simply return ENOSYS for the MADV_ flag.  You 

-EINVAL if something, -ENOSYS would tell userland that it shall stop
trying to use madvise, including the other MADV_ too.

> could even keep it as a module if you liked by separating the madvise bits 
> from the ksm bits.  The madvise() bits could just provide the tracking 
> infrastructure for determine which vmas were currently marked as sharable.
> You could then have ksm as loadable module that consumed that interface to 
> then perform scanning.

What's the point of making ksm a module if one has part of ksm code
loaded in the kernel and not being possible to avoid compiling in?
People that says KSM=N in their .config (like embedded running with 1M
of ram), don't want that tracking overhead compiled into the kernel.

Returning -EINVAL would be an option but again I think madvise is core
syscall for SuS and I don't like that those core VM parts returns
-EINVAL at will depend on certain kernel modules being loaded.

> A number of MADV_ flags are Linux specific (like 
> MADV_DOFORK/MADV_DONTFORK).

But those aren't kernel module related, so they're in line with the
standard ones and could be adapted by other OS.

KSM is not a core VM functionality, madvise is a core VM
functionality, so I don't see fit. KSM as ioctl or KSM creating
/proc/<pid>/ksm when loaded, sounds fine to me instead. If open of
either one fails, application won't register in. It's up to you to
choose KSM=M/N, if you want it as core functionality just build as
KSM=Y but leave the option to others to save memory.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Anthony Liguori March 31, 2009, 4:51 p.m. UTC | #13
Andrea Arcangeli wrote:
> On Tue, Mar 31, 2009 at 10:54:57AM -0500, Anthony Liguori wrote:
>   
>> You can still disable ksm and simply return ENOSYS for the MADV_ flag.  You 
>>     
>
> -EINVAL if something, -ENOSYS would tell userland that it shall stop
> trying to use madvise, including the other MADV_ too.
>
>   
>> could even keep it as a module if you liked by separating the madvise bits 
>> from the ksm bits.  The madvise() bits could just provide the tracking 
>> infrastructure for determine which vmas were currently marked as sharable.
>> You could then have ksm as loadable module that consumed that interface to 
>> then perform scanning.
>>     
>
> What's the point of making ksm a module if one has part of ksm code
> loaded in the kernel and not being possible to avoid compiling in?
> People that says KSM=N in their .config (like embedded running with 1M
> of ram), don't want that tracking overhead compiled into the kernel.
>   

You have two things here.  CONFIG_MEM_SHARABLE and CONFIG_KSM.  
CONFIG_MEM_SHARABLE cannot be a module. If it's set to =n, then 
madvise(MADV_SHARABLE) == -ENOSYS.

If CONFIG_MEM_SHARABLE=y, then madvise(MADV_SHARABLE) will keep track of 
all sharable memory regions.  Independently of that, CONFIG_KSM can be 
set to n,m,y.  It depends on CONFIG_MEM_SHARABLE and when it's loaded, 
it consumes the list of sharable vmas.

But honestly, CONFIG_MEM_SHARABLE shouldn't a lot of code so I don't see 
why you'd even need to make it configable.

>> A number of MADV_ flags are Linux specific (like 
>> MADV_DOFORK/MADV_DONTFORK).
>>     
>
> But those aren't kernel module related, so they're in line with the
> standard ones and could be adapted by other OS.
>
> KSM is not a core VM functionality, madvise is a core VM
> functionality, so I don't see fit. KSM as ioctl or KSM creating
> /proc/<pid>/ksm when loaded, sounds fine to me instead. If open of
> either one fails, application won't register in. It's up to you to
> choose KSM=M/N, if you want it as core functionality just build as
> KSM=Y but leave the option to others to save memory.
>   

The ioctl() interface is quite bad for what you're doing.  You're 
telling the kernel extra information about a VA range in userspace.  
That's what madvise is for.  You're tweaking simple read/write values of 
kernel infrastructure.  That's what sysfs is for.

Regards,

Anthony Liguori
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andrea Arcangeli March 31, 2009, 5:11 p.m. UTC | #14
On Tue, Mar 31, 2009 at 11:51:14AM -0500, Anthony Liguori wrote:
> You have two things here.  CONFIG_MEM_SHARABLE and CONFIG_KSM.  
> CONFIG_MEM_SHARABLE cannot be a module. If it's set to =n, then 
> madvise(MADV_SHARABLE) == -ENOSYS.

Where the part that -ENOSYS tell userland madvise syscall table is
empty, which is obviously not the case, wasn't clear?

> If CONFIG_MEM_SHARABLE=y, then madvise(MADV_SHARABLE) will keep track of 
> all sharable memory regions.  Independently of that, CONFIG_KSM can be set 
> to n,m,y.  It depends on CONFIG_MEM_SHARABLE and when it's loaded, it 
> consumes the list of sharable vmas.

And what do you gain by creating two config params when only one is
needed other than more pain for the poor user doing make oldconfig and
being asked new zillon of questions that aren't necessary?

> But honestly, CONFIG_MEM_SHARABLE shouldn't a lot of code so I don't see 
> why you'd even need to make it configable.

Even if you were to move the registration code in madvise with a
-EINVAL retval if KSM was set to N for embedded, CONFIG_KSM would be
enough: the registration code would be surrounded by CONFIG_KSM_MODULE
|| CONFIG_KSM, just like page_wrprotect/replace_page. This
CONFIG_MEM_SHARABLE in addition to CONFIG_KSM is beyond what can make
sense to me.

> The ioctl() interface is quite bad for what you're doing.  You're telling 
> the kernel extra information about a VA range in userspace.  That's what 

The ioctl can be extended to also tell which pid to share without
having to specify VA range, and having the feature inherited by the
child. Not everyone wants to deal with VA.

But my main issue with madvise is that it's core kernel functionality
while KSM clearly is not.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andrea Arcangeli March 31, 2009, 8:52 p.m. UTC | #15
Hello,

I attach below some benchmark of the new ksm tree algorithm, showing
ksm performance in best and worst case scenarios.

-----------------------------------------------------------
Here a program ksmpages.c that tries to create the worst case scenario
for the ksm tree algorithm.

-----------------------------------------------------------
/* ksmpages.c: exercise KSM (C) Red Hat Inc. GPL'd */

#include <stdlib.h>
#include <malloc.h>
#include <unistd.h>
#include <fcntl.h>
#include <stdio.h>
#include "ksm.h"

#define SIZE (1UL*1024*1024*1024)

#define PAGE_SIZE 4096
#define PAGES (SIZE/PAGE_SIZE)

int ksm_register_memory(char * p)
{
	int fd;
	int ksm_fd;
	int r = 1;
	struct ksm_memory_region ksm_region;
 
	fd = open("/dev/ksm", O_RDWR | O_TRUNC, (mode_t)0600);
	if (fd == -1)
		goto out;
 
	ksm_fd = ioctl(fd, KSM_CREATE_SHARED_MEMORY_AREA);
	if (ksm_fd == -1)
		goto out_free;
 
	ksm_region.npages = PAGES;
	ksm_region.addr = (unsigned long) p;
	r = ioctl(ksm_fd, KSM_REGISTER_MEMORY_REGION, &ksm_region);
	if (r)
		goto out_free1;
 
	return r;
 
out_free1:
	close(ksm_fd);
out_free:
	close(fd);
out:
	return r;
}

int main(void)
{
	unsigned long page;
	char *p = memalign(PAGE_SIZE, PAGES*PAGE_SIZE);
	if (!p)
		perror("memalign"), exit(1);

	if (ksm_register_memory(p))
		printf("failed to register into ksm, run inside VM\n");
	else
		printf("registered into ksm, run outside VM\n");

	for (page = 0; page < PAGES; page++) {
		char *ppage;
		ppage = p + page * PAGE_SIZE +
			PAGE_SIZE - sizeof(unsigned long);
		*(unsigned long *)ppage = page;
	}

	pause();

	return 0;
}
-----------------------------------------------------------

ksmpages exercises ksm tree algorithm worst case where pages are all
equal except for the last bytes, so the memcmp breaks after having
accessed the worst-case amount of memory (i.e. almost 4096 bytes for
each level of the stable or unstable tree).

Top after running the first copy of ksmpages:

  PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  COMMAND
16473 andrea    20   0 1027m 1.0g  328 S    0 25.9   0:01.14 ksmpages

Below is "vmstat 1" while running a second copy with kksmd running at
100% CPU load:

-----------------------------------------------------------
 1  0   3104 2806044  60256  45532    0    0     0     0  912  338  0 25 74  0
 1  0   3104 2805700  60256  45532    0    0     0     0  676  171  0 27 73  0
 1  0   3104 2805452  60264  45524    0    0     0    36  708  172  0 23 77  0
 1  0   3104 2806428  60264  45532    0    0     0     0  787  210  0 25 75  0
 1  0   3104 2806212  60264  45524    0    0     0     0  643  132  0 25 75  0
 1  0   3104 2805864  60264  45524    0    0     0     0  685  157  0 27 73  0
 1  0   3104 2805616  60264  45524    0    0     0     0  640  128  0 23 77  0
 1  0   3104 2805368  60264  45524    0    0     0     0  637  131  0 25 75  0
 1  0   3104 2804996  60280  45508    0    0     0    76  704  165  0 25 75  0
 2  0   3104 2804748  60280  45524    0    0     0     0  636  131  0 27 73  0
 1  0   3104 2804500  60280  45524    0    0     0     0  641  133  0 23 77  0

Here the second copy of ksmpages is started.

 2  0   3104 2660544  60280  45524    0    0     0     0  711  178  0 28 72  0
 1  0   3104 1754096  60280  45524    0    0     0     0  839  172  1 47 53  0

1G of ram has been allocated and initialized by ksmpages.

 1  0   3104 1753848  60280  45524    0    0     0     0  632  122  0 27 73  0
 1  0   3104 1753328  60280  45524    0    0     0     0  661  167  0 23 77  0
 1  0   3104 1753104  60280  45524    0    0     0     0  635  129  0 25 75  0
 1  0   3104 1752856  60280  45524    0    0     0     0  635  127  0 25 75  0
 1  0   3104 1752608  60280  45524    0    0     0     0  677  158  0 27 73  0
 1  0   3104 1752360  60280  45524    0    0     0     0  636  132  0 23 77  0
 1  0   3104 1752112  60280  45524    0    0     0     0  638  133  0 25 75  0
 1  0   3104 1751864  60280  45524    0    0     0     0  665  149  0 25 75  0

It takes around 8 seconds for kksmd to complete a full scan of the 1G
indexed in the unstable tree plus the refresh of the checksum of the
whole 2G registered.

 1  0   3104 1758944  60280  45524    0    0     0     0  649  122  0 27 73  0
 1  0   3104 1772316  60280  45524    0    0     0     0  660  128  0 23 77  0
 1  0   3104 1784668  60280  45524    0    0     0     0  711  159  0 25 75  0
 1  0   3104 1796252  60280  45524    0    0     0     0  669  138  0 25 75  0
 1  0   3104 1807908  60280  45524    0    0     0     0  653  124  0 27 73  0
 1  0   3104 1819044  60280  45524    0    0     0     0  677  148  0 23 77  0
 1  0   3104 1829684  60280  45524    0    0     0     0  649  131  0 25 75  0
procs -----------memory---------- ---swap-- -----io---- -system-- ----cpu----
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa
 2  0   3104 1840324  60280  45524    0    0     0     0  653  131  0 25 75  0
 1  0   3104 1850840  60280  45524    0    0     0    96  734  158  0 27 73  0
 1  0   3104 1861132  60280  45524    0    0     0     0  645  133  0 23 77  0
 1  0   3104 1871424  60280  45524    0    0     0     0  639  129  0 25 75  0
 1  0   3104 1881716  60280  45524    0    0     0     0  676  147  0 25 75  0
 1  0   3104 1891736  60280  45524    0    0     0     0  649  122  0 27 73  0
 1  0   3104 1901656  60280  45524    0    0     0     4  656  137  0 23 77  0
 1  0   3104 1911576  60280  45524    0    0     0     0  682  162  0 25 75  0
 1  0   3104 1921496  60280  45524    0    0     0     0  642  128  0 25 75  0
 1  0   3104 1931292  60280  45524    0    0     0     0  630  126  0 27 73  0
 1  0   3104 1941064  60280  45524    0    0     0     0  676  152  0 23 77  0
 1  0   3104 1950760  60284  45520    0    0     0    24  667  136  0 25 75  0
 1  0   3104 1960160  60284  45524    0    0     0     0  649  129  0 25 75  0
 1  0   3104 1969584  60284  45524    0    0     0     0  671  145  0 27 73  0
 1  0   3104 1978736  60284  45524    0    0     0     0  643  128  0 23 77  0
 1  0   3104 1988036  60284  45524    0    0     0     0  638  127  0 25 75  0
 1  0   3104 1997212  60284  45524    0    0     0     0  674  156  0 25 75  0
 1  0   3104 2006240  60284  45524    0    0     0     0  632  124  0 27 73  0
 1  0   3104 2016204  60284  45524    0    0     0     0  636  128  0 23 77  0
 1  0   3104 2028452  60284  45524    0    0     0     0  691  156  0 25 75  0
 1  0   3104 2040728  60284  45524    0    0     0     0  657  133  0 25 75  0
 1  0   3104 2053004  60284  45524    0    0     0     0  660  128  0 27 73  0
 1  0   3104 2065428  60284  45524    0    0     0     0  686  153  0 23 77  0
 1  0   3104 2077680  60284  45524    0    0     0     0  660  127  0 25 75  0
 1  0   3104 2089264  60284  45524    0    0     0     0  656  127  0 25 75  0
 2  0   3104 2100796  60284  45524    0    0     0     0  670  148  0 27 73  0
 1  0   3104 2112476  60284  45524    0    0     0     0  652  138  0 23 77  0
 1  0   3104 2123884  60284  45524    0    0     0     0  641  129  0 25 75  0
 1  0   3104 2135516  60284  45524    0    0     0     0  674  151  0 25 75  0
 1  0   3104 2147196  60284  45524    0    0     0     0  645  126  0 27 73  0
procs -----------memory---------- ---swap-- -----io---- -system-- ----cpu----
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa
 2  0   3104 2158704  60284  45524    0    0     0     0  650  128  0 23 77  0
 1  0   3104 2170236  60284  45524    0    0     0     0  697  177  0 25 75  0
 1  0   3104 2181620  60284  45524    0    0     0     0  650  130  0 25 75  0
 1  0   3104 2192532  60284  45524    0    0     0     0  639  122  0 27 73  0
 1  0   3104 2203444  60284  45524    0    0     0     0  670  145  0 23 77  0
 1  0   3104 2214356  60284  45524    0    0     0     0  631  127  0 25 75  0
 1  0   3104 2225268  60284  45524    0    0     0     0  630  134  0 25 75  0
 1  0   3104 2235488  60284  45524    0    0     0     0  669  153  0 27 73  0
 1  0   3104 2245780  60284  45524    0    0     0     0  633  132  0 23 77  0
 1  0   3104 2255924  60284  45524    0    0     0     0  632  141  0 25 75  0
 1  0   3104 2265448  60284  45524    0    0     0     0  657  144  0 25 75  0
 1  0   3104 2274452  60284  45524    0    0     0     0  626  129  0 27 73  0
 1  0   3104 2286224  60284  45524    0    0     0     0  661  130  0 23 77  0
 1  0   3104 2297980  60284  45524    0    0     0     0  675  156  0 25 75  0
 1  0   3104 2309760  60284  45524    0    0     0     0  654  128  0 25 75  0
 1  0   3104 2321540  60284  45524    0    0     0     0  629  122  0 27 73  0
 1  0   3104 2333468  60284  45524    0    0     0     0  696  166  0 23 77  0
 1  0   3104 2344952  60284  45524    0    0     0     0  638  129  0 25 75  0
 1  0   3104 2356088  60284  45524    0    0     0     0  631  127  0 25 75  0
 1  0   3104 2367272  60284  45524    0    0     0     0  639  150  0 27 73  0
 1  0   3104 2378432  60284  45524    0    0     0     0  633  132  0 23 77  0
 1  0   3104 2389468  60284  45524    0    0     0     0  622  132  0 25 75  0
 1  0   3104 2400628  60284  45524    0    0     0     0  677  154  0 25 75  0
 1  0   3104 2411664  60284  45524    0    0     0     0  628  122  0 27 73  0
 1  0   3104 2422824  60284  45524    0    0     0     0  639  128  0 23 77  0
 1  0   3104 2433984  60284  45524    0    0     0     0  653  148  0 25 75  0
 1  0   3104 2444700  60284  45524    0    0     0     0  627  133  0 25 75  0
 1  0   3104 2455264  60284  45524    0    0     0     0  634  128  0 27 73  0
 1  0   3104 2465656  60284  45524    0    0     0     0  678  155  0 23 77  0
 1  0   3104 2476220  60284  45524    0    0     0     0  631  131  0 25 75  0
procs -----------memory---------- ---swap-- -----io---- -system-- ----cpu----
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa
 2  0   3104 2486760  60284  45524    0    0     0     0  641  139  0 25 75  0
 1  0   3104 2496756  60284  45524    0    0     0     0  651  148  0 27 73  0
 1  0   3104 2506676  60284  45524    0    0     0     0  630  130  0 23 77  0
 1  0   3104 2516448  60284  45524    0    0     0     0  631  127  0 25 75  0
 1  0   3104 2525848  60284  45524    0    0     0     0  676  154  0 25 75  0
 1  0   3104 2534752  60284  45524    0    0     0     0  625  122  0 27 73  0
 1  0   3104 2546720  60284  45524    0    0     0     0  665  145  0 23 77  0
 1  0   3104 2559864  60284  45524    0    0     0     0  700  158  0 25 75  0
 1  0   3104 2573008  60284  45524    0    0     0     0  671  127  0 25 75  0
 1  0   3104 2586028  60284  45524    0    0     0     0  681  126  0 27 73  0
 1  0   3104 2599024  60284  45524    0    0     0     0  681  145  0 23 77  0
 1  0   3104 2611772  60284  45524    0    0     0     0  662  132  0 25 75  0
 1  0   3104 2624320  60284  45524    0    0     0     0  668  129  0 25 75  0
 1  0   3104 2636844  60284  45524    0    0     0     0  698  152  0 27 73  0
 1  0   3104 2649368  60284  45524    0    0     0     0  665  128  0 23 77  0
 1  0   3104 2661892  60284  45524    0    0     0     0  660  127  0 25 75  0
 1  0   3104 2674268  60284  45524    0    0     0     0  695  161  0 25 75  0
 1  0   3104 2686816  60284  45524    0    0     0     0  652  124  0 27 73  0
 1  0   3104 2699192  60284  45524    0    0     0     0  667  128  0 23 77  0
 1  0   3104 2711220  60284  45524    0    0     0     0  696  161  0 25 75  0
 1  0   3104 2723224  60284  45524    0    0     0     0  653  132  0 25 75  0
 1  0   3104 2735128  60284  45524    0    0     0     0  650  127  0 27 73  0
 1  0   3104 2747156  60284  45524    0    0     0     0  700  154  0 23 77  0
 1  0   3104 2758640  60284  45524    0    0     0     0  662  127  0 25 75  0
 1  0   3104 2770172  60284  45524    0    0     0     0  671  127  0 25 75  0
 1  0   3104 2781432  60284  45524    0    0     0     0  685  150  0 27 73  0
 1  0   3104 2792196  60284  45524    0    0     0     0  663  135  0 23 77  0
 1  0   3104 2799308  60284  45524    0    0     0     0  662  148  0 24 76  0
 1  0   3104 2799416  60284  45524    0    0     0     0  700  213  0 21 78  0

It takes kksmd 96 seconds to merge 1G of ram in the absolute worst
case which has been created artificially. In the absolute worst case
scenario memory is freed roughly at a rate of 10M/sec.

 1  0   3104 2799416  60284  45524    0    0     0     0  672  193  0 24 76  0
procs -----------memory---------- ---swap-- -----io---- -system-- ----cpu----
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa
 2  0   3104 2799416  60284  45524    0    0     0     0  678  194  0 20 80  0
 1  0   3104 2799416  60284  45524    0    0     0     0  694  219  0 22 78  0
 1  0   3104 2799416  60284  45524    0    0     0     0  673  193  0 22 78  0
 1  0   3104 2799416  60284  45524    0    0     0     0  673  188  0 23 77  0
 1  0   3104 2799416  60284  45524    0    0     0     0  701  217  0 20 80  0
 1  0   3104 2799416  60284  45524    0    0     0     0  677  194  0 22 78  0
 1  0   3104 2799416  60284  45524    0    0     0     0  694  198  0 22 78  0
 1  0   3104 2799416  60284  45524    0    0     0     0  683  212  0 23 77  0
 1  0   3104 2799416  60284  45524    0    0     0     0  675  192  0 20 80  0
 1  0   3104 2799416  60284  45524    0    0     0     0  684  197  0 22 78  0
 1  0   3104 2799416  60284  45524    0    0     0     0  702  213  0 22 79  0
 1  0   3104 2799416  60284  45524    0    0     0     0  671  192  0 23 77  0
 1  0   3104 2799416  60284  45524    0    0     0     0  681  194  0 20 80  0
 1  0   3104 2799416  60284  45524    0    0     0     0  695  219  0 21 79  0
 1  0   3104 2799416  60284  45524    0    0     0     0  682  193  0 22 78  0
 1  0   3104 2799416  60284  45524    0    0     0     0  676  189  0 23 77  0
 1  0   3104 2799416  60284  45524    0    0     0     0  710  223  0 20 80  0
 1  0   3104 2799416  60284  45524    0    0     0     0  681  197  0 22 79  0

Result in top is:

  PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  COMMAND
16473 andrea    20   0 1027m 1.0g 1.0g S    0 25.9   0:01.14 ksmpages
16625 andrea    20   0 1027m 1.0g 1.0g S    0 25.9   0:01.01 ksmpages

SHR shows 1G full shared.

Start a new ksmpages:

 1  0   3104 2799292  60424  45544    0    0     0     0  685  185  0 24 76  0
 1  0   3104 2799292  60424  45544    0    0     0     0  699  230  0 21 79  0
 1  0   3104 2799292  60424  45544    0    0     0     0  679  189  0 22 78  0
 1  0   3104 2799292  60424  45544    0    0     0     0  678  196  0 22 78  0
 1  0   3104 2799292  60424  45544    0    0     0     0  704  215  0 19 81  0
 1  0   3104 2797664  60424  45544    0    0     0     0  795  330  1 22 77  0
 1  0   3104 2797516  60424  45548    0    0     0     0  722  276  0 21 79  0
 1  0   3104 2797516  60424  45548    0    0     0     0  706  242  0 23 77  0

Third copy of ksmpages started.

 2  0   3104 2518704  60424  45548    0    0     0     0 4113  228  0 27 73  0
 1  0   3104 1787900  60424  45548    0    0     0     0 13534  195  1 43 56  0

Third copy of ksmpages initialized its 1G of ram.

 1  0   3104 1823500  60424  45548    0    0     0     0  657  151  0 27 73  0
 1  0   3104 1858616  60428  45544    0    0     0    36  801  201  0 25 75  0
 1  0   3104 1893004  60428  45544    0    0     0     0  629  119  0 26 74  0
 1  0   3104 1926212  60428  45544    0    0     0     0  662  166  0 24 76  0
 1  0   3104 1958452  60428  45544    0    0     0     0  626  130  0 24 76  0
 1  0   3104 1988388  60428  45544    0    0     0     0  625  137  0 29 71  0
 1  0   3104 2017080  60428  45544    0    0     0     0  643  156  0 23 77  0
 1  0   3104 2047584  60428  45544    0    0     0     0  622  132  0 27 73  0
 1  0   3104 2077568  60428  45544    0    0     0     0  629  127  0 24 76  0
procs -----------memory---------- ---swap-- -----io---- -system-- ----cpu----
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa
 2  0   3104 2106584  60428  45544    0    0     0     0  677  171  0 23 77  0
 2  0   3104 2135476  60428  45544    0    0     0     0  623  119  0 28 72  0
 1  0   3104 2164392  60428  45544    0    0     0     0  619  137  0 26 74  0
 1  0   3104 2191896  60428  45544    0    0     0     0  637  147  0 23 77  0
 1  0   3104 2219152  60428  45544    0    0     0     0  620  132  0 27 73  0
 1  0   3104 2244920  60428  45544    0    0     0     0  621  126  0 22 78  0
 1  0   3104 2271056  60428  45544    0    0     0     0  666  164  0 26 74  0
 1  0   3104 2303172  60428  45544    0    0     0     0  626  122  0 27 73  0
 1  0   3104 2334892  60428  45544    0    0     0     0  621  132  0 26 74  0
 1  0   3104 2365272  60428  45544    0    0     0     0  642  148  0 23 77  0
 1  0   3104 2395652  60428  45544    0    0     0     0  632  140  0 23 77  0
 1  0   3104 2426008  60428  45544    0    0     0     0  620  122  0 28 72  0
 1  0   3104 2454924  60428  45544    0    0     0     0  665  165  0 23 77  0
 1  0   3104 2483172  60428  45544    0    0     0     0  618  119  0 27 73  0
 1  0   3104 2509536  60428  45544    0    0     0     0  618  132  0 26 74  0
 1  0   3104 2537384  60428  45544    0    0     0     0  660  164  0 22 78  0
 1  0   3104 2567764  60428  45544    0    0     0     0  622  132  0 30 70  0
 1  0   3104 2597524  60428  45544    0    0     0     0  620  119  0 24 76  0
 1  0   3104 2626292  60428  45544    0    0     0     0  639  158  0 24 76  0
 1  0   3104 2654936  60428  45544    0    0     0     0  624  131  0 26 74  0
 1  0   3104 2683704  60436  45536    0    0     0    28  640  151  0 23 77  0
 1  0   3104 2710960  60436  45544    0    0     0     0  673  152  0 26 74  0
 1  0   3104 2737844  60436  45544    0    0     0     0  618  132  0 28 72  0
 1  0   3104 2763364  60436  45544    0    0     0     0  619  123  0 24 76  0
 2  0   3104 2778328  60436  45544    0    0     0     0  657  183  0 23 77  0

This time it took kksmd only 34 seconds to merge the pages and it
started freeing pages immediately. This is because the ksmpages are in
the stable tree now, and they get merged immediately without checksum
overhead, only the worst-case memcmp for each level of the tree runs.

NOTE: the checksum is not used in any way to find equal pages, but
only to avoid filling the unstable tree with frequently changing
pages. In the future the dirty bit in the spte will tell us which
pages are changing frequently and which not in a more efficient way
than the checksum (only problem EPT sptes have no dirty bit). Removing
the checksum would only make the unstable tree more unstable, but it
would have no other downside (unstable tree is unstable anyway, but
it's less unstable than it would be, thanks to the checksum).

 1  0   3104 2778328  60448  45532    0    0     0    56  703  211  0 20 80  0
 1  0   3104 2778328  60448  45544    0    0     0     0  671  199  0 25 75  0
 1  0   3104 2778328  60448  45544    0    0     0     0  702  209  0 20 80  0
 0  0   3104 2778328  60448  45544    0    0     0     0  677  196  0 24 76  0
 1  0   3104 2778328  60448  45544    0    0     0     0  672  189  0 21 79  0
procs -----------memory---------- ---swap-- -----io---- -system-- ----cpu----
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa
 2  0   3104 2778328  60452  45540    0    0     0     4  694  237  0 20 80  0
 1  0   3104 2778328  60452  45544    0    0     0     0  675  189  0 24 76  0
 1  0   3104 2778328  60452  45544    0    0     0     0  676  199  0 23 77  0
 1  0   3104 2778328  60452  45544    0    0     0     0  700  207  0 19 81  0

Top:

  PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  COMMAND
16473 andrea    20   0 1027m 1.0g 1.0g S    0 25.9   0:01.14 ksmpages
16625 andrea    20   0 1027m 1.0g 1.0g S    0 25.9   0:01.01 ksmpages
16887 andrea    20   0 1027m 1.0g 1.0g S    0 25.9   0:01.02 ksmpages

SHR shows 1g shared for all three tasks.

Start a new copy of ksmpages:

 1  0   3104 2778576  60472  45544    0    0     0     0  702  231  0 22 78  0
 1  0   3104 2778576  60472  45544    0    0     0     0  682  189  0 23 77  0
 1  0   3104 2778576  60472  45544    0    0     0     0  682  213  0 20 80  0
 0  0   3104 2778576  60472  45544    0    0     0     0  699  217  0 21 79  0
 0  0   3104 2778576  60472  45544    0    0     0     0  683  205  0 22 78  0
 1  0   3104 2776800  60472  45544    0    0     0     0  795  327  1 22 77  0
 1  0   3104 2776800  60472  45552    0    0     0     0  724  282  0 21 79  0
 1  0   3104 2776800  60472  45552    0    0     0     0  683  197  0 22 78  0

ksmpages fourth copy is stared here:

 2  0   3104 2305216  60472  45552    0    0     0     0 6015  262  0 34 66  0
 1  0   3104 1772652  60472  45548    0    0     0     0 9988  171  1 42 57  0

ksmpages initialized its ram.

 1  0   3104 1807880  60480  45540    0    0     0    52  657  142  0 23 77  0
 1  0   3104 1841832  60480  45548    0    0     0     0  624  131  0 25 75  0
 1  0   3104 1875660  60480  45548    0    0     0     0  665  162  0 25 75  0
 1  0   3104 1908344  60488  45540    0    0     0    44  636  138  0 27 73  0
 1  0   3104 1940212  60488  45548    0    0     0     0  626  129  0 23 77  0
 1  0   3104 1969732  60488  45548    0    0     0     0  648  150  0 25 75  0
 1  0   3104 1998152  60488  45548    0    0     0     0  622  129  0 25 75  0
 1  0   3104 2028380  60488  45548    0    0     0     0  616  124  0 25 75  0
 2  0   3104 2058044  60488  45548    0    0     0     0  661  162  0 25 75  0
 1  0   3104 2086764  60488  45548    0    0     0     0  617  130  0 25 75  0
 1  0   3104 2115284  60488  45548    0    0     0     0  621  128  0 25 75  0
 1  0   3104 2143928  60488  45548    0    0     0     0  642  148  0 27 73  0
 1  0   3104 2171212  60488  45548    0    0     0     0  623  131  0 23 77  0
 1  0   3104 2198344  60488  45548    0    0     0     0  616  132  0 25 75  0
 1  0   3104 2224016  60488  45548    0    0     0     0  657  157  0 25 75  0
 1  0   3104 2249408  60488  45548    0    0     0     0  614  122  0 27 73  0
 1  0   3104 2281236  60488  45548    0    0     0     0  626  129  0 23 77  0
 1  0   3104 2312808  60488  45548    0    0     0     0  669  159  0 25 75  0
 1  0   3104 2342820  60488  45548    0    0     0     0  622  134  0 25 75  0
procs -----------memory---------- ---swap-- -----io---- -system-- ----cpu----
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa
 2  0   3104 2372976  60488  45548    0    0     0     0  617  122  0 27 73  0
 1  0   3104 2402960  60488  45548    0    0     0     0  652  149  0 23 77  0
 1  0   3104 2431732  60488  45548    0    0     0     0  620  131  0 25 75  0
 1  0   3104 2459808  60488  45548    0    0     0     0  622  135  0 25 75  0
 1  0   3104 2486448  60488  45548    0    0     0     0  659  160  0 27 73  0
 1  0   3104 2513348  60488  45548    0    0     0     0  619  129  0 23 77  0
 1  0   3104 2543328  60488  45548    0    0     0     0  620  129  0 25 75  0
 1  0   3104 2572916  60488  45548    0    0     0     0  652  154  0 25 75  0
 1  0   3104 2601364  60488  45548    0    0     0     0  618  128  0 25 75  0
 1  0   3104 2629884  60488  45548    0    0     0     0  626  140  0 25 75  0
 1  0   3104 2658280  60488  45548    0    0     0     0  653  159  0 25 75  0
 1  0   3104 2685688  60488  45548    0    0     0     0  619  132  0 25 75  0
 1  0   3104 2712720  60488  45548    0    0     0     0  619  126  0 27 73  0
 1  0   3104 2738392  60488  45548    0    0     0     0  643  153  0 23 77  0
 1  0   3104 2758752  60488  45548    0    0     0    24  649  145  0 24 76  0

Again 34 seconds, rate is roughly 30M/sec and there are 262144 pages
queued in the stable tree tree, with memcmp running for 4088 bytes per
page indexed.

 1  0   3104 2758852  60488  45548    0    0     0     0  683  193  0 22 79  0
 1  0   3104 2758852  60488  45548    0    0     0     0  691  212  0 23 76  0
 1  0   3104 2758852  60488  45548    0    0     0     0  673  195  0 20 80  0
 1  0   3104 2758852  60488  45548    0    0     0     0  672  195  0 21 79  0
 1  0   3104 2758852  60492  45544    0    0     0    20  714  223  0 21 79  0
 1  0   3104 2758852  60492  45548    0    0     0     0  680  190  0 23 77  0
 1  0   3104 2758852  60492  45548    0    0     0     0  674  194  0 20 80  0
 1  0   3104 2758852  60492  45548    0    0     0     0  689  222  0 22 78  0

Top:

  PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  COMMAND
16473 andrea    20   0 1027m 1.0g 1.0g S    0 25.9   0:01.14 ksmpages
16625 andrea    20   0 1027m 1.0g 1.0g S    0 25.9   0:01.01 ksmpages
16887 andrea    20   0 1027m 1.0g 1.0g S    0 25.9   0:01.02 ksmpages
16928 andrea    20   0 1027m 1.0g 1.0g S    0 25.9   0:01.03 ksmpages

So on a 4G system, with 4G allocated, we still have 2.7G free.

             total       used       free     shared    buffers     cached
Mem:       4043228    1284304    2758924          0      60500      45548
-/+ buffers/cache:    1178256    2864972
Swap:      5863684       3104    5860580

Now it's time to serially start 8 windows VM taking 1G of ram each,
after a couple of minutes 'vmstat 1' is below:

 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa
 1  0 102976 1010296   8416 508884    1    3     3    13   10   29  1 23 75  0
 1  0 102976 1035592   8416 508940    0    0     0     0 1939 2661  0 25 74  0
 1  0 102976 1064320   8416 508940    0    0     0     0 1893 2655  0 25 74  0
 1  0 102976 1091948   8416 508940    0    0     0     0 1904 2679  0 25 74  0
 1  0 102976 1120128   8416 508940    0    0     0     0 1878 2653  0 25 74  0
 3  0 102976 1148524   8416 508940    0    0     0     0 1879 2664  0 25 74  0
 1  0 102976 1176820   8420 508940    0    0     0     4 1889 2657  0 25 74  0
 1  0 102976 1204944   8420 508940    0    0     0     0 1872 2674  0 25 74  0
 1  0 102976 1230608   8420 508940    0    0     0     0 1918 2656  0 25 74  0
 1  0 102972 1258372   8428 508936    0    0    40    88 1991 2832  0 25 73  1
 1  0 102972 1263496   8428 508984    0    0     0     0 1673 2770  1 24 75  0
 1  0 102956 1289952   8428 508984    0    0     0     0 1735 2743  0 25 75  0
 1  0 102956 1293576   8428 508992    0    0     0     0 1678 2719  1 24 75  0
 1  0 102944 1290848   8452 509376    0    0     0     0 1610 2692  1 25 74  0
procs -----------memory---------- ---swap-- -----io---- -system-- ----cpu----
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa
 1  0 102944 1293976   8452 509376    0    0     0     0 1648 2720  0 24 75  0
 1  0 102944 1291116   8452 509376    0    0     0     0 1632 2701  1 25 74  0

All VM had most of their memory fully shared.

  PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  COMMAND
16979 andrea    20   0 1147m 975m 929m S    3 24.7   3:17.55 qemu-system-x86
16473 andrea    20   0 1027m 1.0g 1.0g S    0 25.9   0:01.14 ksmpages
16625 andrea    20   0 1027m 1.0g 1.0g S    0 25.9   0:01.01 ksmpages
16887 andrea    20   0 1027m 1.0g 1.0g S    0 25.9   0:01.02 ksmpages
16928 andrea    20   0 1027m 1.0g 1.0g S    0 25.9   0:01.03 ksmpages
16990 andrea    20   0 1147m 1.0g 967m S    0 25.7   3:16.66 qemu-system-x86
17095 andrea    20   0 1148m 1.0g 976m S    0 26.2   3:21.58 qemu-system-x86
17136 andrea    20   0 1148m 1.0g 977m S    0 26.3   2:43.85 qemu-system-x86
17367 andrea    20   0 1145m 1.0g 981m S    0 26.3   2:29.99 qemu-system-x86
17372 andrea    20   0 1148m 1.0g 980m S    0 26.4   2:27.67 qemu-system-x86
17527 andrea    20   0 1145m 1.0g 979m S    0 26.3   2:25.75 qemu-system-x86
17621 andrea    20   0 1148m 1.0g 979m S    0 26.4   2:26.34 qemu-system-x86

So total 12G are allocated with only 4G of RAM. Around 1G is still
free and very little swap is used.

Now that we're statisfied about the worst case being fully usable
thanks to the O(log(N)) complexity of the ksm tree algorithm (modulo
the checksum load that is O(N) where N is the number of the not shared
pages), I modify the ksmpages like this to exercise the ksm best case
scenario.

-               *(unsigned long *)ppage = page;
+               *(unsigned long *)ppage = 1;

 0  0   5372 3684996  31912 266328    0    0     0     0  612  616  0  0 100  0
 0  0   5372 3684996  31912 266328    0    0     0     0  611  598  0  0 100  0
 0  0   5372 3684996  31912 266328    0    0     0     0  613  615  0  0 100  0
 0  0   5372 3685120  31912 266328    0    0     0     0  610  600  0  0 100  0
 0  0   5372 3685120  31912 266328    0    0     0     0  613  617  0  0 100  0
 0  0   5372 3685120  31912 266328    0    0     0     0  611  599  0  0 100  0

ksmpages best case started.

 2  0   5372 2901972  31912 266328    0    0     0     0  838  280  1 43 56  0
 1  0   5372 2617840  31912 266328    0    0     0     0  692  146  0 32 68  0

ksmpages finishes to initialize its ram.

 1  0   5372 2848932  31920 266320    0    0     0    20  636  154  0 24 76  0
 1  0   5372 3104268  31920 266328    0    0     0     0  619  122  0 26 74  0
 1  0   5372 3363668  31920 266328    0    0     0     0  623  144  0 24 76  0
 1  0   5372 3629688  31920 266328    0    0     0     0  618  124  0 26 74  0

kksmd takes only 4 seconds merge and free 1G of ram because the moment
the single equal page goes in the unstable tree, the memcmp succeeds
immediately and after that all pages are merged into the single page
in the stable tree.

 1  0   5372 3665520  31920 266328    0    0     0     0  647  198  0 21 79  0
 0  0   5372 3665520  31920 266328    0    0     0     0  658  202  0 23 77  0
 1  0   5372 3665520  31920 266328    0    0     0     0  650  206  0 20 80  0
 1  0   5372 3665520  31920 266328    0    0     0     0  647  194  0 22 78  0

A new copy of ksmpages started:

 2  0   5372 3395336  31920 266328    0    0     0     0 30096  234  0 29 71  0
procs -----------memory---------- ---swap-- -----io---- -system-- ----cpu----
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa
 1  0   5372 2867512  31920 266328    0    0     0     0 69466  167  1 46 54  0

kksmd now starts to free pages before ksmpages finishes initializing its memory.

 1  0   5372 3104144  31920 266328    0    0     0     0  620  147  0 24 76  0
 1  0   5372 3337104  31920 266328    0    0     0     0  616  117  0 26 74  0
 1  0   5372 3565768  31920 266328    0    0     0     0  619  142  0 24 76  0
 1  0   5372 3645796  31920 266328    0    0     0     0  633  171  0 24 76  0

In 4 seconds all ram is merged again. RAM is freed roughly at 256M/sec
in the best case with stable tree composed of only one page and
unstable tree empty and no checksum computed because of the constant
'stable-tree' match.

 1  0   5372 3645796  31920 266328    0    0     0     0  645  207  0 20 80  0
 1  0   5372 3645796  31920 266328    0    0     0     0  646  201  0 23 77  0
 1  0   5372 3645796  31920 266328    0    0     0     0  659  202  0 20 80  0
 1  0   5372 3645796  31920 266328    0    0     0     0  643  198  0 22 78  0

The rbtree balancing being guaranteed by rb_color despite the unstable
tree pages changing without the tree being updated accordingly,
guarantees that as more pages are added in stable and unstable tree,
the memcmp overhead will increase only logarithmically. The checksum
overhead instead increases linearly with only the amount of pages
present in the unstable tree.

To verify that there is no COW and that pages are mapped readonly in
the pte, we modify ksmpages.c to loop and read all the pages after
the initialization.

/* ksmpages.c: exercise KSM (C) Red Hat Inc. GPL'd */

#include <stdlib.h>
#include <malloc.h>
#include <unistd.h>
#include <fcntl.h>
#include <stdio.h>
#include "ksm.h"

#define SIZE (1UL*1024*1024*1024)

#define PAGE_SIZE 4096
#define PAGES (SIZE/PAGE_SIZE)

unsigned long global;

int ksm_register_memory(char * p)
{
	int fd;
	int ksm_fd;
	int r = 1;
	struct ksm_memory_region ksm_region;
 
	fd = open("/dev/ksm", O_RDWR | O_TRUNC, (mode_t)0600);
	if (fd == -1)
		goto out;
 
	ksm_fd = ioctl(fd, KSM_CREATE_SHARED_MEMORY_AREA);
	if (ksm_fd == -1)
		goto out_free;
 
	ksm_region.npages = PAGES;
	ksm_region.addr = (unsigned long) p;
	r = ioctl(ksm_fd, KSM_REGISTER_MEMORY_REGION, &ksm_region);
	if (r)
		goto out_free1;
 
	return r;
 
out_free1:
	close(ksm_fd);
out_free:
	close(fd);
out:
	return r;
}

int main(void)
{
	unsigned long page;
	char *p = memalign(PAGE_SIZE, PAGES*PAGE_SIZE);
	if (!p)
		perror("memalign"), exit(1);

	if (ksm_register_memory(p))
		printf("failed to register into ksm, run inside VM\n");
	else
		printf("registered into ksm, run outside VM\n");

	for (page = 0; page < PAGES; page++) {
		char *ppage;
		ppage = p + page * PAGE_SIZE +
			PAGE_SIZE - sizeof(unsigned long);
		*(unsigned long *)ppage = page;
	}
	for (;;) {
		long before, after;
		struct timeval tv;
		sleep(1);
		gettimeofday(&tv, NULL);
		before = tv.tv_sec * 1000000 + tv.tv_usec;
		for (page = 0; page < PAGES; page++) {
			char *ppage;
			ppage = p + page * PAGE_SIZE +
				PAGE_SIZE - sizeof(unsigned long);
			global = *(unsigned long *)ppage;
		}
		gettimeofday(&tv, NULL);
		after = tv.tv_sec * 1000000 + tv.tv_usec;
		printf("%d usec\n", after-before);
	}

	pause();

	return 0;
}

7529 usec
7250 usec
7282 usec
7285 usec
7521 usec
7635 usec
7649 usec
7575 usec
7589 usec
7574 usec
7510 usec
7551 usec
7476 usec
7168 usec

  PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  COMMAND
19123 andrea    20   0 1027m 1.0g 1.0g S    1 25.9   0:02.81 ksmpages
19124 andrea    20   0 1027m 1.0g 1.0g S    1 25.9   0:02.72 ksmpages

The usec taken to read the memory don't change after the merging of
the pages. In fact thanks to sharing the same physical memory,
physically indexed CPU caches could improve application performance.

Here the oprofile including only the start of 2 ksmpages tasks until
they both share the same 1G of ram. Because ksmpages is explicitly
written to exacerbate the absolute worst case of ksm, most of the time
as expected is spent in memcmp_pages that is run to search the stable
and unstable trees.

CPU: Core 2, speed 2003 MHz (estimated)
Counted CPU_CLK_UNHALTED events (Clock cycles when not halted) with a unit mask of 0x00 (Unhalted core cycles) count 100000
samples  %        image name               app name                 symbol name
478896   75.8928  ksm.ko                   ksm                      memcmp_pages
38802     6.1491  ksmpages                 ksmpages                 main
28959     4.5893  ksm.ko                   ksm                      kthread_ksm_scan_thread
10643     1.6866  vmlinux-2.6.29           vmlinux-2.6.29           ext2_free_branches
9442      1.4963  vmlinux-2.6.29           vmlinux-2.6.29           nv_adma_qc_prep
8023      1.2714  vmlinux-2.6.29           vmlinux-2.6.29           bit_cursor
6603      1.0464  ksm.ko                   ksm                      get_rmap_item
4887      0.7745  vmlinux-2.6.29           vmlinux-2.6.29           ext2_new_inode
3527      0.5589  vmlinux-2.6.29           vmlinux-2.6.29           ahci_init_one
3012      0.4773  vmlinux-2.6.29           vmlinux-2.6.29           cfb_imageblit
2954      0.4681  vmlinux-2.6.29           vmlinux-2.6.29           register_framebuffer
2092      0.3315  ksm.ko                   ksm                      .text
1505      0.2385  libc-2.8.so              libc-2.8.so              (no symbols)
1425      0.2258  oprofiled                oprofiled                (no symbols)
1208      0.1914  vmlinux-2.6.29           vmlinux-2.6.29           try_to_extend_transaction
1081      0.1713  opreport                 opreport                 (no symbols)
1041      0.1650  libstdc++.so.6.0.8       libstdc++.so.6.0.8       (no symbols)
869       0.1377  vmlinux-2.6.29           vmlinux-2.6.29           pcie_aspm_init_link_state
855       0.1355  vmlinux-2.6.29           vmlinux-2.6.29           bit_clear_margins
817       0.1295  ksm.ko                   ksm                      is_present_pte
774       0.1227  vmlinux-2.6.29           vmlinux-2.6.29           put_disk
771       0.1222  vmlinux-2.6.29           vmlinux-2.6.29           journal_forget
741       0.1174  vmlinux-2.6.29           vmlinux-2.6.29           ext3_mark_iloc_dirty
635       0.1006  vmlinux-2.6.29           vmlinux-2.6.29           acpi_ds_exec_end_op
624       0.0989  vmlinux-2.6.29           vmlinux-2.6.29           fb_read
595       0.0943  vmlinux-2.6.29           vmlinux-2.6.29           acpi_ds_restart_control_method
595       0.0943  vmlinux-2.6.29           vmlinux-2.6.29           get_domain_for_dev
537       0.0851  vmlinux-2.6.29           vmlinux-2.6.29           cfb_copyarea
510       0.0808  libcrypto.so.0.9.8       libcrypto.so.0.9.8       (no symbols)
503       0.0797  vmlinux-2.6.29           vmlinux-2.6.29           configfs_mkdir
472       0.0748  ksm.ko                   ksm                      is_zapped_item
470       0.0745  vmlinux-2.6.29           vmlinux-2.6.29           ext2_truncate
419       0.0664  vmlinux-2.6.29           vmlinux-2.6.29           aer_print_error
411       0.0651  vmlinux-2.6.29           vmlinux-2.6.29           tcp_v6_rcv
406       0.0643  libbfd-2.18.so           libbfd-2.18.so           (no symbols)
362       0.0574  vmlinux-2.6.29           vmlinux-2.6.29           vesafb_setcolreg
318       0.0504  vmlinux-2.6.29           vmlinux-2.6.29           ext3_group_add
312       0.0494  ld-2.8.so                ld-2.8.so                (no symbols)
291       0.0461  vmlinux-2.6.29           vmlinux-2.6.29           domain_update_iommu_coherency
284       0.0450  bash                     bash                     (no symbols)
277       0.0439  vmlinux-2.6.29           vmlinux-2.6.29           compat_blkdev_ioctl
271       0.0429  vmlinux-2.6.29           vmlinux-2.6.29           ext2_block_to_path
262       0.0415  vmlinux-2.6.29           vmlinux-2.6.29           queue_requests_store
251       0.0398  vmlinux-2.6.29           vmlinux-2.6.29           nv_adma_tf_read
242       0.0384  ksm.ko                   ksm                      scan_get_next_index
236       0.0374  ksm.ko                   ksm                      try_to_merge_one_page
221       0.0350  vmlinux-2.6.29           vmlinux-2.6.29           pcie_aspm_exit_link_state
217       0.0344  vmlinux-2.6.29           vmlinux-2.6.29           sg_scsi_ioctl


Here the profiling of the same workload but with the change that
exercises the ksm absolute best case.

-               *(unsigned long *)ppage = page;
+               *(unsigned long *)ppage = 1;

CPU: Core 2, speed 2003 MHz (estimated)
Counted CPU_CLK_UNHALTED events (Clock cycles when not halted) with a unit mask of 0x00 (Unhalted core cycles) count 100000
samples  %        image name               app name                 symbol name
28855    25.9326  ksm.ko                   ksm                      memcmp_pages
14677    13.1906  ksm.ko                   ksm                      kthread_ksm_scan_thread
9610      8.6367  vmlinux-2.6.29           vmlinux-2.6.29           ext2_free_branches
8127      7.3039  vmlinux-2.6.29           vmlinux-2.6.29           nv_adma_qc_prep
6742      6.0592  vmlinux-2.6.29           vmlinux-2.6.29           bit_cursor
6578      5.9118  ksm.ko                   ksm                      get_rmap_item
5124      4.6051  vmlinux-2.6.29           vmlinux-2.6.29           ext2_new_inode
4216      3.7890  vmlinux-2.6.29           vmlinux-2.6.29           ahci_init_one
3500      3.1455  vmlinux-2.6.29           vmlinux-2.6.29           cfb_imageblit
3288      2.9550  ksmpages                 ksmpages                 main
3137      2.8193  vmlinux-2.6.29           vmlinux-2.6.29           register_framebuffer
1611      1.4478  ksm.ko                   ksm                      .text
1055      0.9482  vmlinux-2.6.29           vmlinux-2.6.29           bit_clear_margins
903       0.8115  vmlinux-2.6.29           vmlinux-2.6.29           journal_forget
894       0.8035  vmlinux-2.6.29           vmlinux-2.6.29           put_disk
767       0.6893  vmlinux-2.6.29           vmlinux-2.6.29           acpi_ds_restart_control_method
640       0.5752  vmlinux-2.6.29           vmlinux-2.6.29           acpi_ds_exec_end_op
626       0.5626  libc-2.8.so              libc-2.8.so              (no symbols)
608       0.5464  vmlinux-2.6.29           vmlinux-2.6.29           fb_read
518       0.4655  vmlinux-2.6.29           vmlinux-2.6.29           vesafb_setcolreg
482       0.4332  vmlinux-2.6.29           vmlinux-2.6.29           cfb_copyarea
478       0.4296  vmlinux-2.6.29           vmlinux-2.6.29           ext2_truncate
451       0.4053  ksm.ko                   ksm                      scan_get_next_index
404       0.3631  ksm.ko                   ksm                      is_present_pte
330       0.2966  vmlinux-2.6.29           vmlinux-2.6.29           ext2_block_to_path
320       0.2876  oprofiled                oprofiled                (no symbols)
270       0.2427  vmlinux-2.6.29           vmlinux-2.6.29           try_to_extend_transaction
253       0.2274  bash                     bash                     (no symbols)
242       0.2175  vmlinux-2.6.29           vmlinux-2.6.29           get_domain_for_dev
240       0.2157  vmlinux-2.6.29           vmlinux-2.6.29           domain_update_iommu_coherency
185       0.1663  vmlinux-2.6.29           vmlinux-2.6.29           pcie_aspm_init_link_state
156       0.1402  vmlinux-2.6.29           vmlinux-2.6.29           acpi_table_print_madt_entry
154       0.1384  vmlinux-2.6.29           vmlinux-2.6.29           acpi_ds_get_field_names
149       0.1339  vmlinux-2.6.29           vmlinux-2.6.29           configfs_mkdir
148       0.1330  vmlinux-2.6.29           vmlinux-2.6.29           ext3_mark_iloc_dirty
145       0.1303  ld-2.8.so                ld-2.8.so                (no symbols)
142       0.1276  vmlinux-2.6.29           vmlinux-2.6.29           ext3_group_add
138       0.1240  opreport                 opreport                 (no symbols)
130       0.1168  vmlinux-2.6.29           vmlinux-2.6.29           device_to_iommu
130       0.1168  vmlinux-2.6.29           vmlinux-2.6.29           fb_compat_ioctl
127       0.1141  vmlinux-2.6.29           vmlinux-2.6.29           acpi_ev_pci_config_region_setup
121       0.1087  ksm.ko                   ksm                      try_to_merge_one_page
113       0.1016  vmlinux-2.6.29           vmlinux-2.6.29           queue_requests_store
107       0.0962  vmlinux-2.6.29           vmlinux-2.6.29           fbcon_prepare_logo
101       0.0908  libbfd-2.18.so           libbfd-2.18.so           (no symbols)

In the shell where I was running 'vmstat 1' to know when to opcontrol
--stop to interrupt the profiling after all memory was already shared,
it is also visible the ksmpages 'read loop' improves substantially
thanks to the cache effects, when all the pages become the same.

procs -----------memory---------- ---swap-- -----io---- -system-- ----cpu----
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa
 2  0   3188 1783940  39204 281408    0    2     3    12   21    5  1 23 76  0
9743 usec
 1  0   3188 1536304  39208 281404    0    0     0     4 31262  291  1 31 68  0
9186 usec
8201 usec
 2  0   3188 1518376  39208 281408    0    0     0     0 25964  304  1 25 74  0
9505 usec
7713 usec
 1  0   3188 1654276  39208 281408    0    0     0     0 26275  307  1 25 74  0
7755 usec
6346 usec
 1  0   3188 1875652  39208 281408    0    0     0     0 25539  271  0 25 75  0
7600 usec
5188 usec
 1  0   3188 2101088  39208 281408    0    0     0     0 25687  280  0 25 74  0
7639 usec
4044 usec
 1  0   3188 2335168  39208 281504    0    0     0     0 25932  277  0 25 75  0
7673 usec
2574 usec
 1  0   3188 2572344  39208 281504    0    0     0     0 25853  294  0 25 74  0
7772 usec
1618 usec
 1  0   3188 2814220  39208 281504    0    0     0     0 26334  284  0 25 75  0
6047 usec
1617 usec
 1  0   3188 3059280  39208 281504    0    0     0     0 26092  284  0 25 75  0
4504 usec
1615 usec
 1  0   3188 3310648  39208 281504    0    0     0     0 26036  279  0 25 75  0
3108 usec
1626 usec
 1  0   3188 3567180  39208 281504    0    0     0   340 26025  283  0 25 75  0
1619 usec
1608 usec
 1  0   3188 3621548  39208 281504    0    0     0     0 24191  334  0 22 78  0
1606 usec
1611 usec
 1  0   3188 3621624  39208 281504    0    0     0     0 23763  356  0 21 79  0
1604 usec
1608 usec
 1  0   3188 3621624  39208 281504    0    0     0     0 23757  335  0 22 79  0
1604 usec
1612 usec
 0  0   3188 3621624  39208 281504    0    0     0     0 23750  350  0 22 79  0
1614 usec
1607 usec
 1  0   3188 3621624  39216 281496    0    0     0   456 23874  360  0 21 78  0
1609 usec
1608 usec
 1  0   3188 3621624  39216 281548    0    0     0     0 23693  352  0 21 79  0
1604 usec
1608 usec
 1  0   3188 3621624  39216 281548    0    0     0     0 23746  359  0 21 79  0
1609 usec
1631 usec
 1  0   3188 3621624  39216 281548    0    0     0     0 23814  432  0 22 78  0
1605 usec
1608 usec
 1  0   3188 3621624  39216 281548    0    0     0     0 23799  410  0 21 79  0
1613 usec

The read loop runs 4 times faster for both copies of ksmpages in the
background after all memory is merged and the virtual address points
to the same physical page that is already cached in the CPU (because
of physically indexed caches).

The whole benchmark has been run with pages_to_scan set to 99999 and
sleep_time 10 that make kksmd run at 100% CPU load, in real life
scenarios kksmd may do less scanning and memory freeing may happen at
a slower peace.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
KAMEZAWA Hiroyuki March 31, 2009, 11:57 p.m. UTC | #16
On Tue, 31 Mar 2009 15:21:53 +0300
Izik Eidus <ieidus@redhat.com> wrote:
> >   
> kpage is actually what going to be KsmPage -> the shared page...
> 
> Right now this pages are not swappable..., after ksm will be merged we 
> will make this pages swappable as well...
> 
sure.

> > If so, please
> >  - show the amount of kpage
> >  
> >  - allow users to set limit for usage of kpages. or preserve kpages at boot or
> >    by user's command.
> >   
> 
> kpage actually save memory..., and limiting the number of them, would 
> make you limit the number of shared pages...
> 

Ah, I'm working for memory control cgroup. And *KSM* will be out of control.
It's ok to make the default limit value as INFINITY. but please add knobs.

Thanks,
-Kame

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Izik Eidus April 1, 2009, 5:28 p.m. UTC | #17
KAMEZAWA Hiroyuki wrote:
> On Tue, 31 Mar 2009 15:21:53 +0300
> Izik Eidus <ieidus@redhat.com> wrote:
>   
>>>   
>>>       
>> kpage is actually what going to be KsmPage -> the shared page...
>>
>> Right now this pages are not swappable..., after ksm will be merged we 
>> will make this pages swappable as well...
>>
>>     
> sure.
>
>   
>>> If so, please
>>>  - show the amount of kpage
>>>  
>>>  - allow users to set limit for usage of kpages. or preserve kpages at boot or
>>>    by user's command.
>>>   
>>>       
>> kpage actually save memory..., and limiting the number of them, would 
>> make you limit the number of shared pages...
>>
>>     
>
> Ah, I'm working for memory control cgroup. And *KSM* will be out of control.
> It's ok to make the default limit value as INFINITY. but please add knobs.
>   
Sure, when i will post V2 i will take care for this issue (i will do it 
after i get little bit more review for ksm.c.... :-))

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Izik Eidus April 1, 2009, 10:54 p.m. UTC | #18
Anthony Liguori wrote:
> Andrea Arcangeli wrote:
>> On Tue, Mar 31, 2009 at 10:54:57AM -0500, Anthony Liguori wrote:
>>  
>>> You can still disable ksm and simply return ENOSYS for the MADV_ 
>>> flag.  You     
>>
>>
Anthony, the biggest problem about madvice() is that it is a real system 
call api, i wouldnt want in that stage of ksm commit into api changes of 
linux...

The ioctl itself is restricting, madvice is much more...,

Can we draft this issue to after ksm is merged, and after all the big 
new fetures that we want to add to ksm will be merge....
(then the api would be much more stable, and we will be able to ask ppl 
in the list about changing of api, but for new driver that it yet to be 
merged, it is kind of overkill to add api to linux)

What do you think?

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Anthony Liguori April 2, 2009, 12:31 a.m. UTC | #19
Izik Eidus wrote:
> Anthony, the biggest problem about madvice() is that it is a real 
> system call api, i wouldnt want in that stage of ksm commit into api 
> changes of linux...
>
> The ioctl itself is restricting, madvice is much more...,
>
> Can we draft this issue to after ksm is merged, and after all the big 
> new fetures that we want to add to ksm will be merge....
> (then the api would be much more stable, and we will be able to ask 
> ppl in the list about changing of api, but for new driver that it yet 
> to be merged, it is kind of overkill to add api to linux)
>
> What do you think?

You can't change ABIs after something is merged or you break userspace.  
So you need to figure out the right ABI first.

Regards,

Anthony Liguori

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Chris Wright April 2, 2009, 12:48 a.m. UTC | #20
* Anthony Liguori (anthony@codemonkey.ws) wrote:
> You can't change ABIs after something is merged or you break userspace.   
> So you need to figure out the right ABI first.

Absolutely.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Chris Wright April 2, 2009, 1:22 a.m. UTC | #21
* Anthony Liguori (anthony@codemonkey.ws) wrote:
> The ioctl() interface is quite bad for what you're doing.  You're  
> telling the kernel extra information about a VA range in userspace.   
> That's what madvise is for.  You're tweaking simple read/write values of  
> kernel infrastructure.  That's what sysfs is for.

I agree re: sysfs (brought it up myself before).  As far as madvise vs.
ioctl, the one thing that comes from the ioctl is fops->release to
automagically unregister memory on exit.  This needs to be handled
anyway if some -p pid is added to add a process after it's running,
so less weight there.

thanks,
-chris
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Anthony Liguori April 2, 2009, 2:36 a.m. UTC | #22
Chris Wright wrote:
> * Anthony Liguori (anthony@codemonkey.ws) wrote:
>   
>> The ioctl() interface is quite bad for what you're doing.  You're  
>> telling the kernel extra information about a VA range in userspace.   
>> That's what madvise is for.  You're tweaking simple read/write values of  
>> kernel infrastructure.  That's what sysfs is for.
>>     
>
> I agree re: sysfs (brought it up myself before).  As far as madvise vs.
> ioctl, the one thing that comes from the ioctl is fops->release to
> automagically unregister memory on exit.

This is precisely why ioctl() is a bad interface.  fops->release isn't 
tied to the process but rather tied to the open file.  The file can stay 
open long after the process exits either by a fork()'d child inheriting 
the file descriptor or through something more sinister like SCM_RIGHTS.

In fact, a common mistake is to leak file descriptors by not closing 
them when exec()'ing a process.  Instead of just delaying a close, if 
you rely on this behavior to unregister memory regions, you could 
potentially have badness happen in the kernel if ksm attempted to access 
an invalid memory region.

So you absolutely have to automatically unregister regions in something 
other than the fops->release handler based on something that's tied to 
the pid's life cycle.

Using an interface like madvise() would force the issue to be dealt with 
properly from the start :-)

I'm often afraid of what sort of bugs we'd uncover in kvm if we passed 
the fds around via SCM_RIGHTS and started poking around :-/

Regards,

Anthony Liguori


>   This needs to be handled
> anyway if some -p pid is added to add a process after it's running,
> so less weight there.
>
> thanks,
> -chris
>   

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity April 2, 2009, 7:24 a.m. UTC | #23
Anthony Liguori wrote:
>
> I'm often afraid of what sort of bugs we'd uncover in kvm if we passed 
> the fds around via SCM_RIGHTS and started poking around :-/

kvm checks the mm doesn't change underneath.
Andrea Arcangeli April 2, 2009, 9:38 a.m. UTC | #24
On Wed, Apr 01, 2009 at 09:36:31PM -0500, Anthony Liguori wrote:
> on this behavior to unregister memory regions, you could potentially have 
> badness happen in the kernel if ksm attempted to access an invalid memory 
> region.

How could you possibly come to this conclusion? If badness could ever
happen then the original task with access to /dev/ksm could make the
same badness happen in the first place without needing to exec or pass
the fd to anybody else with IPC.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Izik Eidus April 2, 2009, 11:23 a.m. UTC | #25
Anthony Liguori wrote:
> Chris Wright wrote:
>> * Anthony Liguori (anthony@codemonkey.ws) wrote:
>>  
>>> The ioctl() interface is quite bad for what you're doing.  You're  
>>> telling the kernel extra information about a VA range in 
>>> userspace.   That's what madvise is for.  You're tweaking simple 
>>> read/write values of  kernel infrastructure.  That's what sysfs is for.
>>>     
>>
>> I agree re: sysfs (brought it up myself before).  As far as madvise vs.
>> ioctl, the one thing that comes from the ioctl is fops->release to
>> automagically unregister memory on exit.
>
> This is precisely why ioctl() is a bad interface.  fops->release isn't 
> tied to the process but rather tied to the open file.  The file can 
> stay open long after the process exits either by a fork()'d child 
> inheriting the file descriptor or through something more sinister like 
> SCM_RIGHTS.
>
> In fact, a common mistake is to leak file descriptors by not closing 
> them when exec()'ing a process.  Instead of just delaying a close, if 
> you rely on this behavior to unregister memory regions, you could 
> potentially have badness happen in the kernel if ksm attempted to 
> access an invalid memory region. 
How could such badness ever happen in the kernel?
Ksm work by virtual addresses!, it fetch the pages by using 
get_user_pages(), and the mm struct is protected by get_task_mm(), in 
addion we take the down_read(mmap_sem)

So how could ksm ever acces to invalid memory region unless the host 
page table or get_task_mm() would stop working!

When someone register memory for scan, we do get_task_mm() when the file 
is closed or when he say that he dont want this to be registered anymore 
he call the unregister ioctl


You can aurgoment about API, but this is mathamathical thing to say Ksm 
is insecure, please show me senario!
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/linux/ksm.h b/include/linux/ksm.h
new file mode 100644
index 0000000..5776dce
--- /dev/null
+++ b/include/linux/ksm.h
@@ -0,0 +1,69 @@ 
+#ifndef __LINUX_KSM_H
+#define __LINUX_KSM_H
+
+/*
+ * Userspace interface for /dev/ksm - kvm shared memory
+ */
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+#include <asm/types.h>
+
+#define KSM_API_VERSION 1
+
+#define ksm_control_flags_run 1
+
+/* for KSM_REGISTER_MEMORY_REGION */
+struct ksm_memory_region {
+	__u32 npages; /* number of pages to share */
+	__u32 pad;
+	__u64 addr; /* the begining of the virtual address */
+        __u64 reserved_bits;
+};
+
+struct ksm_kthread_info {
+	__u32 sleep; /* number of microsecoends to sleep */
+	__u32 pages_to_scan; /* number of pages to scan */
+	__u32 flags; /* control flags */
+        __u32 pad;
+        __u64 reserved_bits;
+};
+
+#define KSMIO 0xAB
+
+/* ioctls for /dev/ksm */
+
+#define KSM_GET_API_VERSION              _IO(KSMIO,   0x00)
+/*
+ * KSM_CREATE_SHARED_MEMORY_AREA - create the shared memory reagion fd
+ */
+#define KSM_CREATE_SHARED_MEMORY_AREA    _IO(KSMIO,   0x01) /* return SMA fd */
+/*
+ * KSM_START_STOP_KTHREAD - control the kernel thread scanning speed
+ * (can stop the kernel thread from working by setting running = 0)
+ */
+#define KSM_START_STOP_KTHREAD		 _IOW(KSMIO,  0x02,\
+					      struct ksm_kthread_info)
+/*
+ * KSM_GET_INFO_KTHREAD - return information about the kernel thread
+ * scanning speed.
+ */
+#define KSM_GET_INFO_KTHREAD		 _IOW(KSMIO,  0x03,\
+					      struct ksm_kthread_info)
+
+
+/* ioctls for SMA fds */
+
+/*
+ * KSM_REGISTER_MEMORY_REGION - register virtual address memory area to be
+ * scanned by kvm.
+ */
+#define KSM_REGISTER_MEMORY_REGION       _IOW(KSMIO,  0x20,\
+					      struct ksm_memory_region)
+/*
+ * KSM_REMOVE_MEMORY_REGION - remove virtual address memory area from ksm.
+ */
+#define KSM_REMOVE_MEMORY_REGION         _IO(KSMIO,   0x21)
+
+#endif
diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h
index a820f81..6d4f8df 100644
--- a/include/linux/miscdevice.h
+++ b/include/linux/miscdevice.h
@@ -29,6 +29,7 @@ 
 #define HPET_MINOR		228
 #define FUSE_MINOR		229
 #define KVM_MINOR		232
+#define KSM_MINOR		233
 #define MISC_DYNAMIC_MINOR	255
 
 struct device;
diff --git a/mm/Kconfig b/mm/Kconfig
index a5b7781..2818223 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -216,3 +216,9 @@  config UNEVICTABLE_LRU
 
 config MMU_NOTIFIER
 	bool
+
+config KSM
+	tristate "Enable KSM for page sharing"
+	help
+	  Enable the KSM kernel module to allow page sharing of equal pages
+	  among different tasks.
diff --git a/mm/Makefile b/mm/Makefile
index 72255be..e3bf7bf 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -24,6 +24,7 @@  obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
 obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
 obj-$(CONFIG_SLOB) += slob.o
 obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
+obj-$(CONFIG_KSM) += ksm.o
 obj-$(CONFIG_SLAB) += slab.o
 obj-$(CONFIG_SLUB) += slub.o
 obj-$(CONFIG_FAILSLAB) += failslab.o
diff --git a/mm/ksm.c b/mm/ksm.c
new file mode 100644
index 0000000..eba4c09
--- /dev/null
+++ b/mm/ksm.c
@@ -0,0 +1,1431 @@ 
+/*
+ * Memory merging driver for Linux
+ *
+ * This module enables dynamic sharing of identical pages found in different
+ * memory areas, even if they are not shared by fork()
+ *
+ * Copyright (C) 2008 Red Hat, Inc.
+ * Authors:
+ *	Izik Eidus
+ *	Andrea Arcangeli
+ *	Chris Wright
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <linux/vmalloc.h>
+#include <linux/file.h>
+#include <linux/mman.h>
+#include <linux/sched.h>
+#include <linux/rwsem.h>
+#include <linux/pagemap.h>
+#include <linux/sched.h>
+#include <linux/rmap.h>
+#include <linux/spinlock.h>
+#include <linux/jhash.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/wait.h>
+#include <linux/scatterlist.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/swap.h>
+#include <linux/rbtree.h>
+#include <linux/anon_inodes.h>
+#include <linux/ksm.h>
+
+#include <asm/tlbflush.h>
+
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
+
+static int rmap_hash_size;
+module_param(rmap_hash_size, int, 0);
+MODULE_PARM_DESC(rmap_hash_size, "Hash table size for the reverse mapping");
+
+/*
+ * ksm_mem_slot - hold information for an userspace scanning range
+ * (the scanning for this region will be from addr untill addr +
+ *  npages * PAGE_SIZE inside mm)
+ */
+struct ksm_mem_slot {
+	struct list_head link;
+	struct list_head sma_link;
+	struct mm_struct *mm;
+	unsigned long addr;	/* the begining of the virtual address */
+	unsigned npages;	/* number of pages to share */
+};
+
+/*
+ * ksm_sma - shared memory area, each process have its own sma that contain the
+ * information about the slots that it own
+ */
+struct ksm_sma {
+	struct list_head sma_slots;
+};
+
+/**
+ * struct ksm_scan - cursor for scanning
+ * @slot_index: the current slot we are scanning
+ * @page_index: the page inside the sma that is currently being scanned
+ *
+ * ksm uses it to know what are the next pages it need to scan
+ */
+struct ksm_scan {
+	struct ksm_mem_slot *slot_index;
+	unsigned long page_index;
+};
+
+/*
+ * Few notes about ksm scanning progress (make it easier to understand the
+ * data structures below):
+ *
+ * In order to reduce excessive scanning, ksm sort the memory pages by their
+ * contents into a data strcture that hold pointer into the pages.
+ *
+ * Since the contents of the pages may change at any moment, ksm cant just
+ * insert the pages into normal sorted tree and expect it to find anything.
+ *
+ * For this purpuse ksm use two data strctures - stable and unstable trees,
+ * the stable tree hold pointers into all the merged pages (KsmPage) sorted by
+ * their contents, beacuse that each such page have to be write-protected,
+ * searching on this tree is fully assuranced to be working and therefore this
+ * tree is called the stable tree.
+ *
+ * In addition to the stable tree, ksm use another data strcture called the
+ * unstable tree, this specific tree hold pointers into pages that have
+ * been found to be "unchanged for period of time", the unstable tree sort this
+ * pages by their contents, but given the fact that this pages are not
+ * write-protected, ksm cant trust the unstable tree to be fully assuranced to
+ * work.
+ * For the reason that the unstable tree would become corrupted when some of
+ * the page inside itself would change, the tree is called unstable.
+ * Ksm solve this problem by two ways:
+ * 1) the unstable tree get flushed every time ksm finish to scan the whole
+ *    memory, and then the tree is rebuild from the begining.
+ * 2) Ksm will only insert into the unstable tree, pages that their hash value
+ *    was not changed during the whole progress of one circuler scanning of the
+ *    memory.
+ * 3) The unstable tree is RedBlack Tree - meaning its balancing is based on
+ *    the colors of the nodes and not their content, this assure that even when
+ *    the tree get "corrupted" we wont get out of balance and the timing of
+ *    scanning is the same, another issue is that searching and inserting nodes
+ *    into rbtree is the same algorithem, therefore we have no overhead when we
+ *    flush the tree and rebuild it.
+ * 4) Ksm never flush the stable tree, this mean that even if it would take 10
+ *    times to find page inside the unstable tree, as soon as we would find it,
+ *    it will be secured inside the stable tree,
+ *    (When we scan new page, we first compare it against the stable tree, and
+ *     then against the unstable tree)
+ */
+
+struct rmap_item;
+
+/*
+ * tree_item - object of the stable and unstable trees
+ */
+struct tree_item {
+	struct rb_node node;
+	struct rmap_item *rmap_item;
+};
+
+/*
+ * rmap_item - object of the rmap_hash hash table
+ * (it is holding the previous hash value (oldindex),
+ *  pointer into the page_hash_item, and pointer into the tree_item)
+ */
+
+/**
+ * struct rmap_item - reverse mapping item for virtual addresses
+ * @link: link into the rmap_hash hash table.
+ * @mm: the memory strcture the rmap_item is pointing to.
+ * @address: the virtual address the rmap_item is pointing to.
+ * @oldchecksum: old checksum result for the page belong the virtual address
+ * @stable_tree: when 1 rmap_item is used for stable_tree, 0 unstable tree
+ * @tree_item: pointer into the stable/unstable tree that hold the virtual
+ *             address that the rmap_item is pointing to.
+ * @next: the next rmap item inside the stable/unstable tree that have that is
+ *        found inside the same tree node.
+ */
+
+struct rmap_item {
+	struct hlist_node link;
+	struct mm_struct *mm;
+	unsigned long address;
+	unsigned int oldchecksum; /* old checksum value */
+	unsigned char stable_tree; /* 1 stable_tree 0 unstable tree */
+	struct tree_item *tree_item;
+	struct rmap_item *next;
+	struct rmap_item *prev;
+};
+
+/*
+ * slots is linked list that hold all the memory regions that were registred
+ * to be scanned.
+ */
+static LIST_HEAD(slots);
+/*
+ * slots_lock protect against removing and adding memory regions while a scanner
+ * is in the middle of scanning.
+ */
+static DECLARE_RWSEM(slots_lock);
+
+/* The stable and unstable trees heads. */
+struct rb_root root_stable_tree = RB_ROOT;
+struct rb_root root_unstable_tree = RB_ROOT;
+
+
+/* The number of linked list members inside the hash table */
+static int nrmaps_hash;
+/* rmap_hash hash table */
+static struct hlist_head *rmap_hash;
+
+static struct kmem_cache *tree_item_cache;
+static struct kmem_cache *rmap_item_cache;
+
+static int kthread_sleep; /* sleep time of the kernel thread */
+static int kthread_pages_to_scan; /* npages to scan for the kernel thread */
+static struct ksm_scan kthread_ksm_scan;
+static int ksmd_flags;
+static struct task_struct *kthread;
+static DECLARE_WAIT_QUEUE_HEAD(kthread_wait);
+static DECLARE_RWSEM(kthread_lock);
+
+static int ksm_slab_init(void)
+{
+	int ret = -ENOMEM;
+
+	tree_item_cache = KMEM_CACHE(tree_item, 0);
+	if (!tree_item_cache)
+		goto out;
+
+	rmap_item_cache = KMEM_CACHE(rmap_item, 0);
+	if (!rmap_item_cache)
+		goto out_free;
+
+	return 0;
+
+out_free:
+	kmem_cache_destroy(tree_item_cache);
+out:
+	return ret;
+}
+
+static void ksm_slab_free(void)
+{
+	kmem_cache_destroy(rmap_item_cache);
+	kmem_cache_destroy(tree_item_cache);
+}
+
+static inline struct tree_item *alloc_tree_item(void)
+{
+	return kmem_cache_zalloc(tree_item_cache, GFP_KERNEL);
+}
+
+static void free_tree_item(struct tree_item *tree_item)
+{
+	kmem_cache_free(tree_item_cache, tree_item);
+}
+
+static inline struct rmap_item *alloc_rmap_item(void)
+{
+	return kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL);
+}
+
+static inline void free_rmap_item(struct rmap_item *rmap_item)
+{
+	kmem_cache_free(rmap_item_cache, rmap_item);
+}
+
+/*
+ * PageKsm - this type of pages are the write protected pages that ksm map
+ * into multiple vmas (this is the "shared page")
+ * this page was allocated using alloc_page(), and every pte that point to it
+ * is always write protected (therefore its data content cant ever be changed)
+ * and this page cant be swapped.
+ */
+static inline int PageKsm(struct page *page)
+{
+	/*
+	 * When ksm create new shared page, it create kernel allocated page
+	 * using alloc_page(), therefore this page is not anonymous, taking into
+         * account that ksm scan just anonymous pages, we can relay on the fact
+	 * that each time we see !PageAnon(page) we are hitting shared page.
+	 */
+	return !PageAnon(page);
+}
+
+static int rmap_hash_init(void)
+{
+	if (!rmap_hash_size) {
+		struct sysinfo sinfo;
+
+		si_meminfo(&sinfo);
+		rmap_hash_size = sinfo.totalram / 10;
+	}
+	nrmaps_hash = rmap_hash_size;
+	rmap_hash = vmalloc(nrmaps_hash * sizeof(struct hlist_head));
+	if (!rmap_hash)
+		return -ENOMEM;
+	memset(rmap_hash, 0, nrmaps_hash * sizeof(struct hlist_head));
+	return 0;
+}
+
+static void rmap_hash_free(void)
+{
+	int i;
+	struct hlist_head *bucket;
+	struct hlist_node *node, *n;
+	struct rmap_item *rmap_item;
+
+	for (i = 0; i < nrmaps_hash; ++i) {
+		bucket = &rmap_hash[i];
+		hlist_for_each_entry_safe(rmap_item, node, n, bucket, link) {
+			hlist_del(&rmap_item->link);
+			free_rmap_item(rmap_item);
+		}
+	}
+	vfree(rmap_hash);
+}
+
+static inline u32 calc_checksum(struct page *page)
+{
+	u32 checksum;
+	void *addr = kmap_atomic(page, KM_USER0);
+	checksum = jhash(addr, PAGE_SIZE, 17);
+	kunmap_atomic(addr, KM_USER0);
+	return checksum;
+}
+
+/*
+ * Return rmap_item for a given virtual address.
+ */
+static struct rmap_item *get_rmap_item(struct mm_struct *mm, unsigned long addr)
+{
+	struct rmap_item *rmap_item;
+	struct hlist_head *bucket;
+	struct hlist_node *node;
+
+	bucket = &rmap_hash[addr % nrmaps_hash];
+	hlist_for_each_entry(rmap_item, node, bucket, link) {
+		if (mm == rmap_item->mm && rmap_item->address == addr) {
+			return rmap_item;
+		}
+	}
+	return NULL;
+}
+
+/*
+ * Removing rmap_item from stable or unstable tree.
+ * This function will free the rmap_item object, and if that rmap_item was
+ * insde the stable or unstable trees, it would remove the link from there
+ * as well.
+ */
+static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
+{
+	struct tree_item *tree_item;
+
+	tree_item = rmap_item->tree_item;
+	rmap_item->tree_item = NULL;
+
+	if (rmap_item->stable_tree) {
+		if (rmap_item->prev) {
+			BUG_ON(rmap_item->prev->next != rmap_item);
+			rmap_item->prev->next = rmap_item->next;
+		}
+		if (rmap_item->next) {
+			BUG_ON(rmap_item->next->prev != rmap_item);
+			rmap_item->next->prev = rmap_item->prev;
+		}
+	}
+
+	if (tree_item) {
+		if (rmap_item->stable_tree) {
+	 		if (!rmap_item->next && !rmap_item->prev) {
+				rb_erase(&tree_item->node, &root_stable_tree);
+				free_tree_item(tree_item);
+			} else if (!rmap_item->prev) {
+				tree_item->rmap_item = rmap_item->next;
+			} else {
+				tree_item->rmap_item = rmap_item->prev;
+			}
+		} else if (!rmap_item->stable_tree) {
+			free_tree_item(tree_item);
+		}
+	}
+
+	hlist_del(&rmap_item->link);
+	free_rmap_item(rmap_item);
+}
+
+static void remove_page_from_tree(struct mm_struct *mm,
+				  unsigned long addr)
+{
+	struct rmap_item *rmap_item;
+
+	rmap_item = get_rmap_item(mm, addr);
+	if (!rmap_item)
+		return;
+	remove_rmap_item_from_tree(rmap_item);
+	return;
+}
+
+static int ksm_sma_ioctl_register_memory_region(struct ksm_sma *ksm_sma,
+						struct ksm_memory_region *mem)
+{
+	struct ksm_mem_slot *slot;
+	int ret = -EPERM;
+
+	slot = kzalloc(sizeof(struct ksm_mem_slot), GFP_KERNEL);
+	if (!slot) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	slot->mm = get_task_mm(current);
+	if (!slot->mm)
+		goto out_free;
+	slot->addr = mem->addr;
+	slot->npages = mem->npages;
+
+	down_write(&slots_lock);
+
+	list_add_tail(&slot->link, &slots);
+	list_add_tail(&slot->sma_link, &ksm_sma->sma_slots);
+
+	up_write(&slots_lock);
+	return 0;
+
+out_free:
+	kfree(slot);
+out:
+	return ret;
+}
+
+static void remove_mm_from_hash_and_tree(struct mm_struct *mm)
+{
+	struct ksm_mem_slot *slot;
+	int pages_count;
+
+	list_for_each_entry(slot, &slots, link)
+		if (slot->mm == mm)
+			break;
+	BUG_ON(!slot);
+
+	root_unstable_tree = RB_ROOT;
+	for (pages_count = 0; pages_count < slot->npages; ++pages_count)
+		remove_page_from_tree(mm, slot->addr +
+				      pages_count * PAGE_SIZE);
+	list_del(&slot->link);
+}
+
+static int ksm_sma_ioctl_remove_memory_region(struct ksm_sma *ksm_sma)
+{
+	struct ksm_mem_slot *slot, *node;
+
+	down_write(&slots_lock);
+	list_for_each_entry_safe(slot, node, &ksm_sma->sma_slots, sma_link) {
+		remove_mm_from_hash_and_tree(slot->mm);
+		mmput(slot->mm);
+		list_del(&slot->sma_link);
+		kfree(slot);
+	}
+	up_write(&slots_lock);
+	return 0;
+}
+
+static int ksm_sma_release(struct inode *inode, struct file *filp)
+{
+	struct ksm_sma *ksm_sma = filp->private_data;
+	int r;
+
+	r = ksm_sma_ioctl_remove_memory_region(ksm_sma);
+	kfree(ksm_sma);
+	return r;
+}
+
+static long ksm_sma_ioctl(struct file *filp,
+			  unsigned int ioctl, unsigned long arg)
+{
+	struct ksm_sma *sma = filp->private_data;
+	void __user *argp = (void __user *)arg;
+	int r = EINVAL;
+
+	switch (ioctl) {
+	case KSM_REGISTER_MEMORY_REGION: {
+		struct ksm_memory_region ksm_memory_region;
+
+		r = -EFAULT;
+		if (copy_from_user(&ksm_memory_region, argp,
+				   sizeof(ksm_memory_region)))
+			goto out;
+		r = ksm_sma_ioctl_register_memory_region(sma,
+							 &ksm_memory_region);
+		break;
+	}
+	case KSM_REMOVE_MEMORY_REGION:
+		r = ksm_sma_ioctl_remove_memory_region(sma);
+		break;
+	}
+
+out:
+	return r;
+}
+
+static unsigned long addr_in_vma(struct vm_area_struct *vma, struct page *page)
+{
+	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+	unsigned long addr;
+
+	addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+	if (unlikely(addr < vma->vm_start || addr >= vma->vm_end))
+		return -EFAULT;
+	return addr;
+}
+
+static pte_t *get_pte(struct mm_struct *mm, unsigned long addr)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *ptep = NULL;
+
+	pgd = pgd_offset(mm, addr);
+	if (!pgd_present(*pgd))
+		goto out;
+
+	pud = pud_offset(pgd, addr);
+	if (!pud_present(*pud))
+		goto out;
+
+	pmd = pmd_offset(pud, addr);
+	if (!pmd_present(*pmd))
+		goto out;
+
+	ptep = pte_offset_map(pmd, addr);
+out:
+	return ptep;
+}
+
+static int is_present_pte(struct mm_struct *mm, unsigned long addr)
+{
+	pte_t *ptep;
+	int r;
+
+	ptep = get_pte(mm, addr);
+	if (!ptep)
+		return 0;
+
+	r = pte_present(*ptep);
+	pte_unmap(ptep);
+
+	return r;
+}
+
+static int memcmp_pages(struct page *page1, struct page *page2)
+{
+	char *addr1, *addr2;
+	int r;
+
+	addr1 = kmap_atomic(page1, KM_USER0);
+	addr2 = kmap_atomic(page2, KM_USER1);
+	r = memcmp(addr1, addr2, PAGE_SIZE);
+	kunmap_atomic(addr1, KM_USER0);
+	kunmap_atomic(addr2, KM_USER1);
+	return r;
+}
+
+/* pages_identical
+ * return 1 if identical, 0 otherwise.
+ */
+static inline int pages_identical(struct page *page1, struct page *page2)
+{
+	return !memcmp_pages(page1, page2);
+}
+
+/*
+ * try_to_merge_one_page - take two pages and merge them into one
+ * @mm: mm_struct that hold vma pointing into oldpage
+ * @vma: the vma that hold the pte pointing into oldpage
+ * @oldpage: the page that we want to replace with newpage
+ * @newpage: the page that we want to map instead of oldpage
+ * @newprot: the new permission of the pte inside vma
+ * note:
+ * oldpage should be anon page while newpage should be file mapped page
+ *
+ * this function return 0 if the pages were merged, 1 otherwise.
+ */
+static int try_to_merge_one_page(struct mm_struct *mm,
+				 struct vm_area_struct *vma,
+				 struct page *oldpage,
+				 struct page *newpage,
+				 pgprot_t newprot)
+{
+	int ret = 1;
+	int odirect_sync;
+	unsigned long page_addr_in_vma;
+	pte_t orig_pte, *orig_ptep;
+
+	get_page(newpage);
+	get_page(oldpage);
+
+	down_read(&mm->mmap_sem);
+
+	page_addr_in_vma = addr_in_vma(vma, oldpage);
+	if (page_addr_in_vma == -EFAULT)
+		goto out_unlock;
+
+	orig_ptep = get_pte(mm, page_addr_in_vma);
+	if (!orig_ptep)
+		goto out_unlock;
+	orig_pte = *orig_ptep;
+	pte_unmap(orig_ptep);
+	if (!pte_present(orig_pte))
+		goto out_unlock;
+	if (page_to_pfn(oldpage) != pte_pfn(orig_pte))
+		goto out_unlock;
+	/*
+	 * we need the page lock to read a stable PageSwapCache in
+	 * page_wrprotect()
+	 */
+	if (!trylock_page(oldpage))
+		goto out_unlock;
+	/*
+	 * page_wrprotect check if the page is swapped or in swap cache,
+	 * in the future we might want to run here if_present_pte and then
+	 * swap_free
+	 */
+	if (!page_wrprotect(oldpage, &odirect_sync, 2)) {
+		unlock_page(oldpage);
+		goto out_unlock;
+	}
+	unlock_page(oldpage);
+	if (!odirect_sync)
+		goto out_unlock;
+
+	orig_pte = pte_wrprotect(orig_pte);
+
+	if (pages_identical(oldpage, newpage))
+		ret = replace_page(vma, oldpage, newpage, orig_pte, newprot);
+
+out_unlock:
+	up_read(&mm->mmap_sem);
+	put_page(oldpage);
+	put_page(newpage);
+	return ret;
+}
+
+/*
+ * try_to_merge_two_pages - take two identical pages and prepare them to be
+ * merged into one page.
+ *
+ * this function return 0 if we successfully mapped two identical pages into one
+ * page, 1 otherwise.
+ * (note in case we created KsmPage and mapped one page into it but the second
+ *  page was not mapped we consider it as a failure and return 1)
+ */
+static int try_to_merge_two_pages(struct mm_struct *mm1, struct page *page1,
+				  struct mm_struct *mm2, struct page *page2,
+				  unsigned long addr1, unsigned long addr2)
+{
+	struct vm_area_struct *vma;
+	pgprot_t prot;
+	int ret = 1;
+
+	/*
+	 * If page2 isn't shared (it isn't PageKsm) we have to allocate a new
+	 * file mapped page and make the two ptes of mm1(page1) and mm2(page2)
+	 * point to it.  If page2 is shared, we can just make the pte of
+	 * mm1(page1) point to page2
+	 */
+	if (PageKsm(page2)) {
+		down_read(&mm1->mmap_sem);
+		vma = find_vma(mm1, addr1);
+		up_read(&mm1->mmap_sem);
+		if (!vma)
+			return ret;
+		prot = vma->vm_page_prot;
+		pgprot_val(prot) &= ~_PAGE_RW;
+		ret = try_to_merge_one_page(mm1, vma, page1, page2, prot);
+	} else {
+		struct page *kpage;
+
+		kpage = alloc_page(GFP_HIGHUSER);
+		if (!kpage)
+			return ret;
+		down_read(&mm1->mmap_sem);
+		vma = find_vma(mm1, addr1);
+		up_read(&mm1->mmap_sem);
+		if (!vma) {
+			put_page(kpage);
+			return ret;
+		}
+		prot = vma->vm_page_prot;
+		pgprot_val(prot) &= ~_PAGE_RW;
+
+		copy_user_highpage(kpage, page1, addr1, vma);
+		ret = try_to_merge_one_page(mm1, vma, page1, kpage, prot);
+
+		if (!ret) {
+			down_read(&mm2->mmap_sem);
+			vma = find_vma(mm2, addr2);
+			up_read(&mm2->mmap_sem);
+			if (!vma) {
+				put_page(kpage);
+				ret = 1;
+				return ret;
+			}
+
+			prot = vma->vm_page_prot;
+			pgprot_val(prot) &= ~_PAGE_RW;
+
+			ret = try_to_merge_one_page(mm2, vma, page2, kpage,
+						    prot);
+			/*
+			 * If the secoend try_to_merge_one_page call was failed,
+			 * we are in situation where we have Ksm page that have
+			 * just one pte pointing to it, in this case we break
+			 * it.
+			 */
+			if (ret) {
+				struct page *tmppage[1];
+
+				down_read(&mm1->mmap_sem);
+				if (get_user_pages(current, mm1, addr1, 1, 1,
+						    0, tmppage, NULL)) {
+					put_page(tmppage[0]);
+				}
+				up_read(&mm1->mmap_sem);
+			}
+		}
+		put_page(kpage);
+	}
+	return ret;
+}
+
+/*
+ * is_zapped_item - check if the page belong to the rmap_item was zapped.
+ *
+ * This function would check if the page that the virtual address inside
+ * rmap_item is poiting to is still KsmPage, and therefore we can trust the
+ * content of this page.
+ * Since that this function call already to get_user_pages it return the
+ * pointer to the page as an optimization.
+ */
+static int is_zapped_item(struct rmap_item *rmap_item,
+			  struct page **page)
+{
+	int ret = 0;
+
+	cond_resched();
+	if (is_present_pte(rmap_item->mm, rmap_item->address)) {
+		down_read(&rmap_item->mm->mmap_sem);
+		ret = get_user_pages(current, rmap_item->mm, rmap_item->address,
+				     1, 0, 0, page, NULL);
+		up_read(&rmap_item->mm->mmap_sem);
+	}
+
+	if (!ret)
+		return 1;
+
+	if (unlikely(!PageKsm(page[0]))) {
+		put_page(page[0]);
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ * stable_tree_search - search page inside the stable tree
+ * @page: the page that we are searching idneitcal pages to.
+ * @page2: pointer into identical page that we are holding inside the stable
+ *	   tree that we have found.
+ * @rmap_item: the reverse mapping item
+ *
+ * this function check if there is a page inside the stable tree
+ * with identical content to the page that we are scanning right now.
+ *
+ * this function return rmap_item pointer to the identical item if found, NULL
+ * otherwise.
+ */
+static struct rmap_item *stable_tree_search(struct page *page,
+					    struct page **page2,
+					    struct rmap_item *rmap_item)
+{
+	struct rb_node *node = root_stable_tree.rb_node;
+	struct tree_item *tree_item;
+	struct rmap_item *found_rmap_item;
+
+	while (node) {
+		int ret;
+
+		tree_item = rb_entry(node, struct tree_item, node);
+		found_rmap_item = tree_item->rmap_item;
+		while (found_rmap_item) {
+			BUG_ON(!found_rmap_item->stable_tree);
+			BUG_ON(!found_rmap_item->tree_item);
+			if (!rmap_item ||
+			     !(found_rmap_item->mm == rmap_item->mm &&
+			      found_rmap_item->address == rmap_item->address)) {
+				if (!is_zapped_item(found_rmap_item, page2))
+					break;
+				remove_rmap_item_from_tree(found_rmap_item);
+			}
+			found_rmap_item = found_rmap_item->next;
+		}
+		if (!found_rmap_item)
+			goto out_didnt_find;
+
+		/*
+		 * We can trust the value of the memcmp as we know the pages
+		 * are write protected.
+		 */
+		ret = memcmp_pages(page, page2[0]);
+
+		if (ret < 0) {
+			put_page(page2[0]);
+			node = node->rb_left;
+		} else if (ret > 0) {
+			put_page(page2[0]);
+			node = node->rb_right;
+		} else {
+			goto out_found;
+		}
+	}
+out_didnt_find:
+	found_rmap_item = NULL;
+out_found:
+	return found_rmap_item;
+}
+
+/*
+ * stable_tree_insert - insert into the stable tree, new rmap_item that is
+ * pointing into a new KsmPage.
+ *
+ * @page: the page that we are searching identical page to inside the stable
+ *	  tree.
+ * @new_tree_item: the new tree item we are going to link into the stable tree.
+ * @rmap_item: pointer into the reverse mapping item.
+ *
+ * this function return 0 if success, 0 otherwise.
+ * otherwise.
+ */
+static int stable_tree_insert(struct page *page,
+			      struct tree_item *new_tree_item,
+			      struct rmap_item *rmap_item)
+{
+	struct rb_node **new = &(root_stable_tree.rb_node);
+	struct rb_node *parent = NULL;
+	struct tree_item *tree_item;
+	struct page *page2[1];
+
+	while (*new) {
+		int ret;
+		struct rmap_item *insert_rmap_item;
+
+		tree_item = rb_entry(*new, struct tree_item, node);
+		BUG_ON(!tree_item);
+		BUG_ON(!tree_item->rmap_item);
+
+		insert_rmap_item = tree_item->rmap_item;
+		while (insert_rmap_item) {
+			BUG_ON(!insert_rmap_item->stable_tree);
+			BUG_ON(!insert_rmap_item->tree_item);
+			if (!rmap_item ||
+			    !(insert_rmap_item->mm == rmap_item->mm &&
+			     insert_rmap_item->address == rmap_item->address)) {
+				if (!is_zapped_item(insert_rmap_item, page2))
+					break;
+				remove_rmap_item_from_tree(insert_rmap_item);
+			}
+			insert_rmap_item = insert_rmap_item->next;
+		}
+		if (!insert_rmap_item)
+			return 1;
+
+		ret = memcmp_pages(page, page2[0]);
+
+		parent = *new;
+		if (ret < 0) {
+			put_page(page2[0]);
+			new = &((*new)->rb_left);
+		} else if (ret > 0) {
+			put_page(page2[0]);
+			new = &((*new)->rb_right);
+		} else {
+			/*
+			 * It isnt a bug when we are here,
+			 * beacuse after we release the stable_tree_lock
+			 * someone else could have merge identical page to the
+			 * tree.
+			 */
+			return 1;
+		}
+	}
+
+	rb_link_node(&new_tree_item->node, parent, new);
+	rb_insert_color(&new_tree_item->node, &root_stable_tree);
+	rmap_item->stable_tree = 1;
+	rmap_item->tree_item = new_tree_item;
+
+	return 0;
+}
+
+/*
+ * unstable_tree_search_insert - search and insert items into the unstable tree.
+ *
+ * @page: the page that we are going to search for identical page or to insert
+ *	  into the unstable tree
+ * @page2: pointer into identical page that was found inside the unstable tree
+ * @page_rmap_item: the reverse mapping item of page
+ *
+ * this function search if identical page to the page that we
+ * are scanning right now is found inside the unstable tree, and in case no page
+ * with identical content is exist inside the unstable tree, we insert
+ * page_rmap_item as a new object into the unstable tree.
+ *
+ * this function return pointer to rmap_item pointer of item that is found to
+ * be identical to the page that we are scanning right now, NULL otherwise.
+ *
+ * (this function do both searching and inserting, beacuse the fact that
+ *  searching and inserting share the same walking algorithem in rbtrees)
+ */
+static struct tree_item *unstable_tree_search_insert(struct page *page,
+					struct page **page2,
+					struct rmap_item *page_rmap_item)
+{
+	struct rb_node **new = &(root_unstable_tree.rb_node);
+	struct rb_node *parent = NULL;
+	struct tree_item *tree_item;
+	struct tree_item *new_tree_item;
+	struct rmap_item *rmap_item;
+
+	while (*new) {
+		int ret;
+
+		tree_item = rb_entry(*new, struct tree_item, node);
+		BUG_ON(!tree_item);
+		rmap_item = tree_item->rmap_item;
+		BUG_ON(!rmap_item);
+
+		/*
+		 * We dont want to swap in pages
+		 */
+		if (!is_present_pte(rmap_item->mm, rmap_item->address))
+			return NULL;
+
+		down_read(&rmap_item->mm->mmap_sem);
+		ret = get_user_pages(current, rmap_item->mm, rmap_item->address,
+				     1, 0, 0, page2, NULL);
+		up_read(&rmap_item->mm->mmap_sem);
+		if (!ret)
+			return NULL;
+
+		ret = memcmp_pages(page, page2[0]);
+
+		parent = *new;
+		if (ret < 0) {
+			put_page(page2[0]);
+			new = &((*new)->rb_left);
+		} else if (ret > 0) {
+			put_page(page2[0]);
+			new = &((*new)->rb_right);
+		} else {
+			return tree_item;
+		}
+	}
+
+	if (!page_rmap_item)
+		return NULL;
+
+	new_tree_item = alloc_tree_item();
+	if (!new_tree_item)
+		return NULL;
+
+	page_rmap_item->tree_item = new_tree_item;
+	page_rmap_item->stable_tree = 0;
+	new_tree_item->rmap_item = page_rmap_item;
+	rb_link_node(&new_tree_item->node, parent, new);
+	rb_insert_color(&new_tree_item->node, &root_unstable_tree);
+
+	return NULL;
+}
+
+/*
+ * update_stable_tree - check if the page inside the tree got zapped,
+ * and if it got zapped, kick it from the tree.
+ *
+ * we are setting wait to 1 in case we find that the rmap_item was object
+ * inside the stable_tree.
+ * (this is used to notify that we dont want to create new rmap_item to it
+ *  at this moment, but in the next time)
+ * wait is left unchanged incase the rmap_item was object inside the unstable
+ * tree.
+ */
+int update_tree(struct rmap_item *rmap_item, int *wait)
+{
+	struct page *page[1];
+
+	if (!rmap_item->stable_tree) {
+		if (rmap_item->tree_item) {
+			remove_rmap_item_from_tree(rmap_item);
+			return 1;
+		}
+		return 0;
+	}
+	if (is_zapped_item(rmap_item, page)) {
+		remove_rmap_item_from_tree(rmap_item);
+		*wait = 1;
+		return 1;
+	}
+	put_page(page[0]);
+	return 0;
+}
+
+static struct rmap_item *create_new_rmap_item(struct mm_struct *mm,
+			 		      unsigned long addr,
+					      unsigned int checksum)
+{
+	struct rmap_item *rmap_item;
+	struct hlist_head *bucket;
+
+	rmap_item = alloc_rmap_item();
+	if (!rmap_item)
+		return NULL;
+
+	rmap_item->mm = mm;
+	rmap_item->address = addr;
+	rmap_item->oldchecksum = checksum;
+	rmap_item->stable_tree = 0;
+	rmap_item->tree_item = NULL;
+
+	bucket = &rmap_hash[addr % nrmaps_hash];
+	hlist_add_head(&rmap_item->link, bucket);
+
+	return rmap_item;
+}
+
+/*
+ * cmp_and_merge_page - take a page computes its hash value and check if there
+ * is similar hash value to different page,
+ * in case we find that there is similar hash to different page we call to
+ * try_to_merge_two_pages().
+ *
+ * @ksm_scan: the ksm scanner strcture.
+ * @page: the page that we are searching identical page to.
+ */
+static int cmp_and_merge_page(struct ksm_scan *ksm_scan, struct page *page)
+{
+	struct page *page2[1];
+	struct ksm_mem_slot *slot;
+	struct tree_item *tree_item;
+	struct rmap_item *rmap_item;
+	struct rmap_item *tree_rmap_item;
+	unsigned int checksum;
+	unsigned long addr;
+	int wait = 0;
+	int ret;
+
+	slot = ksm_scan->slot_index;
+	addr = slot->addr + ksm_scan->page_index * PAGE_SIZE;
+	rmap_item = get_rmap_item(slot->mm, addr);
+	if (rmap_item) {
+		if (update_tree(rmap_item, &wait))
+			rmap_item = NULL;
+	}
+
+	/* We first start with searching the page inside the stable tree */
+	tree_rmap_item = stable_tree_search(page, page2, rmap_item);
+	if (tree_rmap_item) {
+		BUG_ON(!tree_rmap_item->tree_item);
+		ret = try_to_merge_two_pages(slot->mm, page, tree_rmap_item->mm,
+					     page2[0], addr,
+					     tree_rmap_item->address);
+		put_page(page2[0]);
+		if (!ret) {
+			/*
+			 * The page was successuly merged, lets insert its
+			 * rmap_item into the stable tree.
+			 */
+
+			if (!rmap_item)
+				rmap_item = create_new_rmap_item(slot->mm,
+								 addr, 0);
+			if (!rmap_item)
+				return !ret;
+
+			rmap_item->next = tree_rmap_item->next;
+			rmap_item->prev = tree_rmap_item;
+
+			if (tree_rmap_item->next)
+				tree_rmap_item->next->prev = rmap_item;
+
+			tree_rmap_item->next = rmap_item;
+
+			rmap_item->stable_tree = 1;
+			rmap_item->tree_item = tree_rmap_item->tree_item;
+		}
+		ret = !ret;
+		goto out;
+	}
+
+	/*
+	 * In case the hash value of the page was changed from the last time we
+	 * have calculated it, this page to be changed frequely, therefore we
+	 * dont want to insert it to the unstable tree, and we dont want to
+	 * waste our time to search if there is something identical to it there.
+	 */
+	if (rmap_item) {
+		checksum = calc_checksum(page);
+		if (rmap_item->oldchecksum != checksum) {
+			rmap_item->oldchecksum = checksum;
+			goto out;
+		}
+	}
+
+	tree_item = unstable_tree_search_insert(page, page2, rmap_item);
+	if (tree_item) {
+		rmap_item = tree_item->rmap_item;
+		BUG_ON(!rmap_item);
+		ret = try_to_merge_two_pages(slot->mm, page, rmap_item->mm,
+					     page2[0], addr,
+					     rmap_item->address);
+		/*
+		 * As soon as we successuly merged this page, we want to remove
+		 * the rmap_item object of the page that we have merged with and
+		 * instead insert it as a new stable tree node.
+		 */
+		if (!ret) {
+			rb_erase(&tree_item->node, &root_unstable_tree);
+			stable_tree_insert(page2[0], tree_item, rmap_item);
+		}
+		put_page(page2[0]);
+		ret = !ret;
+		goto out;
+	}
+	/*
+	 * When wait is 1, we dont want to calculate the hash value of the page
+	 * right now, instead we prefer to wait.
+	 */
+	if (!wait && !rmap_item) {
+		checksum = calc_checksum(page);
+		create_new_rmap_item(slot->mm, addr, checksum);
+	}
+out:
+	return ret;
+}
+
+/* return -EAGAIN - no slots registered, nothing to be done */
+static int scan_get_next_index(struct ksm_scan *ksm_scan, int nscan)
+{
+	struct ksm_mem_slot *slot;
+
+	if (list_empty(&slots))
+		return -EAGAIN;
+
+	slot = ksm_scan->slot_index;
+
+	/* Are there pages left in this slot to scan? */
+	if ((slot->npages - ksm_scan->page_index - nscan) > 0) {
+		ksm_scan->page_index += nscan;
+		return 0;
+	}
+
+	list_for_each_entry_from(slot, &slots, link) {
+		if (slot == ksm_scan->slot_index)
+			continue;
+		ksm_scan->page_index = 0;
+		ksm_scan->slot_index = slot;
+		return 0;
+	}
+
+	/* look like we finished scanning the whole memory, starting again */
+	root_unstable_tree = RB_ROOT;
+	ksm_scan->page_index = 0;
+	ksm_scan->slot_index = list_first_entry(&slots,
+						struct ksm_mem_slot, link);
+	return 0;
+}
+
+/*
+ * update slot_index - make sure ksm_scan will point to vaild data,
+ * it is possible that by the time we are here the data that ksm_scan was
+ * pointed to was released so we have to call this function every time after
+ * taking the slots_lock
+ */
+static void scan_update_old_index(struct ksm_scan *ksm_scan)
+{
+	struct ksm_mem_slot *slot;
+
+	if (list_empty(&slots))
+		return;
+
+	list_for_each_entry(slot, &slots, link) {
+		if (ksm_scan->slot_index == slot)
+			return;
+	}
+
+	ksm_scan->slot_index = list_first_entry(&slots,
+						struct ksm_mem_slot, link);
+	ksm_scan->page_index = 0;
+}
+
+/**
+ * ksm_scan_start - the ksm scanner main worker function.
+ * @ksm_scan -    the scanner.
+ * @scan_npages - number of pages we are want to scan before we return from this
+ * @function.
+ *
+ * (this function can be called from the kernel thread scanner, or from 
+ *  userspace ioctl context scanner)
+ *
+ *  The function return -EAGAIN in case there are not slots to scan.
+ */
+static int ksm_scan_start(struct ksm_scan *ksm_scan, unsigned int scan_npages)
+{
+	struct ksm_mem_slot *slot;
+	struct page *page[1];
+	int val;
+	int ret = 0;
+
+	down_read(&slots_lock);
+
+	scan_update_old_index(ksm_scan);
+
+	while (scan_npages > 0) {
+		ret = scan_get_next_index(ksm_scan, 1);
+		if (ret)
+			goto out;
+
+		slot = ksm_scan->slot_index;
+
+		cond_resched();
+
+		/*
+		 * If the page is swapped out or in swap cache, we don't want to
+		 * scan it (it is just for performance).
+		 */
+		if (is_present_pte(slot->mm, slot->addr +
+				   ksm_scan->page_index * PAGE_SIZE)) {
+			down_read(&slot->mm->mmap_sem);
+			val = get_user_pages(current, slot->mm, slot->addr +
+					     ksm_scan->page_index * PAGE_SIZE ,
+					      1, 0, 0, page, NULL);
+			up_read(&slot->mm->mmap_sem);
+			if (val == 1) {
+				if (!PageKsm(page[0]))
+					cmp_and_merge_page(ksm_scan, page[0]);
+				put_page(page[0]);
+			}
+		}
+		scan_npages--;
+	}
+	scan_get_next_index(ksm_scan, 1);
+out:
+	up_read(&slots_lock);
+	return ret;
+}
+
+static struct file_operations ksm_sma_fops = {
+	.release        = ksm_sma_release,
+	.unlocked_ioctl = ksm_sma_ioctl,
+	.compat_ioctl   = ksm_sma_ioctl,
+};
+
+static int ksm_dev_ioctl_create_shared_memory_area(void)
+{
+	int fd = -1;
+	struct ksm_sma *ksm_sma;
+
+	ksm_sma = kmalloc(sizeof(struct ksm_sma), GFP_KERNEL);
+	if (!ksm_sma)
+		goto out;
+
+	INIT_LIST_HEAD(&ksm_sma->sma_slots);
+
+	fd = anon_inode_getfd("ksm-sma", &ksm_sma_fops, ksm_sma, 0);
+	if (fd < 0)
+		goto out_free;
+
+	return fd;
+out_free:
+	kfree(ksm_sma);
+out:
+	return fd;
+}
+
+/*
+ * ksm_dev_ioctl_start_stop_kthread - control the kernel thread scanning running
+ * speed.
+ * This function allow us to control on the time the kernel thread will sleep
+ * how many pages it will scan between sleep and sleep, and how many pages it
+ * will maximum merge between sleep and sleep.
+ */
+static int ksm_dev_ioctl_start_stop_kthread(struct ksm_kthread_info *info)
+{
+	int ret = 0;
+
+	down_write(&kthread_lock);
+
+	if (info->flags & ksm_control_flags_run) {
+		if (!info->pages_to_scan) {
+			ret = EPERM;
+			up_write(&kthread_lock);
+			goto out;
+		}
+	}
+
+	kthread_sleep = info->sleep;
+	kthread_pages_to_scan = info->pages_to_scan;
+	ksmd_flags = info->flags;
+
+	up_write(&kthread_lock);
+
+	if (ksmd_flags & ksm_control_flags_run)
+		wake_up_interruptible(&kthread_wait);
+
+out:
+	return ret;
+}
+
+/*
+ * ksm_dev_ioctl_get_info_kthread - write into info the scanning information
+ * of the ksm kernel thread
+ */
+static void ksm_dev_ioctl_get_info_kthread(struct ksm_kthread_info *info)
+{
+	down_read(&kthread_lock);
+
+	info->sleep = kthread_sleep;
+	info->pages_to_scan = kthread_pages_to_scan;
+	info->flags = ksmd_flags;
+
+	up_read(&kthread_lock);
+}
+
+static long ksm_dev_ioctl(struct file *filp,
+			  unsigned int ioctl, unsigned long arg)
+{
+	void __user *argp = (void __user *)arg;
+	long r = -EINVAL;
+
+	switch (ioctl) {
+	case KSM_GET_API_VERSION:
+		r = KSM_API_VERSION;
+		break;
+	case KSM_CREATE_SHARED_MEMORY_AREA:
+		r = ksm_dev_ioctl_create_shared_memory_area();
+		break;
+	case KSM_START_STOP_KTHREAD: {
+		struct ksm_kthread_info info;
+
+		r = -EFAULT;
+		if (copy_from_user(&info, argp,
+				   sizeof(struct ksm_kthread_info)))
+			break;
+
+		r = ksm_dev_ioctl_start_stop_kthread(&info);
+		break;
+		}
+	case KSM_GET_INFO_KTHREAD: {
+		struct ksm_kthread_info info;
+
+		ksm_dev_ioctl_get_info_kthread(&info);
+		r = -EFAULT;
+		if (copy_to_user(argp, &info,
+				 sizeof(struct ksm_kthread_info)))
+			break;
+		r = 0;
+		break;
+	}
+	default:
+		break;
+	}
+	return r;
+}
+
+static struct file_operations ksm_chardev_ops = {
+	.unlocked_ioctl = ksm_dev_ioctl,
+	.compat_ioctl   = ksm_dev_ioctl,
+	.owner          = THIS_MODULE,
+};
+
+static struct miscdevice ksm_dev = {
+	KSM_MINOR,
+	"ksm",
+	&ksm_chardev_ops,
+};
+
+int kthread_ksm_scan_thread(void *nothing)
+{
+	while (!kthread_should_stop()) {
+		if (ksmd_flags & ksm_control_flags_run) {
+			down_read(&kthread_lock);
+			ksm_scan_start(&kthread_ksm_scan,
+				       kthread_pages_to_scan);
+			up_read(&kthread_lock);
+			schedule_timeout_interruptible(
+					usecs_to_jiffies(kthread_sleep));
+		} else {
+			wait_event_interruptible(kthread_wait,
+					ksmd_flags & ksm_control_flags_run ||
+					kthread_should_stop());
+		}
+	}
+	return 0;
+}
+
+static int __init ksm_init(void)
+{
+	int r;
+
+	r = ksm_slab_init();
+	if (r)
+		goto out;
+
+	r = rmap_hash_init();
+	if (r)
+		goto out_free1;
+
+	kthread = kthread_run(kthread_ksm_scan_thread, NULL, "kksmd");
+	if (IS_ERR(kthread)) {
+		printk(KERN_ERR "ksm: creating kthread failed\n");
+		r = PTR_ERR(kthread);
+		goto out_free2;
+	}
+
+	r = misc_register(&ksm_dev);
+	if (r) {
+		printk(KERN_ERR "ksm: misc device register failed\n");
+		goto out_free3;
+	}
+
+	printk(KERN_WARNING "ksm loaded\n");
+	return 0;
+
+out_free3:
+	kthread_stop(kthread);
+out_free2:
+	rmap_hash_free();
+out_free1:
+	ksm_slab_free();
+out:
+	return r;
+}
+
+static void __exit ksm_exit(void)
+{
+	misc_deregister(&ksm_dev);
+	ksmd_flags = ksm_control_flags_run;
+	kthread_stop(kthread);
+	rmap_hash_free();
+	ksm_slab_free();
+}
+
+module_init(ksm_init)
+module_exit(ksm_exit)