diff mbox series

[v2] ksm: allow dedup all tasks memory

Message ID 20181111212610.25213-1-timofey.titovets@synesis.ru (mailing list archive)
State New, archived
Headers show
Series [v2] ksm: allow dedup all tasks memory | expand

Commit Message

Timofey Titovets Nov. 11, 2018, 9:26 p.m. UTC
From: Timofey Titovets <nefelim4ag@gmail.com>

ksm by default working only on memory that added by
madvice().

And only way get that work on other applications:
 - Use LD_PRELOAD and libraries
 - Patch kernel

Lets use kernel task list in ksm_scan_thread and add logic to allow ksm
import VMA from tasks.
That behaviour controlled by new attribute: mode
I try mimic hugepages attribute, so mode have two states:
 - normal       - old default behaviour
 - always [new] - allow ksm to get tasks vma and try working on that.

To reduce CPU load & tasklist locking time,
ksm try import VMAs from one task per loop.

So add new attribute "mode"
Two passible values:
 - normal [default] - ksm use only madvice
 - always [new]     - ksm will search vma over all processes memory and
                      add it to the dedup list

v1 -> v2:
  - Rebase on v4.19.1

Signed-off-by: Timofey Titovets <nefelim4ag@gmail.com>
---
 Documentation/admin-guide/mm/ksm.rst |   7 ++
 mm/ksm.c                             | 149 ++++++++++++++++++++++-----
 2 files changed, 128 insertions(+), 28 deletions(-)

Comments

Matthew Wilcox Nov. 12, 2018, 3:58 a.m. UTC | #1
On Mon, Nov 12, 2018 at 12:26:10AM +0300, Timofey Titovets wrote:
> ksm by default working only on memory that added by
> madvice().
> 
> And only way get that work on other applications:
>  - Use LD_PRELOAD and libraries
>  - Patch kernel
> 
> Lets use kernel task list in ksm_scan_thread and add logic to allow ksm
> import VMA from tasks.
> That behaviour controlled by new attribute: mode
> I try mimic hugepages attribute, so mode have two states:
>  - normal       - old default behaviour
>  - always [new] - allow ksm to get tasks vma and try working on that.
> 
> To reduce CPU load & tasklist locking time,
> ksm try import VMAs from one task per loop.
> 
> So add new attribute "mode"
> Two passible values:
>  - normal [default] - ksm use only madvice
>  - always [new]     - ksm will search vma over all processes memory and
>                       add it to the dedup list

Do you have any numbers for how much difference this change makes with
various different workloads?
Timofey Titovets Nov. 12, 2018, 4:16 p.m. UTC | #2
пн, 12 нояб. 2018 г. в 6:58, Matthew Wilcox <willy@infradead.org>:
>
> On Mon, Nov 12, 2018 at 12:26:10AM +0300, Timofey Titovets wrote:
> > ksm by default working only on memory that added by
> > madvice().
> >
> > And only way get that work on other applications:
> >  - Use LD_PRELOAD and libraries
> >  - Patch kernel
> >
> > Lets use kernel task list in ksm_scan_thread and add logic to allow ksm
> > import VMA from tasks.
> > That behaviour controlled by new attribute: mode
> > I try mimic hugepages attribute, so mode have two states:
> >  - normal       - old default behaviour
> >  - always [new] - allow ksm to get tasks vma and try working on that.
> >
> > To reduce CPU load & tasklist locking time,
> > ksm try import VMAs from one task per loop.
> >
> > So add new attribute "mode"
> > Two passible values:
> >  - normal [default] - ksm use only madvice
> >  - always [new]     - ksm will search vma over all processes memory and
> >                       add it to the dedup list
>
> Do you have any numbers for how much difference this change makes with
> various different workloads?

Yep, i got some non KVM numbers,
Formulas:
 Percentage - (pages_sharing - pages_shared)/pages_unshared
 Memory saved - (pages_sharing - pages_shared)*4/1024 MiB

- My working laptop: 5% - ~100 MiB saved ~2GiB used
  Many different chrome based apps + KDE

- K8s test VM:  40% - ~160 MiB saved ~920MiB used
  With some small running docker images

- Ceph test VM: 20% - ~60MiB saved ~600MiB used
  With ceph mon, osd.

Develop cluster servers:
- K8s server backend: 72%, ~5800 MiB saved ~35.7 GiB used
  (With backend apps: C, java, go & etc server apps)

- K8s server processing: 55%, ~2600 MiB saved ~28 GiB used
  (90% of load many instance of one CPU intensive application)

- Ceph node: 2%, ~190 MiB saved ~11.7 GiB used
  (OSD only)


So numbers, as always depends on the load.

Thanks!
- - -
P.S.
On recent kernels (4.19) i see BUG_ON message, that ksmd scheduled
while in critical section/atomic context,
not sure how to properly fix that.
(If i understood correctly, i can use preempt_disable(); but that
looks more like hack, not a fix).

Any feedback are welcome.
diff mbox series

Patch

diff --git a/Documentation/admin-guide/mm/ksm.rst b/Documentation/admin-guide/mm/ksm.rst
index 9303786632d1..253f94a09be8 100644
--- a/Documentation/admin-guide/mm/ksm.rst
+++ b/Documentation/admin-guide/mm/ksm.rst
@@ -116,6 +116,13 @@  run
         Default: 0 (must be changed to 1 to activate KSM, except if
         CONFIG_SYSFS is disabled)
 
+mode
+        * set always to allow ksm deduplicate memory of every process
+        * set normal to use only madviced memory
+
+        Default: normal (dedupulicate only madviced memory as in
+        earlier releases)
+
 use_zero_pages
         specifies whether empty pages (i.e. allocated pages that only
         contain zeroes) should be treated specially.  When set to 1,
diff --git a/mm/ksm.c b/mm/ksm.c
index 1a088306ef81..5097d710c466 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -295,6 +295,10 @@  static int ksm_nr_node_ids = 1;
 static unsigned long ksm_run = KSM_RUN_STOP;
 static void wait_while_offlining(void);
 
+#define KSM_MODE_NORMAL 0
+#define KSM_MODE_ALWAYS	1
+static unsigned long ksm_mode = KSM_MODE_NORMAL;
+
 static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
 static DEFINE_MUTEX(ksm_thread_mutex);
 static DEFINE_SPINLOCK(ksm_mmlist_lock);
@@ -303,6 +307,11 @@  static DEFINE_SPINLOCK(ksm_mmlist_lock);
 		sizeof(struct __struct), __alignof__(struct __struct),\
 		(__flags), NULL)
 
+static inline int ksm_mode_always(void)
+{
+	return (ksm_mode == KSM_MODE_ALWAYS);
+}
+
 static int __init ksm_slab_init(void)
 {
 	rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0);
@@ -2386,17 +2395,94 @@  static void ksm_do_scan(unsigned int scan_npages)
 
 static int ksmd_should_run(void)
 {
-	return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list);
+	return (ksm_run & KSM_RUN_MERGE) &&
+		(!list_empty(&ksm_mm_head.mm_list) || ksm_mode_always());
+}
+
+
+static int ksm_enter(struct mm_struct *mm, unsigned long *vm_flags)
+{
+	int err;
+
+	if (*vm_flags & (VM_MERGEABLE | VM_SHARED  | VM_MAYSHARE   |
+			 VM_PFNMAP    | VM_IO      | VM_DONTEXPAND |
+			 VM_HUGETLB | VM_MIXEDMAP))
+		return 0;
+
+#ifdef VM_SAO
+	if (*vm_flags & VM_SAO)
+		return 0;
+#endif
+#ifdef VM_SPARC_ADI
+	if (*vm_flags & VM_SPARC_ADI)
+		return 0;
+#endif
+	if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
+		err = __ksm_enter(mm);
+		if (err)
+			return err;
+	}
+
+	*vm_flags |= VM_MERGEABLE;
+
+	return 0;
+}
+
+/*
+ * Register all vmas for all processes in the system with KSM.
+ * Note that every call to ksm_madvise, for a given vma, after the first
+ * does nothing but set flags.
+ */
+void ksm_import_task_vma(struct task_struct *task)
+{
+	struct vm_area_struct *vma;
+	struct mm_struct *mm;
+	int error;
+
+	mm = get_task_mm(task);
+	if (!mm)
+		return;
+	down_write(&mm->mmap_sem);
+	vma = mm->mmap;
+	while (vma) {
+		error = ksm_enter(vma->vm_mm, &vma->vm_flags);
+		vma = vma->vm_next;
+	}
+	up_write(&mm->mmap_sem);
+	mmput(mm);
+	return;
 }
 
 static int ksm_scan_thread(void *nothing)
 {
+	pid_t last_pid = 1;
+	pid_t curr_pid;
+	struct task_struct *task;
+
 	set_freezable();
 	set_user_nice(current, 5);
 
 	while (!kthread_should_stop()) {
 		mutex_lock(&ksm_thread_mutex);
 		wait_while_offlining();
+		if (ksm_mode_always()) {
+			/*
+			 * import one task's vma per run
+			 */
+			read_lock(&tasklist_lock);
+
+			for_each_process(task) {
+				curr_pid = task_pid_nr(task);
+				if (curr_pid == last_pid)
+					break;
+			}
+
+			task = next_task(task);
+			last_pid = task_pid_nr(task);
+
+			ksm_import_task_vma(task);
+			read_unlock(&tasklist_lock);
+		}
 		if (ksmd_should_run())
 			ksm_do_scan(ksm_thread_pages_to_scan);
 		mutex_unlock(&ksm_thread_mutex);
@@ -2422,33 +2508,9 @@  int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
 
 	switch (advice) {
 	case MADV_MERGEABLE:
-		/*
-		 * Be somewhat over-protective for now!
-		 */
-		if (*vm_flags & (VM_MERGEABLE | VM_SHARED  | VM_MAYSHARE   |
-				 VM_PFNMAP    | VM_IO      | VM_DONTEXPAND |
-				 VM_HUGETLB | VM_MIXEDMAP))
-			return 0;		/* just ignore the advice */
-
-		if (vma_is_dax(vma))
-			return 0;
-
-#ifdef VM_SAO
-		if (*vm_flags & VM_SAO)
-			return 0;
-#endif
-#ifdef VM_SPARC_ADI
-		if (*vm_flags & VM_SPARC_ADI)
-			return 0;
-#endif
-
-		if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
-			err = __ksm_enter(mm);
-			if (err)
-				return err;
-		}
-
-		*vm_flags |= VM_MERGEABLE;
+		err = ksm_enter(mm, vm_flags);
+		if (err)
+			return err;
 		break;
 
 	case MADV_UNMERGEABLE:
@@ -2852,6 +2914,36 @@  static ssize_t pages_to_scan_store(struct kobject *kobj,
 }
 KSM_ATTR(pages_to_scan);
 
+static ssize_t mode_show(struct kobject *kobj, struct kobj_attribute *attr,
+			char *buf)
+{
+	switch (ksm_mode) {
+		case KSM_MODE_NORMAL:
+			return sprintf(buf, "always [normal]\n");
+			break;
+		case KSM_MODE_ALWAYS:
+			return sprintf(buf, "[always] normal\n");
+			break;
+	}
+
+	return sprintf(buf, "always [normal]\n");
+}
+
+static ssize_t mode_store(struct kobject *kobj, struct kobj_attribute *attr,
+			 const char *buf, size_t count)
+{
+	if (!memcmp("always", buf, min(sizeof("always")-1, count))) {
+		ksm_mode = KSM_MODE_ALWAYS;
+		wake_up_interruptible(&ksm_thread_wait);
+	} else if (!memcmp("normal", buf, min(sizeof("normal")-1, count))) {
+		ksm_mode = KSM_MODE_NORMAL;
+	} else
+		return -EINVAL;
+
+	return count;
+}
+KSM_ATTR(mode);
+
 static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
 			char *buf)
 {
@@ -3109,6 +3201,7 @@  KSM_ATTR_RO(full_scans);
 static struct attribute *ksm_attrs[] = {
 	&sleep_millisecs_attr.attr,
 	&pages_to_scan_attr.attr,
+	&mode_attr.attr,
 	&run_attr.attr,
 	&pages_shared_attr.attr,
 	&pages_sharing_attr.attr,