diff mbox

[4/4] NUMA: realize NUMA memory pinning

Message ID 1281534738-8310-5-git-send-email-andre.przywara@amd.com (mailing list archive)
State New, archived
Headers show

Commit Message

Andre Przywara Aug. 11, 2010, 1:52 p.m. UTC
None
diff mbox

Patch

diff --git a/hw/pc.c b/hw/pc.c
index 1b24409..dbfc082 100644
--- a/hw/pc.c
+++ b/hw/pc.c
@@ -42,6 +42,15 @@ 
 #include "device-assignment.h"
 #include "kvm.h"
 
+#ifdef CONFIG_NUMA
+#include <numa.h>
+#include <numaif.h>
+#ifndef MPOL_F_RELATIVE_NODES
+  #define MPOL_F_RELATIVE_NODES (1 << 14)
+  #define MPOL_F_STATIC_NODES (1 << 15)
+#endif
+#endif
+
 /* output Bochs bios info messages */
 //#define DEBUG_BIOS
 
@@ -882,6 +891,53 @@  void pc_cpus_init(const char *cpu_model)
     }
 }
 
+static void bind_numa(ram_addr_t ram_addr)
+{
+#ifdef CONFIG_NUMA
+    int i;
+    char* ram_ptr;
+    ram_addr_t len, ram_offset;
+    int bind_mode;
+
+    ram_ptr = qemu_get_ram_ptr(ram_addr);
+
+    ram_offset = 0;
+    for (i = 0; i < nb_numa_nodes; i++) {
+        len = numa_info[i].guest_mem;
+        if (numa_info[i].flags != 0) {
+            switch (numa_info[i].flags & NODE_HOST_POLICY_MASK) {
+            case NODE_HOST_BIND:
+                bind_mode = MPOL_BIND;
+                break;
+            case NODE_HOST_INTERLEAVE:
+                bind_mode = MPOL_INTERLEAVE;
+                break;
+            case NODE_HOST_PREFERRED:
+                bind_mode = MPOL_PREFERRED;
+                break;
+            default:
+                bind_mode = MPOL_DEFAULT;
+                break;
+            }
+            bind_mode |= (numa_info[i].flags & NODE_HOST_RELATIVE) ?
+                MPOL_F_RELATIVE_NODES : MPOL_F_STATIC_NODES;
+
+            /* This is a workaround for a long standing bug in Linux'
+             * mbind implementation, which cuts off the last specified
+             * node. To stay compatible should this bug be fixed, we
+             * specify one more node and zero this one out.
+             */
+            clear_bit(numa_num_configured_nodes() + 1, numa_info[i].host_mem);
+            if (mbind(ram_ptr + ram_offset, len, bind_mode,
+                numa_info[i].host_mem, numa_num_configured_nodes() + 1, 0))
+                    perror("mbind");
+        }
+        ram_offset += len;
+    }
+#endif
+    return;
+}
+
 void pc_memory_init(ram_addr_t ram_size,
                     const char *kernel_filename,
                     const char *kernel_cmdline,
@@ -919,6 +975,8 @@  void pc_memory_init(ram_addr_t ram_size,
     cpu_register_physical_memory(0x100000,
                  below_4g_mem_size - 0x100000,
                  ram_addr + 0x100000);
+    bind_numa(ram_addr);
+
 #if TARGET_PHYS_ADDR_BITS > 32
     cpu_register_physical_memory(0x100000000ULL, above_4g_mem_size,
                                  ram_addr + below_4g_mem_size);