diff mbox

reserved-ram for pci-passthrough without VT-d capable hardware

Message ID 20090427174310.GH11953@random.random (mailing list archive)
State New, archived
Headers show

Commit Message

Andrea Arcangeli April 27, 2009, 5:43 p.m. UTC
Hello Pablo,

On Mon, Apr 27, 2009 at 11:00:51AM -0600, Passera, Pablo R wrote:
> Andrea,
>         We are working with embedded hardware that does not have
>VT-d and we need 1-1 mapping. I wonder which is the status of this
>patch. Have you continued updating it with the latest KVM version?

Sorry to say but it isn't updated to latest KVM and latest
mainline. Porting normally should be easy. I attached last versions.

> Since you mentioned this ;), I take opportunity to add that those
> embedded usages are the ones that are totally fine with the compile
> time passthrough-guest-ram decision, instead of a boot time
> decision. Those host kernels will likely have RT patches (KVM works
> great with preempt-RT indeed) and in turn the compile time ram
> selection is the least of their problems as you can imagine ;). So you
> can see my patch as an embedded-build option, similar to "Configure
> standard kernel features (for small systems)" and no distro is
> shipping new kernels with that feature on either.
> 
> Than if we decide 1:1 should have larger userbase instead of only the
> people that knows what they're doing (i.e. 1:1 guest can destroy
> linux-hypervisor) we can always add a bit of strtol parsing to 16bit
> kernelloader.

Agreed!
From: Andrea Arcangeli <aarcange@redhat.com>

The "reserved RAM" can be mapped by virtualization software with
/dev/mem to create a 1:1 mapping between guest physical (bus) address
and host physical (bus) address. This will allow pci passthrough with
DMA for the guest using the ram with the 1:1 mapping. The only detail
to take care of is the ram marked "reserved RAM failed". The
virtualization software must create for the guest an e820 map that
only includes the "reserved RAM" regions but if the guest touches
memory with guest physical address in the "reserved RAM failed" ranges
(linux guest will do that even if the ram isn't present in the e820
map), it should provide that as ram and map it with a non linear
mapping. This should allow any linux kernel to run fine and hopefully
any other OS too.

svm ~ # cat /proc/iomem |head -n 20
00000000-00000fff : reserved RAM failed
00001000-00005fff : reserved RAM
00006000-00007fff : reserved RAM failed
00008000-0009efff : reserved RAM
0009f000-0009ffff : reserved
000cd600-000cffff : pnp 00:0d
000f0000-000fffff : reserved
00100000-0fffffff : reserved RAM
10000000-3dedffff : System RAM
  10000000-10329ab2 : Kernel code
  10329ab3-104933e7 : Kernel data
  104f5000-10558e67 : Kernel bss
3dee0000-3dee2fff : ACPI Non-volatile Storage
3dee3000-3deeffff : ACPI Tables
3def0000-3defffff : reserved
3dff0000-3ffeffff : pnp 00:0d
e0000000-efffffff : reserved
fa000000-fbffffff : PCI Bus #01
  fa000000-fbffffff : 0000:01:05.0
fda00000-fdbfffff : PCI Bus #01
svm ~ # hexdump /dev/mem | grep -C2 'cccc cccc cccc cccc'
00007e0 0000 0000 0000 0000 0000 0000 0000 0000
*
0001000 cccc cccc cccc cccc cccc cccc cccc cccc
*
0006000 a5a5 a5a5 8ec8 8ed8 8ec0 66d0 06c7 0000
--
*
0007ff0 0000 0000 0000 0000 3063 1000 0000 0000
0008000 cccc cccc cccc cccc cccc cccc cccc cccc
*
009f000 0002 0000 0000 0000 0000 0000 0000 0000
--
00fffe0 6000 3c03 45e7 0184 0500 0082 01c0 0223
00ffff0 5bea 00e0 31f0 2f32 3931 302f 0037 12fc
0100000 cccc cccc cccc cccc cccc cccc cccc cccc
*
10000000 8d48 f92d ffff 48ff ed81 0000 1000 8948
^C
svm ~ #

Signed-off-by: From: Andrea Arcangeli <aarcange@redhat.com>
---

This is a port to current linux-2.6.git of the previous reserved-ram
patch. Let me know if there's a chance to get this acked and
included. Anything that isn't at compile time would require much
bigger changes just to parse the command line at 16bit realmode time
to know where to relocate the kernel dynamically. Because 1:1 is a
corner case feature required only by some users, this is the minimal
intrusive approach. This also has some limits as it can't reserve more
than 1g, and with a few more changes 2g but this is ok for a long time
as the virtualized 1:1 guest doesn't need to be huge, just a desktop.


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
From: Ben-Ami Yassour <benami@il.ibm.com>

In some cases it is not enough to identify mmio memory slots by
pfn_valid. This patch adds checking the PageReserved as well.

Signed-off-by: Ben-Ami Yassour <benami@il.ibm.com>
Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com>
---
 virt/kvm/kvm_main.c |   22 +++++++++++++++-------
 1 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index f9427e2..27b2eff 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -76,6 +76,14 @@ static inline int valid_vcpu(int n)
 	return likely(n >= 0 && n < KVM_MAX_VCPUS);
 }
 
+static inline int is_mmio_pfn(pfn_t pfn)
+{
+	if (pfn_valid(pfn))
+		return PageReserved(pfn_to_page(pfn));
+
+	return true;
+}
+
 /*
  * Switches to specified vcpu, until a matching vcpu_put()
  */
@@ -582,7 +590,7 @@ pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
 		}
 
 		pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
-		BUG_ON(pfn_valid(pfn));
+		BUG_ON(!is_mmio_pfn(pfn));
 	} else
 		pfn = page_to_pfn(page[0]);
 
@@ -596,10 +604,10 @@ struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
 	pfn_t pfn;
 
 	pfn = gfn_to_pfn(kvm, gfn);
-	if (pfn_valid(pfn))
+	if (!is_mmio_pfn(pfn))
 		return pfn_to_page(pfn);
 
-	WARN_ON(!pfn_valid(pfn));
+	WARN_ON(is_mmio_pfn(pfn));
 
 	get_page(bad_page);
 	return bad_page;
@@ -615,7 +623,7 @@ EXPORT_SYMBOL_GPL(kvm_release_page_clean);
 
 void kvm_release_pfn_clean(pfn_t pfn)
 {
-	if (pfn_valid(pfn))
+	if (!is_mmio_pfn(pfn))
 		put_page(pfn_to_page(pfn));
 }
 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
@@ -641,7 +649,7 @@ EXPORT_SYMBOL_GPL(kvm_set_page_dirty);
 
 void kvm_set_pfn_dirty(pfn_t pfn)
 {
-	if (pfn_valid(pfn)) {
+	if (!is_mmio_pfn(pfn)) {
 		struct page *page = pfn_to_page(pfn);
 		if (!PageReserved(page))
 			SetPageDirty(page);
@@ -651,14 +659,14 @@ EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
 
 void kvm_set_pfn_accessed(pfn_t pfn)
 {
-	if (pfn_valid(pfn))
+	if (!is_mmio_pfn(pfn))
 		mark_page_accessed(pfn_to_page(pfn));
 }
 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
 
 void kvm_get_pfn(pfn_t pfn)
 {
-	if (pfn_valid(pfn))
+	if (!is_mmio_pfn(pfn))
 		get_page(pfn_to_page(pfn));
 }
 EXPORT_SYMBOL_GPL(kvm_get_pfn);
kvm_main.c |    5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

Index: virt/kvm/kvm_main.c
--- virt/kvm/kvm_main.c.orig	2008-06-25 02:39:51.000000000 +0200
+++ a/virt/kvm/kvm_main.c	2008-06-25 02:40:35.000000000 +0200
@@ -604,10 +604,9 @@ struct page *gfn_to_page(struct kvm *kvm
 	pfn_t pfn;
 
 	pfn = gfn_to_pfn(kvm, gfn);
-	if (!is_mmio_pfn(pfn))
+	if (pfn_valid(pfn))
 		return pfn_to_page(pfn);
-
-	WARN_ON(is_mmio_pfn(pfn));
+	WARN_ON(1);
 
 	get_page(bad_page);
 	return bad_page;
diff --git a/bios/rombios.c b/bios/rombios.c
index 318de57..f93a6c6 100644
--- a/bios/rombios.c
+++ b/bios/rombios.c
@@ -4251,6 +4251,7 @@ int15_function32(regs, ES, DS, FLAGS)
   Bit32u  extra_lowbits_memory_size=0;
   Bit16u  CX,DX;
   Bit8u   extra_highbits_memory_size=0;
+  Bit32u  below_640_end;
 
 BX_DEBUG_INT15("int15 AX=%04x\n",regs.u.r16.ax);
 
@@ -4305,6 +4306,11 @@ ASM_END
          case 0x20: // coded by osmaker aka K.J.
             if(regs.u.r32.edx == 0x534D4150)
             {
+                below_640_end = inb_cmos(0x16);
+                below_640_end <<= 8;
+                below_640_end |= inb_cmos(0x15);
+                below_640_end *= 1024;
+
                 extended_memory_size = inb_cmos(0x35);
                 extended_memory_size <<= 8;
                 extended_memory_size |= inb_cmos(0x34);
@@ -4334,7 +4340,7 @@ ASM_END
                 {
                     case 0:
                         set_e820_range(ES, regs.u.r16.di,
-                                       0x0000000L, 0x0009fc00L, 0, 0, 1);
+                                       0x0000000L, below_640_end, 0, 0, 1);
                         regs.u.r32.ebx = 1;
                         regs.u.r32.eax = 0x534D4150;
                         regs.u.r32.ecx = 0x14;
@@ -4343,7 +4349,7 @@ ASM_END
                         break;
                     case 1:
                         set_e820_range(ES, regs.u.r16.di,
-                                       0x0009fc00L, 0x000a0000L, 0, 0, 2);
+                                       below_640_end, 0x000a0000L, 0, 0, 2);
                         regs.u.r32.ebx = 2;
                         regs.u.r32.eax = 0x534D4150;
                         regs.u.r32.ecx = 0x14;
diff --git a/qemu/hw/pc.c b/qemu/hw/pc.c
index 3a8269b..6da67ff 100644
--- a/qemu/hw/pc.c
+++ b/qemu/hw/pc.c
@@ -235,6 +235,8 @@ static void cmos_init(ram_addr_t ram_size, ram_addr_t above_4g_mem_size,
 
     /* memory size */
     val = 640; /* base memory in K */
+    if (reserved_ram)
+	    val = reserved[1] / 1024;
     rtc_set_memory(s, 0x15, val);
     rtc_set_memory(s, 0x16, val >> 8);
 
diff --git a/qemu/sysemu.h b/qemu/sysemu.h
index 1469e90..344aa54 100644
--- a/qemu/sysemu.h
+++ b/qemu/sysemu.h
@@ -99,6 +99,8 @@ extern int autostart;
 extern int old_param;
 extern int hpagesize;
 extern const char *bootp_filename;
+extern int reserved_ram;
+extern int64_t reserved[4];
 
 
 #ifdef USE_KQEMU
diff --git a/qemu/vl.c b/qemu/vl.c
index e1762ee..81c9bf8 100644
--- a/qemu/vl.c
+++ b/qemu/vl.c
@@ -237,6 +237,8 @@ int time_drift_fix = 0;
 unsigned int kvm_shadow_memory = 0;
 const char *mem_path = NULL;
 int hpagesize = 0;
+int reserved_ram = 0;
+int64_t reserved[4];
 const char *cpu_vendor_string;
 #ifdef TARGET_ARM
 int old_param = 0;
@@ -8098,6 +8100,7 @@ static void help(int exitcode)
            "-startdate      select initial date of the clock\n"
            "-icount [N|auto]\n"
            "                Enable virtual instruction counter with 2^N clock ticks per instruction\n"
+           "-reserved-ram   use reserved RAM in /proc/iomem with spte identity mapping\n"
            "\n"
            "During emulation, the following keys are useful:\n"
            "ctrl-alt-f      toggle full screen\n"
@@ -8211,6 +8214,7 @@ enum {
     QEMU_OPTION_tdf,
     QEMU_OPTION_kvm_shadow_memory,
     QEMU_OPTION_mempath,
+    QEMU_OPTION_reserved_ram,
 };
 
 typedef struct QEMUOption {
@@ -8336,6 +8340,7 @@ const QEMUOption qemu_options[] = {
     { "tb-size", HAS_ARG, QEMU_OPTION_tb_size },
     { "icount", HAS_ARG, QEMU_OPTION_icount },
     { "mem-path", HAS_ARG, QEMU_OPTION_mempath },
+    { "reserved-ram", 0, QEMU_OPTION_reserved_ram },
     { NULL },
 };
 
@@ -8563,6 +8568,77 @@ static int gethugepagesize(void)
     return hugepagesize;
 }
 
+static int find_reserved_ram(int64_t *_start, int64_t *_end,
+			     unsigned long below, unsigned long above,
+			     unsigned long min_size)
+{
+    int ret, fd;
+    char buf[4096];
+    char *needle = "reserved RAM\n";
+    char *size, *curr;
+    int64_t start, end;
+
+    fd = open("/proc/iomem", O_RDONLY);
+    if (fd < 0) {
+	perror("open");
+	exit(0);
+    }
+
+    ret = read(fd, buf, sizeof(buf)-1);
+    if (ret < 0) {
+	perror("read");
+	exit(0);
+    }
+    buf[ret] = 0;
+
+    size = buf;
+    while (1) {
+	    size = strstr(size, needle);
+	    if (!size)
+		    return 0;
+	    size += strlen(needle);
+	    curr = size - strlen(needle) - 20;
+	    start = strtoll(curr, &curr, 16);
+	    end = strtoll(curr+1, NULL, 16);
+	    if ((!above || start >= above) && (!below || end <= below) &&
+		(!min_size || end-start >= min_size)) {
+		    *_start = start;
+		    *_end = end+1;
+		    return 1;
+	    }
+    }
+}
+
+static void init_reserved_ram(void)
+{
+	if (find_reserved_ram(&reserved[0], &reserved[1],
+			      640*1024, 0, 500*1024) &&
+	    find_reserved_ram(&reserved[2], &reserved[3],
+			      0, 1024*1024, 1024*1024)) {
+		reserved_ram = 1;
+		if (reserved[0] != 4096) {
+			fprintf(stderr,
+				"strange host ram layout\n");
+			exit(1);
+		}
+		if (reserved[2] != 1024*1024) {
+			fprintf(stderr,
+				"strange host ram layout\n");
+			exit(1);
+		}
+		if (reserved[3] < ram_size) {
+			fprintf(stderr,
+				"not enough host reserved ram, decrease -m\n");
+			exit(1);
+		}
+		reserved[1] &= TARGET_PAGE_MASK;
+	} else {
+		fprintf(stderr,
+			"host reserved ram not found\n");
+		exit(1);
+	}
+}
+
 void *alloc_mem_area(unsigned long memory, const char *path)
 {
     char *filename;
@@ -8609,10 +8685,43 @@ void *qemu_alloc_physram(unsigned long memory)
 {
     void *area = NULL;
 
-    if (mem_path)
+    if (!area && mem_path)
 	area = alloc_mem_area(memory, mem_path);
-    if (!area)
+    if (!area) {
 	area = qemu_vmalloc(memory);
+	if (reserved_ram) {
+	    int fd;
+	    if (memory < reserved[2]) {
+		printf("memory < reserved[2]\n");
+		return NULL;
+	    }
+	    fd = open("/dev/mem", O_RDWR);
+	    if (fd < 0) {
+		perror("reserved_ram requires access to /dev/mem");
+		return NULL;
+	    }
+	    if (mmap((char *)area+reserved[0],
+		reserved[1]-reserved[0],
+		     PROT_READ|PROT_WRITE, MAP_SHARED|MAP_FIXED,
+		     fd, 0) == MAP_FAILED) {
+		    perror("reserved_ram mmap failed on /dev/mem");
+		    return NULL;
+	    }
+	    bzero((char *)area+reserved[0], reserved[1]-reserved[0]);
+	    if (mmap((char *)area+reserved[2],
+		     ram_size-reserved[2],
+		     PROT_READ|PROT_WRITE, MAP_SHARED|MAP_FIXED,
+		     fd, reserved[2]) == MAP_FAILED) {
+		    perror("reserved_ram mmap failed on /dev/mem");
+		    return NULL;
+	    }
+	    bzero((char *)area+reserved[2], ram_size-reserved[2]);
+	    if (close(fd) < 0) {
+		    perror("/dev/mem");
+		    return NULL;
+	    }
+	}
+    }
 
     return area;
 }
@@ -9221,6 +9330,9 @@ int main(int argc, char **argv)
             case QEMU_OPTION_mempath:
 		mem_path = optarg;
 		break;
+            case QEMU_OPTION_reserved_ram:
+		init_reserved_ram();
+		break;
             case QEMU_OPTION_name:
                 qemu_name = optarg;
                 break;

Comments

Passera, Pablo R April 28, 2009, 1:35 p.m. UTC | #1
Hello Andrea,

I have applied the patch to kvm and userland, but, when I tried to port the host kernel patch I noticed that the changes were over the file e820.c. However, on kernel 2.6.26 there are two e820 files, e820_32.c and e820_64.c and most of the changes maps on the e820_64.c file. So, I have a couple of questions if you don't mind:

- Against which kernel version was this patch generated?
- Did you try this on a 32 or 64 bits system?

Thanks,
Pablo

>-----Original Message-----
>From: Andrea Arcangeli [mailto:aarcange@redhat.com]
>Sent: Monday, April 27, 2009 2:43 PM
>To: Passera, Pablo R
>Cc: kvm@vger.kernel.org
>Subject: Re: [PATCH] reserved-ram for pci-passthrough without VT-d
>capable hardware
>
>Hello Pablo,
>
>On Mon, Apr 27, 2009 at 11:00:51AM -0600, Passera, Pablo R wrote:
>> Andrea,
>>         We are working with embedded hardware that does not have
>>VT-d and we need 1-1 mapping. I wonder which is the status of this
>>patch. Have you continued updating it with the latest KVM version?
>
>Sorry to say but it isn't updated to latest KVM and latest
>mainline. Porting normally should be easy. I attached last versions.
>
>> Since you mentioned this ;), I take opportunity to add that those
>> embedded usages are the ones that are totally fine with the compile
>> time passthrough-guest-ram decision, instead of a boot time
>> decision. Those host kernels will likely have RT patches (KVM works
>> great with preempt-RT indeed) and in turn the compile time ram
>> selection is the least of their problems as you can imagine ;). So you
>> can see my patch as an embedded-build option, similar to "Configure
>> standard kernel features (for small systems)" and no distro is
>> shipping new kernels with that feature on either.
>>
>> Than if we decide 1:1 should have larger userbase instead of only the
>> people that knows what they're doing (i.e. 1:1 guest can destroy
>> linux-hypervisor) we can always add a bit of strtol parsing to 16bit
>> kernelloader.
>
>Agreed!
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andrea Arcangeli April 28, 2009, 6:06 p.m. UTC | #2
On Tue, Apr 28, 2009 at 07:35:26AM -0600, Passera, Pablo R wrote:
> - Against which kernel version was this patch generated?

I don't remember exactly (I was just using an upstream hg checkout and
I didn't record its hash value) but I think you can go back to when
e820.c was still shared and it'll likely apply and work.

> - Did you try this on a 32 or 64 bits system?

I only tested it on 64bit but there's no reason why it shouldn't work
on 32bit too.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Passera, Pablo R May 5, 2009, 7:53 p.m. UTC | #3
Andrea,
        Thanks for your answers. I already patched the kernel and kvm (including rombios). The host boots up and the memory mapping is as explained in the patch. Now I am trying to launch a vm using memory mapping but it hangs after opening the sdl windows and before showing the bios messages. I am running qemu command from a console in the host that is running X and the command line is the following:

Qemu-system-x86_64 -hda ./dm.img -cdrom /dev/sr0 -m 32 -reserved-ram -boot d

- Is this command line correct?
- Should I run the vm without having started the X in the host machine?
- What should I see after starting the vm? Should the vm take ownership of the video card?

Thanks,
Pablo

>-----Original Message-----
>From: Andrea Arcangeli [mailto:aarcange@redhat.com]
>Sent: Tuesday, April 28, 2009 3:06 PM
>To: Passera, Pablo R
>Cc: kvm@vger.kernel.org
>Subject: Re: [PATCH] reserved-ram for pci-passthrough without VT-d
>capable hardware
>
>On Tue, Apr 28, 2009 at 07:35:26AM -0600, Passera, Pablo R wrote:
>> - Against which kernel version was this patch generated?
>
>I don't remember exactly (I was just using an upstream hg checkout and
>I didn't record its hash value) but I think you can go back to when
>e820.c was still shared and it'll likely apply and work.
>
>> - Did you try this on a 32 or 64 bits system?
>
>I only tested it on 64bit but there's no reason why it shouldn't work
>on 32bit too.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1276,8 +1276,36 @@  config CRASH_DUMP
 	  (CONFIG_RELOCATABLE=y).
 	  For more details see Documentation/kdump/kdump.txt
 
+config RESERVE_PHYSICAL_START
+	bool "Reserve all RAM below PHYSICAL_START (EXPERIMENTAL)"
+	depends on !RELOCATABLE && X86_64
+	help
+	  This makes the kernel use only RAM above __PHYSICAL_START.
+	  All memory below __PHYSICAL_START will be left unused and
+	  marked as "reserved RAM" in /proc/iomem. The few special
+	  pages that can't be relocated at addresses above
+	  __PHYSICAL_START and that can't be guaranteed to be unused
+	  by the running kernel will be marked "reserved RAM failed"
+	  in /proc/iomem. Those may or may be not used by the kernel
+	  (for example SMP trampoline pages would only be used if
+	  CPU hotplug is enabled).
+
+	  The "reserved RAM" can be mapped by virtualization software
+	  with /dev/mem to create a 1:1 mapping between guest physical
+	  (bus) address and host physical (bus) address. This will
+	  allow PCI passthrough with DMA for the guest using the RAM
+	  with the 1:1 mapping. The only detail to take care of is the
+	  RAM marked "reserved RAM failed". The virtualization
+	  software must create for the guest an e820 map that only
+	  includes the "reserved RAM" regions but if the guest touches
+	  memory with guest physical address in the "reserved RAM
+	  failed" ranges (Linux guest will do that even if the RAM
+	  isn't present in the e820 map), it should provide that as
+	  RAM and map it with a non-linear mapping. This should allow
+	  any Linux kernel to run fine and hopefully any other OS too.
+
 config PHYSICAL_START
-	hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP)
+	hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP || RESERVE_PHYSICAL_START)
 	default "0x1000000" if X86_NUMAQ
 	default "0x200000" if X86_64
 	default "0x100000"
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -148,6 +148,14 @@  void __init e820_print_map(char *who)
 		case E820_NVS:
 			printk(KERN_CONT "(ACPI NVS)\n");
 			break;
+#ifdef CONFIG_RESERVE_PHYSICAL_START
+		case E820_RESERVED_RAM:
+			printk(KERN_CONT "(reserved RAM)\n");
+			break;
+		case E820_RESERVED_RAM_FAILED:
+			printk(KERN_CONT "(reserved RAM failed)\n");
+			break;
+#endif
 		default:
 			printk(KERN_CONT "type %u\n", e820.map[i].type);
 			break;
@@ -384,10 +392,28 @@  static int __init __append_e820_map(stru
 		u64 end = start + size;
 		u32 type = biosmap->type;
 
+#ifdef CONFIG_RESERVE_PHYSICAL_START
+		/* make space for two more low-prio types */
+		type += 2;
+#endif
+
 		/* Overflow in 64 bits? Ignore the memory map. */
 		if (start > end)
 			return -1;
 
+#ifdef CONFIG_RESERVE_PHYSICAL_START
+		if (type == E820_RAM) {
+			if (end <= __PHYSICAL_START)
+				type = E820_RESERVED_RAM;
+			else if (start < __PHYSICAL_START) {
+				e820_add_region(start,
+						__PHYSICAL_START-start,
+						E820_RESERVED_RAM);
+				size -= __PHYSICAL_START-start;
+				start = __PHYSICAL_START;
+			}
+		}
+#endif
 		e820_add_region(start, size, type);
 
 		biosmap++;
@@ -893,7 +919,35 @@  void __init early_res_to_bootmem(u64 sta
 			final_start, final_end);
 		reserve_bootmem_generic(final_start, final_end - final_start,
 				BOOTMEM_DEFAULT);
+#ifdef CONFIG_RESERVE_PHYSICAL_START
+		if (r->start < __PHYSICAL_START)
+			e820_add_region(r->start, r->end - r->start,
+					E820_RESERVED_RAM_FAILED);
+#endif			
 	}
+#ifdef CONFIG_RESERVE_PHYSICAL_START
+	/* solve E820_RESERVED_RAM vs E820_RESERVED_RAM_FAILED conflicts */
+	update_e820();
+
+	/* now reserve E820_RESERVED_RAM */
+	for (i = 0; i < e820.nr_map; i++) {
+		struct e820entry *ei = &e820.map[i];
+
+		if (ei->type != E820_RESERVED_RAM)
+			continue;
+		final_start = max(start, (u64) ei->addr);
+		final_end = min(end, (u64) (ei->addr + ei->size));
+		if (final_start >= final_end)
+			continue;
+		if (reserve_bootmem_generic(final_start,
+					    final_end - final_start,
+					    BOOTMEM_DEFAULT))
+			printk(KERN_ERR "reserved physical start failure");
+		else
+			printk(KERN_INFO " bootmem reserved RAM: [%Lx-%Lx]\n",
+			       final_start, final_end - 1);
+	}
+#endif
 }
 
 /* Check for already reserved areas */
@@ -1095,6 +1149,17 @@  unsigned long __init e820_end_of_low_ram
 {
 	return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM);
 }
+
+static int __init e820_is_not_ram(int type)
+{
+#ifdef CONFIG_RESERVE_PHYSICAL_START
+	return type != E820_RAM && type != E820_RESERVED_RAM &&
+		type != E820_RESERVED_RAM_FAILED;
+#else
+	return type != E820_RAM;
+#endif	
+}
+
 /*
  * Finds an active region in the address range from start_pfn to last_pfn and
  * returns its range in ei_startpfn and ei_endpfn for the e820 entry.
@@ -1115,8 +1180,8 @@  int __init e820_find_active_region(const
 		return 0;
 
 	/* Skip if map is outside the node */
-	if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
-				    *ei_startpfn >= last_pfn)
+	if (e820_is_not_ram(ei->type) || *ei_endpfn <= start_pfn ||
+	    *ei_startpfn >= last_pfn)
 		return 0;
 
 	/* Check for overlaps */
@@ -1260,6 +1325,10 @@  static inline const char *e820_type_to_s
 	case E820_RAM:	return "System RAM";
 	case E820_ACPI:	return "ACPI Tables";
 	case E820_NVS:	return "ACPI Non-volatile Storage";
+#ifdef CONFIG_RESERVE_PHYSICAL_START
+	case E820_RESERVED_RAM_FAILED: return "reserved RAM failed";
+	case E820_RESERVED_RAM: return "reserved RAM";
+#endif
 	default:	return "reserved";
 	}
 }
@@ -1289,6 +1358,12 @@  void __init e820_reserve_resources(void)
 		res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
 		insert_resource(&iomem_resource, res);
 		res++;
+
+#ifdef CONFIG_RESERVE_PHYSICAL_START
+		if (i == E820_RESERVED_RAM)
+			memset(__va(e820.map[i].addr),
+			       POISON_FREE_INITMEM, e820.map[i].size);
+#endif
 	}
 
 	for (i = 0; i < e820_saved.nr_map; i++) {
diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h
--- a/include/asm-x86/e820.h
+++ b/include/asm-x86/e820.h
@@ -39,10 +39,19 @@ 
 
 #define E820NR	0x1e8		/* # entries in E820MAP */
 
+#ifdef CONFIG_RESERVE_PHYSICAL_START
+#define E820_RESERVED_RAM 1
+#define E820_RESERVED_RAM_FAILED 2
+#define E820_RAM	3
+#define E820_RESERVED	4
+#define E820_ACPI	5
+#define E820_NVS	6
+#else
 #define E820_RAM	1
 #define E820_RESERVED	2
 #define E820_ACPI	3
 #define E820_NVS	4
+#endif
 
 /* reserved RAM used by kernel itself */
 #define E820_RESERVED_KERN        128
diff --git a/include/asm-x86/page_64.h b/include/asm-x86/page_64.h
--- a/include/asm-x86/page_64.h
+++ b/include/asm-x86/page_64.h
@@ -35,6 +35,7 @@ 
 #define __PAGE_OFFSET           _AC(0xffff880000000000, UL)
 
 #define __PHYSICAL_START	CONFIG_PHYSICAL_START
+#define __PHYSICAL_OFFSET	(__PHYSICAL_START-0x200000)
 #define __KERNEL_ALIGN		0x200000
 
 /*
@@ -57,7 +58,7 @@ 
  * Kernel image size is limited to 512 MB (see level2_kernel_pgt in
  * arch/x86/kernel/head_64.S), and it is mapped here:
  */
-#define KERNEL_IMAGE_SIZE	(512 * 1024 * 1024)
+#define KERNEL_IMAGE_SIZE	(512 * 1024 * 1024 + __PHYSICAL_OFFSET)
 #define KERNEL_IMAGE_START	_AC(0xffffffff80000000, UL)
 
 #ifndef __ASSEMBLY__
diff --git a/include/asm-x86/pgtable_64.h b/include/asm-x86/pgtable_64.h
--- a/include/asm-x86/pgtable_64.h
+++ b/include/asm-x86/pgtable_64.h
@@ -150,7 +150,7 @@  static inline void native_pgd_clear(pgd_
 #define VMALLOC_START    _AC(0xffffc20000000000, UL)
 #define VMALLOC_END      _AC(0xffffe1ffffffffff, UL)
 #define VMEMMAP_START	 _AC(0xffffe20000000000, UL)
-#define MODULES_VADDR    _AC(0xffffffffa0000000, UL)
+#define MODULES_VADDR    (0xffffffffa0000000UL+__PHYSICAL_OFFSET)
 #define MODULES_END      _AC(0xfffffffffff00000, UL)
 #define MODULES_LEN   (MODULES_END - MODULES_VADDR)
 
diff --git a/include/asm-x86/trampoline.h b/include/asm-x86/trampoline.h
--- a/include/asm-x86/trampoline.h
+++ b/include/asm-x86/trampoline.h
@@ -13,7 +13,11 @@  extern unsigned long init_rsp;
 extern unsigned long init_rsp;
 extern unsigned long initial_code;
 
+#ifndef CONFIG_RESERVE_PHYSICAL_START
 #define TRAMPOLINE_BASE 0x6000
+#else
+#define TRAMPOLINE_BASE 0x90000 /* move it next to 640k */
+#endif
 extern unsigned long setup_trampoline(void);
 
 #endif /* __ASSEMBLY__ */