diff mbox

[6/7] arm64/kexec: Add core kexec support

Message ID 471af24b7de659a30a5941e793634bffec380cb9.1411604443.git.geoff@infradead.org (mailing list archive)
State New, archived
Headers show

Commit Message

Geoff Levand Sept. 25, 2014, 12:23 a.m. UTC
Add three new files, kexec.h, machine_kexec.c and relocate_kernel.S to the
arm64 architecture that add support for the kexec re-boot mechanism
(CONFIG_KEXEC) on arm64 platforms.

Signed-off-by: Geoff Levand <geoff@infradead.org>
---
 arch/arm64/Kconfig                  |   9 ++
 arch/arm64/include/asm/kexec.h      |  47 +++++++++
 arch/arm64/kernel/Makefile          |   1 +
 arch/arm64/kernel/machine_kexec.c   | 183 ++++++++++++++++++++++++++++++++++++
 arch/arm64/kernel/relocate_kernel.S | 183 ++++++++++++++++++++++++++++++++++++
 include/uapi/linux/kexec.h          |   1 +
 6 files changed, 424 insertions(+)
 create mode 100644 arch/arm64/include/asm/kexec.h
 create mode 100644 arch/arm64/kernel/machine_kexec.c
 create mode 100644 arch/arm64/kernel/relocate_kernel.S

Comments

Vivek Goyal Sept. 25, 2014, 6:28 p.m. UTC | #1
On Thu, Sep 25, 2014 at 12:23:27AM +0000, Geoff Levand wrote:
[..]
> +void machine_kexec(struct kimage *image)
> +{
> +	phys_addr_t reboot_code_buffer_phys;
> +	void *reboot_code_buffer;
> +
> +	BUG_ON(num_online_cpus() > 1);
> +
> +	kexec_kimage_head = image->head;
> +
> +	reboot_code_buffer_phys = page_to_phys(image->control_code_page);
> +	reboot_code_buffer = phys_to_virt(reboot_code_buffer_phys);
> +
> +	/*
> +	 * Copy relocate_new_kernel to the reboot_code_buffer for use
> +	 * after the kernel is shut down.
> +	 */
> +
> +	memcpy(reboot_code_buffer, relocate_new_kernel,
> +		relocate_new_kernel_size);
> +
> +	/* Flush the reboot_code_buffer in preparation for its execution. */
> +
> +	__flush_dcache_area(reboot_code_buffer, relocate_new_kernel_size);
> +
> +	/* Flush the kimage list. */
> +
> +	kexec_list_walk(NULL, image->head, kexec_list_flush_cb);
> +
> +	pr_info("Bye!\n");
> +
> +	/* Disable all DAIF exceptions. */
> +	
> +	asm volatile ("msr daifset, #0xf" : : : "memory");
> +
> +	soft_restart(reboot_code_buffer_phys);

So what is soft_restart() functionality in arm64?

Looks like it switches to identity mapped page tables and that seems
to be the reason that you are not preparing identity mapped page
tables in kexec code. I am wondering I how do you make sure that once
kexec is swapping pages (putting new kernel's pages to its destination)
at that time these identity page will not be overwritten?

I am assuming that you are jumping to purgatory with paging enabled
and whole of the memory identity mapped.

I am also curious to know what are different entry points arm64
kernel image supports and which one are you using by default.

Thanks
Vivek
Geoff Levand Sept. 25, 2014, 7:02 p.m. UTC | #2
Hi Vivek,

On Thu, 2014-09-25 at 14:28 -0400, Vivek Goyal wrote:
> On Thu, Sep 25, 2014 at 12:23:27AM +0000, Geoff Levand wrote:
> [..]
> > +void machine_kexec(struct kimage *image)
> > +{
> > +	phys_addr_t reboot_code_buffer_phys;
> > +	void *reboot_code_buffer;
> > +
> > +	BUG_ON(num_online_cpus() > 1);
> > +
> > +	kexec_kimage_head = image->head;
> > +
> > +	reboot_code_buffer_phys = page_to_phys(image->control_code_page);
> > +	reboot_code_buffer = phys_to_virt(reboot_code_buffer_phys);
> > +
> > +	/*
> > +	 * Copy relocate_new_kernel to the reboot_code_buffer for use
> > +	 * after the kernel is shut down.
> > +	 */
> > +
> > +	memcpy(reboot_code_buffer, relocate_new_kernel,
> > +		relocate_new_kernel_size);
> > +
> > +	/* Flush the reboot_code_buffer in preparation for its execution. */
> > +
> > +	__flush_dcache_area(reboot_code_buffer, relocate_new_kernel_size);
> > +
> > +	/* Flush the kimage list. */
> > +
> > +	kexec_list_walk(NULL, image->head, kexec_list_flush_cb);
> > +
> > +	pr_info("Bye!\n");
> > +
> > +	/* Disable all DAIF exceptions. */
> > +	
> > +	asm volatile ("msr daifset, #0xf" : : : "memory");
> > +
> > +	soft_restart(reboot_code_buffer_phys);
> 
> So what is soft_restart() functionality in arm64?

soft_restart() basically turns off the MMU and data caches, then jumps
to the address passed to it, reboot_code_buffer_phys here.
 
> Looks like it switches to identity mapped page tables and that seems
> to be the reason that you are not preparing identity mapped page
> tables in kexec code. I am wondering I how do you make sure that once
> kexec is swapping pages (putting new kernel's pages to its destination)
> at that time these identity page will not be overwritten?
> 
> I am assuming that you are jumping to purgatory with paging enabled
> and whole of the memory identity mapped.

The identity map is just used to turn off the MMU.  soft_restart() is in
that identity mapping, and once it shuts off the MMU it jumps to the
physical address of relocate_kernel, which uses physical addressing to
do the copy.

> I am also curious to know what are different entry points arm64
> kernel image supports and which one are you using by default.

The arm64 kernel as a single entry, the start of the image.  See
Documentation/arm64/booting.txt.

-Geoff
Vivek Goyal Sept. 25, 2014, 7:08 p.m. UTC | #3
On Thu, Sep 25, 2014 at 12:02:51PM -0700, Geoff Levand wrote:
> Hi Vivek,
> 
> On Thu, 2014-09-25 at 14:28 -0400, Vivek Goyal wrote:
> > On Thu, Sep 25, 2014 at 12:23:27AM +0000, Geoff Levand wrote:
> > [..]
> > > +void machine_kexec(struct kimage *image)
> > > +{
> > > +	phys_addr_t reboot_code_buffer_phys;
> > > +	void *reboot_code_buffer;
> > > +
> > > +	BUG_ON(num_online_cpus() > 1);
> > > +
> > > +	kexec_kimage_head = image->head;
> > > +
> > > +	reboot_code_buffer_phys = page_to_phys(image->control_code_page);
> > > +	reboot_code_buffer = phys_to_virt(reboot_code_buffer_phys);
> > > +
> > > +	/*
> > > +	 * Copy relocate_new_kernel to the reboot_code_buffer for use
> > > +	 * after the kernel is shut down.
> > > +	 */
> > > +
> > > +	memcpy(reboot_code_buffer, relocate_new_kernel,
> > > +		relocate_new_kernel_size);
> > > +
> > > +	/* Flush the reboot_code_buffer in preparation for its execution. */
> > > +
> > > +	__flush_dcache_area(reboot_code_buffer, relocate_new_kernel_size);
> > > +
> > > +	/* Flush the kimage list. */
> > > +
> > > +	kexec_list_walk(NULL, image->head, kexec_list_flush_cb);
> > > +
> > > +	pr_info("Bye!\n");
> > > +
> > > +	/* Disable all DAIF exceptions. */
> > > +	
> > > +	asm volatile ("msr daifset, #0xf" : : : "memory");
> > > +
> > > +	soft_restart(reboot_code_buffer_phys);
> > 
> > So what is soft_restart() functionality in arm64?
> 
> soft_restart() basically turns off the MMU and data caches, then jumps
> to the address passed to it, reboot_code_buffer_phys here.
>  
> > Looks like it switches to identity mapped page tables and that seems
> > to be the reason that you are not preparing identity mapped page
> > tables in kexec code. I am wondering I how do you make sure that once
> > kexec is swapping pages (putting new kernel's pages to its destination)
> > at that time these identity page will not be overwritten?
> > 
> > I am assuming that you are jumping to purgatory with paging enabled
> > and whole of the memory identity mapped.
> 
> The identity map is just used to turn off the MMU.  soft_restart() is in
> that identity mapping, and once it shuts off the MMU it jumps to the
> physical address of relocate_kernel, which uses physical addressing to
> do the copy.

Hi Geoff,

Ok, thanks. I think it would be nice if this explanation appears in code
somewhere as a comment.

Being able to turn off MMU, seems to have simplified things.

> 
> > I am also curious to know what are different entry points arm64
> > kernel image supports and which one are you using by default.
> 
> The arm64 kernel as a single entry, the start of the image.  See
> Documentation/arm64/booting.txt.

I will go through it.

Thanks
Vivek

> 
> -Geoff
>
Vivek Goyal Sept. 30, 2014, 6:18 p.m. UTC | #4
On Thu, Sep 25, 2014 at 12:23:27AM +0000, Geoff Levand wrote:

[..]
> diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c
> new file mode 100644
> index 0000000..22d185c
> --- /dev/null
> +++ b/arch/arm64/kernel/machine_kexec.c
> @@ -0,0 +1,183 @@
> +/*
> + * kexec for arm64
> + *
> + * Copyright (C) Linaro.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#include <linux/kexec.h>
> +#include <linux/of_fdt.h>
> +#include <linux/slab.h>
> +#include <linux/uaccess.h>
> +
> +#include <asm/cacheflush.h>
> +#include <asm/system_misc.h>
> +
> +/* Global variables for the relocate_kernel routine. */
> +
> +extern const unsigned char relocate_new_kernel[];
> +extern const unsigned long relocate_new_kernel_size;
> +extern unsigned long kexec_dtb_addr;
> +extern unsigned long kexec_kimage_head;
> +extern unsigned long kexec_kimage_start;
> +
> +/**
> + * kexec_list_walk - Helper to walk the kimage page list.
> + */
> +
> +static void kexec_list_walk(void *ctx, unsigned long kimage_head,
> +	void (*cb)(void *ctx, unsigned int flag, void *addr, void *dest))
> +{
> +	void *dest;
> +	unsigned long *entry;

Hi Geoff,

I see only one user of this function, kexec_list_flush_cb(). So why
not directly embed needed logic in kexec_list_flush_cb() instead of
implementing a generic function. It would be simpler as you seem to
be flushing dcache only for SOURCE and IND pages and rest you 
can simply ignore.

> +
> +	for (entry = &kimage_head, dest = NULL; ; entry++) {
> +		unsigned int flag = *entry & 
> +			(IND_DESTINATION | IND_INDIRECTION | IND_DONE |
> +			IND_SOURCE);
> +		void *addr = phys_to_virt(*entry & PAGE_MASK);
> +
> +		switch (flag) {
> +		case IND_INDIRECTION:
> +			entry = (unsigned long *)addr - 1;
> +			cb(ctx, flag, addr, NULL);
> +			break;
> +		case IND_DESTINATION:
> +			dest = addr;
> +			cb(ctx, flag, addr, NULL);
> +			break;
> +		case IND_SOURCE:
> +			cb(ctx, flag, addr, dest);
> +			dest += PAGE_SIZE;
> +			break;
> +		case IND_DONE:
> +			cb(ctx, flag , NULL, NULL);
> +			return;
> +		default:
> +			break;
> +		}
> +	}
> +}
> +
> +/**
> + * kexec_is_dtb - Helper routine to check the device tree header signature.
> + */
> +
> +static bool kexec_is_dtb(const void *dtb)
> +{
> +	__be32 magic;
> +
> +	return get_user(magic, (__be32 *)dtb) ? false :
> +		(be32_to_cpu(magic) == OF_DT_HEADER);
> +}
> +
> +/**
> + * kexec_find_dtb_seg - Helper routine to find the dtb segment.
> + */
> +
> +static const struct kexec_segment *kexec_find_dtb_seg(
> +	const struct kimage *image)
> +{
> +	int i;
> +
> +	for (i = 0; i < image->nr_segments; i++) {
> +		if (kexec_is_dtb(image->segment[i].buf))
> +			return &image->segment[i];
> +	}
> +
> +	return NULL;
> +}

So this implementation makes passing dtb mandatory. So it will not work
with ACPI?

Where is dtb present? How is it passed to first kernel? Can it still
be around in memory and second kernel can access it?

I mean in ACPI world on x86, all the ACPI info is still present and second
kernel can access it without it being explicitly to second kernel in
memory. Can something similar happen for dtb?

[..]
> +/**
> + * kexec_list_flush_cb - Callback to flush the kimage list to PoC.
> + */
> +
> +static void kexec_list_flush_cb(void *ctx , unsigned int flag,
> +	void *addr, void *dest)
			  ^^^

Nobody seems to be making use of dest. So why introduce it?

> +{
> +	switch (flag) {
> +	case IND_INDIRECTION:
> +	case IND_SOURCE:
> +		__flush_dcache_area(addr, PAGE_SIZE);
> +		break;

So what does __flush_dcache_area() do? Flush data caches. IIUC, addr
is virtual address at this point of time. While copying pages and
walking through the list, I am assuming you have switched off page
tables and you are in some kind of 1:1 physical mode. So how did
flushing data caches related to a virtual address help. I guess we
are not even accessing that virtual address now. 
 
[..]
> --- /dev/null
> +++ b/arch/arm64/kernel/relocate_kernel.S
> @@ -0,0 +1,183 @@
> +/*
> + * kexec for arm64
> + *
> + * Copyright (C) Linaro.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#include <asm/assembler.h>
> +#include <asm/kexec.h>
> +#include <asm/memory.h>
> +#include <asm/page.h>
> +#include <asm/proc-macros.S>
> +
> +/* The list entry flags. */
> +
> +#define IND_DESTINATION_BIT 0
> +#define IND_INDIRECTION_BIT 1
> +#define IND_DONE_BIT        2
> +#define IND_SOURCE_BIT      3

I thought you had some patches to move these into generic header file. You
got rid of those patches?

> +
> +/*
> + * relocate_new_kernel - Put the 2nd stage kernel image in place and boot it.
> + *
> + * The memory that the old kernel occupies may be overwritten when coping the
> + * new kernel to its final location.  To assure that the relocate_new_kernel
> + * routine which does that copy is not overwritten all code and data needed
> + * by relocate_new_kernel must be between the symbols relocate_new_kernel and
> + * relocate_new_kernel_end.  The machine_kexec() routine will copy
> + * relocate_new_kernel to the kexec control_code_page, a special page which
> + * has been set up to be preserved during the kernel copy operation.
> + */
> +
> +.globl relocate_new_kernel
> +relocate_new_kernel:
> +
> +	/* Setup the list loop variables. */
> +
> +	ldr	x18, kexec_kimage_head		/* x18 = list entry */
> +	dcache_line_size x17, x0		/* x17 = dcache line size */
> +	mov	x16, xzr			/* x16 = segment start */
> +	mov	x15, xzr			/* x15 = entry ptr */
> +	mov	x14, xzr			/* x14 = copy dest */
> +
> +	/* Check if the new kernel needs relocation. */

What's "relocation" in this  context. I guess you are checking if new
kernel needs to be moved to destination location or not.

[..]
> +/*
> + * kexec_dtb_addr - Physical address of the new kernel's device tree.
> + */
> +
> +.globl kexec_dtb_addr
> +kexec_dtb_addr:
> +	.quad	0x0

As these gloabls are very arm64 specific, will it make sense to prefix
arm64_ before these. arm64_kexec_dtb_addr. Or arch_kexec_dtb_addr.


> +
> +/*
> + * kexec_kimage_head - Copy of image->head, the list of kimage entries.
> + */
> +
> +.globl kexec_kimage_head
> +kexec_kimage_head:
> +	.quad	0x0

Same here. How about arch_kexec_kimage_head.

> +
> +/*
> + * kexec_kimage_start - Copy of image->start, the entry point of the new kernel.
> + */
> +
> +.globl kexec_kimage_start
> +kexec_kimage_start:
> +	.quad	0x0

arch_kexec_kimage_start.

Thanks
Vivek
Geoff Levand Sept. 30, 2014, 7:54 p.m. UTC | #5
Hi Vivek,

On Tue, 2014-09-30 at 14:18 -0400, Vivek Goyal wrote:
> On Thu, Sep 25, 2014 at 12:23:27AM +0000, Geoff Levand wrote:
> 
> [..]
> > diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c
> > new file mode 100644
> > index 0000000..22d185c
> > --- /dev/null
> > +++ b/arch/arm64/kernel/machine_kexec.c
> > @@ -0,0 +1,183 @@
> > +/*
> > + * kexec for arm64
> > + *
> > + * Copyright (C) Linaro.
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License version 2 as
> > + * published by the Free Software Foundation.
> > + */
> > +
> > +#include <linux/kexec.h>
> > +#include <linux/of_fdt.h>
> > +#include <linux/slab.h>
> > +#include <linux/uaccess.h>
> > +
> > +#include <asm/cacheflush.h>
> > +#include <asm/system_misc.h>
> > +
> > +/* Global variables for the relocate_kernel routine. */
> > +
> > +extern const unsigned char relocate_new_kernel[];
> > +extern const unsigned long relocate_new_kernel_size;
> > +extern unsigned long kexec_dtb_addr;
> > +extern unsigned long kexec_kimage_head;
> > +extern unsigned long kexec_kimage_start;
> > +
> > +/**
> > + * kexec_list_walk - Helper to walk the kimage page list.
> > + */
> > +
> > +static void kexec_list_walk(void *ctx, unsigned long kimage_head,
> > +	void (*cb)(void *ctx, unsigned int flag, void *addr, void *dest))
> > +{
> > +	void *dest;
> > +	unsigned long *entry;
> 
> Hi Geoff,
> 
> I see only one user of this function, kexec_list_flush_cb(). So why
> not directly embed needed logic in kexec_list_flush_cb() instead of
> implementing a generic function. It would be simpler as you seem to
> be flushing dcache only for SOURCE and IND pages and rest you 
> can simply ignore.

I have an additional debugging patch that uses this to dump the list.
I can move this routine into that patch and put in a simpler version
here.

> > +
> > +	for (entry = &kimage_head, dest = NULL; ; entry++) {
> > +		unsigned int flag = *entry & 
> > +			(IND_DESTINATION | IND_INDIRECTION | IND_DONE |
> > +			IND_SOURCE);
> > +		void *addr = phys_to_virt(*entry & PAGE_MASK);
> > +
> > +		switch (flag) {
> > +		case IND_INDIRECTION:
> > +			entry = (unsigned long *)addr - 1;
> > +			cb(ctx, flag, addr, NULL);
> > +			break;
> > +		case IND_DESTINATION:
> > +			dest = addr;
> > +			cb(ctx, flag, addr, NULL);
> > +			break;
> > +		case IND_SOURCE:
> > +			cb(ctx, flag, addr, dest);
> > +			dest += PAGE_SIZE;
> > +			break;
> > +		case IND_DONE:
> > +			cb(ctx, flag , NULL, NULL);
> > +			return;
> > +		default:
> > +			break;
> > +		}
> > +	}
> > +}
> > +
> > +/**
> > + * kexec_is_dtb - Helper routine to check the device tree header signature.
> > + */
> > +
> > +static bool kexec_is_dtb(const void *dtb)
> > +{
> > +	__be32 magic;
> > +
> > +	return get_user(magic, (__be32 *)dtb) ? false :
> > +		(be32_to_cpu(magic) == OF_DT_HEADER);
> > +}
> > +
> > +/**
> > + * kexec_find_dtb_seg - Helper routine to find the dtb segment.
> > + */
> > +
> > +static const struct kexec_segment *kexec_find_dtb_seg(
> > +	const struct kimage *image)
> > +{
> > +	int i;
> > +
> > +	for (i = 0; i < image->nr_segments; i++) {
> > +		if (kexec_is_dtb(image->segment[i].buf))
> > +			return &image->segment[i];
> > +	}
> > +
> > +	return NULL;
> > +}
> 
> So this implementation makes passing dtb mandatory. So it will not work
> with ACPI?

I have not yet considered ACPI.  It will most likely need to have
something done differently.  Secure boot will also need something
different, and I expect it will use your new kexec_file_load().

> Where is dtb present? How is it passed to first kernel? Can it still
> be around in memory and second kernel can access it?

The user space program (kexec-tools, etc.) passes a dtb.  That dtb
could be a copy of the currently one, or a new one specified by
the user.

> I mean in ACPI world on x86, all the ACPI info is still present and second
> kernel can access it without it being explicitly to second kernel in
> memory. Can something similar happen for dtb?

This implementation leaves the preparation of the 2nd stage dtb to
the user space program, as it can prepare that dtb with the proper
kernel command line property, initrd properties etc.

> [..]
> > +/**
> > + * kexec_list_flush_cb - Callback to flush the kimage list to PoC.
> > + */
> > +
> > +static void kexec_list_flush_cb(void *ctx , unsigned int flag,
> > +	void *addr, void *dest)
> 			  ^^^
> 
> Nobody seems to be making use of dest. So why introduce it?

As mentioned, I used this for dumping the list, and that callback
used dest.

> > +{
> > +	switch (flag) {
> > +	case IND_INDIRECTION:
> > +	case IND_SOURCE:
> > +		__flush_dcache_area(addr, PAGE_SIZE);
> > +		break;
> 
> So what does __flush_dcache_area() do? Flush data caches. IIUC, addr
> is virtual address at this point of time. While copying pages and
> walking through the list, I am assuming you have switched off page
> tables and you are in some kind of 1:1 physical mode. So how did
> flushing data caches related to a virtual address help. I guess we
> are not even accessing that virtual address now.

__flush_dcache_area(), and the underling aarch64 civac instruction
operate on virtual addresses.  Here we are still running with the
MMU on and the identity mapping has not yet been enabled.  This is
the sequence:

  flush dcache -> turn off MMU, dcache -> access memory (PoC) directly 

> [..]
> > --- /dev/null
> > +++ b/arch/arm64/kernel/relocate_kernel.S
> > @@ -0,0 +1,183 @@
> > +/*
> > + * kexec for arm64
> > + *
> > + * Copyright (C) Linaro.
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License version 2 as
> > + * published by the Free Software Foundation.
> > + */
> > +
> > +#include <asm/assembler.h>
> > +#include <asm/kexec.h>
> > +#include <asm/memory.h>
> > +#include <asm/page.h>
> > +#include <asm/proc-macros.S>
> > +
> > +/* The list entry flags. */
> > +
> > +#define IND_DESTINATION_BIT 0
> > +#define IND_INDIRECTION_BIT 1
> > +#define IND_DONE_BIT        2
> > +#define IND_SOURCE_BIT      3
> 
> I thought you had some patches to move these into generic header file. You
> got rid of those patches?

I will have another patch to remove these when the kexec patches get
merged, or will just remove these if my kexec patches show up in
Catalin's arm64 tree.

> > +
> > +/*
> > + * relocate_new_kernel - Put the 2nd stage kernel image in place and boot it.
> > + *
> > + * The memory that the old kernel occupies may be overwritten when coping the
> > + * new kernel to its final location.  To assure that the relocate_new_kernel
> > + * routine which does that copy is not overwritten all code and data needed
> > + * by relocate_new_kernel must be between the symbols relocate_new_kernel and
> > + * relocate_new_kernel_end.  The machine_kexec() routine will copy
> > + * relocate_new_kernel to the kexec control_code_page, a special page which
> > + * has been set up to be preserved during the kernel copy operation.
> > + */
> > +
> > +.globl relocate_new_kernel
> > +relocate_new_kernel:
> > +
> > +	/* Setup the list loop variables. */
> > +
> > +	ldr	x18, kexec_kimage_head		/* x18 = list entry */
> > +	dcache_line_size x17, x0		/* x17 = dcache line size */
> > +	mov	x16, xzr			/* x16 = segment start */
> > +	mov	x15, xzr			/* x15 = entry ptr */
> > +	mov	x14, xzr			/* x14 = copy dest */
> > +
> > +	/* Check if the new kernel needs relocation. */
> 
> What's "relocation" in this  context. I guess you are checking if new
> kernel needs to be moved to destination location or not.

Yes, relocate means scatter-gather 'copy' here.

> [..]
> > +/*
> > + * kexec_dtb_addr - Physical address of the new kernel's device tree.
> > + */
> > +
> > +.globl kexec_dtb_addr
> > +kexec_dtb_addr:
> > +	.quad	0x0
> 
> As these gloabls are very arm64 specific, will it make sense to prefix
> arm64_ before these. arm64_kexec_dtb_addr. Or arch_kexec_dtb_addr.

I could put an arm64_ prefix on, but this file and these variables are
arm64 specific so I thought it unnecessary.

I don't think arch_ would be right, as I don't expect any other arch to
have these variables.

I'll post a new patch version soon.

-Geoff
Vivek Goyal Oct. 1, 2014, 2:56 p.m. UTC | #6
On Tue, Sep 30, 2014 at 12:54:37PM -0700, Geoff Levand wrote:

[..]
> > > +{
> > > +	switch (flag) {
> > > +	case IND_INDIRECTION:
> > > +	case IND_SOURCE:
> > > +		__flush_dcache_area(addr, PAGE_SIZE);
> > > +		break;
> > 
> > So what does __flush_dcache_area() do? Flush data caches. IIUC, addr
> > is virtual address at this point of time. While copying pages and
> > walking through the list, I am assuming you have switched off page
> > tables and you are in some kind of 1:1 physical mode. So how did
> > flushing data caches related to a virtual address help. I guess we
> > are not even accessing that virtual address now.
> 
> __flush_dcache_area(), and the underling aarch64 civac instruction
> operate on virtual addresses.  Here we are still running with the
> MMU on and the identity mapping has not yet been enabled.  This is
> the sequence:
> 
>   flush dcache -> turn off MMU, dcache -> access memory (PoC) directly 

Sorry, I don't understand that why do we need to flush dcache for source
and indirection page addresses. Some information here will help.

Thanks
Vivek
Mark Rutland Oct. 1, 2014, 4:16 p.m. UTC | #7
[...]

> > > +/**
> > > + * kexec_is_dtb - Helper routine to check the device tree header signature.
> > > + */
> > > +
> > > +static bool kexec_is_dtb(const void *dtb)
> > > +{
> > > +	__be32 magic;
> > > +
> > > +	return get_user(magic, (__be32 *)dtb) ? false :
> > > +		(be32_to_cpu(magic) == OF_DT_HEADER);
> > > +}
> > > +
> > > +/**
> > > + * kexec_find_dtb_seg - Helper routine to find the dtb segment.
> > > + */
> > > +
> > > +static const struct kexec_segment *kexec_find_dtb_seg(
> > > +	const struct kimage *image)
> > > +{
> > > +	int i;
> > > +
> > > +	for (i = 0; i < image->nr_segments; i++) {
> > > +		if (kexec_is_dtb(image->segment[i].buf))
> > > +			return &image->segment[i];
> > > +	}
> > > +
> > > +	return NULL;
> > > +}
> > 
> > So this implementation makes passing dtb mandatory. So it will not work
> > with ACPI?
> 
> I have not yet considered ACPI.  It will most likely need to have
> something done differently.  Secure boot will also need something
> different, and I expect it will use your new kexec_file_load().

A DTB is mandatory for arm64, and is used to pass the command line,
(optionally) initrd, and other parameters, even if it doesn't contain HW
description. In the EFI case the EFI stub will create a trivial DTB if
necessary, and the kernel will detect any ACPI tables via UEFI, so the
DTB should be sufficient for ACPI.

I'm still rather unhappy about the mechanism by which the DTB is passed
by userspace and detected by the kernel, as I'd prefer that the user
explictly stated which segment they wanted to pass to the (Linux)
kernel, but that would require reworking the kexec syscall to allow
per-segment info/flags.

To me it seems that for all the talk of kexec allowing arbitrary kernels
to be booted it's really just a linux->linux reboot bridge. Does anyone
use kexec to boot something that isn't Linux?

> > Where is dtb present? How is it passed to first kernel? Can it still
> > be around in memory and second kernel can access it?
> 
> The user space program (kexec-tools, etc.) passes a dtb.  That dtb
> could be a copy of the currently one, or a new one specified by
> the user.
> 
> > I mean in ACPI world on x86, all the ACPI info is still present and second
> > kernel can access it without it being explicitly to second kernel in
> > memory. Can something similar happen for dtb?

Any ACPI tables should remain, given they'll be reserved in the UEFI
memory map. The second kernel can find them as the first kernel did, via
UEFI tables, which it will fine via the DTB.

For the DTB, reusing the original DTB is a possibility. From what I
recall, Grant seemed to prefer re-packing the existing tree as this
would allow for state destroyed at boot to be corrected for.

Regardless, being able to pass a DTB from userspace is a useful option
(especially for the Linux-as-a-bootloader approach that's been mentioned
a lot). That doesn't work for the secureboot case without a new syscall
as we can't pass a signed DTB (or any other additional objects other
than an initrd) to kexec_file_load, but disallowing the user to pass a
new DTB in that case seems reasonable.

Mark.
Vivek Goyal Oct. 1, 2014, 5:36 p.m. UTC | #8
On Wed, Oct 01, 2014 at 05:16:21PM +0100, Mark Rutland wrote:

[..]
> > > So this implementation makes passing dtb mandatory. So it will not work
> > > with ACPI?
> > 
> > I have not yet considered ACPI.  It will most likely need to have
> > something done differently.  Secure boot will also need something
> > different, and I expect it will use your new kexec_file_load().
> 
> A DTB is mandatory for arm64, and is used to pass the command line,
> (optionally) initrd, and other parameters, even if it doesn't contain HW
> description. In the EFI case the EFI stub will create a trivial DTB if
> necessary, and the kernel will detect any ACPI tables via UEFI, so the
> DTB should be sufficient for ACPI.
> 
> I'm still rather unhappy about the mechanism by which the DTB is passed
> by userspace and detected by the kernel, as I'd prefer that the user
> explictly stated which segment they wanted to pass to the (Linux)
> kernel, but that would require reworking the kexec syscall to allow
> per-segment info/flags.

Yep, in this case, it would have been nice if there were per segment
flags to identify type of segment. But unfortunately we don't have. So
in the absence of that, I think putting 4 bytes as dtb magic in the
beginning of segment should work (though no ideal).

> 
> To me it seems that for all the talk of kexec allowing arbitrary kernels
> to be booted it's really just a linux->linux reboot bridge. Does anyone
> use kexec to boot something that isn't Linux?

> 
> > > Where is dtb present? How is it passed to first kernel? Can it still
> > > be around in memory and second kernel can access it?
> > 
> > The user space program (kexec-tools, etc.) passes a dtb.  That dtb
> > could be a copy of the currently one, or a new one specified by
> > the user.
> > 
> > > I mean in ACPI world on x86, all the ACPI info is still present and second
> > > kernel can access it without it being explicitly to second kernel in
> > > memory. Can something similar happen for dtb?
> 
> Any ACPI tables should remain, given they'll be reserved in the UEFI
> memory map. The second kernel can find them as the first kernel did, via
> UEFI tables, which it will fine via the DTB.
> 
> For the DTB, reusing the original DTB is a possibility. From what I
> recall, Grant seemed to prefer re-packing the existing tree as this
> would allow for state destroyed at boot to be corrected for.
> 
> Regardless, being able to pass a DTB from userspace is a useful option
> (especially for the Linux-as-a-bootloader approach that's been mentioned
> a lot). That doesn't work for the secureboot case without a new syscall
> as we can't pass a signed DTB (or any other additional objects other
> than an initrd) to kexec_file_load, but disallowing the user to pass a
> new DTB in that case seems reasonable.

Yes, kexec_file_load() will not allow passing anything except, kernel,
initrd and command line. So syscall implementation will have to resuse
the existing DTB and pass it to second kernel. 

If there are concerns w.r.t state of DTB which can be destroyed during
boot, I guess we will have to store a copy of DTB somewhere early during
boot and kexec can access that original copy during kernel load time.

Thanks
Vivek



> 
> Mark.
Vivek Goyal Oct. 1, 2014, 5:47 p.m. UTC | #9
On Wed, Oct 01, 2014 at 05:16:21PM +0100, Mark Rutland wrote:

[..]
> I'm still rather unhappy about the mechanism by which the DTB is passed
> by userspace and detected by the kernel, as I'd prefer that the user
> explictly stated which segment they wanted to pass to the (Linux)
> kernel, but that would require reworking the kexec syscall to allow
> per-segment info/flags.

Why does the running kernel need to know about dtb segment.  I see following.

ldr     x0, kexec_dtb_addr

IIUC, we are loading this address in x0. Can't we do something similar
in user space with purgatory. I mean first jump to purgatory (code
compiled in user space but runs prviliged) and that code takes care
of loading x0 with right dtb addr and then jump to final kernel.

IOW, I am not able to understand that why kernel implementation needs
to know which is dtb segment.

Thanks
Vivek
Mark Rutland Oct. 1, 2014, 5:56 p.m. UTC | #10
> > I'm still rather unhappy about the mechanism by which the DTB is passed
> > by userspace and detected by the kernel, as I'd prefer that the user
> > explictly stated which segment they wanted to pass to the (Linux)
> > kernel, but that would require reworking the kexec syscall to allow
> > per-segment info/flags.
> 
> Yep, in this case, it would have been nice if there were per segment
> flags to identify type of segment. But unfortunately we don't have. So
> in the absence of that, I think putting 4 bytes as dtb magic in the
> beginning of segment should work (though no ideal).

I don't disagree it will work for the simple kernel + DTB case. The
existing DTB magic  (which is part of the DTB rather than an addition)
is sufficient to identify _a_ DTB, but that doesn't mean that you want
to pass that DTB to the next kernel, nor that you want it in x0.

That might be important for booting other OSs, or for loading multiple
kernels (if you want linux as a bootloader to load a kernel +
crashkernel pair with separate DTBs).

I believe it would be feasible to (at some point) add a new kexec flag
allowing us to pass a list of (new) struct kexec_segment_extended
elements to address that.

[...]

> > For the DTB, reusing the original DTB is a possibility. From what I
> > recall, Grant seemed to prefer re-packing the existing tree as this
> > would allow for state destroyed at boot to be corrected for.
> > 
> > Regardless, being able to pass a DTB from userspace is a useful option
> > (especially for the Linux-as-a-bootloader approach that's been mentioned
> > a lot). That doesn't work for the secureboot case without a new syscall
> > as we can't pass a signed DTB (or any other additional objects other
> > than an initrd) to kexec_file_load, but disallowing the user to pass a
> > new DTB in that case seems reasonable.
> 
> Yes, kexec_file_load() will not allow passing anything except, kernel,
> initrd and command line. So syscall implementation will have to resuse
> the existing DTB and pass it to second kernel. 
> 
> If there are concerns w.r.t state of DTB which can be destroyed during
> boot, I guess we will have to store a copy of DTB somewhere early during
> boot and kexec can access that original copy during kernel load time.

The issue with state wasn't that the DTB binary gets modified but
rather that the state of the system is changed such that the state
described in the original DTB is no longer correct. Consider a simple
framebuffer setup maintained from the bootloader that the kernel later
decides to modify at the behest of the user.

As I mention above, I believe that Grant was of the opinion that the
live/unpacked device tree should be modified to reflect the system
state, and should be repacked prior to a kexec.

Mark.
Mark Rutland Oct. 1, 2014, 6:03 p.m. UTC | #11
On Wed, Oct 01, 2014 at 06:47:14PM +0100, Vivek Goyal wrote:
> On Wed, Oct 01, 2014 at 05:16:21PM +0100, Mark Rutland wrote:
> 
> [..]
> > I'm still rather unhappy about the mechanism by which the DTB is passed
> > by userspace and detected by the kernel, as I'd prefer that the user
> > explictly stated which segment they wanted to pass to the (Linux)
> > kernel, but that would require reworking the kexec syscall to allow
> > per-segment info/flags.
> 
> Why does the running kernel need to know about dtb segment.  I see following.
> 
> ldr     x0, kexec_dtb_addr
> 
> IIUC, we are loading this address in x0. Can't we do something similar
> in user space with purgatory. I mean first jump to purgatory (code
> compiled in user space but runs prviliged) and that code takes care
> of loading x0 with right dtb addr and then jump to final kernel.

I believe the fundamental issue here is a lack of a userspace-provided
purgatory.

I agree that userspace purgatory code could set this up. That would
address my concerns w.r.t. detecting the DTB kernel-side, as there would
be no need. It would also address my concerns with booting OSs other
than Linux, as the purgatory code could do whatever was appropriate for
whatever OS image was loaded.

So in my view, a userspace-provided purgatory that set up the state the
next kernel expected would be preferable. That could be as simple as
setting up the registers and branching -- I assume we'd have the first
kernel perform the required cache maintenance.

Mark.
Vivek Goyal Oct. 1, 2014, 6:09 p.m. UTC | #12
On Wed, Oct 01, 2014 at 07:03:04PM +0100, Mark Rutland wrote:
> On Wed, Oct 01, 2014 at 06:47:14PM +0100, Vivek Goyal wrote:
> > On Wed, Oct 01, 2014 at 05:16:21PM +0100, Mark Rutland wrote:
> > 
> > [..]
> > > I'm still rather unhappy about the mechanism by which the DTB is passed
> > > by userspace and detected by the kernel, as I'd prefer that the user
> > > explictly stated which segment they wanted to pass to the (Linux)
> > > kernel, but that would require reworking the kexec syscall to allow
> > > per-segment info/flags.
> > 
> > Why does the running kernel need to know about dtb segment.  I see following.
> > 
> > ldr     x0, kexec_dtb_addr
> > 
> > IIUC, we are loading this address in x0. Can't we do something similar
> > in user space with purgatory. I mean first jump to purgatory (code
> > compiled in user space but runs prviliged) and that code takes care
> > of loading x0 with right dtb addr and then jump to final kernel.
> 
> I believe the fundamental issue here is a lack of a userspace-provided
> purgatory.
> 
> I agree that userspace purgatory code could set this up. That would
> address my concerns w.r.t. detecting the DTB kernel-side, as there would
> be no need. It would also address my concerns with booting OSs other
> than Linux, as the purgatory code could do whatever was appropriate for
> whatever OS image was loaded.
> 
> So in my view, a userspace-provided purgatory that set up the state the
> next kernel expected would be preferable. That could be as simple as
> setting up the registers and branching -- I assume we'd have the first
> kernel perform the required cache maintenance.

Apart from setting various registers, we also verify the sha256 checksums
of loaded segments in purgatory to make sure segments are not corrupted.
On x86, we also take care of backing up first 640KB of memory in reserved
area in kdump case. 

So other arches are already doing all this in purgatory. It would be nice
if arm64 sticks to that convention too.

First kernel --> purgatory ----> second kernel.

Thanks
Vivek
Mark Rutland Oct. 1, 2014, 6:19 p.m. UTC | #13
On Wed, Oct 01, 2014 at 07:09:09PM +0100, Vivek Goyal wrote:
> On Wed, Oct 01, 2014 at 07:03:04PM +0100, Mark Rutland wrote:
> > On Wed, Oct 01, 2014 at 06:47:14PM +0100, Vivek Goyal wrote:
> > > On Wed, Oct 01, 2014 at 05:16:21PM +0100, Mark Rutland wrote:
> > > 
> > > [..]
> > > > I'm still rather unhappy about the mechanism by which the DTB is passed
> > > > by userspace and detected by the kernel, as I'd prefer that the user
> > > > explictly stated which segment they wanted to pass to the (Linux)
> > > > kernel, but that would require reworking the kexec syscall to allow
> > > > per-segment info/flags.
> > > 
> > > Why does the running kernel need to know about dtb segment.  I see following.
> > > 
> > > ldr     x0, kexec_dtb_addr
> > > 
> > > IIUC, we are loading this address in x0. Can't we do something similar
> > > in user space with purgatory. I mean first jump to purgatory (code
> > > compiled in user space but runs prviliged) and that code takes care
> > > of loading x0 with right dtb addr and then jump to final kernel.
> > 
> > I believe the fundamental issue here is a lack of a userspace-provided
> > purgatory.
> > 
> > I agree that userspace purgatory code could set this up. That would
> > address my concerns w.r.t. detecting the DTB kernel-side, as there would
> > be no need. It would also address my concerns with booting OSs other
> > than Linux, as the purgatory code could do whatever was appropriate for
> > whatever OS image was loaded.
> > 
> > So in my view, a userspace-provided purgatory that set up the state the
> > next kernel expected would be preferable. That could be as simple as
> > setting up the registers and branching -- I assume we'd have the first
> > kernel perform the required cache maintenance.
> 
> Apart from setting various registers, we also verify the sha256 checksums
> of loaded segments in purgatory to make sure segments are not corrupted.
> On x86, we also take care of backing up first 640KB of memory in reserved
> area in kdump case. 

I was under the (possibly mistaken) impression that for kdump the second
kernel lived and ran at a high address so as to preserve memory in use
by the first kernel. Is the first 640KiB is special on x86, or is does
it have some kdump-specific use?

> So other arches are already doing all this in purgatory. It would be nice
> if arm64 sticks to that convention too.
> 
> First kernel --> purgatory ----> second kernel.

That would also be my preference, especially given the flexibility this
would leave.

Mark.
Vivek Goyal Oct. 1, 2014, 6:31 p.m. UTC | #14
On Wed, Oct 01, 2014 at 07:19:59PM +0100, Mark Rutland wrote:
> On Wed, Oct 01, 2014 at 07:09:09PM +0100, Vivek Goyal wrote:
> > On Wed, Oct 01, 2014 at 07:03:04PM +0100, Mark Rutland wrote:
> > > On Wed, Oct 01, 2014 at 06:47:14PM +0100, Vivek Goyal wrote:
> > > > On Wed, Oct 01, 2014 at 05:16:21PM +0100, Mark Rutland wrote:
> > > > 
> > > > [..]
> > > > > I'm still rather unhappy about the mechanism by which the DTB is passed
> > > > > by userspace and detected by the kernel, as I'd prefer that the user
> > > > > explictly stated which segment they wanted to pass to the (Linux)
> > > > > kernel, but that would require reworking the kexec syscall to allow
> > > > > per-segment info/flags.
> > > > 
> > > > Why does the running kernel need to know about dtb segment.  I see following.
> > > > 
> > > > ldr     x0, kexec_dtb_addr
> > > > 
> > > > IIUC, we are loading this address in x0. Can't we do something similar
> > > > in user space with purgatory. I mean first jump to purgatory (code
> > > > compiled in user space but runs prviliged) and that code takes care
> > > > of loading x0 with right dtb addr and then jump to final kernel.
> > > 
> > > I believe the fundamental issue here is a lack of a userspace-provided
> > > purgatory.
> > > 
> > > I agree that userspace purgatory code could set this up. That would
> > > address my concerns w.r.t. detecting the DTB kernel-side, as there would
> > > be no need. It would also address my concerns with booting OSs other
> > > than Linux, as the purgatory code could do whatever was appropriate for
> > > whatever OS image was loaded.
> > > 
> > > So in my view, a userspace-provided purgatory that set up the state the
> > > next kernel expected would be preferable. That could be as simple as
> > > setting up the registers and branching -- I assume we'd have the first
> > > kernel perform the required cache maintenance.
> > 
> > Apart from setting various registers, we also verify the sha256 checksums
> > of loaded segments in purgatory to make sure segments are not corrupted.
> > On x86, we also take care of backing up first 640KB of memory in reserved
> > area in kdump case. 
> 
> I was under the (possibly mistaken) impression that for kdump the second
> kernel lived and ran at a high address so as to preserve memory in use
> by the first kernel. Is the first 640KiB is special on x86, or is does
> it have some kdump-specific use?

Use of first 640KB by second kernel is x86 specific. And it was long back
and I am not sure if this requirement exists today or not. Just that
things have been working and nobody has bothered to look into optimizing
it further.

Kdump kernel does run from reserved memory. This memory is reserved
very early during boot so that first kernel does not end up using it.
So it does not matter whether that memory is reserved high or low. First
kernel is not going to use it as it is reserved. Hence memory contents
of first kernel will be preserved.

Thanks
Vivek
Vivek Goyal Oct. 1, 2014, 7:22 p.m. UTC | #15
On Wed, Oct 01, 2014 at 07:03:04PM +0100, Mark Rutland wrote:

[..]
> I assume we'd have the first kernel perform the required cache maintenance.
> 

Hi Mark,

I am wondering, what kind of cache management is required here? What kind of
dcaches are present on arm64. I see that Geoff's patches flush dcaches for 
certain kexec stored pages using __flush_dcache_area()
(in kexec_list_flush_cb()).

arch/arm64/include/asm/cacheflush.h says following.

 *      __flush_dcache_area(kaddr, size)
 *
 *              Ensure that the data held in page is written back.
 *              - kaddr  - page address
 *              - size   - region size

So looks like we are trying to write back anything which we will access
after switching off MMU. If that's the case, I have two questions.

- Why do we need to writeback that cacheline. After switching off MMU,
  will we not access same cacheline. I thought caches are VIPT and tag
  will still remain the same (but I might easily be wrong here).

- Even if we have to flush that cacheline, for kexec pages, I guess it
  should be done at kernel load time and not at the time of transition
  into new kernel. That seems too late. Once the kernel has been loaded,
  we don't overwrite these pages anymore. So a dcache flush at that
  time should be good.

Thanks
Vivek
Mark Rutland Oct. 2, 2014, 10:26 a.m. UTC | #16
On Wed, Oct 01, 2014 at 08:22:45PM +0100, Vivek Goyal wrote:
> On Wed, Oct 01, 2014 at 07:03:04PM +0100, Mark Rutland wrote:
> 
> [..]
> > I assume we'd have the first kernel perform the required cache maintenance.
> > 
> 
> Hi Mark,
> 
> I am wondering, what kind of cache management is required here? What kind of
> dcaches are present on arm64.

In ARMv8 there's a hierarchy of quasi-PIPT D-caches; they generally
behave like (and can be maintained as if) they are PIPT but might not
actually be PIPT. There may be a system level cache between the
architected cache hierarchy and memory (that should respect cache
maintenance by VA).

The MT_NORMAL attributes are such that most memory the kernel maps will
have write-back read/write allocate attributes. So cache maintenance is
required to ensure that data is cleaned from the D-caches out to the PoC
(the point in the memory system at which non-cacheable accesses can see
the same data), such that the CPU can see the images rather than stale
data once translation is disabled.

> I see that Geoff's patches flush dcaches for 
> certain kexec stored pages using __flush_dcache_area()
> (in kexec_list_flush_cb()).
> 
> arch/arm64/include/asm/cacheflush.h says following.
> 
>  *      __flush_dcache_area(kaddr, size)
>  *
>  *              Ensure that the data held in page is written back.
>  *              - kaddr  - page address
>  *              - size   - region size
> 
> So looks like we are trying to write back anything which we will access
> after switching off MMU. If that's the case, I have two questions.
> 
> - Why do we need to writeback that cacheline. After switching off MMU,
>   will we not access same cacheline. I thought caches are VIPT and tag
>   will still remain the same (but I might easily be wrong here).

As I mention above, the initial cache flush by VA is to ensure that the
data is visible to the CPU once translation is disabled. I'm not sure I
follow your reasoning.

> - Even if we have to flush that cacheline, for kexec pages, I guess it
>   should be done at kernel load time and not at the time of transition
>   into new kernel. That seems too late. Once the kernel has been loaded,
>   we don't overwrite these pages anymore. So a dcache flush at that
>   time should be good.

Given the current assumption at boot is that the kernel image should be
clean in the D-cache hierarchy (but not necessarily anything else), that
should be fine. However, we may need to nuke the I-cache when branching
to the purgatory code as the I-cache could be PIPT, VIPT, or ASID-tagged
VIVT.

If the purgatory code moves anything around it will need to perform
maintenance by VA to ensure stale dirty lines don't overwrite anything,
and stale clean lines don't shadow anything.

Mark.
Vivek Goyal Oct. 2, 2014, 1:54 p.m. UTC | #17
On Thu, Oct 02, 2014 at 11:26:25AM +0100, Mark Rutland wrote:
> On Wed, Oct 01, 2014 at 08:22:45PM +0100, Vivek Goyal wrote:
> > On Wed, Oct 01, 2014 at 07:03:04PM +0100, Mark Rutland wrote:
> > 
> > [..]
> > > I assume we'd have the first kernel perform the required cache maintenance.
> > > 
> > 
> > Hi Mark,
> > 
> > I am wondering, what kind of cache management is required here? What kind of
> > dcaches are present on arm64.
> 
> In ARMv8 there's a hierarchy of quasi-PIPT D-caches; they generally
> behave like (and can be maintained as if) they are PIPT but might not
> actually be PIPT. There may be a system level cache between the
> architected cache hierarchy and memory (that should respect cache
> maintenance by VA).
> 
> The MT_NORMAL attributes are such that most memory the kernel maps will
> have write-back read/write allocate attributes. So cache maintenance is
> required to ensure that data is cleaned from the D-caches out to the PoC
> (the point in the memory system at which non-cacheable accesses can see
> the same data), such that the CPU can see the images rather than stale
> data once translation is disabled.
> 
> > I see that Geoff's patches flush dcaches for 
> > certain kexec stored pages using __flush_dcache_area()
> > (in kexec_list_flush_cb()).
> > 
> > arch/arm64/include/asm/cacheflush.h says following.
> > 
> >  *      __flush_dcache_area(kaddr, size)
> >  *
> >  *              Ensure that the data held in page is written back.
> >  *              - kaddr  - page address
> >  *              - size   - region size
> > 
> > So looks like we are trying to write back anything which we will access
> > after switching off MMU. If that's the case, I have two questions.
> > 
> > - Why do we need to writeback that cacheline. After switching off MMU,
> >   will we not access same cacheline. I thought caches are VIPT and tag
> >   will still remain the same (but I might easily be wrong here).
> 
> As I mention above, the initial cache flush by VA is to ensure that the
> data is visible to the CPU once translation is disabled. I'm not sure I
> follow your reasoning.

I was assuming that even after we disable translations, cpu will still
read data from dcache if it is available there. Looks like you are
saying that once translation is disabled, data will be read from memory
hence it is important to flush out dcache before disabling translation.
Did I understand it right?

Thanks
Vivek
Mark Rutland Oct. 2, 2014, 4:53 p.m. UTC | #18
> > > I see that Geoff's patches flush dcaches for 
> > > certain kexec stored pages using __flush_dcache_area()
> > > (in kexec_list_flush_cb()).
> > > 
> > > arch/arm64/include/asm/cacheflush.h says following.
> > > 
> > >  *      __flush_dcache_area(kaddr, size)
> > >  *
> > >  *              Ensure that the data held in page is written back.
> > >  *              - kaddr  - page address
> > >  *              - size   - region size
> > > 
> > > So looks like we are trying to write back anything which we will access
> > > after switching off MMU. If that's the case, I have two questions.
> > > 
> > > - Why do we need to writeback that cacheline. After switching off MMU,
> > >   will we not access same cacheline. I thought caches are VIPT and tag
> > >   will still remain the same (but I might easily be wrong here).
> > 
> > As I mention above, the initial cache flush by VA is to ensure that the
> > data is visible to the CPU once translation is disabled. I'm not sure I
> > follow your reasoning.
> 
> I was assuming that even after we disable translations, cpu will still
> read data from dcache if it is available there. Looks like you are
> saying that once translation is disabled, data will be read from memory
> hence it is important to flush out dcache before disabling translation.
> Did I understand it right?

I believe you did.

When translation is disabled (i.e. SCTLR_ELx.M == 0), data accesses are
assigned Device-nGnRnE attributes regardless of whether the caches are
enabled (i.e. SCTLR_ELx.C == 1), and bypass the cache hierarchy. So
accesses to memory will go straight to PoC (essentially memory), and
won't hit in any cache.

However, instruction accesses are more complicated. They are always
assigned Normal memory attributes, and if the I-caches are enabled (i.e.
SCTLR_ELx.I == 1) they are cacheable regardless of whether translation
is enabled. So I-cache maintenance may be required when translation is
disabled.

Thanks,
Mark.
Geoff Levand Oct. 3, 2014, 6:35 p.m. UTC | #19
Hi Vivek,

On Wed, 2014-10-01 at 10:56 -0400, Vivek Goyal wrote:
> On Tue, Sep 30, 2014 at 12:54:37PM -0700, Geoff Levand wrote:
> 
> [..]
> > > > +{
> > > > +	switch (flag) {
> > > > +	case IND_INDIRECTION:
> > > > +	case IND_SOURCE:
> > > > +		__flush_dcache_area(addr, PAGE_SIZE);
> > > > +		break;
> > > 
> > > So what does __flush_dcache_area() do? Flush data caches. IIUC, addr
> > > is virtual address at this point of time. While copying pages and
> > > walking through the list, I am assuming you have switched off page
> > > tables and you are in some kind of 1:1 physical mode. So how did
> > > flushing data caches related to a virtual address help. I guess we
> > > are not even accessing that virtual address now.
> > 
> > __flush_dcache_area(), and the underling aarch64 civac instruction
> > operate on virtual addresses.  Here we are still running with the
> > MMU on and the identity mapping has not yet been enabled.  This is
> > the sequence:
> > 
> >   flush dcache -> turn off MMU, dcache -> access memory (PoC) directly 
> 
> Sorry, I don't understand that why do we need to flush dcache for source
> and indirection page addresses. Some information here will help.

I think Mark answered this.  The architecture requires us to flush to
the point of coherency (PoC) anything that will be used after the
dcache is disabled.

For more info you can look at the section'D4.4.7 Cache maintenance
operations' in the ARMv8 Reference Manual you can get from here (after
registering):

  http://infocenter.arm.com/help/topic/com.arm.doc.subset.architecture.reference/index.html

-Geoff
Vivek Goyal Oct. 7, 2014, 1:44 p.m. UTC | #20
On Fri, Oct 03, 2014 at 11:35:03AM -0700, Geoff Levand wrote:
> Hi Vivek,
> 
> On Wed, 2014-10-01 at 10:56 -0400, Vivek Goyal wrote:
> > On Tue, Sep 30, 2014 at 12:54:37PM -0700, Geoff Levand wrote:
> > 
> > [..]
> > > > > +{
> > > > > +	switch (flag) {
> > > > > +	case IND_INDIRECTION:
> > > > > +	case IND_SOURCE:
> > > > > +		__flush_dcache_area(addr, PAGE_SIZE);
> > > > > +		break;
> > > > 
> > > > So what does __flush_dcache_area() do? Flush data caches. IIUC, addr
> > > > is virtual address at this point of time. While copying pages and
> > > > walking through the list, I am assuming you have switched off page
> > > > tables and you are in some kind of 1:1 physical mode. So how did
> > > > flushing data caches related to a virtual address help. I guess we
> > > > are not even accessing that virtual address now.
> > > 
> > > __flush_dcache_area(), and the underling aarch64 civac instruction
> > > operate on virtual addresses.  Here we are still running with the
> > > MMU on and the identity mapping has not yet been enabled.  This is
> > > the sequence:
> > > 
> > >   flush dcache -> turn off MMU, dcache -> access memory (PoC) directly 
> > 
> > Sorry, I don't understand that why do we need to flush dcache for source
> > and indirection page addresses. Some information here will help.
> 
> I think Mark answered this.  The architecture requires us to flush to
> the point of coherency (PoC) anything that will be used after the
> dcache is disabled.
> 
> For more info you can look at the section'D4.4.7 Cache maintenance
> operations' in the ARMv8 Reference Manual you can get from here (after
> registering):
> 
>   http://infocenter.arm.com/help/topic/com.arm.doc.subset.architecture.reference/index.html

Geoff,

So as Mark and I discussed need of purgatory code in other mails, are you
plannign to enable purgatory on arm64.

Thanks
Vivek
Geoff Levand Oct. 7, 2014, 6:42 p.m. UTC | #21
Hi Vivek,

On Tue, 2014-10-07 at 09:44 -0400, Vivek Goyal wrote:
> So as Mark and I discussed need of purgatory code in other mails, are you
> plannign to enable purgatory on arm64.

Adding purgatory code to arm64 is low priority, and I currently
have no plan to do that.  Users are asking for kdump, and proper
UEFI support, so that is what I will work towards.

-Geoff
Vivek Goyal Oct. 7, 2014, 6:45 p.m. UTC | #22
On Tue, Oct 07, 2014 at 11:42:00AM -0700, Geoff Levand wrote:
> Hi Vivek,
> 
> On Tue, 2014-10-07 at 09:44 -0400, Vivek Goyal wrote:
> > So as Mark and I discussed need of purgatory code in other mails, are you
> > plannign to enable purgatory on arm64.
> 
> Adding purgatory code to arm64 is low priority, and I currently
> have no plan to do that.  Users are asking for kdump, and proper
> UEFI support, so that is what I will work towards.

I think having purgatory enabled is very important here as in kernel
you are hardcoding that one of the segments is DTB and doing all the
magic tricks w.r.t putting a magic number. I think as an interface
that seems bad. So I think atleast we will to fix the purgatory part
of it.

Thanks
Vivek
Vivek Goyal Oct. 7, 2014, 6:48 p.m. UTC | #23
On Tue, Oct 07, 2014 at 11:42:00AM -0700, Geoff Levand wrote:
> Hi Vivek,
> 
> On Tue, 2014-10-07 at 09:44 -0400, Vivek Goyal wrote:
> > So as Mark and I discussed need of purgatory code in other mails, are you
> > plannign to enable purgatory on arm64.
> 
> Adding purgatory code to arm64 is low priority, and I currently
> have no plan to do that.  Users are asking for kdump, and proper
> UEFI support, so that is what I will work towards.

kdump will make sure of purgatory too. And there does not seem to be
any logic which verifies the checksums of loaded segments hence reducing
the reliability of kdump operation.

So I think fixing the purgatory part here is a must. It leads to better
design as well as improved reliability for kexec/kdump operations.

Thanks
Vivek
Geoff Levand Oct. 7, 2014, 8:12 p.m. UTC | #24
Hi Vivek,

On Tue, 2014-10-07 at 14:45 -0400, Vivek Goyal wrote:
> On Tue, Oct 07, 2014 at 11:42:00AM -0700, Geoff Levand wrote:
> > Adding purgatory code to arm64 is low priority, and I currently
> > have no plan to do that.  Users are asking for kdump, and proper
> > UEFI support, so that is what I will work towards.
> 
> I think having purgatory enabled is very important here as in kernel
> you are hardcoding that one of the segments is DTB and doing all the
> magic tricks w.r.t putting a magic number. 

I don't argue that having purgatory code could be useful, but as of
now, enabling the other features is what I'll work towards.
 
Regarding the device tree magic number, I'm wondering if you missed
that the device tree has a header, and that header has a magic
number.  See here:

  http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/scripts/dtc/libfdt/fdt.h#n6

-Geoff
Vivek Goyal Oct. 7, 2014, 8:22 p.m. UTC | #25
On Tue, Oct 07, 2014 at 01:12:57PM -0700, Geoff Levand wrote:
> Hi Vivek,
> 
> On Tue, 2014-10-07 at 14:45 -0400, Vivek Goyal wrote:
> > On Tue, Oct 07, 2014 at 11:42:00AM -0700, Geoff Levand wrote:
> > > Adding purgatory code to arm64 is low priority, and I currently
> > > have no plan to do that.  Users are asking for kdump, and proper
> > > UEFI support, so that is what I will work towards.
> > 
> > I think having purgatory enabled is very important here as in kernel
> > you are hardcoding that one of the segments is DTB and doing all the
> > magic tricks w.r.t putting a magic number. 
> 
> I don't argue that having purgatory code could be useful, but as of
> now, enabling the other features is what I'll work towards.
>  
> Regarding the device tree magic number, I'm wondering if you missed
> that the device tree has a header, and that header has a magic
> number.  See here:
> 
>   http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/scripts/dtc/libfdt/fdt.h#n6
> 

Problem is this that if you put code in kernel once which does something
which purgatory ought to do, you will never be able to remove it for
backward compatibility reasons. Older versions of kexec-tools will 
continue to rely on it. Also how in kernel you would know that now
purgatory will take care of this and kernel does not have to worry
about something. So it is a good idea to integrate the purgatory support
from the very beginning.

Also, verifying checksums of loaded segments before jumping to that kernel
is a must from feature point of view.

Thanks
Vivek
Mark Rutland Oct. 8, 2014, 9:28 a.m. UTC | #26
On Tue, Oct 07, 2014 at 09:12:57PM +0100, Geoff Levand wrote:
> Hi Vivek,
> 
> On Tue, 2014-10-07 at 14:45 -0400, Vivek Goyal wrote:
> > On Tue, Oct 07, 2014 at 11:42:00AM -0700, Geoff Levand wrote:
> > > Adding purgatory code to arm64 is low priority, and I currently
> > > have no plan to do that.  Users are asking for kdump, and proper
> > > UEFI support, so that is what I will work towards.
> > 
> > I think having purgatory enabled is very important here as in kernel
> > you are hardcoding that one of the segments is DTB and doing all the
> > magic tricks w.r.t putting a magic number. 
> 
> I don't argue that having purgatory code could be useful, but as of
> now, enabling the other features is what I'll work towards.
>  
> Regarding the device tree magic number, I'm wondering if you missed
> that the device tree has a header, and that header has a magic
> number.  See here:
> 
>   http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/scripts/dtc/libfdt/fdt.h#n6

As I mentioned elsewhere, regardless of whether we can detect if a
segment is a DTB, we're still encoding the policy of what to do with
that within the kernel, and there are cases that is problematic for.

Leaving at least the register setup to the caller's discretion (by way
of a userspace provided purgatory) avoids those problems entirely.

kexec_load_file is more complicated in that regard due to the less
general interface provided.

Mark.
Geoff Levand Oct. 9, 2014, 10:26 p.m. UTC | #27
Hi Vivek,

On Tue, 2014-10-07 at 16:22 -0400, Vivek Goyal wrote:
> Problem is this that if you put code in kernel once which does something
> which purgatory ought to do, you will never be able to remove it for
> backward compatibility reasons. Older versions of kexec-tools will 
> continue to rely on it. Also how in kernel you would know that now
> purgatory will take care of this and kernel does not have to worry
> about something. So it is a good idea to integrate the purgatory support
> from the very beginning.

I agree with you.  I will add a purgatory.

-Geoff
Geoff Levand Oct. 23, 2014, 11:08 p.m. UTC | #28
Hi Vivek,

On Thu, 2014-10-09 at 15:26 -0700, Geoff Levand wrote:
> On Tue, 2014-10-07 at 16:22 -0400, Vivek Goyal wrote:
> > Problem is this that if you put code in kernel once which does something
> > which purgatory ought to do, you will never be able to remove it for
> > backward compatibility reasons. Older versions of kexec-tools will 
> > continue to rely on it. Also how in kernel you would know that now
> > purgatory will take care of this and kernel does not have to worry
> > about something. So it is a good idea to integrate the purgatory support
> > from the very beginning.
> 
> I agree with you.  I will add a purgatory.

I added a purgatory stage to kexec-tools, and verified it works
as expected.  My current kernel patches were generic enough to
support a purgatory stage without any additional changes.  I did
make some minor changes to the kernel comments so they use the
language of loading an 'image' instead of a 'kernel'.

I'll post a V5 kernel series.  Please review and reply with your
ack if you find them acceptable.

-Geoff
diff mbox

Patch

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index f0d3a2d..af03449 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -313,6 +313,15 @@  config ARCH_HAS_CACHE_LINE_SIZE
 
 source "mm/Kconfig"
 
+config KEXEC
+	depends on (!SMP || PM_SLEEP_SMP)
+	bool "kexec system call"
+	---help---
+	  kexec is a system call that implements the ability to shutdown your
+	  current kernel, and to start another kernel.  It is like a reboot
+	  but it is independent of the system firmware.   And like a reboot
+	  you can start any kernel with it, not just Linux.
+
 config XEN_DOM0
 	def_bool y
 	depends on XEN
diff --git a/arch/arm64/include/asm/kexec.h b/arch/arm64/include/asm/kexec.h
new file mode 100644
index 0000000..e7bd7ab
--- /dev/null
+++ b/arch/arm64/include/asm/kexec.h
@@ -0,0 +1,47 @@ 
+/*
+ * kexec for arm64
+ *
+ * Copyright (C) Linaro.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#if !defined(_ARM64_KEXEC_H)
+#define _ARM64_KEXEC_H
+
+/* Maximum physical address we can use pages from */
+
+#define KEXEC_SOURCE_MEMORY_LIMIT (-1UL)
+
+/* Maximum address we can reach in physical address mode */
+
+#define KEXEC_DESTINATION_MEMORY_LIMIT (-1UL)
+
+/* Maximum address we can use for the control code buffer */
+
+#define KEXEC_CONTROL_MEMORY_LIMIT (-1UL)
+
+#define KEXEC_CONTROL_PAGE_SIZE	4096
+
+#define KEXEC_ARCH KEXEC_ARCH_ARM64
+
+#if !defined(__ASSEMBLY__)
+
+/**
+ * crash_setup_regs() - save registers for the panic kernel
+ *
+ * @newregs: registers are saved here
+ * @oldregs: registers to be saved (may be %NULL)
+ */
+
+static inline void crash_setup_regs(struct pt_regs *newregs,
+				    struct pt_regs *oldregs)
+{
+	/* Empty routine needed to avoid build errors. */
+}
+
+#endif /* !defined(__ASSEMBLY__) */
+
+#endif
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index 6e9538c..77a7351 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -30,6 +30,7 @@  arm64-obj-$(CONFIG_CPU_IDLE)		+= cpuidle.o
 arm64-obj-$(CONFIG_JUMP_LABEL)		+= jump_label.o
 arm64-obj-$(CONFIG_KGDB)		+= kgdb.o
 arm64-obj-$(CONFIG_EFI)			+= efi.o efi-stub.o efi-entry.o
+arm64-obj-$(CONFIG_KEXEC)		+= machine_kexec.o relocate_kernel.o
 
 obj-y					+= $(arm64-obj-y) vdso/
 obj-m					+= $(arm64-obj-m)
diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c
new file mode 100644
index 0000000..22d185c
--- /dev/null
+++ b/arch/arm64/kernel/machine_kexec.c
@@ -0,0 +1,183 @@ 
+/*
+ * kexec for arm64
+ *
+ * Copyright (C) Linaro.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kexec.h>
+#include <linux/of_fdt.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+#include <asm/cacheflush.h>
+#include <asm/system_misc.h>
+
+/* Global variables for the relocate_kernel routine. */
+
+extern const unsigned char relocate_new_kernel[];
+extern const unsigned long relocate_new_kernel_size;
+extern unsigned long kexec_dtb_addr;
+extern unsigned long kexec_kimage_head;
+extern unsigned long kexec_kimage_start;
+
+/**
+ * kexec_list_walk - Helper to walk the kimage page list.
+ */
+
+static void kexec_list_walk(void *ctx, unsigned long kimage_head,
+	void (*cb)(void *ctx, unsigned int flag, void *addr, void *dest))
+{
+	void *dest;
+	unsigned long *entry;
+
+	for (entry = &kimage_head, dest = NULL; ; entry++) {
+		unsigned int flag = *entry & 
+			(IND_DESTINATION | IND_INDIRECTION | IND_DONE |
+			IND_SOURCE);
+		void *addr = phys_to_virt(*entry & PAGE_MASK);
+
+		switch (flag) {
+		case IND_INDIRECTION:
+			entry = (unsigned long *)addr - 1;
+			cb(ctx, flag, addr, NULL);
+			break;
+		case IND_DESTINATION:
+			dest = addr;
+			cb(ctx, flag, addr, NULL);
+			break;
+		case IND_SOURCE:
+			cb(ctx, flag, addr, dest);
+			dest += PAGE_SIZE;
+			break;
+		case IND_DONE:
+			cb(ctx, flag , NULL, NULL);
+			return;
+		default:
+			break;
+		}
+	}
+}
+
+/**
+ * kexec_is_dtb - Helper routine to check the device tree header signature.
+ */
+
+static bool kexec_is_dtb(const void *dtb)
+{
+	__be32 magic;
+
+	return get_user(magic, (__be32 *)dtb) ? false :
+		(be32_to_cpu(magic) == OF_DT_HEADER);
+}
+
+/**
+ * kexec_find_dtb_seg - Helper routine to find the dtb segment.
+ */
+
+static const struct kexec_segment *kexec_find_dtb_seg(
+	const struct kimage *image)
+{
+	int i;
+
+	for (i = 0; i < image->nr_segments; i++) {
+		if (kexec_is_dtb(image->segment[i].buf))
+			return &image->segment[i];
+	}
+
+	return NULL;
+}
+
+void machine_kexec_cleanup(struct kimage *image)
+{
+	/* Empty routine needed to avoid build errors. */
+}
+
+/**
+ * machine_kexec_prepare - Prepare for a kexec reboot.
+ *
+ * Called from the core kexec code when a kernel image is loaded.
+ */
+
+int machine_kexec_prepare(struct kimage *image)
+{
+	const struct kexec_segment *dtb_seg;
+
+	dtb_seg = kexec_find_dtb_seg(image);
+
+	if (!dtb_seg)
+		return -EINVAL;
+
+	kexec_dtb_addr = dtb_seg->mem;
+	kexec_kimage_start = image->start;
+
+	return 0;
+}
+
+/**
+ * kexec_list_flush_cb - Callback to flush the kimage list to PoC.
+ */
+
+static void kexec_list_flush_cb(void *ctx , unsigned int flag,
+	void *addr, void *dest)
+{
+	switch (flag) {
+	case IND_INDIRECTION:
+	case IND_SOURCE:
+		__flush_dcache_area(addr, PAGE_SIZE);
+		break;
+	default:
+		break;
+	}
+}
+
+/**
+ * machine_kexec - Do the kexec reboot.
+ *
+ * Called from the core kexec code for a sys_reboot with LINUX_REBOOT_CMD_KEXEC.
+ */
+
+void machine_kexec(struct kimage *image)
+{
+	phys_addr_t reboot_code_buffer_phys;
+	void *reboot_code_buffer;
+
+	BUG_ON(num_online_cpus() > 1);
+
+	kexec_kimage_head = image->head;
+
+	reboot_code_buffer_phys = page_to_phys(image->control_code_page);
+	reboot_code_buffer = phys_to_virt(reboot_code_buffer_phys);
+
+	/*
+	 * Copy relocate_new_kernel to the reboot_code_buffer for use
+	 * after the kernel is shut down.
+	 */
+
+	memcpy(reboot_code_buffer, relocate_new_kernel,
+		relocate_new_kernel_size);
+
+	/* Flush the reboot_code_buffer in preparation for its execution. */
+
+	__flush_dcache_area(reboot_code_buffer, relocate_new_kernel_size);
+
+	/* Flush the kimage list. */
+
+	kexec_list_walk(NULL, image->head, kexec_list_flush_cb);
+
+	pr_info("Bye!\n");
+
+	/* Disable all DAIF exceptions. */
+	
+	asm volatile ("msr daifset, #0xf" : : : "memory");
+
+	soft_restart(reboot_code_buffer_phys);
+}
+
+void machine_crash_shutdown(struct pt_regs *regs)
+{
+	/* Empty routine needed to avoid build errors. */
+}
diff --git a/arch/arm64/kernel/relocate_kernel.S b/arch/arm64/kernel/relocate_kernel.S
new file mode 100644
index 0000000..07ce41a
--- /dev/null
+++ b/arch/arm64/kernel/relocate_kernel.S
@@ -0,0 +1,183 @@ 
+/*
+ * kexec for arm64
+ *
+ * Copyright (C) Linaro.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <asm/assembler.h>
+#include <asm/kexec.h>
+#include <asm/memory.h>
+#include <asm/page.h>
+#include <asm/proc-macros.S>
+
+/* The list entry flags. */
+
+#define IND_DESTINATION_BIT 0
+#define IND_INDIRECTION_BIT 1
+#define IND_DONE_BIT        2
+#define IND_SOURCE_BIT      3
+
+/*
+ * relocate_new_kernel - Put the 2nd stage kernel image in place and boot it.
+ *
+ * The memory that the old kernel occupies may be overwritten when coping the
+ * new kernel to its final location.  To assure that the relocate_new_kernel
+ * routine which does that copy is not overwritten all code and data needed
+ * by relocate_new_kernel must be between the symbols relocate_new_kernel and
+ * relocate_new_kernel_end.  The machine_kexec() routine will copy
+ * relocate_new_kernel to the kexec control_code_page, a special page which
+ * has been set up to be preserved during the kernel copy operation.
+ */
+
+.globl relocate_new_kernel
+relocate_new_kernel:
+
+	/* Setup the list loop variables. */
+
+	ldr	x18, kexec_kimage_head		/* x18 = list entry */
+	dcache_line_size x17, x0		/* x17 = dcache line size */
+	mov	x16, xzr			/* x16 = segment start */
+	mov	x15, xzr			/* x15 = entry ptr */
+	mov	x14, xzr			/* x14 = copy dest */
+
+	/* Check if the new kernel needs relocation. */
+
+	cbz	x18, .Ldone
+	tbnz	x18, IND_DONE_BIT, .Ldone
+
+.Lloop:
+	and	x13, x18, PAGE_MASK		/* x13 = addr */
+
+	/* Test the entry flags. */
+
+.Ltest_source:
+	tbz	x18, IND_SOURCE_BIT, .Ltest_indirection
+
+	/* copy_page(x20 = dest, x21 = src) */
+
+	mov x20, x14
+	mov x21, x13
+
+1:	ldp	x22, x23, [x21]
+	ldp	x24, x25, [x21, #16]
+	ldp	x26, x27, [x21, #32]
+	ldp	x28, x29, [x21, #48]
+	add	x21, x21, #64
+	stnp	x22, x23, [x20]
+	stnp	x24, x25, [x20, #16]
+	stnp	x26, x27, [x20, #32]
+	stnp	x28, x29, [x20, #48]
+	add	x20, x20, #64
+	tst	x21, #(PAGE_SIZE - 1)
+	b.ne	1b
+
+	/* dest += PAGE_SIZE */
+
+	add	x14, x14, PAGE_SIZE
+	b	.Lnext
+
+.Ltest_indirection:
+	tbz	x18, IND_INDIRECTION_BIT, .Ltest_destination
+
+	/* ptr = addr */
+
+	mov	x15, x13
+	b	.Lnext
+
+.Ltest_destination:
+	tbz	x18, IND_DESTINATION_BIT, .Lnext
+
+	/* flush segment */
+
+	bl	.Lflush
+	mov	x16, x13
+
+	/* dest = addr */
+
+	mov	x14, x13
+
+.Lnext:
+	/* entry = *ptr++ */
+
+	ldr	x18, [x15], #8
+
+	/* while (!(entry & DONE)) */
+
+	tbz	x18, IND_DONE_BIT, .Lloop
+
+.Ldone:
+	/* flush last segment */
+
+	bl	.Lflush
+
+	dsb	sy
+	isb
+	ic	ialluis
+	dsb	sy
+	isb
+
+	/* start_new_kernel */
+
+	ldr	x4, kexec_kimage_start
+	ldr	x0, kexec_dtb_addr
+	mov	x1, xzr
+	mov	x2, xzr
+	mov	x3, xzr
+	br	x4
+
+/* flush - x17 = line size, x16 = start addr, x14 = end addr. */
+
+.Lflush:
+	cbz	x16, 2f
+	mov	x0, x16
+	sub	x1, x17, #1
+	bic	x0, x0, x1
+1:	dc	civac, x0
+	add	x0, x0, x17
+	cmp	x0, x14
+	b.lo	1b
+2:	ret
+
+.align 3	/* To keep the 64-bit values below naturally aligned. */
+
+/* The machine_kexec routines set these variables. */
+
+/*
+ * kexec_dtb_addr - Physical address of the new kernel's device tree.
+ */
+
+.globl kexec_dtb_addr
+kexec_dtb_addr:
+	.quad	0x0
+
+/*
+ * kexec_kimage_head - Copy of image->head, the list of kimage entries.
+ */
+
+.globl kexec_kimage_head
+kexec_kimage_head:
+	.quad	0x0
+
+/*
+ * kexec_kimage_start - Copy of image->start, the entry point of the new kernel.
+ */
+
+.globl kexec_kimage_start
+kexec_kimage_start:
+	.quad	0x0
+
+.Lrelocate_new_kernel_end:
+
+/*
+ * relocate_new_kernel_size - Number of bytes to copy to the control_code_page.
+ */
+
+.globl relocate_new_kernel_size
+relocate_new_kernel_size:
+	.quad .Lrelocate_new_kernel_end - relocate_new_kernel
+
+.org	KEXEC_CONTROL_PAGE_SIZE
diff --git a/include/uapi/linux/kexec.h b/include/uapi/linux/kexec.h
index 6925f5b..04626b9 100644
--- a/include/uapi/linux/kexec.h
+++ b/include/uapi/linux/kexec.h
@@ -39,6 +39,7 @@ 
 #define KEXEC_ARCH_SH      (42 << 16)
 #define KEXEC_ARCH_MIPS_LE (10 << 16)
 #define KEXEC_ARCH_MIPS    ( 8 << 16)
+#define KEXEC_ARCH_ARM64   (183 << 16)
 
 /* The artificial cap on the number of segments passed to kexec_load. */
 #define KEXEC_SEGMENT_MAX 16