diff mbox

[08/10] arm64/kexec: Add core kexec support

Message ID c8e8a3378f028e54e21931bc7f3d62c48725d5c7.1414099246.git.geoff@infradead.org (mailing list archive)
State New, archived
Headers show

Commit Message

Geoff Levand Oct. 23, 2014, 11:10 p.m. UTC
Add three new files, kexec.h, machine_kexec.c and relocate_kernel.S to the
arm64 architecture that add support for the kexec re-boot mechanism
(CONFIG_KEXEC) on arm64 platforms.

Signed-off-by: Geoff Levand <geoff@infradead.org>
---
 arch/arm64/Kconfig                  |   9 ++
 arch/arm64/include/asm/kexec.h      |  47 +++++++++
 arch/arm64/kernel/Makefile          |   1 +
 arch/arm64/kernel/machine_kexec.c   | 169 +++++++++++++++++++++++++++++++++
 arch/arm64/kernel/relocate_kernel.S | 184 ++++++++++++++++++++++++++++++++++++
 include/uapi/linux/kexec.h          |   1 +
 6 files changed, 411 insertions(+)
 create mode 100644 arch/arm64/include/asm/kexec.h
 create mode 100644 arch/arm64/kernel/machine_kexec.c
 create mode 100644 arch/arm64/kernel/relocate_kernel.S

Comments

Mark Rutland Oct. 24, 2014, 10:28 a.m. UTC | #1
[Adding Vivek to Cc]

Hi Geoff,

This is looking rather good now. My only major concerns with this patch
are the DTB handling (which I think should be left to the userspace
purgatory), and the incompatibility with KVM (which is in defconfig
currently).

Otherwise, there are just a couple of minor fixups I'd like to see
below.

On Fri, Oct 24, 2014 at 12:10:58AM +0100, Geoff Levand wrote:
> Add three new files, kexec.h, machine_kexec.c and relocate_kernel.S to the
> arm64 architecture that add support for the kexec re-boot mechanism
> (CONFIG_KEXEC) on arm64 platforms.
>
> Signed-off-by: Geoff Levand <geoff@infradead.org>
> ---
>  arch/arm64/Kconfig                  |   9 ++
>  arch/arm64/include/asm/kexec.h      |  47 +++++++++
>  arch/arm64/kernel/Makefile          |   1 +
>  arch/arm64/kernel/machine_kexec.c   | 169 +++++++++++++++++++++++++++++++++
>  arch/arm64/kernel/relocate_kernel.S | 184 ++++++++++++++++++++++++++++++++++++
>  include/uapi/linux/kexec.h          |   1 +
>  6 files changed, 411 insertions(+)
>  create mode 100644 arch/arm64/include/asm/kexec.h
>  create mode 100644 arch/arm64/kernel/machine_kexec.c
>  create mode 100644 arch/arm64/kernel/relocate_kernel.S
>
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index f0d3a2d..af03449 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -313,6 +313,15 @@ config ARCH_HAS_CACHE_LINE_SIZE
>
>  source "mm/Kconfig"
>
> +config KEXEC
> +       depends on (!SMP || PM_SLEEP_SMP)

In its current state this also depends on !KVM && !EFI (technically you
could detect those cases at runtime, but I don't see that in this
series).

> +       bool "kexec system call"
> +       ---help---
> +         kexec is a system call that implements the ability to shutdown your
> +         current kernel, and to start another kernel.  It is like a reboot
> +         but it is independent of the system firmware.   And like a reboot
> +         you can start any kernel with it, not just Linux.
> +
>  config XEN_DOM0
>         def_bool y
>         depends on XEN
> diff --git a/arch/arm64/include/asm/kexec.h b/arch/arm64/include/asm/kexec.h
> new file mode 100644
> index 0000000..e7bd7ab
> --- /dev/null
> +++ b/arch/arm64/include/asm/kexec.h
> @@ -0,0 +1,47 @@
> +/*
> + * kexec for arm64
> + *
> + * Copyright (C) Linaro.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#if !defined(_ARM64_KEXEC_H)
> +#define _ARM64_KEXEC_H
> +
> +/* Maximum physical address we can use pages from */
> +
> +#define KEXEC_SOURCE_MEMORY_LIMIT (-1UL)
> +
> +/* Maximum address we can reach in physical address mode */
> +
> +#define KEXEC_DESTINATION_MEMORY_LIMIT (-1UL)
> +
> +/* Maximum address we can use for the control code buffer */
> +
> +#define KEXEC_CONTROL_MEMORY_LIMIT (-1UL)
> +
> +#define KEXEC_CONTROL_PAGE_SIZE        4096
> +
> +#define KEXEC_ARCH KEXEC_ARCH_ARM64
> +
> +#if !defined(__ASSEMBLY__)
> +
> +/**
> + * crash_setup_regs() - save registers for the panic kernel
> + *
> + * @newregs: registers are saved here
> + * @oldregs: registers to be saved (may be %NULL)
> + */
> +
> +static inline void crash_setup_regs(struct pt_regs *newregs,
> +                                   struct pt_regs *oldregs)
> +{
> +       /* Empty routine needed to avoid build errors. */
> +}
> +
> +#endif /* !defined(__ASSEMBLY__) */
> +
> +#endif
> diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
> index 6e9538c..77a7351 100644
> --- a/arch/arm64/kernel/Makefile
> +++ b/arch/arm64/kernel/Makefile
> @@ -30,6 +30,7 @@ arm64-obj-$(CONFIG_CPU_IDLE)          += cpuidle.o
>  arm64-obj-$(CONFIG_JUMP_LABEL)         += jump_label.o
>  arm64-obj-$(CONFIG_KGDB)               += kgdb.o
>  arm64-obj-$(CONFIG_EFI)                        += efi.o efi-stub.o efi-entry.o
> +arm64-obj-$(CONFIG_KEXEC)              += machine_kexec.o relocate_kernel.o
>
>  obj-y                                  += $(arm64-obj-y) vdso/
>  obj-m                                  += $(arm64-obj-m)
> diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c
> new file mode 100644
> index 0000000..95bc8d9
> --- /dev/null
> +++ b/arch/arm64/kernel/machine_kexec.c
> @@ -0,0 +1,169 @@
> +/*
> + * kexec for arm64
> + *
> + * Copyright (C) Linaro.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#include <linux/kexec.h>
> +#include <linux/of_fdt.h>
> +#include <linux/slab.h>
> +#include <linux/uaccess.h>
> +
> +#include <asm/cacheflush.h>
> +#include <asm/system_misc.h>
> +
> +/* Global variables for the relocate_kernel routine. */
> +
> +extern const unsigned char relocate_new_kernel[];
> +extern const unsigned long relocate_new_kernel_size;
> +extern unsigned long arm64_kexec_dtb_addr;
> +extern unsigned long arm64_kexec_kimage_head;
> +extern unsigned long arm64_kexec_kimage_start;
> +
> +/**
> + * kexec_is_dtb - Helper routine to check the device tree header signature.
> + */
> +
> +static bool kexec_is_dtb(const void *dtb)
> +{
> +       __be32 magic;
> +
> +       return get_user(magic, (__be32 *)dtb) ? false :
> +               (be32_to_cpu(magic) == OF_DT_HEADER);
> +}
> +
> +/**
> + * kexec_find_dtb_seg - Helper routine to find the dtb segment.
> + */
> +
> +static const struct kexec_segment *kexec_find_dtb_seg(
> +       const struct kimage *image)
> +{
> +       int i;
> +
> +       for (i = 0; i < image->nr_segments; i++) {
> +               if (kexec_is_dtb(image->segment[i].buf))
> +                       return &image->segment[i];
> +       }
> +
> +       return NULL;
> +}
> +
> +void machine_kexec_cleanup(struct kimage *image)
> +{
> +       /* Empty routine needed to avoid build errors. */
> +}
> +
> +/**
> + * machine_kexec_prepare - Prepare for a kexec reboot.
> + *
> + * Called from the core kexec code when a kernel image is loaded.
> + */
> +
> +int machine_kexec_prepare(struct kimage *image)
> +{
> +       const struct kexec_segment *dtb_seg = kexec_find_dtb_seg(image);
> +
> +       if (!dtb_seg)
> +               pr_warn("%s: No device tree segment found.\n", __func__);
> +
> +       arm64_kexec_dtb_addr = dtb_seg ? dtb_seg->mem : 0;
> +       arm64_kexec_kimage_start = image->start;
> +
> +       return 0;
> +}

I thought all of the DTB handling was moving to purgatory?

> +
> +/**
> + * kexec_list_flush - Helper to flush the kimage list to PoC.
> + */
> +
> +static void kexec_list_flush(unsigned long kimage_head)
> +{
> +       void *dest;
> +       unsigned long *entry;
> +
> +       for (entry = &kimage_head, dest = NULL; ; entry++) {
> +               unsigned int flag = *entry &
> +                       (IND_DESTINATION | IND_INDIRECTION | IND_DONE |
> +                       IND_SOURCE);
> +               void *addr = phys_to_virt(*entry & PAGE_MASK);
> +
> +               switch (flag) {
> +               case IND_INDIRECTION:
> +                       entry = (unsigned long *)addr - 1;
> +                       __flush_dcache_area(addr, PAGE_SIZE);
> +                       break;
> +               case IND_DESTINATION:
> +                       dest = addr;
> +                       break;
> +               case IND_SOURCE:
> +                       __flush_dcache_area(addr, PAGE_SIZE);
> +                       dest += PAGE_SIZE;
> +                       break;
> +               case IND_DONE:
> +                       return;
> +               default:
> +                       break;

Can an image ever have no flags? Given the presence of IND_NONE I'd
assume not, so this looks like a candidate for a BUG().

> +               }
> +       }
> +}
> +
> +/**
> + * machine_kexec - Do the kexec reboot.
> + *
> + * Called from the core kexec code for a sys_reboot with LINUX_REBOOT_CMD_KEXEC.
> + */
> +
> +void machine_kexec(struct kimage *image)
> +{
> +       phys_addr_t reboot_code_buffer_phys;
> +       void *reboot_code_buffer;
> +
> +       BUG_ON(num_online_cpus() > 1);
> +
> +       arm64_kexec_kimage_head = image->head;
> +
> +       reboot_code_buffer_phys = page_to_phys(image->control_code_page);
> +       reboot_code_buffer = phys_to_virt(reboot_code_buffer_phys);
> +
> +       /*
> +        * Copy relocate_new_kernel to the reboot_code_buffer for use
> +        * after the kernel is shut down.
> +        */
> +
> +       memcpy(reboot_code_buffer, relocate_new_kernel,
> +               relocate_new_kernel_size);

Can we get rid of the line gaps between comments and the single function
calls they apply to, please? I realise it's a minor thing, but this
looks rather inconsistent with the rest of arch/arm64/.

> +
> +       /* Flush the reboot_code_buffer in preparation for its execution. */
> +
> +       __flush_dcache_area(reboot_code_buffer, relocate_new_kernel_size);

That code should already be at the PoC per the boot protocol (the entire
kernel image should have been clean to the PoC at boot, so the
instructions forming relocate_new_kernel are globally visible).

From the looks of it you only need to flush the variables at the very
end.

> +
> +       /* Flush the kimage list. */
> +
> +       kexec_list_flush(image->head);
> +
> +       pr_info("Bye!\n");
> +
> +       /* Disable all DAIF exceptions. */
> +
> +       asm volatile ("msr daifset, #0xf" : : : "memory");
> +
> +       /*
> +        * soft_restart() will shutdown the MMU, disable data caches, then
> +        * transfer control to the reboot_code_buffer which contains a copy of
> +        * the relocate_new_kernel routine.  relocate_new_kernel will use
> +        * physical addressing to relocate the new kernel to its final position
> +        * and then will transfer control to the entry point of the new kernel.
> +        */
> +
> +       soft_restart(reboot_code_buffer_phys);

As mentioned above, either this needs to depend on !KVM, or this will
blow up.

> +}
> +
> +void machine_crash_shutdown(struct pt_regs *regs)
> +{
> +       /* Empty routine needed to avoid build errors. */
> +}
> diff --git a/arch/arm64/kernel/relocate_kernel.S b/arch/arm64/kernel/relocate_kernel.S
> new file mode 100644
> index 0000000..49cf9a0
> --- /dev/null
> +++ b/arch/arm64/kernel/relocate_kernel.S
> @@ -0,0 +1,184 @@
> +/*
> + * kexec for arm64
> + *
> + * Copyright (C) Linaro.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#include <asm/assembler.h>
> +#include <asm/kexec.h>
> +#include <asm/memory.h>
> +#include <asm/page.h>
> +#include <asm/proc-macros.S>
> +
> +/* The list entry flags. */
> +
> +#define IND_DESTINATION_BIT 0
> +#define IND_INDIRECTION_BIT 1
> +#define IND_DONE_BIT        2
> +#define IND_SOURCE_BIT      3

As previously, I think these need to be moved into a common header, and
defined in terms of the existing IND_* macros (or vice-versa). I believe
you had a patch doing so; what's the status of that?

> +
> +/*
> + * relocate_new_kernel - Put a 2nd stage kernel image in place and boot it.
> + *
> + * The memory that the old kernel occupies may be overwritten when coping the
> + * new image to its final location.  To assure that the relocate_new_kernel
> + * routine which does that copy is not overwritten all code and data needed
> + * by relocate_new_kernel must be between the symbols relocate_new_kernel and
> + * relocate_new_kernel_end.  The machine_kexec() routine will copy
> + * relocate_new_kernel to the kexec control_code_page, a special page which
> + * has been set up to be preserved during the copy operation.
> + */
> +
> +.globl relocate_new_kernel
> +relocate_new_kernel:
> +
> +       /* Setup the list loop variables. */
> +
> +       ldr     x18, arm64_kexec_kimage_head    /* x18 = list entry */
> +       dcache_line_size x17, x0                /* x17 = dcache line size */
> +       mov     x16, xzr                        /* x16 = segment start */
> +       mov     x15, xzr                        /* x15 = entry ptr */
> +       mov     x14, xzr                        /* x14 = copy dest */
> +
> +       /* Check if the new image needs relocation. */
> +
> +       cbz     x18, .Ldone
> +       tbnz    x18, IND_DONE_BIT, .Ldone
> +
> +.Lloop:
> +       and     x13, x18, PAGE_MASK             /* x13 = addr */
> +
> +       /* Test the entry flags. */
> +
> +.Ltest_source:
> +       tbz     x18, IND_SOURCE_BIT, .Ltest_indirection
> +
> +       /* copy_page(x20 = dest, x21 = src) */
> +
> +       mov x20, x14
> +       mov x21, x13
> +
> +1:     ldp     x22, x23, [x21]
> +       ldp     x24, x25, [x21, #16]
> +       ldp     x26, x27, [x21, #32]
> +       ldp     x28, x29, [x21, #48]
> +       add     x21, x21, #64
> +       stnp    x22, x23, [x20]
> +       stnp    x24, x25, [x20, #16]
> +       stnp    x26, x27, [x20, #32]
> +       stnp    x28, x29, [x20, #48]
> +       add     x20, x20, #64
> +       tst     x21, #(PAGE_SIZE - 1)
> +       b.ne    1b
> +
> +       /* dest += PAGE_SIZE */
> +
> +       add     x14, x14, PAGE_SIZE
> +       b       .Lnext
> +
> +.Ltest_indirection:
> +       tbz     x18, IND_INDIRECTION_BIT, .Ltest_destination
> +
> +       /* ptr = addr */
> +
> +       mov     x15, x13
> +       b       .Lnext
> +
> +.Ltest_destination:
> +       tbz     x18, IND_DESTINATION_BIT, .Lnext
> +
> +       /* flush segment */
> +
> +       bl      .Lflush
> +       mov     x16, x13
> +
> +       /* dest = addr */
> +
> +       mov     x14, x13
> +
> +.Lnext:
> +       /* entry = *ptr++ */
> +
> +       ldr     x18, [x15], #8
> +
> +       /* while (!(entry & DONE)) */
> +
> +       tbz     x18, IND_DONE_BIT, .Lloop
> +
> +.Ldone:
> +       /* flush last segment */
> +
> +       bl      .Lflush
> +
> +       dsb     sy
> +       isb
> +       ic      ialluis
> +       dsb     sy
> +       isb
> +
> +       /* start_new_image */
> +
> +       ldr     x4, arm64_kexec_kimage_start
> +       ldr     x0, arm64_kexec_dtb_addr
> +       mov     x1, xzr
> +       mov     x2, xzr
> +       mov     x3, xzr
> +       br      x4

This last part should be in userspace-provided purgatory. If you have
purgatory code which does this then we should be able to rely on that,
and we don't have to try to maintain this DTB handling in kernelspace
(which I suspect may become painful as the boot protocol evolves).

Thanks,
Mark.

> +
> +/* flush - x17 = line size, x16 = start addr, x14 = end addr. */
> +
> +.Lflush:
> +       cbz     x16, 2f
> +       mov     x0, x16
> +       sub     x1, x17, #1
> +       bic     x0, x0, x1
> +1:     dc      civac, x0
> +       add     x0, x0, x17
> +       cmp     x0, x14
> +       b.lo    1b
> +2:     ret
> +
> +.align 3       /* To keep the 64-bit values below naturally aligned. */
> +
> +/* The machine_kexec routines set these variables. */
> +
> +/*
> + * arm64_kexec_kimage_start - Copy of image->start, the entry point of the new
> + * image.
> + */
> +
> +.globl arm64_kexec_kimage_start
> +arm64_kexec_kimage_start:
> +       .quad   0x0
> +
> +/*
> + * arm64_kexec_dtb_addr - Physical address of a device tree.
> + */
> +
> +.globl arm64_kexec_dtb_addr
> +arm64_kexec_dtb_addr:
> +       .quad   0x0
> +
> +/*
> + * arm64_kexec_kimage_head - Copy of image->head, the list of kimage entries.
> + */
> +
> +.globl arm64_kexec_kimage_head
> +arm64_kexec_kimage_head:
> +       .quad   0x0
> +
> +.Lrelocate_new_kernel_end:
> +
> +/*
> + * relocate_new_kernel_size - Number of bytes to copy to the control_code_page.
> + */
> +
> +.globl relocate_new_kernel_size
> +relocate_new_kernel_size:
> +       .quad .Lrelocate_new_kernel_end - relocate_new_kernel
> +
> +.org   KEXEC_CONTROL_PAGE_SIZE
> diff --git a/include/uapi/linux/kexec.h b/include/uapi/linux/kexec.h
> index 6925f5b..04626b9 100644
> --- a/include/uapi/linux/kexec.h
> +++ b/include/uapi/linux/kexec.h
> @@ -39,6 +39,7 @@
>  #define KEXEC_ARCH_SH      (42 << 16)
>  #define KEXEC_ARCH_MIPS_LE (10 << 16)
>  #define KEXEC_ARCH_MIPS    ( 8 << 16)
> +#define KEXEC_ARCH_ARM64   (183 << 16)
>
>  /* The artificial cap on the number of segments passed to kexec_load. */
>  #define KEXEC_SEGMENT_MAX 16
> --
> 1.9.1
>
>
>
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
>
Geoff Levand Nov. 13, 2014, 2:19 a.m. UTC | #2
Hi Mark,

On Fri, 2014-10-24 at 11:28 +0100, Mark Rutland wrote:
> > +++ b/arch/arm64/Kconfig
> > @@ -313,6 +313,15 @@ config ARCH_HAS_CACHE_LINE_SIZE
> >
> >  source "mm/Kconfig"
> >
> > +config KEXEC
> > +       depends on (!SMP || PM_SLEEP_SMP)
> 
> In its current state this also depends on !KVM && !EFI (technically you
> could detect those cases at runtime, but I don't see that in this
> series).

A kernel built with CONFIG_EFI is OK if run on a non-EFI system or
without using a system's EFI support.  I added a patch that adds
runtime checks in the kexec_load syscall path to print a message
and return failure for situations where KVM or EFI won't work.

> > +/**
> > + * machine_kexec_prepare - Prepare for a kexec reboot.
> > + *
> > + * Called from the core kexec code when a kernel image is loaded.
> > + */
> > +
> > +int machine_kexec_prepare(struct kimage *image)
> > +{
> > +       const struct kexec_segment *dtb_seg = kexec_find_dtb_seg(image);
> > +
> > +       if (!dtb_seg)
> > +               pr_warn("%s: No device tree segment found.\n", __func__);
> > +
> > +       arm64_kexec_dtb_addr = dtb_seg ? dtb_seg->mem : 0;
> > +       arm64_kexec_kimage_start = image->start;
> > +
> > +       return 0;
> > +}
> 
> I thought all of the DTB handling was moving to purgatory?

Non-purgatory booting is needed for kexec-lite.  We can do
this simple check here which optionally sets x0 to the dtb
address to support that.  The other solution is to have a
trampoline in kexec-lite that sets x0 (basically an absolute
minimal purgatory), but I think to do it here is nicer, and
is also the same way that the arm arch code does it.

Maybe removing this pr_warn message and just relying on the
kexec_image_info() output would be better.

> > +/**
> > + * kexec_list_flush - Helper to flush the kimage list to PoC.
> > + */
> > +
> > +static void kexec_list_flush(unsigned long kimage_head)
> > +{
> > +       void *dest;
> > +       unsigned long *entry;
> > +
> > +       for (entry = &kimage_head, dest = NULL; ; entry++) {
> > +               unsigned int flag = *entry &
> > +                       (IND_DESTINATION | IND_INDIRECTION | IND_DONE |
> > +                       IND_SOURCE);
> > +               void *addr = phys_to_virt(*entry & PAGE_MASK);
> > +
> > +               switch (flag) {
> > +               case IND_INDIRECTION:
> > +                       entry = (unsigned long *)addr - 1;
> > +                       __flush_dcache_area(addr, PAGE_SIZE);
> > +                       break;
> > +               case IND_DESTINATION:
> > +                       dest = addr;
> > +                       break;
> > +               case IND_SOURCE:
> > +                       __flush_dcache_area(addr, PAGE_SIZE);
> > +                       dest += PAGE_SIZE;
> > +                       break;
> > +               case IND_DONE:
> > +                       return;
> > +               default:
> > +                       break;
> 
> Can an image ever have no flags? Given the presence of IND_NONE I'd
> assume not, so this looks like a candidate for a BUG().

Sure, I guess things will blow up before it ever gets here though.

> 
> > +               }
> > +       }
> > +}
> > +
> > +/**
> > + * machine_kexec - Do the kexec reboot.
> > + *
> > + * Called from the core kexec code for a sys_reboot with LINUX_REBOOT_CMD_KEXEC.
> > + */
> > +
> > +void machine_kexec(struct kimage *image)
> > +{
> > +       phys_addr_t reboot_code_buffer_phys;
> > +       void *reboot_code_buffer;
> > +
> > +       BUG_ON(num_online_cpus() > 1);
> > +
> > +       arm64_kexec_kimage_head = image->head;
> > +
> > +       reboot_code_buffer_phys = page_to_phys(image->control_code_page);
> > +       reboot_code_buffer = phys_to_virt(reboot_code_buffer_phys);
> > +
> > +       /*
> > +        * Copy relocate_new_kernel to the reboot_code_buffer for use
> > +        * after the kernel is shut down.
> > +        */
> > +
> > +       memcpy(reboot_code_buffer, relocate_new_kernel,
> > +               relocate_new_kernel_size);
> 
> Can we get rid of the line gaps between comments and the single function
> calls they apply to, please? I realise it's a minor thing, but this
> looks rather inconsistent with the rest of arch/arm64/.

checkpatch doesn't seem to mind, but sure, I can do that.

> > +
> > +       /* Flush the reboot_code_buffer in preparation for its execution. */
> > +
> > +       __flush_dcache_area(reboot_code_buffer, relocate_new_kernel_size);
> 
> That code should already be at the PoC per the boot protocol (the entire
> kernel image should have been clean to the PoC at boot, so the
> instructions forming relocate_new_kernel are globally visible).
> 
> From the looks of it you only need to flush the variables at the very
> end.

We copy the relocate_new_kernel routine to reboot_code_buffer, which is
a buffer allocated by the kexec core with alloc_pages().  That copy of
relocate_new_kernel is what we are flushing here.

I believe we need to flush that buffer out to PoC so we can execute
the code it contains after cpu_soft_restart().

> > --- /dev/null
> > +++ b/arch/arm64/kernel/relocate_kernel.S
> > @@ -0,0 +1,184 @@
> > +/*
> > + * kexec for arm64
> > + *
> > + * Copyright (C) Linaro.
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License version 2 as
> > + * published by the Free Software Foundation.
> > + */
> > +
> > +#include <asm/assembler.h>
> > +#include <asm/kexec.h>
> > +#include <asm/memory.h>
> > +#include <asm/page.h>
> > +#include <asm/proc-macros.S>
> > +
> > +/* The list entry flags. */
> > +
> > +#define IND_DESTINATION_BIT 0
> > +#define IND_INDIRECTION_BIT 1
> > +#define IND_DONE_BIT        2
> > +#define IND_SOURCE_BIT      3
> 
> As previously, I think these need to be moved into a common header, and
> defined in terms of the existing IND_* macros (or vice-versa). I believe
> you had a patch doing so; what's the status of that?

Still working to get it merged:

  https://lkml.org/lkml/2014/11/12/675

> > +/*
> > + * relocate_new_kernel - Put a 2nd stage kernel image in place and boot it.
> > + *
> > + * The memory that the old kernel occupies may be overwritten when coping the
> > + * new image to its final location.  To assure that the relocate_new_kernel
> > + * routine which does that copy is not overwritten all code and data needed
> > + * by relocate_new_kernel must be between the symbols relocate_new_kernel and
> > + * relocate_new_kernel_end.  The machine_kexec() routine will copy
> > + * relocate_new_kernel to the kexec control_code_page, a special page which
> > + * has been set up to be preserved during the copy operation.
> > + */
> > +
> > +.globl relocate_new_kernel
> > +relocate_new_kernel:

...

> > +
> > +       /* start_new_image */
> > +
> > +       ldr     x4, arm64_kexec_kimage_start
> > +       ldr     x0, arm64_kexec_dtb_addr
> > +       mov     x1, xzr
> > +       mov     x2, xzr
> > +       mov     x3, xzr
> > +       br      x4
> 
> This last part should be in userspace-provided purgatory. If you have
> purgatory code which does this then we should be able to rely on that,
> and we don't have to try to maintain this DTB handling in kernelspace
> (which I suspect may become painful as the boot protocol evolves).

I think the putting the dtb address in x0 is already fixed.  There are
users with firmware that does this and any change to the boot protocol
will have to work with it.

As I mentioned above, we need a solution for non-purgatory re-boot and I
think this is the best way.

Thanks for taking the time to review.  I'll post an updated patch set
soon.

-Geoff
Mark Rutland Nov. 17, 2014, 4:38 p.m. UTC | #3
On Thu, Nov 13, 2014 at 02:19:48AM +0000, Geoff Levand wrote:
> Hi Mark,

Hi Geoff,

> On Fri, 2014-10-24 at 11:28 +0100, Mark Rutland wrote:
> > > +++ b/arch/arm64/Kconfig
> > > @@ -313,6 +313,15 @@ config ARCH_HAS_CACHE_LINE_SIZE
> > >
> > >  source "mm/Kconfig"
> > >
> > > +config KEXEC
> > > +       depends on (!SMP || PM_SLEEP_SMP)
> > 
> > In its current state this also depends on !KVM && !EFI (technically you
> > could detect those cases at runtime, but I don't see that in this
> > series).
> 
> A kernel built with CONFIG_EFI is OK if run on a non-EFI system or
> without using a system's EFI support.  I added a patch that adds
> runtime checks in the kexec_load syscall path to print a message
> and return failure for situations where KVM or EFI won't work.
> 
> > > +/**
> > > + * machine_kexec_prepare - Prepare for a kexec reboot.
> > > + *
> > > + * Called from the core kexec code when a kernel image is loaded.
> > > + */
> > > +
> > > +int machine_kexec_prepare(struct kimage *image)
> > > +{
> > > +       const struct kexec_segment *dtb_seg = kexec_find_dtb_seg(image);
> > > +
> > > +       if (!dtb_seg)
> > > +               pr_warn("%s: No device tree segment found.\n", __func__);
> > > +
> > > +       arm64_kexec_dtb_addr = dtb_seg ? dtb_seg->mem : 0;
> > > +       arm64_kexec_kimage_start = image->start;
> > > +
> > > +       return 0;
> > > +}
> > 
> > I thought all of the DTB handling was moving to purgatory?
> 
> Non-purgatory booting is needed for kexec-lite.  We can do
> this simple check here which optionally sets x0 to the dtb
> address to support that.  The other solution is to have a
> trampoline in kexec-lite that sets x0 (basically an absolute
> minimal purgatory), but I think to do it here is nicer, and
> is also the same way that the arm arch code does it.
> 
> Maybe removing this pr_warn message and just relying on the
> kexec_image_info() output would be better.

I mentioned previously that I don't think the "kexec-lite" approach is a
good one, especially if we're going to have userspace purgatory code
anyway. It embeds a policy w.r.t. the segment handling within the
kernel, on the assumption of a specific use-case for what is a more
general mechanism.

Unfortunately secureboot with kexec_file_load will require a kernelspace
purgatory and likely special DT handling, but it's already a far more
limited interface.

> > > +/**
> > > + * kexec_list_flush - Helper to flush the kimage list to PoC.
> > > + */
> > > +
> > > +static void kexec_list_flush(unsigned long kimage_head)
> > > +{
> > > +       void *dest;
> > > +       unsigned long *entry;
> > > +
> > > +       for (entry = &kimage_head, dest = NULL; ; entry++) {
> > > +               unsigned int flag = *entry &
> > > +                       (IND_DESTINATION | IND_INDIRECTION | IND_DONE |
> > > +                       IND_SOURCE);
> > > +               void *addr = phys_to_virt(*entry & PAGE_MASK);
> > > +
> > > +               switch (flag) {
> > > +               case IND_INDIRECTION:
> > > +                       entry = (unsigned long *)addr - 1;
> > > +                       __flush_dcache_area(addr, PAGE_SIZE);
> > > +                       break;
> > > +               case IND_DESTINATION:
> > > +                       dest = addr;
> > > +                       break;
> > > +               case IND_SOURCE:
> > > +                       __flush_dcache_area(addr, PAGE_SIZE);
> > > +                       dest += PAGE_SIZE;
> > > +                       break;
> > > +               case IND_DONE:
> > > +                       return;
> > > +               default:
> > > +                       break;
> > 
> > Can an image ever have no flags? Given the presence of IND_NONE I'd
> > assume not, so this looks like a candidate for a BUG().
> 
> Sure, I guess things will blow up before it ever gets here though.

I would hope so. If we trigger the BUG() we know otherwise.

> 
> > 
> > > +               }
> > > +       }
> > > +}
> > > +
> > > +/**
> > > + * machine_kexec - Do the kexec reboot.
> > > + *
> > > + * Called from the core kexec code for a sys_reboot with LINUX_REBOOT_CMD_KEXEC.
> > > + */
> > > +
> > > +void machine_kexec(struct kimage *image)
> > > +{
> > > +       phys_addr_t reboot_code_buffer_phys;
> > > +       void *reboot_code_buffer;
> > > +
> > > +       BUG_ON(num_online_cpus() > 1);
> > > +
> > > +       arm64_kexec_kimage_head = image->head;
> > > +
> > > +       reboot_code_buffer_phys = page_to_phys(image->control_code_page);
> > > +       reboot_code_buffer = phys_to_virt(reboot_code_buffer_phys);
> > > +
> > > +       /*
> > > +        * Copy relocate_new_kernel to the reboot_code_buffer for use
> > > +        * after the kernel is shut down.
> > > +        */
> > > +
> > > +       memcpy(reboot_code_buffer, relocate_new_kernel,
> > > +               relocate_new_kernel_size);
> > 
> > Can we get rid of the line gaps between comments and the single function
> > calls they apply to, please? I realise it's a minor thing, but this
> > looks rather inconsistent with the rest of arch/arm64/.
> 
> checkpatch doesn't seem to mind, but sure, I can do that.

Cheers.

> > > +
> > > +       /* Flush the reboot_code_buffer in preparation for its execution. */
> > > +
> > > +       __flush_dcache_area(reboot_code_buffer, relocate_new_kernel_size);
> > 
> > That code should already be at the PoC per the boot protocol (the entire
> > kernel image should have been clean to the PoC at boot, so the
> > instructions forming relocate_new_kernel are globally visible).
> > 
> > From the looks of it you only need to flush the variables at the very
> > end.
> 
> We copy the relocate_new_kernel routine to reboot_code_buffer, which is
> a buffer allocated by the kexec core with alloc_pages().  That copy of
> relocate_new_kernel is what we are flushing here.
> 
> I believe we need to flush that buffer out to PoC so we can execute
> the code it contains after cpu_soft_restart().

Apologies. I'd evidently confused myself here regarding what was being
flushed.

> 
> > > --- /dev/null
> > > +++ b/arch/arm64/kernel/relocate_kernel.S
> > > @@ -0,0 +1,184 @@
> > > +/*
> > > + * kexec for arm64
> > > + *
> > > + * Copyright (C) Linaro.
> > > + *
> > > + * This program is free software; you can redistribute it and/or modify
> > > + * it under the terms of the GNU General Public License version 2 as
> > > + * published by the Free Software Foundation.
> > > + */
> > > +
> > > +#include <asm/assembler.h>
> > > +#include <asm/kexec.h>
> > > +#include <asm/memory.h>
> > > +#include <asm/page.h>
> > > +#include <asm/proc-macros.S>
> > > +
> > > +/* The list entry flags. */
> > > +
> > > +#define IND_DESTINATION_BIT 0
> > > +#define IND_INDIRECTION_BIT 1
> > > +#define IND_DONE_BIT        2
> > > +#define IND_SOURCE_BIT      3
> > 
> > As previously, I think these need to be moved into a common header, and
> > defined in terms of the existing IND_* macros (or vice-versa). I believe
> > you had a patch doing so; what's the status of that?
> 
> Still working to get it merged:
> 
>   https://lkml.org/lkml/2014/11/12/675

Ok. Let's hope that goes through soon.

> 
> > > +/*
> > > + * relocate_new_kernel - Put a 2nd stage kernel image in place and boot it.
> > > + *
> > > + * The memory that the old kernel occupies may be overwritten when coping the
> > > + * new image to its final location.  To assure that the relocate_new_kernel
> > > + * routine which does that copy is not overwritten all code and data needed
> > > + * by relocate_new_kernel must be between the symbols relocate_new_kernel and
> > > + * relocate_new_kernel_end.  The machine_kexec() routine will copy
> > > + * relocate_new_kernel to the kexec control_code_page, a special page which
> > > + * has been set up to be preserved during the copy operation.
> > > + */
> > > +
> > > +.globl relocate_new_kernel
> > > +relocate_new_kernel:
> 
> ...
> 
> > > +
> > > +       /* start_new_image */
> > > +
> > > +       ldr     x4, arm64_kexec_kimage_start
> > > +       ldr     x0, arm64_kexec_dtb_addr
> > > +       mov     x1, xzr
> > > +       mov     x2, xzr
> > > +       mov     x3, xzr
> > > +       br      x4
> > 
> > This last part should be in userspace-provided purgatory. If you have
> > purgatory code which does this then we should be able to rely on that,
> > and we don't have to try to maintain this DTB handling in kernelspace
> > (which I suspect may become painful as the boot protocol evolves).
> 
> I think the putting the dtb address in x0 is already fixed.  There are
> users with firmware that does this and any change to the boot protocol
> will have to work with it.

Sure, but that is the _Linux_ boot protocol, and the Kconfig description
of kexec stats "you can start any kernel with it, not just Linux". Why
should we embed Linux-specific details into a supposedly generic
mechanism?

We may also extend the boot protocol, and I would rather not have to
manage the complexity of each possible extension within the kernel,
especially given that the only context we can pass in kexec is segments.

> As I mentioned above, we need a solution for non-purgatory re-boot and I
> think this is the best way.

Why do we need a solution for "non-purgatory re-boot"? As far as I can
see this is a non-problem.

Thanks,
Mark.
Geoff Levand Nov. 17, 2014, 8:20 p.m. UTC | #4
Hi Mark,

On Thu, Nov 13, 2014 at 02:19:48AM +0000, Geoff Levand wrote:
> > On Fri, 2014-10-24 at 11:28 +0100, Mark Rutland wrote:
> > > > +/**
> > > > + * machine_kexec_prepare - Prepare for a kexec reboot.
> > > > + *
> > > > + * Called from the core kexec code when a kernel image is loaded.
> > > > + */
> > > > +
> > > > +int machine_kexec_prepare(struct kimage *image)
> > > > +{
> > > > +       const struct kexec_segment *dtb_seg = kexec_find_dtb_seg(image);
> > > > +
> > > > +       if (!dtb_seg)
> > > > +               pr_warn("%s: No device tree segment found.\n", __func__);
> > > > +
> > > > +       arm64_kexec_dtb_addr = dtb_seg ? dtb_seg->mem : 0;
> > > > +       arm64_kexec_kimage_start = image->start;
> > > > +
> > > > +       return 0;
> > > > +}
> > > 
> > > I thought all of the DTB handling was moving to purgatory?
> > 
> > Non-purgatory booting is needed for kexec-lite.  We can do
> > this simple check here which optionally sets x0 to the dtb
> > address to support that.  The other solution is to have a
> > trampoline in kexec-lite that sets x0 (basically an absolute
> > minimal purgatory), but I think to do it here is nicer, and
> > is also the same way that the arm arch code does it.
> > 
> > Maybe removing this pr_warn message and just relying on the
> > kexec_image_info() output would be better.
> 
> I mentioned previously that I don't think the "kexec-lite" approach is a
> good one, especially if we're going to have userspace purgatory code
> anyway. It embeds a policy w.r.t. the segment handling within the
> kernel, on the assumption of a specific use-case for what is a more
> general mechanism.

I don't think this support embeds a policy.  It is completely optional.
If one of the kexec segments is found to have a dtb header at its start
the address of that segment is put into x0 so that it is available to
the code that control is passed to.  That code is free to use the value
or not.  In the case of the current kexec-tools implementation for
example, its purgatory does not use that value in x0 since the address
of the dtb is known to the purgatory code through its arm64_dtb_addr
variable. 

One motivation for kexec-lite was to avoid the complicated user
space of a purgatory when it wasn't really needed.  From what I
understand, kexec-lite is shipping to customers, so there is at least a
desire for it on other architectures which I believe are in the same
market as 64 bit ARM servers.  Also, just to mention it, the arm (32 bit)
arch provides a similar facility in its kexec kernel code, by setting
r2 to the address of the dtb, and there doesn't seem to be any concern
over that.

I can't see any negative effect of setting x0 in this way.  If a user
space loader needs or wants to do something different it is completely
free to ignore the value the 1st stage kernel has put into x0.

If the boot protocol is changed new kernels will still need to be able to
boot from old loaders, and old kernels from new loaders.  Depending on
what the protocol change introduces we can decide if it makes sense to
update this part of kexec.

If you can describe a clear situation where this would cause a problem
we should remove it, but if the choice is to remove support that users
want to provide kernel developers some flexibility that may not be
needed, then I think we should keep it in.

> Unfortunately secureboot with kexec_file_load will require a kernelspace
> purgatory and likely special DT handling, but it's already a far more
> limited interface.

...

> > > > +/*
> > > > + * relocate_new_kernel - Put a 2nd stage kernel image in place and boot it.
> > > > + *
> > > > + * The memory that the old kernel occupies may be overwritten when coping the
> > > > + * new image to its final location.  To assure that the relocate_new_kernel
> > > > + * routine which does that copy is not overwritten all code and data needed
> > > > + * by relocate_new_kernel must be between the symbols relocate_new_kernel and
> > > > + * relocate_new_kernel_end.  The machine_kexec() routine will copy
> > > > + * relocate_new_kernel to the kexec control_code_page, a special page which
> > > > + * has been set up to be preserved during the copy operation.
> > > > + */
> > > > +
> > > > +.globl relocate_new_kernel
> > > > +relocate_new_kernel:
> > 
> > ...
> > 
> > > > +
> > > > +       /* start_new_image */
> > > > +
> > > > +       ldr     x4, arm64_kexec_kimage_start
> > > > +       ldr     x0, arm64_kexec_dtb_addr
> > > > +       mov     x1, xzr
> > > > +       mov     x2, xzr
> > > > +       mov     x3, xzr
> > > > +       br      x4
> > > 
> > > This last part should be in userspace-provided purgatory. If you have
> > > purgatory code which does this then we should be able to rely on that,
> > > and we don't have to try to maintain this DTB handling in kernelspace
> > > (which I suspect may become painful as the boot protocol evolves).
> > 
> > I think the putting the dtb address in x0 is already fixed.  There are
> > users with firmware that does this and any change to the boot protocol
> > will have to work with it.
> 
> Sure, but that is the _Linux_ boot protocol, and the Kconfig description
> of kexec stats "you can start any kernel with it, not just Linux". Why
> should we embed Linux-specific details into a supposedly generic
> mechanism?
> 
> We may also extend the boot protocol, and I would rather not have to
> manage the complexity of each possible extension within the kernel,
> especially given that the only context we can pass in kexec is segments.
> 
> > As I mentioned above, we need a solution for non-purgatory re-boot and I
> > think this is the best way.
> 
> Why do we need a solution for "non-purgatory re-boot"? As far as I can
> see this is a non-problem.

I tried to address these last concerns in my comments above.
  
-Geoff
diff mbox

Patch

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index f0d3a2d..af03449 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -313,6 +313,15 @@  config ARCH_HAS_CACHE_LINE_SIZE
 
 source "mm/Kconfig"
 
+config KEXEC
+	depends on (!SMP || PM_SLEEP_SMP)
+	bool "kexec system call"
+	---help---
+	  kexec is a system call that implements the ability to shutdown your
+	  current kernel, and to start another kernel.  It is like a reboot
+	  but it is independent of the system firmware.   And like a reboot
+	  you can start any kernel with it, not just Linux.
+
 config XEN_DOM0
 	def_bool y
 	depends on XEN
diff --git a/arch/arm64/include/asm/kexec.h b/arch/arm64/include/asm/kexec.h
new file mode 100644
index 0000000..e7bd7ab
--- /dev/null
+++ b/arch/arm64/include/asm/kexec.h
@@ -0,0 +1,47 @@ 
+/*
+ * kexec for arm64
+ *
+ * Copyright (C) Linaro.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#if !defined(_ARM64_KEXEC_H)
+#define _ARM64_KEXEC_H
+
+/* Maximum physical address we can use pages from */
+
+#define KEXEC_SOURCE_MEMORY_LIMIT (-1UL)
+
+/* Maximum address we can reach in physical address mode */
+
+#define KEXEC_DESTINATION_MEMORY_LIMIT (-1UL)
+
+/* Maximum address we can use for the control code buffer */
+
+#define KEXEC_CONTROL_MEMORY_LIMIT (-1UL)
+
+#define KEXEC_CONTROL_PAGE_SIZE	4096
+
+#define KEXEC_ARCH KEXEC_ARCH_ARM64
+
+#if !defined(__ASSEMBLY__)
+
+/**
+ * crash_setup_regs() - save registers for the panic kernel
+ *
+ * @newregs: registers are saved here
+ * @oldregs: registers to be saved (may be %NULL)
+ */
+
+static inline void crash_setup_regs(struct pt_regs *newregs,
+				    struct pt_regs *oldregs)
+{
+	/* Empty routine needed to avoid build errors. */
+}
+
+#endif /* !defined(__ASSEMBLY__) */
+
+#endif
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index 6e9538c..77a7351 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -30,6 +30,7 @@  arm64-obj-$(CONFIG_CPU_IDLE)		+= cpuidle.o
 arm64-obj-$(CONFIG_JUMP_LABEL)		+= jump_label.o
 arm64-obj-$(CONFIG_KGDB)		+= kgdb.o
 arm64-obj-$(CONFIG_EFI)			+= efi.o efi-stub.o efi-entry.o
+arm64-obj-$(CONFIG_KEXEC)		+= machine_kexec.o relocate_kernel.o
 
 obj-y					+= $(arm64-obj-y) vdso/
 obj-m					+= $(arm64-obj-m)
diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c
new file mode 100644
index 0000000..95bc8d9
--- /dev/null
+++ b/arch/arm64/kernel/machine_kexec.c
@@ -0,0 +1,169 @@ 
+/*
+ * kexec for arm64
+ *
+ * Copyright (C) Linaro.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kexec.h>
+#include <linux/of_fdt.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+#include <asm/cacheflush.h>
+#include <asm/system_misc.h>
+
+/* Global variables for the relocate_kernel routine. */
+
+extern const unsigned char relocate_new_kernel[];
+extern const unsigned long relocate_new_kernel_size;
+extern unsigned long arm64_kexec_dtb_addr;
+extern unsigned long arm64_kexec_kimage_head;
+extern unsigned long arm64_kexec_kimage_start;
+
+/**
+ * kexec_is_dtb - Helper routine to check the device tree header signature.
+ */
+
+static bool kexec_is_dtb(const void *dtb)
+{
+	__be32 magic;
+
+	return get_user(magic, (__be32 *)dtb) ? false :
+		(be32_to_cpu(magic) == OF_DT_HEADER);
+}
+
+/**
+ * kexec_find_dtb_seg - Helper routine to find the dtb segment.
+ */
+
+static const struct kexec_segment *kexec_find_dtb_seg(
+	const struct kimage *image)
+{
+	int i;
+
+	for (i = 0; i < image->nr_segments; i++) {
+		if (kexec_is_dtb(image->segment[i].buf))
+			return &image->segment[i];
+	}
+
+	return NULL;
+}
+
+void machine_kexec_cleanup(struct kimage *image)
+{
+	/* Empty routine needed to avoid build errors. */
+}
+
+/**
+ * machine_kexec_prepare - Prepare for a kexec reboot.
+ *
+ * Called from the core kexec code when a kernel image is loaded.
+ */
+
+int machine_kexec_prepare(struct kimage *image)
+{
+	const struct kexec_segment *dtb_seg = kexec_find_dtb_seg(image);
+
+	if (!dtb_seg)
+		pr_warn("%s: No device tree segment found.\n", __func__);
+
+	arm64_kexec_dtb_addr = dtb_seg ? dtb_seg->mem : 0;
+	arm64_kexec_kimage_start = image->start;
+
+	return 0;
+}
+
+/**
+ * kexec_list_flush - Helper to flush the kimage list to PoC.
+ */
+
+static void kexec_list_flush(unsigned long kimage_head)
+{
+	void *dest;
+	unsigned long *entry;
+
+	for (entry = &kimage_head, dest = NULL; ; entry++) {
+		unsigned int flag = *entry &
+			(IND_DESTINATION | IND_INDIRECTION | IND_DONE |
+			IND_SOURCE);
+		void *addr = phys_to_virt(*entry & PAGE_MASK);
+
+		switch (flag) {
+		case IND_INDIRECTION:
+			entry = (unsigned long *)addr - 1;
+			__flush_dcache_area(addr, PAGE_SIZE);
+			break;
+		case IND_DESTINATION:
+			dest = addr;
+			break;
+		case IND_SOURCE:
+			__flush_dcache_area(addr, PAGE_SIZE);
+			dest += PAGE_SIZE;
+			break;
+		case IND_DONE:
+			return;
+		default:
+			break;
+		}
+	}
+}
+
+/**
+ * machine_kexec - Do the kexec reboot.
+ *
+ * Called from the core kexec code for a sys_reboot with LINUX_REBOOT_CMD_KEXEC.
+ */
+
+void machine_kexec(struct kimage *image)
+{
+	phys_addr_t reboot_code_buffer_phys;
+	void *reboot_code_buffer;
+
+	BUG_ON(num_online_cpus() > 1);
+
+	arm64_kexec_kimage_head = image->head;
+
+	reboot_code_buffer_phys = page_to_phys(image->control_code_page);
+	reboot_code_buffer = phys_to_virt(reboot_code_buffer_phys);
+
+	/*
+	 * Copy relocate_new_kernel to the reboot_code_buffer for use
+	 * after the kernel is shut down.
+	 */
+
+	memcpy(reboot_code_buffer, relocate_new_kernel,
+		relocate_new_kernel_size);
+
+	/* Flush the reboot_code_buffer in preparation for its execution. */
+
+	__flush_dcache_area(reboot_code_buffer, relocate_new_kernel_size);
+
+	/* Flush the kimage list. */
+
+	kexec_list_flush(image->head);
+
+	pr_info("Bye!\n");
+
+	/* Disable all DAIF exceptions. */
+
+	asm volatile ("msr daifset, #0xf" : : : "memory");
+
+	/*
+	 * soft_restart() will shutdown the MMU, disable data caches, then
+	 * transfer control to the reboot_code_buffer which contains a copy of
+	 * the relocate_new_kernel routine.  relocate_new_kernel will use
+	 * physical addressing to relocate the new kernel to its final position
+	 * and then will transfer control to the entry point of the new kernel.
+	 */
+
+	soft_restart(reboot_code_buffer_phys);
+}
+
+void machine_crash_shutdown(struct pt_regs *regs)
+{
+	/* Empty routine needed to avoid build errors. */
+}
diff --git a/arch/arm64/kernel/relocate_kernel.S b/arch/arm64/kernel/relocate_kernel.S
new file mode 100644
index 0000000..49cf9a0
--- /dev/null
+++ b/arch/arm64/kernel/relocate_kernel.S
@@ -0,0 +1,184 @@ 
+/*
+ * kexec for arm64
+ *
+ * Copyright (C) Linaro.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <asm/assembler.h>
+#include <asm/kexec.h>
+#include <asm/memory.h>
+#include <asm/page.h>
+#include <asm/proc-macros.S>
+
+/* The list entry flags. */
+
+#define IND_DESTINATION_BIT 0
+#define IND_INDIRECTION_BIT 1
+#define IND_DONE_BIT        2
+#define IND_SOURCE_BIT      3
+
+/*
+ * relocate_new_kernel - Put a 2nd stage kernel image in place and boot it.
+ *
+ * The memory that the old kernel occupies may be overwritten when coping the
+ * new image to its final location.  To assure that the relocate_new_kernel
+ * routine which does that copy is not overwritten all code and data needed
+ * by relocate_new_kernel must be between the symbols relocate_new_kernel and
+ * relocate_new_kernel_end.  The machine_kexec() routine will copy
+ * relocate_new_kernel to the kexec control_code_page, a special page which
+ * has been set up to be preserved during the copy operation.
+ */
+
+.globl relocate_new_kernel
+relocate_new_kernel:
+
+	/* Setup the list loop variables. */
+
+	ldr	x18, arm64_kexec_kimage_head	/* x18 = list entry */
+	dcache_line_size x17, x0		/* x17 = dcache line size */
+	mov	x16, xzr			/* x16 = segment start */
+	mov	x15, xzr			/* x15 = entry ptr */
+	mov	x14, xzr			/* x14 = copy dest */
+
+	/* Check if the new image needs relocation. */
+
+	cbz	x18, .Ldone
+	tbnz	x18, IND_DONE_BIT, .Ldone
+
+.Lloop:
+	and	x13, x18, PAGE_MASK		/* x13 = addr */
+
+	/* Test the entry flags. */
+
+.Ltest_source:
+	tbz	x18, IND_SOURCE_BIT, .Ltest_indirection
+
+	/* copy_page(x20 = dest, x21 = src) */
+
+	mov x20, x14
+	mov x21, x13
+
+1:	ldp	x22, x23, [x21]
+	ldp	x24, x25, [x21, #16]
+	ldp	x26, x27, [x21, #32]
+	ldp	x28, x29, [x21, #48]
+	add	x21, x21, #64
+	stnp	x22, x23, [x20]
+	stnp	x24, x25, [x20, #16]
+	stnp	x26, x27, [x20, #32]
+	stnp	x28, x29, [x20, #48]
+	add	x20, x20, #64
+	tst	x21, #(PAGE_SIZE - 1)
+	b.ne	1b
+
+	/* dest += PAGE_SIZE */
+
+	add	x14, x14, PAGE_SIZE
+	b	.Lnext
+
+.Ltest_indirection:
+	tbz	x18, IND_INDIRECTION_BIT, .Ltest_destination
+
+	/* ptr = addr */
+
+	mov	x15, x13
+	b	.Lnext
+
+.Ltest_destination:
+	tbz	x18, IND_DESTINATION_BIT, .Lnext
+
+	/* flush segment */
+
+	bl	.Lflush
+	mov	x16, x13
+
+	/* dest = addr */
+
+	mov	x14, x13
+
+.Lnext:
+	/* entry = *ptr++ */
+
+	ldr	x18, [x15], #8
+
+	/* while (!(entry & DONE)) */
+
+	tbz	x18, IND_DONE_BIT, .Lloop
+
+.Ldone:
+	/* flush last segment */
+
+	bl	.Lflush
+
+	dsb	sy
+	isb
+	ic	ialluis
+	dsb	sy
+	isb
+
+	/* start_new_image */
+
+	ldr	x4, arm64_kexec_kimage_start
+	ldr	x0, arm64_kexec_dtb_addr
+	mov	x1, xzr
+	mov	x2, xzr
+	mov	x3, xzr
+	br	x4
+
+/* flush - x17 = line size, x16 = start addr, x14 = end addr. */
+
+.Lflush:
+	cbz	x16, 2f
+	mov	x0, x16
+	sub	x1, x17, #1
+	bic	x0, x0, x1
+1:	dc	civac, x0
+	add	x0, x0, x17
+	cmp	x0, x14
+	b.lo	1b
+2:	ret
+
+.align 3	/* To keep the 64-bit values below naturally aligned. */
+
+/* The machine_kexec routines set these variables. */
+
+/*
+ * arm64_kexec_kimage_start - Copy of image->start, the entry point of the new
+ * image.
+ */
+
+.globl arm64_kexec_kimage_start
+arm64_kexec_kimage_start:
+	.quad	0x0
+
+/*
+ * arm64_kexec_dtb_addr - Physical address of a device tree.
+ */
+
+.globl arm64_kexec_dtb_addr
+arm64_kexec_dtb_addr:
+	.quad	0x0
+
+/*
+ * arm64_kexec_kimage_head - Copy of image->head, the list of kimage entries.
+ */
+
+.globl arm64_kexec_kimage_head
+arm64_kexec_kimage_head:
+	.quad	0x0
+
+.Lrelocate_new_kernel_end:
+
+/*
+ * relocate_new_kernel_size - Number of bytes to copy to the control_code_page.
+ */
+
+.globl relocate_new_kernel_size
+relocate_new_kernel_size:
+	.quad .Lrelocate_new_kernel_end - relocate_new_kernel
+
+.org	KEXEC_CONTROL_PAGE_SIZE
diff --git a/include/uapi/linux/kexec.h b/include/uapi/linux/kexec.h
index 6925f5b..04626b9 100644
--- a/include/uapi/linux/kexec.h
+++ b/include/uapi/linux/kexec.h
@@ -39,6 +39,7 @@ 
 #define KEXEC_ARCH_SH      (42 << 16)
 #define KEXEC_ARCH_MIPS_LE (10 << 16)
 #define KEXEC_ARCH_MIPS    ( 8 << 16)
+#define KEXEC_ARCH_ARM64   (183 << 16)
 
 /* The artificial cap on the number of segments passed to kexec_load. */
 #define KEXEC_SEGMENT_MAX 16