diff mbox

[v2,1/2] arm64: implement FTRACE_WITH_REGS

Message ID 20160627151717.BD25468D26@newverein.lst.de (mailing list archive)
State New, archived
Headers show

Commit Message

Torsten Duwe June 27, 2016, 3:17 p.m. UTC
Once gcc is enhanced to optionally generate NOPs at the beginning
of each function, like the concept proven in
https://gcc.gnu.org/ml/gcc-patches/2016-04/msg01671.html
(sans the "fprintf (... pad_size);", which spoils the data structure
for kernel use), the generated pads can nicely be used to reroute
function calls for tracing/profiling, or live patching.

The pads look like
fffffc00081335f0 <hrtimer_init>:
fffffc00081335f0:       d503201f        nop
fffffc00081335f4:       d503201f        nop
fffffc00081335f8:       a9bd7bfd        stp     x29, x30, [sp,#-48]!
fffffc00081335fc:       910003fd        mov     x29, sp
[...]

This patch gets the pad locations from the compiler-generated
__prolog_pads_loc into the _mcount_loc array, and provides the
code patching functions to turn the pads at runtime into

fffffc00081335f0     mov     x9, x30
fffffc00081335f4     bl      0xfffffc00080a08c0 <ftrace_caller>
fffffc00081335f8     stp     x29, x30, [sp,#-48]!
fffffc00081335fc     mov     x29, sp

as well as an ftrace_caller that can handle these call sites.
Now ARCH_SUPPORTS_FTRACE_OPS as a benefit, and the graph caller
still works, too.

Signed-off-by: Li Bin <huawei.libin@huawei.com>
Signed-off-by: Torsten Duwe <duwe@suse.de>
---
 arch/arm64/Kconfig                |  1 +
 arch/arm64/Makefile               |  4 ++
 arch/arm64/include/asm/ftrace.h   |  8 ++++
 arch/arm64/kernel/Makefile        |  6 +--
 arch/arm64/kernel/entry-ftrace.S  | 89 +++++++++++++++++++++++++++++++++++++++
 arch/arm64/kernel/ftrace.c        | 43 +++++++++++++++++--
 include/asm-generic/vmlinux.lds.h |  2 +-
 include/linux/compiler.h          |  4 ++
 8 files changed, 150 insertions(+), 7 deletions(-)

Comments

Josh Poimboeuf July 1, 2016, 12:53 p.m. UTC | #1
On Mon, Jun 27, 2016 at 05:17:17PM +0200, Torsten Duwe wrote:
> Once gcc is enhanced to optionally generate NOPs at the beginning
> of each function, like the concept proven in
> https://gcc.gnu.org/ml/gcc-patches/2016-04/msg01671.html
> (sans the "fprintf (... pad_size);", which spoils the data structure
> for kernel use), the generated pads can nicely be used to reroute
> function calls for tracing/profiling, or live patching.
> 
> The pads look like
> fffffc00081335f0 <hrtimer_init>:
> fffffc00081335f0:       d503201f        nop
> fffffc00081335f4:       d503201f        nop
> fffffc00081335f8:       a9bd7bfd        stp     x29, x30, [sp,#-48]!
> fffffc00081335fc:       910003fd        mov     x29, sp
> [...]
> 
> This patch gets the pad locations from the compiler-generated
> __prolog_pads_loc into the _mcount_loc array, and provides the
> code patching functions to turn the pads at runtime into
> 
> fffffc00081335f0     mov     x9, x30
> fffffc00081335f4     bl      0xfffffc00080a08c0 <ftrace_caller>
> fffffc00081335f8     stp     x29, x30, [sp,#-48]!
> fffffc00081335fc     mov     x29, sp
> 
> as well as an ftrace_caller that can handle these call sites.
> Now ARCH_SUPPORTS_FTRACE_OPS as a benefit, and the graph caller
> still works, too.
> 
> Signed-off-by: Li Bin <huawei.libin@huawei.com>
> Signed-off-by: Torsten Duwe <duwe@suse.de>
> ---
>  arch/arm64/Kconfig                |  1 +
>  arch/arm64/Makefile               |  4 ++
>  arch/arm64/include/asm/ftrace.h   |  8 ++++
>  arch/arm64/kernel/Makefile        |  6 +--
>  arch/arm64/kernel/entry-ftrace.S  | 89 +++++++++++++++++++++++++++++++++++++++
>  arch/arm64/kernel/ftrace.c        | 43 +++++++++++++++++--
>  include/asm-generic/vmlinux.lds.h |  2 +-
>  include/linux/compiler.h          |  4 ++
>  8 files changed, 150 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index 5a0a691..36a0e26 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -72,6 +72,7 @@ config ARM64
>  	select HAVE_DMA_API_DEBUG
>  	select HAVE_DMA_CONTIGUOUS
>  	select HAVE_DYNAMIC_FTRACE
> +	select HAVE_DYNAMIC_FTRACE_WITH_REGS
>  	select HAVE_EFFICIENT_UNALIGNED_ACCESS
>  	select HAVE_FTRACE_MCOUNT_RECORD
>  	select HAVE_FUNCTION_TRACER
> diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile
> index 648a32c..e5e335c 100644
> --- a/arch/arm64/Makefile
> +++ b/arch/arm64/Makefile
> @@ -35,6 +35,10 @@ KBUILD_CFLAGS	+= -fno-asynchronous-unwind-tables
>  KBUILD_CFLAGS	+= $(call cc-option, -mpc-relative-literal-loads)
>  KBUILD_AFLAGS	+= $(lseinstr)
>  
> +ifeq ($(CONFIG_DYNAMIC_FTRACE_WITH_REGS), y)
> +CC_FLAGS_FTRACE := -fprolog-pad=2 -DCC_USING_PROLOG_PAD
> +endif
> +

It would probably be good to print a warning for older gccs which don't
support this option, so that when the build fails, there's at least a
warning to indicate why.  Something like:

  ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
    CC_FLAGS_FTRACE := -fprolog-pad=2 -DCC_USING_PROLOG_PAD
    ifeq ($(call cc-option,-fprolog-pad=2),)
      $(warning Cannot use CONFIG_DYNAMIC_FTRACE_WITH_REGS: \
               -fprolog-pad not supported by compiler)
    endif
  endif
kernel test robot July 3, 2016, 5:17 a.m. UTC | #2
Hi,

[auto build test ERROR on arm64/for-next/core]
[also build test ERROR on v4.7-rc5 next-20160701]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Torsten-Duwe/arm64-live-patching/20160627-232728
base:   https://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git for-next/core
config: arm64-allyesconfig (attached as .config)
compiler: aarch64-linux-gnu-gcc (Debian 5.3.1-8) 5.3.1 20160205
reproduce:
        wget https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        make.cross ARCH=arm64 

All errors (new ones prefixed by >>):

   Makefile:687: Cannot use CONFIG_KCOV: -fsanitize-coverage=trace-pc is not supported by compiler
>> aarch64-linux-gnu-gcc: error: unrecognized command line option '-fprolog-pad=2'
   make[2]: *** [kernel/bounds.s] Error 1
   make[2]: Target '__build' not remade because of errors.
   make[1]: *** [prepare0] Error 2
   make[1]: Target 'prepare' not remade because of errors.
   make: *** [sub-make] Error 2

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
Torsten Duwe July 4, 2016, 9:18 a.m. UTC | #3
On Fri, Jul 01, 2016 at 07:53:44AM -0500, Josh Poimboeuf wrote:
> On Mon, Jun 27, 2016 at 05:17:17PM +0200, Torsten Duwe wrote:
> > Once gcc is enhanced to optionally generate NOPs at the beginning
> > of each function, like the concept proven in
> > https://gcc.gnu.org/ml/gcc-patches/2016-04/msg01671.html
> > (sans the "fprintf (... pad_size);", which spoils the data structure
> > for kernel use), the generated pads can nicely be used to reroute
> > function calls for tracing/profiling, or live patching.

[...]

> > @@ -35,6 +35,10 @@ KBUILD_CFLAGS	+= -fno-asynchronous-unwind-tables
> >  KBUILD_CFLAGS	+= $(call cc-option, -mpc-relative-literal-loads)
> >  KBUILD_AFLAGS	+= $(lseinstr)
> >  
> > +ifeq ($(CONFIG_DYNAMIC_FTRACE_WITH_REGS), y)
> > +CC_FLAGS_FTRACE := -fprolog-pad=2 -DCC_USING_PROLOG_PAD
> > +endif
> > +
> 
> It would probably be good to print a warning for older gccs which don't
> support this option, so that when the build fails, there's at least a
> warning to indicate why.  Something like:
> 
>   ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
>     CC_FLAGS_FTRACE := -fprolog-pad=2 -DCC_USING_PROLOG_PAD
>     ifeq ($(call cc-option,-fprolog-pad=2),)
>       $(warning Cannot use CONFIG_DYNAMIC_FTRACE_WITH_REGS: \
>                -fprolog-pad not supported by compiler)
>     endif
>   endif

Yes. Ideally, compiler support could be checked even before the option is
offered, but your explicit warning is better than just failing obscurely.

What do you think about prolog-pad in general? If we can convince the
gcc people to include it, it could become the default mechanism for all
architectures that do not require special treatment (e.g. like ABIv2
dual entry on ppc64le).

	Torsten
Petr Mladek July 8, 2016, 2:58 p.m. UTC | #4
On Mon 2016-06-27 17:17:17, Torsten Duwe wrote:
> Once gcc is enhanced to optionally generate NOPs at the beginning
> of each function, like the concept proven in
> https://gcc.gnu.org/ml/gcc-patches/2016-04/msg01671.html
> (sans the "fprintf (... pad_size);", which spoils the data structure
> for kernel use), the generated pads can nicely be used to reroute
> function calls for tracing/profiling, or live patching.
> diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c
> index ebecf9a..917065c 100644
> --- a/arch/arm64/kernel/ftrace.c
> +++ b/arch/arm64/kernel/ftrace.c
> @@ -39,6 +39,12 @@ static int ftrace_modify_code(unsigned long pc, u32 old, u32 new,
>  		if (aarch64_insn_read((void *)pc, &replaced))
>  			return -EFAULT;
>  
> +		/* If we already have what we'll finally want,
> +		 * report success. This is needed on startup.
> +		 */
> +		if (replaced == new)
> +			return 0;

This looks strange. I wonder if it actually hides a real bug that we
modify the code twice or so.

I wanted to try it myself but I haven't succeeded with creating an ARM test
system yet.

Best Regards,
Petr
Torsten Duwe July 8, 2016, 3:07 p.m. UTC | #5
On Fri, Jul 08, 2016 at 04:58:00PM +0200, Petr Mladek wrote:
> On Mon 2016-06-27 17:17:17, Torsten Duwe wrote:
> > Once gcc is enhanced to optionally generate NOPs at the beginning
> > of each function, like the concept proven in
> > https://gcc.gnu.org/ml/gcc-patches/2016-04/msg01671.html
> > (sans the "fprintf (... pad_size);", which spoils the data structure
> > for kernel use), the generated pads can nicely be used to reroute
> > function calls for tracing/profiling, or live patching.
> > diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c
> > index ebecf9a..917065c 100644
> > --- a/arch/arm64/kernel/ftrace.c
> > +++ b/arch/arm64/kernel/ftrace.c
> > @@ -39,6 +39,12 @@ static int ftrace_modify_code(unsigned long pc, u32 old, u32 new,
> >  		if (aarch64_insn_read((void *)pc, &replaced))
> >  			return -EFAULT;
> >  
> > +		/* If we already have what we'll finally want,
> > +		 * report success. This is needed on startup.
> > +		 */
> > +		if (replaced == new)
> > +			return 0;
> 
> This looks strange. I wonder if it actually hides a real bug that we
> modify the code twice or so.

Not at all. All "profilers" we abused so far generate code that needs to
be disabled on boot first. prolog-pad generates nops, initially.

	Torsten
Petr Mladek July 8, 2016, 3:24 p.m. UTC | #6
On Fri 2016-07-08 17:07:09, Torsten Duwe wrote:
> On Fri, Jul 08, 2016 at 04:58:00PM +0200, Petr Mladek wrote:
> > On Mon 2016-06-27 17:17:17, Torsten Duwe wrote:
> > > Once gcc is enhanced to optionally generate NOPs at the beginning
> > > of each function, like the concept proven in
> > > https://gcc.gnu.org/ml/gcc-patches/2016-04/msg01671.html
> > > (sans the "fprintf (... pad_size);", which spoils the data structure
> > > for kernel use), the generated pads can nicely be used to reroute
> > > function calls for tracing/profiling, or live patching.
> > > diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c
> > > index ebecf9a..917065c 100644
> > > --- a/arch/arm64/kernel/ftrace.c
> > > +++ b/arch/arm64/kernel/ftrace.c
> > > @@ -39,6 +39,12 @@ static int ftrace_modify_code(unsigned long pc, u32 old, u32 new,
> > >  		if (aarch64_insn_read((void *)pc, &replaced))
> > >  			return -EFAULT;
> > >  
> > > +		/* If we already have what we'll finally want,
> > > +		 * report success. This is needed on startup.
> > > +		 */
> > > +		if (replaced == new)
> > > +			return 0;
> > 
> > This looks strange. I wonder if it actually hides a real bug that we
> > modify the code twice or so.
> 
> Not at all. All "profilers" we abused so far generate code that needs to
> be disabled on boot first. prolog-pad generates nops, initially.

Yeah, but I cannot find this kind of check in other architectures.
I checked arch/x86/kernel/ftrace.c, arch/s390/kernel/ftrace.c, and
arch/powerpc/kernel/ftrace.c. These all support ftrace with
regs and livepatching.

Best Regards,
Petr
Josh Poimboeuf July 8, 2016, 3:48 p.m. UTC | #7
On Fri, Jul 08, 2016 at 05:24:21PM +0200, Petr Mladek wrote:
> On Fri 2016-07-08 17:07:09, Torsten Duwe wrote:
> > On Fri, Jul 08, 2016 at 04:58:00PM +0200, Petr Mladek wrote:
> > > On Mon 2016-06-27 17:17:17, Torsten Duwe wrote:
> > > > Once gcc is enhanced to optionally generate NOPs at the beginning
> > > > of each function, like the concept proven in
> > > > https://gcc.gnu.org/ml/gcc-patches/2016-04/msg01671.html
> > > > (sans the "fprintf (... pad_size);", which spoils the data structure
> > > > for kernel use), the generated pads can nicely be used to reroute
> > > > function calls for tracing/profiling, or live patching.
> > > > diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c
> > > > index ebecf9a..917065c 100644
> > > > --- a/arch/arm64/kernel/ftrace.c
> > > > +++ b/arch/arm64/kernel/ftrace.c
> > > > @@ -39,6 +39,12 @@ static int ftrace_modify_code(unsigned long pc, u32 old, u32 new,
> > > >  		if (aarch64_insn_read((void *)pc, &replaced))
> > > >  			return -EFAULT;
> > > >  
> > > > +		/* If we already have what we'll finally want,
> > > > +		 * report success. This is needed on startup.
> > > > +		 */
> > > > +		if (replaced == new)
> > > > +			return 0;
> > > 
> > > This looks strange. I wonder if it actually hides a real bug that we
> > > modify the code twice or so.
> > 
> > Not at all. All "profilers" we abused so far generate code that needs to
> > be disabled on boot first. prolog-pad generates nops, initially.
> 
> Yeah, but I cannot find this kind of check in other architectures.
> I checked arch/x86/kernel/ftrace.c, arch/s390/kernel/ftrace.c, and
> arch/powerpc/kernel/ftrace.c. These all support ftrace with
> regs and livepatching.

My understanding is that other arches don't need this check because they
use -mfentry, so they have to modify the "call fentry" instruction to a
nop on startup.

Here, with -fprolog-pad, it's already a nop, so no change is needed.
Steven Rostedt July 8, 2016, 3:49 p.m. UTC | #8
On Fri, 8 Jul 2016 17:24:21 +0200
Petr Mladek <pmladek@suse.com> wrote:

> On Fri 2016-07-08 17:07:09, Torsten Duwe wrote:
> > On Fri, Jul 08, 2016 at 04:58:00PM +0200, Petr Mladek wrote:  
> > > On Mon 2016-06-27 17:17:17, Torsten Duwe wrote:  
> > > > Once gcc is enhanced to optionally generate NOPs at the beginning
> > > > of each function, like the concept proven in
> > > > https://gcc.gnu.org/ml/gcc-patches/2016-04/msg01671.html
> > > > (sans the "fprintf (... pad_size);", which spoils the data structure
> > > > for kernel use), the generated pads can nicely be used to reroute
> > > > function calls for tracing/profiling, or live patching.
> > > > diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c
> > > > index ebecf9a..917065c 100644
> > > > --- a/arch/arm64/kernel/ftrace.c
> > > > +++ b/arch/arm64/kernel/ftrace.c
> > > > @@ -39,6 +39,12 @@ static int ftrace_modify_code(unsigned long pc, u32 old, u32 new,
> > > >  		if (aarch64_insn_read((void *)pc, &replaced))
> > > >  			return -EFAULT;
> > > >  
> > > > +		/* If we already have what we'll finally want,
> > > > +		 * report success. This is needed on startup.
> > > > +		 */
> > > > +		if (replaced == new)
> > > > +			return 0;  
> > > 
> > > This looks strange. I wonder if it actually hides a real bug that we
> > > modify the code twice or so.  
> > 
> > Not at all. All "profilers" we abused so far generate code that needs to
> > be disabled on boot first. prolog-pad generates nops, initially.  
> 
> Yeah, but I cannot find this kind of check in other architectures.
> I checked arch/x86/kernel/ftrace.c, arch/s390/kernel/ftrace.c, and
> arch/powerpc/kernel/ftrace.c. These all support ftrace with
> regs and livepatching.

I guess the question is, with this approach, there's no call to mcount
or fentry at compile time? Just nops are added? In this case perhaps the
if statement should be more defined:

	/*
	 * On boot, with the prologue code, the code will already
	 * be a nop.
	 */
	if (replace == new && new == NOP)
		return 0;

And perhaps you can even pass in addr and check if it equals the nop
address. Maybe even not call this code then? That is, if addr ==
MCOUNT_ADDR passed in by ftrace_code_disable() have ftrace_make_nop()
simple return 0 without doing anything.

-- Steve
Steven Rostedt July 8, 2016, 3:57 p.m. UTC | #9
On Fri, 8 Jul 2016 10:48:24 -0500
Josh Poimboeuf <jpoimboe@redhat.com> wrote:


> My understanding is that other arches don't need this check because they
> use -mfentry, so they have to modify the "call fentry" instruction to a
> nop on startup.
> 
> Here, with -fprolog-pad, it's already a nop, so no change is needed.
> 

That's what I was thinking. But as I stated in another email (probably
in the air when you wrote this), the call to ftrace_modify_code() may be
completely circumvented by ftrace_make_nop() if the addr is MCOUNT_ADDR.

-- Steve
Torsten Duwe July 8, 2016, 8:24 p.m. UTC | #10
On Fri, Jul 08, 2016 at 11:57:10AM -0400, Steven Rostedt wrote:
> On Fri, 8 Jul 2016 10:48:24 -0500
> Josh Poimboeuf <jpoimboe@redhat.com> wrote:
> > 
> > Here, with -fprolog-pad, it's already a nop, so no change is needed.
> > 

Yes, exactly.

> That's what I was thinking. But as I stated in another email (probably
> in the air when you wrote this), the call to ftrace_modify_code() may be
> completely circumvented by ftrace_make_nop() if the addr is MCOUNT_ADDR.

Only on the _first_ invocation. Later on, tracing can be switched on and off,
and then the instructions need to be changed just like with fentry (or
profile-kernel ;-)

	Torsten
Steven Rostedt July 8, 2016, 9:08 p.m. UTC | #11
On Fri, 8 Jul 2016 22:24:55 +0200
Torsten Duwe <duwe@lst.de> wrote:

> On Fri, Jul 08, 2016 at 11:57:10AM -0400, Steven Rostedt wrote:
> > On Fri, 8 Jul 2016 10:48:24 -0500
> > Josh Poimboeuf <jpoimboe@redhat.com> wrote:  
> > > 
> > > Here, with -fprolog-pad, it's already a nop, so no change is needed.
> > >   
> 
> Yes, exactly.
> 
> > That's what I was thinking. But as I stated in another email (probably
> > in the air when you wrote this), the call to ftrace_modify_code() may be
> > completely circumvented by ftrace_make_nop() if the addr is MCOUNT_ADDR.  
> 
> Only on the _first_ invocation. Later on, tracing can be switched on and off,
> and then the instructions need to be changed just like with fentry (or
> profile-kernel ;-)
> 

Understood, but ftrace_modify_code() will only receive addr ==
MCOUNT_ADDR on boot up or when a module is loaded. In both cases, with
-fprolog-pad it will already be a nop, hence no need to call
ftrace_modify_code(), in those cases.

In all other cases, addr will point to a ftrace trampoline.

-- Steve
diff mbox

Patch

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 5a0a691..36a0e26 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -72,6 +72,7 @@  config ARM64
 	select HAVE_DMA_API_DEBUG
 	select HAVE_DMA_CONTIGUOUS
 	select HAVE_DYNAMIC_FTRACE
+	select HAVE_DYNAMIC_FTRACE_WITH_REGS
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS
 	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_FUNCTION_TRACER
diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile
index 648a32c..e5e335c 100644
--- a/arch/arm64/Makefile
+++ b/arch/arm64/Makefile
@@ -35,6 +35,10 @@  KBUILD_CFLAGS	+= -fno-asynchronous-unwind-tables
 KBUILD_CFLAGS	+= $(call cc-option, -mpc-relative-literal-loads)
 KBUILD_AFLAGS	+= $(lseinstr)
 
+ifeq ($(CONFIG_DYNAMIC_FTRACE_WITH_REGS), y)
+CC_FLAGS_FTRACE := -fprolog-pad=2 -DCC_USING_PROLOG_PAD
+endif
+
 ifeq ($(CONFIG_CPU_BIG_ENDIAN), y)
 KBUILD_CPPFLAGS	+= -mbig-endian
 AS		+= -EB
diff --git a/arch/arm64/include/asm/ftrace.h b/arch/arm64/include/asm/ftrace.h
index caa955f..a569666 100644
--- a/arch/arm64/include/asm/ftrace.h
+++ b/arch/arm64/include/asm/ftrace.h
@@ -16,6 +16,14 @@ 
 #define MCOUNT_ADDR		((unsigned long)_mcount)
 #define MCOUNT_INSN_SIZE	AARCH64_INSN_SIZE
 
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
+#define ARCH_SUPPORTS_FTRACE_OPS 1
+#define REC_IP_BRANCH_OFFSET 4
+#define FTRACE_REGS_ADDR FTRACE_ADDR
+#else
+#define REC_IP_BRANCH_OFFSET 0
+#endif
+
 #ifndef __ASSEMBLY__
 #include <linux/compat.h>
 
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index 2173149..c26f3f8 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -6,9 +6,9 @@  CPPFLAGS_vmlinux.lds	:= -DTEXT_OFFSET=$(TEXT_OFFSET)
 AFLAGS_head.o		:= -DTEXT_OFFSET=$(TEXT_OFFSET)
 CFLAGS_armv8_deprecated.o := -I$(src)
 
-CFLAGS_REMOVE_ftrace.o = -pg
-CFLAGS_REMOVE_insn.o = -pg
-CFLAGS_REMOVE_return_address.o = -pg
+CFLAGS_REMOVE_ftrace.o = -pg $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_insn.o = -pg $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_return_address.o = -pg $(CC_FLAGS_FTRACE)
 
 # Object file lists.
 arm64-obj-y		:= debug-monitors.o entry.o irq.o fpsimd.o		\
diff --git a/arch/arm64/kernel/entry-ftrace.S b/arch/arm64/kernel/entry-ftrace.S
index 0f03a8f..3ebe791 100644
--- a/arch/arm64/kernel/entry-ftrace.S
+++ b/arch/arm64/kernel/entry-ftrace.S
@@ -12,6 +12,8 @@ 
 #include <linux/linkage.h>
 #include <asm/ftrace.h>
 #include <asm/insn.h>
+#include <asm/asm-offsets.h>
+#include <asm/assembler.h>
 
 /*
  * Gcc with -pg will put the following code in the beginning of each function:
@@ -132,6 +134,7 @@  skip_ftrace_call:
 ENDPROC(_mcount)
 
 #else /* CONFIG_DYNAMIC_FTRACE */
+#ifndef CONFIG_DYNAMIC_FTRACE_WITH_REGS
 /*
  * _mcount() is used to build the kernel with -pg option, but all the branch
  * instructions to _mcount() are replaced to NOP initially at kernel start up,
@@ -171,6 +174,84 @@  ftrace_graph_call:			// ftrace_graph_caller();
 
 	mcount_exit
 ENDPROC(ftrace_caller)
+#else /* CONFIG_DYNAMIC_FTRACE_WITH_REGS */
+ENTRY(_mcount)
+	mov     x10, lr
+	mov     lr, x9
+	ret     x10
+ENDPROC(_mcount)
+
+ENTRY(ftrace_caller)
+	stp	x29, x9, [sp, #-16]!
+	sub	sp, sp, #S_FRAME_SIZE
+
+	stp	x0, x1, [sp]
+	stp	x2, x3, [sp, #16]
+	stp	x4, x5, [sp, #32]
+	stp	x6, x7, [sp, #48]
+	stp	x8, x9, [sp, #64]
+	stp	x10, x11, [sp, #80]
+	stp	x12, x13, [sp, #96]
+	stp	x14, x15, [sp, #112]
+	stp	x16, x17, [sp, #128]
+	stp	x18, x19, [sp, #144]
+	stp	x20, x21, [sp, #160]
+	stp	x22, x23, [sp, #176]
+	stp	x24, x25, [sp, #192]
+	stp	x26, x27, [sp, #208]
+	stp	x28, x29, [sp, #224]
+	/* The link Register at callee entry */
+	str	x9, [sp, #S_LR]
+	/* The program counter just after the ftrace call site */
+	str	lr, [sp, #S_PC]
+	/* The stack pointer as it was on ftrace_caller entry... */
+	add	x29, sp, #S_FRAME_SIZE+16	/* ...is also our new FP */
+	str	x29, [sp, #S_SP]
+
+	adrp    x0, function_trace_op
+	ldr     x2, [x0, #:lo12:function_trace_op]
+	mov	x1, x9		/* saved LR == parent IP */
+	sub	x0, lr, #8	/* prolog pad start == IP */
+	mov	x3, sp		/* complete pt_regs are @sp */
+
+	.global ftrace_call
+ftrace_call:
+
+	bl	ftrace_stub
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	.global ftrace_graph_call
+ftrace_graph_call:			// ftrace_graph_caller();
+	nop				// If enabled, this will be replaced
+					// "b ftrace_graph_caller"
+#endif
+
+ftrace_regs_return:
+	ldp	x0, x1, [sp]
+	ldp	x2, x3, [sp, #16]
+	ldp	x4, x5, [sp, #32]
+	ldp	x6, x7, [sp, #48]
+	ldp	x8, x9, [sp, #64]
+	ldp	x10, x11, [sp, #80]
+	ldp	x12, x13, [sp, #96]
+	ldp	x14, x15, [sp, #112]
+	ldp	x16, x17, [sp, #128]
+	ldp	x18, x19, [sp, #144]
+	ldp	x20, x21, [sp, #160]
+	ldp	x22, x23, [sp, #176]
+	ldp	x24, x25, [sp, #192]
+	ldp	x26, x27, [sp, #208]
+	ldp	x28, x29, [sp, #224]
+
+	ldr	x9, [sp, #S_PC]
+	ldr	lr, [sp, #S_LR]
+	add	sp, sp, #S_FRAME_SIZE+16
+
+	ret	x9
+
+ENDPROC(ftrace_caller)
+
+#endif /* CONFIG_DYNAMIC_FTRACE_WITH_REGS */
 #endif /* CONFIG_DYNAMIC_FTRACE */
 
 ENTRY(ftrace_stub)
@@ -206,12 +287,20 @@  ENDPROC(ftrace_stub)
  * and run return_to_handler() later on its exit.
  */
 ENTRY(ftrace_graph_caller)
+#ifndef CONFIG_DYNAMIC_FTRACE_WITH_REGS
 	mcount_get_lr_addr	  x0	//     pointer to function's saved lr
 	mcount_get_pc		  x1	//     function's pc
 	mcount_get_parent_fp	  x2	//     parent's fp
 	bl	prepare_ftrace_return	// prepare_ftrace_return(&lr, pc, fp)
 
 	mcount_exit
+#else
+	add	x0, sp, #S_LR	/* address of (LR pointing into caller) */
+	ldr	x1, [sp, #S_PC]
+	ldr	x2, [sp, #232]	/* caller's frame pointer */
+	bl	prepare_ftrace_return
+	b	ftrace_regs_return
+#endif
 ENDPROC(ftrace_graph_caller)
 
 /*
diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c
index ebecf9a..917065c 100644
--- a/arch/arm64/kernel/ftrace.c
+++ b/arch/arm64/kernel/ftrace.c
@@ -39,6 +39,12 @@  static int ftrace_modify_code(unsigned long pc, u32 old, u32 new,
 		if (aarch64_insn_read((void *)pc, &replaced))
 			return -EFAULT;
 
+		/* If we already have what we'll finally want,
+		 * report success. This is needed on startup.
+		 */
+		if (replaced == new)
+			return 0;
+
 		if (replaced != old)
 			return -EINVAL;
 	}
@@ -68,28 +74,59 @@  int ftrace_update_ftrace_func(ftrace_func_t func)
  */
 int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
 {
-	unsigned long pc = rec->ip;
+	unsigned long pc = rec->ip+REC_IP_BRANCH_OFFSET;
+	int ret;
 	u32 old, new;
 
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
 	old = aarch64_insn_gen_nop();
+	new = 0xaa1e03e9;	/* mov x9,x30 */
+	ret = ftrace_modify_code(pc-REC_IP_BRANCH_OFFSET, old, new, true);
+	if (ret)
+		return ret;
+	smp_wmb();
+#endif
 	new = aarch64_insn_gen_branch_imm(pc, addr, AARCH64_INSN_BRANCH_LINK);
 
 	return ftrace_modify_code(pc, old, new, true);
 }
 
+int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
+		unsigned long addr)
+{
+	unsigned long pc = rec->ip+REC_IP_BRANCH_OFFSET;
+	u32 old, new;
+
+	old = aarch64_insn_gen_branch_imm(pc, old_addr, true);
+	new = aarch64_insn_gen_branch_imm(pc, addr, true);
+
+	return ftrace_modify_code(pc, old, new, true);
+}
+
 /*
  * Turn off the call to ftrace_caller() in instrumented function
  */
 int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec,
 		    unsigned long addr)
 {
-	unsigned long pc = rec->ip;
+	unsigned long pc = rec->ip+REC_IP_BRANCH_OFFSET;
 	u32 old, new;
+	int ret;
+
 
 	old = aarch64_insn_gen_branch_imm(pc, addr, AARCH64_INSN_BRANCH_LINK);
 	new = aarch64_insn_gen_nop();
 
-	return ftrace_modify_code(pc, old, new, true);
+	ret = ftrace_modify_code(pc, old, new, true);
+	if (ret)
+		return ret;
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
+	smp_wmb();
+	old = 0xaa1e03e9;	/* mov x9,x30 */
+	new = aarch64_insn_gen_nop();
+	ret = ftrace_modify_code(pc-REC_IP_BRANCH_OFFSET, old, new, true);
+#endif
+	return ret;
 }
 
 void arch_ftrace_update_code(int command)
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 6a67ab9..66a72b9 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -89,7 +89,7 @@ 
 #ifdef CONFIG_FTRACE_MCOUNT_RECORD
 #define MCOUNT_REC()	. = ALIGN(8);				\
 			VMLINUX_SYMBOL(__start_mcount_loc) = .; \
-			*(__mcount_loc)				\
+			*(__mcount_loc) *(__prolog_pads_loc)	\
 			VMLINUX_SYMBOL(__stop_mcount_loc) = .;
 #else
 #define MCOUNT_REC()
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 793c082..46289c2 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -63,8 +63,12 @@  extern void __chk_io_ptr(const volatile void __iomem *);
 #if defined(CC_USING_HOTPATCH) && !defined(__CHECKER__)
 #define notrace __attribute__((hotpatch(0,0)))
 #else
+#ifdef CC_USING_PROLOG_PAD
+#define notrace __attribute__((prolog_pad(0)))
+#else
 #define notrace __attribute__((no_instrument_function))
 #endif
+#endif
 
 /* Intel compiler defines __GNUC__. So we will overwrite implementations
  * coming from above header files here