diff mbox series

[RFC,V3,4/9] x86/pks: Preserve the PKRS MSR on context switch

Message ID 20201009194258.3207172-5-ira.weiny@intel.com (mailing list archive)
State New, archived
Headers show
Series PKS: Add Protection Keys Supervisor (PKS) support RFC v3 | expand

Commit Message

Ira Weiny Oct. 9, 2020, 7:42 p.m. UTC
From: Ira Weiny <ira.weiny@intel.com>

The PKRS MSR is defined as a per-logical-processor register.  This
isolates memory access by logical CPU.  Unfortunately, the MSR is not
managed by XSAVE.  Therefore, tasks must save/restore the MSR value on
context switch.

Define a saved PKRS value in the task struct, as well as a cached
per-logical-processor MSR value which mirrors the MSR value of the
current CPU.  Initialize all tasks with the default MSR value.  Then, on
schedule in, check the saved task MSR vs the per-cpu value.  If
different proceed to write the MSR.  If not avoid the overhead of the
MSR write and continue.

Follow on patches will update the saved PKRS as well as the MSR if
needed.

Co-developed-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
---
 arch/x86/include/asm/msr-index.h    |  1 +
 arch/x86/include/asm/pkeys_common.h | 20 ++++++++++++++++++++
 arch/x86/include/asm/processor.h    | 13 +++++++++++++
 arch/x86/kernel/cpu/common.c        |  2 ++
 arch/x86/kernel/process.c           | 21 +++++++++++++++++++++
 arch/x86/mm/pkeys.c                 | 28 ++++++++++++++++++++++++++++
 6 files changed, 85 insertions(+)

Comments

Dave Hansen Oct. 13, 2020, 6:31 p.m. UTC | #1
On 10/9/20 12:42 PM, ira.weiny@intel.com wrote:
> From: Ira Weiny <ira.weiny@intel.com>
> 
> The PKRS MSR is defined as a per-logical-processor register.  This
> isolates memory access by logical CPU.  Unfortunately, the MSR is not
> managed by XSAVE.  Therefore, tasks must save/restore the MSR value on
> context switch.
> 
> Define a saved PKRS value in the task struct, as well as a cached
> per-logical-processor MSR value which mirrors the MSR value of the
> current CPU.  Initialize all tasks with the default MSR value.  Then, on
> schedule in, check the saved task MSR vs the per-cpu value.  If
> different proceed to write the MSR.  If not avoid the overhead of the
> MSR write and continue.

It's probably nice to note how the WRMSR is special here, in addition to
the comments below.

>  #endif /*_ASM_X86_PKEYS_INTERNAL_H */
> diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
> index 97143d87994c..da2381136b2d 100644
> --- a/arch/x86/include/asm/processor.h
> +++ b/arch/x86/include/asm/processor.h
> @@ -18,6 +18,7 @@ struct vm86;
>  #include <asm/cpufeatures.h>
>  #include <asm/page.h>
>  #include <asm/pgtable_types.h>
> +#include <asm/pkeys_common.h>
>  #include <asm/percpu.h>
>  #include <asm/msr.h>
>  #include <asm/desc_defs.h>
> @@ -542,6 +543,11 @@ struct thread_struct {
>  
>  	unsigned int		sig_on_uaccess_err:1;
>  
> +#ifdef	CONFIG_ARCH_HAS_SUPERVISOR_PKEYS
> +	/* Saved Protection key register for supervisor mappings */
> +	u32			saved_pkrs;
> +#endif

Could you take a look around thread_struct and see if there are some
other MSRs near which you can stash this?  This seems like a bit of a
lonely place.

...
>  void flush_thread(void)
>  {
>  	struct task_struct *tsk = current;
> @@ -195,6 +212,8 @@ void flush_thread(void)
>  	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
>  
>  	fpu__clear_all(&tsk->thread.fpu);
> +
> +	pks_init_task(tsk);
>  }
>  
>  void disable_TSC(void)
> @@ -644,6 +663,8 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
>  
>  	if ((tifp ^ tifn) & _TIF_SLD)
>  		switch_to_sld(tifn);
> +
> +	pks_sched_in();
>  }
>  
>  /*
> diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
> index 3cf8f775f36d..30f65dd3d0c5 100644
> --- a/arch/x86/mm/pkeys.c
> +++ b/arch/x86/mm/pkeys.c
> @@ -229,3 +229,31 @@ u32 update_pkey_val(u32 pk_reg, int pkey, unsigned int flags)
>  
>  	return pk_reg;
>  }
> +
> +DEFINE_PER_CPU(u32, pkrs_cache);
> +
> +/**
> + * It should also be noted that the underlying WRMSR(MSR_IA32_PKRS) is not
> + * serializing but still maintains ordering properties similar to WRPKRU.
> + * The current SDM section on PKRS needs updating but should be the same as
> + * that of WRPKRU.  So to quote from the WRPKRU text:
> + *
> + * 	WRPKRU will never execute transiently. Memory accesses
> + * 	affected by PKRU register will not execute (even transiently)
> + * 	until all prior executions of WRPKRU have completed execution
> + * 	and updated the PKRU register.
> + */
> +void write_pkrs(u32 new_pkrs)
> +{
> +	u32 *pkrs;
> +
> +	if (!static_cpu_has(X86_FEATURE_PKS))
> +		return;
> +
> +	pkrs = get_cpu_ptr(&pkrs_cache);
> +	if (*pkrs != new_pkrs) {
> +		*pkrs = new_pkrs;
> +		wrmsrl(MSR_IA32_PKRS, new_pkrs);
> +	}
> +	put_cpu_ptr(pkrs);
> +}
> 

It bugs me a *bit* that this is being called in a preempt-disabled
region, but we still bother with the get/put_cpu jazz.  Are there other
future call-sites for this that aren't in preempt-disabled regions?
Ira Weiny Oct. 14, 2020, 10:36 p.m. UTC | #2
On Tue, Oct 13, 2020 at 11:31:45AM -0700, Dave Hansen wrote:
> On 10/9/20 12:42 PM, ira.weiny@intel.com wrote:
> > From: Ira Weiny <ira.weiny@intel.com>
> > 
> > The PKRS MSR is defined as a per-logical-processor register.  This
> > isolates memory access by logical CPU.  Unfortunately, the MSR is not
> > managed by XSAVE.  Therefore, tasks must save/restore the MSR value on
> > context switch.
> > 
> > Define a saved PKRS value in the task struct, as well as a cached
> > per-logical-processor MSR value which mirrors the MSR value of the
> > current CPU.  Initialize all tasks with the default MSR value.  Then, on
> > schedule in, check the saved task MSR vs the per-cpu value.  If
> > different proceed to write the MSR.  If not avoid the overhead of the
> > MSR write and continue.
> 
> It's probably nice to note how the WRMSR is special here, in addition to
> the comments below.

Sure,

> 
> >  #endif /*_ASM_X86_PKEYS_INTERNAL_H */
> > diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
> > index 97143d87994c..da2381136b2d 100644
> > --- a/arch/x86/include/asm/processor.h
> > +++ b/arch/x86/include/asm/processor.h
> > @@ -18,6 +18,7 @@ struct vm86;
> >  #include <asm/cpufeatures.h>
> >  #include <asm/page.h>
> >  #include <asm/pgtable_types.h>
> > +#include <asm/pkeys_common.h>
> >  #include <asm/percpu.h>
> >  #include <asm/msr.h>
> >  #include <asm/desc_defs.h>
> > @@ -542,6 +543,11 @@ struct thread_struct {
> >  
> >  	unsigned int		sig_on_uaccess_err:1;
> >  
> > +#ifdef	CONFIG_ARCH_HAS_SUPERVISOR_PKEYS
> > +	/* Saved Protection key register for supervisor mappings */
> > +	u32			saved_pkrs;
> > +#endif
> 
> Could you take a look around thread_struct and see if there are some
> other MSRs near which you can stash this?  This seems like a bit of a
> lonely place.

Are you more concerned with aesthetics or the in memory struct layout?

How about I put it after error_code?

	unsigned long           error_code;                                     
+                                                                               
+#ifdef CONFIG_ARCH_HAS_SUPERVISOR_PKEYS                                        
+       /* Saved Protection key register for supervisor mappings */             
+       u32                     saved_pkrs;                                     
+#endif                                                                         
+                                                                               

?

> 
> ...
> >  void flush_thread(void)
> >  {
> >  	struct task_struct *tsk = current;
> > @@ -195,6 +212,8 @@ void flush_thread(void)
> >  	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
> >  
> >  	fpu__clear_all(&tsk->thread.fpu);
> > +
> > +	pks_init_task(tsk);
> >  }
> >  
> >  void disable_TSC(void)
> > @@ -644,6 +663,8 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
> >  
> >  	if ((tifp ^ tifn) & _TIF_SLD)
> >  		switch_to_sld(tifn);
> > +
> > +	pks_sched_in();
> >  }
> >  
> >  /*
> > diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
> > index 3cf8f775f36d..30f65dd3d0c5 100644
> > --- a/arch/x86/mm/pkeys.c
> > +++ b/arch/x86/mm/pkeys.c
> > @@ -229,3 +229,31 @@ u32 update_pkey_val(u32 pk_reg, int pkey, unsigned int flags)
> >  
> >  	return pk_reg;
> >  }
> > +
> > +DEFINE_PER_CPU(u32, pkrs_cache);
> > +
> > +/**
> > + * It should also be noted that the underlying WRMSR(MSR_IA32_PKRS) is not
> > + * serializing but still maintains ordering properties similar to WRPKRU.
> > + * The current SDM section on PKRS needs updating but should be the same as
> > + * that of WRPKRU.  So to quote from the WRPKRU text:
> > + *
> > + * 	WRPKRU will never execute transiently. Memory accesses
> > + * 	affected by PKRU register will not execute (even transiently)
> > + * 	until all prior executions of WRPKRU have completed execution
> > + * 	and updated the PKRU register.
> > + */
> > +void write_pkrs(u32 new_pkrs)
> > +{
> > +	u32 *pkrs;
> > +
> > +	if (!static_cpu_has(X86_FEATURE_PKS))
> > +		return;
> > +
> > +	pkrs = get_cpu_ptr(&pkrs_cache);
> > +	if (*pkrs != new_pkrs) {
> > +		*pkrs = new_pkrs;
> > +		wrmsrl(MSR_IA32_PKRS, new_pkrs);
> > +	}
> > +	put_cpu_ptr(pkrs);
> > +}
> > 
> 
> It bugs me a *bit* that this is being called in a preempt-disabled
> region, but we still bother with the get/put_cpu jazz.  Are there other
> future call-sites for this that aren't in preempt-disabled regions?

I'm not specifically disabling preempt before calling write_pkrs except in the
next patch (which is buggy because I meant to have it around the modification
of thread.saved_pkrs as well).  But that was to protect the thread variable not
the percpu cache vs MSR.

My thought above was it is safer for this call to ensure the per-cpu variable
is consistent with the register.  The other calls to write_pkrs() may require
preemption disable but for reasons unrelated to write_pkrs' state.

After some research I've now fully confused myself if this is needed in patch
7/9 where write_pkrs() is called from the exception handing code.  But I think
it is needed there.  Isn't it?

Since preempt_disable() is nestable I think this is ok correct?

Ira
Peter Zijlstra Oct. 16, 2020, 11:06 a.m. UTC | #3
On Fri, Oct 09, 2020 at 12:42:53PM -0700, ira.weiny@intel.com wrote:

> @@ -644,6 +663,8 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
>  
>  	if ((tifp ^ tifn) & _TIF_SLD)
>  		switch_to_sld(tifn);
> +
> +	pks_sched_in();
>  }
>  

You seem to have lost the comment proposed here:

  https://lkml.kernel.org/r/20200717083140.GW10769@hirez.programming.kicks-ass.net

It is useful and important information that the wrmsr normally doesn't
happen.

> diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
> index 3cf8f775f36d..30f65dd3d0c5 100644
> --- a/arch/x86/mm/pkeys.c
> +++ b/arch/x86/mm/pkeys.c
> @@ -229,3 +229,31 @@ u32 update_pkey_val(u32 pk_reg, int pkey, unsigned int flags)
>  
>  	return pk_reg;
>  }
> +
> +DEFINE_PER_CPU(u32, pkrs_cache);
> +
> +/**
> + * It should also be noted that the underlying WRMSR(MSR_IA32_PKRS) is not
> + * serializing but still maintains ordering properties similar to WRPKRU.
> + * The current SDM section on PKRS needs updating but should be the same as
> + * that of WRPKRU.  So to quote from the WRPKRU text:
> + *
> + * 	WRPKRU will never execute transiently. Memory accesses
> + * 	affected by PKRU register will not execute (even transiently)
> + * 	until all prior executions of WRPKRU have completed execution
> + * 	and updated the PKRU register.

(whitespace damage; space followed by tabstop)

> + */
> +void write_pkrs(u32 new_pkrs)
> +{
> +	u32 *pkrs;
> +
> +	if (!static_cpu_has(X86_FEATURE_PKS))
> +		return;
> +
> +	pkrs = get_cpu_ptr(&pkrs_cache);
> +	if (*pkrs != new_pkrs) {
> +		*pkrs = new_pkrs;
> +		wrmsrl(MSR_IA32_PKRS, new_pkrs);
> +	}
> +	put_cpu_ptr(pkrs);
> +}

looks familiar that... :-)
Peter Zijlstra Oct. 16, 2020, 11:12 a.m. UTC | #4
On Tue, Oct 13, 2020 at 11:31:45AM -0700, Dave Hansen wrote:
> > +/**
> > + * It should also be noted that the underlying WRMSR(MSR_IA32_PKRS) is not
> > + * serializing but still maintains ordering properties similar to WRPKRU.
> > + * The current SDM section on PKRS needs updating but should be the same as
> > + * that of WRPKRU.  So to quote from the WRPKRU text:
> > + *
> > + * 	WRPKRU will never execute transiently. Memory accesses
> > + * 	affected by PKRU register will not execute (even transiently)
> > + * 	until all prior executions of WRPKRU have completed execution
> > + * 	and updated the PKRU register.
> > + */
> > +void write_pkrs(u32 new_pkrs)
> > +{
> > +	u32 *pkrs;
> > +
> > +	if (!static_cpu_has(X86_FEATURE_PKS))
> > +		return;
> > +
> > +	pkrs = get_cpu_ptr(&pkrs_cache);
> > +	if (*pkrs != new_pkrs) {
> > +		*pkrs = new_pkrs;
> > +		wrmsrl(MSR_IA32_PKRS, new_pkrs);
> > +	}
> > +	put_cpu_ptr(pkrs);
> > +}
> > 
> 
> It bugs me a *bit* that this is being called in a preempt-disabled
> region, but we still bother with the get/put_cpu jazz.  Are there other
> future call-sites for this that aren't in preempt-disabled regions?

So the previous version had a useful comment that got lost. This stuff
needs to fundamentally be preempt disabled, so it either needs to
explicitly do so, or have an assertion that preemption is indeed
disabled.
Ira Weiny Oct. 17, 2020, 5:14 a.m. UTC | #5
On Fri, Oct 16, 2020 at 01:12:26PM +0200, Peter Zijlstra wrote:
> On Tue, Oct 13, 2020 at 11:31:45AM -0700, Dave Hansen wrote:
> > > +/**
> > > + * It should also be noted that the underlying WRMSR(MSR_IA32_PKRS) is not
> > > + * serializing but still maintains ordering properties similar to WRPKRU.
> > > + * The current SDM section on PKRS needs updating but should be the same as
> > > + * that of WRPKRU.  So to quote from the WRPKRU text:
> > > + *
> > > + * 	WRPKRU will never execute transiently. Memory accesses
> > > + * 	affected by PKRU register will not execute (even transiently)
> > > + * 	until all prior executions of WRPKRU have completed execution
> > > + * 	and updated the PKRU register.
> > > + */
> > > +void write_pkrs(u32 new_pkrs)
> > > +{
> > > +	u32 *pkrs;
> > > +
> > > +	if (!static_cpu_has(X86_FEATURE_PKS))
> > > +		return;
> > > +
> > > +	pkrs = get_cpu_ptr(&pkrs_cache);
> > > +	if (*pkrs != new_pkrs) {
> > > +		*pkrs = new_pkrs;
> > > +		wrmsrl(MSR_IA32_PKRS, new_pkrs);
> > > +	}
> > > +	put_cpu_ptr(pkrs);
> > > +}
> > > 
> > 
> > It bugs me a *bit* that this is being called in a preempt-disabled
> > region, but we still bother with the get/put_cpu jazz.  Are there other
> > future call-sites for this that aren't in preempt-disabled regions?
> 
> So the previous version had a useful comment that got lost.

Ok Looking back I see what happened...  This comment...

 /*
  * PKRS is only temporarily changed during specific code paths.
  * Only a preemption during these windows away from the default
  * value would require updating the MSR.
  */

... was added to pks_sched_in() but that got simplified down because cleaning
up write_pkrs() made the code there obsolete.

> This stuff
> needs to fundamentally be preempt disabled,

I agree, the update to the percpu cache value and MSR can not be torn.

> so it either needs to
> explicitly do so, or have an assertion that preemption is indeed
> disabled.

However, I don't think I understand clearly.  Doesn't [get|put]_cpu_ptr()
handle the preempt_disable() for us?  Is it not sufficient to rely on that?

Dave's comment seems to be the opposite where we need to eliminate preempt
disable before calling write_pkrs().

FWIW I think I'm mistaken in my response to Dave regarding the
preempt_disable() in pks_update_protection().

Ira
Ira Weiny Oct. 17, 2020, 5:37 a.m. UTC | #6
On Fri, Oct 16, 2020 at 01:06:36PM +0200, Peter Zijlstra wrote:
> On Fri, Oct 09, 2020 at 12:42:53PM -0700, ira.weiny@intel.com wrote:
> 
> > @@ -644,6 +663,8 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
> >  
> >  	if ((tifp ^ tifn) & _TIF_SLD)
> >  		switch_to_sld(tifn);
> > +
> > +	pks_sched_in();
> >  }
> >  
> 
> You seem to have lost the comment proposed here:
> 
>   https://lkml.kernel.org/r/20200717083140.GW10769@hirez.programming.kicks-ass.net
> 
> It is useful and important information that the wrmsr normally doesn't
> happen.

Added back in here.

> 
> > diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
> > index 3cf8f775f36d..30f65dd3d0c5 100644
> > --- a/arch/x86/mm/pkeys.c
> > +++ b/arch/x86/mm/pkeys.c
> > @@ -229,3 +229,31 @@ u32 update_pkey_val(u32 pk_reg, int pkey, unsigned int flags)
> >  
> >  	return pk_reg;
> >  }
> > +
> > +DEFINE_PER_CPU(u32, pkrs_cache);
> > +
> > +/**
> > + * It should also be noted that the underlying WRMSR(MSR_IA32_PKRS) is not
> > + * serializing but still maintains ordering properties similar to WRPKRU.
> > + * The current SDM section on PKRS needs updating but should be the same as
> > + * that of WRPKRU.  So to quote from the WRPKRU text:
> > + *
> > + * 	WRPKRU will never execute transiently. Memory accesses
> > + * 	affected by PKRU register will not execute (even transiently)
> > + * 	until all prior executions of WRPKRU have completed execution
> > + * 	and updated the PKRU register.
> 
> (whitespace damage; space followed by tabstop)

Fixed thanks.

> 
> > + */
> > +void write_pkrs(u32 new_pkrs)
> > +{
> > +	u32 *pkrs;
> > +
> > +	if (!static_cpu_has(X86_FEATURE_PKS))
> > +		return;
> > +
> > +	pkrs = get_cpu_ptr(&pkrs_cache);
> > +	if (*pkrs != new_pkrs) {
> > +		*pkrs = new_pkrs;
> > +		wrmsrl(MSR_IA32_PKRS, new_pkrs);
> > +	}
> > +	put_cpu_ptr(pkrs);
> > +}
> 
> looks familiar that... :-)

Added you as a co-developer if that is ok?

Ira
Peter Zijlstra Oct. 19, 2020, 9:37 a.m. UTC | #7
On Fri, Oct 16, 2020 at 10:14:10PM -0700, Ira Weiny wrote:
> > so it either needs to
> > explicitly do so, or have an assertion that preemption is indeed
> > disabled.
> 
> However, I don't think I understand clearly.  Doesn't [get|put]_cpu_ptr()
> handle the preempt_disable() for us? 

It does.

> Is it not sufficient to rely on that?

It is.

> Dave's comment seems to be the opposite where we need to eliminate preempt
> disable before calling write_pkrs().
> 
> FWIW I think I'm mistaken in my response to Dave regarding the
> preempt_disable() in pks_update_protection().

Dave's concern is that we're calling with with preemption already
disabled so disabling it again is superfluous.
Ira Weiny Oct. 19, 2020, 6:48 p.m. UTC | #8
On Mon, Oct 19, 2020 at 11:37:14AM +0200, Peter Zijlstra wrote:
> On Fri, Oct 16, 2020 at 10:14:10PM -0700, Ira Weiny wrote:
> > > so it either needs to
> > > explicitly do so, or have an assertion that preemption is indeed
> > > disabled.
> > 
> > However, I don't think I understand clearly.  Doesn't [get|put]_cpu_ptr()
> > handle the preempt_disable() for us? 
> 
> It does.
> 
> > Is it not sufficient to rely on that?
> 
> It is.
> 
> > Dave's comment seems to be the opposite where we need to eliminate preempt
> > disable before calling write_pkrs().
> > 
> > FWIW I think I'm mistaken in my response to Dave regarding the
> > preempt_disable() in pks_update_protection().
> 
> Dave's concern is that we're calling with with preemption already
> disabled so disabling it again is superfluous.

Ok, thanks, and after getting my head straight I think I agree with him, and
you.

Thanks I've reworked the code to removed the superfluous calls.  Sorry about
being so dense...  :-D

Ira
diff mbox series

Patch

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 2859ee4f39a8..e467e087f1b3 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -747,6 +747,7 @@ 
 
 #define MSR_IA32_TSC_DEADLINE		0x000006E0
 
+#define MSR_IA32_PKRS			0x000006E1
 
 #define MSR_TSX_FORCE_ABORT		0x0000010F
 
diff --git a/arch/x86/include/asm/pkeys_common.h b/arch/x86/include/asm/pkeys_common.h
index a9f086f1e4b4..05781be33c14 100644
--- a/arch/x86/include/asm/pkeys_common.h
+++ b/arch/x86/include/asm/pkeys_common.h
@@ -8,4 +8,24 @@ 
 
 #define PKR_AD_KEY(pkey)	(PKR_AD_BIT << ((pkey) * PKR_BITS_PER_PKEY))
 
+/*
+ * Define a default PKRS value for each task.
+ *
+ * Key 0 has no restriction.  All other keys are set to the most restrictive
+ * value which is access disabled (AD=1).
+ *
+ * NOTE: This needs to be a macro to be used as part of the INIT_THREAD macro.
+ */
+#define INIT_PKRS_VALUE (PKR_AD_KEY(1) | PKR_AD_KEY(2) | PKR_AD_KEY(3) | \
+			 PKR_AD_KEY(4) | PKR_AD_KEY(5) | PKR_AD_KEY(6) | \
+			 PKR_AD_KEY(7) | PKR_AD_KEY(8) | PKR_AD_KEY(9) | \
+			 PKR_AD_KEY(10) | PKR_AD_KEY(11) | PKR_AD_KEY(12) | \
+			 PKR_AD_KEY(13) | PKR_AD_KEY(14) | PKR_AD_KEY(15))
+
+#ifdef CONFIG_ARCH_HAS_SUPERVISOR_PKEYS
+void write_pkrs(u32 new_pkrs);
+#else
+static inline void write_pkrs(u32 new_pkrs) { }
+#endif
+
 #endif /*_ASM_X86_PKEYS_INTERNAL_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 97143d87994c..da2381136b2d 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -18,6 +18,7 @@  struct vm86;
 #include <asm/cpufeatures.h>
 #include <asm/page.h>
 #include <asm/pgtable_types.h>
+#include <asm/pkeys_common.h>
 #include <asm/percpu.h>
 #include <asm/msr.h>
 #include <asm/desc_defs.h>
@@ -542,6 +543,11 @@  struct thread_struct {
 
 	unsigned int		sig_on_uaccess_err:1;
 
+#ifdef	CONFIG_ARCH_HAS_SUPERVISOR_PKEYS
+	/* Saved Protection key register for supervisor mappings */
+	u32			saved_pkrs;
+#endif
+
 	/* Floating point and extended processor state */
 	struct fpu		fpu;
 	/*
@@ -840,8 +846,15 @@  static inline void spin_lock_prefetch(const void *x)
 #define STACK_TOP		TASK_SIZE_LOW
 #define STACK_TOP_MAX		TASK_SIZE_MAX
 
+#ifdef CONFIG_ARCH_HAS_SUPERVISOR_PKEYS
+#define INIT_THREAD_PKRS	.saved_pkrs = INIT_PKRS_VALUE
+#else
+#define INIT_THREAD_PKRS	0
+#endif
+
 #define INIT_THREAD  {						\
 	.addr_limit		= KERNEL_DS,			\
+	INIT_THREAD_PKRS,					\
 }
 
 extern unsigned long KSTK_ESP(struct task_struct *task);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index a129d5e4afab..968863d59b6c 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -57,6 +57,7 @@ 
 #include <asm/intel-family.h>
 #include <asm/cpu_device_id.h>
 #include <asm/uv/uv.h>
+#include <linux/pkeys.h>
 
 #include "cpu.h"
 
@@ -1458,6 +1459,7 @@  static void setup_pks(void)
 	if (!cpu_feature_enabled(X86_FEATURE_PKS))
 		return;
 
+	write_pkrs(INIT_PKRS_VALUE);
 	cr4_set_bits(X86_CR4_PKS);
 }
 
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index ba4593a913fa..eb3a95a69392 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -43,6 +43,7 @@ 
 #include <asm/io_bitmap.h>
 #include <asm/proto.h>
 #include <asm/frame.h>
+#include <asm/pkeys_common.h>
 
 #include "process.h"
 
@@ -187,6 +188,22 @@  int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg,
 	return ret;
 }
 
+#ifdef CONFIG_ARCH_HAS_SUPERVISOR_PKEYS
+DECLARE_PER_CPU(u32, pkrs_cache);
+static inline void pks_init_task(struct task_struct *tsk)
+{
+	/* New tasks get the most restrictive PKRS value */
+	tsk->thread.saved_pkrs = INIT_PKRS_VALUE;
+}
+static inline void pks_sched_in(void)
+{
+	write_pkrs(current->thread.saved_pkrs);
+}
+#else
+static inline void pks_init_task(struct task_struct *tsk) { }
+static inline void pks_sched_in(void) { }
+#endif
+
 void flush_thread(void)
 {
 	struct task_struct *tsk = current;
@@ -195,6 +212,8 @@  void flush_thread(void)
 	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
 
 	fpu__clear_all(&tsk->thread.fpu);
+
+	pks_init_task(tsk);
 }
 
 void disable_TSC(void)
@@ -644,6 +663,8 @@  void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
 
 	if ((tifp ^ tifn) & _TIF_SLD)
 		switch_to_sld(tifn);
+
+	pks_sched_in();
 }
 
 /*
diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
index 3cf8f775f36d..30f65dd3d0c5 100644
--- a/arch/x86/mm/pkeys.c
+++ b/arch/x86/mm/pkeys.c
@@ -229,3 +229,31 @@  u32 update_pkey_val(u32 pk_reg, int pkey, unsigned int flags)
 
 	return pk_reg;
 }
+
+DEFINE_PER_CPU(u32, pkrs_cache);
+
+/**
+ * It should also be noted that the underlying WRMSR(MSR_IA32_PKRS) is not
+ * serializing but still maintains ordering properties similar to WRPKRU.
+ * The current SDM section on PKRS needs updating but should be the same as
+ * that of WRPKRU.  So to quote from the WRPKRU text:
+ *
+ * 	WRPKRU will never execute transiently. Memory accesses
+ * 	affected by PKRU register will not execute (even transiently)
+ * 	until all prior executions of WRPKRU have completed execution
+ * 	and updated the PKRU register.
+ */
+void write_pkrs(u32 new_pkrs)
+{
+	u32 *pkrs;
+
+	if (!static_cpu_has(X86_FEATURE_PKS))
+		return;
+
+	pkrs = get_cpu_ptr(&pkrs_cache);
+	if (*pkrs != new_pkrs) {
+		*pkrs = new_pkrs;
+		wrmsrl(MSR_IA32_PKRS, new_pkrs);
+	}
+	put_cpu_ptr(pkrs);
+}