diff mbox

x86/mm: Split read_cr3() into read_cr3_pa() and __read_cr3()

Message ID 883f8fb121f4616c1c1427ad87350bb2f5ffeca1.1497288170.git.luto@kernel.org (mailing list archive)
State New, archived
Headers show

Commit Message

Andy Lutomirski June 12, 2017, 5:26 p.m. UTC
The kernel has several code paths that read CR3.  Most of them assume that
CR3 contains the PGD's physical address, whereas some of them awkwardly
use PHYSICAL_PAGE_MASK to mask off low bits.

Add explicit mask macros for CR3 and convert all of the CR3 readers.
This will keep them from breaking when PCID is enabled.

Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: xen-devel <xen-devel@lists.xen.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
---

Hi Ingo-

I broke this out because Tom's SME series and my PCID series both need it.
Please consider applying it to tip:x86/mm.

I'll send PCID v2 soon.  It'll apply to x86/mm + sched/urgent + this patch.

Thanks,
Andy

 arch/x86/boot/compressed/pagetable.c   |  2 +-
 arch/x86/include/asm/efi.h             |  2 +-
 arch/x86/include/asm/mmu_context.h     |  4 ++--
 arch/x86/include/asm/paravirt.h        |  2 +-
 arch/x86/include/asm/processor-flags.h | 36 ++++++++++++++++++++++++++++++++++
 arch/x86/include/asm/processor.h       |  8 ++++++++
 arch/x86/include/asm/special_insns.h   | 10 +++++++---
 arch/x86/include/asm/tlbflush.h        |  4 ++--
 arch/x86/kernel/head64.c               |  3 ++-
 arch/x86/kernel/paravirt.c             |  2 +-
 arch/x86/kernel/process_32.c           |  2 +-
 arch/x86/kernel/process_64.c           |  2 +-
 arch/x86/kvm/vmx.c                     |  2 +-
 arch/x86/mm/fault.c                    | 10 +++++-----
 arch/x86/mm/ioremap.c                  |  2 +-
 arch/x86/platform/efi/efi_64.c         |  4 ++--
 arch/x86/platform/olpc/olpc-xo1-pm.c   |  2 +-
 arch/x86/power/cpu.c                   |  2 +-
 arch/x86/power/hibernate_64.c          |  3 ++-
 arch/x86/xen/mmu_pv.c                  |  6 +++---
 20 files changed, 79 insertions(+), 29 deletions(-)

Comments

Ingo Molnar June 13, 2017, 6:49 a.m. UTC | #1
* Andy Lutomirski <luto@kernel.org> wrote:

> The kernel has several code paths that read CR3.  Most of them assume that
> CR3 contains the PGD's physical address, whereas some of them awkwardly
> use PHYSICAL_PAGE_MASK to mask off low bits.
> 
> Add explicit mask macros for CR3 and convert all of the CR3 readers.
> This will keep them from breaking when PCID is enabled.
> 
> Cc: Tom Lendacky <thomas.lendacky@amd.com>
> Cc: Juergen Gross <jgross@suse.com>
> Cc: xen-devel <xen-devel@lists.xen.org>
> Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
> Signed-off-by: Andy Lutomirski <luto@kernel.org>
> ---
> 
> Hi Ingo-
> 
> I broke this out because Tom's SME series and my PCID series both need it.
> Please consider applying it to tip:x86/mm.
> 
> I'll send PCID v2 soon.  It'll apply to x86/mm + sched/urgent + this patch.

Ok, thanks - will push it out after a bit of testing.

Note that I also merged sched/urgent into x86/mm, to create a better base tree for 
the PCID changes by picking up 252d2a4117bc that PCID depends on.

Thanks,

	Ingo
Borislav Petkov June 13, 2017, 9:26 a.m. UTC | #2
On Mon, Jun 12, 2017 at 10:26:14AM -0700, Andy Lutomirski wrote:
> The kernel has several code paths that read CR3.  Most of them assume that
> CR3 contains the PGD's physical address, whereas some of them awkwardly
> use PHYSICAL_PAGE_MASK to mask off low bits.
> 
> Add explicit mask macros for CR3 and convert all of the CR3 readers.
> This will keep them from breaking when PCID is enabled.

...

> +/*
> + * CR3's layout varies depending on several things.
> + *
> + * If CR4.PCIDE is set (64-bit only), then CR3[11:0] is the address space ID.
> + * If PAE is enabled, then CR3[11:5] is part of the PDPT address
> + * (i.e. it's 32-byte aligned, not page-aligned) and CR3[4:0] is ignored.
> + * Otherwise (non-PAE, non-PCID), CR3[3] is PWT, CR3[4] is PCD, and
> + * CR3[2:0] and CR3[11:5] are ignored.
> + *
> + * In all cases, Linux puts zeros in the low ignored bits and in PWT and PCD.
> + *
> + * CR3[63] is always read as zero.  If CR4.PCIDE is set, then CR3[63] may be
> + * written as 1 to prevent the write to CR3 from flushing the TLB.
> + *
> + * On systems with SME, one bit (in a variable position!) is stolen to indicate
> + * that the top-level paging structure is encrypted.
> + *
> + * All of the remaining bits indicate the physical address of the top-level
> + * paging structure.
> + *
> + * CR3_ADDR_MASK is the mask used by read_cr3_pa().
> + */
> +#ifdef CONFIG_X86_64
> +/* Mask off the address space ID bits. */
> +#define CR3_ADDR_MASK 0x7FFFFFFFFFFFF000ull
> +#define CR3_PCID_MASK 0xFFFull
> +#else
> +/*
> + * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save
> + * a tiny bit of code size by setting all the bits.
> + */
> +#define CR3_ADDR_MASK 0xFFFFFFFFull
> +#define CR3_PCID_MASK 0ull

All those can do GENMASK_ULL for better readability.

> diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
> index 3586996fc50d..bc0a849589bb 100644
> --- a/arch/x86/kernel/paravirt.c
> +++ b/arch/x86/kernel/paravirt.c
> @@ -391,7 +391,7 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
>  
>  	.read_cr2 = native_read_cr2,
>  	.write_cr2 = native_write_cr2,
> -	.read_cr3 = native_read_cr3,
> +	.read_cr3 = __native_read_cr3,
>  	.write_cr3 = native_write_cr3,
>  
>  	.flush_tlb_user = native_flush_tlb,
> diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
> index ffeae818aa7a..c6d6dc5f8bb2 100644
> --- a/arch/x86/kernel/process_32.c
> +++ b/arch/x86/kernel/process_32.c
> @@ -92,7 +92,7 @@ void __show_regs(struct pt_regs *regs, int all)
>  
>  	cr0 = read_cr0();
>  	cr2 = read_cr2();
> -	cr3 = read_cr3();
> +	cr3 = __read_cr3();
>  	cr4 = __read_cr4();

This is a good example for my confusion. So we have __read_cr4() - with
the underscores - but not read_cr4().

Now we get __read_cr3 *and* read_cr3_pa(). So __read_cr3() could just as
well lose the "__", right?

Or are the PCID series bringing a read_cr3() without "__" too?

Oh, and to confuse me even more, there's __native_read_cr3() which is
*finally* accessing %cr3 :-) But there's native_write_cr3() without
underscores.

So can we make those names a bit more balanced please?

Thanks.
Andy Lutomirski June 13, 2017, 4 p.m. UTC | #3
On Tue, Jun 13, 2017 at 2:26 AM, Borislav Petkov <bp@alien8.de> wrote:
> On Mon, Jun 12, 2017 at 10:26:14AM -0700, Andy Lutomirski wrote:
>> The kernel has several code paths that read CR3.  Most of them assume that
>> CR3 contains the PGD's physical address, whereas some of them awkwardly
>> use PHYSICAL_PAGE_MASK to mask off low bits.
>>
>> Add explicit mask macros for CR3 and convert all of the CR3 readers.
>> This will keep them from breaking when PCID is enabled.
>
> ...
>
>> +/*
>> + * CR3's layout varies depending on several things.
>> + *
>> + * If CR4.PCIDE is set (64-bit only), then CR3[11:0] is the address space ID.
>> + * If PAE is enabled, then CR3[11:5] is part of the PDPT address
>> + * (i.e. it's 32-byte aligned, not page-aligned) and CR3[4:0] is ignored.
>> + * Otherwise (non-PAE, non-PCID), CR3[3] is PWT, CR3[4] is PCD, and
>> + * CR3[2:0] and CR3[11:5] are ignored.
>> + *
>> + * In all cases, Linux puts zeros in the low ignored bits and in PWT and PCD.
>> + *
>> + * CR3[63] is always read as zero.  If CR4.PCIDE is set, then CR3[63] may be
>> + * written as 1 to prevent the write to CR3 from flushing the TLB.
>> + *
>> + * On systems with SME, one bit (in a variable position!) is stolen to indicate
>> + * that the top-level paging structure is encrypted.
>> + *
>> + * All of the remaining bits indicate the physical address of the top-level
>> + * paging structure.
>> + *
>> + * CR3_ADDR_MASK is the mask used by read_cr3_pa().
>> + */
>> +#ifdef CONFIG_X86_64
>> +/* Mask off the address space ID bits. */
>> +#define CR3_ADDR_MASK 0x7FFFFFFFFFFFF000ull
>> +#define CR3_PCID_MASK 0xFFFull
>> +#else
>> +/*
>> + * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save
>> + * a tiny bit of code size by setting all the bits.
>> + */
>> +#define CR3_ADDR_MASK 0xFFFFFFFFull
>> +#define CR3_PCID_MASK 0ull
>
> All those can do GENMASK_ULL for better readability.
>

Hmm.  I'll look at that.

>> diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
>> index 3586996fc50d..bc0a849589bb 100644
>> --- a/arch/x86/kernel/paravirt.c
>> +++ b/arch/x86/kernel/paravirt.c
>> @@ -391,7 +391,7 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
>>
>>       .read_cr2 = native_read_cr2,
>>       .write_cr2 = native_write_cr2,
>> -     .read_cr3 = native_read_cr3,
>> +     .read_cr3 = __native_read_cr3,
>>       .write_cr3 = native_write_cr3,
>>
>>       .flush_tlb_user = native_flush_tlb,
>> diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
>> index ffeae818aa7a..c6d6dc5f8bb2 100644
>> --- a/arch/x86/kernel/process_32.c
>> +++ b/arch/x86/kernel/process_32.c
>> @@ -92,7 +92,7 @@ void __show_regs(struct pt_regs *regs, int all)
>>
>>       cr0 = read_cr0();
>>       cr2 = read_cr2();
>> -     cr3 = read_cr3();
>> +     cr3 = __read_cr3();
>>       cr4 = __read_cr4();
>
> This is a good example for my confusion. So we have __read_cr4() - with
> the underscores - but not read_cr4().
>
> Now we get __read_cr3 *and* read_cr3_pa(). So __read_cr3() could just as
> well lose the "__", right?
>
> Or are the PCID series bringing a read_cr3() without "__" too?

The intent was twofold:

1. Make sure that every read_cr3() instance got converted.  I didn't
want a mid-air collision with someone else's patch in which it would
appear to apply and compile but the result would randomly fail on PCID
systems.

2. Make users realize that CR3 ain't what it used to be.  __read_cr3()
means "return this complicated register value -- I know what I'm
doing" and read_cr3_pa() means "give me the PA".

Maybe we could rename __read_cr3() to read_cr3_raw()?  If we really
wanted lots of clarity, __read_cr4() could become read_cr4_noshadow(),
I suppose.

What do you think?  My general preference is to clean this up after
the rest of the big patchsets (SME and PCID) land.

>
> Oh, and to confuse me even more, there's __native_read_cr3() which is
> *finally* accessing %cr3 :-) But there's native_write_cr3() without
> underscores.
>
> So can we make those names a bit more balanced please?

write_cr3() was less widespread, so I worried about it less.

--Andy
Borislav Petkov June 13, 2017, 4:18 p.m. UTC | #4
On Tue, Jun 13, 2017 at 09:00:01AM -0700, Andy Lutomirski wrote:
> 1. Make sure that every read_cr3() instance got converted.  I didn't
> want a mid-air collision with someone else's patch in which it would
> appear to apply and compile but the result would randomly fail on PCID
> systems.

Right.

> 2. Make users realize that CR3 ain't what it used to be.  __read_cr3()
> means "return this complicated register value -- I know what I'm
> doing" and read_cr3_pa() means "give me the PA".

Agreed with the _pa thing.

> Maybe we could rename __read_cr3() to read_cr3_raw()?  If we really
> wanted lots of clarity, __read_cr4() could become read_cr4_noshadow(),
> I suppose.

Yeah, both make sense to me. I like the _raw thing and the _noshadow
too, as they actually say what the function *really* does.

In any case, the __ variant is less descriptive than having:

read_crX_pa
read_crX_raw
read_crX_noshadow

and so on which actually say what they each do and when you wonder which
to use, you know.

> What do you think?  My general preference is to clean this up after
> the rest of the big patchsets (SME and PCID) land.

Of course.

Thanks.
diff mbox

Patch

diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c
index 1d78f1739087..8e69df96492e 100644
--- a/arch/x86/boot/compressed/pagetable.c
+++ b/arch/x86/boot/compressed/pagetable.c
@@ -92,7 +92,7 @@  void initialize_identity_maps(void)
 	 * and we must append to the existing area instead of entirely
 	 * overwriting it.
 	 */
-	level4p = read_cr3();
+	level4p = read_cr3_pa();
 	if (level4p == (unsigned long)_pgtable) {
 		debug_putstr("booted via startup_32()\n");
 		pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE;
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 2f77bcefe6b4..d2ff779f347e 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -74,7 +74,7 @@  struct efi_scratch {
 	__kernel_fpu_begin();						\
 									\
 	if (efi_scratch.use_pgd) {					\
-		efi_scratch.prev_cr3 = read_cr3();			\
+		efi_scratch.prev_cr3 = __read_cr3();			\
 		write_cr3((unsigned long)efi_scratch.efi_pgt);		\
 		__flush_tlb_all();					\
 	}								\
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 5a93f6261302..cfe6034ebfc6 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -269,7 +269,7 @@  static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
 
 /*
  * This can be used from process context to figure out what the value of
- * CR3 is without needing to do a (slow) read_cr3().
+ * CR3 is without needing to do a (slow) __read_cr3().
  *
  * It's intended to be used for code like KVM that sneakily changes CR3
  * and needs to restore it.  It needs to be used very carefully.
@@ -281,7 +281,7 @@  static inline unsigned long __get_current_cr3_fast(void)
 	/* For now, be very restrictive about when this can be called. */
 	VM_WARN_ON(in_nmi() || !in_atomic());
 
-	VM_BUG_ON(cr3 != read_cr3());
+	VM_BUG_ON(cr3 != __read_cr3());
 	return cr3;
 }
 
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 9a15739d9f4b..a63e77f8eb41 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -61,7 +61,7 @@  static inline void write_cr2(unsigned long x)
 	PVOP_VCALL1(pv_mmu_ops.write_cr2, x);
 }
 
-static inline unsigned long read_cr3(void)
+static inline unsigned long __read_cr3(void)
 {
 	return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr3);
 }
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index 39fb618e2211..79aa2f98398d 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -8,4 +8,40 @@ 
 #else
 #define X86_VM_MASK	0 /* No VM86 support */
 #endif
+
+/*
+ * CR3's layout varies depending on several things.
+ *
+ * If CR4.PCIDE is set (64-bit only), then CR3[11:0] is the address space ID.
+ * If PAE is enabled, then CR3[11:5] is part of the PDPT address
+ * (i.e. it's 32-byte aligned, not page-aligned) and CR3[4:0] is ignored.
+ * Otherwise (non-PAE, non-PCID), CR3[3] is PWT, CR3[4] is PCD, and
+ * CR3[2:0] and CR3[11:5] are ignored.
+ *
+ * In all cases, Linux puts zeros in the low ignored bits and in PWT and PCD.
+ *
+ * CR3[63] is always read as zero.  If CR4.PCIDE is set, then CR3[63] may be
+ * written as 1 to prevent the write to CR3 from flushing the TLB.
+ *
+ * On systems with SME, one bit (in a variable position!) is stolen to indicate
+ * that the top-level paging structure is encrypted.
+ *
+ * All of the remaining bits indicate the physical address of the top-level
+ * paging structure.
+ *
+ * CR3_ADDR_MASK is the mask used by read_cr3_pa().
+ */
+#ifdef CONFIG_X86_64
+/* Mask off the address space ID bits. */
+#define CR3_ADDR_MASK 0x7FFFFFFFFFFFF000ull
+#define CR3_PCID_MASK 0xFFFull
+#else
+/*
+ * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save
+ * a tiny bit of code size by setting all the bits.
+ */
+#define CR3_ADDR_MASK 0xFFFFFFFFull
+#define CR3_PCID_MASK 0ull
+#endif
+
 #endif /* _ASM_X86_PROCESSOR_FLAGS_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 3cada998a402..9de02c985aa4 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -231,6 +231,14 @@  native_cpuid_reg(ebx)
 native_cpuid_reg(ecx)
 native_cpuid_reg(edx)
 
+/*
+ * Friendlier CR3 helpers.
+ */
+static inline unsigned long read_cr3_pa(void)
+{
+	return __read_cr3() & CR3_ADDR_MASK;
+}
+
 static inline void load_cr3(pgd_t *pgdir)
 {
 	write_cr3(__pa(pgdir));
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 12af3e35edfa..9efaabf5b54b 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -39,7 +39,7 @@  static inline void native_write_cr2(unsigned long val)
 	asm volatile("mov %0,%%cr2": : "r" (val), "m" (__force_order));
 }
 
-static inline unsigned long native_read_cr3(void)
+static inline unsigned long __native_read_cr3(void)
 {
 	unsigned long val;
 	asm volatile("mov %%cr3,%0\n\t" : "=r" (val), "=m" (__force_order));
@@ -159,9 +159,13 @@  static inline void write_cr2(unsigned long x)
 	native_write_cr2(x);
 }
 
-static inline unsigned long read_cr3(void)
+/*
+ * Careful!  CR3 contains more than just an address.  You probably want
+ * read_cr3_pa() instead.
+ */
+static inline unsigned long __read_cr3(void)
 {
-	return native_read_cr3();
+	return __native_read_cr3();
 }
 
 static inline void write_cr3(unsigned long x)
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 388c2463fde6..5f78c6a77578 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -156,7 +156,7 @@  static inline void __native_flush_tlb(void)
 	 * back:
 	 */
 	preempt_disable();
-	native_write_cr3(native_read_cr3());
+	native_write_cr3(__native_read_cr3());
 	preempt_enable();
 }
 
@@ -264,7 +264,7 @@  static inline void reset_lazy_tlbstate(void)
 	this_cpu_write(cpu_tlbstate.state, 0);
 	this_cpu_write(cpu_tlbstate.loaded_mm, &init_mm);
 
-	WARN_ON(read_cr3() != __pa_symbol(swapper_pg_dir));
+	WARN_ON(read_cr3_pa() != __pa_symbol(swapper_pg_dir));
 }
 
 static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 43b7002f44fb..794e8f517a81 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -55,7 +55,8 @@  int __init early_make_pgtable(unsigned long address)
 	pmdval_t pmd, *pmd_p;
 
 	/* Invalid address or early pgt is done ?  */
-	if (physaddr >= MAXMEM || read_cr3() != __pa_nodebug(early_level4_pgt))
+	if (physaddr >= MAXMEM ||
+	    read_cr3_pa() != __pa_nodebug(early_level4_pgt))
 		return -1;
 
 again:
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 3586996fc50d..bc0a849589bb 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -391,7 +391,7 @@  struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
 
 	.read_cr2 = native_read_cr2,
 	.write_cr2 = native_write_cr2,
-	.read_cr3 = native_read_cr3,
+	.read_cr3 = __native_read_cr3,
 	.write_cr3 = native_write_cr3,
 
 	.flush_tlb_user = native_flush_tlb,
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index ffeae818aa7a..c6d6dc5f8bb2 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -92,7 +92,7 @@  void __show_regs(struct pt_regs *regs, int all)
 
 	cr0 = read_cr0();
 	cr2 = read_cr2();
-	cr3 = read_cr3();
+	cr3 = __read_cr3();
 	cr4 = __read_cr4();
 	printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n",
 			cr0, cr2, cr3, cr4);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 9c39ab8bcc41..c3169be4c596 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -104,7 +104,7 @@  void __show_regs(struct pt_regs *regs, int all)
 
 	cr0 = read_cr0();
 	cr2 = read_cr2();
-	cr3 = read_cr3();
+	cr3 = __read_cr3();
 	cr4 = __read_cr4();
 
 	printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 19cde555d73f..d143dd397dc9 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5024,7 +5024,7 @@  static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
 	 * Save the most likely value for this task's CR3 in the VMCS.
 	 * We can't use __get_current_cr3_fast() because we're not atomic.
 	 */
-	cr3 = read_cr3();
+	cr3 = __read_cr3();
 	vmcs_writel(HOST_CR3, cr3);		/* 22.2.3  FIXME: shadow tables */
 	vmx->host_state.vmcs_host_cr3 = cr3;
 
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 8ad91a01cbc8..2a1fa10c6a98 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -346,7 +346,7 @@  static noinline int vmalloc_fault(unsigned long address)
 	 * Do _not_ use "current" here. We might be inside
 	 * an interrupt in the middle of a task switch..
 	 */
-	pgd_paddr = read_cr3();
+	pgd_paddr = read_cr3_pa();
 	pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
 	if (!pmd_k)
 		return -1;
@@ -388,7 +388,7 @@  static bool low_pfn(unsigned long pfn)
 
 static void dump_pagetable(unsigned long address)
 {
-	pgd_t *base = __va(read_cr3());
+	pgd_t *base = __va(read_cr3_pa());
 	pgd_t *pgd = &base[pgd_index(address)];
 	p4d_t *p4d;
 	pud_t *pud;
@@ -451,7 +451,7 @@  static noinline int vmalloc_fault(unsigned long address)
 	 * happen within a race in page table update. In the later
 	 * case just flush:
 	 */
-	pgd = (pgd_t *)__va(read_cr3()) + pgd_index(address);
+	pgd = (pgd_t *)__va(read_cr3_pa()) + pgd_index(address);
 	pgd_ref = pgd_offset_k(address);
 	if (pgd_none(*pgd_ref))
 		return -1;
@@ -555,7 +555,7 @@  static int bad_address(void *p)
 
 static void dump_pagetable(unsigned long address)
 {
-	pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK);
+	pgd_t *base = __va(read_cr3_pa());
 	pgd_t *pgd = base + pgd_index(address);
 	p4d_t *p4d;
 	pud_t *pud;
@@ -700,7 +700,7 @@  show_fault_oops(struct pt_regs *regs, unsigned long error_code,
 		pgd_t *pgd;
 		pte_t *pte;
 
-		pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
+		pgd = __va(read_cr3_pa());
 		pgd += pgd_index(address);
 
 		pte = lookup_address_in_pgd(pgd, address, &level);
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index bbc558b88a88..4c1b5fd0c7ad 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -424,7 +424,7 @@  static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
 static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
 {
 	/* Don't assume we're using swapper_pg_dir at this point */
-	pgd_t *base = __va(read_cr3());
+	pgd_t *base = __va(read_cr3_pa());
 	pgd_t *pgd = &base[pgd_index(addr)];
 	p4d_t *p4d = p4d_offset(pgd, addr);
 	pud_t *pud = pud_offset(p4d, addr);
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index eb8dff15a7f6..f40bf6230480 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -80,7 +80,7 @@  pgd_t * __init efi_call_phys_prolog(void)
 	int n_pgds, i, j;
 
 	if (!efi_enabled(EFI_OLD_MEMMAP)) {
-		save_pgd = (pgd_t *)read_cr3();
+		save_pgd = (pgd_t *)__read_cr3();
 		write_cr3((unsigned long)efi_scratch.efi_pgt);
 		goto out;
 	}
@@ -646,7 +646,7 @@  efi_status_t efi_thunk_set_virtual_address_map(
 	efi_sync_low_kernel_mappings();
 	local_irq_save(flags);
 
-	efi_scratch.prev_cr3 = read_cr3();
+	efi_scratch.prev_cr3 = __read_cr3();
 	write_cr3((unsigned long)efi_scratch.efi_pgt);
 	__flush_tlb_all();
 
diff --git a/arch/x86/platform/olpc/olpc-xo1-pm.c b/arch/x86/platform/olpc/olpc-xo1-pm.c
index c5350fd27d70..0668aaff8bfe 100644
--- a/arch/x86/platform/olpc/olpc-xo1-pm.c
+++ b/arch/x86/platform/olpc/olpc-xo1-pm.c
@@ -77,7 +77,7 @@  static int xo1_power_state_enter(suspend_state_t pm_state)
 
 asmlinkage __visible int xo1_do_sleep(u8 sleep_state)
 {
-	void *pgd_addr = __va(read_cr3());
+	void *pgd_addr = __va(read_cr3_pa());
 
 	/* Program wakeup mask (using dword access to CS5536_PM1_EN) */
 	outl(wakeup_mask << 16, acpi_base + CS5536_PM1_STS);
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 6b05a9219ea2..78459a6d455a 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -129,7 +129,7 @@  static void __save_processor_state(struct saved_context *ctxt)
 	 */
 	ctxt->cr0 = read_cr0();
 	ctxt->cr2 = read_cr2();
-	ctxt->cr3 = read_cr3();
+	ctxt->cr3 = __read_cr3();
 	ctxt->cr4 = __read_cr4();
 #ifdef CONFIG_X86_64
 	ctxt->cr8 = read_cr8();
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index a6e21fee22ea..e3e62c8a8e70 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -150,7 +150,8 @@  static int relocate_restore_code(void)
 	memcpy((void *)relocated_restore_code, &core_restore_code, PAGE_SIZE);
 
 	/* Make the page containing the relocated code executable */
-	pgd = (pgd_t *)__va(read_cr3()) + pgd_index(relocated_restore_code);
+	pgd = (pgd_t *)__va(read_cr3_pa()) +
+		pgd_index(relocated_restore_code);
 	p4d = p4d_offset(pgd, relocated_restore_code);
 	if (p4d_large(*p4d)) {
 		set_p4d(p4d, __p4d(p4d_val(*p4d) & ~_PAGE_NX));
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 21beb37114b7..4f638309deea 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -2017,7 +2017,7 @@  static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr)
 	pmd_t pmd;
 	pte_t pte;
 
-	pa = read_cr3();
+	pa = read_cr3_pa();
 	pgd = native_make_pgd(xen_read_phys_ulong(pa + pgd_index(vaddr) *
 						       sizeof(pgd)));
 	if (!pgd_present(pgd))
@@ -2097,7 +2097,7 @@  void __init xen_relocate_p2m(void)
 	pt_phys = pmd_phys + PFN_PHYS(n_pmd);
 	p2m_pfn = PFN_DOWN(pt_phys) + n_pt;
 
-	pgd = __va(read_cr3());
+	pgd = __va(read_cr3_pa());
 	new_p2m = (unsigned long *)(2 * PGDIR_SIZE);
 	idx_p4d = 0;
 	save_pud = n_pud;
@@ -2204,7 +2204,7 @@  static void __init xen_write_cr3_init(unsigned long cr3)
 {
 	unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir));
 
-	BUG_ON(read_cr3() != __pa(initial_page_table));
+	BUG_ON(read_cr3_pa() != __pa(initial_page_table));
 	BUG_ON(cr3 != __pa(swapper_pg_dir));
 
 	/*