diff mbox

[V8,2/5] x86/hvm: pkeys, add pkeys support for guest_walk_tables

Message ID 1454398542-4815-3-git-send-email-huaitong.han@intel.com
State New, archived
Headers show

Commit Message

Huaitong Han Feb. 2, 2016, 7:35 a.m. UTC
Protection keys define a new 4-bit protection key field(PKEY) in bits 62:59 of
leaf entries of the page tables.

PKRU register defines 32 bits, there are 16 domains and 2 attribute bits per
domain in pkru, for each i (0 ? i ? 15), PKRU[2i] is the access-disable bit for
protection key i (ADi); PKRU[2i+1] is the write-disable bit for protection key
i (WDi). PKEY is index to a defined domain.

A fault is considered as a PKU violation if all of the following conditions are
true:
1.CR4_PKE=1.
2.EFER_LMA=1.
3.Page is present with no reserved bit violations.
4.The access is not an instruction fetch.
5.The access is to a user page.
6.PKRU.AD=1
    or The access is a data write and PKRU.WD=1
                and either CR0.WP=1 or it is a user access.

Signed-off-by: Huaitong Han <huaitong.han@intel.com>
---
Changes in v8:
*Abstract out _write_cr4.

Changes in v7:
*Add static for pkey_fault.
*Add a comment for page present check and adjust indentation.
*Init pkru_ad and pkru_wd.
*Delete l3e_get_pkey the outer parentheses.
*The first parameter of read_pkru_* use uint32_t type.


 xen/arch/x86/mm/guest_walk.c      | 54 +++++++++++++++++++++++++++++++++++++++
 xen/arch/x86/mm/hap/guest_walk.c  |  3 +++
 xen/include/asm-x86/guest_pt.h    | 12 +++++++++
 xen/include/asm-x86/hvm/hvm.h     |  2 ++
 xen/include/asm-x86/page.h        |  5 ++++
 xen/include/asm-x86/processor.h   | 47 +++++++++++++++++++++++++++++++++-
 xen/include/asm-x86/x86_64/page.h | 12 +++++++++
 7 files changed, 134 insertions(+), 1 deletion(-)

Comments

Jan Beulich Feb. 3, 2016, 9:43 a.m. UTC | #1
>>> On 02.02.16 at 08:35, <huaitong.han@intel.com> wrote:
> Protection keys define a new 4-bit protection key field(PKEY) in bits 62:59 
> of
> leaf entries of the page tables.
> 
> PKRU register defines 32 bits, there are 16 domains and 2 attribute bits per
> domain in pkru, for each i (0 ? i ? 15), PKRU[2i] is the access-disable bit 
> for
> protection key i (ADi); PKRU[2i+1] is the write-disable bit for protection 
> key
> i (WDi). PKEY is index to a defined domain.
> 
> A fault is considered as a PKU violation if all of the following conditions 
> are
> true:
> 1.CR4_PKE=1.
> 2.EFER_LMA=1.
> 3.Page is present with no reserved bit violations.
> 4.The access is not an instruction fetch.
> 5.The access is to a user page.
> 6.PKRU.AD=1
>     or The access is a data write and PKRU.WD=1
>                 and either CR0.WP=1 or it is a user access.
> 
> Signed-off-by: Huaitong Han <huaitong.han@intel.com>

Reviewed-by: Jan Beulich <jbeulich@suse.com>
albeit ...

> Changes in v8:
> *Abstract out _write_cr4.

... I'm not happy about the chose name and will try to remember
to change it to e.g. raw_write_cr4(). Names starting with an
underscore and a lower case letter are reserved to symbols
local to a given translation unit, which in my reading doesn't fit
with them getting placed in header files.

Jan
Huaitong Han Feb. 3, 2016, 10:05 a.m. UTC | #2
On Wed, 2016-02-03 at 02:43 -0700, Jan Beulich wrote:
> > > > On 02.02.16 at 08:35, <huaitong.han@intel.com> wrote:

> > Protection keys define a new 4-bit protection key field(PKEY) in

> > bits 62:59 

> > of

> > leaf entries of the page tables.

> > 

> > PKRU register defines 32 bits, there are 16 domains and 2 attribute

> > bits per

> > domain in pkru, for each i (0 ? i ? 15), PKRU[2i] is the access

> > -disable bit 

> > for

> > protection key i (ADi); PKRU[2i+1] is the write-disable bit for

> > protection 

> > key

> > i (WDi). PKEY is index to a defined domain.

> > 

> > A fault is considered as a PKU violation if all of the following

> > conditions 

> > are

> > true:

> > 1.CR4_PKE=1.

> > 2.EFER_LMA=1.

> > 3.Page is present with no reserved bit violations.

> > 4.The access is not an instruction fetch.

> > 5.The access is to a user page.

> > 6.PKRU.AD=1

> >     or The access is a data write and PKRU.WD=1

> >                 and either CR0.WP=1 or it is a user access.

> > 

> > Signed-off-by: Huaitong Han <huaitong.han@intel.com>

> 

> Reviewed-by: Jan Beulich <jbeulich@suse.com>

> albeit ...

> 

> > Changes in v8:

> > *Abstract out _write_cr4.

> 

> ... I'm not happy about the chose name and will try to remember

> to change it to e.g. raw_write_cr4(). Names starting with an

> underscore and a lower case letter are reserved to symbols

> local to a given translation unit, which in my reading doesn't fit

> with them getting placed in header files.

It will be updated in V9.
> 

> Jan
diff mbox

Patch

diff --git a/xen/arch/x86/mm/guest_walk.c b/xen/arch/x86/mm/guest_walk.c
index 18d1acf..5e1111b 100644
--- a/xen/arch/x86/mm/guest_walk.c
+++ b/xen/arch/x86/mm/guest_walk.c
@@ -90,6 +90,54 @@  static uint32_t set_ad_bits(void *guest_p, void *walk_p, int set_dirty)
     return 0;
 }
 
+#if GUEST_PAGING_LEVELS >= 4
+static bool_t pkey_fault(struct vcpu *vcpu, uint32_t pfec,
+        uint32_t pte_flags, uint32_t pte_pkey)
+{
+    uint32_t pkru = 0;
+    bool_t pkru_ad = 0, pkru_wd = 0;
+
+    /* When page isn't present,  PKEY isn't checked. */
+    if ( !(pfec & PFEC_page_present) || is_pv_vcpu(vcpu) )
+        return 0;
+
+    /*
+     * PKU:  additional mechanism by which the paging controls
+     * access to user-mode addresses based on the value in the
+     * PKRU register. A fault is considered as a PKU violation if all
+     * of the following conditions are true:
+     * 1.CR4_PKE=1.
+     * 2.EFER_LMA=1.
+     * 3.Page is present with no reserved bit violations.
+     * 4.The access is not an instruction fetch.
+     * 5.The access is to a user page.
+     * 6.PKRU.AD=1 or
+     *      the access is a data write and PKRU.WD=1 and
+     *          either CR0.WP=1 or it is a user access.
+     */
+    if ( !hvm_pku_enabled(vcpu) ||
+         !hvm_long_mode_enabled(vcpu) ||
+         /* The persent bit is guaranteed by the caller. */
+         (pfec & PFEC_reserved_bit) ||
+         (pfec & PFEC_insn_fetch) ||
+         !(pte_flags & _PAGE_USER) )
+        return 0;
+
+    pkru = read_pkru();
+    if ( unlikely(pkru) )
+    {
+        pkru_ad = read_pkru_ad(pkru, pte_pkey);
+        pkru_wd = read_pkru_wd(pkru, pte_pkey);
+        /* Condition 6 */
+        if ( pkru_ad || (pkru_wd && (pfec & PFEC_write_access) &&
+                    (hvm_wp_enabled(vcpu) || (pfec & PFEC_user_mode))))
+            return 1;
+    }
+
+    return 0;
+}
+#endif
+
 /* Walk the guest pagetables, after the manner of a hardware walker. */
 /* Because the walk is essentially random, it can cause a deadlock 
  * warning in the p2m locking code. Highly unlikely this is an actual
@@ -107,6 +155,7 @@  guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m,
     guest_l3e_t *l3p = NULL;
     guest_l4e_t *l4p;
 #endif
+    unsigned int pkey;
     uint32_t gflags, mflags, iflags, rc = 0;
     bool_t smep = 0, smap = 0;
     bool_t pse1G = 0, pse2M = 0;
@@ -190,6 +239,7 @@  guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m,
         goto out;
     /* Get the l3e and check its flags*/
     gw->l3e = l3p[guest_l3_table_offset(va)];
+    pkey = guest_l3e_get_pkey(gw->l3e);
     gflags = guest_l3e_get_flags(gw->l3e) ^ iflags;
     if ( !(gflags & _PAGE_PRESENT) ) {
         rc |= _PAGE_PRESENT;
@@ -261,6 +311,7 @@  guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m,
 
 #endif /* All levels... */
 
+    pkey = guest_l2e_get_pkey(gw->l2e);
     gflags = guest_l2e_get_flags(gw->l2e) ^ iflags;
     if ( !(gflags & _PAGE_PRESENT) ) {
         rc |= _PAGE_PRESENT;
@@ -324,6 +375,7 @@  guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m,
         if(l1p == NULL)
             goto out;
         gw->l1e = l1p[guest_l1_table_offset(va)];
+        pkey = guest_l1e_get_pkey(gw->l1e);
         gflags = guest_l1e_get_flags(gw->l1e) ^ iflags;
         if ( !(gflags & _PAGE_PRESENT) ) {
             rc |= _PAGE_PRESENT;
@@ -334,6 +386,8 @@  guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m,
 
 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
 set_ad:
+    if ( pkey_fault(v, pfec, gflags, pkey) )
+        rc |= _PAGE_PKEY_BITS;
 #endif
     /* Now re-invert the user-mode requirement for SMEP and SMAP */
     if ( smep || smap )
diff --git a/xen/arch/x86/mm/hap/guest_walk.c b/xen/arch/x86/mm/hap/guest_walk.c
index 11c1b35..49d0328 100644
--- a/xen/arch/x86/mm/hap/guest_walk.c
+++ b/xen/arch/x86/mm/hap/guest_walk.c
@@ -130,6 +130,9 @@  unsigned long hap_p2m_ga_to_gfn(GUEST_PAGING_LEVELS)(
     if ( missing & _PAGE_INVALID_BITS ) 
         pfec[0] |= PFEC_reserved_bit;
 
+    if ( missing & _PAGE_PKEY_BITS )
+        pfec[0] |= PFEC_prot_key;
+
     if ( missing & _PAGE_PAGED )
         pfec[0] = PFEC_page_paged;
 
diff --git a/xen/include/asm-x86/guest_pt.h b/xen/include/asm-x86/guest_pt.h
index 3447973..eb29e62 100644
--- a/xen/include/asm-x86/guest_pt.h
+++ b/xen/include/asm-x86/guest_pt.h
@@ -81,6 +81,11 @@  static inline u32 guest_l1e_get_flags(guest_l1e_t gl1e)
 static inline u32 guest_l2e_get_flags(guest_l2e_t gl2e)
 { return gl2e.l2 & 0xfff; }
 
+static inline u32 guest_l1e_get_pkey(guest_l1e_t gl1e)
+{ return 0; }
+static inline u32 guest_l2e_get_pkey(guest_l2e_t gl2e)
+{ return 0; }
+
 static inline guest_l1e_t guest_l1e_from_gfn(gfn_t gfn, u32 flags)
 { return (guest_l1e_t) { (gfn_x(gfn) << PAGE_SHIFT) | flags }; }
 static inline guest_l2e_t guest_l2e_from_gfn(gfn_t gfn, u32 flags)
@@ -154,6 +159,13 @@  static inline u32 guest_l4e_get_flags(guest_l4e_t gl4e)
 { return l4e_get_flags(gl4e); }
 #endif
 
+static inline u32 guest_l1e_get_pkey(guest_l1e_t gl1e)
+{ return l1e_get_pkey(gl1e); }
+static inline u32 guest_l2e_get_pkey(guest_l2e_t gl2e)
+{ return l2e_get_pkey(gl2e); }
+static inline u32 guest_l3e_get_pkey(guest_l3e_t gl3e)
+{ return l3e_get_pkey(gl3e); }
+
 static inline guest_l1e_t guest_l1e_from_gfn(gfn_t gfn, u32 flags)
 { return l1e_from_pfn(gfn_x(gfn), flags); }
 static inline guest_l2e_t guest_l2e_from_gfn(gfn_t gfn, u32 flags)
diff --git a/xen/include/asm-x86/hvm/hvm.h b/xen/include/asm-x86/hvm/hvm.h
index a87224b..731dd44 100644
--- a/xen/include/asm-x86/hvm/hvm.h
+++ b/xen/include/asm-x86/hvm/hvm.h
@@ -277,6 +277,8 @@  int hvm_girq_dest_2_vcpu_id(struct domain *d, uint8_t dest, uint8_t dest_mode);
     (hvm_paging_enabled(v) && ((v)->arch.hvm_vcpu.guest_cr[4] & X86_CR4_SMAP))
 #define hvm_nx_enabled(v) \
     (!!((v)->arch.hvm_vcpu.guest_efer & EFER_NX))
+#define hvm_pku_enabled(v) \
+    (hvm_paging_enabled(v) && ((v)->arch.hvm_vcpu.guest_cr[4] & X86_CR4_PKE))
 
 /* Can we use superpages in the HAP p2m table? */
 #define hap_has_1gb (!!(hvm_funcs.hap_capabilities & HVM_HAP_SUPERPAGE_1GB))
diff --git a/xen/include/asm-x86/page.h b/xen/include/asm-x86/page.h
index a095a93..9202f3d 100644
--- a/xen/include/asm-x86/page.h
+++ b/xen/include/asm-x86/page.h
@@ -93,6 +93,11 @@ 
 #define l3e_get_flags(x)           (get_pte_flags((x).l3))
 #define l4e_get_flags(x)           (get_pte_flags((x).l4))
 
+/* Get pte pkeys (unsigned int). */
+#define l1e_get_pkey(x)           get_pte_pkey((x).l1)
+#define l2e_get_pkey(x)           get_pte_pkey((x).l2)
+#define l3e_get_pkey(x)           get_pte_pkey((x).l3)
+
 /* Construct an empty pte. */
 #define l1e_empty()                ((l1_pgentry_t) { 0 })
 #define l2e_empty()                ((l2_pgentry_t) { 0 })
diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h
index 26ba141..da5d0e8 100644
--- a/xen/include/asm-x86/processor.h
+++ b/xen/include/asm-x86/processor.h
@@ -332,6 +332,11 @@  static inline unsigned long read_cr2(void)
 
 DECLARE_PER_CPU(unsigned long, cr4);
 
+static inline void _write_cr4(unsigned long val)
+{
+    asm volatile ( "mov %0,%%cr4" : : "r" (val) );
+}
+
 static inline unsigned long read_cr4(void)
 {
     return this_cpu(cr4);
@@ -340,7 +345,7 @@  static inline unsigned long read_cr4(void)
 static inline void write_cr4(unsigned long val)
 {
     this_cpu(cr4) = val;
-    asm volatile ( "mov %0,%%cr4" : : "r" (val) );
+    _write_cr4(val);
 }
 
 /* Clear and set 'TS' bit respectively */
@@ -374,6 +379,46 @@  static always_inline void clear_in_cr4 (unsigned long mask)
     write_cr4(read_cr4() & ~mask);
 }
 
+static inline unsigned int read_pkru(void)
+{
+    unsigned int pkru;
+    unsigned long cr4 = read_cr4();
+
+    /*
+     * _PAGE_PKEY_BITS have a conflict with _PAGE_GNTTAB used by PV guests,
+     * so that X86_CR4_PKE  is disabled on hypervisor. To use RDPKRU, CR4.PKE
+     * gets temporarily enabled.
+     */
+    _write_cr4(cr4 | X86_CR4_PKE);
+    asm volatile (".byte 0x0f,0x01,0xee"
+        : "=a" (pkru) : "c" (0) : "dx");
+    _write_cr4(cr4);
+
+    return pkru;
+}
+
+/* Macros for PKRU domain */
+#define PKRU_READ  (0)
+#define PKRU_WRITE (1)
+#define PKRU_ATTRS (2)
+
+/*
+ * PKRU defines 32 bits, there are 16 domains and 2 attribute bits per
+ * domain in pkru, pkeys is index to a defined domain, so the value of
+ * pte_pkeys * PKRU_ATTRS + R/W is offset of a defined domain attribute.
+ */
+static inline bool_t read_pkru_ad(uint32_t pkru, unsigned int pkey)
+{
+    ASSERT(pkey < 16);
+    return (pkru >> (pkey * PKRU_ATTRS + PKRU_READ)) & 1;
+}
+
+static inline bool_t read_pkru_wd(uint32_t pkru, unsigned int pkey)
+{
+    ASSERT(pkey < 16);
+    return (pkru >> (pkey * PKRU_ATTRS + PKRU_WRITE)) & 1;
+}
+
 /*
  *      NSC/Cyrix CPU configuration register indexes
  */
diff --git a/xen/include/asm-x86/x86_64/page.h b/xen/include/asm-x86/x86_64/page.h
index 19ab4d0..86abb94 100644
--- a/xen/include/asm-x86/x86_64/page.h
+++ b/xen/include/asm-x86/x86_64/page.h
@@ -134,6 +134,18 @@  typedef l4_pgentry_t root_pgentry_t;
 #define get_pte_flags(x) (((int)((x) >> 40) & ~0xFFF) | ((int)(x) & 0xFFF))
 #define put_pte_flags(x) (((intpte_t)((x) & ~0xFFF) << 40) | ((x) & 0xFFF))
 
+/*
+ * Protection keys define a new 4-bit protection key field
+ * (PKEY) in bits 62:59 of leaf entries of the page tables.
+ * This corresponds to bit 22:19 of a 24-bit flags.
+ *
+ * Notice: Bit 22 is used by _PAGE_GNTTAB which is visible to PV guests,
+ * so Protection keys must be disabled on PV guests.
+ */
+#define _PAGE_PKEY_BITS  (0x780000)	 /* Protection Keys, 22:19 */
+
+#define get_pte_pkey(x) (MASK_EXTR(get_pte_flags(x), _PAGE_PKEY_BITS))
+
 /* Bit 23 of a 24-bit flag mask. This corresponds to bit 63 of a pte.*/
 #define _PAGE_NX_BIT (1U<<23)