diff mbox series

[v17,08/26] x86/mm: Introduce _PAGE_COW

Message ID 20201229213053.16395-9-yu-cheng.yu@intel.com (mailing list archive)
State New, archived
Headers show
Series Control-flow Enforcement: Shadow Stack | expand

Commit Message

Yu, Yu-cheng Dec. 29, 2020, 9:30 p.m. UTC
There is essentially no room left in the x86 hardware PTEs on some OSes
(not Linux).  That left the hardware architects looking for a way to
represent a new memory type (shadow stack) within the existing bits.
They chose to repurpose a lightly-used state: Write=0, Dirty=1.

The reason it's lightly used is that Dirty=1 is normally set by hardware
and cannot normally be set by hardware on a Write=0 PTE.  Software must
normally be involved to create one of these PTEs, so software can simply
opt to not create them.

In places where Linux normally creates Write=0, Dirty=1, it can use the
software-defined _PAGE_COW in place of the hardware _PAGE_DIRTY.  In other
words, whenever Linux needs to create Write=0, Dirty=1, it instead creates
Write=0, Cow=1, except for shadow stack, which is Write=0, Dirty=1.  This
clearly separates shadow stack from other data, and results in the
following:

(a) A modified, copy-on-write (COW) page: (Write=0, Cow=1)
(b) A R/O page that has been COW'ed: (Write=0, Cow=1)
    The user page is in a R/O VMA, and get_user_pages() needs a writable
    copy.  The page fault handler creates a copy of the page and sets
    the new copy's PTE as Write=0 and Cow=1.
(c) A shadow stack PTE: (Write=0, Dirty=1)
(d) A shared shadow stack PTE: (Write=0, Cow=1)
    When a shadow stack page is being shared among processes (this happens
    at fork()), its PTE is made Dirty=0, so the next shadow stack access
    causes a fault, and the page is duplicated and Dirty=1 is set again.
    This is the COW equivalent for shadow stack pages, even though it's
    copy-on-access rather than copy-on-write.
(e) A page where the processor observed a Write=1 PTE, started a write, set
    Dirty=1, but then observed a Write=0 PTE.  That's possible today, but
    will not happen on processors that support shadow stack.

Define _PAGE_COW and update pte_*() helpers and apply the same changes to
pmd and pud.

After this, there are six free bits left in the 64-bit PTE, and no more
free bits in the 32-bit PTE (except for PAE) and Shadow Stack is not
implemented for the 32-bit kernel.

Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
---
 arch/x86/include/asm/pgtable.h       | 117 ++++++++++++++++++++++++---
 arch/x86/include/asm/pgtable_types.h |  42 +++++++++-
 2 files changed, 148 insertions(+), 11 deletions(-)

Comments

Borislav Petkov Jan. 21, 2021, 6:44 p.m. UTC | #1
On Tue, Dec 29, 2020 at 01:30:35PM -0800, Yu-cheng Yu wrote:
> @@ -182,6 +182,12 @@ static inline int pud_young(pud_t pud)
>  
>  static inline int pte_write(pte_t pte)
>  {
> +	/*
> +	 * If _PAGE_DIRTY is set, the PTE must either have _PAGE_RW or be
> +	 * a shadow stack PTE, which is logically writable.
> +	 */
> +	if (cpu_feature_enabled(X86_FEATURE_SHSTK))
> +		return pte_flags(pte) & (_PAGE_RW | _PAGE_DIRTY);
>  	return pte_flags(pte) & _PAGE_RW;

        if (cpu_feature_enabled(X86_FEATURE_SHSTK))
                return pte_flags(pte) & (_PAGE_RW | _PAGE_DIRTY);
        else
                return pte_flags(pte) & _PAGE_RW;

The else makes it ballanced and easier to read.


> @@ -333,7 +339,7 @@ static inline pte_t pte_clear_uffd_wp(pte_t pte)
>  
>  static inline pte_t pte_mkclean(pte_t pte)
>  {
> -	return pte_clear_flags(pte, _PAGE_DIRTY);
> +	return pte_clear_flags(pte, _PAGE_DIRTY_BITS);
>  }
>  
>  static inline pte_t pte_mkold(pte_t pte)
> @@ -343,6 +349,16 @@ static inline pte_t pte_mkold(pte_t pte)
>  
>  static inline pte_t pte_wrprotect(pte_t pte)
>  {
> +	/*
> +	 * Blindly clearing _PAGE_RW might accidentally create
> +	 * a shadow stack PTE (RW=0, Dirty=1).  Move the hardware
> +	 * dirty value to the software bit.
> +	 */
> +	if (cpu_feature_enabled(X86_FEATURE_SHSTK)) {
> +		pte.pte |= (pte.pte & _PAGE_DIRTY) >> _PAGE_BIT_DIRTY << _PAGE_BIT_COW;

Why the unreadable shifting when you can simply do:

                if (pte.pte & _PAGE_DIRTY)
                        pte.pte |= _PAGE_COW;

?

> @@ -434,16 +469,40 @@ static inline pmd_t pmd_mkold(pmd_t pmd)
>  
>  static inline pmd_t pmd_mkclean(pmd_t pmd)
>  {
> -	return pmd_clear_flags(pmd, _PAGE_DIRTY);
> +	return pmd_clear_flags(pmd, _PAGE_DIRTY_BITS);
>  }
>  
>  static inline pmd_t pmd_wrprotect(pmd_t pmd)
>  {
> +	/*
> +	 * Blindly clearing _PAGE_RW might accidentally create
> +	 * a shadow stack PMD (RW=0, Dirty=1).  Move the hardware
> +	 * dirty value to the software bit.
> +	 */
> +	if (cpu_feature_enabled(X86_FEATURE_SHSTK)) {
> +		pmdval_t v = native_pmd_val(pmd);
> +
> +		v |= (v & _PAGE_DIRTY) >> _PAGE_BIT_DIRTY << _PAGE_BIT_COW;

As above.

> @@ -488,17 +554,35 @@ static inline pud_t pud_mkold(pud_t pud)
>  
>  static inline pud_t pud_mkclean(pud_t pud)
>  {
> -	return pud_clear_flags(pud, _PAGE_DIRTY);
> +	return pud_clear_flags(pud, _PAGE_DIRTY_BITS);
>  }
>  
>  static inline pud_t pud_wrprotect(pud_t pud)
>  {
> +	/*
> +	 * Blindly clearing _PAGE_RW might accidentally create
> +	 * a shadow stack PUD (RW=0, Dirty=1).  Move the hardware
> +	 * dirty value to the software bit.
> +	 */
> +	if (cpu_feature_enabled(X86_FEATURE_SHSTK)) {
> +		pudval_t v = native_pud_val(pud);
> +
> +		v |= (v & _PAGE_DIRTY) >> _PAGE_BIT_DIRTY << _PAGE_BIT_COW;

Ditto.

> @@ -1131,6 +1222,12 @@ extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
>  #define pmd_write pmd_write
>  static inline int pmd_write(pmd_t pmd)
>  {
> +	/*
> +	 * If _PAGE_DIRTY is set, then the PMD must either have _PAGE_RW or
> +	 * be a shadow stack PMD, which is logically writable.
> +	 */
> +	if (cpu_feature_enabled(X86_FEATURE_SHSTK))
> +		return pmd_flags(pmd) & (_PAGE_RW | _PAGE_DIRTY);

	else


>  	return pmd_flags(pmd) & _PAGE_RW;
>  }
>
Yu, Yu-cheng Jan. 21, 2021, 8:16 p.m. UTC | #2
On 1/21/2021 10:44 AM, Borislav Petkov wrote:
> On Tue, Dec 29, 2020 at 01:30:35PM -0800, Yu-cheng Yu wrote:
[...]
>> @@ -343,6 +349,16 @@ static inline pte_t pte_mkold(pte_t pte)
>>   
>>   static inline pte_t pte_wrprotect(pte_t pte)
>>   {
>> +	/*
>> +	 * Blindly clearing _PAGE_RW might accidentally create
>> +	 * a shadow stack PTE (RW=0, Dirty=1).  Move the hardware
>> +	 * dirty value to the software bit.
>> +	 */
>> +	if (cpu_feature_enabled(X86_FEATURE_SHSTK)) {
>> +		pte.pte |= (pte.pte & _PAGE_DIRTY) >> _PAGE_BIT_DIRTY << _PAGE_BIT_COW;
> 
> Why the unreadable shifting when you can simply do:
> 
>                  if (pte.pte & _PAGE_DIRTY)
>                          pte.pte |= _PAGE_COW;
> 
> ?

It clears _PAGE_DIRTY and sets _PAGE_COW.  That is,

if (pte.pte & _PAGE_DIRTY) {
	pte.pte &= ~_PAGE_DIRTY;
	pte.pte |= _PAGE_COW;
}

So, shifting makes resulting code more efficient.

>> @@ -434,16 +469,40 @@ static inline pmd_t pmd_mkold(pmd_t pmd)
>>   
>>   static inline pmd_t pmd_mkclean(pmd_t pmd)
>>   {
>> -	return pmd_clear_flags(pmd, _PAGE_DIRTY);
>> +	return pmd_clear_flags(pmd, _PAGE_DIRTY_BITS);
>>   }
>>   
>>   static inline pmd_t pmd_wrprotect(pmd_t pmd)
>>   {
>> +	/*
>> +	 * Blindly clearing _PAGE_RW might accidentally create
>> +	 * a shadow stack PMD (RW=0, Dirty=1).  Move the hardware
>> +	 * dirty value to the software bit.
>> +	 */
>> +	if (cpu_feature_enabled(X86_FEATURE_SHSTK)) {
>> +		pmdval_t v = native_pmd_val(pmd);
>> +
>> +		v |= (v & _PAGE_DIRTY) >> _PAGE_BIT_DIRTY << _PAGE_BIT_COW;
> 
> As above.
> 
>> @@ -488,17 +554,35 @@ static inline pud_t pud_mkold(pud_t pud)
>>   
>>   static inline pud_t pud_mkclean(pud_t pud)
>>   {
>> -	return pud_clear_flags(pud, _PAGE_DIRTY);
>> +	return pud_clear_flags(pud, _PAGE_DIRTY_BITS);
>>   }
>>   
>>   static inline pud_t pud_wrprotect(pud_t pud)
>>   {
>> +	/*
>> +	 * Blindly clearing _PAGE_RW might accidentally create
>> +	 * a shadow stack PUD (RW=0, Dirty=1).  Move the hardware
>> +	 * dirty value to the software bit.
>> +	 */
>> +	if (cpu_feature_enabled(X86_FEATURE_SHSTK)) {
>> +		pudval_t v = native_pud_val(pud);
>> +
>> +		v |= (v & _PAGE_DIRTY) >> _PAGE_BIT_DIRTY << _PAGE_BIT_COW;
> 
> Ditto.
> 
>> @@ -1131,6 +1222,12 @@ extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
>>   #define pmd_write pmd_write
>>   static inline int pmd_write(pmd_t pmd)
>>   {
>> +	/*
>> +	 * If _PAGE_DIRTY is set, then the PMD must either have _PAGE_RW or
>> +	 * be a shadow stack PMD, which is logically writable.
>> +	 */
>> +	if (cpu_feature_enabled(X86_FEATURE_SHSTK))
>> +		return pmd_flags(pmd) & (_PAGE_RW | _PAGE_DIRTY);
> 
> 	else
> 
> 
>>   	return pmd_flags(pmd) & _PAGE_RW;
>>   }
>>
Dave Hansen Jan. 21, 2021, 8:20 p.m. UTC | #3
On 1/21/21 12:16 PM, Yu, Yu-cheng wrote:
> 
>>> @@ -343,6 +349,16 @@ static inline pte_t pte_mkold(pte_t pte)
>>>     static inline pte_t pte_wrprotect(pte_t pte)
>>>   {
>>> +    /*
>>> +     * Blindly clearing _PAGE_RW might accidentally create
>>> +     * a shadow stack PTE (RW=0, Dirty=1).  Move the hardware
>>> +     * dirty value to the software bit.
>>> +     */
>>> +    if (cpu_feature_enabled(X86_FEATURE_SHSTK)) {
>>> +        pte.pte |= (pte.pte & _PAGE_DIRTY) >> _PAGE_BIT_DIRTY <<
>>> _PAGE_BIT_COW;
>>
>> Why the unreadable shifting when you can simply do:
>>
>>                  if (pte.pte & _PAGE_DIRTY)
>>                          pte.pte |= _PAGE_COW;
>>
>> ?
> 
> It clears _PAGE_DIRTY and sets _PAGE_COW.  That is,
> 
> if (pte.pte & _PAGE_DIRTY) {
>     pte.pte &= ~_PAGE_DIRTY;
>     pte.pte |= _PAGE_COW;
> }
> 
> So, shifting makes resulting code more efficient.

Are you sure?

Usually, the compiler is better at making code efficient than humans.  I
find that coding it in the most human-readable way is best unless I
*know* the compiler is unable to generate god code.
Dave Hansen Jan. 21, 2021, 8:26 p.m. UTC | #4
> Usually, the compiler is better at making code efficient than humans.  I
> find that coding it in the most human-readable way is best unless I
> *know* the compiler is unable to generate god code.

"good code", even.

I really want a "god code" compiler, though. :)
Borislav Petkov Jan. 21, 2021, 8:41 p.m. UTC | #5
On Thu, Jan 21, 2021 at 12:16:23PM -0800, Yu, Yu-cheng wrote:
> It clears _PAGE_DIRTY and sets _PAGE_COW.  That is,
> 
> if (pte.pte & _PAGE_DIRTY) {
> 	pte.pte &= ~_PAGE_DIRTY;
> 	pte.pte |= _PAGE_COW;
> }
> 
> So, shifting makes resulting code more efficient.

Efficient for what? Is this a hot path?

If not, I'd take readable code any day of the week.
Yu, Yu-cheng Jan. 21, 2021, 8:44 p.m. UTC | #6
On 1/21/2021 12:26 PM, Dave Hansen wrote:
>> Usually, the compiler is better at making code efficient than humans.  I
>> find that coding it in the most human-readable way is best unless I
>> *know* the compiler is unable to generate god code.
> 
> "good code", even.
> 
> I really want a "god code" compiler, though. :)
> 
With my version of GCC, the shifting implementation creates five 
instructions, all operate on registers only.  The other implementation 
also creates five instructions, but introduces one jump and one memory 
access.  But, you are right, being readable is also important.  Maybe we 
can tweak it a little or create something similar to those in bitops.

--
Yu-cheng
Yu, Yu-cheng Jan. 21, 2021, 9:40 p.m. UTC | #7
On 1/21/2021 12:41 PM, Borislav Petkov wrote:
> On Thu, Jan 21, 2021 at 12:16:23PM -0800, Yu, Yu-cheng wrote:
>> It clears _PAGE_DIRTY and sets _PAGE_COW.  That is,
>>
>> if (pte.pte & _PAGE_DIRTY) {
>> 	pte.pte &= ~_PAGE_DIRTY;
>> 	pte.pte |= _PAGE_COW;
>> }
>>
>> So, shifting makes resulting code more efficient.
> 
> Efficient for what? Is this a hot path?
> 
> If not, I'd take readable code any day of the week.
> 

Ok, I will change it to the more readable code as stated earlier.

--
Yu-cheng
David Laight Jan. 21, 2021, 10:16 p.m. UTC | #8
From: Yu, Yu-cheng 
> 
> On 1/21/2021 10:44 AM, Borislav Petkov wrote:
> > On Tue, Dec 29, 2020 at 01:30:35PM -0800, Yu-cheng Yu wrote:
> [...]
> >> @@ -343,6 +349,16 @@ static inline pte_t pte_mkold(pte_t pte)
> >>
> >>   static inline pte_t pte_wrprotect(pte_t pte)
> >>   {
> >> +	/*
> >> +	 * Blindly clearing _PAGE_RW might accidentally create
> >> +	 * a shadow stack PTE (RW=0, Dirty=1).  Move the hardware
> >> +	 * dirty value to the software bit.
> >> +	 */
> >> +	if (cpu_feature_enabled(X86_FEATURE_SHSTK)) {
> >> +		pte.pte |= (pte.pte & _PAGE_DIRTY) >> _PAGE_BIT_DIRTY << _PAGE_BIT_COW;
> >
> > Why the unreadable shifting when you can simply do:
> >
> >                  if (pte.pte & _PAGE_DIRTY)
> >                          pte.pte |= _PAGE_COW;
> >

> > ?
> 
> It clears _PAGE_DIRTY and sets _PAGE_COW.  That is,
> 
> if (pte.pte & _PAGE_DIRTY) {
> 	pte.pte &= ~_PAGE_DIRTY;
> 	pte.pte |= _PAGE_COW;
> }
> 
> So, shifting makes resulting code more efficient.

Does the compiler manage to do one shift?

How can it clear anything?
There is only an |= against the target.

Something horrid with ^= might set and clear.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Randy Dunlap Jan. 21, 2021, 10:19 p.m. UTC | #9
On 1/21/21 2:16 PM, David Laight wrote:
> From: Yu, Yu-cheng 
>>
>> On 1/21/2021 10:44 AM, Borislav Petkov wrote:
>>> On Tue, Dec 29, 2020 at 01:30:35PM -0800, Yu-cheng Yu wrote:
>> [...]
>>>> @@ -343,6 +349,16 @@ static inline pte_t pte_mkold(pte_t pte)
>>>>
>>>>   static inline pte_t pte_wrprotect(pte_t pte)
>>>>   {
>>>> +	/*
>>>> +	 * Blindly clearing _PAGE_RW might accidentally create
>>>> +	 * a shadow stack PTE (RW=0, Dirty=1).  Move the hardware
>>>> +	 * dirty value to the software bit.
>>>> +	 */
>>>> +	if (cpu_feature_enabled(X86_FEATURE_SHSTK)) {
>>>> +		pte.pte |= (pte.pte & _PAGE_DIRTY) >> _PAGE_BIT_DIRTY << _PAGE_BIT_COW;
>>>
>>> Why the unreadable shifting when you can simply do:
>>>
>>>                  if (pte.pte & _PAGE_DIRTY)
>>>                          pte.pte |= _PAGE_COW;
>>>
> 
>>> ?
>>
>> It clears _PAGE_DIRTY and sets _PAGE_COW.  That is,
>>
>> if (pte.pte & _PAGE_DIRTY) {
>> 	pte.pte &= ~_PAGE_DIRTY;
>> 	pte.pte |= _PAGE_COW;
>> }
>>
>> So, shifting makes resulting code more efficient.
> 
> Does the compiler manage to do one shift?
> 
> How can it clear anything?

It could shift it off either end since there are both
<< and >>.

> There is only an |= against the target.
> 
> Something horrid with ^= might set and clear.
David Laight Jan. 21, 2021, 10:32 p.m. UTC | #10
From: Randy Dunlap
> Sent: 21 January 2021 22:19
> 
> On 1/21/21 2:16 PM, David Laight wrote:
> > From: Yu, Yu-cheng
> >>
> >> On 1/21/2021 10:44 AM, Borislav Petkov wrote:
> >>> On Tue, Dec 29, 2020 at 01:30:35PM -0800, Yu-cheng Yu wrote:
> >> [...]
> >>>> @@ -343,6 +349,16 @@ static inline pte_t pte_mkold(pte_t pte)
> >>>>
> >>>>   static inline pte_t pte_wrprotect(pte_t pte)
> >>>>   {
> >>>> +	/*
> >>>> +	 * Blindly clearing _PAGE_RW might accidentally create
> >>>> +	 * a shadow stack PTE (RW=0, Dirty=1).  Move the hardware
> >>>> +	 * dirty value to the software bit.
> >>>> +	 */
> >>>> +	if (cpu_feature_enabled(X86_FEATURE_SHSTK)) {
> >>>> +		pte.pte |= (pte.pte & _PAGE_DIRTY) >> _PAGE_BIT_DIRTY << _PAGE_BIT_COW;
> >>>
> >>> Why the unreadable shifting when you can simply do:
> >>>
> >>>                  if (pte.pte & _PAGE_DIRTY)
> >>>                          pte.pte |= _PAGE_COW;
> >>>
> >
> >>> ?
> >>
> >> It clears _PAGE_DIRTY and sets _PAGE_COW.  That is,
> >>
> >> if (pte.pte & _PAGE_DIRTY) {
> >> 	pte.pte &= ~_PAGE_DIRTY;
> >> 	pte.pte |= _PAGE_COW;
> >> }
> >>
> >> So, shifting makes resulting code more efficient.
> >
> > Does the compiler manage to do one shift?
> >
> > How can it clear anything?
> 
> It could shift it off either end since there are both << and >>.

It is still:
	pte.pte |= xxxxxxx;

> > There is only an |= against the target.
> >
> > Something horrid with ^= might set and clear.

It could be 4 instructions:
	is_dirty = pte.pte & PAGE_DIRTY;
	pte.pte &= ~PAGE_DIRTY; // or pte.pte ^= is_dirty
	is_cow = is_dirty << (BIT_COW - BIT_DIRTY); // or equivalent >>
	pte.pte |= is_cow;
provided you've a three operand form for one of the first two instructions.
Something like ARM might manage to merge the last two as well.
But the register dependency chain length may matter more than
the number of instructions.
The above is likely to be three long.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Yu, Yu-cheng Jan. 22, 2021, 9:54 p.m. UTC | #11
On 1/21/2021 2:32 PM, David Laight wrote:
> From: Randy Dunlap
>> Sent: 21 January 2021 22:19
>>
>> On 1/21/21 2:16 PM, David Laight wrote:
>>> From: Yu, Yu-cheng
>>>>
>>>> On 1/21/2021 10:44 AM, Borislav Petkov wrote:
>>>>> On Tue, Dec 29, 2020 at 01:30:35PM -0800, Yu-cheng Yu wrote:
>>>> [...]
>>>>>> @@ -343,6 +349,16 @@ static inline pte_t pte_mkold(pte_t pte)
>>>>>>
>>>>>>    static inline pte_t pte_wrprotect(pte_t pte)
>>>>>>    {
>>>>>> +	/*
>>>>>> +	 * Blindly clearing _PAGE_RW might accidentally create
>>>>>> +	 * a shadow stack PTE (RW=0, Dirty=1).  Move the hardware
>>>>>> +	 * dirty value to the software bit.
>>>>>> +	 */
>>>>>> +	if (cpu_feature_enabled(X86_FEATURE_SHSTK)) {
>>>>>> +		pte.pte |= (pte.pte & _PAGE_DIRTY) >> _PAGE_BIT_DIRTY << _PAGE_BIT_COW;
>>>>>
>>>>> Why the unreadable shifting when you can simply do:
>>>>>
>>>>>                   if (pte.pte & _PAGE_DIRTY)
>>>>>                           pte.pte |= _PAGE_COW;
>>>>>
>>>
>>>>> ?
>>>>
>>>> It clears _PAGE_DIRTY and sets _PAGE_COW.  That is,
>>>>
>>>> if (pte.pte & _PAGE_DIRTY) {
>>>> 	pte.pte &= ~_PAGE_DIRTY;
>>>> 	pte.pte |= _PAGE_COW;
>>>> }
>>>>
>>>> So, shifting makes resulting code more efficient.
>>>
>>> Does the compiler manage to do one shift?
>>>
>>> How can it clear anything?
>>
>> It could shift it off either end since there are both << and >>.
> 
> It is still:
> 	pte.pte |= xxxxxxx;
> 
>>> There is only an |= against the target.
>>>
>>> Something horrid with ^= might set and clear.
> 
> It could be 4 instructions:
> 	is_dirty = pte.pte & PAGE_DIRTY;
> 	pte.pte &= ~PAGE_DIRTY; // or pte.pte ^= is_dirty
> 	is_cow = is_dirty << (BIT_COW - BIT_DIRTY); // or equivalent >>
> 	pte.pte |= is_cow;
> provided you've a three operand form for one of the first two instructions.
> Something like ARM might manage to merge the last two as well.
> But the register dependency chain length may matter more than
> the number of instructions.
> The above is likely to be three long.

I see what you are saying.  The patch is like...

	if (cpu_feature_enabled(X86_FEATURE_SHSTK)) {
		pte.pte |= (pte.pte & _PAGE_DIRTY) >> _PAGE_BIT_DIRTY << _PAGE_BIT_COW;
		pte = pte_clear_flags(pte, _PAGE_DIRTY);
	}

It is not necessary to do the shifting.  I will make it, simply,

if (pte.pte & _PAGE_DIRTY) {
	pte.pte &= ~PAGE_DIRTY;
	pte.pte |= _PAGE_COW;
}

Thanks for your comments.

--
Yu-cheng
diff mbox series

Patch

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index a02c67291cfc..61e4d3b17d87 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -121,9 +121,9 @@  extern pmdval_t early_pmd_flags;
  * The following only work if pte_present() is true.
  * Undefined behaviour if not..
  */
-static inline int pte_dirty(pte_t pte)
+static inline bool pte_dirty(pte_t pte)
 {
-	return pte_flags(pte) & _PAGE_DIRTY;
+	return pte_flags(pte) & _PAGE_DIRTY_BITS;
 }
 
 
@@ -160,9 +160,9 @@  static inline int pte_young(pte_t pte)
 	return pte_flags(pte) & _PAGE_ACCESSED;
 }
 
-static inline int pmd_dirty(pmd_t pmd)
+static inline bool pmd_dirty(pmd_t pmd)
 {
-	return pmd_flags(pmd) & _PAGE_DIRTY;
+	return pmd_flags(pmd) & _PAGE_DIRTY_BITS;
 }
 
 static inline int pmd_young(pmd_t pmd)
@@ -170,9 +170,9 @@  static inline int pmd_young(pmd_t pmd)
 	return pmd_flags(pmd) & _PAGE_ACCESSED;
 }
 
-static inline int pud_dirty(pud_t pud)
+static inline bool pud_dirty(pud_t pud)
 {
-	return pud_flags(pud) & _PAGE_DIRTY;
+	return pud_flags(pud) & _PAGE_DIRTY_BITS;
 }
 
 static inline int pud_young(pud_t pud)
@@ -182,6 +182,12 @@  static inline int pud_young(pud_t pud)
 
 static inline int pte_write(pte_t pte)
 {
+	/*
+	 * If _PAGE_DIRTY is set, the PTE must either have _PAGE_RW or be
+	 * a shadow stack PTE, which is logically writable.
+	 */
+	if (cpu_feature_enabled(X86_FEATURE_SHSTK))
+		return pte_flags(pte) & (_PAGE_RW | _PAGE_DIRTY);
 	return pte_flags(pte) & _PAGE_RW;
 }
 
@@ -333,7 +339,7 @@  static inline pte_t pte_clear_uffd_wp(pte_t pte)
 
 static inline pte_t pte_mkclean(pte_t pte)
 {
-	return pte_clear_flags(pte, _PAGE_DIRTY);
+	return pte_clear_flags(pte, _PAGE_DIRTY_BITS);
 }
 
 static inline pte_t pte_mkold(pte_t pte)
@@ -343,6 +349,16 @@  static inline pte_t pte_mkold(pte_t pte)
 
 static inline pte_t pte_wrprotect(pte_t pte)
 {
+	/*
+	 * Blindly clearing _PAGE_RW might accidentally create
+	 * a shadow stack PTE (RW=0, Dirty=1).  Move the hardware
+	 * dirty value to the software bit.
+	 */
+	if (cpu_feature_enabled(X86_FEATURE_SHSTK)) {
+		pte.pte |= (pte.pte & _PAGE_DIRTY) >> _PAGE_BIT_DIRTY << _PAGE_BIT_COW;
+		pte = pte_clear_flags(pte, _PAGE_DIRTY);
+	}
+
 	return pte_clear_flags(pte, _PAGE_RW);
 }
 
@@ -353,6 +369,18 @@  static inline pte_t pte_mkexec(pte_t pte)
 
 static inline pte_t pte_mkdirty(pte_t pte)
 {
+	pteval_t dirty = _PAGE_DIRTY;
+
+	/* Avoid creating (HW)Dirty=1, Write=0 PTEs */
+	if (cpu_feature_enabled(X86_FEATURE_SHSTK) && !pte_write(pte))
+		dirty = _PAGE_COW;
+
+	return pte_set_flags(pte, dirty | _PAGE_SOFT_DIRTY);
+}
+
+static inline pte_t pte_mkwrite_shstk(pte_t pte)
+{
+	pte = pte_clear_flags(pte, _PAGE_COW);
 	return pte_set_flags(pte, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
 }
 
@@ -363,6 +391,13 @@  static inline pte_t pte_mkyoung(pte_t pte)
 
 static inline pte_t pte_mkwrite(pte_t pte)
 {
+	if (cpu_feature_enabled(X86_FEATURE_SHSTK)) {
+		if (pte_flags(pte) & _PAGE_COW) {
+			pte = pte_clear_flags(pte, _PAGE_COW);
+			pte = pte_set_flags(pte, _PAGE_DIRTY);
+		}
+	}
+
 	return pte_set_flags(pte, _PAGE_RW);
 }
 
@@ -434,16 +469,40 @@  static inline pmd_t pmd_mkold(pmd_t pmd)
 
 static inline pmd_t pmd_mkclean(pmd_t pmd)
 {
-	return pmd_clear_flags(pmd, _PAGE_DIRTY);
+	return pmd_clear_flags(pmd, _PAGE_DIRTY_BITS);
 }
 
 static inline pmd_t pmd_wrprotect(pmd_t pmd)
 {
+	/*
+	 * Blindly clearing _PAGE_RW might accidentally create
+	 * a shadow stack PMD (RW=0, Dirty=1).  Move the hardware
+	 * dirty value to the software bit.
+	 */
+	if (cpu_feature_enabled(X86_FEATURE_SHSTK)) {
+		pmdval_t v = native_pmd_val(pmd);
+
+		v |= (v & _PAGE_DIRTY) >> _PAGE_BIT_DIRTY << _PAGE_BIT_COW;
+		pmd = pmd_clear_flags(__pmd(v), _PAGE_DIRTY);
+	}
+
 	return pmd_clear_flags(pmd, _PAGE_RW);
 }
 
 static inline pmd_t pmd_mkdirty(pmd_t pmd)
 {
+	pmdval_t dirty = _PAGE_DIRTY;
+
+	/* Avoid creating (HW)Dirty=1, Write=0 PMDs */
+	if (cpu_feature_enabled(X86_FEATURE_SHSTK) && !(pmd_flags(pmd) & _PAGE_RW))
+		dirty = _PAGE_COW;
+
+	return pmd_set_flags(pmd, dirty | _PAGE_SOFT_DIRTY);
+}
+
+static inline pmd_t pmd_mkwrite_shstk(pmd_t pmd)
+{
+	pmd = pmd_clear_flags(pmd, _PAGE_COW);
 	return pmd_set_flags(pmd, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
 }
 
@@ -464,6 +523,13 @@  static inline pmd_t pmd_mkyoung(pmd_t pmd)
 
 static inline pmd_t pmd_mkwrite(pmd_t pmd)
 {
+	if (cpu_feature_enabled(X86_FEATURE_SHSTK)) {
+		if (pmd_flags(pmd) & _PAGE_COW) {
+			pmd = pmd_clear_flags(pmd, _PAGE_COW);
+			pmd = pmd_set_flags(pmd, _PAGE_DIRTY);
+		}
+	}
+
 	return pmd_set_flags(pmd, _PAGE_RW);
 }
 
@@ -488,17 +554,35 @@  static inline pud_t pud_mkold(pud_t pud)
 
 static inline pud_t pud_mkclean(pud_t pud)
 {
-	return pud_clear_flags(pud, _PAGE_DIRTY);
+	return pud_clear_flags(pud, _PAGE_DIRTY_BITS);
 }
 
 static inline pud_t pud_wrprotect(pud_t pud)
 {
+	/*
+	 * Blindly clearing _PAGE_RW might accidentally create
+	 * a shadow stack PUD (RW=0, Dirty=1).  Move the hardware
+	 * dirty value to the software bit.
+	 */
+	if (cpu_feature_enabled(X86_FEATURE_SHSTK)) {
+		pudval_t v = native_pud_val(pud);
+
+		v |= (v & _PAGE_DIRTY) >> _PAGE_BIT_DIRTY << _PAGE_BIT_COW;
+		pud = pud_clear_flags(__pud(v), _PAGE_DIRTY);
+	}
+
 	return pud_clear_flags(pud, _PAGE_RW);
 }
 
 static inline pud_t pud_mkdirty(pud_t pud)
 {
-	return pud_set_flags(pud, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
+	pudval_t dirty = _PAGE_DIRTY;
+
+	/* Avoid creating (HW)Dirty=1, Write=0 PUDs */
+	if (cpu_feature_enabled(X86_FEATURE_SHSTK) && !(pud_flags(pud) & _PAGE_RW))
+		dirty = _PAGE_COW;
+
+	return pud_set_flags(pud, dirty | _PAGE_SOFT_DIRTY);
 }
 
 static inline pud_t pud_mkdevmap(pud_t pud)
@@ -518,6 +602,13 @@  static inline pud_t pud_mkyoung(pud_t pud)
 
 static inline pud_t pud_mkwrite(pud_t pud)
 {
+	if (cpu_feature_enabled(X86_FEATURE_SHSTK)) {
+		if (pud_flags(pud) & _PAGE_COW) {
+			pud = pud_clear_flags(pud, _PAGE_COW);
+			pud = pud_set_flags(pud, _PAGE_DIRTY);
+		}
+	}
+
 	return pud_set_flags(pud, _PAGE_RW);
 }
 
@@ -1131,6 +1222,12 @@  extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
 #define pmd_write pmd_write
 static inline int pmd_write(pmd_t pmd)
 {
+	/*
+	 * If _PAGE_DIRTY is set, then the PMD must either have _PAGE_RW or
+	 * be a shadow stack PMD, which is logically writable.
+	 */
+	if (cpu_feature_enabled(X86_FEATURE_SHSTK))
+		return pmd_flags(pmd) & (_PAGE_RW | _PAGE_DIRTY);
 	return pmd_flags(pmd) & _PAGE_RW;
 }
 
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index b8b79d618bbc..b02084a181c2 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -23,7 +23,8 @@ 
 #define _PAGE_BIT_SOFTW2	10	/* " */
 #define _PAGE_BIT_SOFTW3	11	/* " */
 #define _PAGE_BIT_PAT_LARGE	12	/* On 2MB or 1GB pages */
-#define _PAGE_BIT_SOFTW4	58	/* available for programmer */
+#define _PAGE_BIT_SOFTW4	57	/* available for programmer */
+#define _PAGE_BIT_SOFTW5	58	/* available for programmer */
 #define _PAGE_BIT_PKEY_BIT0	59	/* Protection Keys, bit 1/4 */
 #define _PAGE_BIT_PKEY_BIT1	60	/* Protection Keys, bit 2/4 */
 #define _PAGE_BIT_PKEY_BIT2	61	/* Protection Keys, bit 3/4 */
@@ -36,6 +37,15 @@ 
 #define _PAGE_BIT_SOFT_DIRTY	_PAGE_BIT_SOFTW3 /* software dirty tracking */
 #define _PAGE_BIT_DEVMAP	_PAGE_BIT_SOFTW4
 
+/*
+ * Indicates a copy-on-write page.
+ */
+#ifdef CONFIG_X86_CET_USER
+#define _PAGE_BIT_COW		_PAGE_BIT_SOFTW5 /* copy-on-write */
+#else
+#define _PAGE_BIT_COW		0
+#endif
+
 /* If _PAGE_BIT_PRESENT is clear, we use these: */
 /* - if the user mapped it with PROT_NONE; pte_present gives true */
 #define _PAGE_BIT_PROTNONE	_PAGE_BIT_GLOBAL
@@ -117,6 +127,36 @@ 
 #define _PAGE_DEVMAP	(_AT(pteval_t, 0))
 #endif
 
+/*
+ * The hardware requires shadow stack to be read-only and Dirty.
+ * _PAGE_COW is a software-only bit used to separate copy-on-write PTEs
+ * from shadow stack PTEs:
+ * (a) A modified, copy-on-write (COW) page: (Write=0, Cow=1)
+ * (b) A R/O page that has been COW'ed: (Write=0, Cow=1)
+ *     The user page is in a R/O VMA, and get_user_pages() needs a
+ *     writable copy.  The page fault handler creates a copy of the page
+ *     and sets the new copy's PTE as Write=0, Cow=1.
+ * (c) A shadow stack PTE: (Write=0, Dirty=1)
+ * (d) A shared (copy-on-access) shadow stack PTE: (Write=0, Cow=1)
+ *     When a shadow stack page is being shared among processes (this
+ *     happens at fork()), its PTE is cleared of _PAGE_DIRTY, so the next
+ *     shadow stack access causes a fault, and the page is duplicated and
+ *     _PAGE_DIRTY is set again.  This is the COW equivalent for shadow
+ *     stack pages, even though it's copy-on-access rather than
+ *     copy-on-write.
+ * (e) A page where the processor observed a Write=1 PTE, started a write,
+ *     set Dirty=1, but then observed a Write=0 PTE (changed by another
+ *     thread).  That's possible today, but will not happen on processors
+ *     that support shadow stack.
+ */
+#ifdef CONFIG_X86_CET_USER
+#define _PAGE_COW	(_AT(pteval_t, 1) << _PAGE_BIT_COW)
+#else
+#define _PAGE_COW	(_AT(pteval_t, 0))
+#endif
+
+#define _PAGE_DIRTY_BITS (_PAGE_DIRTY | _PAGE_COW)
+
 #define _PAGE_PROTNONE	(_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
 
 /*