diff mbox

[RFC,v2,16/27] mm: Modify can_follow_write_pte/pmd for shadow stack

Message ID 1531868610.3541.21.camel@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Yu-cheng Yu July 17, 2018, 11:03 p.m. UTC
On Fri, 2018-07-13 at 11:26 -0700, Dave Hansen wrote:
> On 07/11/2018 10:05 AM, Yu-cheng Yu wrote:
> > 
> > My understanding is that we don't want to follow write pte if the page
> > is shared as read-only.  For a SHSTK page, that is (R/O + DIRTY_SW),
> > which means the SHSTK page has not been COW'ed.  Is that right?
> Let's look at the code again:
> 
> > 
> > -static inline bool can_follow_write_pte(pte_t pte, unsigned int flags)
> > +static inline bool can_follow_write_pte(pte_t pte, unsigned int flags,
> > +					bool shstk)
> >  {
> > +	bool pte_cowed = shstk ? is_shstk_pte(pte):pte_dirty(pte);
> > +
> >  	return pte_write(pte) ||
> > -		((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte));
> > +		((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_cowed);
> >  }
> This is another case where the naming of pte_*() is biting us vs. the
> perversion of the PTE bits.  The lack of comments and explanation inthe
> patch is compounding the confusion.
> 
> We need to find a way to differentiate "someone can write to this PTE"
> from "the write bit is set in this PTE".
> 
> In this particular hunk, we need to make it clear that pte_write() is
> *never* true for shadowstack PTEs.  In other words, shadow stack VMAs
> will (should?) never even *see* a pte_write() PTE.
> 
> I think this is a case where you just need to bite the bullet and
> bifurcate can_follow_write_pte().  Just separate the shadowstack and
> non-shadowstack parts.

In case I don't understand the exact issue.
What about the following.

Comments

Dave Hansen July 17, 2018, 11:11 p.m. UTC | #1
On 07/17/2018 04:03 PM, Yu-cheng Yu wrote:
> On Fri, 2018-07-13 at 11:26 -0700, Dave Hansen wrote:
>> On 07/11/2018 10:05 AM, Yu-cheng Yu wrote:
>>>
>>> My understanding is that we don't want to follow write pte if the page
>>> is shared as read-only.  For a SHSTK page, that is (R/O + DIRTY_SW),
>>> which means the SHSTK page has not been COW'ed.  Is that right?
>> Let's look at the code again:
>>
>>>
>>> -static inline bool can_follow_write_pte(pte_t pte, unsigned int flags)
>>> +static inline bool can_follow_write_pte(pte_t pte, unsigned int flags,
>>> +					bool shstk)
>>>  {
>>> +	bool pte_cowed = shstk ? is_shstk_pte(pte):pte_dirty(pte);
>>> +
>>>  	return pte_write(pte) ||
>>> -		((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte));
>>> +		((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_cowed);
>>>  }
>> This is another case where the naming of pte_*() is biting us vs. the
>> perversion of the PTE bits.  The lack of comments and explanation inthe
>> patch is compounding the confusion.
>>
>> We need to find a way to differentiate "someone can write to this PTE"
>> from "the write bit is set in this PTE".
>>
>> In this particular hunk, we need to make it clear that pte_write() is
>> *never* true for shadowstack PTEs.  In other words, shadow stack VMAs
>> will (should?) never even *see* a pte_write() PTE.
>>
>> I think this is a case where you just need to bite the bullet and
>> bifurcate can_follow_write_pte().  Just separate the shadowstack and
>> non-shadowstack parts.
> 
> In case I don't understand the exact issue.
> What about the following.
> 
> diff --git a/mm/gup.c b/mm/gup.c
> index fc5f98069f4e..45a0837b27f9 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -70,6 +70,12 @@ static inline bool can_follow_write_pte(pte_t pte, unsigned int flags)
>  		((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte));
>  }
>  
> +static inline bool can_follow_write_shstk_pte(pte_t pte, unsigned int flags)
> +{
> +	return ((flags & FOLL_FORCE) && (flags & FOLL_COW) &&
> +		is_shstk_pte(pte));
> +}
> +
>  static struct page *follow_page_pte(struct vm_area_struct *vma,
>  		unsigned long address, pmd_t *pmd, unsigned int flags)
>  {
> @@ -105,9 +111,16 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
>  	}
>  	if ((flags & FOLL_NUMA) && pte_protnone(pte))
>  		goto no_page;
> -	if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, flags)) {
> -		pte_unmap_unlock(ptep, ptl);
> -		return NULL;
> +	if (flags & FOLL_WRITE) {
> +		if (is_shstk_mapping(vma->vm_flags)) {
> +			if (!can_follow_write_shstk_pte(pte, flags)) {
> +				pte_unmap_unlock(ptep, ptl);
> +				return NULL;
> +			}
> +		} else if (!can_follow_write_pte(pte, flags) {
> +			pte_unmap_unlock(ptep, ptl);
> +			return NULL;
> +		}

That looks pretty horrible. :(

We need:

bool can_follow_write(vma, pte_t pte, unsigned int flags)
{
	if (!is_shstk_mapping(vma->vm_flags)) {
		// vanilla case here		
	} else {
		// shadowstack case here
	}
}
Dave Hansen July 17, 2018, 11:15 p.m. UTC | #2
On 07/17/2018 04:03 PM, Yu-cheng Yu wrote:
> We need to find a way to differentiate "someone can write to this PTE"
> from "the write bit is set in this PTE".

Please think about this:

	Should pte_write() tell us whether PTE.W=1, or should it tell us
	that *something* can write to the PTE, which would include
	PTE.W=0/D=1?
diff mbox

Patch

diff --git a/mm/gup.c b/mm/gup.c
index fc5f98069f4e..45a0837b27f9 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -70,6 +70,12 @@  static inline bool can_follow_write_pte(pte_t pte, unsigned int flags)
 		((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte));
 }
 
+static inline bool can_follow_write_shstk_pte(pte_t pte, unsigned int flags)
+{
+	return ((flags & FOLL_FORCE) && (flags & FOLL_COW) &&
+		is_shstk_pte(pte));
+}
+
 static struct page *follow_page_pte(struct vm_area_struct *vma,
 		unsigned long address, pmd_t *pmd, unsigned int flags)
 {
@@ -105,9 +111,16 @@  static struct page *follow_page_pte(struct vm_area_struct *vma,
 	}
 	if ((flags & FOLL_NUMA) && pte_protnone(pte))
 		goto no_page;
-	if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, flags)) {
-		pte_unmap_unlock(ptep, ptl);
-		return NULL;
+	if (flags & FOLL_WRITE) {
+		if (is_shstk_mapping(vma->vm_flags)) {
+			if (!can_follow_write_shstk_pte(pte, flags)) {
+				pte_unmap_unlock(ptep, ptl);
+				return NULL;
+			}
+		} else if (!can_follow_write_pte(pte, flags) {
+			pte_unmap_unlock(ptep, ptl);
+			return NULL;
+		}
 	}
 
 	page = vm_normal_page(vma, address, pte);