diff mbox

Re: [PATCH v6 03/11] mm, x86: Add support for eXclusive Page Frame Ownership (XPFO)

Message ID 20170912181303.aqjj5ri3mhscw63t@docker (mailing list archive)
State New, archived
Headers show

Commit Message

Tycho Andersen Sept. 12, 2017, 6:13 p.m. UTC
Hi Yisheng,

> On Tue, Sep 12, 2017 at 04:05:22PM +0800, Yisheng Xie wrote:
> > IMO, before a page is allocated, it is in buddy system, which means it is free
> > and no other 'map' on the page except direct map. Then if the page is allocated
> > to user, XPFO should unmap the direct map. otherwise the ret2dir may works at
> > this window before it is freed. Or maybe I'm still missing anything.
> 
> I agree that it seems broken. I'm just not sure why the test doesn't
> fail. It's certainly worth understanding.

Ok, so I think what's going on is that the page *is* mapped and unmapped by the
kernel as Juerg described, but only in certain cases. See prep_new_page(),
which has the following:

	if (!free_pages_prezeroed() && (gfp_flags & __GFP_ZERO))
		for (i = 0; i < (1 << order); i++)
			clear_highpage(page + i);

clear_highpage() maps and unmaps the pages, so that's why xpfo works with this
set.

I tried with CONFIG_PAGE_POISONING_ZERO=y and page_poison=y, and the
XPFO_READ_USER test does not fail, i.e. the read succeeds. So, I think we need
to include this zeroing condition in xpfo_alloc_pages(), something like the
patch below. Unfortunately, this fails to boot for me, probably for an
unrelated reason that I'll look into.

Thanks a lot!

Tycho


From bfc21a6438cf8c56741af94cac939f1b0f63752c Mon Sep 17 00:00:00 2001
From: Tycho Andersen <tycho@docker.com>
Date: Tue, 12 Sep 2017 12:06:41 -0600
Subject: [PATCH] draft of unmapping patch

Signed-off-by: Tycho Andersen <tycho@docker.com>
---
 include/linux/xpfo.h |  5 +++--
 mm/compaction.c      |  2 +-
 mm/internal.h        |  2 +-
 mm/page_alloc.c      | 10 ++++++----
 mm/xpfo.c            | 10 ++++++++--
 5 files changed, 19 insertions(+), 10 deletions(-)

Comments

Xie Yisheng Sept. 14, 2017, 6:15 a.m. UTC | #1
Hi Tycho,

On 2017/9/13 2:13, Tycho Andersen wrote:
> Hi Yisheng,
> 
>> On Tue, Sep 12, 2017 at 04:05:22PM +0800, Yisheng Xie wrote:
>>> IMO, before a page is allocated, it is in buddy system, which means it is free
>>> and no other 'map' on the page except direct map. Then if the page is allocated
>>> to user, XPFO should unmap the direct map. otherwise the ret2dir may works at
>>> this window before it is freed. Or maybe I'm still missing anything.
>>
>> I agree that it seems broken. I'm just not sure why the test doesn't
>> fail. It's certainly worth understanding.
> 
> Ok, so I think what's going on is that the page *is* mapped and unmapped by the
> kernel as Juerg described, but only in certain cases. See prep_new_page(),
> which has the following:
> 
> 	if (!free_pages_prezeroed() && (gfp_flags & __GFP_ZERO))
> 		for (i = 0; i < (1 << order); i++)
> 			clear_highpage(page + i);
> 
> clear_highpage() maps and unmaps the pages, so that's why xpfo works with this
> set.
Oh, I really missed this point. For we need zero the memory before user get them.

Thanks a lot for figuring out.

> 
> I tried with CONFIG_PAGE_POISONING_ZERO=y and page_poison=y, and the
> XPFO_READ_USER test does not fail, i.e. the read succeeds. So, I think we need
> to include this zeroing condition in xpfo_alloc_pages(), something like the
> patch below. Unfortunately, this fails to boot for me, probably for an
> unrelated reason that I'll look into.
Yes, seems need to fix in this case, and I also a litter puzzle about why boot fail.

Thanks
Yisheng Xie

> 
> Thanks a lot!
> 
> Tycho
> 
> 
>>From bfc21a6438cf8c56741af94cac939f1b0f63752c Mon Sep 17 00:00:00 2001
> From: Tycho Andersen <tycho@docker.com>
> Date: Tue, 12 Sep 2017 12:06:41 -0600
> Subject: [PATCH] draft of unmapping patch
> 
> Signed-off-by: Tycho Andersen <tycho@docker.com>
> ---
>  include/linux/xpfo.h |  5 +++--
>  mm/compaction.c      |  2 +-
>  mm/internal.h        |  2 +-
>  mm/page_alloc.c      | 10 ++++++----
>  mm/xpfo.c            | 10 ++++++++--
>  5 files changed, 19 insertions(+), 10 deletions(-)
> 
> diff --git a/include/linux/xpfo.h b/include/linux/xpfo.h
> index b24be9ac4a2d..c991bf7f051d 100644
> --- a/include/linux/xpfo.h
> +++ b/include/linux/xpfo.h
> @@ -29,7 +29,7 @@ void xpfo_flush_kernel_tlb(struct page *page, int order);
>  
>  void xpfo_kmap(void *kaddr, struct page *page);
>  void xpfo_kunmap(void *kaddr, struct page *page);
> -void xpfo_alloc_pages(struct page *page, int order, gfp_t gfp);
> +void xpfo_alloc_pages(struct page *page, int order, gfp_t gfp, bool will_map);
>  void xpfo_free_pages(struct page *page, int order);
>  
>  bool xpfo_page_is_unmapped(struct page *page);
> @@ -49,7 +49,8 @@ void xpfo_temp_unmap(const void *addr, size_t size, void **mapping,
>  
>  static inline void xpfo_kmap(void *kaddr, struct page *page) { }
>  static inline void xpfo_kunmap(void *kaddr, struct page *page) { }
> -static inline void xpfo_alloc_pages(struct page *page, int order, gfp_t gfp) { }
> +static inline void xpfo_alloc_pages(struct page *page, int order, gfp_t gfp,
> +				    bool will_map) { }
>  static inline void xpfo_free_pages(struct page *page, int order) { }
>  
>  static inline bool xpfo_page_is_unmapped(struct page *page) { return false; }
> diff --git a/mm/compaction.c b/mm/compaction.c
> index fb548e4c7bd4..9a222258e65c 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -76,7 +76,7 @@ static void map_pages(struct list_head *list)
>  		order = page_private(page);
>  		nr_pages = 1 << order;
>  
> -		post_alloc_hook(page, order, __GFP_MOVABLE);
> +		post_alloc_hook(page, order, __GFP_MOVABLE, false);
>  		if (order)
>  			split_page(page, order);
>  
> diff --git a/mm/internal.h b/mm/internal.h
> index 4ef49fc55e58..1a0331ec2b2d 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -165,7 +165,7 @@ extern void __free_pages_bootmem(struct page *page, unsigned long pfn,
>  					unsigned int order);
>  extern void prep_compound_page(struct page *page, unsigned int order);
>  extern void post_alloc_hook(struct page *page, unsigned int order,
> -					gfp_t gfp_flags);
> +					gfp_t gfp_flags, bool will_map);
>  extern int user_min_free_kbytes;
>  
>  #if defined CONFIG_COMPACTION || defined CONFIG_CMA
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 09fdf1bad21f..f73809847c58 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1750,7 +1750,7 @@ static bool check_new_pages(struct page *page, unsigned int order)
>  }
>  
>  inline void post_alloc_hook(struct page *page, unsigned int order,
> -				gfp_t gfp_flags)
> +				gfp_t gfp_flags, bool will_map)
>  {
>  	set_page_private(page, 0);
>  	set_page_refcounted(page);
> @@ -1759,18 +1759,20 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
>  	kernel_map_pages(page, 1 << order, 1);
>  	kernel_poison_pages(page, 1 << order, 1);
>  	kasan_alloc_pages(page, order);
> -	xpfo_alloc_pages(page, order, gfp_flags);
> +	xpfo_alloc_pages(page, order, gfp_flags, will_map);
>  	set_page_owner(page, order, gfp_flags);
>  }
>  
> +extern bool xpfo_test;
>  static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
>  							unsigned int alloc_flags)
>  {
>  	int i;
> +	bool needs_zero = !free_pages_prezeroed() && (gfp_flags & __GFP_ZERO);
>  
> -	post_alloc_hook(page, order, gfp_flags);
> +	post_alloc_hook(page, order, gfp_flags, needs_zero);
>  
> -	if (!free_pages_prezeroed() && (gfp_flags & __GFP_ZERO))
> +	if (needs_zero)
>  		for (i = 0; i < (1 << order); i++)
>  			clear_highpage(page + i);
>  
> diff --git a/mm/xpfo.c b/mm/xpfo.c
> index ca5d4d1838f9..dd25e24213fe 100644
> --- a/mm/xpfo.c
> +++ b/mm/xpfo.c
> @@ -86,7 +86,7 @@ static inline struct xpfo *lookup_xpfo(struct page *page)
>  	return (void *)page_ext + page_xpfo_ops.offset;
>  }
>  
> -void xpfo_alloc_pages(struct page *page, int order, gfp_t gfp)
> +void xpfo_alloc_pages(struct page *page, int order, gfp_t gfp, bool will_map)
>  {
>  	int i, flush_tlb = 0;
>  	struct xpfo *xpfo;
> @@ -116,8 +116,14 @@ void xpfo_alloc_pages(struct page *page, int order, gfp_t gfp)
>  			 * Tag the page as a user page and flush the TLB if it
>  			 * was previously allocated to the kernel.
>  			 */
> -			if (!test_and_set_bit(XPFO_PAGE_USER, &xpfo->flags))
> +			bool was_user = !test_and_set_bit(XPFO_PAGE_USER,
> +							  &xpfo->flags);
> +
> +			if (was_user || !will_map) {
> +				set_kpte(page_address(page + i), page + i,
> +					 __pgprot(0));
>  				flush_tlb = 1;
> +			}
>  		} else {
>  			/* Tag the page as a non-user (kernel) page */
>  			clear_bit(XPFO_PAGE_USER, &xpfo->flags);
>
Dave Hansen Sept. 20, 2017, 11:46 p.m. UTC | #2
On 09/12/2017 11:13 AM, Tycho Andersen wrote:
> -void xpfo_alloc_pages(struct page *page, int order, gfp_t gfp)
> +void xpfo_alloc_pages(struct page *page, int order, gfp_t gfp, bool will_map)
>  {
>  	int i, flush_tlb = 0;
>  	struct xpfo *xpfo;
> @@ -116,8 +116,14 @@ void xpfo_alloc_pages(struct page *page, int order, gfp_t gfp)
>  			 * Tag the page as a user page and flush the TLB if it
>  			 * was previously allocated to the kernel.
>  			 */
> -			if (!test_and_set_bit(XPFO_PAGE_USER, &xpfo->flags))
> +			bool was_user = !test_and_set_bit(XPFO_PAGE_USER,
> +							  &xpfo->flags);
> +
> +			if (was_user || !will_map) {
> +				set_kpte(page_address(page + i), page + i,
> +					 __pgprot(0));
>  				flush_tlb = 1;
> +			}

Shouldn't the "was_user" be "was_kernel"?

Also, the way this now works, let's say we have a nice, 2MB pmd_t (page
table entry) mapping a nice, 2MB page in the allocator.  Then it gets
allocated to userspace.  We do

	for (i = 0; i < (1 << order); i++)  {
		...
		set_kpte(page_address(page + i), page+i, __pgprot(0));
	}

The set_kpte() will take the nice, 2MB mapping and break it down into
512 4k mappings, all pointing to a non-present PTE, in a newly-allocated
PTE page.  So, you get the same result and waste 4k of memory in the
process, *AND* make it slower because we added a level to the page tables.

I think you actually want to make a single set_kpte() call at the end of
the function.  That's faster and preserves the large page in the direct
mapping.
Tycho Andersen Sept. 21, 2017, 12:02 a.m. UTC | #3
On Wed, Sep 20, 2017 at 04:46:41PM -0700, Dave Hansen wrote:
> On 09/12/2017 11:13 AM, Tycho Andersen wrote:
> > -void xpfo_alloc_pages(struct page *page, int order, gfp_t gfp)
> > +void xpfo_alloc_pages(struct page *page, int order, gfp_t gfp, bool will_map)
> >  {
> >  	int i, flush_tlb = 0;
> >  	struct xpfo *xpfo;
> > @@ -116,8 +116,14 @@ void xpfo_alloc_pages(struct page *page, int order, gfp_t gfp)
> >  			 * Tag the page as a user page and flush the TLB if it
> >  			 * was previously allocated to the kernel.
> >  			 */
> > -			if (!test_and_set_bit(XPFO_PAGE_USER, &xpfo->flags))
> > +			bool was_user = !test_and_set_bit(XPFO_PAGE_USER,
> > +							  &xpfo->flags);
> > +
> > +			if (was_user || !will_map) {
> > +				set_kpte(page_address(page + i), page + i,
> > +					 __pgprot(0));
> >  				flush_tlb = 1;
> > +			}
> 
> Shouldn't the "was_user" be "was_kernel"?

Oof, yes, thanks.

> Also, the way this now works, let's say we have a nice, 2MB pmd_t (page
> table entry) mapping a nice, 2MB page in the allocator.  Then it gets
> allocated to userspace.  We do
> 
> 	for (i = 0; i < (1 << order); i++)  {
> 		...
> 		set_kpte(page_address(page + i), page+i, __pgprot(0));
> 	}
> 
> The set_kpte() will take the nice, 2MB mapping and break it down into
> 512 4k mappings, all pointing to a non-present PTE, in a newly-allocated
> PTE page.  So, you get the same result and waste 4k of memory in the
> process, *AND* make it slower because we added a level to the page tables.
> 
> I think you actually want to make a single set_kpte() call at the end of
> the function.  That's faster and preserves the large page in the direct
> mapping.

...and makes it easier to pair tlb flushes with changing the
protections. I guess we still need the for loop, because we need to
set/unset the xpfo bits as necessary, but I'll switch it to a single
set_kpte(). This also implies that the xpfo bits should all be the
same on every page in the mapping, which I think is true.

This will be a nice change, thanks!

Tycho
Dave Hansen Sept. 21, 2017, 12:04 a.m. UTC | #4
On 09/20/2017 05:02 PM, Tycho Andersen wrote:
> ...and makes it easier to pair tlb flushes with changing the
> protections. I guess we still need the for loop, because we need to
> set/unset the xpfo bits as necessary, but I'll switch it to a single
> set_kpte(). This also implies that the xpfo bits should all be the
> same on every page in the mapping, which I think is true.

FWIW, it's a bit bonkers to keep all this duplicate xpfo metadata for
compound pages.  You could probably get away with only keeping it for
the head page.
diff mbox

Patch

diff --git a/include/linux/xpfo.h b/include/linux/xpfo.h
index b24be9ac4a2d..c991bf7f051d 100644
--- a/include/linux/xpfo.h
+++ b/include/linux/xpfo.h
@@ -29,7 +29,7 @@  void xpfo_flush_kernel_tlb(struct page *page, int order);
 
 void xpfo_kmap(void *kaddr, struct page *page);
 void xpfo_kunmap(void *kaddr, struct page *page);
-void xpfo_alloc_pages(struct page *page, int order, gfp_t gfp);
+void xpfo_alloc_pages(struct page *page, int order, gfp_t gfp, bool will_map);
 void xpfo_free_pages(struct page *page, int order);
 
 bool xpfo_page_is_unmapped(struct page *page);
@@ -49,7 +49,8 @@  void xpfo_temp_unmap(const void *addr, size_t size, void **mapping,
 
 static inline void xpfo_kmap(void *kaddr, struct page *page) { }
 static inline void xpfo_kunmap(void *kaddr, struct page *page) { }
-static inline void xpfo_alloc_pages(struct page *page, int order, gfp_t gfp) { }
+static inline void xpfo_alloc_pages(struct page *page, int order, gfp_t gfp,
+				    bool will_map) { }
 static inline void xpfo_free_pages(struct page *page, int order) { }
 
 static inline bool xpfo_page_is_unmapped(struct page *page) { return false; }
diff --git a/mm/compaction.c b/mm/compaction.c
index fb548e4c7bd4..9a222258e65c 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -76,7 +76,7 @@  static void map_pages(struct list_head *list)
 		order = page_private(page);
 		nr_pages = 1 << order;
 
-		post_alloc_hook(page, order, __GFP_MOVABLE);
+		post_alloc_hook(page, order, __GFP_MOVABLE, false);
 		if (order)
 			split_page(page, order);
 
diff --git a/mm/internal.h b/mm/internal.h
index 4ef49fc55e58..1a0331ec2b2d 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -165,7 +165,7 @@  extern void __free_pages_bootmem(struct page *page, unsigned long pfn,
 					unsigned int order);
 extern void prep_compound_page(struct page *page, unsigned int order);
 extern void post_alloc_hook(struct page *page, unsigned int order,
-					gfp_t gfp_flags);
+					gfp_t gfp_flags, bool will_map);
 extern int user_min_free_kbytes;
 
 #if defined CONFIG_COMPACTION || defined CONFIG_CMA
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 09fdf1bad21f..f73809847c58 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1750,7 +1750,7 @@  static bool check_new_pages(struct page *page, unsigned int order)
 }
 
 inline void post_alloc_hook(struct page *page, unsigned int order,
-				gfp_t gfp_flags)
+				gfp_t gfp_flags, bool will_map)
 {
 	set_page_private(page, 0);
 	set_page_refcounted(page);
@@ -1759,18 +1759,20 @@  inline void post_alloc_hook(struct page *page, unsigned int order,
 	kernel_map_pages(page, 1 << order, 1);
 	kernel_poison_pages(page, 1 << order, 1);
 	kasan_alloc_pages(page, order);
-	xpfo_alloc_pages(page, order, gfp_flags);
+	xpfo_alloc_pages(page, order, gfp_flags, will_map);
 	set_page_owner(page, order, gfp_flags);
 }
 
+extern bool xpfo_test;
 static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
 							unsigned int alloc_flags)
 {
 	int i;
+	bool needs_zero = !free_pages_prezeroed() && (gfp_flags & __GFP_ZERO);
 
-	post_alloc_hook(page, order, gfp_flags);
+	post_alloc_hook(page, order, gfp_flags, needs_zero);
 
-	if (!free_pages_prezeroed() && (gfp_flags & __GFP_ZERO))
+	if (needs_zero)
 		for (i = 0; i < (1 << order); i++)
 			clear_highpage(page + i);
 
diff --git a/mm/xpfo.c b/mm/xpfo.c
index ca5d4d1838f9..dd25e24213fe 100644
--- a/mm/xpfo.c
+++ b/mm/xpfo.c
@@ -86,7 +86,7 @@  static inline struct xpfo *lookup_xpfo(struct page *page)
 	return (void *)page_ext + page_xpfo_ops.offset;
 }
 
-void xpfo_alloc_pages(struct page *page, int order, gfp_t gfp)
+void xpfo_alloc_pages(struct page *page, int order, gfp_t gfp, bool will_map)
 {
 	int i, flush_tlb = 0;
 	struct xpfo *xpfo;
@@ -116,8 +116,14 @@  void xpfo_alloc_pages(struct page *page, int order, gfp_t gfp)
 			 * Tag the page as a user page and flush the TLB if it
 			 * was previously allocated to the kernel.
 			 */
-			if (!test_and_set_bit(XPFO_PAGE_USER, &xpfo->flags))
+			bool was_user = !test_and_set_bit(XPFO_PAGE_USER,
+							  &xpfo->flags);
+
+			if (was_user || !will_map) {
+				set_kpte(page_address(page + i), page + i,
+					 __pgprot(0));
 				flush_tlb = 1;
+			}
 		} else {
 			/* Tag the page as a non-user (kernel) page */
 			clear_bit(XPFO_PAGE_USER, &xpfo->flags);