Message ID | 20170912181303.aqjj5ri3mhscw63t@docker (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Hi Tycho, On 2017/9/13 2:13, Tycho Andersen wrote: > Hi Yisheng, > >> On Tue, Sep 12, 2017 at 04:05:22PM +0800, Yisheng Xie wrote: >>> IMO, before a page is allocated, it is in buddy system, which means it is free >>> and no other 'map' on the page except direct map. Then if the page is allocated >>> to user, XPFO should unmap the direct map. otherwise the ret2dir may works at >>> this window before it is freed. Or maybe I'm still missing anything. >> >> I agree that it seems broken. I'm just not sure why the test doesn't >> fail. It's certainly worth understanding. > > Ok, so I think what's going on is that the page *is* mapped and unmapped by the > kernel as Juerg described, but only in certain cases. See prep_new_page(), > which has the following: > > if (!free_pages_prezeroed() && (gfp_flags & __GFP_ZERO)) > for (i = 0; i < (1 << order); i++) > clear_highpage(page + i); > > clear_highpage() maps and unmaps the pages, so that's why xpfo works with this > set. Oh, I really missed this point. For we need zero the memory before user get them. Thanks a lot for figuring out. > > I tried with CONFIG_PAGE_POISONING_ZERO=y and page_poison=y, and the > XPFO_READ_USER test does not fail, i.e. the read succeeds. So, I think we need > to include this zeroing condition in xpfo_alloc_pages(), something like the > patch below. Unfortunately, this fails to boot for me, probably for an > unrelated reason that I'll look into. Yes, seems need to fix in this case, and I also a litter puzzle about why boot fail. Thanks Yisheng Xie > > Thanks a lot! > > Tycho > > >>From bfc21a6438cf8c56741af94cac939f1b0f63752c Mon Sep 17 00:00:00 2001 > From: Tycho Andersen <tycho@docker.com> > Date: Tue, 12 Sep 2017 12:06:41 -0600 > Subject: [PATCH] draft of unmapping patch > > Signed-off-by: Tycho Andersen <tycho@docker.com> > --- > include/linux/xpfo.h | 5 +++-- > mm/compaction.c | 2 +- > mm/internal.h | 2 +- > mm/page_alloc.c | 10 ++++++---- > mm/xpfo.c | 10 ++++++++-- > 5 files changed, 19 insertions(+), 10 deletions(-) > > diff --git a/include/linux/xpfo.h b/include/linux/xpfo.h > index b24be9ac4a2d..c991bf7f051d 100644 > --- a/include/linux/xpfo.h > +++ b/include/linux/xpfo.h > @@ -29,7 +29,7 @@ void xpfo_flush_kernel_tlb(struct page *page, int order); > > void xpfo_kmap(void *kaddr, struct page *page); > void xpfo_kunmap(void *kaddr, struct page *page); > -void xpfo_alloc_pages(struct page *page, int order, gfp_t gfp); > +void xpfo_alloc_pages(struct page *page, int order, gfp_t gfp, bool will_map); > void xpfo_free_pages(struct page *page, int order); > > bool xpfo_page_is_unmapped(struct page *page); > @@ -49,7 +49,8 @@ void xpfo_temp_unmap(const void *addr, size_t size, void **mapping, > > static inline void xpfo_kmap(void *kaddr, struct page *page) { } > static inline void xpfo_kunmap(void *kaddr, struct page *page) { } > -static inline void xpfo_alloc_pages(struct page *page, int order, gfp_t gfp) { } > +static inline void xpfo_alloc_pages(struct page *page, int order, gfp_t gfp, > + bool will_map) { } > static inline void xpfo_free_pages(struct page *page, int order) { } > > static inline bool xpfo_page_is_unmapped(struct page *page) { return false; } > diff --git a/mm/compaction.c b/mm/compaction.c > index fb548e4c7bd4..9a222258e65c 100644 > --- a/mm/compaction.c > +++ b/mm/compaction.c > @@ -76,7 +76,7 @@ static void map_pages(struct list_head *list) > order = page_private(page); > nr_pages = 1 << order; > > - post_alloc_hook(page, order, __GFP_MOVABLE); > + post_alloc_hook(page, order, __GFP_MOVABLE, false); > if (order) > split_page(page, order); > > diff --git a/mm/internal.h b/mm/internal.h > index 4ef49fc55e58..1a0331ec2b2d 100644 > --- a/mm/internal.h > +++ b/mm/internal.h > @@ -165,7 +165,7 @@ extern void __free_pages_bootmem(struct page *page, unsigned long pfn, > unsigned int order); > extern void prep_compound_page(struct page *page, unsigned int order); > extern void post_alloc_hook(struct page *page, unsigned int order, > - gfp_t gfp_flags); > + gfp_t gfp_flags, bool will_map); > extern int user_min_free_kbytes; > > #if defined CONFIG_COMPACTION || defined CONFIG_CMA > diff --git a/mm/page_alloc.c b/mm/page_alloc.c > index 09fdf1bad21f..f73809847c58 100644 > --- a/mm/page_alloc.c > +++ b/mm/page_alloc.c > @@ -1750,7 +1750,7 @@ static bool check_new_pages(struct page *page, unsigned int order) > } > > inline void post_alloc_hook(struct page *page, unsigned int order, > - gfp_t gfp_flags) > + gfp_t gfp_flags, bool will_map) > { > set_page_private(page, 0); > set_page_refcounted(page); > @@ -1759,18 +1759,20 @@ inline void post_alloc_hook(struct page *page, unsigned int order, > kernel_map_pages(page, 1 << order, 1); > kernel_poison_pages(page, 1 << order, 1); > kasan_alloc_pages(page, order); > - xpfo_alloc_pages(page, order, gfp_flags); > + xpfo_alloc_pages(page, order, gfp_flags, will_map); > set_page_owner(page, order, gfp_flags); > } > > +extern bool xpfo_test; > static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, > unsigned int alloc_flags) > { > int i; > + bool needs_zero = !free_pages_prezeroed() && (gfp_flags & __GFP_ZERO); > > - post_alloc_hook(page, order, gfp_flags); > + post_alloc_hook(page, order, gfp_flags, needs_zero); > > - if (!free_pages_prezeroed() && (gfp_flags & __GFP_ZERO)) > + if (needs_zero) > for (i = 0; i < (1 << order); i++) > clear_highpage(page + i); > > diff --git a/mm/xpfo.c b/mm/xpfo.c > index ca5d4d1838f9..dd25e24213fe 100644 > --- a/mm/xpfo.c > +++ b/mm/xpfo.c > @@ -86,7 +86,7 @@ static inline struct xpfo *lookup_xpfo(struct page *page) > return (void *)page_ext + page_xpfo_ops.offset; > } > > -void xpfo_alloc_pages(struct page *page, int order, gfp_t gfp) > +void xpfo_alloc_pages(struct page *page, int order, gfp_t gfp, bool will_map) > { > int i, flush_tlb = 0; > struct xpfo *xpfo; > @@ -116,8 +116,14 @@ void xpfo_alloc_pages(struct page *page, int order, gfp_t gfp) > * Tag the page as a user page and flush the TLB if it > * was previously allocated to the kernel. > */ > - if (!test_and_set_bit(XPFO_PAGE_USER, &xpfo->flags)) > + bool was_user = !test_and_set_bit(XPFO_PAGE_USER, > + &xpfo->flags); > + > + if (was_user || !will_map) { > + set_kpte(page_address(page + i), page + i, > + __pgprot(0)); > flush_tlb = 1; > + } > } else { > /* Tag the page as a non-user (kernel) page */ > clear_bit(XPFO_PAGE_USER, &xpfo->flags); >
On 09/12/2017 11:13 AM, Tycho Andersen wrote: > -void xpfo_alloc_pages(struct page *page, int order, gfp_t gfp) > +void xpfo_alloc_pages(struct page *page, int order, gfp_t gfp, bool will_map) > { > int i, flush_tlb = 0; > struct xpfo *xpfo; > @@ -116,8 +116,14 @@ void xpfo_alloc_pages(struct page *page, int order, gfp_t gfp) > * Tag the page as a user page and flush the TLB if it > * was previously allocated to the kernel. > */ > - if (!test_and_set_bit(XPFO_PAGE_USER, &xpfo->flags)) > + bool was_user = !test_and_set_bit(XPFO_PAGE_USER, > + &xpfo->flags); > + > + if (was_user || !will_map) { > + set_kpte(page_address(page + i), page + i, > + __pgprot(0)); > flush_tlb = 1; > + } Shouldn't the "was_user" be "was_kernel"? Also, the way this now works, let's say we have a nice, 2MB pmd_t (page table entry) mapping a nice, 2MB page in the allocator. Then it gets allocated to userspace. We do for (i = 0; i < (1 << order); i++) { ... set_kpte(page_address(page + i), page+i, __pgprot(0)); } The set_kpte() will take the nice, 2MB mapping and break it down into 512 4k mappings, all pointing to a non-present PTE, in a newly-allocated PTE page. So, you get the same result and waste 4k of memory in the process, *AND* make it slower because we added a level to the page tables. I think you actually want to make a single set_kpte() call at the end of the function. That's faster and preserves the large page in the direct mapping.
On Wed, Sep 20, 2017 at 04:46:41PM -0700, Dave Hansen wrote: > On 09/12/2017 11:13 AM, Tycho Andersen wrote: > > -void xpfo_alloc_pages(struct page *page, int order, gfp_t gfp) > > +void xpfo_alloc_pages(struct page *page, int order, gfp_t gfp, bool will_map) > > { > > int i, flush_tlb = 0; > > struct xpfo *xpfo; > > @@ -116,8 +116,14 @@ void xpfo_alloc_pages(struct page *page, int order, gfp_t gfp) > > * Tag the page as a user page and flush the TLB if it > > * was previously allocated to the kernel. > > */ > > - if (!test_and_set_bit(XPFO_PAGE_USER, &xpfo->flags)) > > + bool was_user = !test_and_set_bit(XPFO_PAGE_USER, > > + &xpfo->flags); > > + > > + if (was_user || !will_map) { > > + set_kpte(page_address(page + i), page + i, > > + __pgprot(0)); > > flush_tlb = 1; > > + } > > Shouldn't the "was_user" be "was_kernel"? Oof, yes, thanks. > Also, the way this now works, let's say we have a nice, 2MB pmd_t (page > table entry) mapping a nice, 2MB page in the allocator. Then it gets > allocated to userspace. We do > > for (i = 0; i < (1 << order); i++) { > ... > set_kpte(page_address(page + i), page+i, __pgprot(0)); > } > > The set_kpte() will take the nice, 2MB mapping and break it down into > 512 4k mappings, all pointing to a non-present PTE, in a newly-allocated > PTE page. So, you get the same result and waste 4k of memory in the > process, *AND* make it slower because we added a level to the page tables. > > I think you actually want to make a single set_kpte() call at the end of > the function. That's faster and preserves the large page in the direct > mapping. ...and makes it easier to pair tlb flushes with changing the protections. I guess we still need the for loop, because we need to set/unset the xpfo bits as necessary, but I'll switch it to a single set_kpte(). This also implies that the xpfo bits should all be the same on every page in the mapping, which I think is true. This will be a nice change, thanks! Tycho
On 09/20/2017 05:02 PM, Tycho Andersen wrote: > ...and makes it easier to pair tlb flushes with changing the > protections. I guess we still need the for loop, because we need to > set/unset the xpfo bits as necessary, but I'll switch it to a single > set_kpte(). This also implies that the xpfo bits should all be the > same on every page in the mapping, which I think is true. FWIW, it's a bit bonkers to keep all this duplicate xpfo metadata for compound pages. You could probably get away with only keeping it for the head page.
diff --git a/include/linux/xpfo.h b/include/linux/xpfo.h index b24be9ac4a2d..c991bf7f051d 100644 --- a/include/linux/xpfo.h +++ b/include/linux/xpfo.h @@ -29,7 +29,7 @@ void xpfo_flush_kernel_tlb(struct page *page, int order); void xpfo_kmap(void *kaddr, struct page *page); void xpfo_kunmap(void *kaddr, struct page *page); -void xpfo_alloc_pages(struct page *page, int order, gfp_t gfp); +void xpfo_alloc_pages(struct page *page, int order, gfp_t gfp, bool will_map); void xpfo_free_pages(struct page *page, int order); bool xpfo_page_is_unmapped(struct page *page); @@ -49,7 +49,8 @@ void xpfo_temp_unmap(const void *addr, size_t size, void **mapping, static inline void xpfo_kmap(void *kaddr, struct page *page) { } static inline void xpfo_kunmap(void *kaddr, struct page *page) { } -static inline void xpfo_alloc_pages(struct page *page, int order, gfp_t gfp) { } +static inline void xpfo_alloc_pages(struct page *page, int order, gfp_t gfp, + bool will_map) { } static inline void xpfo_free_pages(struct page *page, int order) { } static inline bool xpfo_page_is_unmapped(struct page *page) { return false; } diff --git a/mm/compaction.c b/mm/compaction.c index fb548e4c7bd4..9a222258e65c 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -76,7 +76,7 @@ static void map_pages(struct list_head *list) order = page_private(page); nr_pages = 1 << order; - post_alloc_hook(page, order, __GFP_MOVABLE); + post_alloc_hook(page, order, __GFP_MOVABLE, false); if (order) split_page(page, order); diff --git a/mm/internal.h b/mm/internal.h index 4ef49fc55e58..1a0331ec2b2d 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -165,7 +165,7 @@ extern void __free_pages_bootmem(struct page *page, unsigned long pfn, unsigned int order); extern void prep_compound_page(struct page *page, unsigned int order); extern void post_alloc_hook(struct page *page, unsigned int order, - gfp_t gfp_flags); + gfp_t gfp_flags, bool will_map); extern int user_min_free_kbytes; #if defined CONFIG_COMPACTION || defined CONFIG_CMA diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 09fdf1bad21f..f73809847c58 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1750,7 +1750,7 @@ static bool check_new_pages(struct page *page, unsigned int order) } inline void post_alloc_hook(struct page *page, unsigned int order, - gfp_t gfp_flags) + gfp_t gfp_flags, bool will_map) { set_page_private(page, 0); set_page_refcounted(page); @@ -1759,18 +1759,20 @@ inline void post_alloc_hook(struct page *page, unsigned int order, kernel_map_pages(page, 1 << order, 1); kernel_poison_pages(page, 1 << order, 1); kasan_alloc_pages(page, order); - xpfo_alloc_pages(page, order, gfp_flags); + xpfo_alloc_pages(page, order, gfp_flags, will_map); set_page_owner(page, order, gfp_flags); } +extern bool xpfo_test; static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, unsigned int alloc_flags) { int i; + bool needs_zero = !free_pages_prezeroed() && (gfp_flags & __GFP_ZERO); - post_alloc_hook(page, order, gfp_flags); + post_alloc_hook(page, order, gfp_flags, needs_zero); - if (!free_pages_prezeroed() && (gfp_flags & __GFP_ZERO)) + if (needs_zero) for (i = 0; i < (1 << order); i++) clear_highpage(page + i); diff --git a/mm/xpfo.c b/mm/xpfo.c index ca5d4d1838f9..dd25e24213fe 100644 --- a/mm/xpfo.c +++ b/mm/xpfo.c @@ -86,7 +86,7 @@ static inline struct xpfo *lookup_xpfo(struct page *page) return (void *)page_ext + page_xpfo_ops.offset; } -void xpfo_alloc_pages(struct page *page, int order, gfp_t gfp) +void xpfo_alloc_pages(struct page *page, int order, gfp_t gfp, bool will_map) { int i, flush_tlb = 0; struct xpfo *xpfo; @@ -116,8 +116,14 @@ void xpfo_alloc_pages(struct page *page, int order, gfp_t gfp) * Tag the page as a user page and flush the TLB if it * was previously allocated to the kernel. */ - if (!test_and_set_bit(XPFO_PAGE_USER, &xpfo->flags)) + bool was_user = !test_and_set_bit(XPFO_PAGE_USER, + &xpfo->flags); + + if (was_user || !will_map) { + set_kpte(page_address(page + i), page + i, + __pgprot(0)); flush_tlb = 1; + } } else { /* Tag the page as a non-user (kernel) page */ clear_bit(XPFO_PAGE_USER, &xpfo->flags);