@@ -93,6 +93,16 @@ static inline struct page_ext *page_ext_next(struct page_ext *curr)
return next;
}
+struct page_ext_iter {
+ unsigned long pfn;
+ struct page_ext *page_ext;
+};
+
+struct page_ext *page_ext_iter_begin(struct page_ext_iter *iter, struct page *page);
+struct page_ext *page_ext_iter_get(const struct page_ext_iter *iter);
+struct page_ext *page_ext_iter_next(struct page_ext_iter *iter);
+void page_ext_iter_end(struct page_ext_iter *iter);
+
#else /* !CONFIG_PAGE_EXTENSION */
struct page_ext;
@@ -549,3 +549,58 @@ void page_ext_put(struct page_ext *page_ext)
rcu_read_unlock();
}
+
+/**
+ * page_ext_iter_begin() - Prepare for iterating through page extensions.
+ * @iter: page extension iterator.
+ * @page: The page we're interested in.
+ *
+ * Return: NULL if no page_ext exists for this page.
+ */
+struct page_ext *page_ext_iter_begin(struct page_ext_iter *iter, struct page *page)
+{
+ iter->pfn = page_to_pfn(page);
+ iter->page_ext = page_ext_get(page);
+
+ return iter->page_ext;
+}
+
+/**
+ * page_ext_iter_get() - Get current page extension
+ * @iter: page extension iterator.
+ *
+ * Return: NULL if no page_ext exists for this iterator.
+ */
+struct page_ext *page_ext_iter_get(const struct page_ext_iter *iter)
+{
+ return iter->page_ext;
+}
+
+/**
+ * page_ext_iter_next() - Get next page extension
+ * @iter: page extension iterator.
+ *
+ * Return: NULL if no next page_ext exists.
+ */
+struct page_ext *page_ext_iter_next(struct page_ext_iter *iter)
+{
+ if (!iter->page_ext)
+ return NULL;
+
+ page_ext_put(iter->page_ext);
+
+ iter->pfn++;
+ iter->page_ext = page_ext_get(pfn_to_page(iter->pfn));
+
+ return iter->page_ext;
+}
+
+/**
+ * page_ext_iter_end() - End iteration through page extensions.
+ * @iter: page extension iterator.
+ */
+void page_ext_iter_end(struct page_ext_iter *iter)
+{
+ page_ext_put(iter->page_ext);
+ iter->page_ext = NULL;
+}
The page extension implementation assumes that all page extensions of a given page order are stored in the same memory section. The function page_ext_next() relies on this assumption by adding an offset to the current object to return the next adjacent page extension. This behavior works as expected for flatmem but fails for sparsemem when using 1G pages. Commit e98337d11bbd ("mm/contig_alloc: support __GFP_COMP") exposes this issue, making it possible for a crash when using page_owner or page_table_check page extensions. The problem is that for 1G pages, the page extensions may span memory section boundaries and be stored in different memory sections. This issue was not visible before commit e98337d11bbd ("mm/contig_alloc: support __GFP_COMP") because alloc_contig_pages() never passed more than MAX_PAGE_ORDER to post_alloc_hook(). However, the mentioned commit changed this behavior allowing the full 1G page order to be passed. Reproducer: 1. Build the kernel with CONFIG_SPARSEMEM=y and the table extensions 2. Pass 'default_hugepagesz=1 page_owner=on' in the kernel command-line 3. Reserve one 1G page at run-time, this should crash (backtrace below) To address this issue, this commit introduces a new API for iterating through page extensions. In page_ext_iter_next(), we always go through page_ext_get() to guarantee that we do a new memory section lookup for the next page extension. In the future, this API could be used as a basis to implement for_each_page_ext() type of macro. Thanks to David Hildenbrand for helping identify the root cause and providing suggestions on how to fix it (final implementation and bugs are all mine though). Here's the backtrace, without kasan you can get random crashes: [ 76.052526] BUG: KASAN: slab-out-of-bounds in __update_page_owner_handle+0x238/0x298 [ 76.060283] Write of size 4 at addr ffff07ff96240038 by task tee/3598 [ 76.066714] [ 76.068203] CPU: 88 UID: 0 PID: 3598 Comm: tee Kdump: loaded Not tainted 6.13.0-rep1 #3 [ 76.076202] Hardware name: WIWYNN Mt.Jade Server System B81.030Z1.0007/Mt.Jade Motherboard, BIOS 2.10.20220810 (SCP: 2.10.20220810) 2022/08/10 [ 76.088972] Call trace: [ 76.091411] show_stack+0x20/0x38 (C) [ 76.095073] dump_stack_lvl+0x80/0xf8 [ 76.098733] print_address_description.constprop.0+0x88/0x398 [ 76.104476] print_report+0xa8/0x278 [ 76.108041] kasan_report+0xa8/0xf8 [ 76.111520] __asan_report_store4_noabort+0x20/0x30 [ 76.116391] __update_page_owner_handle+0x238/0x298 [ 76.121259] __set_page_owner+0xdc/0x140 [ 76.125173] post_alloc_hook+0x190/0x1d8 [ 76.129090] alloc_contig_range_noprof+0x54c/0x890 [ 76.133874] alloc_contig_pages_noprof+0x35c/0x4a8 [ 76.138656] alloc_gigantic_folio.isra.0+0x2c0/0x368 [ 76.143616] only_alloc_fresh_hugetlb_folio.isra.0+0x24/0x150 [ 76.149353] alloc_pool_huge_folio+0x11c/0x1f8 [ 76.153787] set_max_huge_pages+0x364/0xca8 [ 76.157961] __nr_hugepages_store_common+0xb0/0x1a0 [ 76.162829] nr_hugepages_store+0x108/0x118 [ 76.167003] kobj_attr_store+0x3c/0x70 [ 76.170745] sysfs_kf_write+0xfc/0x188 [ 76.174492] kernfs_fop_write_iter+0x274/0x3e0 [ 76.178927] vfs_write+0x64c/0x8e0 [ 76.182323] ksys_write+0xf8/0x1f0 [ 76.185716] __arm64_sys_write+0x74/0xb0 [ 76.189630] invoke_syscall.constprop.0+0xd8/0x1e0 [ 76.194412] do_el0_svc+0x164/0x1e0 [ 76.197891] el0_svc+0x40/0xe0 [ 76.200939] el0t_64_sync_handler+0x144/0x168 [ 76.205287] el0t_64_sync+0x1ac/0x1b0 Fixes: e98337d11bbd ("mm/contig_alloc: support __GFP_COMP") Signed-off-by: Luiz Capitulino <luizcap@redhat.com> --- include/linux/page_ext.h | 10 ++++++++ mm/page_ext.c | 55 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+)