[095/227] mm: enforce pageblock_order < MAX_ORDER

Message ID	20220322214321.737AFC340EC@smtp.kernel.org (mailing list archive)
State	New
Headers	show Return-Path: <owner-linux-mm@kvack.org> Date: Tue, 22 Mar 2022 14:43:20 -0700 To: ziy@nvidia.com,vbabka@suse.cz,robin.murphy@arm.com,robh+dt@kernel.org,paulus@samba.org,m.szyprowski@samsung.com,mst@redhat.com,mpe@ellerman.id.au,minchan@kernel.org,iommu@lists.linux-foundation.org,hch@lst.de,frowand.list@gmail.com,benh@kernel.crashing.org,aneesh.kumar@linux.ibm.com,david@redhat.com,akpm@linux-foundation.org,patches@lists.linux.dev,linux-mm@kvack.org,mm-commits@vger.kernel.org,torvalds@linux-foundation.org,akpm@linux-foundation.org From: Andrew Morton <akpm@linux-foundation.org> In-Reply-To: <20220322143803.04a5e59a07e48284f196a2f9@linux-foundation.org> Subject: [patch 095/227] mm: enforce pageblock_order < MAX_ORDER Message-Id: <20220322214321.737AFC340EC@smtp.kernel.org> Sender: owner-linux-mm@kvack.org Precedence: bulk
Series	[001/227] linux/kthread.h: remove unused macros \| expand [001/227] linux/kthread.h: remove unused macros [002/227] scripts/spelling.txt: add more spellings to spelling.txt [003/227] ntfs: add sanity check on allocation size [004/227] ocfs2: cleanup some return variables [005/227] fs/ocfs2: fix comments mentioning i_mutex [006/227] doc: convert 'subsection' to 'section' in gfp.h [007/227] mm: document and polish read-ahead code [008/227] mm: improve cleanup when ->readpages doesn't process all pages [009/227] fuse: remove reliance on bdi congestion [010/227] nfs: remove reliance on bdi congestion [011/227] ceph: remove reliance on bdi congestion [012/227] remove inode_congested() [013/227] remove bdi_congested() and wb_congested() and related functions [014/227] f2fs: replace congestion_wait() calls with io_schedule_timeout() [015/227] block/bfq-iosched.c: use "false" rather than "BLK_RW_ASYNC" [016/227] remove congestion tracking framework [017/227] mount: warn only once about timestamp range expiration [018/227] mm/memremap: avoid calling kasan_remove_zero_shadow() for device private memory [019/227] filemap: remove find_get_pages() [020/227] mm/writeback: minor clean up for highmem_dirtyable_memory [021/227] mm: fs: fix lru_cache_disabled race in bh_lru [022/227] mm: fix invalid page pointer returned with FOLL_PIN gups [023/227] mm/gup: follow_pfn_pte(): -EEXIST cleanup [024/227] mm/gup: remove unused pin_user_pages_locked() [025/227] mm: change lookup_node() to use get_user_pages_fast() [026/227] mm/gup: remove unused get_user_pages_locked() [027/227] mm/swap: fix confusing comment in folio_mark_accessed [028/227] tmpfs: support for file creation time [029/227] shmem: mapping_set_exiting() to help mapped resilience [030/227] tmpfs: do not allocate pages on read [031/227] mm: shmem: use helper macro __ATTR_RW [032/227] memcg: replace in_interrupt() with !in_task() [033/227] memcg: add per-memcg total kernel memory stat [034/227] mm/memcg: mem_cgroup_per_node is already set to 0 on allocation [035/227] mm/memcg: retrieve parent memcg from css.parent [036/227] memcg: refactor mem_cgroup_oom [037/227] memcg: unify force charging conditions [038/227] selftests: memcg: test high limit for single entry allocation [039/227] memcg: synchronously enforce memory.high for large overcharges [040/227] mm/memcontrol: return 1 from cgroup.memory __setup() handler [041/227] mm/memcg: revert ("mm/memcg: optimize user context object stock access") [043/227] mm/memcg: protect per-CPU counter by disabling preemption on PREEMPT_RT where needed. [044/227] mm/memcg: opencode the inner part of obj_cgroup_uncharge_pages() in drain_obj_stock() [045/227] mm/memcg: protect memcg_stock with a local_lock_t [047/227] mm: list_lru: transpose the array of per-node per-memcg lru lists [048/227] mm: introduce kmem_cache_alloc_lru [049/227] fs: introduce alloc_inode_sb() to allocate filesystems specific inode [050/227] fs: allocate inode by using alloc_inode_sb() [051/227] f2fs: allocate inode by using alloc_inode_sb() [052/227] mm: dcache: use kmem_cache_alloc_lru() to allocate dentry [053/227] xarray: use kmem_cache_alloc_lru to allocate xa_node [054/227] mm: memcontrol: move memcg_online_kmem() to mem_cgroup_css_online() [055/227] mm: list_lru: allocate list_lru_one only when needed [056/227] mm: list_lru: rename memcg_drain_all_list_lrus to memcg_reparent_list_lrus [057/227] mm: list_lru: replace linear array with xarray [058/227] mm: memcontrol: reuse memory cgroup ID for kmem ID [059/227] mm: memcontrol: fix cannot alloc the maximum memcg ID [060/227] mm: list_lru: rename list_lru_per_memcg to list_lru_memcg [061/227] mm: memcontrol: rename memcg_cache_id to memcg_kmem_id [062/227] memcg: enable accounting for tty-related objects [063/227] selftests, x86: fix how check_cc.sh is being invoked [064/227] mm: merge pte_mkhuge() call into arch_make_huge_pte() [065/227] mm: remove mmu_gathers storage from remaining architectures [066/227] mm: thp: fix wrong cache flush in remove_migration_pmd() [067/227] mm: fix missing cache flush for all tail pages of compound page [068/227] mm: hugetlb: fix missing cache flush in copy_huge_page_from_user() [069/227] mm: hugetlb: fix missing cache flush in hugetlb_mcopy_atomic_pte() [070/227] mm: shmem: fix missing cache flush in shmem_mfill_atomic_pte() [071/227] mm: userfaultfd: fix missing cache flush in mcopy_atomic_pte() and __mcopy_atomic() [072/227] mm: replace multiple dcache flush with flush_dcache_folio() [073/227] mm: don't skip swap entry even if zap_details specified [074/227] mm: rename zap_skip_check_mapping() to should_zap_page() [075/227] mm: change zap_details.zap_mapping into even_cows [076/227] mm: rework swap handling of zap_pte_range [077/227] mm/mmap: return 1 from stack_guard_gap __setup() handler [078/227] mm/memory.c: use helper function range_in_vma() [079/227] mm/memory.c: use helper macro min and max in unmap_mapping_range_tree() [080/227] mm: _install_special_mapping() apply VM_LOCKED_CLEAR_MASK [081/227] mm/mmap: remove obsolete comment in ksys_mmap_pgoff [082/227] mm/mremap:: use vma_lookup() instead of find_vma() [083/227] mm/sparse: make mminit_validate_memmodel_limits() static [084/227] mm/vmalloc: remove unneeded function forward declaration [085/227] mm/vmalloc: Move draining areas out of caller context [086/227] mm/vmalloc: add adjust_search_size parameter [087/227] mm/vmalloc: eliminate an extra orig_gfp_mask [088/227] mm/vmalloc.c: fix "unused function" warning [089/227] mm/vmalloc: fix comments about vmap_area struct [090/227] mm: page_alloc: avoid merging non-fallbackable pageblocks with others [091/227] mm/mmzone.c: use try_cmpxchg() in page_cpupid_xchg_last() [092/227] mm/mmzone.h: remove unused macros [093/227] mm/page_alloc: don't pass pfn to free_unref_page_commit() [094/227] cma: factor out minimum alignment requirement [095/227] mm: enforce pageblock_order < MAX_ORDER [096/227] mm/page_alloc: mark pagesets as __maybe_unused [097/227] mm/pages_alloc.c: don't create ZONE_MOVABLE beyond the end of a node [098/227] mm/page_alloc: fetch the correct pcp buddy during bulk free [099/227] mm/page_alloc: track range of active PCP lists during bulk free [100/227] mm/page_alloc: simplify how many pages are selected per pcp list during bulk free [101/227] mm/page_alloc: drain the requested list first during bulk free [102/227] mm/page_alloc: free pages in a single pass during bulk free [103/227] mm/page_alloc: limit number of high-order pages on PCP during bulk free [104/227] mm/page_alloc: do not prefetch buddies during bulk free [105/227] arch/x86/mm/numa: Do not initialize nodes twice [106/227] mm: count time in drain_all_pages during direct reclaim as memory pressure [107/227] mm/page_alloc: call check_new_pages() while zone spinlock is not held [108/227] mm/page_alloc: check high-order pages for corruption during PCP operations [109/227] mm/memory-failure.c: remove obsolete comment [110/227] mm/hwpoison: fix error page recovered but reported "not recovered" [111/227] mm: invalidate hwpoison page cache page in fault path [112/227] mm/memory-failure.c: minor clean up for memory_failure_dev_pagemap [113/227] mm/memory-failure.c: catch unexpected -EFAULT from vma_address() [114/227] mm/memory-failure.c: rework the signaling logic in kill_proc [115/227] mm/memory-failure.c: fix race with changing page more robustly [116/227] mm/memory-failure.c: remove PageSlab check in hwpoison_filter_dev [117/227] mm/memory-failure.c: rework the try_to_unmap logic in hwpoison_user_mappings() [118/227] mm/memory-failure.c: remove obsolete comment in __soft_offline_page [119/227] mm/memory-failure.c: remove unnecessary PageTransTail check [120/227] mm/hwpoison-inject: support injecting hwpoison to free page [121/227] mm/hwpoison: avoid the impact of hwpoison_filter() return value on mce handler [122/227] mm/hwpoison: add in-use hugepage hwpoison filter judgement [123/227] mm/memory-failure.c: fix race with changing page compound again [124/227] mm/memory-failure.c: avoid calling invalidate_inode_page() with unexpected pages [125/227] mm/memory-failure.c: make non-LRU movable pages unhandlable [126/227] mm, fault-injection: declare should_fail_alloc_page() [127/227] mm/mlock: fix potential imbalanced rlimit ucounts adjustment [128/227] mm: hugetlb: free the 2nd vmemmap page associated with each HugeTLB page [129/227] mm: hugetlb: replace hugetlb_free_vmemmap_enabled with a static_key [130/227] mm: sparsemem: use page table lock to protect kernel pmd operations [131/227] selftests: vm: add a hugetlb test case [132/227] mm: sparsemem: move vmemmap related to HugeTLB to CONFIG_HUGETLB_PAGE_FREE_VMEMMAP [133/227] mm/hugetlb: generalize ARCH_WANT_GENERAL_HUGETLB [134/227] hugetlb: clean up potential spectre issue warnings [135/227] mm/hugetlb: use helper macro __ATTR_RW [136/227] mm/hugetlb.c: export PageHeadHuge() [137/227] mm: remove unneeded local variable follflags [138/227] userfaultfd: provide unmasked address on page-fault [139/227] userfaultfd/selftests: fix uninitialized_var.cocci warning [140/227] mm/fs: delete PF_SWAPWRITE [141/227] mm: __isolate_lru_page_prepare() in isolate_migratepages_block() [142/227] mm/list_lru: optimize memcg_reparent_list_lru_node() [143/227] mm: lru_cache_disable: replace work queue synchronization with synchronize_rcu [144/227] mm: workingset: replace IRQ-off check with a lockdep assert. [145/227] mm: vmscan: fix documentation for page_check_references() [146/227] mm: compaction: cleanup the compaction trace events [147/227] mempolicy: mbind_range() set_policy() after vma_merge() [148/227] mm/oom_kill: remove unneeded is_memcg_oom check [149/227] mm,migrate: fix establishing demotion target [150/227] mm/migrate: fix race between lock page and clear PG_Isolated [151/227] mm/thp: refix __split_huge_pmd_locked() for migration PMD [152/227] mm/cma: provide option to opt out from exposing pages on activation failure [153/227] powerpc/fadump: opt out from freeing pages on cma activation failure [154/227] NUMA Balancing: add page promotion counter [155/227] NUMA balancing: optimize page placement for memory tiering system [156/227] memory tiering: skip to scan fast memory [157/227] mm: page_io: fix psi memory pressure error on cold swapins [158/227] mm/vmstat: add event for ksm swapping in copy [159/227] mm/ksm: use helper macro __ATTR_RW [160/227] mm/hwpoison: check the subpage, not the head page [161/227] mm/madvise: use vma_lookup() instead of find_vma() [162/227] mm: madvise: return correct bytes advised with process_madvise [163/227] mm: madvise: skip unmapped vma holes passed to process_madvise [164/227] mm, memory_hotplug: make arch_alloc_nodedata independent on CONFIG_MEMORY_HOTPLUG [165/227] mm: handle uninitialized numa nodes gracefully [166/227] mm, memory_hotplug: drop arch_free_nodedata [167/227] mm, memory_hotplug: reorganize new pgdat initialization [168/227] mm: make free_area_init_node aware of memory less nodes [169/227] memcg: do not tweak node in alloc_mem_cgroup_per_node_info [170/227] drivers/base/memory: add memory block to memory group after registration succeeded [171/227] drivers/base/node: consolidate node device subsystem initialization in node_dev_init() [172/227] mm/memory_hotplug: remove obsolete comment of __add_pages [173/227] mm/memory_hotplug: avoid calling zone_intersects() for ZONE_NORMAL [174/227] mm/memory_hotplug: clean up try_offline_node [175/227] mm/memory_hotplug: fix misplaced comment in offline_pages [176/227] drivers/base/node: rename link_mem_sections() to register_memory_block_under_node() [177/227] drivers/base/memory: determine and store zone for single-zone memory blocks [178/227] drivers/base/memory: clarify adding and removing of memory blocks [179/227] mm: only re-generate demotion targets when a numa node changes its N_CPU state [180/227] mm/thp: ClearPageDoubleMap in first page_add_file_rmap() [181/227] mm/zswap.c: allow handling just same-value filled pages [182/227] mm: remove usercopy_warn() [183/227] mm: uninline copy_overflow() [184/227] mm/usercopy: return 1 from hardened_usercopy __setup() handler [185/227] mm/early_ioremap: declare early_memremap_pgprot_adjust() [186/227] highmem: document kunmap_local() [187/227] mm/highmem: remove unnecessary done label [188/227] mm/page_table_check.c: use strtobool for param parsing [189/227] mm/kfence: remove unnecessary CONFIG_KFENCE option [190/227] kfence: allow re-enabling KFENCE after system startup [191/227] kfence: alloc kfence_pool after system startup [192/227] kunit: fix UAF when run kfence test case test_gfpzero [193/227] kunit: make kunit_test_timeout compatible with comment [194/227] kfence: test: try to avoid test_gfpzero trigger rcu_stall [195/227] kfence: allow use of a deferrable timer [196/227] mm/hmm.c: remove unneeded local variable ret [197/227] mm/damon/dbgfs/init_regions: use target index instead of target id [198/227] Docs/admin-guide/mm/damon/usage: update for changed initail_regions file input [199/227] mm/damon/core: move damon_set_targets() into dbgfs [200/227] mm/damon: remove the target id concept [201/227] mm/damon: remove redundant page validation [202/227] mm/damon: rename damon_primitives to damon_operations [203/227] mm/damon: let monitoring operations can be registered and selected [204/227] mm/damon/paddr,vaddr: register themselves to DAMON in subsys_initcall [205/227] mm/damon/reclaim: use damon_select_ops() instead of damon_{v,p}a_set_operations() [206/227] mm/damon/dbgfs: use damon_select_ops() instead of damon_{v,p}a_set_operations() [207/227] mm/damon/dbgfs: use operations id for knowing if the target has pid [208/227] mm/damon/dbgfs-test: fix is_target_id() change [209/227] mm/damon/paddr,vaddr: remove damon_{p,v}a_{target_valid,set_operations}() [210/227] mm/damon: remove unnecessary CONFIG_DAMON option [211/227] Docs/vm/damon: call low level monitoring primitives the operations [212/227] Docs/vm/damon/design: update DAMON-Idle Page Tracking interference handling [213/227] Docs/damon: update outdated term 'regions update interval' [214/227] mm/damon/core: allow non-exclusive DAMON start/stop [215/227] mm/damon/core: add number of each enum type values [216/227] mm/damon: implement a minimal stub for sysfs-based DAMON interface [217/227] mm/damon/sysfs: link DAMON for virtual address spaces monitoring [218/227] mm/damon/sysfs: support the physical address space monitoring [219/227] mm/damon/sysfs: support DAMON-based Operation Schemes [220/227] mm/damon/sysfs: support DAMOS quotas [221/227] mm/damon/sysfs: support schemes prioritization [222/227] mm/damon/sysfs: support DAMOS watermarks [223/227] mm/damon/sysfs: support DAMOS stats [224/227] selftests/damon: add a test for DAMON sysfs interface [225/227] Docs/admin-guide/mm/damon/usage: document DAMON sysfs interface [226/227] Docs/ABI/testing: add DAMON sysfs interface ABI document [227/227] mm/damon/sysfs: remove repeat container_of() in damon_sysfs_kdamond_release()

Message ID

20220322214321.737AFC340EC@smtp.kernel.org (mailing list archive)

State

New

Headers

Date: Tue, 22 Mar 2022 14:43:20 -0700
To: 
 ziy@nvidia.com,vbabka@suse.cz,robin.murphy@arm.com,robh+dt@kernel.org,paulus@samba.org,m.szyprowski@samsung.com,mst@redhat.com,mpe@ellerman.id.au,minchan@kernel.org,iommu@lists.linux-foundation.org,hch@lst.de,frowand.list@gmail.com,benh@kernel.crashing.org,aneesh.kumar@linux.ibm.com,david@redhat.com,akpm@linux-foundation.org,patches@lists.linux.dev,linux-mm@kvack.org,mm-commits@vger.kernel.org,torvalds@linux-foundation.org,akpm@linux-foundation.org
From: Andrew Morton <akpm@linux-foundation.org>
In-Reply-To: <20220322143803.04a5e59a07e48284f196a2f9@linux-foundation.org>
Subject: [patch 095/227] mm: enforce pageblock_order < MAX_ORDER
Message-Id: <20220322214321.737AFC340EC@smtp.kernel.org>
Sender: owner-linux-mm@kvack.org
Precedence: bulk

Series

[001/227] linux/kthread.h: remove unused macros | expand

Commit Message

Andrew Morton March 22, 2022, 9:43 p.m. UTC

From: David Hildenbrand <david@redhat.com>
Subject: mm: enforce pageblock_order < MAX_ORDER

Some places in the kernel don't really expect pageblock_order >=
MAX_ORDER, and it looks like this is only possible in corner cases:

1) CONFIG_DEFERRED_STRUCT_PAGE_INIT we'll end up freeing pageblock_order
   pages via __free_pages_core(), which cannot possibly work.

2) find_zone_movable_pfns_for_nodes() will roundup the ZONE_MOVABLE
   start PFN to MAX_ORDER_NR_PAGES. Consequently with a bigger
   pageblock_order, we could have a single pageblock partially managed by
   two zones.

3) compaction code runs into __fragmentation_index() with order
   >= MAX_ORDER, when checking WARN_ON_ONCE(order >= MAX_ORDER). [1]

4) mm/page_reporting.c won't be reporting any pages with default
   page_reporting_order == pageblock_order, as we'll be skipping the
   reporting loop inside page_reporting_process_zone().

5) __rmqueue_fallback() will never be able to steal with
   ALLOC_NOFRAGMENT.

pageblock_order >= MAX_ORDER is weird either way: it's a pure optimization
for making alloc_contig_range(), as used for allcoation of gigantic pages,
a little more reliable to succeed.  However, if there is demand for
somewhat reliable allocation of gigantic pages, affected setups should be
using CMA or boottime allocations instead.

So let's make sure that pageblock_order < MAX_ORDER and simplify.

[1] https://lkml.kernel.org/r/87r189a2ks.fsf@linux.ibm.com

Link: https://lkml.kernel.org/r/20220214174132.219303-3-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Frank Rowand <frowand.list@gmail.com>
Cc: John Garry via iommu <iommu@lists.linux-foundation.org>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Rob Herring <robh+dt@kernel.org>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 drivers/virtio/virtio_mem.c     |    9 ++------
 include/linux/cma.h             |    3 --
 include/linux/pageblock-flags.h |    7 ++++--
 mm/Kconfig                      |    3 ++
 mm/page_alloc.c                 |   32 +++++++-----------------------
 5 files changed, 20 insertions(+), 34 deletions(-)

--- a/drivers/virtio/virtio_mem.c~mm-enforce-pageblock_order-max_order
+++ a/drivers/virtio/virtio_mem.c
@@ -2476,13 +2476,10 @@  static int virtio_mem_init_hotplug(struc
 				      VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD);
 
 	/*
-	 * We want subblocks to span at least MAX_ORDER_NR_PAGES and
-	 * pageblock_nr_pages pages. This:
-	 * - Is required for now for alloc_contig_range() to work reliably -
-	 *   it doesn't properly handle smaller granularity on ZONE_NORMAL.
+	 * TODO: once alloc_contig_range() works reliably with pageblock
+	 * granularity on ZONE_NORMAL, use pageblock_nr_pages instead.
 	 */
-	sb_size = max_t(uint64_t, MAX_ORDER_NR_PAGES,
-			pageblock_nr_pages) * PAGE_SIZE;
+	sb_size = PAGE_SIZE * MAX_ORDER_NR_PAGES;
 	sb_size = max_t(uint64_t, vm->device_block_size, sb_size);
 
 	if (sb_size < memory_block_size_bytes() && !force_bbm) {
--- a/include/linux/cma.h~mm-enforce-pageblock_order-max_order
+++ a/include/linux/cma.h
@@ -25,8 +25,7 @@ 
  * -- can deal with only some pageblocks of a higher-order page being
  *  MIGRATE_CMA, we can use pageblock_nr_pages.
  */
-#define CMA_MIN_ALIGNMENT_PAGES max_t(phys_addr_t, MAX_ORDER_NR_PAGES, \
-				      pageblock_nr_pages)
+#define CMA_MIN_ALIGNMENT_PAGES MAX_ORDER_NR_PAGES
 #define CMA_MIN_ALIGNMENT_BYTES (PAGE_SIZE * CMA_MIN_ALIGNMENT_PAGES)
 
 struct cma;
--- a/include/linux/pageblock-flags.h~mm-enforce-pageblock_order-max_order
+++ a/include/linux/pageblock-flags.h
@@ -37,8 +37,11 @@  extern unsigned int pageblock_order;
 
 #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
 
-/* Huge pages are a constant size */
-#define pageblock_order		HUGETLB_PAGE_ORDER
+/*
+ * Huge pages are a constant size, but don't exceed the maximum allocation
+ * granularity.
+ */
+#define pageblock_order		min_t(unsigned int, HUGETLB_PAGE_ORDER, MAX_ORDER - 1)
 
 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
 
--- a/mm/Kconfig~mm-enforce-pageblock_order-max_order
+++ a/mm/Kconfig
@@ -262,6 +262,9 @@  config HUGETLB_PAGE_SIZE_VARIABLE
 	  HUGETLB_PAGE_ORDER when there are multiple HugeTLB page sizes available
 	  on a platform.
 
+	  Note that the pageblock_order cannot exceed MAX_ORDER - 1 and will be
+	  clamped down to MAX_ORDER - 1.
+
 config CONTIG_ALLOC
 	def_bool (MEMORY_ISOLATION && COMPACTION) || CMA
 
--- a/mm/page_alloc.c~mm-enforce-pageblock_order-max_order
+++ a/mm/page_alloc.c
@@ -1072,14 +1072,12 @@  static inline void __free_one_page(struc
 		int migratetype, fpi_t fpi_flags)
 {
 	struct capture_control *capc = task_capc(zone);
+	unsigned int max_order = pageblock_order;
 	unsigned long buddy_pfn;
 	unsigned long combined_pfn;
-	unsigned int max_order;
 	struct page *buddy;
 	bool to_tail;
 
-	max_order = min_t(unsigned int, MAX_ORDER - 1, pageblock_order);
-
 	VM_BUG_ON(!zone_is_initialized(zone));
 	VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
 
@@ -2259,19 +2257,8 @@  void __init init_cma_reserved_pageblock(
 	} while (++p, --i);
 
 	set_pageblock_migratetype(page, MIGRATE_CMA);
-
-	if (pageblock_order >= MAX_ORDER) {
-		i = pageblock_nr_pages;
-		p = page;
-		do {
-			set_page_refcounted(p);
-			__free_pages(p, MAX_ORDER - 1);
-			p += MAX_ORDER_NR_PAGES;
-		} while (i -= MAX_ORDER_NR_PAGES);
-	} else {
-		set_page_refcounted(page);
-		__free_pages(page, pageblock_order);
-	}
+	set_page_refcounted(page);
+	__free_pages(page, pageblock_order);
 
 	adjust_managed_page_count(page, pageblock_nr_pages);
 	page_zone(page)->cma_pages += pageblock_nr_pages;
@@ -7382,16 +7369,15 @@  static inline void setup_usemap(struct z
 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
 void __init set_pageblock_order(void)
 {
-	unsigned int order;
+	unsigned int order = MAX_ORDER - 1;
 
 	/* Check that pageblock_nr_pages has not already been setup */
 	if (pageblock_order)
 		return;
 
-	if (HPAGE_SHIFT > PAGE_SHIFT)
+	/* Don't let pageblocks exceed the maximum allocation granularity. */
+	if (HPAGE_SHIFT > PAGE_SHIFT && HUGETLB_PAGE_ORDER < order)
 		order = HUGETLB_PAGE_ORDER;
-	else
-		order = MAX_ORDER - 1;
 
 	/*
 	 * Assume the largest contiguous order of interest is a huge page.
@@ -8979,14 +8965,12 @@  struct page *has_unmovable_pages(struct
 #ifdef CONFIG_CONTIG_ALLOC
 static unsigned long pfn_max_align_down(unsigned long pfn)
 {
-	return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,
-			     pageblock_nr_pages) - 1);
+	return ALIGN_DOWN(pfn, MAX_ORDER_NR_PAGES);
 }
 
 static unsigned long pfn_max_align_up(unsigned long pfn)
 {
-	return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,
-				pageblock_nr_pages));
+	return ALIGN(pfn, MAX_ORDER_NR_PAGES);
 }
 
 #if defined(CONFIG_DYNAMIC_DEBUG) || \

[095/227] mm: enforce pageblock_order < MAX_ORDER

Commit Message

Patch