diff mbox series

[6/8] mm: shmem: add mTHP support for anonymous shmem

Message ID adc64bf0f150bdc614c6c06fc313adeef7dbbbff.1714978902.git.baolin.wang@linux.alibaba.com (mailing list archive)
State New
Headers show
Series add mTHP support for anonymous shmem | expand

Commit Message

Baolin Wang May 6, 2024, 8:46 a.m. UTC
Commit 19eaf44954df adds multi-size THP (mTHP) for anonymous pages, that
can allow THP to be configured through the sysfs interface located at
'/sys/kernel/mm/transparent_hugepage/hugepage-XXkb/enabled'.

However, the anonymous share pages will ignore the anonymous mTHP rule
configured through the sysfs interface, and can only use the PMD-mapped
THP, that is not reasonable. Users expect to apply the mTHP rule for
all anonymous pages, including the anonymous share pages, in order to
enjoy the benefits of mTHP. For example, lower latency than PMD-mapped THP,
smaller memory bloat than PMD-mapped THP, contiguous PTEs on ARM architecture
to reduce TLB miss etc.

The primary strategy is similar to supporting anonymous mTHP. Introduce
a new interface '/mm/transparent_hugepage/hugepage-XXkb/shmem_enabled',
which can have all the same values as the top-level
'/sys/kernel/mm/transparent_hugepage/shmem_enabled', with adding a new
additional "inherit" option. By default all sizes will be set to "never"
except PMD size, which is set to "inherit". This ensures backward compatibility
with the shmem enabled of the top level, meanwhile also allows independent
control of shmem enabled for each mTHP.

Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
---
 mm/shmem.c | 177 +++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 150 insertions(+), 27 deletions(-)

Comments

kernel test robot May 7, 2024, 10:46 a.m. UTC | #1
Hi Baolin,

kernel test robot noticed the following build warnings:

[auto build test WARNING on akpm-mm/mm-everything]
[also build test WARNING on next-20240506]
[cannot apply to linus/master v6.9-rc7]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Baolin-Wang/mm-move-highest_order-and-next_order-out-of-the-THP-config/20240506-164838
base:   https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link:    https://lore.kernel.org/r/adc64bf0f150bdc614c6c06fc313adeef7dbbbff.1714978902.git.baolin.wang%40linux.alibaba.com
patch subject: [PATCH 6/8] mm: shmem: add mTHP support for anonymous shmem
config: s390-allnoconfig (https://download.01.org/0day-ci/archive/20240507/202405071820.2KY0UnDu-lkp@intel.com/config)
compiler: clang version 19.0.0git (https://github.com/llvm/llvm-project 0ab4458df0688955620b72cc2c72a32dffad3615)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240507/202405071820.2KY0UnDu-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202405071820.2KY0UnDu-lkp@intel.com/

All warnings (new ones prefixed by >>):

   In file included from mm/shmem.c:28:
   In file included from include/linux/ramfs.h:5:
   In file included from include/linux/fs_parser.h:11:
   In file included from include/linux/fs_context.h:14:
   In file included from include/linux/security.h:33:
   In file included from include/linux/mm.h:2253:
   include/linux/vmstat.h:514:36: warning: arithmetic between different enumeration types ('enum node_stat_item' and 'enum lru_list') [-Wenum-enum-conversion]
     514 |         return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_"
         |                               ~~~~~~~~~~~ ^ ~~~
>> mm/shmem.c:1780:10: warning: variable 'folio' is used uninitialized whenever 'while' loop exits because its condition is false [-Wsometimes-uninitialized]
    1780 |                 while (suitable_orders) {
         |                        ^~~~~~~~~~~~~~~
   mm/shmem.c:1795:7: note: uninitialized use occurs here
    1795 |         if (!folio)
         |              ^~~~~
   mm/shmem.c:1780:10: note: remove the condition if it is always true
    1780 |                 while (suitable_orders) {
         |                        ^~~~~~~~~~~~~~~
         |                        1
   mm/shmem.c:1750:21: note: initialize the variable 'folio' to silence this warning
    1750 |         struct folio *folio;
         |                            ^
         |                             = NULL
   mm/shmem.c:1564:20: warning: unused function 'shmem_show_mpol' [-Wunused-function]
    1564 | static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
         |                    ^~~~~~~~~~~~~~~
   3 warnings generated.


vim +1780 mm/shmem.c

  1741	
  1742	static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf,
  1743			gfp_t gfp, struct inode *inode, pgoff_t index,
  1744			struct mm_struct *fault_mm, bool huge, unsigned long orders)
  1745	{
  1746		struct address_space *mapping = inode->i_mapping;
  1747		struct shmem_inode_info *info = SHMEM_I(inode);
  1748		struct vm_area_struct *vma = vmf ? vmf->vma : NULL;
  1749		unsigned long suitable_orders;
  1750		struct folio *folio;
  1751		long pages;
  1752		int error, order;
  1753	
  1754		if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
  1755			huge = false;
  1756	
  1757		if (huge || orders > 0) {
  1758			if (vma && vma_is_anon_shmem(vma) && orders) {
  1759				suitable_orders = anon_shmem_suitable_orders(inode, vmf,
  1760								mapping, index, orders);
  1761			} else {
  1762				pages = HPAGE_PMD_NR;
  1763				suitable_orders = BIT(HPAGE_PMD_ORDER);
  1764				index = round_down(index, HPAGE_PMD_NR);
  1765	
  1766				/*
  1767				 * Check for conflict before waiting on a huge allocation.
  1768				 * Conflict might be that a huge page has just been allocated
  1769				 * and added to page cache by a racing thread, or that there
  1770				 * is already at least one small page in the huge extent.
  1771				 * Be careful to retry when appropriate, but not forever!
  1772				 * Elsewhere -EEXIST would be the right code, but not here.
  1773				 */
  1774				if (xa_find(&mapping->i_pages, &index,
  1775					index + HPAGE_PMD_NR - 1, XA_PRESENT))
  1776					return ERR_PTR(-E2BIG);
  1777			}
  1778	
  1779			order = highest_order(suitable_orders);
> 1780			while (suitable_orders) {
  1781				pages = 1 << order;
  1782				index = round_down(index, pages);
  1783				folio = shmem_alloc_hugefolio(gfp, info, index, order);
  1784				if (folio)
  1785					goto allocated;
  1786	
  1787				if (pages == HPAGE_PMD_NR)
  1788					count_vm_event(THP_FILE_FALLBACK);
  1789				order = next_order(&suitable_orders, order);
  1790			}
  1791		} else {
  1792			pages = 1;
  1793			folio = shmem_alloc_folio(gfp, info, index);
  1794		}
  1795		if (!folio)
  1796			return ERR_PTR(-ENOMEM);
  1797	
  1798	allocated:
  1799		__folio_set_locked(folio);
  1800		__folio_set_swapbacked(folio);
  1801	
  1802		gfp &= GFP_RECLAIM_MASK;
  1803		error = mem_cgroup_charge(folio, fault_mm, gfp);
  1804		if (error) {
  1805			if (xa_find(&mapping->i_pages, &index,
  1806					index + pages - 1, XA_PRESENT)) {
  1807				error = -EEXIST;
  1808			} else if (pages == HPAGE_PMD_NR) {
  1809				count_vm_event(THP_FILE_FALLBACK);
  1810				count_vm_event(THP_FILE_FALLBACK_CHARGE);
  1811			}
  1812			goto unlock;
  1813		}
  1814	
  1815		error = shmem_add_to_page_cache(folio, mapping, index, NULL, gfp);
  1816		if (error)
  1817			goto unlock;
  1818	
  1819		error = shmem_inode_acct_blocks(inode, pages);
  1820		if (error) {
  1821			struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
  1822			long freed;
  1823			/*
  1824			 * Try to reclaim some space by splitting a few
  1825			 * large folios beyond i_size on the filesystem.
  1826			 */
  1827			shmem_unused_huge_shrink(sbinfo, NULL, 2);
  1828			/*
  1829			 * And do a shmem_recalc_inode() to account for freed pages:
  1830			 * except our folio is there in cache, so not quite balanced.
  1831			 */
  1832			spin_lock(&info->lock);
  1833			freed = pages + info->alloced - info->swapped -
  1834				READ_ONCE(mapping->nrpages);
  1835			if (freed > 0)
  1836				info->alloced -= freed;
  1837			spin_unlock(&info->lock);
  1838			if (freed > 0)
  1839				shmem_inode_unacct_blocks(inode, freed);
  1840			error = shmem_inode_acct_blocks(inode, pages);
  1841			if (error) {
  1842				filemap_remove_folio(folio);
  1843				goto unlock;
  1844			}
  1845		}
  1846	
  1847		shmem_recalc_inode(inode, pages, 0);
  1848		folio_add_lru(folio);
  1849		return folio;
  1850	
  1851	unlock:
  1852		folio_unlock(folio);
  1853		folio_put(folio);
  1854		return ERR_PTR(error);
  1855	}
  1856
Baolin Wang May 8, 2024, 6:03 a.m. UTC | #2
Hi,

On 2024/5/7 18:46, kernel test robot wrote:
>>> mm/shmem.c:1780:10: warning: variable 'folio' is used uninitialized whenever 'while' loop exits because its condition is false [-Wsometimes-uninitialized]
>      1780 |                 while (suitable_orders) {
>           |                        ^~~~~~~~~~~~~~~
>     mm/shmem.c:1795:7: note: uninitialized use occurs here
>      1795 |         if (!folio)
>           |              ^~~~~
>     mm/shmem.c:1780:10: note: remove the condition if it is always true
>      1780 |                 while (suitable_orders) {
>           |                        ^~~~~~~~~~~~~~~
>           |                        1
>     mm/shmem.c:1750:21: note: initialize the variable 'folio' to silence this warning
>      1750 |         struct folio *folio;
>           |                            ^
>           |                             = NULL
>     mm/shmem.c:1564:20: warning: unused function 'shmem_show_mpol' [-Wunused-function]
>      1564 | static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)

Thanks for reporting. Will add below change to avoid the warning:
diff --git a/mm/shmem.c b/mm/shmem.c
index d603e36e0f4f..fd2cb2e73a21 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1747,7 +1747,7 @@ static struct folio 
*shmem_alloc_and_add_folio(struct vm_fault *vmf,
         struct shmem_inode_info *info = SHMEM_I(inode);
         struct vm_area_struct *vma = vmf ? vmf->vma : NULL;
         unsigned long suitable_orders;
-       struct folio *folio;
+       struct folio *folio = NULL;
         long pages;
         int error, order;
diff mbox series

Patch

diff --git a/mm/shmem.c b/mm/shmem.c
index 59cc26d44344..08ccea5170a1 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1611,6 +1611,106 @@  static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp)
 	return result;
 }
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static unsigned long anon_shmem_allowable_huge_orders(struct inode *inode,
+				struct vm_area_struct *vma, pgoff_t index,
+				bool global_huge)
+{
+	unsigned long mask = READ_ONCE(huge_anon_shmem_orders_always);
+	unsigned long within_size_orders = READ_ONCE(huge_anon_shmem_orders_within_size);
+	unsigned long vm_flags = vma->vm_flags;
+	/*
+	 * Check all the (large) orders below HPAGE_PMD_ORDER + 1 that
+	 * are enabled for this vma.
+	 */
+	unsigned long orders = BIT(PMD_ORDER + 1) - 1;
+	loff_t i_size;
+	int order;
+
+	if ((vm_flags & VM_NOHUGEPAGE) ||
+	    test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
+		return 0;
+
+	/* If the hardware/firmware marked hugepage support disabled. */
+	if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED))
+		return 0;
+
+	/*
+	 * Following the 'deny' semantics of the top level, force the huge
+	 * option off from all mounts.
+	 */
+	if (shmem_huge == SHMEM_HUGE_DENY)
+		return 0;
+	/*
+	 * Only allow inherit orders if the top-level value is 'force', which
+	 * means non-PMD sized THP can not override 'huge' mount option now.
+	 */
+	if (shmem_huge == SHMEM_HUGE_FORCE)
+		return READ_ONCE(huge_anon_shmem_orders_inherit);
+
+	/* Allow mTHP that will be fully within i_size. */
+	order = highest_order(within_size_orders);
+	while (within_size_orders) {
+		index = round_up(index + 1, order);
+		i_size = round_up(i_size_read(inode), PAGE_SIZE);
+		if (i_size >> PAGE_SHIFT >= index) {
+			mask |= within_size_orders;
+			break;
+		}
+
+		order = next_order(&within_size_orders, order);
+	}
+
+	if (vm_flags & VM_HUGEPAGE)
+		mask |= READ_ONCE(huge_anon_shmem_orders_madvise);
+
+	if (global_huge)
+		mask |= READ_ONCE(huge_anon_shmem_orders_inherit);
+
+	return orders & mask;
+}
+
+static unsigned long anon_shmem_suitable_orders(struct inode *inode, struct vm_fault *vmf,
+					struct address_space *mapping, pgoff_t index,
+					unsigned long orders)
+{
+	struct vm_area_struct *vma = vmf->vma;
+	unsigned long pages;
+	int order;
+
+	orders = thp_vma_suitable_orders(vma, vmf->address, orders);
+	if (!orders)
+		return 0;
+
+	/* Find the highest order that can add into the page cache */
+	order = highest_order(orders);
+	while (orders) {
+		pages = 1UL << order;
+		index = round_down(index, pages);
+		if (!xa_find(&mapping->i_pages, &index,
+			     index + pages - 1, XA_PRESENT))
+			break;
+		order = next_order(&orders, order);
+	}
+
+	return orders;
+}
+#else
+static unsigned long anon_shmem_allowable_huge_orders(struct inode *inode,
+				struct vm_area_struct *vma, pgoff_t index,
+				bool global_huge)
+{
+	return 0;
+}
+
+static unsigned long anon_shmem_suitable_orders(struct inode *inode, struct vm_fault *vmf,
+					struct address_space *mapping, pgoff_t index,
+					unsigned long orders)
+{
+	return 0;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
 static struct folio *shmem_alloc_hugefolio(gfp_t gfp,
 		struct shmem_inode_info *info, pgoff_t index, int order)
 {
@@ -1639,38 +1739,55 @@  static struct folio *shmem_alloc_folio(gfp_t gfp,
 	return (struct folio *)page;
 }
 
-static struct folio *shmem_alloc_and_add_folio(gfp_t gfp,
-		struct inode *inode, pgoff_t index,
-		struct mm_struct *fault_mm, bool huge)
+static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf,
+		gfp_t gfp, struct inode *inode, pgoff_t index,
+		struct mm_struct *fault_mm, bool huge, unsigned long orders)
 {
 	struct address_space *mapping = inode->i_mapping;
 	struct shmem_inode_info *info = SHMEM_I(inode);
+	struct vm_area_struct *vma = vmf ? vmf->vma : NULL;
+	unsigned long suitable_orders;
 	struct folio *folio;
 	long pages;
-	int error;
+	int error, order;
 
 	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
 		huge = false;
 
-	if (huge) {
-		pages = HPAGE_PMD_NR;
-		index = round_down(index, HPAGE_PMD_NR);
+	if (huge || orders > 0) {
+		if (vma && vma_is_anon_shmem(vma) && orders) {
+			suitable_orders = anon_shmem_suitable_orders(inode, vmf,
+							mapping, index, orders);
+		} else {
+			pages = HPAGE_PMD_NR;
+			suitable_orders = BIT(HPAGE_PMD_ORDER);
+			index = round_down(index, HPAGE_PMD_NR);
 
-		/*
-		 * Check for conflict before waiting on a huge allocation.
-		 * Conflict might be that a huge page has just been allocated
-		 * and added to page cache by a racing thread, or that there
-		 * is already at least one small page in the huge extent.
-		 * Be careful to retry when appropriate, but not forever!
-		 * Elsewhere -EEXIST would be the right code, but not here.
-		 */
-		if (xa_find(&mapping->i_pages, &index,
+			/*
+			 * Check for conflict before waiting on a huge allocation.
+			 * Conflict might be that a huge page has just been allocated
+			 * and added to page cache by a racing thread, or that there
+			 * is already at least one small page in the huge extent.
+			 * Be careful to retry when appropriate, but not forever!
+			 * Elsewhere -EEXIST would be the right code, but not here.
+			 */
+			if (xa_find(&mapping->i_pages, &index,
 				index + HPAGE_PMD_NR - 1, XA_PRESENT))
-			return ERR_PTR(-E2BIG);
+				return ERR_PTR(-E2BIG);
+		}
 
-		folio = shmem_alloc_hugefolio(gfp, info, index, HPAGE_PMD_ORDER);
-		if (!folio && pages == HPAGE_PMD_NR)
-			count_vm_event(THP_FILE_FALLBACK);
+		order = highest_order(suitable_orders);
+		while (suitable_orders) {
+			pages = 1 << order;
+			index = round_down(index, pages);
+			folio = shmem_alloc_hugefolio(gfp, info, index, order);
+			if (folio)
+				goto allocated;
+
+			if (pages == HPAGE_PMD_NR)
+				count_vm_event(THP_FILE_FALLBACK);
+			order = next_order(&suitable_orders, order);
+		}
 	} else {
 		pages = 1;
 		folio = shmem_alloc_folio(gfp, info, index);
@@ -1678,6 +1795,7 @@  static struct folio *shmem_alloc_and_add_folio(gfp_t gfp,
 	if (!folio)
 		return ERR_PTR(-ENOMEM);
 
+allocated:
 	__folio_set_locked(folio);
 	__folio_set_swapbacked(folio);
 
@@ -1972,7 +2090,8 @@  static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
 	struct mm_struct *fault_mm;
 	struct folio *folio;
 	int error;
-	bool alloced;
+	bool alloced, huge;
+	unsigned long orders = 0;
 
 	if (WARN_ON_ONCE(!shmem_mapping(inode->i_mapping)))
 		return -EINVAL;
@@ -2044,14 +2163,18 @@  static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
 		return 0;
 	}
 
-	if (shmem_is_huge(inode, index, false, fault_mm,
-			  vma ? vma->vm_flags : 0)) {
+	huge = shmem_is_huge(inode, index, false, fault_mm,
+			     vma ? vma->vm_flags : 0);
+	/* Find hugepage orders that are allowed for anonymous shmem. */
+	if (vma && vma_is_anon_shmem(vma))
+		orders = anon_shmem_allowable_huge_orders(inode, vma, index, huge);
+	if (huge || orders > 0) {
 		gfp_t huge_gfp;
 
 		huge_gfp = vma_thp_gfp_mask(vma);
 		huge_gfp = limit_gfp_mask(huge_gfp, gfp);
-		folio = shmem_alloc_and_add_folio(huge_gfp,
-				inode, index, fault_mm, true);
+		folio = shmem_alloc_and_add_folio(vmf, huge_gfp,
+				inode, index, fault_mm, true, orders);
 		if (!IS_ERR(folio)) {
 			if (folio_test_pmd_mappable(folio))
 				count_vm_event(THP_FILE_ALLOC);
@@ -2061,7 +2184,7 @@  static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
 			goto repeat;
 	}
 
-	folio = shmem_alloc_and_add_folio(gfp, inode, index, fault_mm, false);
+	folio = shmem_alloc_and_add_folio(vmf, gfp, inode, index, fault_mm, false, 0);
 	if (IS_ERR(folio)) {
 		error = PTR_ERR(folio);
 		if (error == -EEXIST)
@@ -2072,7 +2195,7 @@  static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
 
 alloced:
 	alloced = true;
-	if (folio_test_pmd_mappable(folio) &&
+	if (folio_test_large(folio) &&
 	    DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
 					folio_next_index(folio) - 1) {
 		struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);