@@ -31,6 +31,28 @@
#define SWP_PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT)
#define SWP_PFN_MASK ((1UL << SWP_PFN_BITS) - 1)
+#ifdef __ARCH_SWP_OFFSET_BITS
+#define SWP_PFN_OFFSET_FREE_BITS (__ARCH_SWP_OFFSET_BITS - SWP_PFN_BITS)
+#else
+/*
+ * If __ARCH_SWP_OFFSET_BITS not defined, assuming we don't have free bits
+ * to be on the safe side.
+ */
+#define SWP_PFN_OFFSET_FREE_BITS 0
+#endif
+
+/**
+ * Migration swap entry specific bitfield definitions.
+ *
+ * @SWP_MIG_YOUNG_BIT: Whether the page used to have young bit set
+ *
+ * Note: these bits will be used only if there're free bits in arch
+ * specific swp offset field. Arch needs __ARCH_SWP_OFFSET_BITS defined to
+ * use the bits/features.
+ */
+#define SWP_MIG_YOUNG_BIT (1UL << SWP_PFN_BITS)
+#define SWP_MIG_OFFSET_BITS (SWP_PFN_BITS + 1)
+
/* Clear all flags but only keep swp_entry_t related information */
static inline pte_t pte_swp_clear_flags(pte_t pte)
{
@@ -258,6 +280,30 @@ static inline swp_entry_t make_writable_migration_entry(pgoff_t offset)
return swp_entry(SWP_MIGRATION_WRITE, offset);
}
+static inline swp_entry_t make_migration_entry_young(swp_entry_t entry)
+{
+ /*
+ * Due to a limitation on x86_64 we can't use #ifdef, as
+ * SWP_PFN_OFFSET_FREE_BITS value can be changed dynamically for
+ * 4/5 level pgtables. For all the non-x86_64 archs (where the
+ * macro MAX_PHYSMEM_BITS is constant) this branching should be
+ * optimized out by the compiler.
+ */
+ if (SWP_PFN_OFFSET_FREE_BITS)
+ return swp_entry(swp_type(entry),
+ swp_offset(entry) | SWP_MIG_YOUNG_BIT);
+ return entry;
+}
+
+static inline bool is_migration_entry_young(swp_entry_t entry)
+{
+ /* Please refer to comment in make_migration_entry_young() */
+ if (SWP_PFN_OFFSET_FREE_BITS)
+ return swp_offset(entry) & SWP_MIG_YOUNG_BIT;
+ /* Keep the old behavior of aging page after migration */
+ return false;
+}
+
extern void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
spinlock_t *ptl);
extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
@@ -304,6 +350,16 @@ static inline int is_readable_migration_entry(swp_entry_t entry)
return 0;
}
+static inline swp_entry_t make_migration_entry_young(swp_entry_t entry)
+{
+ return entry;
+}
+
+static inline bool is_migration_entry_young(swp_entry_t entry)
+{
+ return false;
+}
+
#endif
typedef unsigned long pte_marker;
@@ -407,6 +463,7 @@ static inline bool is_pfn_swap_entry(swp_entry_t entry)
{
/* Make sure the swp offset can always store the needed fields */
BUILD_BUG_ON(SWP_TYPE_SHIFT < SWP_PFN_BITS);
+ BUILD_BUG_ON(SWP_TYPE_SHIFT < SWP_MIG_OFFSET_BITS);
return is_migration_entry(entry) || is_device_private_entry(entry) ||
is_device_exclusive_entry(entry);
@@ -2088,7 +2088,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
write = is_writable_migration_entry(entry);
if (PageAnon(page))
anon_exclusive = is_readable_exclusive_migration_entry(entry);
- young = false;
+ young = is_migration_entry_young(entry);
soft_dirty = pmd_swp_soft_dirty(old_pmd);
uffd_wp = pmd_swp_uffd_wp(old_pmd);
} else {
@@ -2146,6 +2146,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
else
swp_entry = make_readable_migration_entry(
page_to_pfn(page + i));
+ if (young)
+ swp_entry = make_migration_entry_young(swp_entry);
entry = swp_entry_to_pte(swp_entry);
if (soft_dirty)
entry = pte_swp_mksoft_dirty(entry);
@@ -3148,6 +3150,8 @@ int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
entry = make_readable_exclusive_migration_entry(page_to_pfn(page));
else
entry = make_readable_migration_entry(page_to_pfn(page));
+ if (pmd_young(pmdval))
+ entry = make_migration_entry_young(entry);
pmdswp = swp_entry_to_pmd(entry);
if (pmd_soft_dirty(pmdval))
pmdswp = pmd_swp_mksoft_dirty(pmdswp);
@@ -3173,13 +3177,15 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
entry = pmd_to_swp_entry(*pvmw->pmd);
get_page(new);
- pmde = pmd_mkold(mk_huge_pmd(new, READ_ONCE(vma->vm_page_prot)));
+ pmde = mk_huge_pmd(new, READ_ONCE(vma->vm_page_prot));
if (pmd_swp_soft_dirty(*pvmw->pmd))
pmde = pmd_mksoft_dirty(pmde);
if (is_writable_migration_entry(entry))
pmde = maybe_pmd_mkwrite(pmde, vma);
if (pmd_swp_uffd_wp(*pvmw->pmd))
pmde = pmd_wrprotect(pmd_mkuffd_wp(pmde));
+ if (!is_migration_entry_young(entry))
+ pmde = pmd_mkold(pmde);
if (PageAnon(new)) {
rmap_t rmap_flags = RMAP_COMPOUND;
@@ -199,7 +199,7 @@ static bool remove_migration_pte(struct folio *folio,
#endif
folio_get(folio);
- pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot)));
+ pte = mk_pte(new, READ_ONCE(vma->vm_page_prot));
if (pte_swp_soft_dirty(*pvmw.pte))
pte = pte_mksoft_dirty(pte);
@@ -207,6 +207,8 @@ static bool remove_migration_pte(struct folio *folio,
* Recheck VMA as permissions can change since migration started
*/
entry = pte_to_swp_entry(*pvmw.pte);
+ if (!is_migration_entry_young(entry))
+ pte = pte_mkold(pte);
if (is_writable_migration_entry(entry))
pte = maybe_mkwrite(pte, vma);
else if (pte_swp_uffd_wp(*pvmw.pte))
@@ -221,6 +221,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
else
entry = make_readable_migration_entry(
page_to_pfn(page));
+ if (pte_young(pte))
+ entry = make_migration_entry_young(entry);
swp_pte = swp_entry_to_pte(entry);
if (pte_present(pte)) {
if (pte_soft_dirty(pte))
@@ -2065,7 +2065,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
else
entry = make_readable_migration_entry(
page_to_pfn(subpage));
-
+ if (pte_young(pteval))
+ entry = make_migration_entry_young(entry);
swp_pte = swp_entry_to_pte(entry);
if (pte_soft_dirty(pteval))
swp_pte = pte_swp_mksoft_dirty(swp_pte);
When page migration happens, we always ignore the young bit settings in the old pgtable, and marking the page as old in the new page table using either pte_mkold() or pmd_mkold(). That's fine from functional-wise, but that's not friendly to page reclaim because the moving page can be actively accessed within the procedure. Actually we can easily remember the young bit configuration and make that information recovered when the page is migrated. To achieve it, define a new bit in the migration swap offset field to show whether the old pte has young bit set or not. Then when removing/recovering the migration entry, we can recover the young bit even if the page changed. One thing to mention is that the whole feature is based on an arch specific macro __ARCH_SWP_OFFSET_BITS that needs to be defined per-arch. The macro tells how many bits are available for the arch specific swp offset field. When that macro is not defined, we'll assume we don't have free bits in the migration swap entry offset, so we can't persist the young bit. So until now, there should have no functional change at all with this patch, since no arch has yet defined __ARCH_SWP_OFFSET_BITS. Signed-off-by: Peter Xu <peterx@redhat.com> --- include/linux/swapops.h | 57 +++++++++++++++++++++++++++++++++++++++++ mm/huge_memory.c | 10 ++++++-- mm/migrate.c | 4 ++- mm/migrate_device.c | 2 ++ mm/rmap.c | 3 ++- 5 files changed, 72 insertions(+), 4 deletions(-)