@@ -106,6 +106,7 @@ struct dev_pagemap_ops {
/**
* struct dev_pagemap - metadata for ZONE_DEVICE mappings
+ * @folio_ops: method table for folio operations.
* @altmap: pre-allocated/reserved memory for vmemmap allocations
* @ref: reference count that pins the devm_memremap_pages() mapping
* @done: completion for @ref
@@ -125,6 +126,7 @@ struct dev_pagemap_ops {
* @ranges: array of ranges to be mapped when nr_range > 1
*/
struct dev_pagemap {
+ struct folio_owner_ops folio_ops;
struct vmem_altmap altmap;
struct percpu_ref ref;
struct completion done;
@@ -140,6 +142,12 @@ struct dev_pagemap {
};
};
+/*
+ * The folio_owner_ops structure needs to be first since pgmap in struct page is
+ * overlaid with owner_ops in struct folio.
+ */
+static_assert(offsetof(struct dev_pagemap, folio_ops) == 0);
+
static inline bool pgmap_has_memory_failure(struct dev_pagemap *pgmap)
{
return pgmap->ops && pgmap->ops->memory_failure;
@@ -427,6 +427,7 @@ FOLIO_MATCH(lru, lru);
FOLIO_MATCH(mapping, mapping);
FOLIO_MATCH(compound_head, lru);
FOLIO_MATCH(compound_head, owner_ops);
+FOLIO_MATCH(pgmap, owner_ops);
FOLIO_MATCH(index, index);
FOLIO_MATCH(private, private);
FOLIO_MATCH(_mapcount, _mapcount);
@@ -618,15 +619,26 @@ static inline const struct folio_owner_ops *folio_get_owner_ops(struct folio *fo
/*
* Get the page dev_pagemap pgmap pointer.
+ *
+ * The page pgmap is overlaid with the folio owner_ops, where bit 1 is used to
+ * indicate that the page/folio has owner ops. The dev_pagemap contains
+ * owner_ops and is handled the same way. The getter returns a sanitized
+ * pointer.
*/
-#define page_get_pgmap(page) ((page)->pgmap)
+#define page_get_pgmap(page) \
+ ((struct dev_pagemap *)((unsigned long)(page)->pgmap & ~FOLIO_OWNER_OPS))
/*
* Set the page dev_pagemap pgmap pointer.
+ *
+ * The page pgmap is overlaid with the folio owner_ops, where bit 1 is used to
+ * indicate that the page/folio has owner ops. The dev_pagemap contains
+ * owner_ops and is handled the same way. The setter sets bit 1 to indicate
+ * that the page owner_ops.
*/
static inline void page_set_pgmap(struct page *page, struct dev_pagemap *pgmap)
{
- page->pgmap = pgmap;
+ page->pgmap = (struct dev_pagemap *)((unsigned long)pgmap | FOLIO_OWNER_OPS);
}
struct page_frag_cache {
@@ -1262,7 +1262,6 @@ int numa_migrate_check(struct folio *folio, struct vm_fault *vmf,
unsigned long addr, int *flags, bool writable,
int *last_cpupid);
-void free_zone_device_folio(struct folio *folio);
int migrate_device_coherent_folio(struct folio *folio);
struct vm_struct *__get_vm_area_node(unsigned long size,
@@ -456,50 +456,6 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
}
EXPORT_SYMBOL_GPL(get_dev_pagemap);
-void free_zone_device_folio(struct folio *folio)
-{
- struct dev_pagemap *pgmap = page_get_pgmap(&folio->page);
-
- if (WARN_ON_ONCE(!pgmap->ops || !pgmap->ops->page_free))
- return;
-
- mem_cgroup_uncharge(folio);
-
- /*
- * Note: we don't expect anonymous compound pages yet. Once supported
- * and we could PTE-map them similar to THP, we'd have to clear
- * PG_anon_exclusive on all tail pages.
- */
- if (folio_test_anon(folio)) {
- VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
- __ClearPageAnonExclusive(folio_page(folio, 0));
- }
-
- /*
- * When a device managed page is freed, the folio->mapping field
- * may still contain a (stale) mapping value. For example, the
- * lower bits of folio->mapping may still identify the folio as an
- * anonymous folio. Ultimately, this entire field is just stale
- * and wrong, and it will cause errors if not cleared.
- *
- * For other types of ZONE_DEVICE pages, migration is either
- * handled differently or not done at all, so there is no need
- * to clear folio->mapping.
- */
- folio->mapping = NULL;
- pgmap->ops->page_free(folio_page(folio, 0));
-
- if (pgmap->type != MEMORY_DEVICE_PRIVATE &&
- pgmap->type != MEMORY_DEVICE_COHERENT)
- /*
- * Reset the refcount to 1 to prepare for handing out the page
- * again.
- */
- folio_set_count(folio, 1);
- else
- put_dev_pagemap(pgmap);
-}
-
void zone_device_page_init(struct page *page)
{
/*
@@ -974,6 +974,51 @@ static void __init memmap_init(void)
}
#ifdef CONFIG_ZONE_DEVICE
+
+static void free_zone_device_folio(struct folio *folio)
+{
+ struct dev_pagemap *pgmap = page_get_pgmap(&folio->page);
+
+ if (WARN_ON_ONCE(!pgmap->ops || !pgmap->ops->page_free))
+ return;
+
+ mem_cgroup_uncharge(folio);
+
+ /*
+ * Note: we don't expect anonymous compound pages yet. Once supported
+ * and we could PTE-map them similar to THP, we'd have to clear
+ * PG_anon_exclusive on all tail pages.
+ */
+ if (folio_test_anon(folio)) {
+ VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
+ __ClearPageAnonExclusive(folio_page(folio, 0));
+ }
+
+ /*
+ * When a device managed page is freed, the folio->mapping field
+ * may still contain a (stale) mapping value. For example, the
+ * lower bits of folio->mapping may still identify the folio as an
+ * anonymous folio. Ultimately, this entire field is just stale
+ * and wrong, and it will cause errors if not cleared.
+ *
+ * For other types of ZONE_DEVICE pages, migration is either
+ * handled differently or not done at all, so there is no need
+ * to clear folio->mapping.
+ */
+ folio->mapping = NULL;
+ pgmap->ops->page_free(folio_page(folio, 0));
+
+ if (pgmap->type != MEMORY_DEVICE_PRIVATE &&
+ pgmap->type != MEMORY_DEVICE_COHERENT)
+ /*
+ * Reset the refcount to 1 to prepare for handing out the page
+ * again.
+ */
+ folio_set_count(folio, 1);
+ else
+ put_dev_pagemap(pgmap);
+}
+
static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
unsigned long zone_idx, int nid,
struct dev_pagemap *pgmap)
@@ -995,6 +1040,7 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
* and zone_device_data. It is a bug if a ZONE_DEVICE page is
* ever freed or placed on a driver-private list.
*/
+ pgmap->folio_ops.free = free_zone_device_folio;
page_set_pgmap(page, pgmap);
page->zone_device_data = NULL;
@@ -117,11 +117,6 @@ void __folio_put(struct folio *folio)
return;
}
- if (unlikely(folio_is_zone_device(folio))) {
- free_zone_device_folio(folio);
- return;
- }
-
if (folio_test_hugetlb(folio)) {
free_huge_folio(folio);
return;
@@ -947,20 +942,11 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs)
unlock_page_lruvec_irqrestore(lruvec, flags);
lruvec = NULL;
}
- if (folio_ref_sub_and_test(folio, nr_refs))
- owner_ops->free(folio);
- continue;
- }
-
- if (folio_is_zone_device(folio)) {
- if (lruvec) {
- unlock_page_lruvec_irqrestore(lruvec, flags);
- lruvec = NULL;
- }
+ /* fenced by folio_is_zone_device() */
if (put_devmap_managed_folio_refs(folio, nr_refs))
continue;
if (folio_ref_sub_and_test(folio, nr_refs))
- free_zone_device_folio(folio);
+ owner_ops->free(folio);
continue;
}
Now that we have the folio_owner_ops callback, use it for zone device pages instead of using a dedicated callback. Note that struct dev_pagemap (pgmap) in struct page is overlaid with struct folio owner_ops. Therefore, make struct dev_pagemap contain an instance of struct folio_owner_ops, to handle it the same way as struct folio_owner_ops. Also note that, although struct dev_pagemap_ops has a page_free() function, it does not have the same intention as the folio_owner_ops free() callback nor does it have the same behavior. The page_free() function is used as an optional callback to drivers that use zone device to inform them of the freeing of the page. Signed-off-by: Fuad Tabba <tabba@google.com> --- include/linux/memremap.h | 8 +++++++ include/linux/mm_types.h | 16 ++++++++++++-- mm/internal.h | 1 - mm/memremap.c | 44 -------------------------------------- mm/mm_init.c | 46 ++++++++++++++++++++++++++++++++++++++++ mm/swap.c | 18 ++-------------- 6 files changed, 70 insertions(+), 63 deletions(-)