[RFC,v1,09/10] mm: Use owner_ops on folio_put for zone device pages

Message ID	20241108162040.159038-10-tabba@google.com (mailing list archive)
State	New
Headers	show Received: from mail-yb1-f202.google.com (mail-yb1-f202.google.com [209.85.219.202]) (using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 1B550197A8E for <kvm@vger.kernel.org>; Fri, 8 Nov 2024 16:21:03 +0000 (UTC) Date: Fri, 8 Nov 2024 16:20:39 +0000 In-Reply-To: <20241108162040.159038-1-tabba@google.com> Precedence: bulk Mime-Version: 1.0 References: <20241108162040.159038-1-tabba@google.com> Message-ID: <20241108162040.159038-10-tabba@google.com> Subject: [RFC PATCH v1 09/10] mm: Use owner_ops on folio_put for zone device pages From: Fuad Tabba <tabba@google.com> To: linux-mm@kvack.org Cc: kvm@vger.kernel.org, nouveau@lists.freedesktop.org, dri-devel@lists.freedesktop.org, david@redhat.com, rppt@kernel.org, jglisse@redhat.com, akpm@linux-foundation.org, muchun.song@linux.dev, simona@ffwll.ch, airlied@gmail.com, pbonzini@redhat.com, seanjc@google.com, willy@infradead.org, jgg@nvidia.com, jhubbard@nvidia.com, ackerleytng@google.com, vannapurve@google.com, mail@maciej.szmigiero.name, kirill.shutemov@linux.intel.com, quic_eberman@quicinc.com, maz@kernel.org, will@kernel.org, qperret@google.com, keirf@google.com, roypat@amazon.co.uk, tabba@google.com Content-Type: text/plain; charset="UTF-8"
Series	mm: Introduce and use folio_owner_ops \| expand [RFC,v1,00/10] mm: Introduce and use folio_owner_ops [RFC,v1,01/10] mm/hugetlb: rename isolate_hugetlb() to folio_isolate_hugetlb() [RFC,v1,02/10] mm/migrate: don't call folio_putback_active_hugetlb() on dst hugetlb folio [RFC,v1,03/10] mm/hugetlb: rename "folio_putback_active_hugetlb()" to "folio_putback_hugetlb()" [RFC,v1,04/10] mm/hugetlb-cgroup: convert hugetlb_cgroup_css_offline() to work on folios [RFC,v1,05/10] mm/hugetlb: use folio->lru int demote_free_hugetlb_folios() [RFC,v1,06/10] mm/hugetlb: use separate folio->_hugetlb_list for hugetlb-internals [RFC,v1,07/10] mm: Introduce struct folio_owner_ops [RFC,v1,08/10] mm: Use getters and setters to access page pgmap [RFC,v1,09/10] mm: Use owner_ops on folio_put for zone device pages [RFC,v1,10/10] mm: hugetlb: Use owner_ops on folio_put for hugetlb

diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 060e27b6aee0..5b68bbc588a3 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -106,6 +106,7 @@ struct dev_pagemap_ops { /** * struct dev_pagemap - metadata for ZONE_DEVICE mappings + * @folio_ops: method table for folio operations. * @altmap: pre-allocated/reserved memory for vmemmap allocations * @ref: reference count that pins the devm_memremap_pages() mapping * @done: completion for @ref @@ -125,6 +126,7 @@ struct dev_pagemap_ops { * @ranges: array of ranges to be mapped when nr_range > 1 */ struct dev_pagemap { + struct folio_owner_ops folio_ops; struct vmem_altmap altmap; struct percpu_ref ref; struct completion done; @@ -140,6 +142,12 @@ struct dev_pagemap { }; }; +/* + * The folio_owner_ops structure needs to be first since pgmap in struct page is + * overlaid with owner_ops in struct folio. + */ +static_assert(offsetof(struct dev_pagemap, folio_ops) == 0); + static inline bool pgmap_has_memory_failure(struct dev_pagemap *pgmap) { return pgmap->ops && pgmap->ops->memory_failure; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 27075ea24e67..a72fda20d5e9 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -427,6 +427,7 @@ FOLIO_MATCH(lru, lru); FOLIO_MATCH(mapping, mapping); FOLIO_MATCH(compound_head, lru); FOLIO_MATCH(compound_head, owner_ops); +FOLIO_MATCH(pgmap, owner_ops); FOLIO_MATCH(index, index); FOLIO_MATCH(private, private); FOLIO_MATCH(_mapcount, _mapcount); @@ -618,15 +619,26 @@ static inline const struct folio_owner_ops *folio_get_owner_ops(struct folio *fo /* * Get the page dev_pagemap pgmap pointer. + * + * The page pgmap is overlaid with the folio owner_ops, where bit 1 is used to + * indicate that the page/folio has owner ops. The dev_pagemap contains + * owner_ops and is handled the same way. The getter returns a sanitized + * pointer. */ -#define page_get_pgmap(page) ((page)->pgmap) +#define page_get_pgmap(page) \ + ((struct dev_pagemap *)((unsigned long)(page)->pgmap & ~FOLIO_OWNER_OPS)) /* * Set the page dev_pagemap pgmap pointer. + * + * The page pgmap is overlaid with the folio owner_ops, where bit 1 is used to + * indicate that the page/folio has owner ops. The dev_pagemap contains + * owner_ops and is handled the same way. The setter sets bit 1 to indicate + * that the page owner_ops. */ static inline void page_set_pgmap(struct page *page, struct dev_pagemap *pgmap) { - page->pgmap = pgmap; + page->pgmap = (struct dev_pagemap *)((unsigned long)pgmap | FOLIO_OWNER_OPS); } struct page_frag_cache { diff --git a/mm/internal.h b/mm/internal.h index 5a7302baeed7..a041247bed10 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1262,7 +1262,6 @@ int numa_migrate_check(struct folio *folio, struct vm_fault *vmf, unsigned long addr, int *flags, bool writable, int *last_cpupid); -void free_zone_device_folio(struct folio *folio); int migrate_device_coherent_folio(struct folio *folio); struct vm_struct *__get_vm_area_node(unsigned long size, diff --git a/mm/memremap.c b/mm/memremap.c index 931bc85da1df..9fd5f57219eb 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -456,50 +456,6 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn, } EXPORT_SYMBOL_GPL(get_dev_pagemap); -void free_zone_device_folio(struct folio *folio) -{ - struct dev_pagemap *pgmap = page_get_pgmap(&folio->page); - - if (WARN_ON_ONCE(!pgmap->ops || !pgmap->ops->page_free)) - return; - - mem_cgroup_uncharge(folio); - - /* - * Note: we don't expect anonymous compound pages yet. Once supported - * and we could PTE-map them similar to THP, we'd have to clear - * PG_anon_exclusive on all tail pages. - */ - if (folio_test_anon(folio)) { - VM_BUG_ON_FOLIO(folio_test_large(folio), folio); - __ClearPageAnonExclusive(folio_page(folio, 0)); - } - - /* - * When a device managed page is freed, the folio->mapping field - * may still contain a (stale) mapping value. For example, the - * lower bits of folio->mapping may still identify the folio as an - * anonymous folio. Ultimately, this entire field is just stale - * and wrong, and it will cause errors if not cleared. - * - * For other types of ZONE_DEVICE pages, migration is either - * handled differently or not done at all, so there is no need - * to clear folio->mapping. - */ - folio->mapping = NULL; - pgmap->ops->page_free(folio_page(folio, 0)); - - if (pgmap->type != MEMORY_DEVICE_PRIVATE && - pgmap->type != MEMORY_DEVICE_COHERENT) - /* - * Reset the refcount to 1 to prepare for handing out the page - * again. - */ - folio_set_count(folio, 1); - else - put_dev_pagemap(pgmap); -} - void zone_device_page_init(struct page *page) { /* diff --git a/mm/mm_init.c b/mm/mm_init.c index 279cdaebfd2b..47c1f8fd4914 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -974,6 +974,51 @@ static void __init memmap_init(void) } #ifdef CONFIG_ZONE_DEVICE + +static void free_zone_device_folio(struct folio *folio) +{ + struct dev_pagemap *pgmap = page_get_pgmap(&folio->page); + + if (WARN_ON_ONCE(!pgmap->ops || !pgmap->ops->page_free)) + return; + + mem_cgroup_uncharge(folio); + + /* + * Note: we don't expect anonymous compound pages yet. Once supported + * and we could PTE-map them similar to THP, we'd have to clear + * PG_anon_exclusive on all tail pages. + */ + if (folio_test_anon(folio)) { + VM_BUG_ON_FOLIO(folio_test_large(folio), folio); + __ClearPageAnonExclusive(folio_page(folio, 0)); + } + + /* + * When a device managed page is freed, the folio->mapping field + * may still contain a (stale) mapping value. For example, the + * lower bits of folio->mapping may still identify the folio as an + * anonymous folio. Ultimately, this entire field is just stale + * and wrong, and it will cause errors if not cleared. + * + * For other types of ZONE_DEVICE pages, migration is either + * handled differently or not done at all, so there is no need + * to clear folio->mapping. + */ + folio->mapping = NULL; + pgmap->ops->page_free(folio_page(folio, 0)); + + if (pgmap->type != MEMORY_DEVICE_PRIVATE && + pgmap->type != MEMORY_DEVICE_COHERENT) + /* + * Reset the refcount to 1 to prepare for handing out the page + * again. + */ + folio_set_count(folio, 1); + else + put_dev_pagemap(pgmap); +} + static void __ref __init_zone_device_page(struct page *page, unsigned long pfn, unsigned long zone_idx, int nid, struct dev_pagemap *pgmap) @@ -995,6 +1040,7 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn, * and zone_device_data. It is a bug if a ZONE_DEVICE page is * ever freed or placed on a driver-private list. */ + pgmap->folio_ops.free = free_zone_device_folio; page_set_pgmap(page, pgmap); page->zone_device_data = NULL; diff --git a/mm/swap.c b/mm/swap.c index 767ff6d8f47b..d2578465e270 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -117,11 +117,6 @@ void __folio_put(struct folio *folio) return; } - if (unlikely(folio_is_zone_device(folio))) { - free_zone_device_folio(folio); - return; - } - if (folio_test_hugetlb(folio)) { free_huge_folio(folio); return; @@ -947,20 +942,11 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs) unlock_page_lruvec_irqrestore(lruvec, flags); lruvec = NULL; } - if (folio_ref_sub_and_test(folio, nr_refs)) - owner_ops->free(folio); - continue; - } - - if (folio_is_zone_device(folio)) { - if (lruvec) { - unlock_page_lruvec_irqrestore(lruvec, flags); - lruvec = NULL; - } + /* fenced by folio_is_zone_device() */ if (put_devmap_managed_folio_refs(folio, nr_refs)) continue; if (folio_ref_sub_and_test(folio, nr_refs)) - free_zone_device_folio(folio); + owner_ops->free(folio); continue; }

[RFC,v1,09/10] mm: Use owner_ops on folio_put for zone device pages

Commit Message

Patch