@@ -576,6 +576,10 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
pgoff_t next, index;
int i, freed = 0;
bool truncate_op = (lend == LLONG_MAX);
+ LIST_HEAD(hwp_folios);
+
+ /* Needs to be done before removing folios from filemap. */
+ populate_memfd_hwp_folios(mapping, lstart >> PAGE_SHIFT, end, &hwp_folios);
folio_batch_init(&fbatch);
next = lstart >> PAGE_SHIFT;
@@ -605,6 +609,18 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
(void)hugetlb_unreserve_pages(inode,
lstart >> huge_page_shift(h),
LONG_MAX, freed);
+ /*
+ * hugetlbfs_error_remove_folio keeps the HWPoison-ed pages in
+ * page cache until mm wants to drop the folio at the end of the
+ * of the filemap. At this point, if memory failure was delayed
+ * by AS_MF_KEEP_UE_MAPPED in the past, we can now deal with it.
+ *
+ * TODO: in V2 we can probably get rid of populate_memfd_hwp_folios
+ * and hwp_folios, by inserting filemap_offline_hwpoison_folio
+ * into somewhere in folio_batch_release, or into per file system's
+ * free_folio handler.
+ */
+ offline_memfd_hwp_folios(mapping, &hwp_folios);
}
static void hugetlbfs_evict_inode(struct inode *inode)
@@ -836,10 +836,17 @@ int dissolve_free_hugetlb_folios(unsigned long start_pfn,
#ifdef CONFIG_MEMORY_FAILURE
extern void folio_clear_hugetlb_hwpoison(struct folio *folio);
+extern bool hugetlb_should_keep_hwpoison_mapped(struct folio *folio,
+ struct address_space *mapping);
#else
static inline void folio_clear_hugetlb_hwpoison(struct folio *folio)
{
}
+static inline bool hugetlb_should_keep_hwpoison_mapped(struct folio *folio
+ struct address_space *mapping)
+{
+ return false;
+}
#endif
#ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
@@ -210,6 +210,12 @@ enum mapping_flags {
AS_STABLE_WRITES = 7, /* must wait for writeback before modifying
folio contents */
AS_INACCESSIBLE = 8, /* Do not attempt direct R/W access to the mapping */
+ /*
+ * Keeps folios belong to the mapping mapped even if uncorrectable memory
+ * errors (UE) caused memory failure (MF) within the folio. Only at the end
+ * of mapping will its HWPoison-ed folios be dealt with.
+ */
+ AS_MF_KEEP_UE_MAPPED = 9,
/* Bits 16-25 are used for FOLIO_ORDER */
AS_FOLIO_ORDER_BITS = 5,
AS_FOLIO_ORDER_MIN = 16,
@@ -335,6 +341,16 @@ static inline bool mapping_inaccessible(struct address_space *mapping)
return test_bit(AS_INACCESSIBLE, &mapping->flags);
}
+static inline bool mapping_mf_keep_ue_mapped(struct address_space *mapping)
+{
+ return test_bit(AS_MF_KEEP_UE_MAPPED, &mapping->flags);
+}
+
+static inline void mapping_set_mf_keep_ue_mapped(struct address_space *mapping)
+{
+ set_bit(AS_MF_KEEP_UE_MAPPED, &mapping->flags);
+}
+
static inline gfp_t mapping_gfp_mask(struct address_space * mapping)
{
return mapping->gfp_mask;
@@ -1298,6 +1314,33 @@ void replace_page_cache_folio(struct folio *old, struct folio *new);
void delete_from_page_cache_batch(struct address_space *mapping,
struct folio_batch *fbatch);
bool filemap_release_folio(struct folio *folio, gfp_t gfp);
+#ifdef CONFIG_MEMORY_FAILURE
+void populate_memfd_hwp_folios(struct address_space *mapping,
+ pgoff_t lstart, pgoff_t lend,
+ struct list_head *list);
+void offline_memfd_hwp_folios(struct address_space *mapping,
+ struct list_head *list);
+/*
+ * Provided by memory failure to offline HWPoison-ed folio for various memory
+ * management systems (hugetlb, THP etc).
+ */
+void filemap_offline_hwpoison_folio(struct address_space *mapping,
+ struct folio *folio);
+#else
+void populate_memfd_hwp_folios(struct address_space *mapping,
+ loff_t lstart, loff_t lend,
+ struct list_head *list)
+{
+}
+void offline_memfd_hwp_folios(struct address_space *mapping,
+ struct list_head *list)
+{
+}
+void filemap_offline_hwpoison_folio(struct address_space *mapping,
+ struct folio *folio)
+{
+}
+#endif
loff_t mapping_seek_hole_data(struct address_space *, loff_t start, loff_t end,
int whence);
@@ -12,6 +12,7 @@
#define MFD_NOEXEC_SEAL 0x0008U
/* executable */
#define MFD_EXEC 0x0010U
+#define MFD_MF_KEEP_UE_MAPPED 0x0020U
/*
* Huge page size encoding when MFD_HUGETLB is specified, and a huge page
@@ -4427,3 +4427,81 @@ SYSCALL_DEFINE4(cachestat, unsigned int, fd,
return 0;
}
#endif /* CONFIG_CACHESTAT_SYSCALL */
+
+#ifdef CONFIG_MEMORY_FAILURE
+/**
+ * To remember the HWPoison-ed folios within a mapping before removing every
+ * folio, create an utility struct to link them a list.
+ */
+struct memfd_hwp_folio {
+ struct list_head node;
+ struct folio *folio;
+};
+/**
+ * populate_memfd_hwp_folios - populates HWPoison-ed folios.
+ * @mapping: The address_space of a memfd the kernel is trying to remove or truncate.
+ * @start: The starting page index.
+ * @end: The final page index (inclusive).
+ * @list: Where the HWPoison-ed folios will be stored into.
+ *
+ * There may be pending HWPoison-ed folios when a memfd is being removed or
+ * part of it is being truncated. Stores them into a linked list to offline
+ * after the file system removes them.
+ */
+void populate_memfd_hwp_folios(struct address_space *mapping,
+ pgoff_t start, pgoff_t end,
+ struct list_head *list)
+{
+ int i;
+ struct folio *folio;
+ struct memfd_hwp_folio *to_add;
+ struct folio_batch fbatch;
+ pgoff_t next = start;
+
+ if (!mapping_mf_keep_ue_mapped(mapping))
+ return;
+
+ folio_batch_init(&fbatch);
+ while (filemap_get_folios(mapping, &next, end - 1, &fbatch)) {
+ for (i = 0; i < folio_batch_count(&fbatch); ++i) {
+ folio = fbatch.folios[i];
+ if (!folio_test_hwpoison(folio))
+ continue;
+
+ to_add = kmalloc(sizeof(*to_add), GFP_KERNEL);
+ if (!to_add)
+ continue;
+
+ to_add->folio = folio;
+ list_add_tail(&to_add->node, list);
+ }
+ folio_batch_release(&fbatch);
+ }
+}
+EXPORT_SYMBOL_GPL(populate_memfd_hwp_folios);
+
+/**
+ * offline_memfd_hwp_folios - hard offline HWPoison-ed folios.
+ * @mapping: The address_space of a memfd the kernel is trying to remove or truncate.
+ * @list: Where the HWPoison-ed folios are stored. It will become empty when
+ * offline_memfd_hwp_folios returns.
+ *
+ * After the file system removed all the folios belong to a memfd, the kernel
+ * now can hard offline all HWPoison-ed folios that are previously pending.
+ * Caller needs to exclusively own @list as no locking is provided here, and
+ * @list is entirely consumed here.
+ */
+void offline_memfd_hwp_folios(struct address_space *mapping,
+ struct list_head *list)
+{
+ struct memfd_hwp_folio *curr, *temp;
+
+ list_for_each_entry_safe(curr, temp, list, node) {
+ filemap_offline_hwpoison_folio(mapping, curr->folio);
+ list_del(&curr->node);
+ kfree(curr);
+ }
+}
+EXPORT_SYMBOL_GPL(offline_memfd_hwp_folios);
+
+#endif /* CONFIG_MEMORY_FAILURE */
@@ -6091,6 +6091,18 @@ static bool hugetlb_pte_stable(struct hstate *h, struct mm_struct *mm, unsigned
return same;
}
+bool hugetlb_should_keep_hwpoison_mapped(struct folio *folio,
+ struct address_space *mapping)
+{
+ if (WARN_ON_ONCE(!folio_test_hugetlb(folio)))
+ return false;
+
+ if (!mapping)
+ return false;
+
+ return mapping_mf_keep_ue_mapped(mapping);
+}
+
static vm_fault_t hugetlb_no_page(struct address_space *mapping,
struct vm_fault *vmf)
{
@@ -6214,9 +6226,11 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping,
* So we need to block hugepage fault by PG_hwpoison bit check.
*/
if (unlikely(folio_test_hwpoison(folio))) {
- ret = VM_FAULT_HWPOISON_LARGE |
- VM_FAULT_SET_HINDEX(hstate_index(h));
- goto backout_unlocked;
+ if (!mapping_mf_keep_ue_mapped(mapping)) {
+ ret = VM_FAULT_HWPOISON_LARGE |
+ VM_FAULT_SET_HINDEX(hstate_index(h));
+ goto backout_unlocked;
+ }
}
/* Check for page in userfault range. */
@@ -302,7 +302,8 @@ long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg)
#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)
-#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | MFD_NOEXEC_SEAL | MFD_EXEC)
+#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | \
+ MFD_NOEXEC_SEAL | MFD_EXEC | MFD_MF_KEEP_UE_MAPPED)
static int check_sysctl_memfd_noexec(unsigned int *flags)
{
@@ -376,6 +377,8 @@ static int sanitize_flags(unsigned int *flags_ptr)
if (!(flags & MFD_HUGETLB)) {
if (flags & ~(unsigned int)MFD_ALL_FLAGS)
return -EINVAL;
+ if (flags & MFD_MF_KEEP_UE_MAPPED)
+ return -EINVAL;
} else {
/* Allow huge page size encoding in flags. */
if (flags & ~(unsigned int)(MFD_ALL_FLAGS |
@@ -436,6 +439,16 @@ static struct file *alloc_file(const char *name, unsigned int flags)
file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
file->f_flags |= O_LARGEFILE;
+ /*
+ * MFD_MF_KEEP_UE_MAPPED can only be specified in memfd_create; no API
+ * to update it once memfd is created. MFD_MF_KEEP_UE_MAPPED is not
+ * seal-able.
+ *
+ * TODO: MFD_MF_KEEP_UE_MAPPED is not supported by all file system yet.
+ */
+ if (flags & (MFD_HUGETLB | MFD_MF_KEEP_UE_MAPPED))
+ mapping_set_mf_keep_ue_mapped(file->f_mapping);
+
if (flags & MFD_NOEXEC_SEAL) {
struct inode *inode = file_inode(file);
@@ -445,11 +445,13 @@ static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma,
* Schedule a process for later kill.
* Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
*/
-static void __add_to_kill(struct task_struct *tsk, const struct page *p,
+static void __add_to_kill(struct task_struct *tsk, struct page *p,
struct vm_area_struct *vma, struct list_head *to_kill,
unsigned long addr)
{
struct to_kill *tk;
+ struct folio *folio;
+ struct address_space *mapping;
tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
if (!tk) {
@@ -460,8 +462,20 @@ static void __add_to_kill(struct task_struct *tsk, const struct page *p,
tk->addr = addr;
if (is_zone_device_page(p))
tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr);
- else
- tk->size_shift = folio_shift(page_folio(p));
+ else {
+ folio = page_folio(p);
+ mapping = folio_mapping(folio);
+ if (mapping && mapping_mf_keep_ue_mapped(mapping))
+ /*
+ * Let userspace know the radius of the hardware poison
+ * is the size of raw page, and as long as they aborts
+ * the load to the scope, other pages inside the folio
+ * are still safe to access.
+ */
+ tk->size_shift = PAGE_SHIFT;
+ else
+ tk->size_shift = folio_shift(folio);
+ }
/*
* Send SIGKILL if "tk->addr == -EFAULT". Also, as
@@ -486,7 +500,7 @@ static void __add_to_kill(struct task_struct *tsk, const struct page *p,
list_add_tail(&tk->nd, to_kill);
}
-static void add_to_kill_anon_file(struct task_struct *tsk, const struct page *p,
+static void add_to_kill_anon_file(struct task_struct *tsk, struct page *p,
struct vm_area_struct *vma, struct list_head *to_kill,
unsigned long addr)
{
@@ -607,7 +621,7 @@ struct task_struct *task_early_kill(struct task_struct *tsk, int force_early)
* Collect processes when the error hit an anonymous page.
*/
static void collect_procs_anon(const struct folio *folio,
- const struct page *page, struct list_head *to_kill,
+ struct page *page, struct list_head *to_kill,
int force_early)
{
struct task_struct *tsk;
@@ -645,7 +659,7 @@ static void collect_procs_anon(const struct folio *folio,
* Collect processes when the error hit a file mapped page.
*/
static void collect_procs_file(const struct folio *folio,
- const struct page *page, struct list_head *to_kill,
+ struct page *page, struct list_head *to_kill,
int force_early)
{
struct vm_area_struct *vma;
@@ -727,7 +741,7 @@ static void collect_procs_fsdax(const struct page *page,
/*
* Collect the processes who have the corrupted page mapped to kill.
*/
-static void collect_procs(const struct folio *folio, const struct page *page,
+static void collect_procs(const struct folio *folio, struct page *page,
struct list_head *tokill, int force_early)
{
if (!folio->mapping)
@@ -1226,6 +1240,13 @@ static int me_huge_page(struct page_state *ps, struct page *p)
}
}
+ /*
+ * MF still needs to holds a refcount for the deferred actions in
+ * filemap_offline_hwpoison_folio.
+ */
+ if (hugetlb_should_keep_hwpoison_mapped(folio, mapping))
+ return res;
+
if (has_extra_refcount(ps, p, extra_pins))
res = MF_FAILED;
@@ -1593,6 +1614,7 @@ static bool hwpoison_user_mappings(struct folio *folio, struct page *p,
struct address_space *mapping;
LIST_HEAD(tokill);
bool unmap_success;
+ bool keep_mapped;
int forcekill;
bool mlocked = folio_test_mlocked(folio);
@@ -1643,10 +1665,12 @@ static bool hwpoison_user_mappings(struct folio *folio, struct page *p,
*/
collect_procs(folio, p, &tokill, flags & MF_ACTION_REQUIRED);
- unmap_poisoned_folio(folio, ttu);
+ keep_mapped = hugetlb_should_keep_hwpoison_mapped(folio, mapping);
+ if (!keep_mapped)
+ unmap_poisoned_folio(folio, ttu);
unmap_success = !folio_mapped(folio);
- if (!unmap_success)
+ if (!unmap_success && !keep_mapped)
pr_err("%#lx: failed to unmap page (folio mapcount=%d)\n",
pfn, folio_mapcount(folio));
@@ -1671,7 +1695,7 @@ static bool hwpoison_user_mappings(struct folio *folio, struct page *p,
!unmap_success;
kill_procs(&tokill, forcekill, pfn, flags);
- return unmap_success;
+ return unmap_success || keep_mapped;
}
static int identify_page_state(unsigned long pfn, struct page *p,
@@ -1911,6 +1935,9 @@ static unsigned long __folio_free_raw_hwp(struct folio *folio, bool move_flag)
unsigned long count = 0;
head = llist_del_all(raw_hwp_list_head(folio));
+ if (head == NULL)
+ return 0;
+
llist_for_each_entry_safe(p, next, head, node) {
if (move_flag)
SetPageHWPoison(p->page);
@@ -1927,7 +1954,8 @@ static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page)
struct llist_head *head;
struct raw_hwp_page *raw_hwp;
struct raw_hwp_page *p;
- int ret = folio_test_set_hwpoison(folio) ? -EHWPOISON : 0;
+ struct address_space *mapping = folio->mapping;
+ bool has_hwpoison = folio_test_set_hwpoison(folio);
/*
* Once the hwpoison hugepage has lost reliable raw error info,
@@ -1946,8 +1974,15 @@ static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page)
if (raw_hwp) {
raw_hwp->page = page;
llist_add(&raw_hwp->node, head);
+ if (hugetlb_should_keep_hwpoison_mapped(folio, mapping))
+ /*
+ * A new raw HWPoison page. Don't return HWPOISON.
+ * Error event will be counted in action_result().
+ */
+ return 0;
+
/* the first error event will be counted in action_result(). */
- if (ret)
+ if (has_hwpoison)
num_poisoned_pages_inc(page_to_pfn(page));
} else {
/*
@@ -1962,7 +1997,8 @@ static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page)
*/
__folio_free_raw_hwp(folio, false);
}
- return ret;
+
+ return has_hwpoison ? -EHWPOISON : 0;
}
static unsigned long folio_free_raw_hwp(struct folio *folio, bool move_flag)
@@ -2051,6 +2087,63 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
return ret;
}
+static void filemap_offline_hwpoison_folio_hugetlb(struct folio *folio)
+{
+ int ret;
+ struct llist_node *head;
+ struct raw_hwp_page *curr, *next;
+ struct page *page;
+ unsigned long pfn;
+
+ head = llist_del_all(raw_hwp_list_head(folio));
+
+ /*
+ * Release references hold by try_memory_failure_hugetlb, one per
+ * HWPoison-ed page in raw hwp list. This folio's refcount expects to
+ * drop to zero after the below for-each loop.
+ */
+ llist_for_each_entry(curr, head, node)
+ folio_put(folio);
+
+ ret = dissolve_free_hugetlb_folio(folio);
+ if (ret) {
+ pr_err("failed to dissolve hugetlb folio: %d\n", ret);
+ llist_for_each_entry(curr, head, node) {
+ page = curr->page;
+ pfn = page_to_pfn(page);
+ /*
+ * TODO: roll back the count incremented during online
+ * handling, i.e. whatever me_huge_page returns.
+ */
+ update_per_node_mf_stats(pfn, MF_FAILED);
+ }
+ return;
+ }
+
+ llist_for_each_entry_safe(curr, next, head, node) {
+ page = curr->page;
+ pfn = page_to_pfn(page);
+ drain_all_pages(page_zone(page));
+ if (PageBuddy(page) && !take_page_off_buddy(page))
+ pr_warn("%#lx: unable to take off buddy allocator\n", pfn);
+
+ SetPageHWPoison(page);
+ page_ref_inc(page);
+ kfree(curr);
+ pr_info("%#lx: pending hard offline completed\n", pfn);
+ }
+}
+
+void filemap_offline_hwpoison_folio(struct address_space *mapping,
+ struct folio *folio)
+{
+ WARN_ON_ONCE(!mapping);
+
+ /* Pending MFR currently only exist for hugetlb. */
+ if (hugetlb_should_keep_hwpoison_mapped(folio, mapping))
+ filemap_offline_hwpoison_folio_hugetlb(folio);
+}
+
/*
* Taking refcount of hugetlb pages needs extra care about race conditions
* with basic operations like hugepage allocation/free/demotion.
Sometimes immediately hard offlining memory page having uncorrected memory errors (UE) may not be the best option for capacity and/or performance reasons. "Sometimes" even becomes "often times" in Cloud scenarios. See cover letter for the descriptions to two scenarios. Therefore keeping or discarding a large chunk of contiguous memory mapped to userspace (particularly to serve guest memory) due to UE (recoverable is implied) should be able to be controlled by userspace process, e.g. VMM in Cloud environment. Given the relevance of HugeTLB's non-ideal memory failure recovery behavior, this commit uses HugeTLB as the "testbed" to demonstrate the idea of memfd-based userspace memory failure policy. MFD_MF_KEEP_UE_MAPPED is added to the possible values for flags in memfd_create syscall. It is intended to be generic for any memfd, not just HugeTLB, but the current implementation only covers HugeTLB. When MFD_MF_KEEP_UE_MAPPED is set in flags, memory failure recovery in the kernel doesn’t hard offline memory due to UE until the created memfd is released or the affected memory region is truncated by userspace. IOW, the HWPoison-ed memory remains accessible via the returned memfd or the memory mapping created with that memfd. However, the affected memory will be immediately protected and isolated from future use by both kernel and userspace once the owning memfd is gone or the memory is truncated. By default MFD_MF_KEEP_UE_MAPPED is not set, and kernel hard offlines memory having UEs. Tested with selftest in followup patch. This commit should probably be split into smaller pieces, but for now I will defer it until this RFC becomes PATCH. Signed-off-by: Jiaqi Yan <jiaqiyan@google.com> --- fs/hugetlbfs/inode.c | 16 +++++ include/linux/hugetlb.h | 7 +++ include/linux/pagemap.h | 43 ++++++++++++++ include/uapi/linux/memfd.h | 1 + mm/filemap.c | 78 ++++++++++++++++++++++++ mm/hugetlb.c | 20 ++++++- mm/memfd.c | 15 ++++- mm/memory-failure.c | 119 +++++++++++++++++++++++++++++++++---- 8 files changed, 282 insertions(+), 17 deletions(-)