Message ID | 20230405180134.16932-4-ankita@nvidia.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Expose GPU memory as coherently CPU accessible | expand |
Hi, kernel test robot noticed the following build errors: [auto build test ERROR on awilliam-vfio/for-linus] [also build test ERROR on kvmarm/next akpm-mm/mm-everything linus/master v6.3-rc5] [cannot apply to awilliam-vfio/next next-20230405] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch#_base_tree_information] url: https://github.com/intel-lab-lkp/linux/commits/ankita-nvidia-com/kvm-determine-memory-type-from-VMA/20230406-020404 base: https://github.com/awilliam/linux-vfio.git for-linus patch link: https://lore.kernel.org/r/20230405180134.16932-4-ankita%40nvidia.com patch subject: [PATCH v3 3/6] mm: handle poisoning of pfn without struct pages config: x86_64-randconfig-a015-20230403 (https://download.01.org/0day-ci/archive/20230406/202304060452.tpNrPK39-lkp@intel.com/config) compiler: gcc-11 (Debian 11.3.0-8) 11.3.0 reproduce (this is a W=1 build): # https://github.com/intel-lab-lkp/linux/commit/25466c8c2fa22d39a08721a24f0cf3bc3059417b git remote add linux-review https://github.com/intel-lab-lkp/linux git fetch --no-tags linux-review ankita-nvidia-com/kvm-determine-memory-type-from-VMA/20230406-020404 git checkout 25466c8c2fa22d39a08721a24f0cf3bc3059417b # save the config file mkdir build_dir && cp config build_dir/.config make W=1 O=build_dir ARCH=x86_64 olddefconfig make W=1 O=build_dir ARCH=x86_64 SHELL=/bin/bash If you fix the issue, kindly add following tag where applicable | Reported-by: kernel test robot <lkp@intel.com> | Link: https://lore.kernel.org/oe-kbuild-all/202304060452.tpNrPK39-lkp@intel.com/ All errors (new ones prefixed by >>): ld: vmlinux.o: in function `memory_failure_pfn': >> mm/memory-failure.c:2124: undefined reference to `interval_tree_iter_first' >> ld: mm/memory-failure.c:2125: undefined reference to `interval_tree_iter_next' ld: vmlinux.o: in function `register_pfn_address_space': >> mm/memory-failure.c:2087: undefined reference to `interval_tree_insert' ld: vmlinux.o: in function `unregister_pfn_address_space': >> mm/memory-failure.c:2105: undefined reference to `interval_tree_remove' vim +2124 mm/memory-failure.c 2065 2066 /** 2067 * register_pfn_address_space - Register PA region for poison notification. 2068 * @pfn_space: structure containing region range and callback function on 2069 * poison detection. 2070 * 2071 * This function is called by a kernel module to register a PA region and 2072 * a callback function with the kernel. On detection of poison, the 2073 * kernel code will go through all registered regions and call the 2074 * appropriate callback function associated with the range. The kernel 2075 * module is responsible for tracking the poisoned pages. 2076 * 2077 * Return: 0 if successfully registered, 2078 * -EBUSY if the region is already registered 2079 */ 2080 int register_pfn_address_space(struct pfn_address_space *pfn_space) 2081 { 2082 if (!request_mem_region(pfn_space->node.start << PAGE_SHIFT, 2083 (pfn_space->node.last - pfn_space->node.start + 1) << PAGE_SHIFT, "")) 2084 return -EBUSY; 2085 2086 mutex_lock(&pfn_space_lock); > 2087 interval_tree_insert(&pfn_space->node, &pfn_space_itree); 2088 mutex_unlock(&pfn_space_lock); 2089 2090 return 0; 2091 } 2092 EXPORT_SYMBOL_GPL(register_pfn_address_space); 2093 2094 /** 2095 * unregister_pfn_address_space - Unregister a PA region from poison 2096 * notification. 2097 * @pfn_space: structure containing region range to be unregistered. 2098 * 2099 * This function is called by a kernel module to unregister the PA region 2100 * from the kernel from poison tracking. 2101 */ 2102 void unregister_pfn_address_space(struct pfn_address_space *pfn_space) 2103 { 2104 mutex_lock(&pfn_space_lock); > 2105 interval_tree_remove(&pfn_space->node, &pfn_space_itree); 2106 mutex_unlock(&pfn_space_lock); 2107 release_mem_region(pfn_space->node.start << PAGE_SHIFT, 2108 (pfn_space->node.last - pfn_space->node.start + 1) << PAGE_SHIFT); 2109 } 2110 EXPORT_SYMBOL_GPL(unregister_pfn_address_space); 2111 2112 static int memory_failure_pfn(unsigned long pfn, int flags) 2113 { 2114 struct interval_tree_node *node; 2115 int rc = -EBUSY; 2116 LIST_HEAD(tokill); 2117 2118 mutex_lock(&pfn_space_lock); 2119 /* 2120 * Modules registers with MM the address space mapping to the device memory they 2121 * manage. Iterate to identify exactly which address space has mapped to this 2122 * failing PFN. 2123 */ > 2124 for (node = interval_tree_iter_first(&pfn_space_itree, pfn, pfn); node; > 2125 node = interval_tree_iter_next(node, pfn, pfn)) { 2126 struct pfn_address_space *pfn_space = 2127 container_of(node, struct pfn_address_space, node); 2128 rc = 0; 2129 2130 /* 2131 * Modules managing the device memory needs to be conveyed about the 2132 * memory failure so that the poisoned PFN can be tracked. 2133 */ 2134 pfn_space->ops->failure(pfn_space, pfn); 2135 2136 collect_procs_pgoff(NULL, pfn_space->mapping, pfn, &tokill); 2137 2138 unmap_mapping_range(pfn_space->mapping, pfn << PAGE_SHIFT, 2139 PAGE_SIZE, 0); 2140 } 2141 mutex_unlock(&pfn_space_lock); 2142 2143 /* 2144 * Unlike System-RAM there is no possibility to swap in a different 2145 * physical page at a given virtual address, so all userspace 2146 * consumption of direct PFN memory necessitates SIGBUS (i.e. 2147 * MF_MUST_KILL) 2148 */ 2149 flags |= MF_ACTION_REQUIRED | MF_MUST_KILL; 2150 kill_procs(&tokill, true, false, pfn, flags); 2151 2152 pr_err("%#lx: recovery action for %s: %s\n", 2153 pfn, action_page_types[MF_MSG_PFN], 2154 action_name[rc ? MF_FAILED : MF_RECOVERED]); 2155 2156 return rc; 2157 } 2158
Thanks, Naoya for reviewing the patch. Comments inline. > -----Original Message----- > From: HORIGUCHI NAOYA(堀口 直也) <naoya.horiguchi@nec.com> > Sent: Tuesday, May 9, 2023 2:51 AM > To: Ankit Agrawal <ankita@nvidia.com> > Cc: Jason Gunthorpe <jgg@nvidia.com>; alex.williamson@redhat.com; > maz@kernel.org; oliver.upton@linux.dev; Aniket Agashe > <aniketa@nvidia.com>; Neo Jia <cjia@nvidia.com>; Kirti Wankhede > <kwankhede@nvidia.com>; Tarun Gupta (SW-GPU) <targupta@nvidia.com>; > Vikram Sethi <vsethi@nvidia.com>; Andy Currid <acurrid@nvidia.com>; > Alistair Popple <apopple@nvidia.com>; John Hubbard > <jhubbard@nvidia.com>; Dan Williams <danw@nvidia.com>; > kvm@vger.kernel.org; linux-kernel@vger.kernel.org; linux-arm- > kernel@lists.infradead.org; linux-mm@kvack.org > Subject: Re: [PATCH v3 3/6] mm: handle poisoning of pfn without struct pages > > External email: Use caution opening links or attachments > > > On Wed, Apr 05, 2023 at 11:01:31AM -0700, ankita@nvidia.com wrote: > > From: Ankit Agrawal <ankita@nvidia.com> > > > > The kernel MM does not currently handle ECC errors / poison on a > > memory region that is not backed by struct pages. In this series, > > mapping request from QEMU to the device memory is executed using > remap_pfn_range(). > > Hence added a new mechanism to handle memory failure on such memory. > > > > Make kernel MM expose a function to allow modules managing the device > > memory to register a failure function and the address space that is > > associated with the device memory. MM maintains this information as > > interval tree. The registered memory failure function is used by MM to > > notify the module of the PFN, so that the module may take any required > > action. The module for example may use the information to track the > > poisoned pages. > > > > In this implementation, kernel MM follows the following sequence > > (mostly) similar to the memory_failure() handler for struct page backed > memory: > > 1. memory_failure() is triggered on reception of a poison error. An > > absence of struct page is detected and consequently memory_failure_pfn > > is executed. > > 2. memory_failure_pfn() call the newly introduced failure handler > > exposed by the module managing the poisoned memory to notify it of the > > problematic PFN. > > 3. memory_failure_pfn() unmaps the stage-2 mapping to the PFN. > > 4. memory_failure_pfn() collects the processes mapped to the PFN. > > 5. memory_failure_pfn() sends SIGBUS (BUS_MCEERR_AO) to all the > > processes mapping the faulty PFN using kill_procs(). > > 6. An access to the faulty PFN by an operation in VM at a later point > > of time is trapped and user_mem_abort() is called. > > 7. user_mem_abort() calls __gfn_to_pfn_memslot() on the PFN, and the > > following execution path is followed: __gfn_to_pfn_memslot() -> > > hva_to_pfn() -> hva_to_pfn_remapped() -> fixup_user_fault() -> > > handle_mm_fault() -> handle_pte_fault() -> do_fault(). do_fault() is > > expected to return VM_FAULT_HWPOISON on the PFN (it currently does not > > and is fixed as part of another patch in the series). > > 8. __gfn_to_pfn_memslot() then returns KVM_PFN_ERR_HWPOISON, > which > > cause the poison with SIGBUS (BUS_MCEERR_AR) to be sent to the QEMU > > process through kvm_send_hwpoison_signal(). > > > > Signed-off-by: Ankit Agrawal <ankita@nvidia.com> > > --- > > include/linux/memory-failure.h | 22 +++++ > > include/linux/mm.h | 1 + > > include/ras/ras_event.h | 1 + > > mm/memory-failure.c | 148 +++++++++++++++++++++++++++++---- > > 4 files changed, 154 insertions(+), 18 deletions(-) create mode > > 100644 include/linux/memory-failure.h > > > > diff --git a/include/linux/memory-failure.h > > b/include/linux/memory-failure.h new file mode 100644 index > > 000000000000..9a579960972a > > --- /dev/null > > +++ b/include/linux/memory-failure.h > > @@ -0,0 +1,22 @@ > > +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef > > +_LINUX_MEMORY_FAILURE_H #define _LINUX_MEMORY_FAILURE_H > > + > > +#include <linux/interval_tree.h> > > + > > +struct pfn_address_space; > > + > > +struct pfn_address_space_ops { > > + void (*failure)(struct pfn_address_space *pfn_space, unsigned > > +long pfn); }; > > + > > +struct pfn_address_space { > > + struct interval_tree_node node; > > + const struct pfn_address_space_ops *ops; > > + struct address_space *mapping; > > +}; > > + > > +int register_pfn_address_space(struct pfn_address_space *pfn_space); > > +void unregister_pfn_address_space(struct pfn_address_space > > +*pfn_space); > > + > > +#endif /* _LINUX_MEMORY_FAILURE_H */ > > diff --git a/include/linux/mm.h b/include/linux/mm.h index > > 1f79667824eb..e3ef52d3d45a 100644 > > --- a/include/linux/mm.h > > +++ b/include/linux/mm.h > > @@ -3530,6 +3530,7 @@ enum mf_action_page_type { > > MF_MSG_BUDDY, > > MF_MSG_DAX, > > MF_MSG_UNSPLIT_THP, > > + MF_MSG_PFN, > > Personally, the keyword "PFN" looks to me a little too generic, so I prefer > "PFNMAP" or "PFN_MAP" because memory_failure() is anyway called with > pfn regardless of being backed by struct page. > Makes sense. Will change to PFNMAP. > > MF_MSG_UNKNOWN, > > }; > > > > diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h index > > cbd3ddd7c33d..5c62a4d17172 100644 > > --- a/include/ras/ras_event.h > > +++ b/include/ras/ras_event.h > > @@ -373,6 +373,7 @@ TRACE_EVENT(aer_event, > > EM ( MF_MSG_BUDDY, "free buddy page" ) \ > > EM ( MF_MSG_DAX, "dax page" ) \ > > EM ( MF_MSG_UNSPLIT_THP, "unsplit thp" ) \ > > + EM ( MF_MSG_PFN, "non struct page pfn" ) \ > > EMe ( MF_MSG_UNKNOWN, "unknown page" ) > > > > /* > > diff --git a/mm/memory-failure.c b/mm/memory-failure.c index > > fae9baf3be16..2c1a2ec42f7b 100644 > > --- a/mm/memory-failure.c > > +++ b/mm/memory-failure.c > > @@ -38,6 +38,7 @@ > > > > #include <linux/kernel.h> > > #include <linux/mm.h> > > +#include <linux/memory-failure.h> > > #include <linux/page-flags.h> > > #include <linux/kernel-page-flags.h> > > #include <linux/sched/signal.h> > > @@ -62,6 +63,7 @@ > > #include <linux/page-isolation.h> > > #include <linux/pagewalk.h> > > #include <linux/shmem_fs.h> > > +#include <linux/pfn_t.h> > > #include "swap.h" > > #include "internal.h" > > #include "ras/ras_event.h" > > @@ -122,6 +124,10 @@ const struct attribute_group > memory_failure_attr_group = { > > .attrs = memory_failure_attr, > > }; > > > > +static struct rb_root_cached pfn_space_itree = RB_ROOT_CACHED; > > + > > +static DEFINE_MUTEX(pfn_space_lock); > > + > > /* > > * Return values: > > * 1: the page is dissolved (if needed) and taken off from buddy, > > @@ -399,15 +405,14 @@ static unsigned long > dev_pagemap_mapping_shift(struct vm_area_struct *vma, > > * Schedule a process for later kill. > > * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM. > > * > > - * Note: @fsdax_pgoff is used only when @p is a fsdax page and a > > - * filesystem with a memory failure handler has claimed the > > - * memory_failure event. In all other cases, page->index and > > - * page->mapping are sufficient for mapping the page back to its > > + * Notice: @pgoff is used either when @p is a fsdax page or a PFN is > > + not > > + * backed by struct page and a filesystem with a memory failure > > + handler > > + * has claimed the memory_failure event. In all other cases, > > + page->index > > You add a new case using @pgoff, and now we have 3 such cases, so could > you update the comment to itemize them (which makes it easier to read and > update)? > Sure, will do in the next version. > > + * and page->mapping are sufficient for mapping the page back to its > > * corresponding user virtual address. > > */ > > -static void add_to_kill(struct task_struct *tsk, struct page *p, > > - pgoff_t fsdax_pgoff, struct vm_area_struct *vma, > > - struct list_head *to_kill) > > +static void add_to_kill(struct task_struct *tsk, struct page *p, pgoff_t pgoff, > > + struct vm_area_struct *vma, struct list_head > > +*to_kill) > > { > > struct to_kill *tk; > > > > @@ -417,13 +422,20 @@ static void add_to_kill(struct task_struct *tsk, > struct page *p, > > return; > > } > > > > - tk->addr = page_address_in_vma(p, vma); > > - if (is_zone_device_page(p)) { > > - if (fsdax_pgoff != FSDAX_INVALID_PGOFF) > > - tk->addr = vma_pgoff_address(fsdax_pgoff, 1, vma); > > + if (vma->vm_flags | PFN_MAP) { > > + tk->addr = > > + vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); > > + tk->size_shift = PAGE_SHIFT; > > + } else if (is_zone_device_page(p)) { > > + if (pgoff != FSDAX_INVALID_PGOFF) > > + tk->addr = vma_pgoff_address(pgoff, 1, vma); > > + else > > + tk->addr = page_address_in_vma(p, vma); > > tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr); > > - } else > > + } else { > > + tk->addr = page_address_in_vma(p, vma); > > tk->size_shift = page_shift(compound_head(p)); > > + } > > > > /* > > * Send SIGKILL if "tk->addr == -EFAULT". Also, as @@ -617,13 > > +629,12 @@ static void collect_procs_file(struct page *page, struct list_head > *to_kill, > > i_mmap_unlock_read(mapping); > > } > > > > -#ifdef CONFIG_FS_DAX > > It seems that your new driver is built only in limited > configuration/architecture, so loosening the condition instead of simply > removing might be better. > > > /* > > * Collect processes when the error hit a fsdax page. > > */ > > -static void collect_procs_fsdax(struct page *page, > > - struct address_space *mapping, pgoff_t pgoff, > > - struct list_head *to_kill) > > +static void collect_procs_pgoff(struct page *page, > > + struct address_space *mapping, pgoff_t pgoff, > > + struct list_head *to_kill) > > { > > struct vm_area_struct *vma; > > struct task_struct *tsk; > > @@ -643,7 +654,6 @@ static void collect_procs_fsdax(struct page *page, > > read_unlock(&tasklist_lock); > > i_mmap_unlock_read(mapping); > > } > > -#endif /* CONFIG_FS_DAX */ > > > > /* > > * Collect the processes who have the corrupted page mapped to kill. > > @@ -835,6 +845,7 @@ static const char * const action_page_types[] = { > > [MF_MSG_BUDDY] = "free buddy page", > > [MF_MSG_DAX] = "dax page", > > [MF_MSG_UNSPLIT_THP] = "unsplit thp", > > + [MF_MSG_PFN] = "non struct page pfn", > > [MF_MSG_UNKNOWN] = "unknown page", > > }; > > > > @@ -1745,7 +1756,7 @@ int mf_dax_kill_procs(struct address_space > > *mapping, pgoff_t index, > > > > SetPageHWPoison(page); > > > > - collect_procs_fsdax(page, mapping, index, &to_kill); > > + collect_procs_pgoff(page, mapping, index, &to_kill); > > unmap_and_kill(&to_kill, page_to_pfn(page), mapping, > > index, mf_flags); > > unlock: > > @@ -2052,6 +2063,99 @@ static int > memory_failure_dev_pagemap(unsigned long pfn, int flags, > > return rc; > > } > > > > +/** > > + * register_pfn_address_space - Register PA region for poison notification. > > + * @pfn_space: structure containing region range and callback function on > > + * poison detection. > > + * > > + * This function is called by a kernel module to register a PA region > > +and > > + * a callback function with the kernel. On detection of poison, the > > + * kernel code will go through all registered regions and call the > > + * appropriate callback function associated with the range. The > > +kernel > > + * module is responsible for tracking the poisoned pages. > > + * > > + * Return: 0 if successfully registered, > > + * -EBUSY if the region is already registered > > + */ > > +int register_pfn_address_space(struct pfn_address_space *pfn_space) { > > + if (!request_mem_region(pfn_space->node.start << PAGE_SHIFT, > > + (pfn_space->node.last - pfn_space->node.start + 1) << PAGE_SHIFT, > "")) > > + return -EBUSY; > > + > > + mutex_lock(&pfn_space_lock); > > + interval_tree_insert(&pfn_space->node, &pfn_space_itree); > > + mutex_unlock(&pfn_space_lock); > > + > > + return 0; > > +} > > +EXPORT_SYMBOL_GPL(register_pfn_address_space); > > register_pfn_address_space and unregister_pfn_address_space are not > compiled if CONFIG_MEMORY_FAILURE is not set, so maybe your new driver > might need to depend on this config. > Yeah, that's right. Shall I put parts of code in the vfio-pci variant driver related to poison handling as part of #ifdef CONFIG_MEMORY_FAILURE. Or would you rather prefer I make NVGPU_VFIO_PCI config dependent on CONFIG_MEMORY_FAILURE? > > + > > +/** > > + * unregister_pfn_address_space - Unregister a PA region from poison > > + * notification. > > + * @pfn_space: structure containing region range to be unregistered. > > + * > > + * This function is called by a kernel module to unregister the PA > > +region > > + * from the kernel from poison tracking. > > + */ > > +void unregister_pfn_address_space(struct pfn_address_space > > +*pfn_space) { > > + mutex_lock(&pfn_space_lock); > > + interval_tree_remove(&pfn_space->node, &pfn_space_itree); > > + mutex_unlock(&pfn_space_lock); > > + release_mem_region(pfn_space->node.start << PAGE_SHIFT, > > + (pfn_space->node.last - pfn_space->node.start + 1) << > > +PAGE_SHIFT); } EXPORT_SYMBOL_GPL(unregister_pfn_address_space); > > + > > +static int memory_failure_pfn(unsigned long pfn, int flags) { > > + struct interval_tree_node *node; > > + int rc = -EBUSY; > > + LIST_HEAD(tokill); > > + > > + mutex_lock(&pfn_space_lock); > > + /* > > + * Modules registers with MM the address space mapping to the device > memory they > > + * manage. Iterate to identify exactly which address space has mapped to > this > > + * failing PFN. > > + */ > > + for (node = interval_tree_iter_first(&pfn_space_itree, pfn, pfn); node; > > + node = interval_tree_iter_next(node, pfn, pfn)) { > > + struct pfn_address_space *pfn_space = > > + container_of(node, struct pfn_address_space, node); > > + rc = 0; > > + > > + /* > > + * Modules managing the device memory needs to be conveyed > about the > > + * memory failure so that the poisoned PFN can be tracked. > > + */ > > + pfn_space->ops->failure(pfn_space, pfn); > > + > > + collect_procs_pgoff(NULL, pfn_space->mapping, pfn, > > + &tokill); > > + > > + unmap_mapping_range(pfn_space->mapping, pfn << PAGE_SHIFT, > > + PAGE_SIZE, 0); > > + } > > + mutex_unlock(&pfn_space_lock); > > If rc == 0 at this point, the given pfn seems to be outside the GPU memory, so > that should be considered as "Invalid address" case whose check is removed > by patch 5/6. So it might be better to sperate the case from "do handling for > non struct page pfn" case. > Sorry did you mean rc !=0 here? But yeah, you are right that I should add the case for check in case a region with the desired PFN isn't found above. > > + > > + /* > > + * Unlike System-RAM there is no possibility to swap in a different > > + * physical page at a given virtual address, so all userspace > > + * consumption of direct PFN memory necessitates SIGBUS (i.e. > > + * MF_MUST_KILL) > > + */ > > + flags |= MF_ACTION_REQUIRED | MF_MUST_KILL; > > + kill_procs(&tokill, true, false, pfn, flags); > > + > > + pr_err("%#lx: recovery action for %s: %s\n", > > + pfn, action_page_types[MF_MSG_PFN], > > + action_name[rc ? MF_FAILED : MF_RECOVERED]); > > Could you call action_result() to print out the summary line. > It has some other common things like accounting and tracepoint. > Ack. > > + > > + return rc; > > +} > > + > > static DEFINE_MUTEX(mf_mutex); > > > > /** > > @@ -2093,6 +2197,11 @@ int memory_failure(unsigned long pfn, int flags) > > if (!(flags & MF_SW_SIMULATED)) > > hw_memory_failure = true; > > > > + if (!pfn_valid(pfn) && !arch_is_platform_page(PFN_PHYS(pfn))) { > > + res = memory_failure_pfn(pfn, flags); > > + goto unlock_mutex; > > + } > > This might break exisiting corner case about DAX device, so maybe this should > be checked after confirming that pfn_to_online_page returns NULL. > Sorry, it isn't clear to me which corner case could break here. Can the pfn_valid() be false on a DAX device page? > > + > > p = pfn_to_online_page(pfn); > > if (!p) { > > res = arch_memory_failure(pfn, flags); @@ -2106,6 > > +2215,9 @@ int memory_failure(unsigned long pfn, int flags) > > pgmap); > > goto unlock_mutex; > > } > > + > > + res = memory_failure_pfn(pfn, flags); > > + goto unlock_mutex; > > This path is chosen when pfn_valid returns true, which cannot happen for > GPU memory's case? > Good catch. That needs to be removed. > Thanks, > Naoya Horiguchi > > > } > > pr_err("%#lx: memory outside kernel control\n", pfn); > > res = -ENXIO; > > -- > > 2.17.1 On a separate note, would you rather prefer that I separate out the poison handling parts (i.e. 3/6 and 5/6) into a separate series? Or should I just keep the whole series together? Thanks Ankit Agrawal
On Mon, May 15, 2023 at 11:18:16AM +0000, Ankit Agrawal wrote: > Thanks, Naoya for reviewing the patch. Comments inline. > > > -----Original Message----- > > From: HORIGUCHI NAOYA(堀口 直也) <naoya.horiguchi@nec.com> > > Sent: Tuesday, May 9, 2023 2:51 AM > > To: Ankit Agrawal <ankita@nvidia.com> > > Cc: Jason Gunthorpe <jgg@nvidia.com>; alex.williamson@redhat.com; > > maz@kernel.org; oliver.upton@linux.dev; Aniket Agashe > > <aniketa@nvidia.com>; Neo Jia <cjia@nvidia.com>; Kirti Wankhede > > <kwankhede@nvidia.com>; Tarun Gupta (SW-GPU) <targupta@nvidia.com>; > > Vikram Sethi <vsethi@nvidia.com>; Andy Currid <acurrid@nvidia.com>; > > Alistair Popple <apopple@nvidia.com>; John Hubbard > > <jhubbard@nvidia.com>; Dan Williams <danw@nvidia.com>; > > kvm@vger.kernel.org; linux-kernel@vger.kernel.org; linux-arm- > > kernel@lists.infradead.org; linux-mm@kvack.org > > Subject: Re: [PATCH v3 3/6] mm: handle poisoning of pfn without struct pages > > > > External email: Use caution opening links or attachments > > > > > > On Wed, Apr 05, 2023 at 11:01:31AM -0700, ankita@nvidia.com wrote: > > > From: Ankit Agrawal <ankita@nvidia.com> > > > > > > The kernel MM does not currently handle ECC errors / poison on a > > > memory region that is not backed by struct pages. In this series, > > > mapping request from QEMU to the device memory is executed using > > remap_pfn_range(). > > > Hence added a new mechanism to handle memory failure on such memory. > > > > > > Make kernel MM expose a function to allow modules managing the device > > > memory to register a failure function and the address space that is > > > associated with the device memory. MM maintains this information as > > > interval tree. The registered memory failure function is used by MM to > > > notify the module of the PFN, so that the module may take any required > > > action. The module for example may use the information to track the > > > poisoned pages. > > > > > > In this implementation, kernel MM follows the following sequence > > > (mostly) similar to the memory_failure() handler for struct page backed > > memory: > > > 1. memory_failure() is triggered on reception of a poison error. An > > > absence of struct page is detected and consequently memory_failure_pfn > > > is executed. > > > 2. memory_failure_pfn() call the newly introduced failure handler > > > exposed by the module managing the poisoned memory to notify it of the > > > problematic PFN. > > > 3. memory_failure_pfn() unmaps the stage-2 mapping to the PFN. > > > 4. memory_failure_pfn() collects the processes mapped to the PFN. > > > 5. memory_failure_pfn() sends SIGBUS (BUS_MCEERR_AO) to all the > > > processes mapping the faulty PFN using kill_procs(). > > > 6. An access to the faulty PFN by an operation in VM at a later point > > > of time is trapped and user_mem_abort() is called. > > > 7. user_mem_abort() calls __gfn_to_pfn_memslot() on the PFN, and the > > > following execution path is followed: __gfn_to_pfn_memslot() -> > > > hva_to_pfn() -> hva_to_pfn_remapped() -> fixup_user_fault() -> > > > handle_mm_fault() -> handle_pte_fault() -> do_fault(). do_fault() is > > > expected to return VM_FAULT_HWPOISON on the PFN (it currently does not > > > and is fixed as part of another patch in the series). > > > 8. __gfn_to_pfn_memslot() then returns KVM_PFN_ERR_HWPOISON, > > which > > > cause the poison with SIGBUS (BUS_MCEERR_AR) to be sent to the QEMU > > > process through kvm_send_hwpoison_signal(). > > > > > > Signed-off-by: Ankit Agrawal <ankita@nvidia.com> > > > --- > > > include/linux/memory-failure.h | 22 +++++ > > > include/linux/mm.h | 1 + > > > include/ras/ras_event.h | 1 + > > > mm/memory-failure.c | 148 +++++++++++++++++++++++++++++---- > > > 4 files changed, 154 insertions(+), 18 deletions(-) create mode > > > 100644 include/linux/memory-failure.h > > > ... > > > @@ -2052,6 +2063,99 @@ static int > > memory_failure_dev_pagemap(unsigned long pfn, int flags, > > > return rc; > > > } > > > > > > +/** > > > + * register_pfn_address_space - Register PA region for poison notification. > > > + * @pfn_space: structure containing region range and callback function on > > > + * poison detection. > > > + * > > > + * This function is called by a kernel module to register a PA region > > > +and > > > + * a callback function with the kernel. On detection of poison, the > > > + * kernel code will go through all registered regions and call the > > > + * appropriate callback function associated with the range. The > > > +kernel > > > + * module is responsible for tracking the poisoned pages. > > > + * > > > + * Return: 0 if successfully registered, > > > + * -EBUSY if the region is already registered > > > + */ > > > +int register_pfn_address_space(struct pfn_address_space *pfn_space) { > > > + if (!request_mem_region(pfn_space->node.start << PAGE_SHIFT, > > > + (pfn_space->node.last - pfn_space->node.start + 1) << PAGE_SHIFT, > > "")) > > > + return -EBUSY; > > > + > > > + mutex_lock(&pfn_space_lock); > > > + interval_tree_insert(&pfn_space->node, &pfn_space_itree); > > > + mutex_unlock(&pfn_space_lock); > > > + > > > + return 0; > > > +} > > > +EXPORT_SYMBOL_GPL(register_pfn_address_space); > > > > register_pfn_address_space and unregister_pfn_address_space are not > > compiled if CONFIG_MEMORY_FAILURE is not set, so maybe your new driver > > might need to depend on this config. > > > > Yeah, that's right. Shall I put parts of code in the vfio-pci variant driver > related to poison handling as part of #ifdef CONFIG_MEMORY_FAILURE. > Or would you rather prefer I make NVGPU_VFIO_PCI config dependent > on CONFIG_MEMORY_FAILURE? If the vfio-pci variant driver plans to provide features unrelated to poison handling, "ifdef" option looks fine to me. Otherwise, the second option could be possible. Both options look to me comparably reasonable. > > > > + > > > +/** > > > + * unregister_pfn_address_space - Unregister a PA region from poison > > > + * notification. > > > + * @pfn_space: structure containing region range to be unregistered. > > > + * > > > + * This function is called by a kernel module to unregister the PA > > > +region > > > + * from the kernel from poison tracking. > > > + */ > > > +void unregister_pfn_address_space(struct pfn_address_space > > > +*pfn_space) { > > > + mutex_lock(&pfn_space_lock); > > > + interval_tree_remove(&pfn_space->node, &pfn_space_itree); > > > + mutex_unlock(&pfn_space_lock); > > > + release_mem_region(pfn_space->node.start << PAGE_SHIFT, > > > + (pfn_space->node.last - pfn_space->node.start + 1) << > > > +PAGE_SHIFT); } EXPORT_SYMBOL_GPL(unregister_pfn_address_space); > > > + > > > +static int memory_failure_pfn(unsigned long pfn, int flags) { > > > + struct interval_tree_node *node; > > > + int rc = -EBUSY; > > > + LIST_HEAD(tokill); > > > + > > > + mutex_lock(&pfn_space_lock); > > > + /* > > > + * Modules registers with MM the address space mapping to the device > > memory they > > > + * manage. Iterate to identify exactly which address space has mapped to > > this > > > + * failing PFN. > > > + */ > > > + for (node = interval_tree_iter_first(&pfn_space_itree, pfn, pfn); node; > > > + node = interval_tree_iter_next(node, pfn, pfn)) { > > > + struct pfn_address_space *pfn_space = > > > + container_of(node, struct pfn_address_space, node); > > > + rc = 0; > > > + > > > + /* > > > + * Modules managing the device memory needs to be conveyed > > about the > > > + * memory failure so that the poisoned PFN can be tracked. > > > + */ > > > + pfn_space->ops->failure(pfn_space, pfn); > > > + > > > + collect_procs_pgoff(NULL, pfn_space->mapping, pfn, > > > + &tokill); > > > + > > > + unmap_mapping_range(pfn_space->mapping, pfn << PAGE_SHIFT, > > > + PAGE_SIZE, 0); > > > + } > > > + mutex_unlock(&pfn_space_lock); > > > > If rc == 0 at this point, the given pfn seems to be outside the GPU memory, so > > that should be considered as "Invalid address" case whose check is removed > > by patch 5/6. So it might be better to sperate the case from "do handling for > > non struct page pfn" case. > > > > Sorry did you mean rc !=0 here? But yeah, you are right that I should add the > case for check in case a region with the desired PFN isn't found above. > > > > + > > > + /* > > > + * Unlike System-RAM there is no possibility to swap in a different > > > + * physical page at a given virtual address, so all userspace > > > + * consumption of direct PFN memory necessitates SIGBUS (i.e. > > > + * MF_MUST_KILL) > > > + */ > > > + flags |= MF_ACTION_REQUIRED | MF_MUST_KILL; > > > + kill_procs(&tokill, true, false, pfn, flags); > > > + > > > + pr_err("%#lx: recovery action for %s: %s\n", > > > + pfn, action_page_types[MF_MSG_PFN], > > > + action_name[rc ? MF_FAILED : MF_RECOVERED]); > > > > Could you call action_result() to print out the summary line. > > It has some other common things like accounting and tracepoint. > > > > Ack. > > > > + > > > + return rc; > > > +} > > > + > > > static DEFINE_MUTEX(mf_mutex); > > > > > > /** > > > @@ -2093,6 +2197,11 @@ int memory_failure(unsigned long pfn, int flags) > > > if (!(flags & MF_SW_SIMULATED)) > > > hw_memory_failure = true; > > > > > > + if (!pfn_valid(pfn) && !arch_is_platform_page(PFN_PHYS(pfn))) { > > > + res = memory_failure_pfn(pfn, flags); > > > + goto unlock_mutex; > > > + } > > > > This might break exisiting corner case about DAX device, so maybe this should > > be checked after confirming that pfn_to_online_page returns NULL. > > > > Sorry, it isn't clear to me which corner case could break here. Can the pfn_valid() > be false on a DAX device page? I thought that possibility, but I commented with relying on my vague/wrong memory, sorry. pfn_valid() should always true for DAX device pages, so the above check should not break. > > > > + > > > p = pfn_to_online_page(pfn); > > > if (!p) { > > > res = arch_memory_failure(pfn, flags); @@ -2106,6 > > > +2215,9 @@ int memory_failure(unsigned long pfn, int flags) > > > pgmap); > > > goto unlock_mutex; > > > } > > > + > > > + res = memory_failure_pfn(pfn, flags); > > > + goto unlock_mutex; > > > > This path is chosen when pfn_valid returns true, which cannot happen for > > GPU memory's case? > > > > Good catch. That needs to be removed. > > > Thanks, > > Naoya Horiguchi > > > > > } > > > pr_err("%#lx: memory outside kernel control\n", pfn); > > > res = -ENXIO; > > > -- > > > 2.17.1 > > On a separate note, would you rather prefer that I separate out the poison > handling parts (i.e. 3/6 and 5/6) into a separate series? Or should I just keep the > whole series together? The new poison handling code is used after the vfio-pci driver is available, so I think that they may as well be merged together. Thanks, Naoya Horiguchi
diff --git a/include/linux/memory-failure.h b/include/linux/memory-failure.h new file mode 100644 index 000000000000..9a579960972a --- /dev/null +++ b/include/linux/memory-failure.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_MEMORY_FAILURE_H +#define _LINUX_MEMORY_FAILURE_H + +#include <linux/interval_tree.h> + +struct pfn_address_space; + +struct pfn_address_space_ops { + void (*failure)(struct pfn_address_space *pfn_space, unsigned long pfn); +}; + +struct pfn_address_space { + struct interval_tree_node node; + const struct pfn_address_space_ops *ops; + struct address_space *mapping; +}; + +int register_pfn_address_space(struct pfn_address_space *pfn_space); +void unregister_pfn_address_space(struct pfn_address_space *pfn_space); + +#endif /* _LINUX_MEMORY_FAILURE_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 1f79667824eb..e3ef52d3d45a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3530,6 +3530,7 @@ enum mf_action_page_type { MF_MSG_BUDDY, MF_MSG_DAX, MF_MSG_UNSPLIT_THP, + MF_MSG_PFN, MF_MSG_UNKNOWN, }; diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h index cbd3ddd7c33d..5c62a4d17172 100644 --- a/include/ras/ras_event.h +++ b/include/ras/ras_event.h @@ -373,6 +373,7 @@ TRACE_EVENT(aer_event, EM ( MF_MSG_BUDDY, "free buddy page" ) \ EM ( MF_MSG_DAX, "dax page" ) \ EM ( MF_MSG_UNSPLIT_THP, "unsplit thp" ) \ + EM ( MF_MSG_PFN, "non struct page pfn" ) \ EMe ( MF_MSG_UNKNOWN, "unknown page" ) /* diff --git a/mm/memory-failure.c b/mm/memory-failure.c index fae9baf3be16..2c1a2ec42f7b 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -38,6 +38,7 @@ #include <linux/kernel.h> #include <linux/mm.h> +#include <linux/memory-failure.h> #include <linux/page-flags.h> #include <linux/kernel-page-flags.h> #include <linux/sched/signal.h> @@ -62,6 +63,7 @@ #include <linux/page-isolation.h> #include <linux/pagewalk.h> #include <linux/shmem_fs.h> +#include <linux/pfn_t.h> #include "swap.h" #include "internal.h" #include "ras/ras_event.h" @@ -122,6 +124,10 @@ const struct attribute_group memory_failure_attr_group = { .attrs = memory_failure_attr, }; +static struct rb_root_cached pfn_space_itree = RB_ROOT_CACHED; + +static DEFINE_MUTEX(pfn_space_lock); + /* * Return values: * 1: the page is dissolved (if needed) and taken off from buddy, @@ -399,15 +405,14 @@ static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma, * Schedule a process for later kill. * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM. * - * Note: @fsdax_pgoff is used only when @p is a fsdax page and a - * filesystem with a memory failure handler has claimed the - * memory_failure event. In all other cases, page->index and - * page->mapping are sufficient for mapping the page back to its + * Notice: @pgoff is used either when @p is a fsdax page or a PFN is not + * backed by struct page and a filesystem with a memory failure handler + * has claimed the memory_failure event. In all other cases, page->index + * and page->mapping are sufficient for mapping the page back to its * corresponding user virtual address. */ -static void add_to_kill(struct task_struct *tsk, struct page *p, - pgoff_t fsdax_pgoff, struct vm_area_struct *vma, - struct list_head *to_kill) +static void add_to_kill(struct task_struct *tsk, struct page *p, pgoff_t pgoff, + struct vm_area_struct *vma, struct list_head *to_kill) { struct to_kill *tk; @@ -417,13 +422,20 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, return; } - tk->addr = page_address_in_vma(p, vma); - if (is_zone_device_page(p)) { - if (fsdax_pgoff != FSDAX_INVALID_PGOFF) - tk->addr = vma_pgoff_address(fsdax_pgoff, 1, vma); + if (vma->vm_flags | PFN_MAP) { + tk->addr = + vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); + tk->size_shift = PAGE_SHIFT; + } else if (is_zone_device_page(p)) { + if (pgoff != FSDAX_INVALID_PGOFF) + tk->addr = vma_pgoff_address(pgoff, 1, vma); + else + tk->addr = page_address_in_vma(p, vma); tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr); - } else + } else { + tk->addr = page_address_in_vma(p, vma); tk->size_shift = page_shift(compound_head(p)); + } /* * Send SIGKILL if "tk->addr == -EFAULT". Also, as @@ -617,13 +629,12 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, i_mmap_unlock_read(mapping); } -#ifdef CONFIG_FS_DAX /* * Collect processes when the error hit a fsdax page. */ -static void collect_procs_fsdax(struct page *page, - struct address_space *mapping, pgoff_t pgoff, - struct list_head *to_kill) +static void collect_procs_pgoff(struct page *page, + struct address_space *mapping, pgoff_t pgoff, + struct list_head *to_kill) { struct vm_area_struct *vma; struct task_struct *tsk; @@ -643,7 +654,6 @@ static void collect_procs_fsdax(struct page *page, read_unlock(&tasklist_lock); i_mmap_unlock_read(mapping); } -#endif /* CONFIG_FS_DAX */ /* * Collect the processes who have the corrupted page mapped to kill. @@ -835,6 +845,7 @@ static const char * const action_page_types[] = { [MF_MSG_BUDDY] = "free buddy page", [MF_MSG_DAX] = "dax page", [MF_MSG_UNSPLIT_THP] = "unsplit thp", + [MF_MSG_PFN] = "non struct page pfn", [MF_MSG_UNKNOWN] = "unknown page", }; @@ -1745,7 +1756,7 @@ int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index, SetPageHWPoison(page); - collect_procs_fsdax(page, mapping, index, &to_kill); + collect_procs_pgoff(page, mapping, index, &to_kill); unmap_and_kill(&to_kill, page_to_pfn(page), mapping, index, mf_flags); unlock: @@ -2052,6 +2063,99 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, return rc; } +/** + * register_pfn_address_space - Register PA region for poison notification. + * @pfn_space: structure containing region range and callback function on + * poison detection. + * + * This function is called by a kernel module to register a PA region and + * a callback function with the kernel. On detection of poison, the + * kernel code will go through all registered regions and call the + * appropriate callback function associated with the range. The kernel + * module is responsible for tracking the poisoned pages. + * + * Return: 0 if successfully registered, + * -EBUSY if the region is already registered + */ +int register_pfn_address_space(struct pfn_address_space *pfn_space) +{ + if (!request_mem_region(pfn_space->node.start << PAGE_SHIFT, + (pfn_space->node.last - pfn_space->node.start + 1) << PAGE_SHIFT, "")) + return -EBUSY; + + mutex_lock(&pfn_space_lock); + interval_tree_insert(&pfn_space->node, &pfn_space_itree); + mutex_unlock(&pfn_space_lock); + + return 0; +} +EXPORT_SYMBOL_GPL(register_pfn_address_space); + +/** + * unregister_pfn_address_space - Unregister a PA region from poison + * notification. + * @pfn_space: structure containing region range to be unregistered. + * + * This function is called by a kernel module to unregister the PA region + * from the kernel from poison tracking. + */ +void unregister_pfn_address_space(struct pfn_address_space *pfn_space) +{ + mutex_lock(&pfn_space_lock); + interval_tree_remove(&pfn_space->node, &pfn_space_itree); + mutex_unlock(&pfn_space_lock); + release_mem_region(pfn_space->node.start << PAGE_SHIFT, + (pfn_space->node.last - pfn_space->node.start + 1) << PAGE_SHIFT); +} +EXPORT_SYMBOL_GPL(unregister_pfn_address_space); + +static int memory_failure_pfn(unsigned long pfn, int flags) +{ + struct interval_tree_node *node; + int rc = -EBUSY; + LIST_HEAD(tokill); + + mutex_lock(&pfn_space_lock); + /* + * Modules registers with MM the address space mapping to the device memory they + * manage. Iterate to identify exactly which address space has mapped to this + * failing PFN. + */ + for (node = interval_tree_iter_first(&pfn_space_itree, pfn, pfn); node; + node = interval_tree_iter_next(node, pfn, pfn)) { + struct pfn_address_space *pfn_space = + container_of(node, struct pfn_address_space, node); + rc = 0; + + /* + * Modules managing the device memory needs to be conveyed about the + * memory failure so that the poisoned PFN can be tracked. + */ + pfn_space->ops->failure(pfn_space, pfn); + + collect_procs_pgoff(NULL, pfn_space->mapping, pfn, &tokill); + + unmap_mapping_range(pfn_space->mapping, pfn << PAGE_SHIFT, + PAGE_SIZE, 0); + } + mutex_unlock(&pfn_space_lock); + + /* + * Unlike System-RAM there is no possibility to swap in a different + * physical page at a given virtual address, so all userspace + * consumption of direct PFN memory necessitates SIGBUS (i.e. + * MF_MUST_KILL) + */ + flags |= MF_ACTION_REQUIRED | MF_MUST_KILL; + kill_procs(&tokill, true, false, pfn, flags); + + pr_err("%#lx: recovery action for %s: %s\n", + pfn, action_page_types[MF_MSG_PFN], + action_name[rc ? MF_FAILED : MF_RECOVERED]); + + return rc; +} + static DEFINE_MUTEX(mf_mutex); /** @@ -2093,6 +2197,11 @@ int memory_failure(unsigned long pfn, int flags) if (!(flags & MF_SW_SIMULATED)) hw_memory_failure = true; + if (!pfn_valid(pfn) && !arch_is_platform_page(PFN_PHYS(pfn))) { + res = memory_failure_pfn(pfn, flags); + goto unlock_mutex; + } + p = pfn_to_online_page(pfn); if (!p) { res = arch_memory_failure(pfn, flags); @@ -2106,6 +2215,9 @@ int memory_failure(unsigned long pfn, int flags) pgmap); goto unlock_mutex; } + + res = memory_failure_pfn(pfn, flags); + goto unlock_mutex; } pr_err("%#lx: memory outside kernel control\n", pfn); res = -ENXIO;