diff mbox series

[v1,4/4] vfio/nvgpu: register device memory for poison handling

Message ID 20230920140210.12663-5-ankita@nvidia.com (mailing list archive)
State New
Headers show
Series mm: Implement ECC handling for pfn with no struct page | expand

Commit Message

Ankit Agrawal Sept. 20, 2023, 2:02 p.m. UTC
From: Ankit Agrawal <ankita@nvidia.com>

The nvgrace-gpu-vfio-pci module [1] maps the device memory to the user VA
(Qemu) using remap_pfn_range() without adding the memory to the kernel.
The device memory pages are not backed by struct page. Patches 1-3
implements the mechanism to handle ECC/poison on memory page without
struct page and expose a registration function. This new mechanism is
leveraged here.
 
The module registers its memory region with the kernel MM for ECC handling
using the register_pfn_address_space() registration API exposed by the
kernel. It also defines a failure callback function pfn_memory_failure()
to get the poisoned PFN from the MM.
 
The module track poisoned PFN as a bitmap with a bit per PFN. The PFN is
communicated by the kernel MM to the module through the failure function,
which sets the appropriate bit in the bitmap.
 
The module also defines a VMA fault ops for the module. It returns
VM_FAULT_HWPOISON in case the bit for the PFN is set in the bitmap.

[1] https://lore.kernel.org/all/20230915025415.6762-1-ankita@nvidia.com/

Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
---
 drivers/vfio/pci/nvgrace-gpu/main.c | 107 +++++++++++++++++++++++++++-
 drivers/vfio/vfio.h                 |  11 ---
 drivers/vfio/vfio_main.c            |   3 +-
 include/linux/vfio.h                |  15 ++++
 4 files changed, 123 insertions(+), 13 deletions(-)

Comments

Yujie Liu Sept. 26, 2023, 5:36 a.m. UTC | #1
Hi,

kernel test robot noticed the following build warnings:

[auto build test WARNING on awilliam-vfio/for-linus]
[also build test WARNING on kvm/queue rafael-pm/linux-next linus/master]
[cannot apply to akpm-mm/mm-everything awilliam-vfio/next kvm/linux-next next-20230925]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/ankita-nvidia-com/mm-handle-poisoning-of-pfn-without-struct-pages/20230920-220626
base:   https://github.com/awilliam/linux-vfio.git for-linus
patch link:    https://lore.kernel.org/r/20230920140210.12663-5-ankita%40nvidia.com
patch subject: [PATCH v1 4/4] vfio/nvgpu: register device memory for poison handling
config: powerpc64-allmodconfig (https://download.01.org/0day-ci/archive/20230925/202309252319.hQ7rHJTJ-lkp@intel.com/config)
compiler: clang version 17.0.0 (https://github.com/llvm/llvm-project.git 4a5ac14ee968ff0ad5d2cc1ffa0299048db4c88a)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20230925/202309252319.hQ7rHJTJ-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <yujie.liu@intel.com>
| Closes: https://lore.kernel.org/r/202309252319.hQ7rHJTJ-lkp@intel.com/

All warnings (new ones prefixed by >>):

>> drivers/vfio/pci/nvgrace-gpu/main.c:27:6: warning: no previous prototype for function 'nvgrace_gpu_vfio_pci_pfn_memory_failure' [-Wmissing-prototypes]
      27 | void nvgrace_gpu_vfio_pci_pfn_memory_failure(struct pfn_address_space *pfn_space,
         |      ^
   drivers/vfio/pci/nvgrace-gpu/main.c:27:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
      27 | void nvgrace_gpu_vfio_pci_pfn_memory_failure(struct pfn_address_space *pfn_space,
         | ^
         | static 
   drivers/vfio/pci/nvgrace-gpu/main.c:300:9: warning: no previous prototype for function 'nvgrace_gpu_read_mem' [-Wmissing-prototypes]
     300 | ssize_t nvgrace_gpu_read_mem(void __user *buf, size_t count, loff_t *ppos,
         |         ^
   drivers/vfio/pci/nvgrace-gpu/main.c:300:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
     300 | ssize_t nvgrace_gpu_read_mem(void __user *buf, size_t count, loff_t *ppos,
         | ^
         | static 
   drivers/vfio/pci/nvgrace-gpu/main.c:376:9: warning: no previous prototype for function 'nvgrace_gpu_write_mem' [-Wmissing-prototypes]
     376 | ssize_t nvgrace_gpu_write_mem(size_t count, loff_t *ppos, const void __user *buf,
         |         ^
   drivers/vfio/pci/nvgrace-gpu/main.c:376:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
     376 | ssize_t nvgrace_gpu_write_mem(size_t count, loff_t *ppos, const void __user *buf,
         | ^
         | static 
   3 warnings generated.


vim +/nvgrace_gpu_vfio_pci_pfn_memory_failure +27 drivers/vfio/pci/nvgrace-gpu/main.c

b59e9d949a79e1 Ankit Agrawal 2023-09-14  25  
5f3746d8629350 Ankit Agrawal 2023-09-20  26  #ifdef CONFIG_MEMORY_FAILURE
5f3746d8629350 Ankit Agrawal 2023-09-20 @27  void nvgrace_gpu_vfio_pci_pfn_memory_failure(struct pfn_address_space *pfn_space,
5f3746d8629350 Ankit Agrawal 2023-09-20  28  		unsigned long pfn)
5f3746d8629350 Ankit Agrawal 2023-09-20  29  {
5f3746d8629350 Ankit Agrawal 2023-09-20  30  	struct nvgrace_gpu_vfio_pci_core_device *nvdev = container_of(
5f3746d8629350 Ankit Agrawal 2023-09-20  31  		pfn_space, struct nvgrace_gpu_vfio_pci_core_device, pfn_address_space);
5f3746d8629350 Ankit Agrawal 2023-09-20  32  	unsigned long mem_offset = pfn - pfn_space->node.start;
5f3746d8629350 Ankit Agrawal 2023-09-20  33  
5f3746d8629350 Ankit Agrawal 2023-09-20  34  	if (mem_offset >= nvdev->memlength)
5f3746d8629350 Ankit Agrawal 2023-09-20  35  		return;
5f3746d8629350 Ankit Agrawal 2023-09-20  36  
5f3746d8629350 Ankit Agrawal 2023-09-20  37  	/*
5f3746d8629350 Ankit Agrawal 2023-09-20  38  	 * MM has called to notify a poisoned page. Track that in the bitmap.
5f3746d8629350 Ankit Agrawal 2023-09-20  39  	 */
5f3746d8629350 Ankit Agrawal 2023-09-20  40  	__set_bit(mem_offset, nvdev->pfn_bitmap);
5f3746d8629350 Ankit Agrawal 2023-09-20  41  }
5f3746d8629350 Ankit Agrawal 2023-09-20  42
Naoya Horiguchi Sept. 26, 2023, 7:38 a.m. UTC | #2
On Wed, Sep 20, 2023 at 07:32:10PM +0530, ankita@nvidia.com wrote:
> From: Ankit Agrawal <ankita@nvidia.com>
> 
> The nvgrace-gpu-vfio-pci module [1] maps the device memory to the user VA
> (Qemu) using remap_pfn_range() without adding the memory to the kernel.
> The device memory pages are not backed by struct page. Patches 1-3
> implements the mechanism to handle ECC/poison on memory page without
> struct page and expose a registration function. This new mechanism is
> leveraged here.
>  
> The module registers its memory region with the kernel MM for ECC handling
> using the register_pfn_address_space() registration API exposed by the
> kernel. It also defines a failure callback function pfn_memory_failure()
> to get the poisoned PFN from the MM.
>  
> The module track poisoned PFN as a bitmap with a bit per PFN. The PFN is
> communicated by the kernel MM to the module through the failure function,
> which sets the appropriate bit in the bitmap.
>  
> The module also defines a VMA fault ops for the module. It returns
> VM_FAULT_HWPOISON in case the bit for the PFN is set in the bitmap.
> 
> [1] https://lore.kernel.org/all/20230915025415.6762-1-ankita@nvidia.com/
> 
> Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
> ---

...

> @@ -406,6 +494,19 @@ nvgrace_gpu_vfio_pci_fetch_memory_property(struct pci_dev *pdev,
>  
>  	nvdev->memlength = memlength;
>  
> +#ifdef CONFIG_MEMORY_FAILURE
> +	/*
> +	 * A bitmap is maintained to track the pages that are poisoned. Each
> +	 * page is represented by a bit. Allocation size in bytes is
> +	 * determined by shifting the device memory size by PAGE_SHIFT to
> +	 * determine the number of pages; and further shifted by 3 as each
> +	 * byte could track 8 pages.
> +	 */
> +	nvdev->pfn_bitmap
> +		= vzalloc((nvdev->memlength >> PAGE_SHIFT)/BITS_PER_TYPE(char));
> +	if (!nvdev->pfn_bitmap)
> +		ret = -ENOMEM;
> +#endif
>  	return ret;
>  }
>  

I assume that memory failure is a relatively rare event (otherwise the device
is simply broken and it's better to stop using it), so the bitmap is mostly
full of zeros.
I think that the size of device memory is on the order of 100GB, then the
bitmap size is about 3.2MB, which might be not too large in modern systems,
but using other data structure with smaller memory footprint like hash table
can be more beneficial?

Thanks,
Naoya Horiguchi
Alex Williamson Sept. 28, 2023, 7:45 p.m. UTC | #3
On Wed, 20 Sep 2023 19:32:10 +0530
<ankita@nvidia.com> wrote:

> From: Ankit Agrawal <ankita@nvidia.com>
> 
> The nvgrace-gpu-vfio-pci module [1] maps the device memory to the user VA
> (Qemu) using remap_pfn_range() without adding the memory to the kernel.
> The device memory pages are not backed by struct page. Patches 1-3
> implements the mechanism to handle ECC/poison on memory page without
> struct page and expose a registration function. This new mechanism is
> leveraged here.
>  
> The module registers its memory region with the kernel MM for ECC handling
> using the register_pfn_address_space() registration API exposed by the
> kernel. It also defines a failure callback function pfn_memory_failure()
> to get the poisoned PFN from the MM.
>  
> The module track poisoned PFN as a bitmap with a bit per PFN. The PFN is
> communicated by the kernel MM to the module through the failure function,
> which sets the appropriate bit in the bitmap.
>  
> The module also defines a VMA fault ops for the module. It returns
> VM_FAULT_HWPOISON in case the bit for the PFN is set in the bitmap.
> 
> [1] https://lore.kernel.org/all/20230915025415.6762-1-ankita@nvidia.com/
> 
> Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
> ---
>  drivers/vfio/pci/nvgrace-gpu/main.c | 107 +++++++++++++++++++++++++++-
>  drivers/vfio/vfio.h                 |  11 ---
>  drivers/vfio/vfio_main.c            |   3 +-
>  include/linux/vfio.h                |  15 ++++
>  4 files changed, 123 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c
> index ba323f2d8ea1..1c89ce0cc1cc 100644
> --- a/drivers/vfio/pci/nvgrace-gpu/main.c
> +++ b/drivers/vfio/pci/nvgrace-gpu/main.c
> @@ -6,6 +6,10 @@
>  #include <linux/pci.h>
>  #include <linux/vfio_pci_core.h>
>  #include <linux/vfio.h>
> +#ifdef CONFIG_MEMORY_FAILURE
> +#include <linux/bitmap.h>
> +#include <linux/memory-failure.h>
> +#endif
>  
>  struct nvgrace_gpu_vfio_pci_core_device {
>  	struct vfio_pci_core_device core_device;
> @@ -13,8 +17,85 @@ struct nvgrace_gpu_vfio_pci_core_device {
>  	size_t memlength;
>  	void *memmap;
>  	struct mutex memmap_lock;
> +#ifdef CONFIG_MEMORY_FAILURE
> +	struct pfn_address_space pfn_address_space;
> +	unsigned long *pfn_bitmap;
> +#endif
>  };
>  
> +#ifdef CONFIG_MEMORY_FAILURE
> +void nvgrace_gpu_vfio_pci_pfn_memory_failure(struct pfn_address_space *pfn_space,
> +		unsigned long pfn)
> +{
> +	struct nvgrace_gpu_vfio_pci_core_device *nvdev = container_of(
> +		pfn_space, struct nvgrace_gpu_vfio_pci_core_device, pfn_address_space);
> +	unsigned long mem_offset = pfn - pfn_space->node.start;
> +
> +	if (mem_offset >= nvdev->memlength)
> +		return;
> +
> +	/*
> +	 * MM has called to notify a poisoned page. Track that in the bitmap.
> +	 */
> +	__set_bit(mem_offset, nvdev->pfn_bitmap);
> +}
> +
> +struct pfn_address_space_ops nvgrace_gpu_vfio_pci_pas_ops = {
> +	.failure = nvgrace_gpu_vfio_pci_pfn_memory_failure,
> +};
> +
> +static int
> +nvgrace_gpu_vfio_pci_register_pfn_range(struct nvgrace_gpu_vfio_pci_core_device *nvdev,
> +					struct vm_area_struct *vma)
> +{
> +	unsigned long nr_pages;
> +	int ret = 0;
> +
> +	nr_pages = nvdev->memlength >> PAGE_SHIFT;
> +
> +	nvdev->pfn_address_space.node.start = vma->vm_pgoff;
> +	nvdev->pfn_address_space.node.last = vma->vm_pgoff + nr_pages - 1;
> +	nvdev->pfn_address_space.ops = &nvgrace_gpu_vfio_pci_pas_ops;
> +	nvdev->pfn_address_space.mapping = vma->vm_file->f_mapping;
> +
> +	ret = register_pfn_address_space(&(nvdev->pfn_address_space));
> +
> +	return ret;
> +}
> +
> +static vm_fault_t nvgrace_gpu_vfio_pci_fault(struct vm_fault *vmf)
> +{
> +	unsigned long mem_offset = vmf->pgoff - vmf->vma->vm_pgoff;
> +	struct vfio_device *core_vdev;
> +	struct nvgrace_gpu_vfio_pci_core_device *nvdev;
> +
> +	if (!(vmf->vma->vm_file))
> +		goto error_exit;
> +
> +	core_vdev = vfio_device_from_file(vmf->vma->vm_file);
> +
> +	if (!core_vdev)
> +		goto error_exit;
> +
> +	nvdev = container_of(core_vdev,
> +			struct nvgrace_gpu_vfio_pci_core_device, core_device.vdev);
> +
> +	/*
> +	 * Check if the page is poisoned.
> +	 */
> +	if (mem_offset < (nvdev->memlength >> PAGE_SHIFT) &&
> +		test_bit(mem_offset, nvdev->pfn_bitmap))
> +		return VM_FAULT_HWPOISON;
> +
> +error_exit:
> +	return VM_FAULT_ERROR;
> +}
> +
> +static const struct vm_operations_struct nvgrace_gpu_vfio_pci_mmap_ops = {
> +	.fault = nvgrace_gpu_vfio_pci_fault,
> +};
> +#endif
> +
>  static int nvgrace_gpu_vfio_pci_open_device(struct vfio_device *core_vdev)
>  {
>  	struct vfio_pci_core_device *vdev =
> @@ -46,6 +127,9 @@ static void nvgrace_gpu_vfio_pci_close_device(struct vfio_device *core_vdev)
>  
>  	mutex_destroy(&nvdev->memmap_lock);
>  
> +#ifdef CONFIG_MEMORY_FAILURE
> +	unregister_pfn_address_space(&(nvdev->pfn_address_space));
> +#endif
>  	vfio_pci_core_close_device(core_vdev);
>  }
>  
> @@ -104,8 +188,12 @@ static int nvgrace_gpu_vfio_pci_mmap(struct vfio_device *core_vdev,
>  		return ret;
>  
>  	vma->vm_pgoff = start_pfn;
> +#ifdef CONFIG_MEMORY_FAILURE
> +	vma->vm_ops = &nvgrace_gpu_vfio_pci_mmap_ops;
>  
> -	return 0;
> +	ret = nvgrace_gpu_vfio_pci_register_pfn_range(nvdev, vma);
> +#endif
> +	return ret;
>  }
>  
>  static long
> @@ -406,6 +494,19 @@ nvgrace_gpu_vfio_pci_fetch_memory_property(struct pci_dev *pdev,
>  
>  	nvdev->memlength = memlength;
>  
> +#ifdef CONFIG_MEMORY_FAILURE
> +	/*
> +	 * A bitmap is maintained to track the pages that are poisoned. Each
> +	 * page is represented by a bit. Allocation size in bytes is
> +	 * determined by shifting the device memory size by PAGE_SHIFT to
> +	 * determine the number of pages; and further shifted by 3 as each
> +	 * byte could track 8 pages.
> +	 */
> +	nvdev->pfn_bitmap
> +		= vzalloc((nvdev->memlength >> PAGE_SHIFT)/BITS_PER_TYPE(char));
> +	if (!nvdev->pfn_bitmap)
> +		ret = -ENOMEM;
> +#endif
>  	return ret;
>  }
>  
> @@ -442,6 +543,10 @@ static void nvgrace_gpu_vfio_pci_remove(struct pci_dev *pdev)
>  	struct nvgrace_gpu_vfio_pci_core_device *nvdev = nvgrace_gpu_drvdata(pdev);
>  	struct vfio_pci_core_device *vdev = &nvdev->core_device;
>  
> +#ifdef CONFIG_MEMORY_FAILURE
> +	vfree(nvdev->pfn_bitmap);
> +#endif
> +
>  	vfio_pci_core_unregister_device(vdev);
>  	vfio_put_device(&vdev->vdev);
>  }
> diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h
> index 307e3f29b527..747094503909 100644
> --- a/drivers/vfio/vfio.h
> +++ b/drivers/vfio/vfio.h
> @@ -16,17 +16,6 @@ struct iommufd_ctx;
>  struct iommu_group;
>  struct vfio_container;
>  
> -struct vfio_device_file {
> -	struct vfio_device *device;
> -	struct vfio_group *group;
> -
> -	u8 access_granted;
> -	u32 devid; /* only valid when iommufd is valid */
> -	spinlock_t kvm_ref_lock; /* protect kvm field */
> -	struct kvm *kvm;
> -	struct iommufd_ctx *iommufd; /* protected by struct vfio_device_set::lock */
> -};
> -
>  void vfio_device_put_registration(struct vfio_device *device);
>  bool vfio_device_try_get_registration(struct vfio_device *device);
>  int vfio_df_open(struct vfio_device_file *df);
> diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
> index 40732e8ed4c6..a7dafd7c64a6 100644
> --- a/drivers/vfio/vfio_main.c
> +++ b/drivers/vfio/vfio_main.c
> @@ -1309,7 +1309,7 @@ const struct file_operations vfio_device_fops = {
>  	.mmap		= vfio_device_fops_mmap,
>  };
>  
> -static struct vfio_device *vfio_device_from_file(struct file *file)
> +struct vfio_device *vfio_device_from_file(struct file *file)
>  {
>  	struct vfio_device_file *df = file->private_data;
>  
> @@ -1317,6 +1317,7 @@ static struct vfio_device *vfio_device_from_file(struct file *file)
>  		return NULL;
>  	return df->device;
>  }
> +EXPORT_SYMBOL_GPL(vfio_device_from_file);
>  
>  /**
>   * vfio_file_is_valid - True if the file is valid vfio file
> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> index 454e9295970c..d88af251e931 100644
> --- a/include/linux/vfio.h
> +++ b/include/linux/vfio.h
> @@ -361,4 +361,19 @@ int vfio_virqfd_enable(void *opaque, int (*handler)(void *, void *),
>  		       struct virqfd **pvirqfd, int fd);
>  void vfio_virqfd_disable(struct virqfd **pvirqfd);
>  
> +/*
> + * VFIO device file.
> + */
> +struct vfio_device_file {
> +	struct vfio_device *device;
> +	struct vfio_group *group;
> +	u8 access_granted;
> +	u32 devid; /* only valid when iommufd is valid */
> +	spinlock_t kvm_ref_lock; /* protect kvm field */
> +	struct kvm *kvm;
> +	struct iommufd_ctx *iommufd; /* protected by struct vfio_device_set::lock */
> +};

What here necessitates moving this to the more public header?  Thanks,

Alex

> +
> +struct vfio_device *vfio_device_from_file(struct file *file);
> +
>  #endif /* VFIO_H */
diff mbox series

Patch

diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c
index ba323f2d8ea1..1c89ce0cc1cc 100644
--- a/drivers/vfio/pci/nvgrace-gpu/main.c
+++ b/drivers/vfio/pci/nvgrace-gpu/main.c
@@ -6,6 +6,10 @@ 
 #include <linux/pci.h>
 #include <linux/vfio_pci_core.h>
 #include <linux/vfio.h>
+#ifdef CONFIG_MEMORY_FAILURE
+#include <linux/bitmap.h>
+#include <linux/memory-failure.h>
+#endif
 
 struct nvgrace_gpu_vfio_pci_core_device {
 	struct vfio_pci_core_device core_device;
@@ -13,8 +17,85 @@  struct nvgrace_gpu_vfio_pci_core_device {
 	size_t memlength;
 	void *memmap;
 	struct mutex memmap_lock;
+#ifdef CONFIG_MEMORY_FAILURE
+	struct pfn_address_space pfn_address_space;
+	unsigned long *pfn_bitmap;
+#endif
 };
 
+#ifdef CONFIG_MEMORY_FAILURE
+void nvgrace_gpu_vfio_pci_pfn_memory_failure(struct pfn_address_space *pfn_space,
+		unsigned long pfn)
+{
+	struct nvgrace_gpu_vfio_pci_core_device *nvdev = container_of(
+		pfn_space, struct nvgrace_gpu_vfio_pci_core_device, pfn_address_space);
+	unsigned long mem_offset = pfn - pfn_space->node.start;
+
+	if (mem_offset >= nvdev->memlength)
+		return;
+
+	/*
+	 * MM has called to notify a poisoned page. Track that in the bitmap.
+	 */
+	__set_bit(mem_offset, nvdev->pfn_bitmap);
+}
+
+struct pfn_address_space_ops nvgrace_gpu_vfio_pci_pas_ops = {
+	.failure = nvgrace_gpu_vfio_pci_pfn_memory_failure,
+};
+
+static int
+nvgrace_gpu_vfio_pci_register_pfn_range(struct nvgrace_gpu_vfio_pci_core_device *nvdev,
+					struct vm_area_struct *vma)
+{
+	unsigned long nr_pages;
+	int ret = 0;
+
+	nr_pages = nvdev->memlength >> PAGE_SHIFT;
+
+	nvdev->pfn_address_space.node.start = vma->vm_pgoff;
+	nvdev->pfn_address_space.node.last = vma->vm_pgoff + nr_pages - 1;
+	nvdev->pfn_address_space.ops = &nvgrace_gpu_vfio_pci_pas_ops;
+	nvdev->pfn_address_space.mapping = vma->vm_file->f_mapping;
+
+	ret = register_pfn_address_space(&(nvdev->pfn_address_space));
+
+	return ret;
+}
+
+static vm_fault_t nvgrace_gpu_vfio_pci_fault(struct vm_fault *vmf)
+{
+	unsigned long mem_offset = vmf->pgoff - vmf->vma->vm_pgoff;
+	struct vfio_device *core_vdev;
+	struct nvgrace_gpu_vfio_pci_core_device *nvdev;
+
+	if (!(vmf->vma->vm_file))
+		goto error_exit;
+
+	core_vdev = vfio_device_from_file(vmf->vma->vm_file);
+
+	if (!core_vdev)
+		goto error_exit;
+
+	nvdev = container_of(core_vdev,
+			struct nvgrace_gpu_vfio_pci_core_device, core_device.vdev);
+
+	/*
+	 * Check if the page is poisoned.
+	 */
+	if (mem_offset < (nvdev->memlength >> PAGE_SHIFT) &&
+		test_bit(mem_offset, nvdev->pfn_bitmap))
+		return VM_FAULT_HWPOISON;
+
+error_exit:
+	return VM_FAULT_ERROR;
+}
+
+static const struct vm_operations_struct nvgrace_gpu_vfio_pci_mmap_ops = {
+	.fault = nvgrace_gpu_vfio_pci_fault,
+};
+#endif
+
 static int nvgrace_gpu_vfio_pci_open_device(struct vfio_device *core_vdev)
 {
 	struct vfio_pci_core_device *vdev =
@@ -46,6 +127,9 @@  static void nvgrace_gpu_vfio_pci_close_device(struct vfio_device *core_vdev)
 
 	mutex_destroy(&nvdev->memmap_lock);
 
+#ifdef CONFIG_MEMORY_FAILURE
+	unregister_pfn_address_space(&(nvdev->pfn_address_space));
+#endif
 	vfio_pci_core_close_device(core_vdev);
 }
 
@@ -104,8 +188,12 @@  static int nvgrace_gpu_vfio_pci_mmap(struct vfio_device *core_vdev,
 		return ret;
 
 	vma->vm_pgoff = start_pfn;
+#ifdef CONFIG_MEMORY_FAILURE
+	vma->vm_ops = &nvgrace_gpu_vfio_pci_mmap_ops;
 
-	return 0;
+	ret = nvgrace_gpu_vfio_pci_register_pfn_range(nvdev, vma);
+#endif
+	return ret;
 }
 
 static long
@@ -406,6 +494,19 @@  nvgrace_gpu_vfio_pci_fetch_memory_property(struct pci_dev *pdev,
 
 	nvdev->memlength = memlength;
 
+#ifdef CONFIG_MEMORY_FAILURE
+	/*
+	 * A bitmap is maintained to track the pages that are poisoned. Each
+	 * page is represented by a bit. Allocation size in bytes is
+	 * determined by shifting the device memory size by PAGE_SHIFT to
+	 * determine the number of pages; and further shifted by 3 as each
+	 * byte could track 8 pages.
+	 */
+	nvdev->pfn_bitmap
+		= vzalloc((nvdev->memlength >> PAGE_SHIFT)/BITS_PER_TYPE(char));
+	if (!nvdev->pfn_bitmap)
+		ret = -ENOMEM;
+#endif
 	return ret;
 }
 
@@ -442,6 +543,10 @@  static void nvgrace_gpu_vfio_pci_remove(struct pci_dev *pdev)
 	struct nvgrace_gpu_vfio_pci_core_device *nvdev = nvgrace_gpu_drvdata(pdev);
 	struct vfio_pci_core_device *vdev = &nvdev->core_device;
 
+#ifdef CONFIG_MEMORY_FAILURE
+	vfree(nvdev->pfn_bitmap);
+#endif
+
 	vfio_pci_core_unregister_device(vdev);
 	vfio_put_device(&vdev->vdev);
 }
diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h
index 307e3f29b527..747094503909 100644
--- a/drivers/vfio/vfio.h
+++ b/drivers/vfio/vfio.h
@@ -16,17 +16,6 @@  struct iommufd_ctx;
 struct iommu_group;
 struct vfio_container;
 
-struct vfio_device_file {
-	struct vfio_device *device;
-	struct vfio_group *group;
-
-	u8 access_granted;
-	u32 devid; /* only valid when iommufd is valid */
-	spinlock_t kvm_ref_lock; /* protect kvm field */
-	struct kvm *kvm;
-	struct iommufd_ctx *iommufd; /* protected by struct vfio_device_set::lock */
-};
-
 void vfio_device_put_registration(struct vfio_device *device);
 bool vfio_device_try_get_registration(struct vfio_device *device);
 int vfio_df_open(struct vfio_device_file *df);
diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
index 40732e8ed4c6..a7dafd7c64a6 100644
--- a/drivers/vfio/vfio_main.c
+++ b/drivers/vfio/vfio_main.c
@@ -1309,7 +1309,7 @@  const struct file_operations vfio_device_fops = {
 	.mmap		= vfio_device_fops_mmap,
 };
 
-static struct vfio_device *vfio_device_from_file(struct file *file)
+struct vfio_device *vfio_device_from_file(struct file *file)
 {
 	struct vfio_device_file *df = file->private_data;
 
@@ -1317,6 +1317,7 @@  static struct vfio_device *vfio_device_from_file(struct file *file)
 		return NULL;
 	return df->device;
 }
+EXPORT_SYMBOL_GPL(vfio_device_from_file);
 
 /**
  * vfio_file_is_valid - True if the file is valid vfio file
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 454e9295970c..d88af251e931 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -361,4 +361,19 @@  int vfio_virqfd_enable(void *opaque, int (*handler)(void *, void *),
 		       struct virqfd **pvirqfd, int fd);
 void vfio_virqfd_disable(struct virqfd **pvirqfd);
 
+/*
+ * VFIO device file.
+ */
+struct vfio_device_file {
+	struct vfio_device *device;
+	struct vfio_group *group;
+	u8 access_granted;
+	u32 devid; /* only valid when iommufd is valid */
+	spinlock_t kvm_ref_lock; /* protect kvm field */
+	struct kvm *kvm;
+	struct iommufd_ctx *iommufd; /* protected by struct vfio_device_set::lock */
+};
+
+struct vfio_device *vfio_device_from_file(struct file *file);
+
 #endif /* VFIO_H */