diff mbox series

[RFC,2/4] pagemap: introduce ->memory_failure()

Message ID 20200915101311.144269-3-ruansy.fnst@cn.fujitsu.com (mailing list archive)
State New
Headers show
Series fsdax: introduce fs query to support reflink | expand

Commit Message

Ruan Shiyang Sept. 15, 2020, 10:13 a.m. UTC
When memory-failure occurs, we call this function which is implemented
by each devices.  For fsdax, pmem device implements it.  Pmem device
will find out the block device where the error page located in, gets the
filesystem on this block device, and finally call ->storage_lost() to
handle the error in filesystem layer.

Normally, a pmem device may contain one or more partitions, each
partition contains a block device, each block device contains a
filesystem.  So we are able to find out the filesystem by one offset on
this pmem device.  However, in other cases, such as mapped device, I
didn't find a way to obtain the filesystem laying on it.  It is a
problem need to be fixed.

Signed-off-by: Shiyang Ruan <ruansy.fnst@cn.fujitsu.com>
---
 block/genhd.c            | 12 ++++++++++++
 drivers/nvdimm/pmem.c    | 31 +++++++++++++++++++++++++++++++
 include/linux/genhd.h    |  2 ++
 include/linux/memremap.h |  3 +++
 4 files changed, 48 insertions(+)

Comments

Darrick J. Wong Sept. 15, 2020, 4:31 p.m. UTC | #1
On Tue, Sep 15, 2020 at 06:13:09PM +0800, Shiyang Ruan wrote:
> When memory-failure occurs, we call this function which is implemented
> by each devices.  For fsdax, pmem device implements it.  Pmem device
> will find out the block device where the error page located in, gets the
> filesystem on this block device, and finally call ->storage_lost() to
> handle the error in filesystem layer.
> 
> Normally, a pmem device may contain one or more partitions, each
> partition contains a block device, each block device contains a
> filesystem.  So we are able to find out the filesystem by one offset on
> this pmem device.  However, in other cases, such as mapped device, I
> didn't find a way to obtain the filesystem laying on it.  It is a
> problem need to be fixed.
> 
> Signed-off-by: Shiyang Ruan <ruansy.fnst@cn.fujitsu.com>
> ---
>  block/genhd.c            | 12 ++++++++++++
>  drivers/nvdimm/pmem.c    | 31 +++++++++++++++++++++++++++++++
>  include/linux/genhd.h    |  2 ++
>  include/linux/memremap.h |  3 +++
>  4 files changed, 48 insertions(+)
> 
> diff --git a/block/genhd.c b/block/genhd.c
> index 99c64641c314..e7442b60683e 100644
> --- a/block/genhd.c
> +++ b/block/genhd.c
> @@ -1063,6 +1063,18 @@ struct block_device *bdget_disk(struct gendisk *disk, int partno)
>  }
>  EXPORT_SYMBOL(bdget_disk);
>  
> +struct block_device *bdget_disk_sector(struct gendisk *disk, sector_t sector)
> +{
> +	struct block_device *bdev = NULL;
> +	struct hd_struct *part = disk_map_sector_rcu(disk, sector);
> +
> +	if (part)
> +		bdev = bdget(part_devt(part));
> +
> +	return bdev;
> +}
> +EXPORT_SYMBOL(bdget_disk_sector);
> +
>  /*
>   * print a full list of all partitions - intended for places where the root
>   * filesystem can't be mounted and thus to give the victim some idea of what
> diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
> index fab29b514372..3ed96486c883 100644
> --- a/drivers/nvdimm/pmem.c
> +++ b/drivers/nvdimm/pmem.c
> @@ -364,9 +364,40 @@ static void pmem_release_disk(void *__pmem)
>  	put_disk(pmem->disk);
>  }
>  
> +static int pmem_pagemap_memory_failure(struct dev_pagemap *pgmap,
> +		struct mf_recover_controller *mfrc)
> +{
> +	struct pmem_device *pdev;
> +	struct block_device *bdev;
> +	sector_t disk_sector;
> +	loff_t bdev_offset;
> +
> +	pdev = container_of(pgmap, struct pmem_device, pgmap);
> +	if (!pdev->disk)
> +		return -ENXIO;
> +
> +	disk_sector = (PFN_PHYS(mfrc->pfn) - pdev->phys_addr) >> SECTOR_SHIFT;

Ah, I see, looking at the current x86 MCE code, the MCE handler gets a
physical address, which is then rounded down to a PFN, which is then
blown back up into a byte address(?) and then rounded down to sectors.
That is then blown back up into a byte address and passed on to XFS,
which rounds it down to fs blocksize.

/me wishes that wasn't so convoluted, but reforming the whole mm poison
system to have smaller blast radii isn't the purpose of this patch. :)

> +	bdev = bdget_disk_sector(pdev->disk, disk_sector);
> +	if (!bdev)
> +		return -ENXIO;
> +
> +	// TODO what if block device contains a mapped device

Find its dev_pagemap_ops and invoke its memory_failure function? ;)

> +	if (!bdev->bd_super)
> +		goto out;
> +
> +	bdev_offset = ((disk_sector - get_start_sect(bdev)) << SECTOR_SHIFT) -
> +			pdev->data_offset;
> +	bdev->bd_super->s_op->storage_lost(bdev->bd_super, bdev_offset, mfrc);

->storage_lost is required for all filesystems?

--D

> +
> +out:
> +	bdput(bdev);
> +	return 0;
> +}
> +
>  static const struct dev_pagemap_ops fsdax_pagemap_ops = {
>  	.kill			= pmem_pagemap_kill,
>  	.cleanup		= pmem_pagemap_cleanup,
> +	.memory_failure		= pmem_pagemap_memory_failure,
>  };
>  
>  static int pmem_attach_disk(struct device *dev,
> diff --git a/include/linux/genhd.h b/include/linux/genhd.h
> index 4ab853461dff..16e9e13e0841 100644
> --- a/include/linux/genhd.h
> +++ b/include/linux/genhd.h
> @@ -303,6 +303,8 @@ static inline void add_disk_no_queue_reg(struct gendisk *disk)
>  extern void del_gendisk(struct gendisk *gp);
>  extern struct gendisk *get_gendisk(dev_t dev, int *partno);
>  extern struct block_device *bdget_disk(struct gendisk *disk, int partno);
> +extern struct block_device *bdget_disk_sector(struct gendisk *disk,
> +			sector_t sector);
>  
>  extern void set_device_ro(struct block_device *bdev, int flag);
>  extern void set_disk_ro(struct gendisk *disk, int flag);
> diff --git a/include/linux/memremap.h b/include/linux/memremap.h
> index 5f5b2df06e61..efebefa70d00 100644
> --- a/include/linux/memremap.h
> +++ b/include/linux/memremap.h
> @@ -6,6 +6,7 @@
>  
>  struct resource;
>  struct device;
> +struct mf_recover_controller;
>  
>  /**
>   * struct vmem_altmap - pre-allocated storage for vmemmap_populate
> @@ -87,6 +88,8 @@ struct dev_pagemap_ops {
>  	 * the page back to a CPU accessible page.
>  	 */
>  	vm_fault_t (*migrate_to_ram)(struct vm_fault *vmf);
> +	int (*memory_failure)(struct dev_pagemap *pgmap,
> +			      struct mf_recover_controller *mfrc);
>  };
>  
>  #define PGMAP_ALTMAP_VALID	(1 << 0)
> -- 
> 2.28.0
> 
> 
>
Ruan Shiyang Sept. 16, 2020, 2:15 a.m. UTC | #2
On 2020/9/16 上午12:31, Darrick J. Wong wrote:
> On Tue, Sep 15, 2020 at 06:13:09PM +0800, Shiyang Ruan wrote:
>> When memory-failure occurs, we call this function which is implemented
>> by each devices.  For fsdax, pmem device implements it.  Pmem device
>> will find out the block device where the error page located in, gets the
>> filesystem on this block device, and finally call ->storage_lost() to
>> handle the error in filesystem layer.
>>
>> Normally, a pmem device may contain one or more partitions, each
>> partition contains a block device, each block device contains a
>> filesystem.  So we are able to find out the filesystem by one offset on
>> this pmem device.  However, in other cases, such as mapped device, I
>> didn't find a way to obtain the filesystem laying on it.  It is a
>> problem need to be fixed.
>>
>> Signed-off-by: Shiyang Ruan <ruansy.fnst@cn.fujitsu.com>
>> ---
>>   block/genhd.c            | 12 ++++++++++++
>>   drivers/nvdimm/pmem.c    | 31 +++++++++++++++++++++++++++++++
>>   include/linux/genhd.h    |  2 ++
>>   include/linux/memremap.h |  3 +++
>>   4 files changed, 48 insertions(+)
>>
>> diff --git a/block/genhd.c b/block/genhd.c
>> index 99c64641c314..e7442b60683e 100644
>> --- a/block/genhd.c
>> +++ b/block/genhd.c
>> @@ -1063,6 +1063,18 @@ struct block_device *bdget_disk(struct gendisk *disk, int partno)
>>   }
>>   EXPORT_SYMBOL(bdget_disk);
>>   
>> +struct block_device *bdget_disk_sector(struct gendisk *disk, sector_t sector)
>> +{
>> +	struct block_device *bdev = NULL;
>> +	struct hd_struct *part = disk_map_sector_rcu(disk, sector);
>> +
>> +	if (part)
>> +		bdev = bdget(part_devt(part));
>> +
>> +	return bdev;
>> +}
>> +EXPORT_SYMBOL(bdget_disk_sector);
>> +
>>   /*
>>    * print a full list of all partitions - intended for places where the root
>>    * filesystem can't be mounted and thus to give the victim some idea of what
>> diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
>> index fab29b514372..3ed96486c883 100644
>> --- a/drivers/nvdimm/pmem.c
>> +++ b/drivers/nvdimm/pmem.c
>> @@ -364,9 +364,40 @@ static void pmem_release_disk(void *__pmem)
>>   	put_disk(pmem->disk);
>>   }
>>   
>> +static int pmem_pagemap_memory_failure(struct dev_pagemap *pgmap,
>> +		struct mf_recover_controller *mfrc)
>> +{
>> +	struct pmem_device *pdev;
>> +	struct block_device *bdev;
>> +	sector_t disk_sector;
>> +	loff_t bdev_offset;
>> +
>> +	pdev = container_of(pgmap, struct pmem_device, pgmap);
>> +	if (!pdev->disk)
>> +		return -ENXIO;
>> +
>> +	disk_sector = (PFN_PHYS(mfrc->pfn) - pdev->phys_addr) >> SECTOR_SHIFT;
> 
> Ah, I see, looking at the current x86 MCE code, the MCE handler gets a
> physical address, which is then rounded down to a PFN, which is then
> blown back up into a byte address(?) and then rounded down to sectors.
> That is then blown back up into a byte address and passed on to XFS,
> which rounds it down to fs blocksize.
> 
> /me wishes that wasn't so convoluted, but reforming the whole mm poison
> system to have smaller blast radii isn't the purpose of this patch. :)
> 
>> +	bdev = bdget_disk_sector(pdev->disk, disk_sector);
>> +	if (!bdev)
>> +		return -ENXIO;
>> +
>> +	// TODO what if block device contains a mapped device
> 
> Find its dev_pagemap_ops and invoke its memory_failure function? ;)

Thanks for pointing out.  I'll think about it in this way.
> 
>> +	if (!bdev->bd_super)
>> +		goto out;
>> +
>> +	bdev_offset = ((disk_sector - get_start_sect(bdev)) << SECTOR_SHIFT) -
>> +			pdev->data_offset;
>> +	bdev->bd_super->s_op->storage_lost(bdev->bd_super, bdev_offset, mfrc);
> 
> ->storage_lost is required for all filesystems?

I think it is required for filesystems that support fsdax, since the 
owner tracking is moved here.  But anyway, there should have a non-NULL 
judgment.


--
Thanks,
Ruan Shiyang.
> 
> --D
> 
>> +
>> +out:
>> +	bdput(bdev);
>> +	return 0;
>> +}
>> +
>>   static const struct dev_pagemap_ops fsdax_pagemap_ops = {
>>   	.kill			= pmem_pagemap_kill,
>>   	.cleanup		= pmem_pagemap_cleanup,
>> +	.memory_failure		= pmem_pagemap_memory_failure,
>>   };
>>   
>>   static int pmem_attach_disk(struct device *dev,
>> diff --git a/include/linux/genhd.h b/include/linux/genhd.h
>> index 4ab853461dff..16e9e13e0841 100644
>> --- a/include/linux/genhd.h
>> +++ b/include/linux/genhd.h
>> @@ -303,6 +303,8 @@ static inline void add_disk_no_queue_reg(struct gendisk *disk)
>>   extern void del_gendisk(struct gendisk *gp);
>>   extern struct gendisk *get_gendisk(dev_t dev, int *partno);
>>   extern struct block_device *bdget_disk(struct gendisk *disk, int partno);
>> +extern struct block_device *bdget_disk_sector(struct gendisk *disk,
>> +			sector_t sector);
>>   
>>   extern void set_device_ro(struct block_device *bdev, int flag);
>>   extern void set_disk_ro(struct gendisk *disk, int flag);
>> diff --git a/include/linux/memremap.h b/include/linux/memremap.h
>> index 5f5b2df06e61..efebefa70d00 100644
>> --- a/include/linux/memremap.h
>> +++ b/include/linux/memremap.h
>> @@ -6,6 +6,7 @@
>>   
>>   struct resource;
>>   struct device;
>> +struct mf_recover_controller;
>>   
>>   /**
>>    * struct vmem_altmap - pre-allocated storage for vmemmap_populate
>> @@ -87,6 +88,8 @@ struct dev_pagemap_ops {
>>   	 * the page back to a CPU accessible page.
>>   	 */
>>   	vm_fault_t (*migrate_to_ram)(struct vm_fault *vmf);
>> +	int (*memory_failure)(struct dev_pagemap *pgmap,
>> +			      struct mf_recover_controller *mfrc);
>>   };
>>   
>>   #define PGMAP_ALTMAP_VALID	(1 << 0)
>> -- 
>> 2.28.0
>>
>>
>>
> 
>
diff mbox series

Patch

diff --git a/block/genhd.c b/block/genhd.c
index 99c64641c314..e7442b60683e 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1063,6 +1063,18 @@  struct block_device *bdget_disk(struct gendisk *disk, int partno)
 }
 EXPORT_SYMBOL(bdget_disk);
 
+struct block_device *bdget_disk_sector(struct gendisk *disk, sector_t sector)
+{
+	struct block_device *bdev = NULL;
+	struct hd_struct *part = disk_map_sector_rcu(disk, sector);
+
+	if (part)
+		bdev = bdget(part_devt(part));
+
+	return bdev;
+}
+EXPORT_SYMBOL(bdget_disk_sector);
+
 /*
  * print a full list of all partitions - intended for places where the root
  * filesystem can't be mounted and thus to give the victim some idea of what
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index fab29b514372..3ed96486c883 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -364,9 +364,40 @@  static void pmem_release_disk(void *__pmem)
 	put_disk(pmem->disk);
 }
 
+static int pmem_pagemap_memory_failure(struct dev_pagemap *pgmap,
+		struct mf_recover_controller *mfrc)
+{
+	struct pmem_device *pdev;
+	struct block_device *bdev;
+	sector_t disk_sector;
+	loff_t bdev_offset;
+
+	pdev = container_of(pgmap, struct pmem_device, pgmap);
+	if (!pdev->disk)
+		return -ENXIO;
+
+	disk_sector = (PFN_PHYS(mfrc->pfn) - pdev->phys_addr) >> SECTOR_SHIFT;
+	bdev = bdget_disk_sector(pdev->disk, disk_sector);
+	if (!bdev)
+		return -ENXIO;
+
+	// TODO what if block device contains a mapped device
+	if (!bdev->bd_super)
+		goto out;
+
+	bdev_offset = ((disk_sector - get_start_sect(bdev)) << SECTOR_SHIFT) -
+			pdev->data_offset;
+	bdev->bd_super->s_op->storage_lost(bdev->bd_super, bdev_offset, mfrc);
+
+out:
+	bdput(bdev);
+	return 0;
+}
+
 static const struct dev_pagemap_ops fsdax_pagemap_ops = {
 	.kill			= pmem_pagemap_kill,
 	.cleanup		= pmem_pagemap_cleanup,
+	.memory_failure		= pmem_pagemap_memory_failure,
 };
 
 static int pmem_attach_disk(struct device *dev,
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 4ab853461dff..16e9e13e0841 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -303,6 +303,8 @@  static inline void add_disk_no_queue_reg(struct gendisk *disk)
 extern void del_gendisk(struct gendisk *gp);
 extern struct gendisk *get_gendisk(dev_t dev, int *partno);
 extern struct block_device *bdget_disk(struct gendisk *disk, int partno);
+extern struct block_device *bdget_disk_sector(struct gendisk *disk,
+			sector_t sector);
 
 extern void set_device_ro(struct block_device *bdev, int flag);
 extern void set_disk_ro(struct gendisk *disk, int flag);
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 5f5b2df06e61..efebefa70d00 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -6,6 +6,7 @@ 
 
 struct resource;
 struct device;
+struct mf_recover_controller;
 
 /**
  * struct vmem_altmap - pre-allocated storage for vmemmap_populate
@@ -87,6 +88,8 @@  struct dev_pagemap_ops {
 	 * the page back to a CPU accessible page.
 	 */
 	vm_fault_t (*migrate_to_ram)(struct vm_fault *vmf);
+	int (*memory_failure)(struct dev_pagemap *pgmap,
+			      struct mf_recover_controller *mfrc);
 };
 
 #define PGMAP_ALTMAP_VALID	(1 << 0)