diff mbox series

[v5,5/7] pmem: add pmem_recovery_write() dax op

Message ID 20220128213150.1333552-6-jane.chu@oracle.com (mailing list archive)
State New, archived
Headers show
Series DAX poison recovery | expand

Commit Message

Jane Chu Jan. 28, 2022, 9:31 p.m. UTC
pmem_recovery_write() consists of clearing poison via DSM,
clearing page HWPoison bit, re-state _PAGE_PRESENT bit,
cacheflush, write, and finally clearing bad-block record.

A competing pread thread is held off during recovery write
by the presence of bad-block record. A competing recovery_write
thread is serialized by a lock.

Signed-off-by: Jane Chu <jane.chu@oracle.com>
---
 drivers/nvdimm/pmem.c | 82 +++++++++++++++++++++++++++++++++++++++----
 drivers/nvdimm/pmem.h |  1 +
 2 files changed, 77 insertions(+), 6 deletions(-)

Comments

Christoph Hellwig Feb. 2, 2022, 1:43 p.m. UTC | #1
> @@ -257,10 +263,15 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector,
>  __weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
>  		long nr_pages, void **kaddr, pfn_t *pfn)
>  {
> +	bool bad_pmem;
> +	bool do_recovery = false;
>  	resource_size_t offset = PFN_PHYS(pgoff) + pmem->data_offset;
>  
> -	if (unlikely(is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512,
> -					PFN_PHYS(nr_pages))))
> +	bad_pmem = is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512,
> +				PFN_PHYS(nr_pages));
> +	if (bad_pmem && kaddr)
> +		do_recovery = dax_recovery_started(pmem->dax_dev, kaddr);
> +	if (bad_pmem && !do_recovery)
>  		return -EIO;

I find the passing of the recovery flag through the address very
cumbersome.  I remember there was some kind of discussion, but this looks
pretty ugly.

Also no need for the bad_pmem variable:

	if (is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, PFN_PHYS(nr_pages)) &&
	    (!kaddr | !dax_recovery_started(pmem->dax_dev, kaddr)))
		return -EIO;

Also:  the !kaddr check could go into dax_recovery_started.  That way
even if we stick with the overloading kaddr could also be used just for
the flag if needed.
Jane Chu Feb. 2, 2022, 10:13 p.m. UTC | #2
On 2/2/2022 5:43 AM, Christoph Hellwig wrote:
>> @@ -257,10 +263,15 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector,
>>   __weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
>>   		long nr_pages, void **kaddr, pfn_t *pfn)
>>   {
>> +	bool bad_pmem;
>> +	bool do_recovery = false;
>>   	resource_size_t offset = PFN_PHYS(pgoff) + pmem->data_offset;
>>   
>> -	if (unlikely(is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512,
>> -					PFN_PHYS(nr_pages))))
>> +	bad_pmem = is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512,
>> +				PFN_PHYS(nr_pages));
>> +	if (bad_pmem && kaddr)
>> +		do_recovery = dax_recovery_started(pmem->dax_dev, kaddr);
>> +	if (bad_pmem && !do_recovery)
>>   		return -EIO;
> 
> I find the passing of the recovery flag through the address very
> cumbersome.  I remember there was some kind of discussion, but this looks
> pretty ugly.
> 
> Also no need for the bad_pmem variable:
> 
> 	if (is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, PFN_PHYS(nr_pages)) &&
> 	    (!kaddr | !dax_recovery_started(pmem->dax_dev, kaddr)))
> 		return -EIO;

Okay.

>
> Also:  the !kaddr check could go into dax_recovery_started.  That way
> even if we stick with the overloading kaddr could also be used just for
> the flag if needed.

The !kaddr check is in dax_recovery_started(), and that reminded me the
check should be in dax_prep_recovery() too.

Thanks!
-jane
Dan Williams Feb. 4, 2022, 6:21 a.m. UTC | #3
On Fri, Jan 28, 2022 at 1:32 PM Jane Chu <jane.chu@oracle.com> wrote:
>
> pmem_recovery_write() consists of clearing poison via DSM,
> clearing page HWPoison bit, re-state _PAGE_PRESENT bit,
> cacheflush, write, and finally clearing bad-block record.
>
> A competing pread thread is held off during recovery write
> by the presence of bad-block record. A competing recovery_write
> thread is serialized by a lock.
>
> Signed-off-by: Jane Chu <jane.chu@oracle.com>
> ---
>  drivers/nvdimm/pmem.c | 82 +++++++++++++++++++++++++++++++++++++++----
>  drivers/nvdimm/pmem.h |  1 +
>  2 files changed, 77 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
> index 638e64681db9..f2d6b34d48de 100644
> --- a/drivers/nvdimm/pmem.c
> +++ b/drivers/nvdimm/pmem.c
> @@ -69,6 +69,14 @@ static void hwpoison_clear(struct pmem_device *pmem,
>         }
>  }
>
> +static void pmem_clear_badblocks(struct pmem_device *pmem, sector_t sector,
> +                               long cleared_blks)
> +{
> +       badblocks_clear(&pmem->bb, sector, cleared_blks);
> +       if (pmem->bb_state)
> +               sysfs_notify_dirent(pmem->bb_state);
> +}
> +
>  static blk_status_t pmem_clear_poison(struct pmem_device *pmem,
>                 phys_addr_t offset, unsigned int len)
>  {
> @@ -88,9 +96,7 @@ static blk_status_t pmem_clear_poison(struct pmem_device *pmem,
>                 dev_dbg(dev, "%#llx clear %ld sector%s\n",
>                                 (unsigned long long) sector, cleared,
>                                 cleared > 1 ? "s" : "");
> -               badblocks_clear(&pmem->bb, sector, cleared);
> -               if (pmem->bb_state)
> -                       sysfs_notify_dirent(pmem->bb_state);
> +               pmem_clear_badblocks(pmem, sector, cleared);
>         }
>
>         arch_invalidate_pmem(pmem->virt_addr + offset, len);
> @@ -257,10 +263,15 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector,
>  __weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
>                 long nr_pages, void **kaddr, pfn_t *pfn)
>  {
> +       bool bad_pmem;
> +       bool do_recovery = false;
>         resource_size_t offset = PFN_PHYS(pgoff) + pmem->data_offset;
>
> -       if (unlikely(is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512,
> -                                       PFN_PHYS(nr_pages))))
> +       bad_pmem = is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512,
> +                               PFN_PHYS(nr_pages));
> +       if (bad_pmem && kaddr)
> +               do_recovery = dax_recovery_started(pmem->dax_dev, kaddr);
> +       if (bad_pmem && !do_recovery)
>                 return -EIO;
>
>         if (kaddr)
> @@ -301,10 +312,68 @@ static long pmem_dax_direct_access(struct dax_device *dax_dev,
>         return __pmem_direct_access(pmem, pgoff, nr_pages, kaddr, pfn);
>  }
>
> +/*
> + * The recovery write thread started out as a normal pwrite thread and
> + * when the filesystem was told about potential media error in the
> + * range, filesystem turns the normal pwrite to a dax_recovery_write.
> + *
> + * The recovery write consists of clearing poison via DSM, clearing page
> + * HWPoison bit, reenable page-wide read-write permission, flush the
> + * caches and finally write.  A competing pread thread needs to be held
> + * off during the recovery process since data read back might not be valid.
> + * And that's achieved by placing the badblock records clearing after
> + * the completion of the recovery write.
> + *
> + * Any competing recovery write thread needs to be serialized, and this is
> + * done via pmem device level lock .recovery_lock.
> + */
>  static size_t pmem_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff,
>                 void *addr, size_t bytes, struct iov_iter *i)
>  {
> -       return 0;
> +       size_t rc, len, off;
> +       phys_addr_t pmem_off;
> +       struct pmem_device *pmem = dax_get_private(dax_dev);
> +       struct device *dev = pmem->bb.dev;
> +       sector_t sector;
> +       long cleared, cleared_blk;
> +
> +       mutex_lock(&pmem->recovery_lock);
> +
> +       /* If no poison found in the range, go ahead with write */
> +       off = (unsigned long)addr & ~PAGE_MASK;
> +       len = PFN_PHYS(PFN_UP(off + bytes));
> +       if (!is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, len)) {
> +               rc = _copy_from_iter_flushcache(addr, bytes, i);
> +               goto write_done;
> +       }

is_bad_pmem() takes a seqlock so it should be safe to move the
recovery_lock below this point.

> +
> +       /* Not page-aligned range cannot be recovered */
> +       if (off || !(PAGE_ALIGNED(bytes))) {
> +               dev_warn(dev, "Found poison, but addr(%p) or bytes(%#lx) not page aligned\n",
> +                       addr, bytes);

fs/dax.c will prevent this from happening, right? It seems like an
upper layer bug if we get this far into the recovery process without
checking if a full page is being overwritten.

> +               rc = (size_t) -EIO;
> +               goto write_done;
> +       }
> +
> +       pmem_off = PFN_PHYS(pgoff) + pmem->data_offset;
> +       sector = (pmem_off - pmem->data_offset) / 512;
> +       cleared = nvdimm_clear_poison(dev, pmem->phys_addr + pmem_off, len);
> +       cleared_blk = cleared / 512;
> +       if (cleared_blk > 0) {
> +               hwpoison_clear(pmem, pmem->phys_addr + pmem_off, cleared);
> +       } else {
> +               dev_warn(dev, "pmem_recovery_write: cleared_blk: %ld\n",
> +                       cleared_blk);
> +               rc = (size_t) -EIO;
> +               goto write_done;
> +       }
> +       arch_invalidate_pmem(pmem->virt_addr + pmem_off, bytes);
> +       rc = _copy_from_iter_flushcache(addr, bytes, i);
> +       pmem_clear_badblocks(pmem, sector, cleared_blk);

This duplicates pmem_clear_poison() can more code be shared between
the 2 functions?


> +
> +write_done:
> +       mutex_unlock(&pmem->recovery_lock);
> +       return rc;
>  }
>
>  static const struct dax_operations pmem_dax_ops = {
> @@ -495,6 +564,7 @@ static int pmem_attach_disk(struct device *dev,
>                 goto out_cleanup_dax;
>         dax_write_cache(dax_dev, nvdimm_has_cache(nd_region));
>         set_dax_recovery(dax_dev);
> +       mutex_init(&pmem->recovery_lock);
>         pmem->dax_dev = dax_dev;
>
>         rc = device_add_disk(dev, disk, pmem_attribute_groups);
> diff --git a/drivers/nvdimm/pmem.h b/drivers/nvdimm/pmem.h
> index 59cfe13ea8a8..971bff7552d6 100644
> --- a/drivers/nvdimm/pmem.h
> +++ b/drivers/nvdimm/pmem.h
> @@ -24,6 +24,7 @@ struct pmem_device {
>         struct dax_device       *dax_dev;
>         struct gendisk          *disk;
>         struct dev_pagemap      pgmap;
> +       struct mutex            recovery_lock;
>  };
>
>  long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
> --
> 2.18.4
>
Jane Chu Feb. 6, 2022, 8:08 a.m. UTC | #4
On 2/3/2022 10:21 PM, Dan Williams wrote:
> On Fri, Jan 28, 2022 at 1:32 PM Jane Chu <jane.chu@oracle.com> wrote:
>>
>> pmem_recovery_write() consists of clearing poison via DSM,
>> clearing page HWPoison bit, re-state _PAGE_PRESENT bit,
>> cacheflush, write, and finally clearing bad-block record.
>>
>> A competing pread thread is held off during recovery write
>> by the presence of bad-block record. A competing recovery_write
>> thread is serialized by a lock.
>>
>> Signed-off-by: Jane Chu <jane.chu@oracle.com>
>> ---
>>   drivers/nvdimm/pmem.c | 82 +++++++++++++++++++++++++++++++++++++++----
>>   drivers/nvdimm/pmem.h |  1 +
>>   2 files changed, 77 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
>> index 638e64681db9..f2d6b34d48de 100644
>> --- a/drivers/nvdimm/pmem.c
>> +++ b/drivers/nvdimm/pmem.c
>> @@ -69,6 +69,14 @@ static void hwpoison_clear(struct pmem_device *pmem,
>>          }
>>   }
>>
>> +static void pmem_clear_badblocks(struct pmem_device *pmem, sector_t sector,
>> +                               long cleared_blks)
>> +{
>> +       badblocks_clear(&pmem->bb, sector, cleared_blks);
>> +       if (pmem->bb_state)
>> +               sysfs_notify_dirent(pmem->bb_state);
>> +}
>> +
>>   static blk_status_t pmem_clear_poison(struct pmem_device *pmem,
>>                  phys_addr_t offset, unsigned int len)
>>   {
>> @@ -88,9 +96,7 @@ static blk_status_t pmem_clear_poison(struct pmem_device *pmem,
>>                  dev_dbg(dev, "%#llx clear %ld sector%s\n",
>>                                  (unsigned long long) sector, cleared,
>>                                  cleared > 1 ? "s" : "");
>> -               badblocks_clear(&pmem->bb, sector, cleared);
>> -               if (pmem->bb_state)
>> -                       sysfs_notify_dirent(pmem->bb_state);
>> +               pmem_clear_badblocks(pmem, sector, cleared);
>>          }
>>
>>          arch_invalidate_pmem(pmem->virt_addr + offset, len);
>> @@ -257,10 +263,15 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector,
>>   __weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
>>                  long nr_pages, void **kaddr, pfn_t *pfn)
>>   {
>> +       bool bad_pmem;
>> +       bool do_recovery = false;
>>          resource_size_t offset = PFN_PHYS(pgoff) + pmem->data_offset;
>>
>> -       if (unlikely(is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512,
>> -                                       PFN_PHYS(nr_pages))))
>> +       bad_pmem = is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512,
>> +                               PFN_PHYS(nr_pages));
>> +       if (bad_pmem && kaddr)
>> +               do_recovery = dax_recovery_started(pmem->dax_dev, kaddr);
>> +       if (bad_pmem && !do_recovery)
>>                  return -EIO;
>>
>>          if (kaddr)
>> @@ -301,10 +312,68 @@ static long pmem_dax_direct_access(struct dax_device *dax_dev,
>>          return __pmem_direct_access(pmem, pgoff, nr_pages, kaddr, pfn);
>>   }
>>
>> +/*
>> + * The recovery write thread started out as a normal pwrite thread and
>> + * when the filesystem was told about potential media error in the
>> + * range, filesystem turns the normal pwrite to a dax_recovery_write.
>> + *
>> + * The recovery write consists of clearing poison via DSM, clearing page
>> + * HWPoison bit, reenable page-wide read-write permission, flush the
>> + * caches and finally write.  A competing pread thread needs to be held
>> + * off during the recovery process since data read back might not be valid.
>> + * And that's achieved by placing the badblock records clearing after
>> + * the completion of the recovery write.
>> + *
>> + * Any competing recovery write thread needs to be serialized, and this is
>> + * done via pmem device level lock .recovery_lock.
>> + */
>>   static size_t pmem_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff,
>>                  void *addr, size_t bytes, struct iov_iter *i)
>>   {
>> -       return 0;
>> +       size_t rc, len, off;
>> +       phys_addr_t pmem_off;
>> +       struct pmem_device *pmem = dax_get_private(dax_dev);
>> +       struct device *dev = pmem->bb.dev;
>> +       sector_t sector;
>> +       long cleared, cleared_blk;
>> +
>> +       mutex_lock(&pmem->recovery_lock);
>> +
>> +       /* If no poison found in the range, go ahead with write */
>> +       off = (unsigned long)addr & ~PAGE_MASK;
>> +       len = PFN_PHYS(PFN_UP(off + bytes));
>> +       if (!is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, len)) {
>> +               rc = _copy_from_iter_flushcache(addr, bytes, i);
>> +               goto write_done;
>> +       }
> 
> is_bad_pmem() takes a seqlock so it should be safe to move the
> recovery_lock below this point.

Okay, thanks!

> 
>> +
>> +       /* Not page-aligned range cannot be recovered */
>> +       if (off || !(PAGE_ALIGNED(bytes))) {
>> +               dev_warn(dev, "Found poison, but addr(%p) or bytes(%#lx) not page aligned\n",
>> +                       addr, bytes);
> 
> fs/dax.c will prevent this from happening, right? It seems like an
> upper layer bug if we get this far into the recovery process without
> checking if a full page is being overwritten.

Yes, at the start of each dax_iomap_iter, the buffer is page aligned. 
However, the underlying dax_copy_from_iter is allowed to return partial 
results, causing the subsequent 'while' loop within dax_iomap_iter to 
continue at not page aligned address. I ran into the situation when I 
played around dax_copy_from_iter, not sure in reality, partial result is 
legitimate, just thought to issue a warning should the situation happen.

> 
>> +               rc = (size_t) -EIO;
>> +               goto write_done;
>> +       }
>> +
>> +       pmem_off = PFN_PHYS(pgoff) + pmem->data_offset;
>> +       sector = (pmem_off - pmem->data_offset) / 512;
>> +       cleared = nvdimm_clear_poison(dev, pmem->phys_addr + pmem_off, len);
>> +       cleared_blk = cleared / 512;
>> +       if (cleared_blk > 0) {
>> +               hwpoison_clear(pmem, pmem->phys_addr + pmem_off, cleared);
>> +       } else {
>> +               dev_warn(dev, "pmem_recovery_write: cleared_blk: %ld\n",
>> +                       cleared_blk);
>> +               rc = (size_t) -EIO;
>> +               goto write_done;
>> +       }
>> +       arch_invalidate_pmem(pmem->virt_addr + pmem_off, bytes);
>> +       rc = _copy_from_iter_flushcache(addr, bytes, i);
>> +       pmem_clear_badblocks(pmem, sector, cleared_blk);
> 
> This duplicates pmem_clear_poison() can more code be shared between
> the 2 functions?

I'll look into refactoring pmem_clear_poison().
> 
> 
>> +
>> +write_done:
>> +       mutex_unlock(&pmem->recovery_lock);
>> +       return rc;
>>   }
>>
>>   static const struct dax_operations pmem_dax_ops = {
>> @@ -495,6 +564,7 @@ static int pmem_attach_disk(struct device *dev,
>>                  goto out_cleanup_dax;
>>          dax_write_cache(dax_dev, nvdimm_has_cache(nd_region));
>>          set_dax_recovery(dax_dev);
>> +       mutex_init(&pmem->recovery_lock);
>>          pmem->dax_dev = dax_dev;
>>
>>          rc = device_add_disk(dev, disk, pmem_attribute_groups);
>> diff --git a/drivers/nvdimm/pmem.h b/drivers/nvdimm/pmem.h
>> index 59cfe13ea8a8..971bff7552d6 100644
>> --- a/drivers/nvdimm/pmem.h
>> +++ b/drivers/nvdimm/pmem.h
>> @@ -24,6 +24,7 @@ struct pmem_device {
>>          struct dax_device       *dax_dev;
>>          struct gendisk          *disk;
>>          struct dev_pagemap      pgmap;
>> +       struct mutex            recovery_lock;
>>   };
>>
>>   long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
>> --
>> 2.18.4
>>

thanks!
-jane
diff mbox series

Patch

diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 638e64681db9..f2d6b34d48de 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -69,6 +69,14 @@  static void hwpoison_clear(struct pmem_device *pmem,
 	}
 }
 
+static void pmem_clear_badblocks(struct pmem_device *pmem, sector_t sector,
+				long cleared_blks)
+{
+	badblocks_clear(&pmem->bb, sector, cleared_blks);
+	if (pmem->bb_state)
+		sysfs_notify_dirent(pmem->bb_state);
+}
+
 static blk_status_t pmem_clear_poison(struct pmem_device *pmem,
 		phys_addr_t offset, unsigned int len)
 {
@@ -88,9 +96,7 @@  static blk_status_t pmem_clear_poison(struct pmem_device *pmem,
 		dev_dbg(dev, "%#llx clear %ld sector%s\n",
 				(unsigned long long) sector, cleared,
 				cleared > 1 ? "s" : "");
-		badblocks_clear(&pmem->bb, sector, cleared);
-		if (pmem->bb_state)
-			sysfs_notify_dirent(pmem->bb_state);
+		pmem_clear_badblocks(pmem, sector, cleared);
 	}
 
 	arch_invalidate_pmem(pmem->virt_addr + offset, len);
@@ -257,10 +263,15 @@  static int pmem_rw_page(struct block_device *bdev, sector_t sector,
 __weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
 		long nr_pages, void **kaddr, pfn_t *pfn)
 {
+	bool bad_pmem;
+	bool do_recovery = false;
 	resource_size_t offset = PFN_PHYS(pgoff) + pmem->data_offset;
 
-	if (unlikely(is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512,
-					PFN_PHYS(nr_pages))))
+	bad_pmem = is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512,
+				PFN_PHYS(nr_pages));
+	if (bad_pmem && kaddr)
+		do_recovery = dax_recovery_started(pmem->dax_dev, kaddr);
+	if (bad_pmem && !do_recovery)
 		return -EIO;
 
 	if (kaddr)
@@ -301,10 +312,68 @@  static long pmem_dax_direct_access(struct dax_device *dax_dev,
 	return __pmem_direct_access(pmem, pgoff, nr_pages, kaddr, pfn);
 }
 
+/*
+ * The recovery write thread started out as a normal pwrite thread and
+ * when the filesystem was told about potential media error in the
+ * range, filesystem turns the normal pwrite to a dax_recovery_write.
+ *
+ * The recovery write consists of clearing poison via DSM, clearing page
+ * HWPoison bit, reenable page-wide read-write permission, flush the
+ * caches and finally write.  A competing pread thread needs to be held
+ * off during the recovery process since data read back might not be valid.
+ * And that's achieved by placing the badblock records clearing after
+ * the completion of the recovery write.
+ *
+ * Any competing recovery write thread needs to be serialized, and this is
+ * done via pmem device level lock .recovery_lock.
+ */
 static size_t pmem_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff,
 		void *addr, size_t bytes, struct iov_iter *i)
 {
-	return 0;
+	size_t rc, len, off;
+	phys_addr_t pmem_off;
+	struct pmem_device *pmem = dax_get_private(dax_dev);
+	struct device *dev = pmem->bb.dev;
+	sector_t sector;
+	long cleared, cleared_blk;
+
+	mutex_lock(&pmem->recovery_lock);
+
+	/* If no poison found in the range, go ahead with write */
+	off = (unsigned long)addr & ~PAGE_MASK;
+	len = PFN_PHYS(PFN_UP(off + bytes));
+	if (!is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, len)) {
+		rc = _copy_from_iter_flushcache(addr, bytes, i);
+		goto write_done;
+	}
+
+	/* Not page-aligned range cannot be recovered */
+	if (off || !(PAGE_ALIGNED(bytes))) {
+		dev_warn(dev, "Found poison, but addr(%p) or bytes(%#lx) not page aligned\n",
+			addr, bytes);
+		rc = (size_t) -EIO;
+		goto write_done;
+	}
+
+	pmem_off = PFN_PHYS(pgoff) + pmem->data_offset;
+	sector = (pmem_off - pmem->data_offset) / 512;
+	cleared = nvdimm_clear_poison(dev, pmem->phys_addr + pmem_off, len);
+	cleared_blk = cleared / 512;
+	if (cleared_blk > 0) {
+		hwpoison_clear(pmem, pmem->phys_addr + pmem_off, cleared);
+	} else {
+		dev_warn(dev, "pmem_recovery_write: cleared_blk: %ld\n",
+			cleared_blk);
+		rc = (size_t) -EIO;
+		goto write_done;
+	}
+	arch_invalidate_pmem(pmem->virt_addr + pmem_off, bytes);
+	rc = _copy_from_iter_flushcache(addr, bytes, i);
+	pmem_clear_badblocks(pmem, sector, cleared_blk);
+
+write_done:
+	mutex_unlock(&pmem->recovery_lock);
+	return rc;
 }
 
 static const struct dax_operations pmem_dax_ops = {
@@ -495,6 +564,7 @@  static int pmem_attach_disk(struct device *dev,
 		goto out_cleanup_dax;
 	dax_write_cache(dax_dev, nvdimm_has_cache(nd_region));
 	set_dax_recovery(dax_dev);
+	mutex_init(&pmem->recovery_lock);
 	pmem->dax_dev = dax_dev;
 
 	rc = device_add_disk(dev, disk, pmem_attribute_groups);
diff --git a/drivers/nvdimm/pmem.h b/drivers/nvdimm/pmem.h
index 59cfe13ea8a8..971bff7552d6 100644
--- a/drivers/nvdimm/pmem.h
+++ b/drivers/nvdimm/pmem.h
@@ -24,6 +24,7 @@  struct pmem_device {
 	struct dax_device	*dax_dev;
 	struct gendisk		*disk;
 	struct dev_pagemap	pgmap;
+	struct mutex		recovery_lock;
 };
 
 long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,