diff mbox series

[v8,7/7] pmem: implement pmem_recovery_write()

Message ID 20220420020435.90326-8-jane.chu@oracle.com (mailing list archive)
State Superseded
Headers show
Series DAX poison recovery | expand

Commit Message

Jane Chu April 20, 2022, 2:04 a.m. UTC
The recovery write thread started out as a normal pwrite thread and
when the filesystem was told about potential media error in the
range, filesystem turns the normal pwrite to a dax_recovery_write.

The recovery write consists of clearing media poison, clearing page
HWPoison bit, reenable page-wide read-write permission, flush the
caches and finally write.  A competing pread thread will be held
off during the recovery process since data read back might not be
valid, and this is achieved by clearing the badblock records after
the recovery write is complete. Competing recovery write threads
are serialized by pmem device level .recovery_lock.

Signed-off-by: Jane Chu <jane.chu@oracle.com>
---
 drivers/nvdimm/pmem.c | 56 ++++++++++++++++++++++++++++++++++++++++++-
 drivers/nvdimm/pmem.h |  1 +
 2 files changed, 56 insertions(+), 1 deletion(-)

Comments

Dan Williams April 20, 2022, 6:26 a.m. UTC | #1
On Tue, Apr 19, 2022 at 7:06 PM Jane Chu <jane.chu@oracle.com> wrote:
>
> The recovery write thread started out as a normal pwrite thread and
> when the filesystem was told about potential media error in the
> range, filesystem turns the normal pwrite to a dax_recovery_write.
>
> The recovery write consists of clearing media poison, clearing page
> HWPoison bit, reenable page-wide read-write permission, flush the
> caches and finally write.  A competing pread thread will be held
> off during the recovery process since data read back might not be
> valid, and this is achieved by clearing the badblock records after
> the recovery write is complete. Competing recovery write threads
> are serialized by pmem device level .recovery_lock.
>
> Signed-off-by: Jane Chu <jane.chu@oracle.com>
> ---
>  drivers/nvdimm/pmem.c | 56 ++++++++++++++++++++++++++++++++++++++++++-
>  drivers/nvdimm/pmem.h |  1 +
>  2 files changed, 56 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
> index c3772304d417..134f8909eb65 100644
> --- a/drivers/nvdimm/pmem.c
> +++ b/drivers/nvdimm/pmem.c
> @@ -332,10 +332,63 @@ static long pmem_dax_direct_access(struct dax_device *dax_dev,
>         return __pmem_direct_access(pmem, pgoff, nr_pages, mode, kaddr, pfn);
>  }
>
> +/*
> + * The recovery write thread started out as a normal pwrite thread and
> + * when the filesystem was told about potential media error in the
> + * range, filesystem turns the normal pwrite to a dax_recovery_write.
> + *
> + * The recovery write consists of clearing media poison, clearing page
> + * HWPoison bit, reenable page-wide read-write permission, flush the
> + * caches and finally write.  A competing pread thread will be held
> + * off during the recovery process since data read back might not be
> + * valid, and this is achieved by clearing the badblock records after
> + * the recovery write is complete. Competing recovery write threads
> + * are serialized by pmem device level .recovery_lock.
> + */
>  static size_t pmem_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff,
>                 void *addr, size_t bytes, struct iov_iter *i)
>  {
> -       return 0;
> +       struct pmem_device *pmem = dax_get_private(dax_dev);
> +       size_t olen, len, off;
> +       phys_addr_t pmem_off;
> +       struct device *dev = pmem->bb.dev;
> +       long cleared;
> +
> +       off = offset_in_page(addr);
> +       len = PFN_PHYS(PFN_UP(off + bytes));
> +       if (!is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) >> SECTOR_SHIFT, len))
> +               return _copy_from_iter_flushcache(addr, bytes, i);
> +
> +       /*
> +        * Not page-aligned range cannot be recovered. This should not
> +        * happen unless something else went wrong.
> +        */
> +       if (off || !PAGE_ALIGNED(bytes)) {
> +               dev_warn(dev, "Found poison, but addr(%p) or bytes(%#lx) not page aligned\n",
> +                       addr, bytes);

If this warn stays:

s/dev_warn/dev_warn_ratelimited/

The kernel prints hashed addresses for %p, so I'm not sure printing
@addr is useful or @bytes because there is nothing actionable that can
be done with that information in the log. @pgoff is probably the only
variable worth printing (after converting to bytes or sectors) as that
might be able to be reverse mapped back to the impacted data.

> +               return 0;
> +       }
> +
> +       mutex_lock(&pmem->recovery_lock);
> +       pmem_off = PFN_PHYS(pgoff) + pmem->data_offset;
> +       cleared = __pmem_clear_poison(pmem, pmem_off, len);
> +       if (cleared > 0 && cleared < len) {
> +               dev_warn(dev, "poison cleared only %ld out of %lu bytes\n",
> +                       cleared, len);

This looks like dev_dbg() to me, or at minimum the same
dev_warn_ratelimited() print as the one above to just indicate a write
to the device at the given offset failed.

> +               mutex_unlock(&pmem->recovery_lock);
> +               return 0;
> +       }
> +       if (cleared < 0) {
> +               dev_warn(dev, "poison clear failed: %ld\n", cleared);

Same feedback here, these should probably all map to the identical
error exit ratelimited print.

> +               mutex_unlock(&pmem->recovery_lock);

It occurs to me that all callers of this are arriving through the
fsdax iomap ops and all of these callers take an exclusive lock to
prevent simultaneous access to the inode. If recovery_write() is only
used through that path then this lock is redundant. Simultaneous reads
are protected by the fact that the badblocks are cleared last. I think
this can wait to add the lock when / if a non-fsdax access path
arrives for dax_recovery_write(), and even then it should probably
enforce the single writer exclusion guarantee of the fsdax path.

> +               return 0;
> +       }
> +
> +       olen = _copy_from_iter_flushcache(addr, bytes, i);
> +       pmem_clear_bb(pmem, to_sect(pmem, pmem_off), cleared >> SECTOR_SHIFT);
> +
> +       mutex_unlock(&pmem->recovery_lock);
> +       return olen;
>  }
>
>  static const struct dax_operations pmem_dax_ops = {
> @@ -525,6 +578,7 @@ static int pmem_attach_disk(struct device *dev,
>         if (rc)
>                 goto out_cleanup_dax;
>         dax_write_cache(dax_dev, nvdimm_has_cache(nd_region));
> +       mutex_init(&pmem->recovery_lock);
>         pmem->dax_dev = dax_dev;
>
>         rc = device_add_disk(dev, disk, pmem_attribute_groups);
> diff --git a/drivers/nvdimm/pmem.h b/drivers/nvdimm/pmem.h
> index 392b0b38acb9..91e40f5e3c0e 100644
> --- a/drivers/nvdimm/pmem.h
> +++ b/drivers/nvdimm/pmem.h
> @@ -27,6 +27,7 @@ struct pmem_device {
>         struct dax_device       *dax_dev;
>         struct gendisk          *disk;
>         struct dev_pagemap      pgmap;
> +       struct mutex            recovery_lock;
>  };
>
>  long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
> --
> 2.18.4
>
Jane Chu April 20, 2022, 5:01 p.m. UTC | #2
On 4/19/2022 11:26 PM, Dan Williams wrote:
> On Tue, Apr 19, 2022 at 7:06 PM Jane Chu <jane.chu@oracle.com> wrote:
>>
>> The recovery write thread started out as a normal pwrite thread and
>> when the filesystem was told about potential media error in the
>> range, filesystem turns the normal pwrite to a dax_recovery_write.
>>
>> The recovery write consists of clearing media poison, clearing page
>> HWPoison bit, reenable page-wide read-write permission, flush the
>> caches and finally write.  A competing pread thread will be held
>> off during the recovery process since data read back might not be
>> valid, and this is achieved by clearing the badblock records after
>> the recovery write is complete. Competing recovery write threads
>> are serialized by pmem device level .recovery_lock.
>>
>> Signed-off-by: Jane Chu <jane.chu@oracle.com>
>> ---
>>   drivers/nvdimm/pmem.c | 56 ++++++++++++++++++++++++++++++++++++++++++-
>>   drivers/nvdimm/pmem.h |  1 +
>>   2 files changed, 56 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
>> index c3772304d417..134f8909eb65 100644
>> --- a/drivers/nvdimm/pmem.c
>> +++ b/drivers/nvdimm/pmem.c
>> @@ -332,10 +332,63 @@ static long pmem_dax_direct_access(struct dax_device *dax_dev,
>>          return __pmem_direct_access(pmem, pgoff, nr_pages, mode, kaddr, pfn);
>>   }
>>
>> +/*
>> + * The recovery write thread started out as a normal pwrite thread and
>> + * when the filesystem was told about potential media error in the
>> + * range, filesystem turns the normal pwrite to a dax_recovery_write.
>> + *
>> + * The recovery write consists of clearing media poison, clearing page
>> + * HWPoison bit, reenable page-wide read-write permission, flush the
>> + * caches and finally write.  A competing pread thread will be held
>> + * off during the recovery process since data read back might not be
>> + * valid, and this is achieved by clearing the badblock records after
>> + * the recovery write is complete. Competing recovery write threads
>> + * are serialized by pmem device level .recovery_lock.
>> + */
>>   static size_t pmem_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff,
>>                  void *addr, size_t bytes, struct iov_iter *i)
>>   {
>> -       return 0;
>> +       struct pmem_device *pmem = dax_get_private(dax_dev);
>> +       size_t olen, len, off;
>> +       phys_addr_t pmem_off;
>> +       struct device *dev = pmem->bb.dev;
>> +       long cleared;
>> +
>> +       off = offset_in_page(addr);
>> +       len = PFN_PHYS(PFN_UP(off + bytes));
>> +       if (!is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) >> SECTOR_SHIFT, len))
>> +               return _copy_from_iter_flushcache(addr, bytes, i);
>> +
>> +       /*
>> +        * Not page-aligned range cannot be recovered. This should not
>> +        * happen unless something else went wrong.
>> +        */
>> +       if (off || !PAGE_ALIGNED(bytes)) {
>> +               dev_warn(dev, "Found poison, but addr(%p) or bytes(%#lx) not page aligned\n",
>> +                       addr, bytes);
> 
> If this warn stays:
> 
> s/dev_warn/dev_warn_ratelimited/
> 
> The kernel prints hashed addresses for %p, so I'm not sure printing
> @addr is useful or @bytes because there is nothing actionable that can
> be done with that information in the log. @pgoff is probably the only
> variable worth printing (after converting to bytes or sectors) as that
> might be able to be reverse mapped back to the impacted data.

The intention with printing @addr and @bytes is to show the 
mis-alignment. In the past when UC- was set on poisoned dax page, 
returning less than a page being written would cause dax_iomap_iter to 
produce next iteration with @addr and @bytes not-page-aligned.  Although 
UC- doesn't apply here, I thought it might still be worth while to watch 
for similar scenario.  Also that's why @pgoff isn't helpful.

How about s/dev_warn/dev_dbg/ ?

> 
>> +               return 0;
>> +       }
>> +
>> +       mutex_lock(&pmem->recovery_lock);
>> +       pmem_off = PFN_PHYS(pgoff) + pmem->data_offset;
>> +       cleared = __pmem_clear_poison(pmem, pmem_off, len);
>> +       if (cleared > 0 && cleared < len) {
>> +               dev_warn(dev, "poison cleared only %ld out of %lu bytes\n",
>> +                       cleared, len);
> 
> This looks like dev_dbg() to me, or at minimum the same
> dev_warn_ratelimited() print as the one above to just indicate a write
> to the device at the given offset failed.

Will s/dev_warn/dev_dbg/

> 
>> +               mutex_unlock(&pmem->recovery_lock);
>> +               return 0;
>> +       }
>> +       if (cleared < 0) {
>> +               dev_warn(dev, "poison clear failed: %ld\n", cleared);
> 
> Same feedback here, these should probably all map to the identical
> error exit ratelimited print.

Will s/dev_warn/dev_dbg/

> 
>> +               mutex_unlock(&pmem->recovery_lock);
> 
> It occurs to me that all callers of this are arriving through the
> fsdax iomap ops and all of these callers take an exclusive lock to
> prevent simultaneous access to the inode. If recovery_write() is only
> used through that path then this lock is redundant. Simultaneous reads
> are protected by the fact that the badblocks are cleared last. I think
> this can wait to add the lock when / if a non-fsdax access path
> arrives for dax_recovery_write(), and even then it should probably
> enforce the single writer exclusion guarantee of the fsdax path.
> 

Indeed, the caller dax_iomap_rw has already held the writer lock.

Will remove .recovery_lock for now.

BTW, how are the other patches look to you?

Thanks!
-jane

>> +               return 0;
>> +       }
>> +
>> +       olen = _copy_from_iter_flushcache(addr, bytes, i);
>> +       pmem_clear_bb(pmem, to_sect(pmem, pmem_off), cleared >> SECTOR_SHIFT);
>> +
>> +       mutex_unlock(&pmem->recovery_lock);
>> +       return olen;
>>   }
>>
>>   static const struct dax_operations pmem_dax_ops = {
>> @@ -525,6 +578,7 @@ static int pmem_attach_disk(struct device *dev,
>>          if (rc)
>>                  goto out_cleanup_dax;
>>          dax_write_cache(dax_dev, nvdimm_has_cache(nd_region));
>> +       mutex_init(&pmem->recovery_lock);
>>          pmem->dax_dev = dax_dev;
>>
>>          rc = device_add_disk(dev, disk, pmem_attribute_groups);
>> diff --git a/drivers/nvdimm/pmem.h b/drivers/nvdimm/pmem.h
>> index 392b0b38acb9..91e40f5e3c0e 100644
>> --- a/drivers/nvdimm/pmem.h
>> +++ b/drivers/nvdimm/pmem.h
>> @@ -27,6 +27,7 @@ struct pmem_device {
>>          struct dax_device       *dax_dev;
>>          struct gendisk          *disk;
>>          struct dev_pagemap      pgmap;
>> +       struct mutex            recovery_lock;
>>   };
>>
>>   long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
>> --
>> 2.18.4
>>
Dan Williams April 20, 2022, 5:05 p.m. UTC | #3
On Wed, Apr 20, 2022 at 10:02 AM Jane Chu <jane.chu@oracle.com> wrote:
>
> On 4/19/2022 11:26 PM, Dan Williams wrote:
> > On Tue, Apr 19, 2022 at 7:06 PM Jane Chu <jane.chu@oracle.com> wrote:
> >>
> >> The recovery write thread started out as a normal pwrite thread and
> >> when the filesystem was told about potential media error in the
> >> range, filesystem turns the normal pwrite to a dax_recovery_write.
> >>
> >> The recovery write consists of clearing media poison, clearing page
> >> HWPoison bit, reenable page-wide read-write permission, flush the
> >> caches and finally write.  A competing pread thread will be held
> >> off during the recovery process since data read back might not be
> >> valid, and this is achieved by clearing the badblock records after
> >> the recovery write is complete. Competing recovery write threads
> >> are serialized by pmem device level .recovery_lock.
> >>
> >> Signed-off-by: Jane Chu <jane.chu@oracle.com>
> >> ---
> >>   drivers/nvdimm/pmem.c | 56 ++++++++++++++++++++++++++++++++++++++++++-
> >>   drivers/nvdimm/pmem.h |  1 +
> >>   2 files changed, 56 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
> >> index c3772304d417..134f8909eb65 100644
> >> --- a/drivers/nvdimm/pmem.c
> >> +++ b/drivers/nvdimm/pmem.c
> >> @@ -332,10 +332,63 @@ static long pmem_dax_direct_access(struct dax_device *dax_dev,
> >>          return __pmem_direct_access(pmem, pgoff, nr_pages, mode, kaddr, pfn);
> >>   }
> >>
> >> +/*
> >> + * The recovery write thread started out as a normal pwrite thread and
> >> + * when the filesystem was told about potential media error in the
> >> + * range, filesystem turns the normal pwrite to a dax_recovery_write.
> >> + *
> >> + * The recovery write consists of clearing media poison, clearing page
> >> + * HWPoison bit, reenable page-wide read-write permission, flush the
> >> + * caches and finally write.  A competing pread thread will be held
> >> + * off during the recovery process since data read back might not be
> >> + * valid, and this is achieved by clearing the badblock records after
> >> + * the recovery write is complete. Competing recovery write threads
> >> + * are serialized by pmem device level .recovery_lock.
> >> + */
> >>   static size_t pmem_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff,
> >>                  void *addr, size_t bytes, struct iov_iter *i)
> >>   {
> >> -       return 0;
> >> +       struct pmem_device *pmem = dax_get_private(dax_dev);
> >> +       size_t olen, len, off;
> >> +       phys_addr_t pmem_off;
> >> +       struct device *dev = pmem->bb.dev;
> >> +       long cleared;
> >> +
> >> +       off = offset_in_page(addr);
> >> +       len = PFN_PHYS(PFN_UP(off + bytes));
> >> +       if (!is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) >> SECTOR_SHIFT, len))
> >> +               return _copy_from_iter_flushcache(addr, bytes, i);
> >> +
> >> +       /*
> >> +        * Not page-aligned range cannot be recovered. This should not
> >> +        * happen unless something else went wrong.
> >> +        */
> >> +       if (off || !PAGE_ALIGNED(bytes)) {
> >> +               dev_warn(dev, "Found poison, but addr(%p) or bytes(%#lx) not page aligned\n",
> >> +                       addr, bytes);
> >
> > If this warn stays:
> >
> > s/dev_warn/dev_warn_ratelimited/
> >
> > The kernel prints hashed addresses for %p, so I'm not sure printing
> > @addr is useful or @bytes because there is nothing actionable that can
> > be done with that information in the log. @pgoff is probably the only
> > variable worth printing (after converting to bytes or sectors) as that
> > might be able to be reverse mapped back to the impacted data.
>
> The intention with printing @addr and @bytes is to show the
> mis-alignment. In the past when UC- was set on poisoned dax page,
> returning less than a page being written would cause dax_iomap_iter to
> produce next iteration with @addr and @bytes not-page-aligned.  Although
> UC- doesn't apply here, I thought it might still be worth while to watch
> for similar scenario.  Also that's why @pgoff isn't helpful.
>
> How about s/dev_warn/dev_dbg/ ?
>
> >
> >> +               return 0;
> >> +       }
> >> +
> >> +       mutex_lock(&pmem->recovery_lock);
> >> +       pmem_off = PFN_PHYS(pgoff) + pmem->data_offset;
> >> +       cleared = __pmem_clear_poison(pmem, pmem_off, len);
> >> +       if (cleared > 0 && cleared < len) {
> >> +               dev_warn(dev, "poison cleared only %ld out of %lu bytes\n",
> >> +                       cleared, len);
> >
> > This looks like dev_dbg() to me, or at minimum the same
> > dev_warn_ratelimited() print as the one above to just indicate a write
> > to the device at the given offset failed.
>
> Will s/dev_warn/dev_dbg/
>
> >
> >> +               mutex_unlock(&pmem->recovery_lock);
> >> +               return 0;
> >> +       }
> >> +       if (cleared < 0) {
> >> +               dev_warn(dev, "poison clear failed: %ld\n", cleared);
> >
> > Same feedback here, these should probably all map to the identical
> > error exit ratelimited print.
>
> Will s/dev_warn/dev_dbg/
>
> >
> >> +               mutex_unlock(&pmem->recovery_lock);
> >
> > It occurs to me that all callers of this are arriving through the
> > fsdax iomap ops and all of these callers take an exclusive lock to
> > prevent simultaneous access to the inode. If recovery_write() is only
> > used through that path then this lock is redundant. Simultaneous reads
> > are protected by the fact that the badblocks are cleared last. I think
> > this can wait to add the lock when / if a non-fsdax access path
> > arrives for dax_recovery_write(), and even then it should probably
> > enforce the single writer exclusion guarantee of the fsdax path.
> >
>
> Indeed, the caller dax_iomap_rw has already held the writer lock.
>
> Will remove .recovery_lock for now.
>
> BTW, how are the other patches look to you?

First pass looked good, so I'll do one more lookover, but this is
coming together nicely. Thanks for all the effort on this!
diff mbox series

Patch

diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index c3772304d417..134f8909eb65 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -332,10 +332,63 @@  static long pmem_dax_direct_access(struct dax_device *dax_dev,
 	return __pmem_direct_access(pmem, pgoff, nr_pages, mode, kaddr, pfn);
 }
 
+/*
+ * The recovery write thread started out as a normal pwrite thread and
+ * when the filesystem was told about potential media error in the
+ * range, filesystem turns the normal pwrite to a dax_recovery_write.
+ *
+ * The recovery write consists of clearing media poison, clearing page
+ * HWPoison bit, reenable page-wide read-write permission, flush the
+ * caches and finally write.  A competing pread thread will be held
+ * off during the recovery process since data read back might not be
+ * valid, and this is achieved by clearing the badblock records after
+ * the recovery write is complete. Competing recovery write threads
+ * are serialized by pmem device level .recovery_lock.
+ */
 static size_t pmem_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff,
 		void *addr, size_t bytes, struct iov_iter *i)
 {
-	return 0;
+	struct pmem_device *pmem = dax_get_private(dax_dev);
+	size_t olen, len, off;
+	phys_addr_t pmem_off;
+	struct device *dev = pmem->bb.dev;
+	long cleared;
+
+	off = offset_in_page(addr);
+	len = PFN_PHYS(PFN_UP(off + bytes));
+	if (!is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) >> SECTOR_SHIFT, len))
+		return _copy_from_iter_flushcache(addr, bytes, i);
+
+	/*
+	 * Not page-aligned range cannot be recovered. This should not
+	 * happen unless something else went wrong.
+	 */
+	if (off || !PAGE_ALIGNED(bytes)) {
+		dev_warn(dev, "Found poison, but addr(%p) or bytes(%#lx) not page aligned\n",
+			addr, bytes);
+		return 0;
+	}
+
+	mutex_lock(&pmem->recovery_lock);
+	pmem_off = PFN_PHYS(pgoff) + pmem->data_offset;
+	cleared = __pmem_clear_poison(pmem, pmem_off, len);
+	if (cleared > 0 && cleared < len) {
+		dev_warn(dev, "poison cleared only %ld out of %lu bytes\n",
+			cleared, len);
+		mutex_unlock(&pmem->recovery_lock);
+		return 0;
+	}
+	if (cleared < 0) {
+		dev_warn(dev, "poison clear failed: %ld\n", cleared);
+		mutex_unlock(&pmem->recovery_lock);
+		return 0;
+	}
+
+	olen = _copy_from_iter_flushcache(addr, bytes, i);
+	pmem_clear_bb(pmem, to_sect(pmem, pmem_off), cleared >> SECTOR_SHIFT);
+
+	mutex_unlock(&pmem->recovery_lock);
+	return olen;
 }
 
 static const struct dax_operations pmem_dax_ops = {
@@ -525,6 +578,7 @@  static int pmem_attach_disk(struct device *dev,
 	if (rc)
 		goto out_cleanup_dax;
 	dax_write_cache(dax_dev, nvdimm_has_cache(nd_region));
+	mutex_init(&pmem->recovery_lock);
 	pmem->dax_dev = dax_dev;
 
 	rc = device_add_disk(dev, disk, pmem_attribute_groups);
diff --git a/drivers/nvdimm/pmem.h b/drivers/nvdimm/pmem.h
index 392b0b38acb9..91e40f5e3c0e 100644
--- a/drivers/nvdimm/pmem.h
+++ b/drivers/nvdimm/pmem.h
@@ -27,6 +27,7 @@  struct pmem_device {
 	struct dax_device	*dax_dev;
 	struct gendisk		*disk;
 	struct dev_pagemap	pgmap;
+	struct mutex		recovery_lock;
 };
 
 long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,