diff mbox series

[v7,6/6] pmem: implement pmem_recovery_write()

Message ID 20220405194747.2386619-7-jane.chu@oracle.com (mailing list archive)
State New, archived
Headers show
Series DAX poison recovery | expand

Commit Message

Jane Chu April 5, 2022, 7:47 p.m. UTC
The recovery write thread started out as a normal pwrite thread and
when the filesystem was told about potential media error in the
range, filesystem turns the normal pwrite to a dax_recovery_write.

The recovery write consists of clearing media poison, clearing page
HWPoison bit, reenable page-wide read-write permission, flush the
caches and finally write.  A competing pread thread will be held
off during the recovery process since data read back might not be
valid, and this is achieved by clearing the badblock records after
the recovery write is complete. Competing recovery write threads
are serialized by pmem device level .recovery_lock.

Signed-off-by: Jane Chu <jane.chu@oracle.com>
---
 drivers/nvdimm/pmem.c | 60 ++++++++++++++++++++++++++++++++++++++++++-
 drivers/nvdimm/pmem.h |  1 +
 2 files changed, 60 insertions(+), 1 deletion(-)

Comments

Christoph Hellwig April 6, 2022, 5:21 a.m. UTC | #1
On Tue, Apr 05, 2022 at 01:47:47PM -0600, Jane Chu wrote:
> +	off = (unsigned long)addr & ~PAGE_MASK;

offset_inpage()

> +	if (off || !(PAGE_ALIGNED(bytes))) {

No need for the inner braces.

> +	mutex_lock(&pmem->recovery_lock);
> +	pmem_off = PFN_PHYS(pgoff) + pmem->data_offset;
> +	cleared = __pmem_clear_poison(pmem, pmem_off, len);
> +	if (cleared > 0 && cleared < len) {
> +		dev_warn(dev, "poison cleared only %ld out of %lu\n",
> +			cleared, len);
> +		mutex_unlock(&pmem->recovery_lock);
> +		return 0;
> +	} else if (cleared < 0) {

No need for an else after a return.
Jane Chu April 6, 2022, 5:33 p.m. UTC | #2
On 4/5/2022 10:21 PM, Christoph Hellwig wrote:
> On Tue, Apr 05, 2022 at 01:47:47PM -0600, Jane Chu wrote:
>> +	off = (unsigned long)addr & ~PAGE_MASK;
> 
> offset_inpage()
> 
>> +	if (off || !(PAGE_ALIGNED(bytes))) {
> 
> No need for the inner braces.
> 
>> +	mutex_lock(&pmem->recovery_lock);
>> +	pmem_off = PFN_PHYS(pgoff) + pmem->data_offset;
>> +	cleared = __pmem_clear_poison(pmem, pmem_off, len);
>> +	if (cleared > 0 && cleared < len) {
>> +		dev_warn(dev, "poison cleared only %ld out of %lu\n",
>> +			cleared, len);
>> +		mutex_unlock(&pmem->recovery_lock);
>> +		return 0;
>> +	} else if (cleared < 0) {
> 
> No need for an else after a return.


Agreed, will reflect your comments in next rev.

thanks!
-jane
diff mbox series

Patch

diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 56596be70400..b868a88a0d58 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -340,10 +340,67 @@  static const struct dax_operations pmem_dax_ops = {
 	.zero_page_range = pmem_dax_zero_page_range,
 };
 
+/*
+ * The recovery write thread started out as a normal pwrite thread and
+ * when the filesystem was told about potential media error in the
+ * range, filesystem turns the normal pwrite to a dax_recovery_write.
+ *
+ * The recovery write consists of clearing media poison, clearing page
+ * HWPoison bit, reenable page-wide read-write permission, flush the
+ * caches and finally write.  A competing pread thread will be held
+ * off during the recovery process since data read back might not be
+ * valid, and this is achieved by clearing the badblock records after
+ * the recovery write is complete. Competing recovery write threads
+ * are serialized by pmem device level .recovery_lock.
+ */
 static size_t pmem_recovery_write(struct dev_pagemap *pgmap, pgoff_t pgoff,
 		void *addr, size_t bytes, void *iter)
 {
-	return 0;
+	struct pmem_device *pmem = pgmap->owner;
+	size_t olen, len, off;
+	phys_addr_t pmem_off;
+	struct device *dev = pmem->bb.dev;
+	long cleared;
+
+	if (!pmem) {
+		dev_warn(dev, "pgmap->owner field not set, cannot recover\n");
+		return 0;
+	}
+
+	off = (unsigned long)addr & ~PAGE_MASK;
+	len = PFN_PHYS(PFN_UP(off + bytes));
+	if (!is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) >> SECTOR_SHIFT, len))
+		return _copy_from_iter_flushcache(addr, bytes, iter);
+
+	/*
+	 * Not page-aligned range cannot be recovered. This should not
+	 * happen unless something else went wrong.
+	 */
+	if (off || !(PAGE_ALIGNED(bytes))) {
+		dev_warn(dev, "Found poison, but addr(%p) or bytes(%#lx) not page aligned\n",
+			addr, bytes);
+		return 0;
+	}
+
+	mutex_lock(&pmem->recovery_lock);
+	pmem_off = PFN_PHYS(pgoff) + pmem->data_offset;
+	cleared = __pmem_clear_poison(pmem, pmem_off, len);
+	if (cleared > 0 && cleared < len) {
+		dev_warn(dev, "poison cleared only %ld out of %lu\n",
+			cleared, len);
+		mutex_unlock(&pmem->recovery_lock);
+		return 0;
+	} else if (cleared < 0) {
+		dev_warn(dev, "poison clear failed: %ld\n", cleared);
+		mutex_unlock(&pmem->recovery_lock);
+		return 0;
+	}
+
+	olen = _copy_from_iter_flushcache(addr, bytes, iter);
+	pmem_clear_bb(pmem, to_sect(pmem, pmem_off), cleared >> SECTOR_SHIFT);
+
+	mutex_unlock(&pmem->recovery_lock);
+	return olen;
 }
 
 static ssize_t write_cache_show(struct device *dev,
@@ -533,6 +590,7 @@  static int pmem_attach_disk(struct device *dev,
 	if (rc)
 		goto out_cleanup_dax;
 	dax_write_cache(dax_dev, nvdimm_has_cache(nd_region));
+	mutex_init(&pmem->recovery_lock);
 	pmem->dax_dev = dax_dev;
 
 	rc = device_add_disk(dev, disk, pmem_attribute_groups);
diff --git a/drivers/nvdimm/pmem.h b/drivers/nvdimm/pmem.h
index e9c53d42c488..d44f6da34009 100644
--- a/drivers/nvdimm/pmem.h
+++ b/drivers/nvdimm/pmem.h
@@ -25,6 +25,7 @@  struct pmem_device {
 	struct dax_device	*dax_dev;
 	struct gendisk		*disk;
 	struct dev_pagemap	pgmap;
+	struct mutex		recovery_lock;
 };
 
 long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,