diff mbox series

[5/6] dax,pmem: Add data recovery feature to pmem_copy_to/from_iter()

Message ID 20211021001059.438843-6-jane.chu@oracle.com (mailing list archive)
State New, archived
Headers show
Series dax poison recovery with RWF_RECOVERY_DATA flag | expand

Commit Message

Jane Chu Oct. 21, 2021, 12:10 a.m. UTC
When DAXDEV_F_RECOVERY flag is set, pmem_copy_to_iter() shall read
as much data as possible up till the first poisoned page is
encountered, and pmem_copy_from_iter() shall try to clear poison(s)
within the page aligned range prior to writing.

Signed-off-by: Jane Chu <jane.chu@oracle.com>
---
 drivers/nvdimm/pmem.c | 72 ++++++++++++++++++++++++++++++++++++++++---
 fs/dax.c              |  5 +++
 2 files changed, 72 insertions(+), 5 deletions(-)

Comments

Christoph Hellwig Oct. 21, 2021, 11:28 a.m. UTC | #1
> +	if (flags & DAXDEV_F_RECOVERY) {
> +		lead_off = (unsigned long)addr & ~PAGE_MASK;
> +		len = PFN_PHYS(PFN_UP(lead_off + bytes));
> +		if (is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, len)) {
> +			if (lead_off || !(PAGE_ALIGNED(bytes))) {
> +				dev_warn(dev, "Found poison, but addr(%p) and/or bytes(%#lx) not page aligned\n",
> +					addr, bytes);
> +				return (size_t) -EIO;
> +			}
> +			pmem_off = PFN_PHYS(pgoff) + pmem->data_offset;
> +			if (pmem_clear_poison(pmem, pmem_off, bytes) !=
> +					BLK_STS_OK)
> +				return (size_t) -EIO;
> +		}

Shouldn't this just go down in a separe ->clear_poison operation
to make the whole thing a little easier to follow?
Jane Chu Oct. 22, 2021, 12:58 a.m. UTC | #2
On 10/21/2021 4:28 AM, Christoph Hellwig wrote:
>> +	if (flags & DAXDEV_F_RECOVERY) {
>> +		lead_off = (unsigned long)addr & ~PAGE_MASK;
>> +		len = PFN_PHYS(PFN_UP(lead_off + bytes));
>> +		if (is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, len)) {
>> +			if (lead_off || !(PAGE_ALIGNED(bytes))) {
>> +				dev_warn(dev, "Found poison, but addr(%p) and/or bytes(%#lx) not page aligned\n",
>> +					addr, bytes);
>> +				return (size_t) -EIO;
>> +			}
>> +			pmem_off = PFN_PHYS(pgoff) + pmem->data_offset;
>> +			if (pmem_clear_poison(pmem, pmem_off, bytes) !=
>> +					BLK_STS_OK)
>> +				return (size_t) -EIO;
>> +		}
> 
> Shouldn't this just go down in a separe ->clear_poison operation
> to make the whole thing a little easier to follow?
> 

Do you mean to lift or refactor the above to a helper function so as
to improve the readability of the code?  I can do that, just to confirm.
On the same note, would you prefer to refactor the read path as well?

thanks!
-jane
diff mbox series

Patch

diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index e2a1c35108cd..c456f84d2f6f 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -305,21 +305,83 @@  static long pmem_dax_direct_access(struct dax_device *dax_dev,
 }
 
 /*
- * Use the 'no check' versions of copy_from_iter_flushcache() and
- * copy_mc_to_iter() to bypass HARDENED_USERCOPY overhead. Bounds
- * checking, both file offset and device offset, is handled by
- * dax_iomap_actor()
+ * Even though the 'no check' versions of copy_from_iter_flushcache()
+ * and copy_mc_to_iter() are used to bypass HARDENED_USERCOPY overhead,
+ * 'read'/'write' aren't always safe when poison is consumed. They happen
+ * to be safe because the 'read'/'write' range has been guaranteed
+ * be free of poison(s) by a prior call to dax_direct_access() on the
+ * caller stack.
+ * However with the introduction of DAXDEV_F_RECOVERY, the 'read'/'write'
+ * range may contain poison(s), so the functions perform explicit check
+ * on poison, and 'read' end up fetching only non-poisoned page(s) up
+ * till  the first poison is encountered while 'write' require the range
+ * is page aligned in order to restore the poisoned page's memory type
+ * back to "rw" after clearing the poison(s).
+ * In the event of poison related failure, (size_t) -EIO is returned and
+ * caller may check the return value after casting it to (ssize_t).
  */
 static size_t pmem_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
 	void *addr, size_t bytes, struct iov_iter *i, unsigned long flags)
 {
+	phys_addr_t pmem_off;
+	size_t len, lead_off;
+	struct pmem_device *pmem = dax_get_private(dax_dev);
+	struct device *dev = pmem->bb.dev;
+
+	if (flags & DAXDEV_F_RECOVERY) {
+		lead_off = (unsigned long)addr & ~PAGE_MASK;
+		len = PFN_PHYS(PFN_UP(lead_off + bytes));
+		if (is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, len)) {
+			if (lead_off || !(PAGE_ALIGNED(bytes))) {
+				dev_warn(dev, "Found poison, but addr(%p) and/or bytes(%#lx) not page aligned\n",
+					addr, bytes);
+				return (size_t) -EIO;
+			}
+			pmem_off = PFN_PHYS(pgoff) + pmem->data_offset;
+			if (pmem_clear_poison(pmem, pmem_off, bytes) !=
+					BLK_STS_OK)
+				return (size_t) -EIO;
+		}
+	}
+
 	return _copy_from_iter_flushcache(addr, bytes, i);
 }
 
 static size_t pmem_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff,
 	void *addr, size_t bytes, struct iov_iter *i, unsigned long flags)
 {
-	return _copy_mc_to_iter(addr, bytes, i);
+	int num_bad;
+	size_t len, lead_off;
+	unsigned long bad_pfn;
+	bool bad_pmem = false;
+	size_t adj_len = bytes;
+	sector_t sector, first_bad;
+	struct pmem_device *pmem = dax_get_private(dax_dev);
+	struct device *dev = pmem->bb.dev;
+
+	if (flags & DAXDEV_F_RECOVERY) {
+		sector = PFN_PHYS(pgoff) / 512;
+		lead_off = (unsigned long)addr & ~PAGE_MASK;
+		len = PFN_PHYS(PFN_UP(lead_off + bytes));
+		if (pmem->bb.count)
+			bad_pmem = !!badblocks_check(&pmem->bb, sector,
+					len / 512, &first_bad, &num_bad);
+		if (bad_pmem) {
+			bad_pfn = PHYS_PFN(first_bad * 512);
+			if (bad_pfn == pgoff) {
+				dev_warn(dev, "Found poison in page: pgoff(%#lx)\n",
+					 pgoff);
+				return -EIO;
+			}
+			adj_len = PFN_PHYS(bad_pfn - pgoff) - lead_off;
+			dev_WARN_ONCE(dev, (adj_len > bytes),
+					"out-of-range first_bad?");
+		}
+		if (adj_len == 0)
+			return (size_t) -EIO;
+	}
+
+	return _copy_mc_to_iter(addr, adj_len, i);
 }
 
 static const struct dax_operations pmem_dax_ops = {
diff --git a/fs/dax.c b/fs/dax.c
index 69433c6cd6c4..b9286668dc46 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1246,6 +1246,11 @@  static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
 			xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr,
 					map_len, iter, dax_flag);
 
+		if ((ssize_t)xfer == -EIO) {
+			ret = -EIO;
+			break;
+		}
+
 		pos += xfer;
 		length -= xfer;
 		done += xfer;