diff mbox series

[5/6] dax, pmem: Add data recovery feature to pmem_copy_to/from_iter()

Message ID 20211021001059.438843-6-jane.chu@oracle.com (mailing list archive)
State Not Applicable, archived
Delegated to: Mike Snitzer
Headers show
Series dax poison recovery with RWF_RECOVERY_DATA flag | expand

Commit Message

Jane Chu Oct. 21, 2021, 12:10 a.m. UTC
When DAXDEV_F_RECOVERY flag is set, pmem_copy_to_iter() shall read
as much data as possible up till the first poisoned page is
encountered, and pmem_copy_from_iter() shall try to clear poison(s)
within the page aligned range prior to writing.

Signed-off-by: Jane Chu <jane.chu@oracle.com>
---
 drivers/nvdimm/pmem.c | 72 ++++++++++++++++++++++++++++++++++++++++---
 fs/dax.c              |  5 +++
 2 files changed, 72 insertions(+), 5 deletions(-)

Comments

Christoph Hellwig Oct. 21, 2021, 11:28 a.m. UTC | #1
> +	if (flags & DAXDEV_F_RECOVERY) {
> +		lead_off = (unsigned long)addr & ~PAGE_MASK;
> +		len = PFN_PHYS(PFN_UP(lead_off + bytes));
> +		if (is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, len)) {
> +			if (lead_off || !(PAGE_ALIGNED(bytes))) {
> +				dev_warn(dev, "Found poison, but addr(%p) and/or bytes(%#lx) not page aligned\n",
> +					addr, bytes);
> +				return (size_t) -EIO;
> +			}
> +			pmem_off = PFN_PHYS(pgoff) + pmem->data_offset;
> +			if (pmem_clear_poison(pmem, pmem_off, bytes) !=
> +					BLK_STS_OK)
> +				return (size_t) -EIO;
> +		}

Shouldn't this just go down in a separe ->clear_poison operation
to make the whole thing a little easier to follow?

--
dm-devel mailing list
dm-devel@redhat.com
https://listman.redhat.com/mailman/listinfo/dm-devel
Jane Chu Oct. 22, 2021, 12:58 a.m. UTC | #2
On 10/21/2021 4:28 AM, Christoph Hellwig wrote:
>> +	if (flags & DAXDEV_F_RECOVERY) {
>> +		lead_off = (unsigned long)addr & ~PAGE_MASK;
>> +		len = PFN_PHYS(PFN_UP(lead_off + bytes));
>> +		if (is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, len)) {
>> +			if (lead_off || !(PAGE_ALIGNED(bytes))) {
>> +				dev_warn(dev, "Found poison, but addr(%p) and/or bytes(%#lx) not page aligned\n",
>> +					addr, bytes);
>> +				return (size_t) -EIO;
>> +			}
>> +			pmem_off = PFN_PHYS(pgoff) + pmem->data_offset;
>> +			if (pmem_clear_poison(pmem, pmem_off, bytes) !=
>> +					BLK_STS_OK)
>> +				return (size_t) -EIO;
>> +		}
> 
> Shouldn't this just go down in a separe ->clear_poison operation
> to make the whole thing a little easier to follow?
> 

Do you mean to lift or refactor the above to a helper function so as
to improve the readability of the code?  I can do that, just to confirm.
On the same note, would you prefer to refactor the read path as well?

thanks!
-jane


--
dm-devel mailing list
dm-devel@redhat.com
https://listman.redhat.com/mailman/listinfo/dm-devel
kernel test robot Oct. 22, 2021, 8:03 a.m. UTC | #3
Hi Jane,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on device-mapper-dm/for-next]
[also build test WARNING on nvdimm/libnvdimm-for-next mszeredi-fuse/for-next linus/master v5.15-rc6 next-20211021]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Jane-Chu/dax-poison-recovery-with-RWF_RECOVERY_DATA-flag/20211021-081336
base:   https://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm.git for-next
config: i386-debian-10.3 (attached as .config)
compiler: gcc-9 (Debian 9.3.0-22) 9.3.0
reproduce (this is a W=1 build):
        # https://github.com/0day-ci/linux/commit/a01994a484c54b2f4b6eb32104ab3caf7b9b32a8
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Jane-Chu/dax-poison-recovery-with-RWF_RECOVERY_DATA-flag/20211021-081336
        git checkout a01994a484c54b2f4b6eb32104ab3caf7b9b32a8
        # save the attached .config to linux build tree
        make W=1 ARCH=i386 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

   In file included from include/linux/device.h:15,
                    from include/linux/blk_types.h:11,
                    from include/linux/genhd.h:19,
                    from include/linux/blkdev.h:8,
                    from drivers/nvdimm/pmem.c:10:
   drivers/nvdimm/pmem.c: In function 'pmem_copy_from_iter':
>> drivers/nvdimm/pmem.c:336:19: warning: format '%lx' expects argument of type 'long unsigned int', but argument 4 has type 'size_t' {aka 'unsigned int'} [-Wformat=]
     336 |     dev_warn(dev, "Found poison, but addr(%p) and/or bytes(%#lx) not page aligned\n",
         |                   ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/dev_printk.h:110:16: note: in definition of macro 'dev_printk_index_wrap'
     110 |   _p_func(dev, fmt, ##__VA_ARGS__);   \
         |                ^~~
   include/linux/dev_printk.h:146:54: note: in expansion of macro 'dev_fmt'
     146 |  dev_printk_index_wrap(_dev_warn, KERN_WARNING, dev, dev_fmt(fmt), ##__VA_ARGS__)
         |                                                      ^~~~~~~
   drivers/nvdimm/pmem.c:336:5: note: in expansion of macro 'dev_warn'
     336 |     dev_warn(dev, "Found poison, but addr(%p) and/or bytes(%#lx) not page aligned\n",
         |     ^~~~~~~~
   drivers/nvdimm/pmem.c:336:63: note: format string is defined here
     336 |     dev_warn(dev, "Found poison, but addr(%p) and/or bytes(%#lx) not page aligned\n",
         |                                                            ~~~^
         |                                                               |
         |                                                               long unsigned int
         |                                                            %#x


vim +336 drivers/nvdimm/pmem.c

   306	
   307	/*
   308	 * Even though the 'no check' versions of copy_from_iter_flushcache()
   309	 * and copy_mc_to_iter() are used to bypass HARDENED_USERCOPY overhead,
   310	 * 'read'/'write' aren't always safe when poison is consumed. They happen
   311	 * to be safe because the 'read'/'write' range has been guaranteed
   312	 * be free of poison(s) by a prior call to dax_direct_access() on the
   313	 * caller stack.
   314	 * However with the introduction of DAXDEV_F_RECOVERY, the 'read'/'write'
   315	 * range may contain poison(s), so the functions perform explicit check
   316	 * on poison, and 'read' end up fetching only non-poisoned page(s) up
   317	 * till  the first poison is encountered while 'write' require the range
   318	 * is page aligned in order to restore the poisoned page's memory type
   319	 * back to "rw" after clearing the poison(s).
   320	 * In the event of poison related failure, (size_t) -EIO is returned and
   321	 * caller may check the return value after casting it to (ssize_t).
   322	 */
   323	static size_t pmem_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
   324		void *addr, size_t bytes, struct iov_iter *i, unsigned long flags)
   325	{
   326		phys_addr_t pmem_off;
   327		size_t len, lead_off;
   328		struct pmem_device *pmem = dax_get_private(dax_dev);
   329		struct device *dev = pmem->bb.dev;
   330	
   331		if (flags & DAXDEV_F_RECOVERY) {
   332			lead_off = (unsigned long)addr & ~PAGE_MASK;
   333			len = PFN_PHYS(PFN_UP(lead_off + bytes));
   334			if (is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, len)) {
   335				if (lead_off || !(PAGE_ALIGNED(bytes))) {
 > 336					dev_warn(dev, "Found poison, but addr(%p) and/or bytes(%#lx) not page aligned\n",
   337						addr, bytes);
   338					return (size_t) -EIO;
   339				}
   340				pmem_off = PFN_PHYS(pgoff) + pmem->data_offset;
   341				if (pmem_clear_poison(pmem, pmem_off, bytes) !=
   342						BLK_STS_OK)
   343					return (size_t) -EIO;
   344			}
   345		}
   346	
   347		return _copy_from_iter_flushcache(addr, bytes, i);
   348	}
   349	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
--
dm-devel mailing list
dm-devel@redhat.com
https://listman.redhat.com/mailman/listinfo/dm-devel
kernel test robot Oct. 26, 2021, 10:21 a.m. UTC | #4
Hi Jane,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on device-mapper-dm/for-next]
[also build test WARNING on mszeredi-fuse/for-next linus/master v5.15-rc7 next-20211025]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Jane-Chu/dax-poison-recovery-with-RWF_RECOVERY_DATA-flag/20211021-081336
base:   https://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm.git for-next
config: i386-randconfig-a013-20211022 (attached as .config)
compiler: clang version 14.0.0 (https://github.com/llvm/llvm-project 5dc339d9825f1dbe788cfb69c88210a59bbf8e3a)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/a01994a484c54b2f4b6eb32104ab3caf7b9b32a8
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Jane-Chu/dax-poison-recovery-with-RWF_RECOVERY_DATA-flag/20211021-081336
        git checkout a01994a484c54b2f4b6eb32104ab3caf7b9b32a8
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 ARCH=i386 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

>> drivers/nvdimm/pmem.c:337:12: warning: format specifies type 'unsigned long' but the argument has type 'size_t' (aka 'unsigned int') [-Wformat]
                                           addr, bytes);
                                                 ^~~~~
   include/linux/dev_printk.h:146:70: note: expanded from macro 'dev_warn'
           dev_printk_index_wrap(_dev_warn, KERN_WARNING, dev, dev_fmt(fmt), ##__VA_ARGS__)
                                                                       ~~~     ^~~~~~~~~~~
   include/linux/dev_printk.h:110:23: note: expanded from macro 'dev_printk_index_wrap'
                   _p_func(dev, fmt, ##__VA_ARGS__);                       \
                                ~~~    ^~~~~~~~~~~
   1 warning generated.


vim +337 drivers/nvdimm/pmem.c

   306	
   307	/*
   308	 * Even though the 'no check' versions of copy_from_iter_flushcache()
   309	 * and copy_mc_to_iter() are used to bypass HARDENED_USERCOPY overhead,
   310	 * 'read'/'write' aren't always safe when poison is consumed. They happen
   311	 * to be safe because the 'read'/'write' range has been guaranteed
   312	 * be free of poison(s) by a prior call to dax_direct_access() on the
   313	 * caller stack.
   314	 * However with the introduction of DAXDEV_F_RECOVERY, the 'read'/'write'
   315	 * range may contain poison(s), so the functions perform explicit check
   316	 * on poison, and 'read' end up fetching only non-poisoned page(s) up
   317	 * till  the first poison is encountered while 'write' require the range
   318	 * is page aligned in order to restore the poisoned page's memory type
   319	 * back to "rw" after clearing the poison(s).
   320	 * In the event of poison related failure, (size_t) -EIO is returned and
   321	 * caller may check the return value after casting it to (ssize_t).
   322	 */
   323	static size_t pmem_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
   324		void *addr, size_t bytes, struct iov_iter *i, unsigned long flags)
   325	{
   326		phys_addr_t pmem_off;
   327		size_t len, lead_off;
   328		struct pmem_device *pmem = dax_get_private(dax_dev);
   329		struct device *dev = pmem->bb.dev;
   330	
   331		if (flags & DAXDEV_F_RECOVERY) {
   332			lead_off = (unsigned long)addr & ~PAGE_MASK;
   333			len = PFN_PHYS(PFN_UP(lead_off + bytes));
   334			if (is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, len)) {
   335				if (lead_off || !(PAGE_ALIGNED(bytes))) {
   336					dev_warn(dev, "Found poison, but addr(%p) and/or bytes(%#lx) not page aligned\n",
 > 337						addr, bytes);
   338					return (size_t) -EIO;
   339				}
   340				pmem_off = PFN_PHYS(pgoff) + pmem->data_offset;
   341				if (pmem_clear_poison(pmem, pmem_off, bytes) !=
   342						BLK_STS_OK)
   343					return (size_t) -EIO;
   344			}
   345		}
   346	
   347		return _copy_from_iter_flushcache(addr, bytes, i);
   348	}
   349	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
--
dm-devel mailing list
dm-devel@redhat.com
https://listman.redhat.com/mailman/listinfo/dm-devel
diff mbox series

Patch

diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index e2a1c35108cd..c456f84d2f6f 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -305,21 +305,83 @@  static long pmem_dax_direct_access(struct dax_device *dax_dev,
 }
 
 /*
- * Use the 'no check' versions of copy_from_iter_flushcache() and
- * copy_mc_to_iter() to bypass HARDENED_USERCOPY overhead. Bounds
- * checking, both file offset and device offset, is handled by
- * dax_iomap_actor()
+ * Even though the 'no check' versions of copy_from_iter_flushcache()
+ * and copy_mc_to_iter() are used to bypass HARDENED_USERCOPY overhead,
+ * 'read'/'write' aren't always safe when poison is consumed. They happen
+ * to be safe because the 'read'/'write' range has been guaranteed
+ * be free of poison(s) by a prior call to dax_direct_access() on the
+ * caller stack.
+ * However with the introduction of DAXDEV_F_RECOVERY, the 'read'/'write'
+ * range may contain poison(s), so the functions perform explicit check
+ * on poison, and 'read' end up fetching only non-poisoned page(s) up
+ * till  the first poison is encountered while 'write' require the range
+ * is page aligned in order to restore the poisoned page's memory type
+ * back to "rw" after clearing the poison(s).
+ * In the event of poison related failure, (size_t) -EIO is returned and
+ * caller may check the return value after casting it to (ssize_t).
  */
 static size_t pmem_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
 	void *addr, size_t bytes, struct iov_iter *i, unsigned long flags)
 {
+	phys_addr_t pmem_off;
+	size_t len, lead_off;
+	struct pmem_device *pmem = dax_get_private(dax_dev);
+	struct device *dev = pmem->bb.dev;
+
+	if (flags & DAXDEV_F_RECOVERY) {
+		lead_off = (unsigned long)addr & ~PAGE_MASK;
+		len = PFN_PHYS(PFN_UP(lead_off + bytes));
+		if (is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, len)) {
+			if (lead_off || !(PAGE_ALIGNED(bytes))) {
+				dev_warn(dev, "Found poison, but addr(%p) and/or bytes(%#lx) not page aligned\n",
+					addr, bytes);
+				return (size_t) -EIO;
+			}
+			pmem_off = PFN_PHYS(pgoff) + pmem->data_offset;
+			if (pmem_clear_poison(pmem, pmem_off, bytes) !=
+					BLK_STS_OK)
+				return (size_t) -EIO;
+		}
+	}
+
 	return _copy_from_iter_flushcache(addr, bytes, i);
 }
 
 static size_t pmem_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff,
 	void *addr, size_t bytes, struct iov_iter *i, unsigned long flags)
 {
-	return _copy_mc_to_iter(addr, bytes, i);
+	int num_bad;
+	size_t len, lead_off;
+	unsigned long bad_pfn;
+	bool bad_pmem = false;
+	size_t adj_len = bytes;
+	sector_t sector, first_bad;
+	struct pmem_device *pmem = dax_get_private(dax_dev);
+	struct device *dev = pmem->bb.dev;
+
+	if (flags & DAXDEV_F_RECOVERY) {
+		sector = PFN_PHYS(pgoff) / 512;
+		lead_off = (unsigned long)addr & ~PAGE_MASK;
+		len = PFN_PHYS(PFN_UP(lead_off + bytes));
+		if (pmem->bb.count)
+			bad_pmem = !!badblocks_check(&pmem->bb, sector,
+					len / 512, &first_bad, &num_bad);
+		if (bad_pmem) {
+			bad_pfn = PHYS_PFN(first_bad * 512);
+			if (bad_pfn == pgoff) {
+				dev_warn(dev, "Found poison in page: pgoff(%#lx)\n",
+					 pgoff);
+				return -EIO;
+			}
+			adj_len = PFN_PHYS(bad_pfn - pgoff) - lead_off;
+			dev_WARN_ONCE(dev, (adj_len > bytes),
+					"out-of-range first_bad?");
+		}
+		if (adj_len == 0)
+			return (size_t) -EIO;
+	}
+
+	return _copy_mc_to_iter(addr, adj_len, i);
 }
 
 static const struct dax_operations pmem_dax_ops = {
diff --git a/fs/dax.c b/fs/dax.c
index 69433c6cd6c4..b9286668dc46 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1246,6 +1246,11 @@  static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
 			xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr,
 					map_len, iter, dax_flag);
 
+		if ((ssize_t)xfer == -EIO) {
+			ret = -EIO;
+			break;
+		}
+
 		pos += xfer;
 		length -= xfer;
 		done += xfer;