diff mbox series

[v2,2/2] dax,pmem: Implement pmem based dax data recovery

Message ID 20211106011638.2613039-3-jane.chu@oracle.com (mailing list archive)
State New, archived
Headers show
Series Dax poison recovery | expand

Commit Message

Jane Chu Nov. 6, 2021, 1:16 a.m. UTC
For /dev/pmem based dax, enable DAX_OP_RECOVERY mode for
dax_direct_access to translate 'kaddr' over a range that
may contain poison(s); and enable dax_copy_to_iter to
read as much data as possible up till a poisoned page is
encountered; and enable dax_copy_from_iter to clear poison
among a page-aligned range, and then write the good data over.

Signed-off-by: Jane Chu <jane.chu@oracle.com>
---
 drivers/md/dm.c       |  2 ++
 drivers/nvdimm/pmem.c | 75 ++++++++++++++++++++++++++++++++++++++++---
 fs/dax.c              | 24 +++++++++++---
 3 files changed, 92 insertions(+), 9 deletions(-)

Comments

Darrick J. Wong Nov. 6, 2021, 2:04 a.m. UTC | #1
On Fri, Nov 05, 2021 at 07:16:38PM -0600, Jane Chu wrote:
> For /dev/pmem based dax, enable DAX_OP_RECOVERY mode for
> dax_direct_access to translate 'kaddr' over a range that
> may contain poison(s); and enable dax_copy_to_iter to
> read as much data as possible up till a poisoned page is
> encountered; and enable dax_copy_from_iter to clear poison
> among a page-aligned range, and then write the good data over.
> 
> Signed-off-by: Jane Chu <jane.chu@oracle.com>
> ---
>  drivers/md/dm.c       |  2 ++
>  drivers/nvdimm/pmem.c | 75 ++++++++++++++++++++++++++++++++++++++++---
>  fs/dax.c              | 24 +++++++++++---
>  3 files changed, 92 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> index dc354db22ef9..9b3dac916f22 100644
> --- a/drivers/md/dm.c
> +++ b/drivers/md/dm.c
> @@ -1043,6 +1043,7 @@ static size_t dm_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
>  	if (!ti)
>  		goto out;
>  	if (!ti->type->dax_copy_from_iter) {
> +		WARN_ON(mode == DAX_OP_RECOVERY);
>  		ret = copy_from_iter(addr, bytes, i);
>  		goto out;
>  	}
> @@ -1067,6 +1068,7 @@ static size_t dm_dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff,
>  	if (!ti)
>  		goto out;
>  	if (!ti->type->dax_copy_to_iter) {
> +		WARN_ON(mode == DAX_OP_RECOVERY);

Maybe just return -EOPNOTSUPP here?

Warnings are kinda loud.

>  		ret = copy_to_iter(addr, bytes, i);
>  		goto out;
>  	}
> diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
> index 3dc99e0bf633..8ae6aa678c51 100644
> --- a/drivers/nvdimm/pmem.c
> +++ b/drivers/nvdimm/pmem.c
> @@ -260,7 +260,7 @@ __weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
>  	resource_size_t offset = PFN_PHYS(pgoff) + pmem->data_offset;
>  
>  	if (unlikely(is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512,
> -					PFN_PHYS(nr_pages))))
> +				 PFN_PHYS(nr_pages)) && mode == DAX_OP_NORMAL))
>  		return -EIO;
>  
>  	if (kaddr)
> @@ -303,20 +303,85 @@ static long pmem_dax_direct_access(struct dax_device *dax_dev,
>  }
>  
>  /*
> - * Use the 'no check' versions of copy_from_iter_flushcache() and
> - * copy_mc_to_iter() to bypass HARDENED_USERCOPY overhead. Bounds
> - * checking, both file offset and device offset, is handled by
> - * dax_iomap_actor()
> + * Even though the 'no check' versions of copy_from_iter_flushcache()
> + * and copy_mc_to_iter() are used to bypass HARDENED_USERCOPY overhead,
> + * 'read'/'write' aren't always safe when poison is consumed. They happen
> + * to be safe because the 'read'/'write' range has been guaranteed
> + * be free of poison(s) by a prior call to dax_direct_access() on the
> + * caller stack.
> + * But on a data recovery code path, the 'read'/'write' range is expected
> + * to contain poison(s), and so poison(s) is explicit checked, such that
> + * 'read' can fetch data from clean page(s) up till the first poison is
> + * encountered, and 'write' requires the range be page aligned in order
> + * to restore the poisoned page's memory type back to "rw" after clearing
> + * the poison(s).
> + * In the event of poison related failure, (size_t) -EIO is returned and
> + * caller may check the return value after casting it to (ssize_t).
> + *
> + * TODO: add support for CPUs that support MOVDIR64B instruction for
> + * faster poison clearing, and possibly smaller error blast radius.

I get that it's still early days yet for whatever pmem stuff is going on
for 5.17, but I feel like this ought to be a separate function called by
pmem_copy_from_iter, with this huge comment attached to that recovery
function.

>   */
>  static size_t pmem_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
>  		void *addr, size_t bytes, struct iov_iter *i, int mode)
>  {
> +	phys_addr_t pmem_off;
> +	size_t len, lead_off;
> +	struct pmem_device *pmem = dax_get_private(dax_dev);
> +	struct device *dev = pmem->bb.dev;
> +
> +	if (unlikely(mode == DAX_OP_RECOVERY)) {
> +		lead_off = (unsigned long)addr & ~PAGE_MASK;
> +		len = PFN_PHYS(PFN_UP(lead_off + bytes));
> +		if (is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, len)) {
> +			if (lead_off || !(PAGE_ALIGNED(bytes))) {
> +				dev_warn(dev, "Found poison, but addr(%p) and/or bytes(%#lx) not page aligned\n",
> +					addr, bytes);
> +				return (size_t) -EIO;
> +			}
> +			pmem_off = PFN_PHYS(pgoff) + pmem->data_offset;
> +			if (pmem_clear_poison(pmem, pmem_off, bytes) !=
> +						BLK_STS_OK)
> +				return (size_t) -EIO;

Looks reasonable enough to me, though you might want to restructure this
to reduce the amount of indent.

FWIW I dislike how is_bad_pmem mixes units (sector_t vs. bytes), that
was seriously confusing.  But I guess that's a weird quirk of the
badblocks API and .... ugh.

(I dunno, can we at least clean up the nvdimm parts and some day replace
the badblocks backend with something that can handle more than 16
records?  interval_tree is more than up to that task, I know, I use it
for xfs online fsck...)

> +		}
> +	}
> +
>  	return _copy_from_iter_flushcache(addr, bytes, i);
>  }
>  
>  static size_t pmem_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff,
>  		void *addr, size_t bytes, struct iov_iter *i, int mode)
>  {
> +	int num_bad;
> +	size_t len, lead_off;
> +	unsigned long bad_pfn;
> +	bool bad_pmem = false;
> +	size_t adj_len = bytes;
> +	sector_t sector, first_bad;
> +	struct pmem_device *pmem = dax_get_private(dax_dev);
> +	struct device *dev = pmem->bb.dev;
> +
> +	if (unlikely(mode == DAX_OP_RECOVERY)) {
> +		sector = PFN_PHYS(pgoff) / 512;
> +		lead_off = (unsigned long)addr & ~PAGE_MASK;
> +		len = PFN_PHYS(PFN_UP(lead_off + bytes));
> +		if (pmem->bb.count)
> +			bad_pmem = !!badblocks_check(&pmem->bb, sector,
> +					len / 512, &first_bad, &num_bad);
> +		if (bad_pmem) {
> +			bad_pfn = PHYS_PFN(first_bad * 512);
> +			if (bad_pfn == pgoff) {
> +				dev_warn(dev, "Found poison in page: pgoff(%#lx)\n",
> +					pgoff);
> +				return -EIO;
> +			}
> +			adj_len = PFN_PHYS(bad_pfn - pgoff) - lead_off;
> +			dev_WARN_ONCE(dev, (adj_len > bytes),
> +					"out-of-range first_bad?");
> +		}
> +		if (adj_len == 0)
> +			return (size_t) -EIO;

Uh, are we supposed to adjust bytes here or something?

--D

> +	}
> +
>  	return _copy_mc_to_iter(addr, bytes, i);
>  }
>  
> diff --git a/fs/dax.c b/fs/dax.c
> index bea6df1498c3..7640be6b6a97 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -1219,6 +1219,8 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
>  		unsigned offset = pos & (PAGE_SIZE - 1);
>  		const size_t size = ALIGN(length + offset, PAGE_SIZE);
>  		const sector_t sector = dax_iomap_sector(iomap, pos);
> +		long nr_page = PHYS_PFN(size);
> +		int dax_mode = DAX_OP_NORMAL;
>  		ssize_t map_len;
>  		pgoff_t pgoff;
>  		void *kaddr;
> @@ -1232,8 +1234,13 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
>  		if (ret)
>  			break;
>  
> -		map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
> -					    DAX_OP_NORMAL, &kaddr, NULL);
> +		map_len = dax_direct_access(dax_dev, pgoff, nr_page, dax_mode,
> +					    &kaddr, NULL);
> +		if (unlikely(map_len == -EIO)) {
> +			dax_mode = DAX_OP_RECOVERY;
> +			map_len = dax_direct_access(dax_dev, pgoff, nr_page,
> +						    dax_mode, &kaddr, NULL);
> +		}
>  		if (map_len < 0) {
>  			ret = map_len;
>  			break;
> @@ -1252,11 +1259,20 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
>  		 */
>  		if (iov_iter_rw(iter) == WRITE)
>  			xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr,
> -					map_len, iter, DAX_OP_NORMAL);
> +					map_len, iter, dax_mode);
>  		else
>  			xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr,
> -					map_len, iter, DAX_OP_NORMAL);
> +					map_len, iter, dax_mode);
>  
> +		/*
> +		 * If dax data recovery is enacted via DAX_OP_RECOVERY,
> +		 * recovery failure would be indicated by a -EIO return
> +		 * in 'xfer' casted as (size_t).
> +		 */
> +		if ((ssize_t)xfer == -EIO) {
> +			ret = -EIO;
> +			break;
> +		}
>  		pos += xfer;
>  		length -= xfer;
>  		done += xfer;
> -- 
> 2.18.4
>
Jane Chu Nov. 8, 2021, 8:53 p.m. UTC | #2
On 11/5/2021 7:04 PM, Darrick J. Wong wrote:
<snip>
>>
>> diff --git a/drivers/md/dm.c b/drivers/md/dm.c
>> index dc354db22ef9..9b3dac916f22 100644
>> --- a/drivers/md/dm.c
>> +++ b/drivers/md/dm.c
>> @@ -1043,6 +1043,7 @@ static size_t dm_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
>>   	if (!ti)
>>   		goto out;
>>   	if (!ti->type->dax_copy_from_iter) {
>> +		WARN_ON(mode == DAX_OP_RECOVERY);
>>   		ret = copy_from_iter(addr, bytes, i);
>>   		goto out;
>>   	}
>> @@ -1067,6 +1068,7 @@ static size_t dm_dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff,
>>   	if (!ti)
>>   		goto out;
>>   	if (!ti->type->dax_copy_to_iter) {
>> +		WARN_ON(mode == DAX_OP_RECOVERY);
> 
> Maybe just return -EOPNOTSUPP here?
> 
> Warnings are kinda loud.
> 

Indeed.  Looks like the
   "if (!ti->type->dax_copy_to_iter) {"
clause was to allow mixed dax targets in dm, such as dcss, fuse and
virtio_fs targets. These targets either don't export
.dax_copy_from/to_iter, or don't need to.
And their .dax_direct_access don't check poison, and can't repair
poison anyway.

I think these targets may safely ignore the flag.  However, returning
-EOPNOTSUPP is helpful to catch future bug, such as someone add a
method to detect poison, but didn't add a method to clear poison, in
that case, we fail the call.

Dan, do you have a preference?

thanks!
-jane
Jane Chu Nov. 8, 2021, 9 p.m. UTC | #3
On 11/5/2021 7:04 PM, Darrick J. Wong wrote:
<snip>
>> @@ -303,20 +303,85 @@ static long pmem_dax_direct_access(struct dax_device *dax_dev,
>>   }
>>   
>>   /*
>> - * Use the 'no check' versions of copy_from_iter_flushcache() and
>> - * copy_mc_to_iter() to bypass HARDENED_USERCOPY overhead. Bounds
>> - * checking, both file offset and device offset, is handled by
>> - * dax_iomap_actor()
>> + * Even though the 'no check' versions of copy_from_iter_flushcache()
>> + * and copy_mc_to_iter() are used to bypass HARDENED_USERCOPY overhead,
>> + * 'read'/'write' aren't always safe when poison is consumed. They happen
>> + * to be safe because the 'read'/'write' range has been guaranteed
>> + * be free of poison(s) by a prior call to dax_direct_access() on the
>> + * caller stack.
>> + * But on a data recovery code path, the 'read'/'write' range is expected
>> + * to contain poison(s), and so poison(s) is explicit checked, such that
>> + * 'read' can fetch data from clean page(s) up till the first poison is
>> + * encountered, and 'write' requires the range be page aligned in order
>> + * to restore the poisoned page's memory type back to "rw" after clearing
>> + * the poison(s).
>> + * In the event of poison related failure, (size_t) -EIO is returned and
>> + * caller may check the return value after casting it to (ssize_t).
>> + *
>> + * TODO: add support for CPUs that support MOVDIR64B instruction for
>> + * faster poison clearing, and possibly smaller error blast radius.
> 
> I get that it's still early days yet for whatever pmem stuff is going on
> for 5.17, but I feel like this ought to be a separate function called by
> pmem_copy_from_iter, with this huge comment attached to that recovery
> function.

Thanks, will refactor both functions.

> 
>>    */
>>   static size_t pmem_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
>>   		void *addr, size_t bytes, struct iov_iter *i, int mode)
>>   {
>> +	phys_addr_t pmem_off;
>> +	size_t len, lead_off;
>> +	struct pmem_device *pmem = dax_get_private(dax_dev);
>> +	struct device *dev = pmem->bb.dev;
>> +
>> +	if (unlikely(mode == DAX_OP_RECOVERY)) {
>> +		lead_off = (unsigned long)addr & ~PAGE_MASK;
>> +		len = PFN_PHYS(PFN_UP(lead_off + bytes));
>> +		if (is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, len)) {
>> +			if (lead_off || !(PAGE_ALIGNED(bytes))) {
>> +				dev_warn(dev, "Found poison, but addr(%p) and/or bytes(%#lx) not page aligned\n",
>> +					addr, bytes);
>> +				return (size_t) -EIO;
>> +			}
>> +			pmem_off = PFN_PHYS(pgoff) + pmem->data_offset;
>> +			if (pmem_clear_poison(pmem, pmem_off, bytes) !=
>> +						BLK_STS_OK)
>> +				return (size_t) -EIO;
> 
> Looks reasonable enough to me, though you might want to restructure this
> to reduce the amount of indent.

Agreed.

> 
> FWIW I dislike how is_bad_pmem mixes units (sector_t vs. bytes), that
> was seriously confusing.  But I guess that's a weird quirk of the
> badblocks API and .... ugh.
> 
> (I dunno, can we at least clean up the nvdimm parts and some day replace
> the badblocks backend with something that can handle more than 16
> records?  interval_tree is more than up to that task, I know, I use it
> for xfs online fsck...)

Let me look into this and get back to you.

> 
>> +		}
>> +	}
>> +
>>   	return _copy_from_iter_flushcache(addr, bytes, i);
>>   }
>>   
>>   static size_t pmem_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff,
>>   		void *addr, size_t bytes, struct iov_iter *i, int mode)
>>   {
>> +	int num_bad;
>> +	size_t len, lead_off;
>> +	unsigned long bad_pfn;
>> +	bool bad_pmem = false;
>> +	size_t adj_len = bytes;
>> +	sector_t sector, first_bad;
>> +	struct pmem_device *pmem = dax_get_private(dax_dev);
>> +	struct device *dev = pmem->bb.dev;
>> +
>> +	if (unlikely(mode == DAX_OP_RECOVERY)) {
>> +		sector = PFN_PHYS(pgoff) / 512;
>> +		lead_off = (unsigned long)addr & ~PAGE_MASK;
>> +		len = PFN_PHYS(PFN_UP(lead_off + bytes));
>> +		if (pmem->bb.count)
>> +			bad_pmem = !!badblocks_check(&pmem->bb, sector,
>> +					len / 512, &first_bad, &num_bad);
>> +		if (bad_pmem) {
>> +			bad_pfn = PHYS_PFN(first_bad * 512);
>> +			if (bad_pfn == pgoff) {
>> +				dev_warn(dev, "Found poison in page: pgoff(%#lx)\n",
>> +					pgoff);
>> +				return -EIO;
>> +			}
>> +			adj_len = PFN_PHYS(bad_pfn - pgoff) - lead_off;
>> +			dev_WARN_ONCE(dev, (adj_len > bytes),
>> +					"out-of-range first_bad?");
>> +		}
>> +		if (adj_len == 0)
>> +			return (size_t) -EIO;
> 
> Uh, are we supposed to adjust bytes here or something?

Because we're trying to read as much data as possible...
What is your concern here?

thanks!
-jane

> 
> --D
> 
>> +	}
>> +
>>   	return _copy_mc_to_iter(addr, bytes, i);
>>   }
>>
Christoph Hellwig Nov. 9, 2021, 7:27 a.m. UTC | #4
On Fri, Nov 05, 2021 at 07:16:38PM -0600, Jane Chu wrote:
>  static size_t pmem_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
>  		void *addr, size_t bytes, struct iov_iter *i, int mode)
>  {
> +	phys_addr_t pmem_off;
> +	size_t len, lead_off;
> +	struct pmem_device *pmem = dax_get_private(dax_dev);
> +	struct device *dev = pmem->bb.dev;
> +
> +	if (unlikely(mode == DAX_OP_RECOVERY)) {
> +		lead_off = (unsigned long)addr & ~PAGE_MASK;
> +		len = PFN_PHYS(PFN_UP(lead_off + bytes));
> +		if (is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, len)) {
> +			if (lead_off || !(PAGE_ALIGNED(bytes))) {
> +				dev_warn(dev, "Found poison, but addr(%p) and/or bytes(%#lx) not page aligned\n",
> +					addr, bytes);
> +				return (size_t) -EIO;
> +			}
> +			pmem_off = PFN_PHYS(pgoff) + pmem->data_offset;
> +			if (pmem_clear_poison(pmem, pmem_off, bytes) !=
> +						BLK_STS_OK)
> +				return (size_t) -EIO;
> +		}
> +	}

This is in the wrong spot.  As seen in my WIP series individual drivers
really should not hook into copying to and from the iter, because it
really is just one way to write to a nvdimm.  How would dm-writecache
clear the errors with this scheme?

So IMHO going back to the separate recovery method as in your previous
patch really is the way to go.  If/when the 64-bit store happens we
need to figure out a good way to clear the bad block list for that.
Dan Williams Nov. 9, 2021, 6:48 p.m. UTC | #5
On Mon, Nov 8, 2021 at 11:27 PM Christoph Hellwig <hch@infradead.org> wrote:
>
> On Fri, Nov 05, 2021 at 07:16:38PM -0600, Jane Chu wrote:
> >  static size_t pmem_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
> >               void *addr, size_t bytes, struct iov_iter *i, int mode)
> >  {
> > +     phys_addr_t pmem_off;
> > +     size_t len, lead_off;
> > +     struct pmem_device *pmem = dax_get_private(dax_dev);
> > +     struct device *dev = pmem->bb.dev;
> > +
> > +     if (unlikely(mode == DAX_OP_RECOVERY)) {
> > +             lead_off = (unsigned long)addr & ~PAGE_MASK;
> > +             len = PFN_PHYS(PFN_UP(lead_off + bytes));
> > +             if (is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, len)) {
> > +                     if (lead_off || !(PAGE_ALIGNED(bytes))) {
> > +                             dev_warn(dev, "Found poison, but addr(%p) and/or bytes(%#lx) not page aligned\n",
> > +                                     addr, bytes);
> > +                             return (size_t) -EIO;
> > +                     }
> > +                     pmem_off = PFN_PHYS(pgoff) + pmem->data_offset;
> > +                     if (pmem_clear_poison(pmem, pmem_off, bytes) !=
> > +                                             BLK_STS_OK)
> > +                             return (size_t) -EIO;
> > +             }
> > +     }
>
> This is in the wrong spot.  As seen in my WIP series individual drivers
> really should not hook into copying to and from the iter, because it
> really is just one way to write to a nvdimm.  How would dm-writecache
> clear the errors with this scheme?
>
> So IMHO going back to the separate recovery method as in your previous
> patch really is the way to go.  If/when the 64-bit store happens we
> need to figure out a good way to clear the bad block list for that.

I think we just make error management a first class citizen of a
dax-device and stop abstracting it behind a driver callback. That way
the driver that registers the dax-device can optionally register error
management as well. Then fsdax path can do:

        rc = dax_direct_access(..., &kaddr, ...);
        if (unlikely(rc)) {
                kaddr = dax_mk_recovery(kaddr);
                dax_direct_access(..., &kaddr, ...);
                return dax_recovery_{read,write}(..., kaddr, ...);
        }
        return copy_{mc_to_iter,from_iter_flushcache}(...);

Where, the recovery version of dax_direct_access() has the opportunity
to change the page permissions / use an alias mapping for the access,
dax_recovery_read() allows reading the good cachelines out of a
poisoned page, and dax_recovery_write() coordinates error list
management and returning a poison page to full write-back caching
operation when no more poisoned cacheline are detected in the page.
Jane Chu Nov. 9, 2021, 7:14 p.m. UTC | #6
On 11/8/2021 11:27 PM, Christoph Hellwig wrote:
> On Fri, Nov 05, 2021 at 07:16:38PM -0600, Jane Chu wrote:
>>   static size_t pmem_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
>>   		void *addr, size_t bytes, struct iov_iter *i, int mode)
>>   {
>> +	phys_addr_t pmem_off;
>> +	size_t len, lead_off;
>> +	struct pmem_device *pmem = dax_get_private(dax_dev);
>> +	struct device *dev = pmem->bb.dev;
>> +
>> +	if (unlikely(mode == DAX_OP_RECOVERY)) {
>> +		lead_off = (unsigned long)addr & ~PAGE_MASK;
>> +		len = PFN_PHYS(PFN_UP(lead_off + bytes));
>> +		if (is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, len)) {
>> +			if (lead_off || !(PAGE_ALIGNED(bytes))) {
>> +				dev_warn(dev, "Found poison, but addr(%p) and/or bytes(%#lx) not page aligned\n",
>> +					addr, bytes);
>> +				return (size_t) -EIO;
>> +			}
>> +			pmem_off = PFN_PHYS(pgoff) + pmem->data_offset;
>> +			if (pmem_clear_poison(pmem, pmem_off, bytes) !=
>> +						BLK_STS_OK)
>> +				return (size_t) -EIO;
>> +		}
>> +	}
> 
> This is in the wrong spot.  As seen in my WIP series individual drivers
> really should not hook into copying to and from the iter, because it
> really is just one way to write to a nvdimm.  How would dm-writecache
> clear the errors with this scheme?

How does dm-writecache detect and clear errors today?

> 
> So IMHO going back to the separate recovery method as in your previous
> patch really is the way to go.  If/when the 64-bit store happens we
> need to figure out a good way to clear the bad block list for that.
> 
Yeah, the separate .dax_clear_poison API may not be arbitrarily called
though. It must be followed by a 'write' operation, and so, unless we
bind the two operations together, how do we enforce that to avoid
silent data corruption?

thanks!
-jane
Christoph Hellwig Nov. 9, 2021, 7:52 p.m. UTC | #7
On Tue, Nov 09, 2021 at 10:48:51AM -0800, Dan Williams wrote:
> I think we just make error management a first class citizen of a
> dax-device and stop abstracting it behind a driver callback. That way
> the driver that registers the dax-device can optionally register error
> management as well. Then fsdax path can do:

This sound pretty sensible.
Jane Chu Nov. 9, 2021, 7:58 p.m. UTC | #8
On 11/9/2021 10:48 AM, Dan Williams wrote:
> On Mon, Nov 8, 2021 at 11:27 PM Christoph Hellwig <hch@infradead.org> wrote:
>>
>> On Fri, Nov 05, 2021 at 07:16:38PM -0600, Jane Chu wrote:
>>>   static size_t pmem_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
>>>                void *addr, size_t bytes, struct iov_iter *i, int mode)
>>>   {
>>> +     phys_addr_t pmem_off;
>>> +     size_t len, lead_off;
>>> +     struct pmem_device *pmem = dax_get_private(dax_dev);
>>> +     struct device *dev = pmem->bb.dev;
>>> +
>>> +     if (unlikely(mode == DAX_OP_RECOVERY)) {
>>> +             lead_off = (unsigned long)addr & ~PAGE_MASK;
>>> +             len = PFN_PHYS(PFN_UP(lead_off + bytes));
>>> +             if (is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, len)) {
>>> +                     if (lead_off || !(PAGE_ALIGNED(bytes))) {
>>> +                             dev_warn(dev, "Found poison, but addr(%p) and/or bytes(%#lx) not page aligned\n",
>>> +                                     addr, bytes);
>>> +                             return (size_t) -EIO;
>>> +                     }
>>> +                     pmem_off = PFN_PHYS(pgoff) + pmem->data_offset;
>>> +                     if (pmem_clear_poison(pmem, pmem_off, bytes) !=
>>> +                                             BLK_STS_OK)
>>> +                             return (size_t) -EIO;
>>> +             }
>>> +     }
>>
>> This is in the wrong spot.  As seen in my WIP series individual drivers
>> really should not hook into copying to and from the iter, because it
>> really is just one way to write to a nvdimm.  How would dm-writecache
>> clear the errors with this scheme?
>>
>> So IMHO going back to the separate recovery method as in your previous
>> patch really is the way to go.  If/when the 64-bit store happens we
>> need to figure out a good way to clear the bad block list for that.
> 
> I think we just make error management a first class citizen of a
> dax-device and stop abstracting it behind a driver callback. That way
> the driver that registers the dax-device can optionally register error
> management as well. Then fsdax path can do:
> 
>          rc = dax_direct_access(..., &kaddr, ...);
>          if (unlikely(rc)) {
>                  kaddr = dax_mk_recovery(kaddr);

Sorry, what does dax_mk_recovery(kaddr) do?

>                  dax_direct_access(..., &kaddr, ...);
>                  return dax_recovery_{read,write}(..., kaddr, ...);
>          }
>          return copy_{mc_to_iter,from_iter_flushcache}(...);
> 
> Where, the recovery version of dax_direct_access() has the opportunity
> to change the page permissions / use an alias mapping for the access,

again, sorry, what 'page permissions'?  memory_failure_dev_pagemap()
changes the poisoned page mem_type from 'rw' to 'uc-' (should be NP?),
do you mean to reverse the change?

> dax_recovery_read() allows reading the good cachelines out of a
> poisoned page, and dax_recovery_write() coordinates error list
> management and returning a poison page to full write-back caching
> operation when no more poisoned cacheline are detected in the page.
> 

How about to introduce 3 dax_recover_ APIs:
   dax_recover_direct_access(): similar to dax_direct_access except
      it ignores error list and return the kaddr, and hence is also
      optional, exported by device driver that has the ability to
      detect error;
   dax_recovery_read(): optional, supported by pmem driver only,
      reads as much data as possible up to the poisoned page;
   dax_recovery_write(): optional, supported by pmem driver only,
      first clear-poison, then write.

Should we worry about the dm targets?

Both dax_recovery_read/write() are hooked up to dax_iomap_iter().

Thanks,
-jane
Dan Williams Nov. 9, 2021, 9:02 p.m. UTC | #9
On Tue, Nov 9, 2021 at 11:59 AM Jane Chu <jane.chu@oracle.com> wrote:
>
> On 11/9/2021 10:48 AM, Dan Williams wrote:
> > On Mon, Nov 8, 2021 at 11:27 PM Christoph Hellwig <hch@infradead.org> wrote:
> >>
> >> On Fri, Nov 05, 2021 at 07:16:38PM -0600, Jane Chu wrote:
> >>>   static size_t pmem_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
> >>>                void *addr, size_t bytes, struct iov_iter *i, int mode)
> >>>   {
> >>> +     phys_addr_t pmem_off;
> >>> +     size_t len, lead_off;
> >>> +     struct pmem_device *pmem = dax_get_private(dax_dev);
> >>> +     struct device *dev = pmem->bb.dev;
> >>> +
> >>> +     if (unlikely(mode == DAX_OP_RECOVERY)) {
> >>> +             lead_off = (unsigned long)addr & ~PAGE_MASK;
> >>> +             len = PFN_PHYS(PFN_UP(lead_off + bytes));
> >>> +             if (is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, len)) {
> >>> +                     if (lead_off || !(PAGE_ALIGNED(bytes))) {
> >>> +                             dev_warn(dev, "Found poison, but addr(%p) and/or bytes(%#lx) not page aligned\n",
> >>> +                                     addr, bytes);
> >>> +                             return (size_t) -EIO;
> >>> +                     }
> >>> +                     pmem_off = PFN_PHYS(pgoff) + pmem->data_offset;
> >>> +                     if (pmem_clear_poison(pmem, pmem_off, bytes) !=
> >>> +                                             BLK_STS_OK)
> >>> +                             return (size_t) -EIO;
> >>> +             }
> >>> +     }
> >>
> >> This is in the wrong spot.  As seen in my WIP series individual drivers
> >> really should not hook into copying to and from the iter, because it
> >> really is just one way to write to a nvdimm.  How would dm-writecache
> >> clear the errors with this scheme?
> >>
> >> So IMHO going back to the separate recovery method as in your previous
> >> patch really is the way to go.  If/when the 64-bit store happens we
> >> need to figure out a good way to clear the bad block list for that.
> >
> > I think we just make error management a first class citizen of a
> > dax-device and stop abstracting it behind a driver callback. That way
> > the driver that registers the dax-device can optionally register error
> > management as well. Then fsdax path can do:
> >
> >          rc = dax_direct_access(..., &kaddr, ...);
> >          if (unlikely(rc)) {
> >                  kaddr = dax_mk_recovery(kaddr);
>
> Sorry, what does dax_mk_recovery(kaddr) do?

I was thinking this just does the hackery to set a flag bit in the
pointer, something like:

return (void *) ((unsigned long) kaddr | DAX_RECOVERY)

>
> >                  dax_direct_access(..., &kaddr, ...);
> >                  return dax_recovery_{read,write}(..., kaddr, ...);
> >          }
> >          return copy_{mc_to_iter,from_iter_flushcache}(...);
> >
> > Where, the recovery version of dax_direct_access() has the opportunity
> > to change the page permissions / use an alias mapping for the access,
>
> again, sorry, what 'page permissions'?  memory_failure_dev_pagemap()
> changes the poisoned page mem_type from 'rw' to 'uc-' (should be NP?),
> do you mean to reverse the change?

Right, the result of the conversation with Boris is that
memory_failure() should mark the page as NP in call cases, so
dax_direct_access() needs to create a UC mapping and
dax_recover_{read,write}() would sink that operation and either return
the page to NP after the access completes, or convert it to WB if the
operation cleared the error.

> > dax_recovery_read() allows reading the good cachelines out of a
> > poisoned page, and dax_recovery_write() coordinates error list
> > management and returning a poison page to full write-back caching
> > operation when no more poisoned cacheline are detected in the page.
> >
>
> How about to introduce 3 dax_recover_ APIs:
>    dax_recover_direct_access(): similar to dax_direct_access except
>       it ignores error list and return the kaddr, and hence is also
>       optional, exported by device driver that has the ability to
>       detect error;
>    dax_recovery_read(): optional, supported by pmem driver only,
>       reads as much data as possible up to the poisoned page;

It wouldn't be a property of the pmem driver, I expect it would be a
flag on the dax device whether to attempt recovery or not. I.e. get
away from this being a pmem callback and make this a native capability
of a dax device.

>    dax_recovery_write(): optional, supported by pmem driver only,
>       first clear-poison, then write.
>
> Should we worry about the dm targets?

The dm targets after Christoph's conversion should be able to do all
the translation at direct access time and then dax_recovery_X can be
done on the resulting already translated kaddr.

> Both dax_recovery_read/write() are hooked up to dax_iomap_iter().

Yes.
Jane Chu Nov. 10, 2021, 6:26 p.m. UTC | #10
On 11/9/2021 1:02 PM, Dan Williams wrote:
> On Tue, Nov 9, 2021 at 11:59 AM Jane Chu <jane.chu@oracle.com> wrote:
>>
>> On 11/9/2021 10:48 AM, Dan Williams wrote:
>>> On Mon, Nov 8, 2021 at 11:27 PM Christoph Hellwig <hch@infradead.org> wrote:
>>>>
>>>> On Fri, Nov 05, 2021 at 07:16:38PM -0600, Jane Chu wrote:
>>>>>    static size_t pmem_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
>>>>>                 void *addr, size_t bytes, struct iov_iter *i, int mode)
>>>>>    {
>>>>> +     phys_addr_t pmem_off;
>>>>> +     size_t len, lead_off;
>>>>> +     struct pmem_device *pmem = dax_get_private(dax_dev);
>>>>> +     struct device *dev = pmem->bb.dev;
>>>>> +
>>>>> +     if (unlikely(mode == DAX_OP_RECOVERY)) {
>>>>> +             lead_off = (unsigned long)addr & ~PAGE_MASK;
>>>>> +             len = PFN_PHYS(PFN_UP(lead_off + bytes));
>>>>> +             if (is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, len)) {
>>>>> +                     if (lead_off || !(PAGE_ALIGNED(bytes))) {
>>>>> +                             dev_warn(dev, "Found poison, but addr(%p) and/or bytes(%#lx) not page aligned\n",
>>>>> +                                     addr, bytes);
>>>>> +                             return (size_t) -EIO;
>>>>> +                     }
>>>>> +                     pmem_off = PFN_PHYS(pgoff) + pmem->data_offset;
>>>>> +                     if (pmem_clear_poison(pmem, pmem_off, bytes) !=
>>>>> +                                             BLK_STS_OK)
>>>>> +                             return (size_t) -EIO;
>>>>> +             }
>>>>> +     }
>>>>
>>>> This is in the wrong spot.  As seen in my WIP series individual drivers
>>>> really should not hook into copying to and from the iter, because it
>>>> really is just one way to write to a nvdimm.  How would dm-writecache
>>>> clear the errors with this scheme?
>>>>
>>>> So IMHO going back to the separate recovery method as in your previous
>>>> patch really is the way to go.  If/when the 64-bit store happens we
>>>> need to figure out a good way to clear the bad block list for that.
>>>
>>> I think we just make error management a first class citizen of a
>>> dax-device and stop abstracting it behind a driver callback. That way
>>> the driver that registers the dax-device can optionally register error
>>> management as well. Then fsdax path can do:
>>>
>>>           rc = dax_direct_access(..., &kaddr, ...);
>>>           if (unlikely(rc)) {
>>>                   kaddr = dax_mk_recovery(kaddr);
>>
>> Sorry, what does dax_mk_recovery(kaddr) do?
> 
> I was thinking this just does the hackery to set a flag bit in the
> pointer, something like:
> 
> return (void *) ((unsigned long) kaddr | DAX_RECOVERY)

Okay, how about call it dax_prep_recovery()?

> 
>>
>>>                   dax_direct_access(..., &kaddr, ...);
>>>                   return dax_recovery_{read,write}(..., kaddr, ...);
>>>           }
>>>           return copy_{mc_to_iter,from_iter_flushcache}(...);
>>>
>>> Where, the recovery version of dax_direct_access() has the opportunity
>>> to change the page permissions / use an alias mapping for the access,
>>
>> again, sorry, what 'page permissions'?  memory_failure_dev_pagemap()
>> changes the poisoned page mem_type from 'rw' to 'uc-' (should be NP?),
>> do you mean to reverse the change?
> 
> Right, the result of the conversation with Boris is that
> memory_failure() should mark the page as NP in call cases, so
> dax_direct_access() needs to create a UC mapping and
> dax_recover_{read,write}() would sink that operation and either return
> the page to NP after the access completes, or convert it to WB if the
> operation cleared the error.

Okay,  will add a patch to fix set_mce_nospec().

How about moving set_memory_uc() and set_memory_np() down to
dax_recovery_read(), so that we don't split the set_memory_X calls
over different APIs, because we can't enforce what follows
dax_direct_access()?

> 
>>> dax_recovery_read() allows reading the good cachelines out of a
>>> poisoned page, and dax_recovery_write() coordinates error list
>>> management and returning a poison page to full write-back caching
>>> operation when no more poisoned cacheline are detected in the page.
>>>
>>
>> How about to introduce 3 dax_recover_ APIs:
>>     dax_recover_direct_access(): similar to dax_direct_access except
>>        it ignores error list and return the kaddr, and hence is also
>>        optional, exported by device driver that has the ability to
>>        detect error;
>>     dax_recovery_read(): optional, supported by pmem driver only,
>>        reads as much data as possible up to the poisoned page;
> 
> It wouldn't be a property of the pmem driver, I expect it would be a
> flag on the dax device whether to attempt recovery or not. I.e. get
> away from this being a pmem callback and make this a native capability
> of a dax device.
> 
>>     dax_recovery_write(): optional, supported by pmem driver only,
>>        first clear-poison, then write.
>>
>> Should we worry about the dm targets?
> 
> The dm targets after Christoph's conversion should be able to do all
> the translation at direct access time and then dax_recovery_X can be
> done on the resulting already translated kaddr.

I'm thinking about the mixed device dm where some provides
dax_recovery_X, others don't, in which case we don't allow
dax recovery because that causes confusion? or should we still
allow recovery for part of the mixed devices?

> 
>> Both dax_recovery_read/write() are hooked up to dax_iomap_iter().
> 
> Yes.
> 

thanks!
-jane
Mike Snitzer Nov. 12, 2021, 3:36 p.m. UTC | #11
On Wed, Nov 10 2021 at  1:26P -0500,
Jane Chu <jane.chu@oracle.com> wrote:

> On 11/9/2021 1:02 PM, Dan Williams wrote:
> > On Tue, Nov 9, 2021 at 11:59 AM Jane Chu <jane.chu@oracle.com> wrote:
> >>
> >> On 11/9/2021 10:48 AM, Dan Williams wrote:
> >>> On Mon, Nov 8, 2021 at 11:27 PM Christoph Hellwig <hch@infradead.org> wrote:
> >>>>
> >>>> On Fri, Nov 05, 2021 at 07:16:38PM -0600, Jane Chu wrote:
> >>>>>    static size_t pmem_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
> >>>>>                 void *addr, size_t bytes, struct iov_iter *i, int mode)
> >>>>>    {
> >>>>> +     phys_addr_t pmem_off;
> >>>>> +     size_t len, lead_off;
> >>>>> +     struct pmem_device *pmem = dax_get_private(dax_dev);
> >>>>> +     struct device *dev = pmem->bb.dev;
> >>>>> +
> >>>>> +     if (unlikely(mode == DAX_OP_RECOVERY)) {
> >>>>> +             lead_off = (unsigned long)addr & ~PAGE_MASK;
> >>>>> +             len = PFN_PHYS(PFN_UP(lead_off + bytes));
> >>>>> +             if (is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, len)) {
> >>>>> +                     if (lead_off || !(PAGE_ALIGNED(bytes))) {
> >>>>> +                             dev_warn(dev, "Found poison, but addr(%p) and/or bytes(%#lx) not page aligned\n",
> >>>>> +                                     addr, bytes);
> >>>>> +                             return (size_t) -EIO;
> >>>>> +                     }
> >>>>> +                     pmem_off = PFN_PHYS(pgoff) + pmem->data_offset;
> >>>>> +                     if (pmem_clear_poison(pmem, pmem_off, bytes) !=
> >>>>> +                                             BLK_STS_OK)
> >>>>> +                             return (size_t) -EIO;
> >>>>> +             }
> >>>>> +     }
> >>>>
> >>>> This is in the wrong spot.  As seen in my WIP series individual drivers
> >>>> really should not hook into copying to and from the iter, because it
> >>>> really is just one way to write to a nvdimm.  How would dm-writecache
> >>>> clear the errors with this scheme?
> >>>>
> >>>> So IMHO going back to the separate recovery method as in your previous
> >>>> patch really is the way to go.  If/when the 64-bit store happens we
> >>>> need to figure out a good way to clear the bad block list for that.
> >>>
> >>> I think we just make error management a first class citizen of a
> >>> dax-device and stop abstracting it behind a driver callback. That way
> >>> the driver that registers the dax-device can optionally register error
> >>> management as well. Then fsdax path can do:
> >>>
> >>>           rc = dax_direct_access(..., &kaddr, ...);
> >>>           if (unlikely(rc)) {
> >>>                   kaddr = dax_mk_recovery(kaddr);
> >>
> >> Sorry, what does dax_mk_recovery(kaddr) do?
> > 
> > I was thinking this just does the hackery to set a flag bit in the
> > pointer, something like:
> > 
> > return (void *) ((unsigned long) kaddr | DAX_RECOVERY)
> 
> Okay, how about call it dax_prep_recovery()?
> 
> > 
> >>
> >>>                   dax_direct_access(..., &kaddr, ...);
> >>>                   return dax_recovery_{read,write}(..., kaddr, ...);
> >>>           }
> >>>           return copy_{mc_to_iter,from_iter_flushcache}(...);
> >>>
> >>> Where, the recovery version of dax_direct_access() has the opportunity
> >>> to change the page permissions / use an alias mapping for the access,
> >>
> >> again, sorry, what 'page permissions'?  memory_failure_dev_pagemap()
> >> changes the poisoned page mem_type from 'rw' to 'uc-' (should be NP?),
> >> do you mean to reverse the change?
> > 
> > Right, the result of the conversation with Boris is that
> > memory_failure() should mark the page as NP in call cases, so
> > dax_direct_access() needs to create a UC mapping and
> > dax_recover_{read,write}() would sink that operation and either return
> > the page to NP after the access completes, or convert it to WB if the
> > operation cleared the error.
> 
> Okay,  will add a patch to fix set_mce_nospec().
> 
> How about moving set_memory_uc() and set_memory_np() down to
> dax_recovery_read(), so that we don't split the set_memory_X calls
> over different APIs, because we can't enforce what follows
> dax_direct_access()?
> 
> > 
> >>> dax_recovery_read() allows reading the good cachelines out of a
> >>> poisoned page, and dax_recovery_write() coordinates error list
> >>> management and returning a poison page to full write-back caching
> >>> operation when no more poisoned cacheline are detected in the page.
> >>>
> >>
> >> How about to introduce 3 dax_recover_ APIs:
> >>     dax_recover_direct_access(): similar to dax_direct_access except
> >>        it ignores error list and return the kaddr, and hence is also
> >>        optional, exported by device driver that has the ability to
> >>        detect error;
> >>     dax_recovery_read(): optional, supported by pmem driver only,
> >>        reads as much data as possible up to the poisoned page;
> > 
> > It wouldn't be a property of the pmem driver, I expect it would be a
> > flag on the dax device whether to attempt recovery or not. I.e. get
> > away from this being a pmem callback and make this a native capability
> > of a dax device.
> > 
> >>     dax_recovery_write(): optional, supported by pmem driver only,
> >>        first clear-poison, then write.
> >>
> >> Should we worry about the dm targets?
> > 
> > The dm targets after Christoph's conversion should be able to do all
> > the translation at direct access time and then dax_recovery_X can be
> > done on the resulting already translated kaddr.
> 
> I'm thinking about the mixed device dm where some provides
> dax_recovery_X, others don't, in which case we don't allow
> dax recovery because that causes confusion? or should we still
> allow recovery for part of the mixed devices?

I really don't like the all or nothing approach if it can be avoided.
I would imagine that if recovery possible it best to support it even
if the DM device happens to span a mix of devices with varying support
for recovery.

Thanks,
Mike
Jane Chu Nov. 12, 2021, 6 p.m. UTC | #12
On 11/12/2021 7:36 AM, Mike Snitzer wrote:
> On Wed, Nov 10 2021 at  1:26P -0500,
> Jane Chu <jane.chu@oracle.com> wrote:
> 
>> On 11/9/2021 1:02 PM, Dan Williams wrote:
>>> On Tue, Nov 9, 2021 at 11:59 AM Jane Chu <jane.chu@oracle.com> wrote:
>>>>
>>>> On 11/9/2021 10:48 AM, Dan Williams wrote:
>>>>> On Mon, Nov 8, 2021 at 11:27 PM Christoph Hellwig <hch@infradead.org> wrote:
>>>>>>
>>>>>> On Fri, Nov 05, 2021 at 07:16:38PM -0600, Jane Chu wrote:
>>>>>>>     static size_t pmem_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
>>>>>>>                  void *addr, size_t bytes, struct iov_iter *i, int mode)
>>>>>>>     {
>>>>>>> +     phys_addr_t pmem_off;
>>>>>>> +     size_t len, lead_off;
>>>>>>> +     struct pmem_device *pmem = dax_get_private(dax_dev);
>>>>>>> +     struct device *dev = pmem->bb.dev;
>>>>>>> +
>>>>>>> +     if (unlikely(mode == DAX_OP_RECOVERY)) {
>>>>>>> +             lead_off = (unsigned long)addr & ~PAGE_MASK;
>>>>>>> +             len = PFN_PHYS(PFN_UP(lead_off + bytes));
>>>>>>> +             if (is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, len)) {
>>>>>>> +                     if (lead_off || !(PAGE_ALIGNED(bytes))) {
>>>>>>> +                             dev_warn(dev, "Found poison, but addr(%p) and/or bytes(%#lx) not page aligned\n",
>>>>>>> +                                     addr, bytes);
>>>>>>> +                             return (size_t) -EIO;
>>>>>>> +                     }
>>>>>>> +                     pmem_off = PFN_PHYS(pgoff) + pmem->data_offset;
>>>>>>> +                     if (pmem_clear_poison(pmem, pmem_off, bytes) !=
>>>>>>> +                                             BLK_STS_OK)
>>>>>>> +                             return (size_t) -EIO;
>>>>>>> +             }
>>>>>>> +     }
>>>>>>
>>>>>> This is in the wrong spot.  As seen in my WIP series individual drivers
>>>>>> really should not hook into copying to and from the iter, because it
>>>>>> really is just one way to write to a nvdimm.  How would dm-writecache
>>>>>> clear the errors with this scheme?
>>>>>>
>>>>>> So IMHO going back to the separate recovery method as in your previous
>>>>>> patch really is the way to go.  If/when the 64-bit store happens we
>>>>>> need to figure out a good way to clear the bad block list for that.
>>>>>
>>>>> I think we just make error management a first class citizen of a
>>>>> dax-device and stop abstracting it behind a driver callback. That way
>>>>> the driver that registers the dax-device can optionally register error
>>>>> management as well. Then fsdax path can do:
>>>>>
>>>>>            rc = dax_direct_access(..., &kaddr, ...);
>>>>>            if (unlikely(rc)) {
>>>>>                    kaddr = dax_mk_recovery(kaddr);
>>>>
>>>> Sorry, what does dax_mk_recovery(kaddr) do?
>>>
>>> I was thinking this just does the hackery to set a flag bit in the
>>> pointer, something like:
>>>
>>> return (void *) ((unsigned long) kaddr | DAX_RECOVERY)
>>
>> Okay, how about call it dax_prep_recovery()?
>>
>>>
>>>>
>>>>>                    dax_direct_access(..., &kaddr, ...);
>>>>>                    return dax_recovery_{read,write}(..., kaddr, ...);
>>>>>            }
>>>>>            return copy_{mc_to_iter,from_iter_flushcache}(...);
>>>>>
>>>>> Where, the recovery version of dax_direct_access() has the opportunity
>>>>> to change the page permissions / use an alias mapping for the access,
>>>>
>>>> again, sorry, what 'page permissions'?  memory_failure_dev_pagemap()
>>>> changes the poisoned page mem_type from 'rw' to 'uc-' (should be NP?),
>>>> do you mean to reverse the change?
>>>
>>> Right, the result of the conversation with Boris is that
>>> memory_failure() should mark the page as NP in call cases, so
>>> dax_direct_access() needs to create a UC mapping and
>>> dax_recover_{read,write}() would sink that operation and either return
>>> the page to NP after the access completes, or convert it to WB if the
>>> operation cleared the error.
>>
>> Okay,  will add a patch to fix set_mce_nospec().
>>
>> How about moving set_memory_uc() and set_memory_np() down to
>> dax_recovery_read(), so that we don't split the set_memory_X calls
>> over different APIs, because we can't enforce what follows
>> dax_direct_access()?
>>
>>>
>>>>> dax_recovery_read() allows reading the good cachelines out of a
>>>>> poisoned page, and dax_recovery_write() coordinates error list
>>>>> management and returning a poison page to full write-back caching
>>>>> operation when no more poisoned cacheline are detected in the page.
>>>>>
>>>>
>>>> How about to introduce 3 dax_recover_ APIs:
>>>>      dax_recover_direct_access(): similar to dax_direct_access except
>>>>         it ignores error list and return the kaddr, and hence is also
>>>>         optional, exported by device driver that has the ability to
>>>>         detect error;
>>>>      dax_recovery_read(): optional, supported by pmem driver only,
>>>>         reads as much data as possible up to the poisoned page;
>>>
>>> It wouldn't be a property of the pmem driver, I expect it would be a
>>> flag on the dax device whether to attempt recovery or not. I.e. get
>>> away from this being a pmem callback and make this a native capability
>>> of a dax device.
>>>
>>>>      dax_recovery_write(): optional, supported by pmem driver only,
>>>>         first clear-poison, then write.
>>>>
>>>> Should we worry about the dm targets?
>>>
>>> The dm targets after Christoph's conversion should be able to do all
>>> the translation at direct access time and then dax_recovery_X can be
>>> done on the resulting already translated kaddr.
>>
>> I'm thinking about the mixed device dm where some provides
>> dax_recovery_X, others don't, in which case we don't allow
>> dax recovery because that causes confusion? or should we still
>> allow recovery for part of the mixed devices?
> 
> I really don't like the all or nothing approach if it can be avoided.
> I would imagine that if recovery possible it best to support it even
> if the DM device happens to span a mix of devices with varying support
> for recovery.

Got it!

thanks!
-jane

> 
> Thanks,
> Mike
>
diff mbox series

Patch

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index dc354db22ef9..9b3dac916f22 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1043,6 +1043,7 @@  static size_t dm_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
 	if (!ti)
 		goto out;
 	if (!ti->type->dax_copy_from_iter) {
+		WARN_ON(mode == DAX_OP_RECOVERY);
 		ret = copy_from_iter(addr, bytes, i);
 		goto out;
 	}
@@ -1067,6 +1068,7 @@  static size_t dm_dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff,
 	if (!ti)
 		goto out;
 	if (!ti->type->dax_copy_to_iter) {
+		WARN_ON(mode == DAX_OP_RECOVERY);
 		ret = copy_to_iter(addr, bytes, i);
 		goto out;
 	}
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 3dc99e0bf633..8ae6aa678c51 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -260,7 +260,7 @@  __weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
 	resource_size_t offset = PFN_PHYS(pgoff) + pmem->data_offset;
 
 	if (unlikely(is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512,
-					PFN_PHYS(nr_pages))))
+				 PFN_PHYS(nr_pages)) && mode == DAX_OP_NORMAL))
 		return -EIO;
 
 	if (kaddr)
@@ -303,20 +303,85 @@  static long pmem_dax_direct_access(struct dax_device *dax_dev,
 }
 
 /*
- * Use the 'no check' versions of copy_from_iter_flushcache() and
- * copy_mc_to_iter() to bypass HARDENED_USERCOPY overhead. Bounds
- * checking, both file offset and device offset, is handled by
- * dax_iomap_actor()
+ * Even though the 'no check' versions of copy_from_iter_flushcache()
+ * and copy_mc_to_iter() are used to bypass HARDENED_USERCOPY overhead,
+ * 'read'/'write' aren't always safe when poison is consumed. They happen
+ * to be safe because the 'read'/'write' range has been guaranteed
+ * be free of poison(s) by a prior call to dax_direct_access() on the
+ * caller stack.
+ * But on a data recovery code path, the 'read'/'write' range is expected
+ * to contain poison(s), and so poison(s) is explicit checked, such that
+ * 'read' can fetch data from clean page(s) up till the first poison is
+ * encountered, and 'write' requires the range be page aligned in order
+ * to restore the poisoned page's memory type back to "rw" after clearing
+ * the poison(s).
+ * In the event of poison related failure, (size_t) -EIO is returned and
+ * caller may check the return value after casting it to (ssize_t).
+ *
+ * TODO: add support for CPUs that support MOVDIR64B instruction for
+ * faster poison clearing, and possibly smaller error blast radius.
  */
 static size_t pmem_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
 		void *addr, size_t bytes, struct iov_iter *i, int mode)
 {
+	phys_addr_t pmem_off;
+	size_t len, lead_off;
+	struct pmem_device *pmem = dax_get_private(dax_dev);
+	struct device *dev = pmem->bb.dev;
+
+	if (unlikely(mode == DAX_OP_RECOVERY)) {
+		lead_off = (unsigned long)addr & ~PAGE_MASK;
+		len = PFN_PHYS(PFN_UP(lead_off + bytes));
+		if (is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, len)) {
+			if (lead_off || !(PAGE_ALIGNED(bytes))) {
+				dev_warn(dev, "Found poison, but addr(%p) and/or bytes(%#lx) not page aligned\n",
+					addr, bytes);
+				return (size_t) -EIO;
+			}
+			pmem_off = PFN_PHYS(pgoff) + pmem->data_offset;
+			if (pmem_clear_poison(pmem, pmem_off, bytes) !=
+						BLK_STS_OK)
+				return (size_t) -EIO;
+		}
+	}
+
 	return _copy_from_iter_flushcache(addr, bytes, i);
 }
 
 static size_t pmem_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff,
 		void *addr, size_t bytes, struct iov_iter *i, int mode)
 {
+	int num_bad;
+	size_t len, lead_off;
+	unsigned long bad_pfn;
+	bool bad_pmem = false;
+	size_t adj_len = bytes;
+	sector_t sector, first_bad;
+	struct pmem_device *pmem = dax_get_private(dax_dev);
+	struct device *dev = pmem->bb.dev;
+
+	if (unlikely(mode == DAX_OP_RECOVERY)) {
+		sector = PFN_PHYS(pgoff) / 512;
+		lead_off = (unsigned long)addr & ~PAGE_MASK;
+		len = PFN_PHYS(PFN_UP(lead_off + bytes));
+		if (pmem->bb.count)
+			bad_pmem = !!badblocks_check(&pmem->bb, sector,
+					len / 512, &first_bad, &num_bad);
+		if (bad_pmem) {
+			bad_pfn = PHYS_PFN(first_bad * 512);
+			if (bad_pfn == pgoff) {
+				dev_warn(dev, "Found poison in page: pgoff(%#lx)\n",
+					pgoff);
+				return -EIO;
+			}
+			adj_len = PFN_PHYS(bad_pfn - pgoff) - lead_off;
+			dev_WARN_ONCE(dev, (adj_len > bytes),
+					"out-of-range first_bad?");
+		}
+		if (adj_len == 0)
+			return (size_t) -EIO;
+	}
+
 	return _copy_mc_to_iter(addr, bytes, i);
 }
 
diff --git a/fs/dax.c b/fs/dax.c
index bea6df1498c3..7640be6b6a97 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1219,6 +1219,8 @@  static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
 		unsigned offset = pos & (PAGE_SIZE - 1);
 		const size_t size = ALIGN(length + offset, PAGE_SIZE);
 		const sector_t sector = dax_iomap_sector(iomap, pos);
+		long nr_page = PHYS_PFN(size);
+		int dax_mode = DAX_OP_NORMAL;
 		ssize_t map_len;
 		pgoff_t pgoff;
 		void *kaddr;
@@ -1232,8 +1234,13 @@  static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
 		if (ret)
 			break;
 
-		map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
-					    DAX_OP_NORMAL, &kaddr, NULL);
+		map_len = dax_direct_access(dax_dev, pgoff, nr_page, dax_mode,
+					    &kaddr, NULL);
+		if (unlikely(map_len == -EIO)) {
+			dax_mode = DAX_OP_RECOVERY;
+			map_len = dax_direct_access(dax_dev, pgoff, nr_page,
+						    dax_mode, &kaddr, NULL);
+		}
 		if (map_len < 0) {
 			ret = map_len;
 			break;
@@ -1252,11 +1259,20 @@  static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
 		 */
 		if (iov_iter_rw(iter) == WRITE)
 			xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr,
-					map_len, iter, DAX_OP_NORMAL);
+					map_len, iter, dax_mode);
 		else
 			xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr,
-					map_len, iter, DAX_OP_NORMAL);
+					map_len, iter, dax_mode);
 
+		/*
+		 * If dax data recovery is enacted via DAX_OP_RECOVERY,
+		 * recovery failure would be indicated by a -EIO return
+		 * in 'xfer' casted as (size_t).
+		 */
+		if ((ssize_t)xfer == -EIO) {
+			ret = -EIO;
+			break;
+		}
 		pos += xfer;
 		length -= xfer;
 		done += xfer;