diff mbox

[v2] libnvdimm: check and clear poison before writing to pmem

Message ID 147879993593.130465.16954973899565924428.stgit@djiang5-desk3.ch.intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Dave Jiang Nov. 10, 2016, 5:45 p.m. UTC
We need to clear any poison when we are writing to pmem. The granularity
will be sector size. If it's less then we can't do anything about it
barring corruption.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 drivers/nvdimm/claim.c |   24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

Comments

Verma, Vishal L Nov. 10, 2016, 7:17 p.m. UTC | #1
On 11/10, Dave Jiang wrote:
> We need to clear any poison when we are writing to pmem. The granularity
> will be sector size. If it's less then we can't do anything about it
> barring corruption.
> 
> Signed-off-by: Dave Jiang <dave.jiang@intel.com>
> ---
>  drivers/nvdimm/claim.c |   24 +++++++++++++++++++++---
>  1 file changed, 21 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c
> index d5dc80c..306d8fd 100644
> --- a/drivers/nvdimm/claim.c
> +++ b/drivers/nvdimm/claim.c
> @@ -226,6 +226,11 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns,
>  		resource_size_t offset, void *buf, size_t size, int rw)
>  {
>  	struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
> +	unsigned int sz_align = ALIGN(size + (offset & (512 - 1)), 512);
> +	sector_t sector = offset >> 9;
> +
> +	if (unlikely(!size))
> +		return -EINVAL;
>  
>  	if (unlikely(offset + size > nsio->size)) {
>  		dev_WARN_ONCE(&ndns->dev, 1, "request out of range\n");
> @@ -233,12 +238,25 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns,
>  	}
>  
>  	if (rw == READ) {
> -		unsigned int sz_align = ALIGN(size + (offset & (512 - 1)), 512);
> -
> -		if (unlikely(is_bad_pmem(&nsio->bb, offset / 512, sz_align)))
> +		if (unlikely(is_bad_pmem(&nsio->bb, sector, sz_align)))
>  			return -EIO;
>  		return memcpy_from_pmem(buf, nsio->addr + offset, size);
>  	} else {
> +
> +		if (unlikely(is_bad_pmem(&nsio->bb, sector, sz_align))) {
> +			if (IS_ALIGNED(offset, 512) && IS_ALIGNED(size, 512)) {
> +				long cleared;
> +
> +				cleared = nvdimm_clear_poison(&ndns->dev,
> +							      offset, size);

Everything above looks good..

> +				if (cleared != size)
> +					return -EIO;

But I think we don't want to return -EIO here. Instead, ...
> +
> +				badblocks_clear(&nsio->bb, sector,
> +						cleared >> 9);
> +			}
> +		}
> +
>  		memcpy_to_pmem(nsio->addr + offset, buf, size);
>  		nvdimm_flush(to_nd_region(ndns->dev.parent));

Do _some_ writes. We have two options:
1. Try to write the whole thing (size), and return success. If the write
touches any remaining poison, it will not complain, but the poison will
be retained, so subsequent reads will hit it and fail.

2. Only write what we were able to clear, i.e. 'cleared' bytes. And
since we didn't complete the write, error out.

I think 1 makes more sense, but I could be convinced otherwise..

One wild idea might be:
rw_bytes can write arbitrary lengths, and commonly does up to 4K at a
time (BTT data writes). This gives us a rather large window where we
could be clearing 4K worth of badblocks all together, and then writing
4K of data. If we crash anywhere in the middle, we're almost guaranteed
silent data corruption.

Instead, what if, for the known-errors case for writes, we break it down
into smaller chunks:
Go cache line by cache line - if it may have poison (based on
badblocks), clear poison, followed by a cache line worth of data write.
When we cross a sector boundary (512), do a badblocks_clear.

I think this gives us the best chance against silent corruption in the
absence of an atomic clear-error-and-write command.

Thoughts?

>  	}
>
Dan Williams Nov. 10, 2016, 9:24 p.m. UTC | #2
On Thu, Nov 10, 2016 at 11:17 AM, Vishal Verma <vishal.l.verma@intel.com> wrote:
> On 11/10, Dave Jiang wrote:
>> We need to clear any poison when we are writing to pmem. The granularity
>> will be sector size. If it's less then we can't do anything about it
>> barring corruption.
>>
>> Signed-off-by: Dave Jiang <dave.jiang@intel.com>
>> ---
>>  drivers/nvdimm/claim.c |   24 +++++++++++++++++++++---
>>  1 file changed, 21 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c
>> index d5dc80c..306d8fd 100644
>> --- a/drivers/nvdimm/claim.c
>> +++ b/drivers/nvdimm/claim.c
>> @@ -226,6 +226,11 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns,
>>               resource_size_t offset, void *buf, size_t size, int rw)
>>  {
>>       struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
>> +     unsigned int sz_align = ALIGN(size + (offset & (512 - 1)), 512);
>> +     sector_t sector = offset >> 9;
>> +
>> +     if (unlikely(!size))
>> +             return -EINVAL;
>>
>>       if (unlikely(offset + size > nsio->size)) {
>>               dev_WARN_ONCE(&ndns->dev, 1, "request out of range\n");
>> @@ -233,12 +238,25 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns,
>>       }
>>
>>       if (rw == READ) {
>> -             unsigned int sz_align = ALIGN(size + (offset & (512 - 1)), 512);
>> -
>> -             if (unlikely(is_bad_pmem(&nsio->bb, offset / 512, sz_align)))
>> +             if (unlikely(is_bad_pmem(&nsio->bb, sector, sz_align)))
>>                       return -EIO;
>>               return memcpy_from_pmem(buf, nsio->addr + offset, size);
>>       } else {
>> +
>> +             if (unlikely(is_bad_pmem(&nsio->bb, sector, sz_align))) {
>> +                     if (IS_ALIGNED(offset, 512) && IS_ALIGNED(size, 512)) {
>> +                             long cleared;
>> +
>> +                             cleared = nvdimm_clear_poison(&ndns->dev,
>> +                                                           offset, size);
>
> Everything above looks good..
>
>> +                             if (cleared != size)
>> +                                     return -EIO;
>
> But I think we don't want to return -EIO here. Instead, ...
>> +
>> +                             badblocks_clear(&nsio->bb, sector,
>> +                                             cleared >> 9);
>> +                     }
>> +             }
>> +
>>               memcpy_to_pmem(nsio->addr + offset, buf, size);
>>               nvdimm_flush(to_nd_region(ndns->dev.parent));
>
> Do _some_ writes. We have two options:
> 1. Try to write the whole thing (size), and return success. If the write
> touches any remaining poison, it will not complain, but the poison will
> be retained, so subsequent reads will hit it and fail.
>
> 2. Only write what we were able to clear, i.e. 'cleared' bytes. And
> since we didn't complete the write, error out.
>
> I think 1 makes more sense, but I could be convinced otherwise..
>
> One wild idea might be:
> rw_bytes can write arbitrary lengths, and commonly does up to 4K at a
> time (BTT data writes). This gives us a rather large window where we
> could be clearing 4K worth of badblocks all together, and then writing
> 4K of data. If we crash anywhere in the middle, we're almost guaranteed
> silent data corruption.
>
> Instead, what if, for the known-errors case for writes, we break it down
> into smaller chunks:
> Go cache line by cache line - if it may have poison (based on
> badblocks), clear poison, followed by a cache line worth of data write.
> When we cross a sector boundary (512), do a badblocks_clear.
>
> I think this gives us the best chance against silent corruption in the
> absence of an atomic clear-error-and-write command.
>
> Thoughts?

This feels like over-engineering a still not perfect solution to a
rare problem.  Outside of atomic-write-and-clear we should just keep
the code best effort and simple.
Verma, Vishal L Nov. 10, 2016, 9:56 p.m. UTC | #3
On 11/10, Dan Williams wrote:
> On Thu, Nov 10, 2016 at 11:17 AM, Vishal Verma <vishal.l.verma@intel.com> wrote:
> > On 11/10, Dave Jiang wrote:
> >> We need to clear any poison when we are writing to pmem. The granularity
> >> will be sector size. If it's less then we can't do anything about it
> >> barring corruption.
> >>
> >> Signed-off-by: Dave Jiang <dave.jiang@intel.com>
> >> ---
> >>  drivers/nvdimm/claim.c |   24 +++++++++++++++++++++---
> >>  1 file changed, 21 insertions(+), 3 deletions(-)
> >>
> >> diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c
> >> index d5dc80c..306d8fd 100644
> >> --- a/drivers/nvdimm/claim.c
> >> +++ b/drivers/nvdimm/claim.c
> >> @@ -226,6 +226,11 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns,
> >>               resource_size_t offset, void *buf, size_t size, int rw)
> >>  {
> >>       struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
> >> +     unsigned int sz_align = ALIGN(size + (offset & (512 - 1)), 512);
> >> +     sector_t sector = offset >> 9;
> >> +
> >> +     if (unlikely(!size))
> >> +             return -EINVAL;
> >>
> >>       if (unlikely(offset + size > nsio->size)) {
> >>               dev_WARN_ONCE(&ndns->dev, 1, "request out of range\n");
> >> @@ -233,12 +238,25 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns,
> >>       }
> >>
> >>       if (rw == READ) {
> >> -             unsigned int sz_align = ALIGN(size + (offset & (512 - 1)), 512);
> >> -
> >> -             if (unlikely(is_bad_pmem(&nsio->bb, offset / 512, sz_align)))
> >> +             if (unlikely(is_bad_pmem(&nsio->bb, sector, sz_align)))
> >>                       return -EIO;
> >>               return memcpy_from_pmem(buf, nsio->addr + offset, size);
> >>       } else {
> >> +
> >> +             if (unlikely(is_bad_pmem(&nsio->bb, sector, sz_align))) {
> >> +                     if (IS_ALIGNED(offset, 512) && IS_ALIGNED(size, 512)) {
> >> +                             long cleared;
> >> +
> >> +                             cleared = nvdimm_clear_poison(&ndns->dev,
> >> +                                                           offset, size);
> >
> > Everything above looks good..
> >
> >> +                             if (cleared != size)
> >> +                                     return -EIO;
> >
> > But I think we don't want to return -EIO here. Instead, ...
> >> +
> >> +                             badblocks_clear(&nsio->bb, sector,
> >> +                                             cleared >> 9);
> >> +                     }
> >> +             }
> >> +
> >>               memcpy_to_pmem(nsio->addr + offset, buf, size);
> >>               nvdimm_flush(to_nd_region(ndns->dev.parent));
> >
> > Do _some_ writes. We have two options:
> > 1. Try to write the whole thing (size), and return success. If the write
> > touches any remaining poison, it will not complain, but the poison will
> > be retained, so subsequent reads will hit it and fail.
> >
> > 2. Only write what we were able to clear, i.e. 'cleared' bytes. And
> > since we didn't complete the write, error out.
> >
> > I think 1 makes more sense, but I could be convinced otherwise..
> >
> > One wild idea might be:
> > rw_bytes can write arbitrary lengths, and commonly does up to 4K at a
> > time (BTT data writes). This gives us a rather large window where we
> > could be clearing 4K worth of badblocks all together, and then writing
> > 4K of data. If we crash anywhere in the middle, we're almost guaranteed
> > silent data corruption.
> >
> > Instead, what if, for the known-errors case for writes, we break it down
> > into smaller chunks:
> > Go cache line by cache line - if it may have poison (based on
> > badblocks), clear poison, followed by a cache line worth of data write.
> > When we cross a sector boundary (512), do a badblocks_clear.
> >
> > I think this gives us the best chance against silent corruption in the
> > absence of an atomic clear-error-and-write command.
> >
> > Thoughts?
> 
> This feels like over-engineering a still not perfect solution to a
> rare problem.  Outside of atomic-write-and-clear we should just keep
> the code best effort and simple.

Fair enough :) In that case I think the only change needed is to simply
remove the cleared != size check, do badblocks_clear for "cleared >> 9"
(as it is now), and then write "size", and return success. Sound good?
Dan Williams Nov. 10, 2016, 10:02 p.m. UTC | #4
On Thu, Nov 10, 2016 at 1:56 PM, Vishal Verma <vishal.l.verma@intel.com> wrote:
> On 11/10, Dan Williams wrote:
>> On Thu, Nov 10, 2016 at 11:17 AM, Vishal Verma <vishal.l.verma@intel.com> wrote:
>> > On 11/10, Dave Jiang wrote:
>> >> We need to clear any poison when we are writing to pmem. The granularity
>> >> will be sector size. If it's less then we can't do anything about it
>> >> barring corruption.
>> >>
>> >> Signed-off-by: Dave Jiang <dave.jiang@intel.com>
>> >> ---
>> >>  drivers/nvdimm/claim.c |   24 +++++++++++++++++++++---
>> >>  1 file changed, 21 insertions(+), 3 deletions(-)
>> >>
>> >> diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c
>> >> index d5dc80c..306d8fd 100644
>> >> --- a/drivers/nvdimm/claim.c
>> >> +++ b/drivers/nvdimm/claim.c
>> >> @@ -226,6 +226,11 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns,
>> >>               resource_size_t offset, void *buf, size_t size, int rw)
>> >>  {
>> >>       struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
>> >> +     unsigned int sz_align = ALIGN(size + (offset & (512 - 1)), 512);
>> >> +     sector_t sector = offset >> 9;
>> >> +
>> >> +     if (unlikely(!size))
>> >> +             return -EINVAL;
>> >>
>> >>       if (unlikely(offset + size > nsio->size)) {
>> >>               dev_WARN_ONCE(&ndns->dev, 1, "request out of range\n");
>> >> @@ -233,12 +238,25 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns,
>> >>       }
>> >>
>> >>       if (rw == READ) {
>> >> -             unsigned int sz_align = ALIGN(size + (offset & (512 - 1)), 512);
>> >> -
>> >> -             if (unlikely(is_bad_pmem(&nsio->bb, offset / 512, sz_align)))
>> >> +             if (unlikely(is_bad_pmem(&nsio->bb, sector, sz_align)))
>> >>                       return -EIO;
>> >>               return memcpy_from_pmem(buf, nsio->addr + offset, size);
>> >>       } else {
>> >> +
>> >> +             if (unlikely(is_bad_pmem(&nsio->bb, sector, sz_align))) {
>> >> +                     if (IS_ALIGNED(offset, 512) && IS_ALIGNED(size, 512)) {
>> >> +                             long cleared;
>> >> +
>> >> +                             cleared = nvdimm_clear_poison(&ndns->dev,
>> >> +                                                           offset, size);
>> >
>> > Everything above looks good..
>> >
>> >> +                             if (cleared != size)
>> >> +                                     return -EIO;
>> >
>> > But I think we don't want to return -EIO here. Instead, ...
>> >> +
>> >> +                             badblocks_clear(&nsio->bb, sector,
>> >> +                                             cleared >> 9);
>> >> +                     }
>> >> +             }
>> >> +
>> >>               memcpy_to_pmem(nsio->addr + offset, buf, size);
>> >>               nvdimm_flush(to_nd_region(ndns->dev.parent));
>> >
>> > Do _some_ writes. We have two options:
>> > 1. Try to write the whole thing (size), and return success. If the write
>> > touches any remaining poison, it will not complain, but the poison will
>> > be retained, so subsequent reads will hit it and fail.
>> >
>> > 2. Only write what we were able to clear, i.e. 'cleared' bytes. And
>> > since we didn't complete the write, error out.
>> >
>> > I think 1 makes more sense, but I could be convinced otherwise..
>> >
>> > One wild idea might be:
>> > rw_bytes can write arbitrary lengths, and commonly does up to 4K at a
>> > time (BTT data writes). This gives us a rather large window where we
>> > could be clearing 4K worth of badblocks all together, and then writing
>> > 4K of data. If we crash anywhere in the middle, we're almost guaranteed
>> > silent data corruption.
>> >
>> > Instead, what if, for the known-errors case for writes, we break it down
>> > into smaller chunks:
>> > Go cache line by cache line - if it may have poison (based on
>> > badblocks), clear poison, followed by a cache line worth of data write.
>> > When we cross a sector boundary (512), do a badblocks_clear.
>> >
>> > I think this gives us the best chance against silent corruption in the
>> > absence of an atomic clear-error-and-write command.
>> >
>> > Thoughts?
>>
>> This feels like over-engineering a still not perfect solution to a
>> rare problem.  Outside of atomic-write-and-clear we should just keep
>> the code best effort and simple.
>
> Fair enough :) In that case I think the only change needed is to simply
> remove the cleared != size check, do badblocks_clear for "cleared >> 9"
> (as it is now), and then write "size", and return success. Sound good?
>

Return success even on an incomplete write? That's not the approach we
took with Toshi's recent change:

ommit 3115bb02b5c23d960df5f1bf551ec394a9bb10ec
Author: Toshi Kani <toshi.kani@hpe.com>
Date:   Thu Oct 13 09:54:21 2016 -0600

    pmem: report error on clear poison failure

    ACPI Clear Uncorrectable Error DSM function may fail or may be
    unsupported on a platform.  pmem_clear_poison() returns without clearing
    badblocks in such cases.  This failure is detected at the next read
    (-EIO).

    This behavior can lead to an issue when user keeps writing but does not
    read immediately.  For instance, flight recorder file may be only read
    when it is necessary for troubleshooting.

    Change pmem_do_bvec() and pmem_clear_poison() to return -EIO so that
    filesystem can log an error message on a write error.

    Cc: Vishal Verma <vishal.l.verma@intel.com>
    Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
    Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Verma, Vishal L Nov. 10, 2016, 10:04 p.m. UTC | #5
On 11/10, Dan Williams wrote:
> On Thu, Nov 10, 2016 at 1:56 PM, Vishal Verma <vishal.l.verma@intel.com> wrote:
> > On 11/10, Dan Williams wrote:
> >> On Thu, Nov 10, 2016 at 11:17 AM, Vishal Verma <vishal.l.verma@intel.com> wrote:
> >> > On 11/10, Dave Jiang wrote:
> >> >> We need to clear any poison when we are writing to pmem. The granularity
> >> >> will be sector size. If it's less then we can't do anything about it
> >> >> barring corruption.
> >> >>
> >> >> Signed-off-by: Dave Jiang <dave.jiang@intel.com>
> >> >> ---
> >> >>  drivers/nvdimm/claim.c |   24 +++++++++++++++++++++---
> >> >>  1 file changed, 21 insertions(+), 3 deletions(-)
> >> >>
> >> >> diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c
> >> >> index d5dc80c..306d8fd 100644
> >> >> --- a/drivers/nvdimm/claim.c
> >> >> +++ b/drivers/nvdimm/claim.c
> >> >> @@ -226,6 +226,11 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns,
> >> >>               resource_size_t offset, void *buf, size_t size, int rw)
> >> >>  {
> >> >>       struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
> >> >> +     unsigned int sz_align = ALIGN(size + (offset & (512 - 1)), 512);
> >> >> +     sector_t sector = offset >> 9;
> >> >> +
> >> >> +     if (unlikely(!size))
> >> >> +             return -EINVAL;
> >> >>
> >> >>       if (unlikely(offset + size > nsio->size)) {
> >> >>               dev_WARN_ONCE(&ndns->dev, 1, "request out of range\n");
> >> >> @@ -233,12 +238,25 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns,
> >> >>       }
> >> >>
> >> >>       if (rw == READ) {
> >> >> -             unsigned int sz_align = ALIGN(size + (offset & (512 - 1)), 512);
> >> >> -
> >> >> -             if (unlikely(is_bad_pmem(&nsio->bb, offset / 512, sz_align)))
> >> >> +             if (unlikely(is_bad_pmem(&nsio->bb, sector, sz_align)))
> >> >>                       return -EIO;
> >> >>               return memcpy_from_pmem(buf, nsio->addr + offset, size);
> >> >>       } else {
> >> >> +
> >> >> +             if (unlikely(is_bad_pmem(&nsio->bb, sector, sz_align))) {
> >> >> +                     if (IS_ALIGNED(offset, 512) && IS_ALIGNED(size, 512)) {
> >> >> +                             long cleared;
> >> >> +
> >> >> +                             cleared = nvdimm_clear_poison(&ndns->dev,
> >> >> +                                                           offset, size);
> >> >
> >> > Everything above looks good..
> >> >
> >> >> +                             if (cleared != size)
> >> >> +                                     return -EIO;
> >> >
> >> > But I think we don't want to return -EIO here. Instead, ...
> >> >> +
> >> >> +                             badblocks_clear(&nsio->bb, sector,
> >> >> +                                             cleared >> 9);
> >> >> +                     }
> >> >> +             }
> >> >> +
> >> >>               memcpy_to_pmem(nsio->addr + offset, buf, size);
> >> >>               nvdimm_flush(to_nd_region(ndns->dev.parent));
> >> >
> >> > Do _some_ writes. We have two options:
> >> > 1. Try to write the whole thing (size), and return success. If the write
> >> > touches any remaining poison, it will not complain, but the poison will
> >> > be retained, so subsequent reads will hit it and fail.
> >> >
> >> > 2. Only write what we were able to clear, i.e. 'cleared' bytes. And
> >> > since we didn't complete the write, error out.
> >> >
> >> > I think 1 makes more sense, but I could be convinced otherwise..
> >> >
> >> > One wild idea might be:
> >> > rw_bytes can write arbitrary lengths, and commonly does up to 4K at a
> >> > time (BTT data writes). This gives us a rather large window where we
> >> > could be clearing 4K worth of badblocks all together, and then writing
> >> > 4K of data. If we crash anywhere in the middle, we're almost guaranteed
> >> > silent data corruption.
> >> >
> >> > Instead, what if, for the known-errors case for writes, we break it down
> >> > into smaller chunks:
> >> > Go cache line by cache line - if it may have poison (based on
> >> > badblocks), clear poison, followed by a cache line worth of data write.
> >> > When we cross a sector boundary (512), do a badblocks_clear.
> >> >
> >> > I think this gives us the best chance against silent corruption in the
> >> > absence of an atomic clear-error-and-write command.
> >> >
> >> > Thoughts?
> >>
> >> This feels like over-engineering a still not perfect solution to a
> >> rare problem.  Outside of atomic-write-and-clear we should just keep
> >> the code best effort and simple.
> >
> > Fair enough :) In that case I think the only change needed is to simply
> > remove the cleared != size check, do badblocks_clear for "cleared >> 9"
> > (as it is now), and then write "size", and return success. Sound good?
> >
> 
> Return success even on an incomplete write? That's not the approach we
> took with Toshi's recent change:

Good point, so maybe do the write, and then,
if (cleared != size)
	return -EIO;

Yeah?

> 
> ommit 3115bb02b5c23d960df5f1bf551ec394a9bb10ec
> Author: Toshi Kani <toshi.kani@hpe.com>
> Date:   Thu Oct 13 09:54:21 2016 -0600
> 
>     pmem: report error on clear poison failure
> 
>     ACPI Clear Uncorrectable Error DSM function may fail or may be
>     unsupported on a platform.  pmem_clear_poison() returns without clearing
>     badblocks in such cases.  This failure is detected at the next read
>     (-EIO).
> 
>     This behavior can lead to an issue when user keeps writing but does not
>     read immediately.  For instance, flight recorder file may be only read
>     when it is necessary for troubleshooting.
> 
>     Change pmem_do_bvec() and pmem_clear_poison() to return -EIO so that
>     filesystem can log an error message on a write error.
> 
>     Cc: Vishal Verma <vishal.l.verma@intel.com>
>     Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
>     Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Kani, Toshi Nov. 10, 2016, 10:04 p.m. UTC | #6
On Thu, 2016-11-10 at 14:56 -0700, Vishal Verma wrote:
> On 11/10, Dan Williams wrote:

 :
> > This feels like over-engineering a still not perfect solution to a

> > rare problem.  Outside of atomic-write-and-clear we should just

> > keep the code best effort and simple.

> 

> Fair enough :) In that case I think the only change needed is to

> simply remove the cleared != size check, do badblocks_clear for

> "cleared >> 9" (as it is now), and then write "size", and return

> success. Sound good?


Can we take the same approach as pmem_do_bvec(), i.e. copy the data,
but return -EIO in case the clear failed?

Thanks,
-Toshi
Dave Jiang Nov. 10, 2016, 10:37 p.m. UTC | #7
On 11/10/2016 03:04 PM, Vishal Verma wrote:
> On 11/10, Dan Williams wrote:
>> On Thu, Nov 10, 2016 at 1:56 PM, Vishal Verma <vishal.l.verma@intel.com> wrote:
>>> On 11/10, Dan Williams wrote:
>>>> On Thu, Nov 10, 2016 at 11:17 AM, Vishal Verma <vishal.l.verma@intel.com> wrote:
>>>>> On 11/10, Dave Jiang wrote:
>>>>>> We need to clear any poison when we are writing to pmem. The granularity
>>>>>> will be sector size. If it's less then we can't do anything about it
>>>>>> barring corruption.
>>>>>>
>>>>>> Signed-off-by: Dave Jiang <dave.jiang@intel.com>
>>>>>> ---
>>>>>>  drivers/nvdimm/claim.c |   24 +++++++++++++++++++++---
>>>>>>  1 file changed, 21 insertions(+), 3 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c
>>>>>> index d5dc80c..306d8fd 100644
>>>>>> --- a/drivers/nvdimm/claim.c
>>>>>> +++ b/drivers/nvdimm/claim.c
>>>>>> @@ -226,6 +226,11 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns,
>>>>>>               resource_size_t offset, void *buf, size_t size, int rw)
>>>>>>  {
>>>>>>       struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
>>>>>> +     unsigned int sz_align = ALIGN(size + (offset & (512 - 1)), 512);
>>>>>> +     sector_t sector = offset >> 9;
>>>>>> +
>>>>>> +     if (unlikely(!size))
>>>>>> +             return -EINVAL;
>>>>>>
>>>>>>       if (unlikely(offset + size > nsio->size)) {
>>>>>>               dev_WARN_ONCE(&ndns->dev, 1, "request out of range\n");
>>>>>> @@ -233,12 +238,25 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns,
>>>>>>       }
>>>>>>
>>>>>>       if (rw == READ) {
>>>>>> -             unsigned int sz_align = ALIGN(size + (offset & (512 - 1)), 512);
>>>>>> -
>>>>>> -             if (unlikely(is_bad_pmem(&nsio->bb, offset / 512, sz_align)))
>>>>>> +             if (unlikely(is_bad_pmem(&nsio->bb, sector, sz_align)))
>>>>>>                       return -EIO;
>>>>>>               return memcpy_from_pmem(buf, nsio->addr + offset, size);
>>>>>>       } else {
>>>>>> +
>>>>>> +             if (unlikely(is_bad_pmem(&nsio->bb, sector, sz_align))) {
>>>>>> +                     if (IS_ALIGNED(offset, 512) && IS_ALIGNED(size, 512)) {
>>>>>> +                             long cleared;
>>>>>> +
>>>>>> +                             cleared = nvdimm_clear_poison(&ndns->dev,
>>>>>> +                                                           offset, size);
>>>>>
>>>>> Everything above looks good..
>>>>>
>>>>>> +                             if (cleared != size)
>>>>>> +                                     return -EIO;
>>>>>
>>>>> But I think we don't want to return -EIO here. Instead, ...
>>>>>> +
>>>>>> +                             badblocks_clear(&nsio->bb, sector,
>>>>>> +                                             cleared >> 9);
>>>>>> +                     }
>>>>>> +             }
>>>>>> +
>>>>>>               memcpy_to_pmem(nsio->addr + offset, buf, size);
>>>>>>               nvdimm_flush(to_nd_region(ndns->dev.parent));
>>>>>
>>>>> Do _some_ writes. We have two options:
>>>>> 1. Try to write the whole thing (size), and return success. If the write
>>>>> touches any remaining poison, it will not complain, but the poison will
>>>>> be retained, so subsequent reads will hit it and fail.
>>>>>
>>>>> 2. Only write what we were able to clear, i.e. 'cleared' bytes. And
>>>>> since we didn't complete the write, error out.
>>>>>
>>>>> I think 1 makes more sense, but I could be convinced otherwise..
>>>>>
>>>>> One wild idea might be:
>>>>> rw_bytes can write arbitrary lengths, and commonly does up to 4K at a
>>>>> time (BTT data writes). This gives us a rather large window where we
>>>>> could be clearing 4K worth of badblocks all together, and then writing
>>>>> 4K of data. If we crash anywhere in the middle, we're almost guaranteed
>>>>> silent data corruption.
>>>>>
>>>>> Instead, what if, for the known-errors case for writes, we break it down
>>>>> into smaller chunks:
>>>>> Go cache line by cache line - if it may have poison (based on
>>>>> badblocks), clear poison, followed by a cache line worth of data write.
>>>>> When we cross a sector boundary (512), do a badblocks_clear.
>>>>>
>>>>> I think this gives us the best chance against silent corruption in the
>>>>> absence of an atomic clear-error-and-write command.
>>>>>
>>>>> Thoughts?
>>>>
>>>> This feels like over-engineering a still not perfect solution to a
>>>> rare problem.  Outside of atomic-write-and-clear we should just keep
>>>> the code best effort and simple.
>>>
>>> Fair enough :) In that case I think the only change needed is to simply
>>> remove the cleared != size check, do badblocks_clear for "cleared >> 9"
>>> (as it is now), and then write "size", and return success. Sound good?
>>>
>>
>> Return success even on an incomplete write? That's not the approach we
>> took with Toshi's recent change:
> 
> Good point, so maybe do the write, and then,
> if (cleared != size)
> 	return -EIO;
> 
> Yeah?

I think we also need to return -EIO if the alignment check fails after
we detected bad pmem in that case when we can't clear?

> 
>>
>> ommit 3115bb02b5c23d960df5f1bf551ec394a9bb10ec
>> Author: Toshi Kani <toshi.kani@hpe.com>
>> Date:   Thu Oct 13 09:54:21 2016 -0600
>>
>>     pmem: report error on clear poison failure
>>
>>     ACPI Clear Uncorrectable Error DSM function may fail or may be
>>     unsupported on a platform.  pmem_clear_poison() returns without clearing
>>     badblocks in such cases.  This failure is detected at the next read
>>     (-EIO).
>>
>>     This behavior can lead to an issue when user keeps writing but does not
>>     read immediately.  For instance, flight recorder file may be only read
>>     when it is necessary for troubleshooting.
>>
>>     Change pmem_do_bvec() and pmem_clear_poison() to return -EIO so that
>>     filesystem can log an error message on a write error.
>>
>>     Cc: Vishal Verma <vishal.l.verma@intel.com>
>>     Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
>>     Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Verma, Vishal L Nov. 10, 2016, 11:34 p.m. UTC | #8
On 11/10, Dave Jiang wrote:
> On 11/10/2016 03:04 PM, Vishal Verma wrote:
> > On 11/10, Dan Williams wrote:
> >> On Thu, Nov 10, 2016 at 1:56 PM, Vishal Verma <vishal.l.verma@intel.com> wrote:
> >>> On 11/10, Dan Williams wrote:
> >>>> On Thu, Nov 10, 2016 at 11:17 AM, Vishal Verma <vishal.l.verma@intel.com> wrote:
> >>>>> On 11/10, Dave Jiang wrote:
> >>>>>> We need to clear any poison when we are writing to pmem. The granularity
> >>>>>> will be sector size. If it's less then we can't do anything about it
> >>>>>> barring corruption.
> >>>>>>
> >>>>>> Signed-off-by: Dave Jiang <dave.jiang@intel.com>
> >>>>>> ---
> >>>>>>  drivers/nvdimm/claim.c |   24 +++++++++++++++++++++---
> >>>>>>  1 file changed, 21 insertions(+), 3 deletions(-)
> >>>>>>
> >>>>>> diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c
> >>>>>> index d5dc80c..306d8fd 100644
> >>>>>> --- a/drivers/nvdimm/claim.c
> >>>>>> +++ b/drivers/nvdimm/claim.c
> >>>>>> @@ -226,6 +226,11 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns,
> >>>>>>               resource_size_t offset, void *buf, size_t size, int rw)
> >>>>>>  {
> >>>>>>       struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
> >>>>>> +     unsigned int sz_align = ALIGN(size + (offset & (512 - 1)), 512);
> >>>>>> +     sector_t sector = offset >> 9;
> >>>>>> +
> >>>>>> +     if (unlikely(!size))
> >>>>>> +             return -EINVAL;
> >>>>>>
> >>>>>>       if (unlikely(offset + size > nsio->size)) {
> >>>>>>               dev_WARN_ONCE(&ndns->dev, 1, "request out of range\n");
> >>>>>> @@ -233,12 +238,25 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns,
> >>>>>>       }
> >>>>>>
> >>>>>>       if (rw == READ) {
> >>>>>> -             unsigned int sz_align = ALIGN(size + (offset & (512 - 1)), 512);
> >>>>>> -
> >>>>>> -             if (unlikely(is_bad_pmem(&nsio->bb, offset / 512, sz_align)))
> >>>>>> +             if (unlikely(is_bad_pmem(&nsio->bb, sector, sz_align)))
> >>>>>>                       return -EIO;
> >>>>>>               return memcpy_from_pmem(buf, nsio->addr + offset, size);
> >>>>>>       } else {
> >>>>>> +
> >>>>>> +             if (unlikely(is_bad_pmem(&nsio->bb, sector, sz_align))) {
> >>>>>> +                     if (IS_ALIGNED(offset, 512) && IS_ALIGNED(size, 512)) {
> >>>>>> +                             long cleared;
> >>>>>> +
> >>>>>> +                             cleared = nvdimm_clear_poison(&ndns->dev,
> >>>>>> +                                                           offset, size);
> >>>>>
> >>>>> Everything above looks good..
> >>>>>
> >>>>>> +                             if (cleared != size)
> >>>>>> +                                     return -EIO;
> >>>>>
> >>>>> But I think we don't want to return -EIO here. Instead, ...
> >>>>>> +
> >>>>>> +                             badblocks_clear(&nsio->bb, sector,
> >>>>>> +                                             cleared >> 9);
> >>>>>> +                     }
> >>>>>> +             }
> >>>>>> +
> >>>>>>               memcpy_to_pmem(nsio->addr + offset, buf, size);
> >>>>>>               nvdimm_flush(to_nd_region(ndns->dev.parent));
> >>>>>
> >>>>> Do _some_ writes. We have two options:
> >>>>> 1. Try to write the whole thing (size), and return success. If the write
> >>>>> touches any remaining poison, it will not complain, but the poison will
> >>>>> be retained, so subsequent reads will hit it and fail.
> >>>>>
> >>>>> 2. Only write what we were able to clear, i.e. 'cleared' bytes. And
> >>>>> since we didn't complete the write, error out.
> >>>>>
> >>>>> I think 1 makes more sense, but I could be convinced otherwise..
> >>>>>
> >>>>> One wild idea might be:
> >>>>> rw_bytes can write arbitrary lengths, and commonly does up to 4K at a
> >>>>> time (BTT data writes). This gives us a rather large window where we
> >>>>> could be clearing 4K worth of badblocks all together, and then writing
> >>>>> 4K of data. If we crash anywhere in the middle, we're almost guaranteed
> >>>>> silent data corruption.
> >>>>>
> >>>>> Instead, what if, for the known-errors case for writes, we break it down
> >>>>> into smaller chunks:
> >>>>> Go cache line by cache line - if it may have poison (based on
> >>>>> badblocks), clear poison, followed by a cache line worth of data write.
> >>>>> When we cross a sector boundary (512), do a badblocks_clear.
> >>>>>
> >>>>> I think this gives us the best chance against silent corruption in the
> >>>>> absence of an atomic clear-error-and-write command.
> >>>>>
> >>>>> Thoughts?
> >>>>
> >>>> This feels like over-engineering a still not perfect solution to a
> >>>> rare problem.  Outside of atomic-write-and-clear we should just keep
> >>>> the code best effort and simple.
> >>>
> >>> Fair enough :) In that case I think the only change needed is to simply
> >>> remove the cleared != size check, do badblocks_clear for "cleared >> 9"
> >>> (as it is now), and then write "size", and return success. Sound good?
> >>>
> >>
> >> Return success even on an incomplete write? That's not the approach we
> >> took with Toshi's recent change:
> > 
> > Good point, so maybe do the write, and then,
> > if (cleared != size)
> > 	return -EIO;
> > 
> > Yeah?
> 
> I think we also need to return -EIO if the alignment check fails after
> we detected bad pmem in that case when we can't clear?
> 
Yes, I think I agree.. It will mean that there will be no way to do
small writes using rw_bytes to a bad sector, and that you have to clear
it by other methods, but that is no different from status quo..

So I think the flow is:
if error
	if aligned
		clear error
		clear (possibly some) badblocks
		if clear error was successful
			write all
			return success
		else
			write partial (or write all anyway?)
			return EIO
	else
		write all (?)
		return EIO
else
	write all
	return success
Elliott, Robert (Servers) Nov. 11, 2016, 4:50 a.m. UTC | #9
> -----Original Message-----
> From: Linux-nvdimm [mailto:linux-nvdimm-bounces@lists.01.org] On
> Behalf Of Vishal Verma
> Sent: Thursday, November 10, 2016 3:56 PM
> To: Dan Williams <dan.j.williams@intel.com>
> Cc: linux-nvdimm@lists.01.org
> Subject: Re: [PATCH v2] libnvdimm: check and clear poison before
> writing to pmem
> 
...
> >
> > This feels like over-engineering a still not perfect solution to a
> > rare problem.  Outside of atomic-write-and-clear we should just keep
> > the code best effort and simple.
> 
> Fair enough :) In that case I think the only change needed is to simply
> remove the cleared != size check, do badblocks_clear for "cleared >> 9"
> (as it is now), and then write "size", and return success. Sound good?

Writing size is dangerous. The CPU might not have big enough write 
instructions to avoid causing reads and trigger more uncorrectable 
errors.  Plus, if the function supports size being less than the number
of bytes covered by ECC (e.g. 8 bytes), then a read modify write 
must be done.  If those 8 bytes currently hold an ECC error, then
this write must not fix that error and silently upgrade an error
into bogus good data.  But, preserving the error would mean the
bytes written (and just declared good) are immediately bad again.

Writing cleared is unnecessary, since the caller is only told that
this write function failed, not that it failed after writing a
certain number of bytes.

This still needs to happen after badblocks_clear (or after the
memcpy_to_pmem and nvdimm_flush, if I haven't convinced you to
skip those steps):

>> +                             if (cleared != size)
>> +                                     return -EIO;


---
Robert Elliott, HPE Persistent Memory
Dan Williams Nov. 11, 2016, 6:18 a.m. UTC | #10
On Thu, Nov 10, 2016 at 8:50 PM, Elliott, Robert (Persistent Memory)
<elliott@hpe.com> wrote:
>
>
>> -----Original Message-----
>> From: Linux-nvdimm [mailto:linux-nvdimm-bounces@lists.01.org] On
>> Behalf Of Vishal Verma
>> Sent: Thursday, November 10, 2016 3:56 PM
>> To: Dan Williams <dan.j.williams@intel.com>
>> Cc: linux-nvdimm@lists.01.org
>> Subject: Re: [PATCH v2] libnvdimm: check and clear poison before
>> writing to pmem
>>
> ...
>> >
>> > This feels like over-engineering a still not perfect solution to a
>> > rare problem.  Outside of atomic-write-and-clear we should just keep
>> > the code best effort and simple.
>>
>> Fair enough :) In that case I think the only change needed is to simply
>> remove the cleared != size check, do badblocks_clear for "cleared >> 9"
>> (as it is now), and then write "size", and return success. Sound good?
>
> Writing size is dangerous. The CPU might not have big enough write
> instructions to avoid causing reads and trigger more uncorrectable
> errors.

The writes are non-temporal and even if there were reads the cpu does
not consume them on a read-modify-write cycle.  They might trigger a
CMCI ("corrected" machine check), but should halt the cpu since the
error only ever appears in the cache.

> Plus, if the function supports size being less than the number
> of bytes covered by ECC (e.g. 8 bytes), then a read modify write
> must be done.  If those 8 bytes currently hold an ECC error, then
> this write must not fix that error and silently upgrade an error
> into bogus good data.  But, preserving the error would mean the
> bytes written (and just declared good) are immediately bad again.

We're only talking about partially completing a multi-sector transfer,
not a sub-sector transfer.  Sub-sector or unaligned transfers will not
attempt to clear errors.

> Writing cleared is unnecessary, since the caller is only told that
> this write function failed, not that it failed after writing a
> certain number of bytes.

This assumes that an application is prepared to handle indeterminate
data after a write error.  It may still be fatal, but trying to push
the state from "old data" to "new" rather than "old data" to
"indeterminate" is less likely to surprise an application.
diff mbox

Patch

diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c
index d5dc80c..306d8fd 100644
--- a/drivers/nvdimm/claim.c
+++ b/drivers/nvdimm/claim.c
@@ -226,6 +226,11 @@  static int nsio_rw_bytes(struct nd_namespace_common *ndns,
 		resource_size_t offset, void *buf, size_t size, int rw)
 {
 	struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
+	unsigned int sz_align = ALIGN(size + (offset & (512 - 1)), 512);
+	sector_t sector = offset >> 9;
+
+	if (unlikely(!size))
+		return -EINVAL;
 
 	if (unlikely(offset + size > nsio->size)) {
 		dev_WARN_ONCE(&ndns->dev, 1, "request out of range\n");
@@ -233,12 +238,25 @@  static int nsio_rw_bytes(struct nd_namespace_common *ndns,
 	}
 
 	if (rw == READ) {
-		unsigned int sz_align = ALIGN(size + (offset & (512 - 1)), 512);
-
-		if (unlikely(is_bad_pmem(&nsio->bb, offset / 512, sz_align)))
+		if (unlikely(is_bad_pmem(&nsio->bb, sector, sz_align)))
 			return -EIO;
 		return memcpy_from_pmem(buf, nsio->addr + offset, size);
 	} else {
+
+		if (unlikely(is_bad_pmem(&nsio->bb, sector, sz_align))) {
+			if (IS_ALIGNED(offset, 512) && IS_ALIGNED(size, 512)) {
+				long cleared;
+
+				cleared = nvdimm_clear_poison(&ndns->dev,
+							      offset, size);
+				if (cleared != size)
+					return -EIO;
+
+				badblocks_clear(&nsio->bb, sector,
+						cleared >> 9);
+			}
+		}
+
 		memcpy_to_pmem(nsio->addr + offset, buf, size);
 		nvdimm_flush(to_nd_region(ndns->dev.parent));
 	}