diff mbox

[RFC,2/5] mtd:fsl_nfc: Add hardware 45 byte BHC-ECC support for 24 bit corrections.

Message ID 1389222441-4322-3-git-send-email-bpringlemeir@nbsps.com (mailing list archive)
State New, archived
Headers show

Commit Message

Bill Pringlemeir Jan. 8, 2014, 11:07 p.m. UTC
Signed-off-by: Bill Pringlemeir <bpringlemeir@nbsps.com>
---
 drivers/mtd/nand/Kconfig   |   5 +-
 drivers/mtd/nand/fsl_nfc.c | 114 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 117 insertions(+), 2 deletions(-)

Comments

Stefan Agner Sept. 17, 2014, 5:02 p.m. UTC | #1
Hi Bill,

On one of our Colibri VF61 modules I discovered an issue using this
driver:

[    0.758327] ECC failed to correct all errors (ebd9fd80)
[    0.767005] ECC failed to correct all errors (ebd9fd80)
[    0.775525] ECC failed to correct all errors (ebd9fd80)
[    0.784004] ECC failed to correct all errors (ebd9fd80)
[    0.791938] UBI error: ubi_io_read: error -74 (ECC error) while
reading 2048 bytes from PEB 28:2048, 
read 2048 bytes

That page supposed to be empty, and I got several of this messages.
Hence I did not believe that they have really ECC errors, so I digged a
bit deeper:

@@ -673,11 +673,13 @@ static int fsl_nfc_check_ecc_status(struct
mtd_info *mtd, u_char *dat)
                return ecc_count;
 
        /* If 'ecc_count' zero or less then buffer is all 0xff or
erased. */
-       flip = count_written_bits(dat, chip->ecc.size, ecc_count);
+       flip = count_written_bits(dat, chip->ecc.size, 100);
 
 
        if (flip > ecc_count) {
-               printk("ECC failed to correct all errors (%08x)\n",
ecc_status);
+               printk("ECC failed to correct all errors (%08x, flips
%d)\n",
+                               ecc_status, flip);
                return -1;
        }


[    0.758624] ECC failed to correct all errors (ebdded80, flips 1)
[    0.768123] ECC failed to correct all errors (ebdded80, flips 1)
[    0.777468] ECC failed to correct all errors (ebdded80, flips 1)
[    0.786773] ECC failed to correct all errors (ebdded80, flips 1)
[    0.795535] UBI error: ubi_io_read: error -74 (ECC error) while
reading 2048 bytes from PEB 28:2048, read 2048 bytes
[    1.462183] ECC failed to correct all errors (ebdded82, flips 3)
[    1.471623] ECC failed to correct all errors (ebdded82, flips 3)
[    1.481025] ECC failed to correct all errors (ebdded82, flips 3)
[    1.490336] ECC failed to correct all errors (ebdded82, flips 3)
[    1.498953] UBI error: ubi_io_read: error -74 (ECC error) while
reading 2048 bytes from PEB 664:2048, read 2048 bytes
[    1.551421] ECC failed to correct all errors (ebdded80, flips 1)
[    1.560616] ECC failed to correct all errors (ebdded80, flips 1)
[    1.569695] ECC failed to correct all errors (ebdded80, flips 1)
[    1.578711] ECC failed to correct all errors (ebdded80, flips 1)
[    1.587192] UBI error: ubi_io_read: error -74 (ECC error) while
reading 2048 bytes from PEB 666:2048, read 2048 bytes
[    1.744612] ECC failed to correct all errors (ebdded81, flips 2)
[    1.753943] ECC failed to correct all errors (ebdded81, flips 2)
[    1.763146] ECC failed to correct all errors (ebdded81, flips 2)
[    1.772247] ECC failed to correct all errors (ebdded81, flips 2)
[    1.780722] UBI error: ubi_io_read: error -74 (ECC error) while
reading 2048 bytes from PEB 776:2048, read 2048 bytes


Interesting thought, the returned ecc_count is exactly one below the
actual flipped bytes counted...

One comment below regarding this...

Am 2014-01-09 00:07, schrieb Bill Pringlemeir:
> Signed-off-by: Bill Pringlemeir <bpringlemeir@nbsps.com>
> ---
>  drivers/mtd/nand/Kconfig   |   5 +-
>  drivers/mtd/nand/fsl_nfc.c | 114 +++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 117 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/mtd/nand/Kconfig b/drivers/mtd/nand/Kconfig
> index 7e0c695..8ac9923 100644
> --- a/drivers/mtd/nand/Kconfig
> +++ b/drivers/mtd/nand/Kconfig
> @@ -458,8 +458,9 @@ config MTD_NAND_FSL_NFC
>  	help
>  	  Enables support for NAND Flash controller on some Freescale
>  	  processors like the MPC5125, VF610, MCF54418, and Kinetis K70.
> -	  The driver supports a maximum 2k page size.
> -	  The driver currently does not support hardware ECC.
> +	  The driver supports a maximum 2k page size.  With 2k pages and
> +	  64 bytes or more of OOB, hardware ECC with 24bit correction is
> +	  supported.  Only MTD_OF (device tree) can enable the hardware ECC.
>  
>  config MTD_NAND_MXC
>  	tristate "MXC NAND support"
> diff --git a/drivers/mtd/nand/fsl_nfc.c b/drivers/mtd/nand/fsl_nfc.c
> index eb4e353..703511d 100644
> --- a/drivers/mtd/nand/fsl_nfc.c
> +++ b/drivers/mtd/nand/fsl_nfc.c
> @@ -17,6 +17,7 @@
>   * - Untested on MPC5125 and M54418.
>   * - DMA not used.
>   * - 2K pages or less.
> + * - Only 2K page w. 64+OOB and hardware ECC.
>   */
>  
>  #include <linux/module.h>
> @@ -68,6 +69,7 @@
>  
>  /* NFC ECC mode define */
>  #define ECC_BYPASS			0
> +#define ECC_45_BYTE			6
>  
>  /*** Register Mask and bit definitions */
>  
> @@ -100,6 +102,8 @@
>  #define STATUS_BYTE1_MASK			0x000000FF
>  
>  /* NFC_FLASH_CONFIG Field */
> +#define CONFIG_ECC_SRAM_ADDR_MASK		0x7FC00000
> +#define CONFIG_ECC_SRAM_ADDR_SHIFT		22
>  #define CONFIG_ECC_SRAM_REQ_BIT			(1<<21)
>  #define CONFIG_DMA_REQ_BIT			(1<<20)
>  #define CONFIG_ECC_MODE_MASK			0x000E0000
> @@ -120,6 +124,11 @@
>  
>  #define NFC_TIMEOUT		(HZ)
>  
> +/* ECC status placed at end of buffers. */
> +#define ECC_SRAM_ADDR	((PAGE_2K+256-8) >> 3)
> +#define ECC_STATUS_MASK	0x80
> +#define ECC_ERR_COUNT	0x3F
> +
>  struct fsl_nfc {
>  	struct mtd_info	   mtd;
>  	struct nand_chip   chip;
> @@ -160,6 +169,19 @@ static struct nand_bbt_descr bbt_mirror_descr = {
>  	.pattern = mirror_pattern,
>  };
>  
> +static struct nand_ecclayout nfc_ecc45 = {
> +	.eccbytes = 45,
> +	.eccpos = {19, 20, 21, 22, 23,
> +		   24, 25, 26, 27, 28, 29, 30, 31,
> +		   32, 33, 34, 35, 36, 37, 38, 39,
> +		   40, 41, 42, 43, 44, 45, 46, 47,
> +		   48, 49, 50, 51, 52, 53, 54, 55,
> +		   56, 57, 58, 59, 60, 61, 62, 63},
> +	.oobfree = {
> +		{.offset = 8,
> +		 .length = 11} }
> +};
> +
>  static u32 nfc_read(struct mtd_info *mtd, uint reg)
>  {
>  	struct fsl_nfc *nfc = mtd_to_nfc(mtd);
> @@ -458,7 +480,61 @@ nfc_select_chip(struct mtd_info *mtd, int chip)
>  #endif
>  }
>  
> +/* Count the number of 0's in buff upto max_bits */
> +static int count_written_bits(uint8_t *buff, int size, int max_bits)
> +{
> +	int k, written_bits = 0;
> +
> +	for (k = 0; k < size; k++) {
> +		written_bits += hweight8(~buff[k]);
> +		if (written_bits > max_bits)
> +			break;
> +	}
> +
> +	return written_bits;
> +}
> +
> +static int nfc_correct_data(struct mtd_info *mtd, u_char *dat,
> +				 u_char *read_ecc, u_char *calc_ecc)
> +{
> +	struct fsl_nfc *nfc = mtd_to_nfc(mtd);
> +	u32 ecc_status;
> +	u8 ecc_count;
> +	int flip;
> +
> +	/* Errata: ECC status is stored at NFC_CFG[ECCADD] +4 for
> +	   little-endian and +7 for big-endian SOC.  Access as 32 bits
> +	   and use low byte.
> +	*/
> +	ecc_status = __raw_readl(nfc->regs + ECC_SRAM_ADDR * 8 + 4);
> +	ecc_count = ecc_status & ECC_ERR_COUNT;
> +	if (!(ecc_status & ECC_STATUS_MASK))
> +		return ecc_count;
> +
> +	/* If 'ecc_count' zero or less then buffer is all 0xff or erased. */
> +	flip = count_written_bits(dat, nfc->chip.ecc.size, ecc_count);

Also, I could not find that the reference manual states that ecc_count
represents the amount of flipped byte in a empty page. Is this given by
the ECC algorithm?



> +
> +	/* ECC failed. */
> +	if (flip > ecc_count)
> +		return -1;
> +
> +	/* Erased page. */
> +	memset(dat, 0xff, nfc->chip.ecc.size);
> +	return 0;
> +}
> +
> +static int nfc_calculate_ecc(struct mtd_info *mtd, const u_char *dat,
> +				  u_char *ecc_code)
> +{
> +	return 0;
> +}
> +
> +static void nfc_enable_hwecc(struct mtd_info *mtd, int mode)
> +{
> +}
> +
>  struct nfc_config {
> +	int hardware_ecc;
>  	int width;
>  	int flash_bbt;
>  	u32 clkrate;
> @@ -483,6 +559,9 @@ static int __init nfc_probe_dt(struct device *dev,
> struct nfc_config *cfg)
>  	if (!np)
>  		return 1;
>  
> +	if (of_get_nand_ecc_mode(np) >= NAND_ECC_HW)
> +		cfg->hardware_ecc = 1;
> +
>  	cfg->flash_bbt = of_get_nand_on_flash_bbt(np);
>  
>  	if (!of_property_read_u32(np, "clock-frequency", &clkrate))
> @@ -608,6 +687,11 @@ static int nfc_probe(struct platform_device *pdev)
>  	nfc_set_field(mtd, NFC_FLASH_CONFIG, CONFIG_PAGE_CNT_MASK,
>  			CONFIG_PAGE_CNT_SHIFT, 1);
>  
> +	/* Set ECC_STATUS offset */
> +	nfc_set_field(mtd, NFC_FLASH_CONFIG,
> +		      CONFIG_ECC_SRAM_ADDR_MASK,
> +		      CONFIG_ECC_SRAM_ADDR_SHIFT, ECC_SRAM_ADDR);
> +
>  	/* first scan to find the device and get the page size */
>  	if (nand_scan_ident(mtd, 1, NULL)) {
>  		err = -ENXIO;
> @@ -627,6 +711,36 @@ static int nfc_probe(struct platform_device *pdev)
>  	page_sz += cfg.width == 16 ? 1 : 0;
>  	nfc_write(mtd, NFC_SECTOR_SIZE, page_sz);
>  
> +	if (cfg.hardware_ecc) {
> +		if (mtd->writesize != PAGE_2K && mtd->oobsize < 64) {
> +			dev_err(nfc->dev, "Unsupported flash with hwecc\n");
> +			err = -ENXIO;
> +			goto error;
> +		}
> +
> +		chip->ecc.layout = &nfc_ecc45;
> +
> +		/* propagate ecc.layout to mtd_info */
> +		mtd->ecclayout = chip->ecc.layout;
> +		chip->ecc.calculate = nfc_calculate_ecc;
> +		chip->ecc.hwctl = nfc_enable_hwecc;
> +		chip->ecc.correct = nfc_correct_data;
> +		chip->ecc.mode = NAND_ECC_HW;
> +
> +		chip->ecc.bytes = 45;
> +		chip->ecc.size = PAGE_2K;
> +		chip->ecc.strength = 24;
> +
> +		/* set ECC mode to 45 bytes OOB with 24 bits correction */
> +		nfc_set_field(mtd, NFC_FLASH_CONFIG,
> +				CONFIG_ECC_MODE_MASK,
> +				CONFIG_ECC_MODE_SHIFT, ECC_45_BYTE);
> +
> +		/* Enable ECC_STATUS */
> +		nfc_set(mtd, NFC_FLASH_CONFIG, CONFIG_ECC_SRAM_REQ_BIT);
> +
> +	}
> +
>  	/* second phase scan */
>  	if (nand_scan_tail(mtd)) {
>  		err = -ENXIO;


--
Stefan
Bill Pringlemeir Sept. 17, 2014, 6:06 p.m. UTC | #2
On 17 Sep 2014, stefan@agner.ch wrote:

> On one of our Colibri VF61 modules I discovered an issue using this
> driver:

> [    0.758327] ECC failed to correct all errors (ebd9fd80)
> [    0.767005] ECC failed to correct all errors (ebd9fd80)
> [    0.775525] ECC failed to correct all errors (ebd9fd80)
> [    0.784004] ECC failed to correct all errors (ebd9fd80)
> [    0.791938] UBI error: ubi_io_read: error -74 (ECC error) while
> reading 2048 bytes from PEB 28:2048, 
> read 2048 bytes

> That page supposed to be empty, and I got several of this messages.
> Hence I did not believe that they have really ECC errors, so I digged
> a bit deeper:

>>> -673,11 +673,13 @@ static int fsl_nfc_check_ecc_status(struct
> mtd_info *mtd, u_char *dat)
> return ecc_count;

> /* If 'ecc_count' zero or less then buffer is all 0xff or
> erased. */
> -       flip = count_written_bits(dat, chip->ecc.size, ecc_count);
> +       flip = count_written_bits(dat, chip->ecc.size, 100);

> if (flip > ecc_count) {
> -               printk("ECC failed to correct all errors (%08x)\n",
> ecc_status);
> +               printk("ECC failed to correct all errors (%08x, flips
> %d)\n",
> +                               ecc_status, flip);
> return -1;
> }

> [    0.758624] ECC failed to correct all errors (ebdded80, flips 1)
> [    0.768123] ECC failed to correct all errors (ebdded80, flips 1)
> [    0.777468] ECC failed to correct all errors (ebdded80, flips 1)
> [    0.786773] ECC failed to correct all errors (ebdded80, flips 1)
> [    0.795535] UBI error: ubi_io_read: error -74 (ECC error) while
> reading 2048 bytes from PEB 28:2048, read 2048 bytes

[snip]

> Interesting thought, the returned ecc_count is exactly one below the
> actual flipped bytes counted...
>
> One comment below regarding this...

[snip]

>> +
>> +	/* If 'ecc_count' zero or less then buffer is all 0xff or erased. */
>> +	flip = count_written_bits(dat, nfc->chip.ecc.size, ecc_count);

> Also, I could not find that the reference manual states that ecc_count
> represents the amount of flipped byte in a empty page. Is this given
> by the ECC algorithm?

I was using this information.

   Table 31-18. ECC Status Word Field Definition

   7            0 Page has been successfully corrected
   CORFAIL      1 Page is uncorrectable

   5–0          Number of errors that have been corrected in this page
   ERROR_COUNT

This is from the Vybrid RM, but the MPC5125RM has the same information.

I have definitely tested the detection of 'erased pages'.  However, I
don't know that I ever had actual bit flips.

The ECC controller has no idea whether the page is empty or not.  Are
you saying you have an erased page with bit flips?  I have definitely
not tested this.

>> +	/* If 'ecc_count' zero or less then buffer is all 0xff or erased. */
>> +	flip = count_written_bits(dat, nfc->chip.ecc.size, ecc_count);

There are two issues.

 1) erased page with some physical flipped bits (zero).
 2) erased page were controller flipped some bits.

Currently, the code is only handling case 2 (that is what the controller
counts).  I believe that your physical page has actual 'stuck at zero'
bits.  The current driver gives up.  If you want to handle this then we
should replace the lines,

> +	/* ECC failed. */
> +	if (flip > ecc_count)
> +		return -1;

With something like,

 +	/* Not completely empty. */
 +	if (flip > ecc_count) {
 +           re_read_page_w_ecc_off();
 +           if (count_written_bits() > strength) /* strength/2 ? */
 +		return -1;
 +      }

There is also a discussion on this here,

http://lists.infradead.org/pipermail/linux-mtd/2014-March/052507.html
http://lists.infradead.org/pipermail/linux-mtd/2014-March/052508.html
http://lists.infradead.org/pipermail/linux-mtd/2014-March/052509.html
http://lists.infradead.org/pipermail/linux-mtd/2014-March/052526.html
http://lists.infradead.org/pipermail/linux-mtd/2014-March/052527.html
http://lists.infradead.org/pipermail/linux-mtd/2014-March/052619.html
... etc.  Use the view thread.

Especially,
http://lists.infradead.org/pipermail/linux-mtd/2014-March/052647.html

Here Brian says that 3b applies to SLC NAND.  I guess this is you :).

It is fairly common to read an erased page.  Doing 'raw reads' all the
time on an erased page will slow the file system.  However, doing
re-reads for an erased page with bit flips is probably fairly uncommon.
A flash in this state maybe near EOL or the sector was bad from the
factory but not marked so.

I am chasing an DDR2 issue on another CPU and haven't had much more time
to the look at the Vybrid nor follow the MTD mailing list.  I don't know
if the 'NAND_ECC_NEED_CHECK_FF' feature has been merged?  It may also
solve the issue.

Regards,
Bill Pringlemeir.
Stefan Agner Sept. 17, 2014, 8:08 p.m. UTC | #3
Am 2014-09-17 20:06, schrieb Bill Pringlemeir:
> On 17 Sep 2014, stefan@agner.ch wrote:
> 
>> On one of our Colibri VF61 modules I discovered an issue using this
>> driver:
> 
>> [    0.758327] ECC failed to correct all errors (ebd9fd80)
>> [    0.767005] ECC failed to correct all errors (ebd9fd80)
>> [    0.775525] ECC failed to correct all errors (ebd9fd80)
>> [    0.784004] ECC failed to correct all errors (ebd9fd80)
>> [    0.791938] UBI error: ubi_io_read: error -74 (ECC error) while
>> reading 2048 bytes from PEB 28:2048,
>> read 2048 bytes
> 
>> That page supposed to be empty, and I got several of this messages.
>> Hence I did not believe that they have really ECC errors, so I digged
>> a bit deeper:
> 
>>>> -673,11 +673,13 @@ static int fsl_nfc_check_ecc_status(struct
>> mtd_info *mtd, u_char *dat)
>> return ecc_count;
> 
>> /* If 'ecc_count' zero or less then buffer is all 0xff or
>> erased. */
>> -       flip = count_written_bits(dat, chip->ecc.size, ecc_count);
>> +       flip = count_written_bits(dat, chip->ecc.size, 100);
> 
>> if (flip > ecc_count) {
>> -               printk("ECC failed to correct all errors (%08x)\n",
>> ecc_status);
>> +               printk("ECC failed to correct all errors (%08x, flips
>> %d)\n",
>> +                               ecc_status, flip);
>> return -1;
>> }
> 
>> [    0.758624] ECC failed to correct all errors (ebdded80, flips 1)
>> [    0.768123] ECC failed to correct all errors (ebdded80, flips 1)
>> [    0.777468] ECC failed to correct all errors (ebdded80, flips 1)
>> [    0.786773] ECC failed to correct all errors (ebdded80, flips 1)
>> [    0.795535] UBI error: ubi_io_read: error -74 (ECC error) while
>> reading 2048 bytes from PEB 28:2048, read 2048 bytes
> 
> [snip]
> 
>> Interesting thought, the returned ecc_count is exactly one below the
>> actual flipped bytes counted...
>>
>> One comment below regarding this...
> 
> [snip]
> 
>>> +
>>> +	/* If 'ecc_count' zero or less then buffer is all 0xff or erased. */
>>> +	flip = count_written_bits(dat, nfc->chip.ecc.size, ecc_count);
> 
>> Also, I could not find that the reference manual states that ecc_count
>> represents the amount of flipped byte in a empty page. Is this given
>> by the ECC algorithm?
> 
> I was using this information.
> 
>    Table 31-18. ECC Status Word Field Definition
> 
>    7            0 Page has been successfully corrected
>    CORFAIL      1 Page is uncorrectable
> 
>    5–0          Number of errors that have been corrected in this page
>    ERROR_COUNT
> 
> This is from the Vybrid RM, but the MPC5125RM has the same information.
> 
> I have definitely tested the detection of 'erased pages'.  However, I
> don't know that I ever had actual bit flips.
> 
> The ECC controller has no idea whether the page is empty or not.  Are
> you saying you have an erased page with bit flips?  I have definitely
> not tested this.

Yes, that's what it looks like. Note that this output is on first boot
after flashing the device (with the UBI auto-grow option). I guess UBI
is reading all pages once to verify that they are empty.

>>> +	/* If 'ecc_count' zero or less then buffer is all 0xff or erased. */
>>> +	flip = count_written_bits(dat, nfc->chip.ecc.size, ecc_count);
> 
> There are two issues.
> 
>  1) erased page with some physical flipped bits (zero).
>  2) erased page were controller flipped some bits.

I did not know that issue 2 exists. How can 2 happen on a erased page?
With a 'stuck at zero' in OOB?

> Currently, the code is only handling case 2 (that is what the controller
> counts).  I believe that your physical page has actual 'stuck at zero'
> bits.  The current driver gives up.  If you want to handle this then we
> should replace the lines,
> 

I assumed that your code assumes that the controller returns the flipped
bit count when reading an erase page (case 1), which I did not found in
the documentation. 

>> +	/* ECC failed. */
>> +	if (flip > ecc_count)
>> +		return -1;
> 
> With something like,
> 
>  +	/* Not completely empty. */
>  +	if (flip > ecc_count) {
>  +           re_read_page_w_ecc_off();
>  +           if (count_written_bits() > strength) /* strength/2 ? */
>  +		return -1;
>  +      }
> 
> There is also a discussion on this here,
> 
> http://lists.infradead.org/pipermail/linux-mtd/2014-March/052507.html
> http://lists.infradead.org/pipermail/linux-mtd/2014-March/052508.html
> http://lists.infradead.org/pipermail/linux-mtd/2014-March/052509.html
> http://lists.infradead.org/pipermail/linux-mtd/2014-March/052526.html
> http://lists.infradead.org/pipermail/linux-mtd/2014-March/052527.html
> http://lists.infradead.org/pipermail/linux-mtd/2014-March/052619.html
> ... etc.  Use the view thread.
> 
> Especially,
> http://lists.infradead.org/pipermail/linux-mtd/2014-March/052647.html
> 
> Here Brian says that 3b applies to SLC NAND.  I guess this is you :).

Yes, we are using Macronix SLC NAND.

> 
> It is fairly common to read an erased page.  Doing 'raw reads' all the
> time on an erased page will slow the file system.  However, doing
> re-reads for an erased page with bit flips is probably fairly uncommon.
> A flash in this state maybe near EOL or the sector was bad from the
> factory but not marked so.

This is a new device, but its one out of several dozens. The device had
two factory marked bad page. This four page would then be 6 bad pages. I
would say that your guess is probably the case at hand (should be
considered bad, but were marked by factory).  

> 
> I am chasing an DDR2 issue on another CPU and haven't had much more time
> to the look at the Vybrid nor follow the MTD mailing list.  I don't know
> if the 'NAND_ECC_NEED_CHECK_FF' feature has been merged?  It may also
> solve the issue.

A quick search through 3.17-rc3 didn't found that string.

I will read the pages using raw again and check how many bit flips it
shows. But I guess regarding Brian's comment to 3b, the driver actually
behaves correct and return -1, because we have bit flips on erased page
which is not good...

UBI starts to scrub & torture those 4 pages then, but starts doing this
with other pages too. All the pages start to fail then! I'm still
investigating that issue.

[   30.901345] UBI: scrubbed PEB 1436 (LEB 0:356), data moved to PEB
3965
[   31.100247] UBI: run torture test for PEB 1436
[   31.280845] UBI error: torture_peb: read problems on freshly erased
PEB 1436, must be bad
[   31.300656] UBI error: erase_worker: failed to erase PEB 1436, error
-5
[   31.312626] UBI: mark PEB 1436 as bad
[   31.338216] UBI: 12 PEBs left in the reserve
[   31.519044] UBI: scrubbed PEB 1390 (LEB 0:313), data moved to PEB
3963
[   31.697812] UBI: run torture test for PEB 1390
[   31.880470] UBI error: torture_peb: read problems on freshly erased
PEB 1390, must be bad
[   31.898220] UBI error: erase_worker: failed to erase PEB 1390, error
-5
[   31.909687] UBI: mark PEB 1390 as bad
[   31.931620] UBI: 11 PEBs left in the reserve
[   32.170599] UBI: scrubbed PEB 1470 (LEB 0:389), data moved to PEB
3961
[   32.344564] UBI: run torture test for PEB 1470
[   32.522842] UBI error: torture_peb: read problems on freshly erased
PEB 1470, must be bad
[   32.540587] UBI error: erase_worker: failed to erase PEB 1470, error
-5
[   32.552060] UBI: mark PEB 1470 as bad
Bill Pringlemeir Sept. 17, 2014, 10:21 p.m. UTC | #4
On 17 Sep 2014, stefan@agner.ch wrote:

> Am 2014-09-17 20:06, schrieb Bill Pringlemeir:
>> On 17 Sep 2014, stefan@agner.ch wrote:

>>> On one of our Colibri VF61 modules I discovered an issue using this
>>> driver:

>>> [    0.758327] ECC failed to correct all errors (ebd9fd80)
>>> [    0.767005] ECC failed to correct all errors (ebd9fd80)
>>> [    0.775525] ECC failed to correct all errors (ebd9fd80)
>>> [    0.784004] ECC failed to correct all errors (ebd9fd80)
>>> [    0.791938] UBI error: ubi_io_read: error -74 (ECC error) while
>>> reading 2048 bytes from PEB 28:2048,
>>> read 2048 bytes

[snip]

>>> Also, I could not find that the reference manual states that
>>> ecc_count represents the amount of flipped byte in a empty page. Is
>>> this given by the ECC algorithm?

>> I was using this information.
>>
>> Table 31-18. ECC Status Word Field Definition
>>
>> 7            0 Page has been successfully corrected
>> CORFAIL      1 Page is uncorrectable
>>
>> 5–0          Number of errors that have been corrected in this page
>> ERROR_COUNT
>>
>> This is from the Vybrid RM, but the MPC5125RM has the same
>> information.
>>
>> I have definitely tested the detection of 'erased pages'.  However, I
>> don't know that I ever had actual bit flips.

>> The ECC controller has no idea whether the page is empty or not.  Are
>> you saying you have an erased page with bit flips?  I have definitely
>> not tested this.

> Yes, that's what it looks like. Note that this output is on first boot
> after flashing the device (with the UBI auto-grow option). I guess UBI
> is reading all pages once to verify that they are empty.

>>>> + /* If 'ecc_count' zero or less then buffer is all 0xff or
>>>> erased. */ + flip = count_written_bits(dat, nfc->chip.ecc.size,
>>>> ecc_count);
>>
>> There are two issues.
>>
>> 1) erased page with some physical flipped bits (zero).
>> 2) erased page were controller flipped some bits.

> I did not know that issue 2 exists. How can 2 happen on a erased page?
> With a 'stuck at zero' in OOB?

Issue 2 is always an issue when reading an erased page with hardware
ECC.  So we have,

                    Main         Spare
    Good program:   Data    OOB-data+HW-ECC
    Good erased:    FF      FF  FF
    Stuck erased1:  FF      FF  xx
    Stuck erased2:  xx      xx  FF

For the 'good erased' the controller goes about like it is case 'good
program'.  It reads a sector (raw read or software ECC case) and a the
same time, the error corrector is running.  This examines the 'OOB
data', which is 0xff for the erased sector and it blindly begins to
apply the error correction to the main data section.  The 'correction'
for an erased block will always flip '1' to '0'.  So for 'good erased',
the 'error_count' will equal the flipped bits.

Now, your sectors aren't that bad.  I believe you only have one bit
error on an erase.  However, it is really hairy to know what the ECC
logic will do when you have bit flips in the ECC code.  It is also next
to impossible to examine the buffer and know if zero bits are due to the
ECC engine flipping the bits and/or the actual physical flash having the
stuck bit.  It appears that the ECC engine runs from start to finish.
So most of the ECC corrections should be at the start; use this info for
diagnostics, but not coding!

The best course IMHO is to do what Brian (and other) did and do a 'raw'
read, so you take the ECC engine out of the picture and stop it from
flipping bits.  You can experiment with always doing the raw read, but I
believe you will see a decrease in performance (as others mentioned and
I saw).

>> Currently, the code is only handling case 2 (that is what the
>> controller counts).  I believe that your physical page has actual
>> 'stuck at zero' bits.  The current driver gives up.  If you want to
>> handle this then we should replace the lines,

> I assumed that your code assumes that the controller returns the
> flipped bit count when reading an erase page (case 1), which I did not
> found in the documentation.

>>> +	/* ECC failed. */
>>> +	if (flip > ecc_count)
>>> +		return -1;

>> With something like,
>>
>> +	/* Not completely empty. */
>> +	if (flip > ecc_count) {
>> +           re_read_page_w_ecc_off();
>> +           if (count_written_bits() > strength) /* strength/2 ? */
>> +		return -1;
>> +      }
>>
>> There is also a discussion on this here,
>>
>> http://lists.infradead.org/pipermail/linux-mtd/2014-March/052507.html
>> http://lists.infradead.org/pipermail/linux-mtd/2014-March/052508.html
>> http://lists.infradead.org/pipermail/linux-mtd/2014-March/052509.html
>> http://lists.infradead.org/pipermail/linux-mtd/2014-March/052526.html
>> http://lists.infradead.org/pipermail/linux-mtd/2014-March/052527.html
>> http://lists.infradead.org/pipermail/linux-mtd/2014-March/052619.html
>> ... etc.  Use the view thread.
>>
>> Especially,
>> http://lists.infradead.org/pipermail/linux-mtd/2014-March/052647.html
>>
>> Here Brian says that 3b applies to SLC NAND.  I guess this is you :).

> Yes, we are using Macronix SLC NAND.

Well, it is everyone.  Sorry, I made the assumption this wouldn't
happen, but apparently it does.

>> It is fairly common to read an erased page.  Doing 'raw reads' all
>> the time on an erased page will slow the file system.  However, doing
>> re-reads for an erased page with bit flips is probably fairly
>> uncommon.  A flash in this state maybe near EOL or the sector was bad
>> from the factory but not marked so.

> This is a new device, but its one out of several dozens. The device
> had two factory marked bad page. This four page would then be 6 bad
> pages. I would say that your guess is probably the case at hand
> (should be considered bad, but were marked by factory).

>> I am chasing an DDR2 issue on another CPU and haven't had much more
>> time to the look at the Vybrid nor follow the MTD mailing list.  I
>> don't know if the 'NAND_ECC_NEED_CHECK_FF' feature has been merged?
>> It may also solve the issue.
>
> A quick search through 3.17-rc3 didn't found that string.

> I will read the pages using raw again and check how many bit flips it
> shows. But I guess regarding Brian's comment to 3b, the driver
> actually behaves correct and return -1, because we have bit flips on
> erased page which is not good...

Hmm.  No, it should be ok; I think Brian meant we should accept and
expect this.  We had this conversation after the RFC.  However, we
should probably only accept strength/2 zero values at most; I believe
what to accept was kind of hanging in that conversation.  If the zeros
are below our threshold with a raw read, then we can report the page is
all '\xff' and return a 'bit flip' count as the number of zeros found in
a raw read.  Also, in the 'all FFs' case we should probably memset the
OOB data.  I see that other patches mentioned in the MTD threads above
do this.

 +	/* Not completely empty. */
 +	if (flip > ecc_count) {
 +           re_read_page_w_ecc_off();
 +           ecc_count = count_zero_bits();
 +           if (ecc_count > strength/2)
 +		return -1;
 +      } else
 +          ecc_count = 0;  /* 'normal' case all ff */
 +
 +	/* Erased page. */
 +	memset(dat, 0xff, nfc->chip.ecc.size);
 +	memset(oob, 0xff, ???);  /* ??? */
 +	return ecc_count;

I don't think that UBI/UbiFs use the OOB data at all, but that is
something that I don't think is completely correct in this path?

> UBI starts to scrub & torture those 4 pages then, but starts doing
> this with other pages too. All the pages start to fail then! I'm still
> investigating that issue.

Arg.  I am not quite sure.  It seems ok to have a few 'stuck at one'
errors.  As you move closer to the ECC strength this seems a little
crazy.  At some point the sector/erase block is becoming useless.  I am
not sure if this is an issue with this device; it seems a little
suspect.  Maybe you could code something that emulates the stuck bits on
a good device and see if it still behaves the same; triggers a software
condition.  You might also reduce the NFC bus clock just to see on the
'stuck zero' device.

Hth,
Bill Pringlemeir.
diff mbox

Patch

diff --git a/drivers/mtd/nand/Kconfig b/drivers/mtd/nand/Kconfig
index 7e0c695..8ac9923 100644
--- a/drivers/mtd/nand/Kconfig
+++ b/drivers/mtd/nand/Kconfig
@@ -458,8 +458,9 @@  config MTD_NAND_FSL_NFC
 	help
 	  Enables support for NAND Flash controller on some Freescale
 	  processors like the MPC5125, VF610, MCF54418, and Kinetis K70.
-	  The driver supports a maximum 2k page size.
-	  The driver currently does not support hardware ECC.
+	  The driver supports a maximum 2k page size.  With 2k pages and
+	  64 bytes or more of OOB, hardware ECC with 24bit correction is
+	  supported.  Only MTD_OF (device tree) can enable the hardware ECC.
 
 config MTD_NAND_MXC
 	tristate "MXC NAND support"
diff --git a/drivers/mtd/nand/fsl_nfc.c b/drivers/mtd/nand/fsl_nfc.c
index eb4e353..703511d 100644
--- a/drivers/mtd/nand/fsl_nfc.c
+++ b/drivers/mtd/nand/fsl_nfc.c
@@ -17,6 +17,7 @@ 
  * - Untested on MPC5125 and M54418.
  * - DMA not used.
  * - 2K pages or less.
+ * - Only 2K page w. 64+OOB and hardware ECC.
  */
 
 #include <linux/module.h>
@@ -68,6 +69,7 @@ 
 
 /* NFC ECC mode define */
 #define ECC_BYPASS			0
+#define ECC_45_BYTE			6
 
 /*** Register Mask and bit definitions */
 
@@ -100,6 +102,8 @@ 
 #define STATUS_BYTE1_MASK			0x000000FF
 
 /* NFC_FLASH_CONFIG Field */
+#define CONFIG_ECC_SRAM_ADDR_MASK		0x7FC00000
+#define CONFIG_ECC_SRAM_ADDR_SHIFT		22
 #define CONFIG_ECC_SRAM_REQ_BIT			(1<<21)
 #define CONFIG_DMA_REQ_BIT			(1<<20)
 #define CONFIG_ECC_MODE_MASK			0x000E0000
@@ -120,6 +124,11 @@ 
 
 #define NFC_TIMEOUT		(HZ)
 
+/* ECC status placed at end of buffers. */
+#define ECC_SRAM_ADDR	((PAGE_2K+256-8) >> 3)
+#define ECC_STATUS_MASK	0x80
+#define ECC_ERR_COUNT	0x3F
+
 struct fsl_nfc {
 	struct mtd_info	   mtd;
 	struct nand_chip   chip;
@@ -160,6 +169,19 @@  static struct nand_bbt_descr bbt_mirror_descr = {
 	.pattern = mirror_pattern,
 };
 
+static struct nand_ecclayout nfc_ecc45 = {
+	.eccbytes = 45,
+	.eccpos = {19, 20, 21, 22, 23,
+		   24, 25, 26, 27, 28, 29, 30, 31,
+		   32, 33, 34, 35, 36, 37, 38, 39,
+		   40, 41, 42, 43, 44, 45, 46, 47,
+		   48, 49, 50, 51, 52, 53, 54, 55,
+		   56, 57, 58, 59, 60, 61, 62, 63},
+	.oobfree = {
+		{.offset = 8,
+		 .length = 11} }
+};
+
 static u32 nfc_read(struct mtd_info *mtd, uint reg)
 {
 	struct fsl_nfc *nfc = mtd_to_nfc(mtd);
@@ -458,7 +480,61 @@  nfc_select_chip(struct mtd_info *mtd, int chip)
 #endif
 }
 
+/* Count the number of 0's in buff upto max_bits */
+static int count_written_bits(uint8_t *buff, int size, int max_bits)
+{
+	int k, written_bits = 0;
+
+	for (k = 0; k < size; k++) {
+		written_bits += hweight8(~buff[k]);
+		if (written_bits > max_bits)
+			break;
+	}
+
+	return written_bits;
+}
+
+static int nfc_correct_data(struct mtd_info *mtd, u_char *dat,
+				 u_char *read_ecc, u_char *calc_ecc)
+{
+	struct fsl_nfc *nfc = mtd_to_nfc(mtd);
+	u32 ecc_status;
+	u8 ecc_count;
+	int flip;
+
+	/* Errata: ECC status is stored at NFC_CFG[ECCADD] +4 for
+	   little-endian and +7 for big-endian SOC.  Access as 32 bits
+	   and use low byte.
+	*/
+	ecc_status = __raw_readl(nfc->regs + ECC_SRAM_ADDR * 8 + 4);
+	ecc_count = ecc_status & ECC_ERR_COUNT;
+	if (!(ecc_status & ECC_STATUS_MASK))
+		return ecc_count;
+
+	/* If 'ecc_count' zero or less then buffer is all 0xff or erased. */
+	flip = count_written_bits(dat, nfc->chip.ecc.size, ecc_count);
+
+	/* ECC failed. */
+	if (flip > ecc_count)
+		return -1;
+
+	/* Erased page. */
+	memset(dat, 0xff, nfc->chip.ecc.size);
+	return 0;
+}
+
+static int nfc_calculate_ecc(struct mtd_info *mtd, const u_char *dat,
+				  u_char *ecc_code)
+{
+	return 0;
+}
+
+static void nfc_enable_hwecc(struct mtd_info *mtd, int mode)
+{
+}
+
 struct nfc_config {
+	int hardware_ecc;
 	int width;
 	int flash_bbt;
 	u32 clkrate;
@@ -483,6 +559,9 @@  static int __init nfc_probe_dt(struct device *dev, struct nfc_config *cfg)
 	if (!np)
 		return 1;
 
+	if (of_get_nand_ecc_mode(np) >= NAND_ECC_HW)
+		cfg->hardware_ecc = 1;
+
 	cfg->flash_bbt = of_get_nand_on_flash_bbt(np);
 
 	if (!of_property_read_u32(np, "clock-frequency", &clkrate))
@@ -608,6 +687,11 @@  static int nfc_probe(struct platform_device *pdev)
 	nfc_set_field(mtd, NFC_FLASH_CONFIG, CONFIG_PAGE_CNT_MASK,
 			CONFIG_PAGE_CNT_SHIFT, 1);
 
+	/* Set ECC_STATUS offset */
+	nfc_set_field(mtd, NFC_FLASH_CONFIG,
+		      CONFIG_ECC_SRAM_ADDR_MASK,
+		      CONFIG_ECC_SRAM_ADDR_SHIFT, ECC_SRAM_ADDR);
+
 	/* first scan to find the device and get the page size */
 	if (nand_scan_ident(mtd, 1, NULL)) {
 		err = -ENXIO;
@@ -627,6 +711,36 @@  static int nfc_probe(struct platform_device *pdev)
 	page_sz += cfg.width == 16 ? 1 : 0;
 	nfc_write(mtd, NFC_SECTOR_SIZE, page_sz);
 
+	if (cfg.hardware_ecc) {
+		if (mtd->writesize != PAGE_2K && mtd->oobsize < 64) {
+			dev_err(nfc->dev, "Unsupported flash with hwecc\n");
+			err = -ENXIO;
+			goto error;
+		}
+
+		chip->ecc.layout = &nfc_ecc45;
+
+		/* propagate ecc.layout to mtd_info */
+		mtd->ecclayout = chip->ecc.layout;
+		chip->ecc.calculate = nfc_calculate_ecc;
+		chip->ecc.hwctl = nfc_enable_hwecc;
+		chip->ecc.correct = nfc_correct_data;
+		chip->ecc.mode = NAND_ECC_HW;
+
+		chip->ecc.bytes = 45;
+		chip->ecc.size = PAGE_2K;
+		chip->ecc.strength = 24;
+
+		/* set ECC mode to 45 bytes OOB with 24 bits correction */
+		nfc_set_field(mtd, NFC_FLASH_CONFIG,
+				CONFIG_ECC_MODE_MASK,
+				CONFIG_ECC_MODE_SHIFT, ECC_45_BYTE);
+
+		/* Enable ECC_STATUS */
+		nfc_set(mtd, NFC_FLASH_CONFIG, CONFIG_ECC_SRAM_REQ_BIT);
+
+	}
+
 	/* second phase scan */
 	if (nand_scan_tail(mtd)) {
 		err = -ENXIO;